def to_ascii_file(demux_fp, output_fp, samples=None, out_format='fastq'): """Writes the sequences on FASTQ or FASTA format Parameters ---------- demux_fp : str The demux file path output_fp : str The output file path samples : list of str, optional Samples to pull out. If None, then all samples will be examined. Defaults to None. out_format: {'fastq', 'fasta'}, optional The format in which the output file should be written. Default: FASTQ Raises ------ ValueError If `out_format` is not 'fastq' or 'fasta' """ if out_format == 'fastq': formatter = format_fastq_record elif out_format == 'fasta': formatter = format_fasta_record else: raise ValueError("'out_format' should be either 'fastq' or 'fasta', " "found: %s" % out_format) with open_file(demux_fp, 'r') as demux: if samples is None: samples = list(demux.keys()) samples = [s.encode() for s in samples] with open(output_fp, 'wb') as out: for rec in _to_ascii(demux, samples, formatter): out.write(rec)
def stats(demux): """Return file stats Parameters ---------- demux : {str, h5py.File, h5py.Group} The file or group to get stats from Returns ------- stat The corresponding stats """ with open_file(demux) as fh: attrs = fh.attrs obs_stats = stat(n=attrs['n'], max=attrs['max'], min=attrs['min'], std=attrs['std'], mean=attrs['mean'], median=attrs['median'], hist=attrs['hist'], hist_edge=attrs['hist_edge']) return obs_stats
def test_filehandle(self): """Filehandles slip through untouched""" with tempfile.TemporaryFile('r') as fh: with open_file(fh) as ffh: self.assertTrue(fh is ffh) # And it doesn't close the file-handle self.assertFalse(fh.closed)
def test_file_closed(self): """File gets closed in decorator""" f = tempfile.NamedTemporaryFile('r') filepath = f.name with open_file(filepath) as fh: pass self.assertTrue(fh.closed)
def test_hdf5IO_open(self): name = None with tempfile.NamedTemporaryFile(delete=False) as fh: name = fh.name fh.close() h5file = h5py.File(name, 'w') h5file.close() with open_file(name) as fh_inner: self.assertTrue(isinstance(fh_inner, h5py.File)) os.remove(name)
def test_file_closed_harder(self): """File gets closed in decorator, even if exceptions happen.""" f = tempfile.NamedTemporaryFile('r') filepath = f.name try: with open_file(filepath) as fh: raise TypeError except TypeError: self.assertTrue(fh.closed) else: # If we're here, no exceptions have been raised inside the # try clause, so the context manager swallowed them. No # good. raise Exception("`open_file` didn't propagate exceptions")
def to_per_sample_files(demux_fp, samples=None, out_dir='./', n_jobs=1, out_format='fastq'): """Writes per sample files Parameters ---------- demux_fp : str The demux file path samples : list of str, optional Samples to pull out. If None, then all samples will be examined. Defaults to None. out_dir : str, optional Path to output directory to store the per sample fasta. Defaults to current directory n_jobs : int, optional Number of jobs to run in parallel. Defaults to 1 out_format : {'fastq', 'fasta'} The format in which the output files should be written. """ if out_format == 'fastq': formatter = format_fastq_record file_name_fmt = "%s.fastq" elif out_format == 'fasta': formatter = format_fasta_record file_name_fmt = "%s.fna" else: raise ValueError("'out_format' should be either 'fastq' or 'fasta', " "found: %s" % out_format) if samples is None: with open_file(demux_fp, 'r') as demux: # We need to call list because demux.keys() is a KeysView object # from the file, and the file will be closed once we exit the # context manager samples = list(demux.keys()) if out_dir is None: out_dir = './' path_builder = partial(os.path.join, out_dir) samples_and_paths = [(s.encode(), path_builder(file_name_fmt % s)) for s in samples] with joblib.Parallel(n_jobs=n_jobs) as par: par( joblib.delayed(_to_file)(demux_fp, sample, s_fp, formatter) for sample, s_fp in samples_and_paths)
def parser(lines): with open_file(lines) as lines: curr = [] for l in lines: try: l = str(l.decode('utf-8')) except AttributeError: pass if constructor is not None: line = constructor(l) else: line = l if ignore(line): continue # if we find the label, return the previous record if is_label_line(line): if curr: yield curr curr = [] curr.append(line) # don't forget to return the last record in the file if curr: yield curr
def _validate_demux_file(qclient, job_id, prep_info, out_dir, demux_fp, fastq_fp=None, fasta_fp=None, log_fp=None): """Validate and fix a 'demux' file and regenerate fastq and fasta files Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id out_dir : str The output directory demux_fp : str The demux file path fastq_fp : str, optional The original fastq filepath. If demux is correct, it will not be regenerated fasta_fp : str, optional The original fasta filepath. If demux is correct, it will no be regenerated log_fp : str, optional The original log filepath Returns ------- dict The results og the job """ pt_sample_ids = set(prep_info) with open_file(demux_fp) as f: demux_sample_ids = set(f.keys()) if not pt_sample_ids.issuperset(demux_sample_ids): # The demux sample ids are different from the ones in the prep template qclient.update_job_step(job_id, "Step 3: Fixing sample ids") # Atempt 1: the user provided the run prefix column - in this case the # run prefix column holds the sample ids present in the demux file if 'run_prefix' in prep_info[next(iter(pt_sample_ids))]: id_map = {v['run_prefix']: k for k, v in prep_info.items()} if not set(id_map).issuperset(demux_sample_ids): error_msg = ('The sample ids in the "run_prefix" columns ' 'from the prep information do not match the ' 'ones in the demux file. Please, correct the ' 'column "run_prefix" in the prep information to ' 'map the existing sample ids to the prep ' 'information sample ids.') return False, None, error_msg else: # Attempt 2: the sample ids in the demux table are the same that # in the prep template but without the prefix prefix = next(iter(pt_sample_ids)).split('.', 1)[0] prefixed = set("%s.%s" % (prefix, s) for s in demux_sample_ids) if pt_sample_ids.issuperset(prefixed): id_map = {s: "%s.%s" % (prefix, s) for s in demux_sample_ids} else: # There is nothing we can do. The samples in the demux file do # not match the ones in the prep template and we can't fix it error_msg = ('The sample ids in the demultiplexed files do ' 'not match the ones in the prep information. ' 'Please, provide the column "run_prefix" in ' 'the prep information to map the existing sample' ' ids to the prep information sample ids.') return False, None, error_msg # Fix the sample ids # Do not modify the original demux file, copy it to a new location new_demux_fp = join(out_dir, basename(demux_fp)) # this if is important so we don't regenerate the demux file if the # user uploads fastq or fna if demux_fp != new_demux_fp: copy(demux_fp, new_demux_fp) demux_fp = new_demux_fp with open_file(demux_fp, 'r+') as f: for old in f: f.move(old, id_map[old]) # When we fix, we always generate the FASTQ and FASTA file # By setting them to None, below will be generated fastq_fp = None fasta_fp = None # If we didn't fix anything, we only generate the files if they don't # already exists name = splitext(basename(demux_fp))[0] if not fastq_fp: fastq_fp = join(out_dir, "%s.fastq" % name) to_ascii_file(demux_fp, fastq_fp, out_format='fastq') fastq_fp, error_msg = _gzip_file(fastq_fp) if error_msg is not None: return False, None, error_msg if not fasta_fp: fasta_fp = join(out_dir, "%s.fasta" % name) to_ascii_file(demux_fp, fasta_fp, out_format='fasta') fasta_fp, error_msg = _gzip_file(fasta_fp) if error_msg is not None: return False, None, error_msg filepaths = [(fastq_fp, 'preprocessed_fastq'), (fasta_fp, 'preprocessed_fasta'), (demux_fp, 'preprocessed_demux')] if log_fp: filepaths.append((log_fp, 'log')) return True, [ArtifactInfo(None, 'Demultiplexed', filepaths)], ""
def _validate_demultiplexed(qclient, job_id, prep_info, files, out_dir): """Validate and fix a new 'Demultiplexed' artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id files : dict of {str: list of str} The files to add to the new artifact, keyed by filepath type out_dir : str The output directory Returns ------- dict The results of the job """ qclient.update_job_step(job_id, "Step 2: Validating 'Demultiplexed' files") supported_fp_types = {'preprocessed_fasta', 'preprocessed_fastq', 'preprocessed_demux', 'log'} unsupported_fp_types = set(files) - supported_fp_types if unsupported_fp_types: error_msg = ("Filepath type(s) %s not supported by artifact type " "Demultiplexed. Supported filepath types: %s" % (', '.join(unsupported_fp_types), ', '.join(sorted(supported_fp_types)))) return False, None, error_msg # At most one file of each type can be provided offending = set(fp_t for fp_t, fps in files.items() if len(fps) > 1) if offending: errors = ["%s (%d): %s" % (fp_t, len(files[fp_t]), ', '.join(files[fp_t])) for fp_t in sorted(offending)] error_msg = ("Only one filepath of each file type is supported, " "offending types:\n%s" % "; ".join(errors)) return False, None, error_msg # Check which files we have available: fasta = (files['preprocessed_fasta'][0] if 'preprocessed_fasta' in files else None) fastq = (files['preprocessed_fastq'][0] if 'preprocessed_fastq' in files else None) demux = (files['preprocessed_demux'][0] if 'preprocessed_demux' in files else None) log = (files['log'][0] if 'log' in files else None) if demux: # If demux is available, use that one to perform the validation and # generate the fasta and fastq from it success, a_info, error_msg = _validate_demux_file( qclient, job_id, prep_info, out_dir, demux, log_fp=log) elif fastq: # Generate the demux file from the fastq demux = join(out_dir, "%s.demux" % splitext(basename(fastq))[0]) with open_file(demux, "w") as f: to_hdf5(fastq, f) # Validate the demux, providing the original fastq success, a_info, error_msg = _validate_demux_file( qclient, job_id, prep_info, out_dir, demux, fastq_fp=fastq, log_fp=log) elif fasta: # Generate the demux file from the fasta demux = join(out_dir, "%s.demux" % splitext(basename(fasta))[0]) with open_file(demux, "w") as f: to_hdf5(fasta, f) # Validate the demux, providing the original fasta success, a_info, error_msg = _validate_demux_file( qclient, job_id, prep_info, out_dir, demux, fasta_fp=fasta, log_fp=log) else: error_msg = ("Either a 'preprocessed_demux', 'preprocessed_fastq' or " "'preprocessed_fasta' file should be provided.") return False, None, error_msg return success, a_info, error_msg
def _validate_demux_file(qclient, job_id, prep_info, out_dir, demux_fp, fastq_fp=None, fasta_fp=None, log_fp=None): """Validate and fix a 'demux' file and regenerate fastq and fasta files Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id out_dir : str The output directory demux_fp : str The demux file path fastq_fp : str, optional The original fastq filepath. If demux is correct, it will not be regenerated fasta_fp : str, optional The original fasta filepath. If demux is correct, it will no be regenerated log_fp : str, optional The original log filepath Returns ------- dict The results og the job """ pt_sample_ids = set(prep_info) with open_file(demux_fp) as f: demux_sample_ids = set(f.keys()) if not pt_sample_ids.issuperset(demux_sample_ids): # The demux sample ids are different from the ones in the prep template qclient.update_job_step(job_id, "Step 3: Fixing sample ids") # Atempt 1: the user provided the run prefix column - in this case the # run prefix column holds the sample ids present in the demux file if 'run_prefix' in prep_info[next(iter(pt_sample_ids))]: id_map = {v['run_prefix']: k for k, v in prep_info.items()} if not set(id_map).issuperset(demux_sample_ids): error_msg = ('The sample ids in the "run_prefix" columns ' 'from the prep information do not match the ' 'ones in the demux file. Please, correct the ' 'column "run_prefix" in the prep information to ' 'map the existing sample ids to the prep ' 'information sample ids.') return False, None, error_msg else: # Attempt 2: the sample ids in the demux table are the same that # in the prep template but without the prefix prefix = next(iter(pt_sample_ids)).split('.', 1)[0] prefixed = set("%s.%s" % (prefix, s) for s in demux_sample_ids) if pt_sample_ids.issuperset(prefixed): id_map = {s: "%s.%s" % (prefix, s) for s in demux_sample_ids} else: # There is nothing we can do. The samples in the demux file do # not match the ones in the prep template and we can't fix it error_msg = ('The sample ids in the demultiplexed files do ' 'not match the ones in the prep information. ' 'Please, provide the column "run_prefix" in ' 'the prep information to map the existing sample' ' ids to the prep information sample ids.') return False, None, error_msg # Fix the sample ids # Do not modify the original demux file, copy it to a new location new_demux_fp = join(out_dir, basename(demux_fp)) copy(demux_fp, new_demux_fp) # Need to catch an error with open_file(new_demux_fp, 'r+') as f: for old in f: f.move(old, id_map[old]) # When we fix, we always generate the FASTQ and FASTA file # By setting them to None, below will be generated demux_fp = new_demux_fp fastq_fp = None fasta_fp = None # If we didn't fix anything, we only generate the files if they don't # already exists name = splitext(basename(demux_fp))[0] if not fastq_fp: fastq_fp = join(out_dir, "%s.fastq" % name) to_ascii_file(demux_fp, fastq_fp, out_format='fastq') if not fasta_fp: fasta_fp = join(out_dir, "%s.fasta" % name) to_ascii_file(demux_fp, fasta_fp, out_format='fasta') filepaths = [(fastq_fp, 'preprocessed_fastq'), (fasta_fp, 'preprocessed_fasta'), (demux_fp, 'preprocessed_demux')] if log_fp: filepaths.append((log_fp, 'log')) return True, [ArtifactInfo(None, 'Demultiplexed', filepaths)], ""
def _to_file(demux_fp, sample, fp, formatter): with open_file(demux_fp, 'r') as demux: with open(fp, 'wb') as out: for rec in _to_ascii(demux, [sample], formatter): out.write(rec)
def parse_fastq(data, strict=False, enforce_qual_range=True, phred_offset=33): r"""yields label, seq, and qual from a fastq file. Parameters ---------- data : open file object or str An open fastq file (opened in binary mode) or a path to it. strict : bool, optional Defaults to ``False``. If strict is true a FastqParse error will be raised if the seq and qual labels dont' match. enforce_qual_range : bool, optional Defaults to ``True``. If ``True``, an exception will be raised if a quality score outside the range [0, 62] is detected phred_offset : {33, 64}, optional What Phred offset to use when converting qual score symbols to integers Returns ------- label, seq, qual : (str, bytes, np.array) yields the label, sequence and quality for each entry """ if phred_offset == 33: phred_f = ascii_to_phred33 elif phred_offset == 64: phred_f = ascii_to_phred64 else: raise ValueError("Unknown PHRED offset of %s" % phred_offset) with open_file(data, 'rb') as data: iters = [iter(data)] * 4 for seqid, seq, qualid, qual in zip_longest(*iters): seqid = seqid.strip() # If the file simply ended in a blankline, do not error if seqid == b'': continue # Error if an incomplete record is found # Note: seqid cannot be None, because if all 4 values were None, # then the loop condition would be false, and we could not have # gotten to this point if seq is None or qualid is None or qual is None: raise ValueError("Incomplete FASTQ record found at end " "of file") seq = seq.strip() qualid = qualid.strip() qual = qual.strip() seqid = _drop_id_marker(seqid) try: seq = str(seq.decode("utf-8")) except AttributeError: pass qualid = _drop_id_marker(qualid) if strict: if seqid != qualid: raise ValueError('ID mismatch: {} != {}'.format( seqid, qualid)) # bounds based on illumina limits, see: # http://nar.oxfordjournals.org/content/38/6/1767/T1.expansion.html qual = phred_f(qual) if enforce_qual_range and ((qual < 0).any() or (qual > 62).any()): raise ValueError("Failed qual conversion for seq id: %s. " "This may be because you passed an incorrect " "value for phred_offset." % seqid) yield (seqid, seq, qual)
def test_hdf5IO(self): f = h5py.File('test', mode='w', driver='core', backing_store=False) with open_file(f) as fh: self.assertTrue(fh is f)
def test_BytesIO(self): """BytesIO (useful e.g. for testing) slips through.""" f = BytesIO(b"File contents") with open_file(f) as fh: self.assertTrue(fh is f)
def _validate_demultiplexed(qclient, job_id, prep_info, files, out_dir): """Validate and fix a new 'Demultiplexed' artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id files : dict of {str: list of str} The files to add to the new artifact, keyed by filepath type out_dir : str The output directory Returns ------- dict The results of the job """ qclient.update_job_step(job_id, "Step 2: Validating 'Demultiplexed' files") supported_fp_types = { 'preprocessed_fasta', 'preprocessed_fastq', 'preprocessed_demux', 'log' } unsupported_fp_types = set(files) - supported_fp_types if unsupported_fp_types: error_msg = ("Filepath type(s) %s not supported by artifact type " "Demultiplexed. Supported filepath types: %s" % (', '.join(unsupported_fp_types), ', '.join( sorted(supported_fp_types)))) return False, None, error_msg # At most one file of each type can be provided offending = set(fp_t for fp_t, fps in files.items() if len(fps) > 1) if offending: errors = [ "%s (%d): %s" % (fp_t, len(files[fp_t]), ', '.join(files[fp_t])) for fp_t in sorted(offending) ] error_msg = ("Only one filepath of each file type is supported, " "offending types:\n%s" % "; ".join(errors)) return False, None, error_msg # Check which files we have available: fasta = (files['preprocessed_fasta'][0] if 'preprocessed_fasta' in files else None) fastq = (files['preprocessed_fastq'][0] if 'preprocessed_fastq' in files else None) demux = (files['preprocessed_demux'][0] if 'preprocessed_demux' in files else None) log = (files['log'][0] if 'log' in files else None) if demux: # If demux is available, use that one to perform the validation and # generate the fasta and fastq from it success, a_info, error_msg = _validate_demux_file(qclient, job_id, prep_info, out_dir, demux, log_fp=log) elif fastq: # Generate the demux file from the fastq demux = join(out_dir, "%s.demux" % splitext(basename(fastq))[0]) with open_file(demux, "w") as f: to_hdf5(fastq, f) # Validate the demux, providing the original fastq success, a_info, error_msg = _validate_demux_file(qclient, job_id, prep_info, out_dir, demux, fastq_fp=fastq, log_fp=log) elif fasta: # Generate the demux file from the fasta demux = join(out_dir, "%s.demux" % splitext(basename(fasta))[0]) with open_file(demux, "w") as f: to_hdf5(fasta, f) # Validate the demux, providing the original fasta success, a_info, error_msg = _validate_demux_file(qclient, job_id, prep_info, out_dir, demux, fasta_fp=fasta, log_fp=log) else: error_msg = ("Either a 'preprocessed_demux', 'preprocessed_fastq' or " "'preprocessed_fasta' file should be provided.") return False, None, error_msg return success, a_info, error_msg