def test_force_constructor(self): it = load([self.noext], constructor=FastaIterator) obs = [rec.copy() for rec in it] exp = [{'Sequence': 'AATTGG', 'SequenceID': 'seq1', 'Qual': None, 'QualID': None}, {'Sequence': 'ATATA', 'SequenceID': 'seq2', 'Qual': None, 'QualID': None}] self.assertEqual(obs, exp)
def to_hdf5(fp, h5file, max_barcode_length=12): """Represent demux data in an h5file Parameters ---------- fp : filepath The filepath containing either FASTA or FASTQ data. h5file : h5py.File The file to write into. Notes ----- A group, per sample, will be created and within that group, 5 datasets will be constructed that correspond to sequence, original_barcode, corrected_barcode, barcode_errors, and qual. The filepath is required as two passes over the file are essential. The expectation is that the filepath being operated on is the result of split_libraries.py or split_libraries_fastq.py from QIIME. This code makes assumptions about items in the comment line that are added by split libraries. Specifically, the code looks for a "new_bc", an "ori_bc" and a "bc_diffs" field, and additionally assumes the sample ID is encoded in the ID. """ # walk over the file and collect summary stats sample_stats, full_stats = _summarize_lengths(_per_sample_lengths(fp)) # construct the datasets, storing per sample stats and full file stats buffers = _construct_datasets(sample_stats, h5file) _set_attr_stats(h5file, full_stats) h5file.attrs['has-qual'] = _has_qual(fp) for rec in load(fp): result = search((r'^(?P<sample>.+?)_\d+? .*orig_bc=(?P<orig_bc>.+?) ' 'new_bc=(?P<corr_bc>.+?) bc_diffs=(?P<bc_diffs>\d+)'), rec['SequenceID']) if result is None: raise ValueError("%s doesn't appear to be split libraries " "output!" % fp) sample = result.group('sample') bc_diffs = result.group('bc_diffs') corr_bc = result.group('corr_bc') orig_bc = result.group('orig_bc') sequence = rec['Sequence'] qual = rec['Qual'] pjoin = partial(os.path.join, sample) buffers[pjoin(dset_paths['sequence'])].write(sequence) buffers[pjoin(dset_paths['barcode_original'])].write(orig_bc) buffers[pjoin(dset_paths['barcode_corrected'])].write(corr_bc) buffers[pjoin(dset_paths['barcode_error'])].write(bc_diffs) if qual is not None: buffers[pjoin(dset_paths['qual'])].write(qual)
def test_transform(self): """load should pass transform methods to the iterators""" def rev_f(st): st['Sequence'] = st['Sequence'][::-1] st['Qual'] = st['Qual'][::-1] if st['Qual'] is not None else None it = load([self.fq1gz, self.fna1], transform=rev_f, phred_offset=64) obs = [rec.copy() for rec in it] exp = [{ 'Sequence': 'CGTA', 'SequenceID': 's1', 'QualID': 's1', 'Qual': array([40, 40, 40, 40]) }, { 'Sequence': 'GGTTAA', 'SequenceID': 's2', 'QualID': 's2', 'Qual': array([40, 40, 39, 39, 39, 39]) }, { 'Sequence': 'CGTA', 'SequenceID': 's1', 'QualID': None, 'Qual': None }, { 'Sequence': 'GGTTAA', 'SequenceID': 's2', 'QualID': None, 'Qual': None }] o = obs[0] e = exp[0] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) o = obs[1] e = exp[1] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) o = obs[2] e = exp[2] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertEqual(o['Qual'], e['Qual']) o = obs[3] e = exp[3] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertEqual(o['Qual'], e['Qual'])
def test_multiple_files(self): """load should handle multiple files of different types""" it = load([self.fq1, self.fna1], phred_offset=64) obs = [rec.copy() for rec in it] exp = [{ 'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': 's1', 'Qual': array([40, 40, 40, 40]) }, { 'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40]) }, { 'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': None, 'Qual': None }, { 'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': None, 'Qual': None }] o = obs[0] e = exp[0] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) o = obs[1] e = exp[1] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) o = obs[2] e = exp[2] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertEqual(o['Qual'], e['Qual']) o = obs[3] e = exp[3] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertEqual(o['Qual'], e['Qual'])
def test_single_files(self): """load should handle a single file, and can be gzipped""" it = load(self.fna1) obs = [rec.copy() for rec in it] exp = [{'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': None, 'Qual': None}, {'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': None, 'Qual': None}] self.assertEqual(obs, exp) it = load(self.fq1, phred_offset=64) obs = [rec.copy() for rec in it] exp = [{'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': 's1', 'Qual': array([40, 40, 40, 40])}, {'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40])}] for o, e in zip(obs, exp): self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) it = load(self.fna1gz) obs = [rec.copy() for rec in it] exp = [{'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': None, 'Qual': None}, {'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': None, 'Qual': None}] self.assertEqual(obs, exp) it = load(self.fq1gz, phred_offset=64) obs = [rec.copy() for rec in it] exp = [{'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': 's1', 'Qual': array([40, 40, 40, 40])}, {'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40])}] for o, e in zip(obs, exp): self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all())
def test_force_constructor(self): it = load([self.noext], constructor=FastaIterator) obs = [rec.copy() for rec in it] exp = [{ 'Sequence': 'AATTGG', 'SequenceID': 'seq1', 'Qual': None, 'QualID': None }, { 'Sequence': 'ATATA', 'SequenceID': 'seq2', 'Qual': None, 'QualID': None }] self.assertEqual(obs, exp)
def test_transform(self): """load should pass transform methods to the iterators""" def rev_f(st): st['Sequence'] = st['Sequence'][::-1] st['Qual'] = st['Qual'][::-1] if st['Qual'] is not None else None it = load([self.fq1gz, self.fna1], transform=rev_f, phred_offset=64) obs = [rec.copy() for rec in it] exp = [{'Sequence': 'CGTA', 'SequenceID': 's1', 'QualID': 's1', 'Qual': array([40, 40, 40, 40])}, {'Sequence': 'GGTTAA', 'SequenceID': 's2', 'QualID': 's2', 'Qual': array([40, 40, 39, 39, 39, 39])}, {'Sequence': 'CGTA', 'SequenceID': 's1', 'QualID': None, 'Qual': None}, {'Sequence': 'GGTTAA', 'SequenceID': 's2', 'QualID': None, 'Qual': None}] o = obs[0] e = exp[0] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) o = obs[1] e = exp[1] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) o = obs[2] e = exp[2] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertEqual(o['Qual'], e['Qual']) o = obs[3] e = exp[3] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertEqual(o['Qual'], e['Qual'])
def _per_sample_lengths(fp): """Determine the lengths of all sequences per sample Parameters ---------- fp : filepath The sequence file to walk over Returns ------- dict {sample_id: [sequence_length]} """ lengths = defaultdict(list) for record in load(fp): sample_id = record['SequenceID'].split('_', 1)[0] lengths[sample_id].append(len(record['Sequence'])) return lengths
def _per_sample_lengths(fp): """Determine the lengths of all sequences per sample Parameters ---------- fp : filepath The sequence file to walk over Returns ------- dict {sample_id: [sequence_length]} """ lengths = defaultdict(list) for record in load(fp): sample_id = record['SequenceID'].split(' ')[0].rsplit('_', 1)[0] lengths[sample_id].append(len(record['Sequence'])) return lengths
def test_multiple_files(self): """load should handle multiple files of different types""" it = load([self.fq1, self.fna1], phred_offset=64) obs = [rec.copy() for rec in it] exp = [{'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': 's1', 'Qual': array([40, 40, 40, 40])}, {'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40])}, {'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': None, 'Qual': None}, {'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': None, 'Qual': None}] o = obs[0] e = exp[0] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) o = obs[1] e = exp[1] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) o = obs[2] e = exp[2] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertEqual(o['Qual'], e['Qual']) o = obs[3] e = exp[3] self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertEqual(o['Qual'], e['Qual'])
def test_multiple_types_fasta_fastq_qual(self): with self.assertRaises(ValueError): load([self.fna1, self.fq1], qual=self.qual1)
def test_file_path_does_not_exist(self): with self.assertRaises(IOError): load('this-seqs-file-had-better-not-exist-or-this-test-will-' 'fail.fna')
def test_unknown_filetype(self): with self.assertRaises(IOError): load('seqs.mpeg')
def test_no_seqs(self): for null in ('', [], (), None): with self.assertRaises(ValueError): load(null)
def _validate_demux_file(qclient, job_id, prep_info, out_dir, demux_fp, fastq_fp=None, fasta_fp=None, log_fp=None): """Validate and fix a 'demux' file and regenerate fastq and fasta files Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id out_dir : str The output directory demux_fp : str The demux file path fastq_fp : str, optional The original fastq filepath. If demux is correct, it will not be regenerated fasta_fp : str, optional The original fasta filepath. If demux is correct, it will no be regenerated log_fp : str, optional The original log filepath Returns ------- dict The results og the job """ pt_sample_ids = set(prep_info) with File(demux_fp) as f: demux_sample_ids = set(f.keys()) if not pt_sample_ids.issuperset(demux_sample_ids): # The demux sample ids are different from the ones in the prep template qclient.update_job_step(job_id, "Step 3: Fixing sample ids") # Atempt 1: the user provided the run prefix column - in this case the # run prefix column holds the sample ids present in the demux file if 'run_prefix' in prep_info[next(iter(pt_sample_ids))]: id_map = {v['run_prefix']: k for k, v in prep_info.items()} if not set(id_map).issuperset(demux_sample_ids): error_msg = ('The sample ids in the "run_prefix" columns ' 'from the prep information do not match the ' 'ones in the demux file. Please, correct the ' 'column "run_prefix" in the prep information to ' 'map the existing sample ids to the prep ' 'information sample ids.') return False, None, error_msg else: # Attempt 2: the sample ids in the demux table are the same that # in the prep template but without the prefix prefix = next(iter(pt_sample_ids)).split('.', 1)[0] prefixed = set("%s.%s" % (prefix, s) for s in demux_sample_ids) if pt_sample_ids.issuperset(prefixed): id_map = {s: "%s.%s" % (prefix, s) for s in demux_sample_ids} else: # There is nothing we can do. The samples in the demux file do # not match the ones in the prep template and we can't fix it error_msg = ('The sample ids in the demultiplexed files do ' 'not match the ones in the prep information. ' 'Please, provide the column "run_prefix" in ' 'the prep information to map the existing sample' ' ids to the prep information sample ids.') return False, None, error_msg # Fix the sample ids # Do not modify the original demux file, copy it to a new location new_demux_fp = join(out_dir, basename(demux_fp)) copy(demux_fp, new_demux_fp) # Need to catch an error with File(new_demux_fp, 'r+') as f: for old in f: f.move(old, id_map[old]) # When we fix, we always generate the FASTQ and FASTA file # By setting them to None, below will be generated demux_fp = new_demux_fp fastq_fp = None fasta_fp = None # If we didn't fix anything, we only generate the files if they don't # already exists name = splitext(basename(demux_fp))[0] if not fastq_fp: fastq_fp = join(out_dir, "%s.fastq" % name) with open(fastq_fp, 'w') as fq: with File(demux_fp, 'r') as dx: for record in to_ascii(dx): fq.write(record) if not fasta_fp: fasta_fp = join(out_dir, "%s.fasta" % name) with open(fasta_fp, 'w') as f: for r in load(fastq_fp): f.write(format_fasta_record(r['SequenceID'], r['Sequence'], r['Qual'])) filepaths = [[[fastq_fp], 'preprocessed_fastq'], [[fasta_fp], 'preprocessed_fasta'], [[demux_fp], 'preprocessed_demux']] if log_fp: filepaths.append([[log_fp], 'log']) return True, [[None, 'Demultiplexed', filepaths]], ""
def _validate_demux_file(qclient, job_id, prep_info, out_dir, demux_fp, fastq_fp=None, fasta_fp=None, log_fp=None): """Validate and fix a 'demux' file and regenerate fastq and fasta files Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id out_dir : str The output directory demux_fp : str The demux file path fastq_fp : str, optional The original fastq filepath. If demux is correct, it will not be regenerated fasta_fp : str, optional The original fasta filepath. If demux is correct, it will no be regenerated log_fp : str, optional The original log filepath Returns ------- dict The results og the job """ pt_sample_ids = set(prep_info) with File(demux_fp) as f: demux_sample_ids = set(f.keys()) if not pt_sample_ids.issuperset(demux_sample_ids): # The demux sample ids are different from the ones in the prep template qclient.update_job_step(job_id, "Step 3: Fixing sample ids") # Atempt 1: the user provided the run prefix column - in this case the # run prefix column holds the sample ids present in the demux file if 'run_prefix' in prep_info[next(iter(pt_sample_ids))]: id_map = {v['run_prefix']: k for k, v in prep_info.items()} if not set(id_map).issuperset(demux_sample_ids): return format_payload( success=False, error_msg='The sample ids in the "run_prefix" columns ' 'from the prep information do not match the ' 'ones in the demux file. Please, correct the ' 'column "run_prefix" in the prep information to ' 'map the existing sample ids to the prep ' 'information sample ids.') else: # Attempt 2: the sample ids in the demux table are the same that # in the prep template but without the prefix prefix = next(iter(pt_sample_ids)).split('.', 1)[0] prefixed = set("%s.%s" % (prefix, s) for s in demux_sample_ids) if pt_sample_ids.issuperset(prefixed): id_map = {s: "%s.%s" % (prefix, s) for s in demux_sample_ids} else: # There is nothing we can do. The samples in the demux file do # not match the ones in the prep template and we can't fix it return format_payload( success=False, error_msg='The sample ids in the demultiplexed files do ' 'not match the ones in the prep information. ' 'Please, provide the column "run_prefix" in ' 'the prep information to map the existing sample' ' ids to the prep information sample ids.') # Fix the sample ids # Do not modify the original demux file, copy it to a new location new_demux_fp = join(out_dir, basename(demux_fp)) copy(demux_fp, new_demux_fp) # Need to catch an error with File(new_demux_fp, 'r+') as f: for old in f: f.move(old, id_map[old]) # When we fix, we always generate the FASTQ and FASTA file # By setting them to None, below will be generated demux_fp = new_demux_fp fastq_fp = None fasta_fp = None # If we didn't fix anything, we only generate the files if they don't # already exists name = splitext(basename(demux_fp))[0] if not fastq_fp: fastq_fp = join(out_dir, "%s.fastq" % name) with open(fastq_fp, 'w') as fq: with File(demux_fp, 'r') as dx: for record in to_ascii(dx): fq.write(record) if not fasta_fp: fasta_fp = join(out_dir, "%s.fasta" % name) with open(fasta_fp, 'w') as f: for r in load(fastq_fp): f.write(format_fasta_record(r['SequenceID'], r['Sequence'], r['Qual'])) filepaths = [[[fastq_fp], 'preprocessed_fastq'], [[fasta_fp], 'preprocessed_fasta'], [[demux_fp], 'preprocessed_demux']] if log_fp: filepaths.append([[log_fp], 'log']) return format_payload( success=True, artifacts_info=[[None, 'Demultiplexed', filepaths]])
def test_single_files(self): """load should handle a single file, and can be gzipped""" it = load(self.fna1) obs = [rec.copy() for rec in it] exp = [{ 'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': None, 'Qual': None }, { 'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': None, 'Qual': None }] self.assertEqual(obs, exp) it = load(self.fq1, phred_offset=64) obs = [rec.copy() for rec in it] exp = [{ 'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': 's1', 'Qual': array([40, 40, 40, 40]) }, { 'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40]) }] for o, e in zip(obs, exp): self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) it = load(self.qs1, phred_offset=64) obs = [rec.copy() for rec in it] exp = [{ 'Qual': array([2, 27, 27, 27]), 'QualID': 'CRESSIA_242:1:2204:1453:1918#0/1', 'Sequence': 'TTAA', 'SequenceID': 'CRESSIA_242:1:2204:1453:1918#0/1' }, { 'Qual': array([2, 2, 2, 2], ), 'QualID': 'CRESSIA_242:1:2204:1491:1920#0/1', 'Sequence': 'AAAA', 'SequenceID': 'CRESSIA_242:1:2204:1491:1920#0/1' }] for o, e in zip(obs, exp): self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) it = load(self.fna1gz) obs = [rec.copy() for rec in it] exp = [{ 'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': None, 'Qual': None }, { 'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': None, 'Qual': None }] self.assertEqual(obs, exp) it = load(self.fq1gz, phred_offset=64) obs = [rec.copy() for rec in it] exp = [{ 'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': 's1', 'Qual': array([40, 40, 40, 40]) }, { 'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40]) }] for o, e in zip(obs, exp): self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) it = load(self.qs1gz, phred_offset=64) obs = [rec.copy() for rec in it] exp = [{ 'Qual': array([2, 27, 27, 27]), 'QualID': 'CRESSIA_242:1:2204:1453:1918#0/1', 'Sequence': 'TTAA', 'SequenceID': 'CRESSIA_242:1:2204:1453:1918#0/1' }, { 'Qual': array([2, 2, 2, 2], ), 'QualID': 'CRESSIA_242:1:2204:1491:1920#0/1', 'Sequence': 'AAAA', 'SequenceID': 'CRESSIA_242:1:2204:1491:1920#0/1' }] for o, e in zip(obs, exp): self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all())
def _has_qual(fp): """Check if it looks like we have qual""" iter_ = load(fp) rec = next(iter(iter_)) return rec['Qual'] is not None
def test_single_files(self): """load should handle a single file, and can be gzipped""" it = load(self.fna1) obs = [rec.copy() for rec in it] exp = [{ 'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': None, 'Qual': None }, { 'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': None, 'Qual': None }] self.assertEqual(obs, exp) it = load(self.fq1, phred_offset=64) obs = [rec.copy() for rec in it] exp = [{ 'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': 's1', 'Qual': array([40, 40, 40, 40]) }, { 'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40]) }] for o, e in zip(obs, exp): self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) it = load(self.fna1gz) obs = [rec.copy() for rec in it] exp = [{ 'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': None, 'Qual': None }, { 'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': None, 'Qual': None }] self.assertEqual(obs, exp) it = load(self.fq1gz, phred_offset=64) obs = [rec.copy() for rec in it] exp = [{ 'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': 's1', 'Qual': array([40, 40, 40, 40]) }, { 'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40]) }] for o, e in zip(obs, exp): self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all())
def test_single_files(self): """load should handle a single file, and can be gzipped""" it = load(self.fna1) obs = [rec.copy() for rec in it] exp = [{'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': None, 'Qual': None}, {'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': None, 'Qual': None}] self.assertEqual(obs, exp) it = load(self.fq1, phred_offset=64) obs = [rec.copy() for rec in it] exp = [{'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': 's1', 'Qual': array([40, 40, 40, 40])}, {'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40])}] for o, e in zip(obs, exp): self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) it = load(self.qs1, phred_offset=64) obs = [rec.copy() for rec in it] exp = [{'Qual': array([2, 27, 27, 27]), 'QualID': 'CRESSIA_242:1:2204:1453:1918#0/1', 'Sequence': 'TTAA', 'SequenceID': 'CRESSIA_242:1:2204:1453:1918#0/1'}, {'Qual': array([2, 2, 2, 2],), 'QualID': 'CRESSIA_242:1:2204:1491:1920#0/1', 'Sequence': 'AAAA', 'SequenceID': 'CRESSIA_242:1:2204:1491:1920#0/1'}] for o, e in zip(obs, exp): self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) it = load(self.fna1gz) obs = [rec.copy() for rec in it] exp = [{'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': None, 'Qual': None}, {'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': None, 'Qual': None}] self.assertEqual(obs, exp) it = load(self.fq1gz, phred_offset=64) obs = [rec.copy() for rec in it] exp = [{'Sequence': 'ATGC', 'SequenceID': 's1', 'QualID': 's1', 'Qual': array([40, 40, 40, 40])}, {'Sequence': 'AATTGG', 'SequenceID': 's2', 'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40])}] for o, e in zip(obs, exp): self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all()) it = load(self.qs1gz, phred_offset=64) obs = [rec.copy() for rec in it] exp = [{'Qual': array([2, 27, 27, 27]), 'QualID': 'CRESSIA_242:1:2204:1453:1918#0/1', 'Sequence': 'TTAA', 'SequenceID': 'CRESSIA_242:1:2204:1453:1918#0/1'}, {'Qual': array([2, 2, 2, 2],), 'QualID': 'CRESSIA_242:1:2204:1491:1920#0/1', 'Sequence': 'AAAA', 'SequenceID': 'CRESSIA_242:1:2204:1491:1920#0/1'}] for o, e in zip(obs, exp): self.assertEqual(o['Sequence'], e['Sequence']) self.assertEqual(o['SequenceID'], e['SequenceID']) self.assertEqual(o['QualID'], e['QualID']) self.assertTrue((o['Qual'] == e['Qual']).all())