def test_contigset_consolidate_int_names(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format( i=ReferenceSet(data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) double = 'B.cereus.1' exp_double = rs1.get_contig(double) # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord('5141', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord('5142', exp_double.sequence) exp_double_seqs = [exp_double.sequence, exp_double.sequence] exp_names = ['5141', '5142'] obs_file = ContigSet(outFas1, outFas2) log.debug(obs_file.toExternalFiles()) obs_file.consolidate() log.debug(obs_file.toExternalFiles()) # open obs and compare to exp for name, seq in zip(exp_names, exp_double_seqs): self.assertEqual(obs_file.get_contig(name).sequence[:], seq)
def test_contigset_consolidate_genomic_consensus(self): """ Verify that the contigs output by GenomicConsensus (e.g. quiver) can be consolidated. """ FASTA1 = ("lambda_NEB3011_0_60", "GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCG") FASTA2 = ("lambda_NEB3011_120_180", "CACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTG") FASTA3 = ("lambda_NEB3011_60_120", "GTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGT") files = [] for i, (header, seq) in enumerate([FASTA1, FASTA2, FASTA3]): _files = [] for suffix in ["", "|quiver", "|plurality", "|arrow", "|poa"]: tmpfile = tempfile.NamedTemporaryFile(suffix=".fasta").name with open(tmpfile, "w") as f: f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq)) _files.append(tmpfile) files.append(_files) for i in range(3): ds = ContigSet(*[f[i] for f in files]) out1 = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name fa1 = tempfile.NamedTemporaryFile(suffix=".fasta").name ds.consolidate(fa1) ds.write(out1) with ContigSet(out1) as ds_new: self.assertEqual(len([rec for rec in ds_new]), 1, "failed on %d" % i)
def __gather_contigset(resource_file_extension, input_files, output_file, new_resource_file=None, skip_empty=True): """ :param input_files: List of file paths :param output_file: File Path :param new_resource_file: the path of the file to which the other contig files are consolidated :param skip_empty: Ignore empty files (doesn't do much yet) :return: Output file :rtype: str """ if skip_empty: _input_files = [] for file_name in input_files: cs = ContigSet(file_name) if len(cs.toExternalFiles()) > 0: _input_files.append(file_name) input_files = _input_files tbr = ContigSet(*input_files) if not new_resource_file: if output_file.endswith('xml'): new_resource_file = output_file[:-3] + resource_file_extension tbr.consolidate(new_resource_file) tbr.newUuid() tbr.write(output_file) return output_file
def test_contigset_consolidate(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format( i=ReferenceSet(data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) singletons = ['A.baumannii.1', 'A.odontolyticus.1'] double = 'B.cereus.1' reader = rs1.resourceReaders()[0] exp_double = rs1.get_contig(double) exp_singles = [rs1.get_contig(name) for name in singletons] # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord(exp_singles[0]) writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord(exp_double.name + '_0_10', exp_double.sequence + 'ATCGATCGATCG') writer.writeRecord(exp_singles[1]) exp_double_seq = ''.join([exp_double.sequence, 'ATCGATCGATCG', exp_double.sequence]) exp_single_seqs = [rec.sequence for rec in exp_singles] acc_file = ContigSet(outFas1, outFas2) acc_file.induceIndices() log.debug(acc_file.toExternalFiles()) self.assertEqual(len(acc_file), 4) self.assertEqual(len(list(acc_file)), 4) acc_file.consolidate() log.debug(acc_file.toExternalFiles()) # open acc and compare to exp for name, seq in zip(singletons, exp_single_seqs): self.assertEqual(acc_file.get_contig(name).sequence[:], seq) self.assertEqual(acc_file.get_contig(double).sequence[:], exp_double_seq) self.assertEqual(len(acc_file._openReaders), 1) self.assertEqual(len(acc_file.index), 3) self.assertEqual(len(acc_file._indexMap), 3) self.assertEqual(len(acc_file), 3) self.assertEqual(len(list(acc_file)), 3) # test merge: acc1 = ContigSet(outFas1) acc2 = ContigSet(outFas2) acc3 = acc1 + acc2
def test_contigset_consolidate(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format(i=ReferenceSet( data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) singletons = ['A.baumannii.1', 'A.odontolyticus.1'] double = 'B.cereus.1' reader = rs1.resourceReaders()[0] exp_double = rs1.get_contig(double) exp_singles = [rs1.get_contig(name) for name in singletons] # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord(exp_singles[0]) writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord(exp_double.name + '_0_10', exp_double.sequence + 'ATCGATCGATCG') writer.writeRecord(exp_singles[1]) exp_double_seq = ''.join( [exp_double.sequence, 'ATCGATCGATCG', exp_double.sequence]) exp_single_seqs = [rec.sequence for rec in exp_singles] acc_file = ContigSet(outFas1, outFas2) acc_file.induceIndices() log.debug(acc_file.toExternalFiles()) self.assertEqual(len(acc_file), 4) self.assertEqual(len(list(acc_file)), 4) acc_file.consolidate() log.debug(acc_file.toExternalFiles()) # open acc and compare to exp for name, seq in zip(singletons, exp_single_seqs): self.assertEqual(acc_file.get_contig(name).sequence[:], seq) self.assertEqual( acc_file.get_contig(double).sequence[:], exp_double_seq) self.assertEqual(len(acc_file._openReaders), 1) self.assertEqual(len(acc_file.index), 3) self.assertEqual(len(acc_file._indexMap), 3) self.assertEqual(len(acc_file), 3) self.assertEqual(len(list(acc_file)), 3) # test merge: acc1 = ContigSet(outFas1) acc2 = ContigSet(outFas2) acc3 = acc1 + acc2
def __gather_contigset(input_files, output_file, new_resource_file): """Copied from pbcoretools.chunking.gather:__gather_contigset() """ skip_empty = True if skip_empty: _input_files = [] for file_name in input_files: cs = ContigSet(file_name) if len(cs.toExternalFiles()) > 0: _input_files.append(file_name) input_files = _input_files tbr = ContigSet(*input_files) tbr.consolidate(new_resource_file) tbr.newUuid() tbr.write(output_file, relPaths=True) return output_file
def gather_contigset(input_files, output_file, new_resource_file=None, skip_empty=True): """ :param input_files: List of file paths :param output_file: File Path :param new_resource_file: the path of the file to which the other contig files are consolidated :param skip_empty: Ignore empty files (doesn't do much yet) :return: Output file :rtype: str """ tbr = ContigSet(*input_files) if not new_resource_file: if output_file.endswith('xml'): new_resource_file = output_file[:-3] + 'fasta' tbr.consolidate(new_resource_file) tbr.write(output_file) return output_file
def test_fastq_consolidate(self): fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/' 'lambda/2590980/0008/Analysis_Results/' 'm141115_075238_ethan_c100699872550000001' '823139203261572_s1_p0.1.subreads.fastq') fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name with open(fq_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240): fqh.write(line) cset = ContigSet(fq_out) cset_l = sum(1 for _ in cset) self.assertEqual(cset_l, 60) cset.filters.addRequirement(length=[('>', 1000)]) cset_l = sum(1 for _ in cset) self.assertEqual(cset_l, 23) cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) self.assertEqual(cset_l, 23) self.assertEqual(cset_l, sum(1 for _ in cfq))
def test_empty_fastq_consolidate(self): fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/' 'lambda/2590980/0008/Analysis_Results/' 'm141115_075238_ethan_c100699872550000001' '823139203261572_s1_p0.1.subreads.fastq') fq1_out = tempfile.NamedTemporaryFile(suffix="1.fastq").name fq2_out = tempfile.NamedTemporaryFile(suffix="2.fastq").name cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name # Two full with open(fq1_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240): fqh.write(line) with open(fq2_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240, 480): fqh.write(line) cset = ContigSet(fq1_out, fq2_out) cset_l = sum(1 for _ in cset) self.assertEqual(cset_l, 120) cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) self.assertEqual(cset_l, 120) self.assertEqual(cset_l, sum(1 for _ in cfq)) # one full one empty with open(fq1_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240): fqh.write(line) with open(fq2_out, 'w') as fqh: with open(fn, 'r') as fih: fqh.write("") cset = ContigSet(fq1_out, fq2_out) cset_l = sum(1 for _ in cset) self.assertEqual(cset_l, 60) cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) self.assertEqual(cset_l, 60) self.assertEqual(cset_l, sum(1 for _ in cfq)) # one empty one full with open(fq1_out, 'w') as fqh: with open(fn, 'r') as fih: fqh.write("") with open(fq2_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240): fqh.write(line) cset = ContigSet(fq1_out, fq2_out) cset_l = sum(1 for _ in cset) self.assertEqual(cset_l, 60) cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) self.assertEqual(cset_l, 60) self.assertEqual(cset_l, sum(1 for _ in cfq)) # both empty with open(fq1_out, 'w') as fqh: with open(fn, 'r') as fih: fqh.write("") with open(fq2_out, 'w') as fqh: with open(fn, 'r') as fih: fqh.write("") cset = ContigSet(fq1_out, fq2_out) cset_l = sum(1 for _ in cset) self.assertEqual(cset_l, 0) cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) self.assertEqual(cset_l, 0) self.assertEqual(cset_l, sum(1 for _ in cfq))
def consolidateXml(args): """Combine BAMs and apply the filters described in the XML file, producing one consolidated XML""" dset = ContigSet(args.infile) dset.consolidate(args.datafile) dset.write(args.xmlfile)