def test_contigset_consolidate_int_names(self): # build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used shutil.copyfile( ReferenceSet(data.getXml(8)).toExternalFiles()[0], inFas) rs1 = ContigSet(inFas) double = 'B.cereus.1' exp_double = rs1.get_contig(double) # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord('5141', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord('5142', exp_double.sequence) exp_double_seqs = [exp_double.sequence, exp_double.sequence] exp_names = ['5141', '5142'] obs_file = ContigSet(outFas1, outFas2) log.debug(obs_file.toExternalFiles()) obs_file.consolidate() log.debug(obs_file.toExternalFiles()) # open obs and compare to exp for name, seq in zip(exp_names, exp_double_seqs): assert obs_file.get_contig(name).sequence[:] == seq
def test_contigset_consolidate_int_names(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format( i=ReferenceSet(data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) double = 'B.cereus.1' exp_double = rs1.get_contig(double) # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord('5141', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord('5142', exp_double.sequence) exp_double_seqs = [exp_double.sequence, exp_double.sequence] exp_names = ['5141', '5142'] obs_file = ContigSet(outFas1, outFas2) log.debug(obs_file.toExternalFiles()) obs_file.consolidate() log.debug(obs_file.toExternalFiles()) # open obs and compare to exp for name, seq in zip(exp_names, exp_double_seqs): self.assertEqual(obs_file.get_contig(name).sequence[:], seq)
def test_contigset_consolidate(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format( i=ReferenceSet(data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) singletons = ['A.baumannii.1', 'A.odontolyticus.1'] double = 'B.cereus.1' reader = rs1.resourceReaders()[0] exp_double = rs1.get_contig(double) exp_singles = [rs1.get_contig(name) for name in singletons] # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord(exp_singles[0]) writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord(exp_double.name + '_0_10', exp_double.sequence + 'ATCGATCGATCG') writer.writeRecord(exp_singles[1]) exp_double_seq = ''.join([exp_double.sequence, 'ATCGATCGATCG', exp_double.sequence]) exp_single_seqs = [rec.sequence for rec in exp_singles] acc_file = ContigSet(outFas1, outFas2) acc_file.induceIndices() log.debug(acc_file.toExternalFiles()) self.assertEqual(len(acc_file), 4) self.assertEqual(len(list(acc_file)), 4) acc_file.consolidate() log.debug(acc_file.toExternalFiles()) # open acc and compare to exp for name, seq in zip(singletons, exp_single_seqs): self.assertEqual(acc_file.get_contig(name).sequence[:], seq) self.assertEqual(acc_file.get_contig(double).sequence[:], exp_double_seq) self.assertEqual(len(acc_file._openReaders), 1) self.assertEqual(len(acc_file.index), 3) self.assertEqual(len(acc_file._indexMap), 3) self.assertEqual(len(acc_file), 3) self.assertEqual(len(list(acc_file)), 3) # test merge: acc1 = ContigSet(outFas1) acc2 = ContigSet(outFas2) acc3 = acc1 + acc2
def test_contigset_consolidate(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format(i=ReferenceSet( data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) singletons = ['A.baumannii.1', 'A.odontolyticus.1'] double = 'B.cereus.1' reader = rs1.resourceReaders()[0] exp_double = rs1.get_contig(double) exp_singles = [rs1.get_contig(name) for name in singletons] # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord(exp_singles[0]) writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord(exp_double.name + '_0_10', exp_double.sequence + 'ATCGATCGATCG') writer.writeRecord(exp_singles[1]) exp_double_seq = ''.join( [exp_double.sequence, 'ATCGATCGATCG', exp_double.sequence]) exp_single_seqs = [rec.sequence for rec in exp_singles] acc_file = ContigSet(outFas1, outFas2) acc_file.induceIndices() log.debug(acc_file.toExternalFiles()) self.assertEqual(len(acc_file), 4) self.assertEqual(len(list(acc_file)), 4) acc_file.consolidate() log.debug(acc_file.toExternalFiles()) # open acc and compare to exp for name, seq in zip(singletons, exp_single_seqs): self.assertEqual(acc_file.get_contig(name).sequence[:], seq) self.assertEqual( acc_file.get_contig(double).sequence[:], exp_double_seq) self.assertEqual(len(acc_file._openReaders), 1) self.assertEqual(len(acc_file.index), 3) self.assertEqual(len(acc_file._indexMap), 3) self.assertEqual(len(acc_file), 3) self.assertEqual(len(list(acc_file)), 3) # test merge: acc1 = ContigSet(outFas1) acc2 = ContigSet(outFas2) acc3 = acc1 + acc2
def __gather_contigset(resource_file_extension, input_files, output_file, new_resource_file=None, skip_empty=True): """ :param input_files: List of file paths :param output_file: File Path :param new_resource_file: the path of the file to which the other contig files are consolidated :param skip_empty: Ignore empty files (doesn't do much yet) :return: Output file :rtype: str """ if skip_empty: _input_files = [] for file_name in input_files: cs = ContigSet(file_name) if len(cs.toExternalFiles()) > 0: _input_files.append(file_name) input_files = _input_files tbr = ContigSet(*input_files) if not new_resource_file: if output_file.endswith('xml'): new_resource_file = output_file[:-3] + resource_file_extension tbr.consolidate(new_resource_file) tbr.newUuid() sanitize_dataset_tags(tbr) tbr.write(output_file) return output_file
def __gather_contigset(resource_file_extension, input_files, output_file, new_resource_file=None, skip_empty=True): """ :param input_files: List of file paths :param output_file: File Path :param new_resource_file: the path of the file to which the other contig files are consolidated :param skip_empty: Ignore empty files (doesn't do much yet) :return: Output file :rtype: str """ if skip_empty: _input_files = [] for file_name in input_files: cs = ContigSet(file_name) if len(cs.toExternalFiles()) > 0: _input_files.append(file_name) input_files = _input_files tbr = ContigSet(*input_files) if not new_resource_file: if output_file.endswith('xml'): new_resource_file = output_file[:-3] + resource_file_extension tbr.consolidate(new_resource_file) tbr.newUuid() tbr.write(output_file) return output_file
def _get_fasta_path(file_name): if file_name.endswith(".contigset.xml"): ds = ContigSet(file_name) fasta_files = ds.toExternalFiles() assert len(fasta_files) == 1 return fasta_files[0] return file_name
def __gather_contigset(input_files, output_file, new_resource_file): """Copied from pbcoretools.chunking.gather:__gather_contigset() """ skip_empty = True if skip_empty: _input_files = [] for file_name in input_files: cs = ContigSet(file_name) if len(cs.toExternalFiles()) > 0: _input_files.append(file_name) input_files = _input_files tbr = ContigSet(*input_files) tbr.consolidate(new_resource_file) tbr.newUuid() tbr.write(output_file, relPaths=True) return output_file