def resolved_tool_contract_runner(resolved_contract): rc = resolved_contract alignment_path = rc.task.input_files[0] reference_path = rc.task.input_files[1] gff_path = rc.task.output_files[0] vcf_path = rc.task.output_files[1] dataset_path = rc.task.output_files[2] fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path) fastq_path = rc.task.output_files[3] args = [ alignment_path, "--verbose", "--reference", reference_path, "--outputFilename", gff_path, "--outputFilename", fasta_path, "--outputFilename", fastq_path, "--outputFilename", vcf_path, "--numWorkers", str(rc.task.nproc), "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]), "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]), "--maskRadius", str(Constants.DEFAULT_MASK_RADIUS) if \ bool(rc.task.options[Constants.MASKING_ID]) else "0", "--algorithm", rc.task.options[Constants.ALGORITHM_ID], "--alignmentSetRefWindows", ] args_ = get_parser().arg_parser.parser.parse_args(args) rc = args_runner(args_) if rc == 0: pysam.faidx(fasta_path) ds = ContigSet(fasta_path, strict=True) ds.write(dataset_path) return rc
def run(referenceset, fastq, gff, fasta, contigset, alignmentset, options, log_level): #'--log-file foo.log', #'--verbose', #'--debug', # requires 'ipdb' #'-j NWORKERS', #'--algorithm quiver', #'--diploid', # binary #'--minConfidence 40', #'--minCoverage 5', #'--alignmentSetRefWindows', cmd = "variantCaller --log-level {log_level} {options} --referenceFilename {referenceset} -o {fastq} -o {gff} -o {fasta} {alignmentset}" system(cmd.format(**locals())) try: say('Converting fasta {!r} to contigset {!r}'.format(fasta, contigset)) # Convert to contigset.xml import pysam pysam.faidx(fasta) # pylint: disable=no-member # I do not know why pylint does not see this defined. ds = ContigSet(fasta, strict=True) ds.write(contigset, relPaths=True) say('Successfully wrapped fasta {!r} in contigset {!r}'.format( fasta, contigset)) except Exception: say(traceback.format_exc()) say('Skipping conversion to contigset.')
def test_contigset_build(self): ds1 = ContigSet(data.getXml(3), skipMissing=True) assert type(ds1).__name__ == 'ContigSet' assert type(ds1._metadata).__name__ == 'ContigSetMetadata' ds2 = ContigSet(data.getXml(3), skipMissing=True) assert type(ds2).__name__ == 'ContigSet' assert type(ds2._metadata).__name__ == 'ContigSetMetadata'
def run(referenceset, fastq, gff, fasta, contigset, alignmentset, options, log_level): #'--log-file foo.log', #'--verbose', #'--debug', # requires 'ipdb' #'-j NWORKERS', #'--algorithm quiver', #'--diploid', # binary #'--minConfidence 40', #'--minCoverage 5', #'--alignmentSetRefWindows', cmd = "variantCaller --log-level {log_level} {options} --referenceFilename {referenceset} -o {fastq} -o {gff} -o {fasta} {alignmentset}" system(cmd.format(**locals())) try: say('Converting fasta {!r} to contigset {!r}'.format(fasta, contigset)) # Convert to contigset.xml import pysam pysam.faidx(fasta) # pylint: disable=no-member # I do not know why pylint does not see this defined. ds = ContigSet(fasta, strict=True) ds.write(contigset, relPaths=True) say('Successfully wrapped fasta {!r} in contigset {!r}'.format(fasta, contigset)) except Exception: say(traceback.format_exc()) say('Skipping conversion to contigset.')
def _get_fasta_path(file_name): if file_name.endswith(".contigset.xml"): ds = ContigSet(file_name) fasta_files = ds.toExternalFiles() assert len(fasta_files) == 1 return fasta_files[0] return file_name
def test_contigset_build(self): ds1 = ContigSet(data.getXml(3), skipMissing=True) self.assertEquals(type(ds1).__name__, 'ContigSet') self.assertEquals(type(ds1._metadata).__name__, 'ContigSetMetadata') ds2 = ContigSet(data.getXml(3), skipMissing=True) self.assertEquals(type(ds2).__name__, 'ContigSet') self.assertEquals(type(ds2._metadata).__name__, 'ContigSetMetadata')
def test_contigset_consolidate_int_names(self): # build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used shutil.copyfile( ReferenceSet(data.getXml(8)).toExternalFiles()[0], inFas) rs1 = ContigSet(inFas) double = 'B.cereus.1' exp_double = rs1.get_contig(double) # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord('5141', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord('5142', exp_double.sequence) exp_double_seqs = [exp_double.sequence, exp_double.sequence] exp_names = ['5141', '5142'] obs_file = ContigSet(outFas1, outFas2) log.debug(obs_file.toExternalFiles()) obs_file.consolidate() log.debug(obs_file.toExternalFiles()) # open obs and compare to exp for name, seq in zip(exp_names, exp_double_seqs): assert obs_file.get_contig(name).sequence[:] == seq
def resolved_tool_contract_runner(resolved_contract): rc = resolved_contract alignment_path = rc.task.input_files[0] reference_path = rc.task.input_files[1] gff_path = rc.task.output_files[0] dataset_path = rc.task.output_files[1] fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path) fastq_path = rc.task.output_files[2] args = [ alignment_path, "--verbose", "--reference", reference_path, "--outputFilename", gff_path, "--outputFilename", fasta_path, "--outputFilename", fastq_path, "--numWorkers", str(rc.task.nproc), "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]), "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]), "--algorithm", rc.task.options[Constants.ALGORITHM_ID], "--alignmentSetRefWindows", ] if rc.task.options[Constants.DIPLOID_MODE_ID]: args.append("--diploid") args_ = get_parser().arg_parser.parser.parse_args(args) rc = args_runner(args_) if rc == 0: pysam.faidx(fasta_path) ds = ContigSet(fasta_path, strict=True) ds.write(dataset_path) return rc
def test_contigset_consolidate_genomic_consensus(self): """ Verify that the contigs output by GenomicConsensus (e.g. quiver) can be consolidated. """ FASTA1 = ("lambda_NEB3011_0_60", "GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCG") FASTA2 = ("lambda_NEB3011_120_180", "CACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTG") FASTA3 = ("lambda_NEB3011_60_120", "GTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGT") files = [] for i, (header, seq) in enumerate([FASTA1, FASTA2, FASTA3]): _files = [] for suffix in ["", "|quiver", "|plurality", "|arrow", "|poa"]: tmpfile = tempfile.NamedTemporaryFile(suffix=".fasta").name with open(tmpfile, "w") as f: f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq)) _files.append(tmpfile) files.append(_files) for i in range(3): ds = ContigSet(*[f[i] for f in files]) out1 = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name fa1 = tempfile.NamedTemporaryFile(suffix=".fasta").name ds.consolidate(fa1) ds.write(out1) with ContigSet(out1) as ds_new: self.assertEqual(len([rec for rec in ds_new]), 1, "failed on %d" % i)
def _write_fasta_or_contigset(file_name): fasta_file = re.sub(".contigset.xml", ".fasta", file_name) rec = [">chr%d\nacgtacgtacgt" % x for x in range(251)] with open(fasta_file, "w") as f: f.write("\n".join(rec)) if file_name.endswith(".xml"): cs = ContigSet(fasta_file) cs.write(file_name)
def test_contigset_build(self): ds1 = ContigSet(data.getXml(3)) self.assertEquals(type(ds1).__name__, 'ContigSet') self.assertEquals(type(ds1._metadata).__name__, 'ContigSetMetadata') ds2 = ContigSet(data.getXml(3)) self.assertEquals(type(ds2).__name__, 'ContigSet') self.assertEquals(type(ds2._metadata).__name__, 'ContigSetMetadata') for contigmd in ds2.metadata.contigs: self.assertEquals(type(contigmd).__name__, 'ContigMetadata')
def makeReport(inReadsFN, inSummaryFN, outDir): """ Generate a report with ID, tables, attributes and plot groups. inReadsFN --- an input FASTA file which has all consensus isoforms produced by pbtranscript.py cluster. This file is required to plot a read length histogram as part of the report: consensus_isoforms_readlength_hist.png inSummaryFN --- a summary TXT file with cluster attributes, including two attributes: number of consensus isoforms average length of consensus isoforms Attributes of the report are extracted from this file. """ log.info("Plotting read length histogram from file: {f}". format(f=inReadsFN)) # Collect read lengths of reader = ContigSet(inReadsFN) rs = [len(r.sequence) for r in reader] reader.close() readlengths = np.array(rs) # Plot read length histogram readlength_plot = create_readlength_plot(readlengths, outDir) readlength_group = PlotGroup(Constants.PG_READLENGTH, title="Read Length of Consensus Isoforms Reads", plots=[readlength_plot], thumbnail=readlength_plot.thumbnail) log.info("Plotting summary attributes from file: {f}". format(f=inSummaryFN)) # Produce attributes based on summary. dataset_uuids = [ContigSet(inReadsFN).uuid] if inSummaryFN.endswith(".json"): attributes = _report_to_attributes(inSummaryFN) r = load_report_from_json(inSummaryFN) # FIXME(nechols)(2016-03-22) not using the dataset UUIDs from these # reports; should we be? else: attributes = summaryToAttributes(inSummaryFN) table = attributesToTable(attributes) log.info(str(table)) # A report is consist of ID, tables, attributes, and plotgroups. report = Report(Constants.R_ID, title="Transcript Clustering", attributes=attributes, plotgroups=[readlength_group], dataset_uuids=dataset_uuids) return report
def __gather_contigset(resource_file_extension, input_files, output_file, new_resource_file=None, skip_empty=True): """ :param input_files: List of file paths :param output_file: File Path :param new_resource_file: the path of the file to which the other contig files are consolidated :param skip_empty: Ignore empty files (doesn't do much yet) :return: Output file :rtype: str """ if skip_empty: _input_files = [] for file_name in input_files: cs = ContigSet(file_name) if len(cs.toExternalFiles()) > 0: _input_files.append(file_name) input_files = _input_files tbr = ContigSet(*input_files) if not new_resource_file: if output_file.endswith('xml'): new_resource_file = output_file[:-3] + resource_file_extension tbr.consolidate(new_resource_file) tbr.newUuid() tbr.write(output_file) return output_file
def gather_contigset(input_files, output_file, new_resource_file=None, skip_empty=True): """ :param input_files: List of file paths :param output_file: File Path :param new_resource_file: the path of the file to which the other contig files are consolidated :param skip_empty: Ignore empty files (doesn't do much yet) :return: Output file :rtype: str """ if skip_empty: _input_files = [] for file_name in input_files: cs = ContigSet(file_name) if len(cs.toExternalFiles()) > 0: _input_files.append(file_name) input_files = _input_files tbr = ContigSet(*input_files) if not new_resource_file: if output_file.endswith('xml'): new_resource_file = output_file[:-3] + 'fasta' tbr.consolidate(new_resource_file) tbr.write(output_file) return output_file
def _write_fasta_or_contigset(file_name, make_faidx=False, n_records=251): fasta_file = re.sub(".contigset.xml", ".fasta", file_name) rec = [">chr%d\nacgtacgtacgt" % x for x in range(n_records)] with open(fasta_file, "w") as f: f.write("\n".join(rec)) f.flush() if make_faidx: pysam.faidx(fasta_file) if file_name.endswith(".xml"): cs = ContigSet(fasta_file, strict=make_faidx) cs.write(file_name)
def write_contigset_records(pbcore_writer_class, records, file_name): """ Writes the Chunked fasta files and Writes a ContigSet.xml Filename has contigset.xml """ fasta_file_name = ".".join(file_name.split(".")[:-2]) + ".fasta" write_pbcore_records(pbcore_writer_class, records, fasta_file_name) log.debug("Writing ContigSet XML to {f}".format(f=file_name)) ds = ContigSet(fasta_file_name) ds.write(file_name)
def makeReport(inReadsFN, inSummaryFN, outDir): """ Generate a report with ID, tables, attributes and plot groups. inReadsFN --- an input FASTA file which has all consensus isoforms produced by pbtranscript.py cluster. This file is required to plot a read length histogram as part of the report: consensus_isoforms_readlength_hist.png inSummaryFN --- a summary TXT file with cluster attributes, including two attributes: number of consensus isoforms average length of consensus isoforms Attributes of the report are extracted from this file. """ log.info("Plotting read length histogram from file: {f}". format(f=inReadsFN)) # Collect read lengths of reader = ContigSet(inReadsFN) rs = [len(r.sequence) for r in reader] reader.close() readlengths = np.array(rs) # Plot read length histogram readlength_plot = create_readlength_plot(readlengths, outDir) readlength_group = PlotGroup(Constants.PG_READLENGTH, plots=[readlength_plot], thumbnail=readlength_plot.thumbnail) log.info("Plotting summary attributes from file: {f}". format(f=inSummaryFN)) # Produce attributes based on summary. dataset_uuids = [ContigSet(inReadsFN).uuid] attributes = _report_to_attributes(inSummaryFN) r = load_report_from_json(inSummaryFN) # FIXME(nechols)(2016-03-22) not using the dataset UUIDs from these # reports; should we be? table = attributesToTable(attributes) log.info(str(table)) # A report is consist of ID, tables, attributes, and plotgroups. report = Report(Constants.R_ID, title=meta_rpt.title, attributes=attributes, plotgroups=[readlength_group], dataset_uuids=dataset_uuids) return meta_rpt.apply_view(report)
def run_after(self, rtc, output_dir): json_file = rtc.task.output_files[0] chunks = load_pipeline_chunks_from_json(json_file) n_rec = 0 with ContigSet(self.INPUT_FILES[0]) as f: n_rec = len(f) n_rec_chunked = 0 for chunk in chunks: d = chunk.chunk_d with ContigSet(d['$chunk.contigset_id']) as cs: n_rec_chunked += len([r for r in cs]) self._check_unchunked_files(d) self.assertEqual(n_rec_chunked, n_rec)
def test_merged_contigset(self): fn = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name with ContigSet(upstreamData.getLambdaFasta(), upstreamData.getFasta()) as cset: assert len(list(cset)) == 49 assert len(cset) == 49 cset.consolidate() cset.write(fn) log.debug("Writing to {f}".format(f=fn)) assert len(list(cset)) == 49 assert len(cset) == 49 with ContigSet(fn) as cset: assert len(list(cset)) == 49 assert len(cset) == 49
def test_missing_fai_error_message(self): outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') # copy fasta reference to hide fai and ensure FastaReader is used shutil.copyfile( ReferenceSet(data.getXml(8)).toExternalFiles()[0], inFas) rs1 = ContigSet(inFas) with pytest.raises(IOError) as cm: rs1.assertIndexed() assert str(cm) == ( "Companion FASTA index (.fai) file not found or malformatted! " "Use 'samtools faidx' to generate FASTA index.")
def test_contigset_consolidate_int_names(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format( i=ReferenceSet(data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) double = 'B.cereus.1' exp_double = rs1.get_contig(double) # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord('5141', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord('5142', exp_double.sequence) exp_double_seqs = [exp_double.sequence, exp_double.sequence] exp_names = ['5141', '5142'] obs_file = ContigSet(outFas1, outFas2) log.debug(obs_file.toExternalFiles()) obs_file.consolidate() log.debug(obs_file.toExternalFiles()) # open obs and compare to exp for name, seq in zip(exp_names, exp_double_seqs): self.assertEqual(obs_file.get_contig(name).sequence[:], seq)
def test_contigset_write(self): fasta = upstreamData.getLambdaFasta() ds = ContigSet(fasta) assert isinstance(ds.resourceReaders()[0], IndexedFastaReader) outdir = tempfile.mkdtemp(suffix="dataset-unittest") outfn = os.path.join(outdir, 'test.fasta') w = FastaWriter(outfn) for rec in ds: w.writeRecord(rec) w.close() fas = FastaReader(outfn) for rec in fas: # make sure a __repr__ didn't slip through: assert not rec.sequence.startswith('<')
def test_len_fastq(self): fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/' 'lambda/2590980/0008/Analysis_Results/' 'm141115_075238_ethan_c100699872550000001' '823139203261572_s1_p0.1.subreads.fastq') fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name with open(fq_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 24): fqh.write(line) cset = ContigSet(fq_out) assert not cset.isIndexed assert isinstance(cset.resourceReaders()[0], FastqReader) assert sum(1 for _ in cset) == sum(1 for _ in FastqReader(fq_out)) assert sum(1 for _ in cset) == 6
def test_contigset_write(self): fasta = upstreamData.getLambdaFasta() ds = ContigSet(fasta) self.assertTrue(isinstance(ds.resourceReaders()[0], IndexedFastaReader)) outdir = tempfile.mkdtemp(suffix="dataset-unittest") outfn = os.path.join(outdir, 'test.fasta') w = FastaWriter(outfn) for rec in ds: w.writeRecord(rec) w.close() fas = FastaReader(outfn) for rec in fas: # make sure a __repr__ didn't slip through: self.assertFalse(rec.sequence.startswith('<'))
def test_missing_fai_error_message(self): outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format(i=ReferenceSet( data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) with self.assertRaises(IOError) as cm: rs1.assertIndexed() self.assertEqual( str(cm.exception), ("Companion FASTA index (.fai) file not found or malformatted! " "Use 'samtools faidx' to generate FASTA index."))
def run_fasta_to_reference(input_file_name, output_file_name, organism, reference_name, ploidy): """Copied from pbcoretools/tasks/converters.py:run_fasta_to_reference() """ ds_in = ContigSet(input_file_name) if len(ds_in.externalResources) > 1: raise TypeError("Only a single FASTA file is supported as input.") fasta_file_name = ds_in.externalResources[0].resourceId output_dir_name = op.dirname(output_file_name) args = [ "fasta-to-reference", "--organism", organism, "--ploidy", ploidy, "--debug", fasta_file_name, output_dir_name, reference_name ] log.info(" ".join(args)) system(" ".join(args)) ref_file = op.join(output_dir_name, reference_name, "referenceset.xml") assert op.isfile(ref_file) with ReferenceSet(ref_file, strict=True) as ds_ref: ds_ref.makePathsAbsolute() log.info("saving final ReferenceSet to {f!r}".format(f=output_file_name)) ds_ref.write(output_file_name)
def run_fasta_to_reference(input_file_name, output_file_name, organism=None, reference_name=None, ploidy="haploid"): if reference_name is None or reference_name == "": reference_name = op.splitext(op.basename(input_file_name))[0] ds_in = ContigSet(input_file_name) if len(ds_in.externalResources) > 1: raise TypeError("Only a single FASTA file is supported as input.") fasta_file_name = ds_in.externalResources[0].resourceId output_dir_name = op.dirname(output_file_name) args = [ "fasta-to-reference", "--organism", str(organism) if organism != "" else "unknown", "--ploidy", str(ploidy) if ploidy != "" else "unknown", "--debug", fasta_file_name, output_dir_name, reference_name ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code ref_file = op.join(output_dir_name, reference_name, "referenceset.xml") assert op.isfile(ref_file) with ReferenceSet(ref_file, strict=True) as ds_ref: ds_ref.makePathsAbsolute() log.info("saving final ReferenceSet to {f}".format(f=output_file_name)) ds_ref.write(output_file_name) return 0
def getMetrics(cls): super(TestIsoSeqCluster, cls).getMetrics() cls.hq_fasta_file = cls.lq_fasta_file = None for file_id, file_info in cls.datastore.get_file_dict().iteritems(): if file_info.file_type_id == FileTypes.DS_CONTIG.file_type_id: file_name = op.basename(file_info.path) if file_name.startswith("hq_isoforms"): cls.hq_fasta_file = file_info.path with ContigSet(cls.hq_fasta_file) as ds: n = len(ds) cls.metric_dict["num_polished_hq_isoforms_fasta"] = n elif file_name.startswith("lq_isoforms"): cls.lq_fasta_file = file_info.path with ContigSet(cls.lq_fasta_file) as ds: n = len(ds) cls.metric_dict["num_polished_lq_isoforms_fasta"] = n
def test_len_fastq(self): fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/' 'lambda/2590980/0008/Analysis_Results/' 'm141115_075238_ethan_c100699872550000001' '823139203261572_s1_p0.1.subreads.fastq') fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name with open(fq_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 24): fqh.write(line) cset = ContigSet(fq_out) self.assertFalse(cset.isIndexed) self.assertTrue(isinstance(cset.resourceReaders()[0], FastqReader)) self.assertEqual(sum(1 for _ in cset), sum(1 for _ in FastqReader(fq_out))) self.assertEqual(sum(1 for _ in cset), 6)
def test_missing_fai_error_message(self): outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format( i=ReferenceSet(data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) with self.assertRaises(IOError) as cm: rs1.assertIndexed() self.assertEqual( str(cm.exception), ( "Companion FASTA index (.fai) file not found or malformatted! " "Use 'samtools faidx' to generate FASTA index."))
def test_write_contigset_records(self): records = [FastaRecord("chr1", "acgt"), FastaRecord("chr2", "tgca")] tmp_contigs = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name write_contigset_records(FastaWriter, records, tmp_contigs) with ContigSet(tmp_contigs) as ds_in: rec2 = [(rec.id, rec.sequence) for rec in ds_in] assert rec2 == [("chr1", "acgt"), ("chr2", "tgca")]
def contigset_to_fasta(rtc): with ContigSet(rtc.task.input_files[0]) as ds_in: if len(ds_in.externalResources) != 1: raise ValueError("This task assumes that the ContigSet contains "+ "only a single FASTA file.") file_name = ds_in.externalResources[0].resourceId os.symlink(file_name, rtc.task.output_files[0]) return 0
def as_contigset(fasta_file, xml_file): if fasta_file == xml_file or xml_file is None: if not op.isfile(fasta_file) or op.getsize(fasta_file) == 0: return ContigSet() return ContigSet(fasta_file) file_size = op.getsize(fasta_file) fai_file = fasta_file + ".fai" if op.exists(fai_file): os.remove(fai_file) ds = ContigSet(fasta_file, generateIndices=True) ds.write(xml_file) if not file_size > 0: with open(fai_file, "w") as fai: fai.write("") return ds
def test_contigset_consolidate_genomic_consensus(self): """ Verify that the contigs output by GenomicConsensus (e.g. quiver) can be consolidated. """ FASTA1 = ( "lambda_NEB3011_0_60", "GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCG") FASTA2 = ( "lambda_NEB3011_120_180", "CACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTG") FASTA3 = ( "lambda_NEB3011_60_120", "GTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGT") files = [] for i, (header, seq) in enumerate([FASTA1, FASTA2, FASTA3]): _files = [] for suffix in ["", "|quiver", "|plurality", "|arrow", "|poa"]: tmpfile = tempfile.NamedTemporaryFile(suffix=".fasta").name with open(tmpfile, "w") as f: f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq)) _files.append(tmpfile) files.append(_files) for i in range(3): ds = ContigSet(*[f[i] for f in files]) out1 = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name fa1 = tempfile.NamedTemporaryFile(suffix=".fasta").name ds.consolidate(fa1) ds.write(out1) with ContigSet(out1) as ds_new: assert len([rec for rec in ds_new]) == 1
def test_contigset_empty(self): fa_file = tempfile.NamedTemporaryFile(suffix=".fasta").name ds_file = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name open(fa_file, "w").write("") ds = ContigSet(fa_file, strict=False) ds.write(ds_file) fai_file = fa_file + ".fai" open(fai_file, "w").write("") ds = ContigSet(fa_file, strict=True) ds.write(ds_file) self.assertEqual(len(ds), 0)
def test_contigset_empty(self): fa_file = tempfile.NamedTemporaryFile(suffix=".fasta") ds_file = tempfile.NamedTemporaryFile(suffix=".contigset.xml") open(fa_file.name, "w").write("") ds = ContigSet(fa_file.name, strict=False) ds.write(ds_file.name) fai_file = fa_file.name + ".fai" open(fai_file, "w").write("") ds = ContigSet(fa_file.name, strict=True) ds.write(ds_file.name) assert len(ds) == 0 fa_file.close() ds_file.close()
def run_after(self, rtc, output_dir): rpt = None uuids = [] for file_name in rtc.task.output_files: if file_name.endswith(".json"): rpt = load_report_from_json(file_name) elif file_name.endswith(".xml"): uuids.append(ContigSet(file_name, strict=True).uuid) else: assert file_name.endswith(".csv") self.assertEqual(sorted(rpt._dataset_uuids), sorted(uuids))
def gather_contigset(input_files, output_file, new_resource_file=None, skip_empty=True): """ :param input_files: List of file paths :param output_file: File Path :param new_resource_file: the path of the file to which the other contig files are consolidated :param skip_empty: Ignore empty files (doesn't do much yet) :return: Output file :rtype: str """ tbr = ContigSet(*input_files) if not new_resource_file: if output_file.endswith('xml'): new_resource_file = output_file[:-3] + 'fasta' tbr.consolidate(new_resource_file) tbr.write(output_file) return output_file
def test_fastq_consolidate(self): fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/' 'lambda/2590980/0008/Analysis_Results/' 'm141115_075238_ethan_c100699872550000001' '823139203261572_s1_p0.1.subreads.fastq') fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name with open(fq_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240): fqh.write(line) cset = ContigSet(fq_out) cset_l = sum(1 for _ in cset) self.assertEqual(cset_l, 60) cset.filters.addRequirement(length=[('>', 1000)]) cset_l = sum(1 for _ in cset) self.assertEqual(cset_l, 23) cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) self.assertEqual(cset_l, 23) self.assertEqual(cset_l, sum(1 for _ in cfq))
def test_not_all_lowercase_contigs(self): """ Test that no contigs in the output ContigSet have only lowercase characters. """ n_uppercase = 0 for file_id, file_info in self.datastore.get_file_dict().iteritems(): if file_info.file_type_id == FileTypes.DS_CONTIG.file_type_id: if not file_info.is_chunked: with ContigSet(file_info.path) as contigs: for contig in contigs: c = Counter(contig.sequence) n_uppercase += c['A'] + c['C'] + c['G'] + c['T'] self.assertTrue(n_uppercase > 0, "All contigs are lowercase-only")
def _open_files(self, *input_filenames): """Open file handers and return.""" readers = [] for fn in input_filenames: if ContigSetReaderWrapper.get_file_type(fn) == "FASTA": readers.append(FastaReader(fn)) elif ContigSetReaderWrapper.get_file_type(fn) == "FASTQ": readers.append(FastqReader(fn)) elif ContigSetReaderWrapper.get_file_type(fn) == "CONTIGSET": readers.append(ContigSet(fn)) else: raise IOError( "Could not read %s as FASTA/FASTQ/CONTIGSET file." % fn) return readers
def write_cluster_summary(summary_fn, isoforms_fa, hq_fa=None, lq_fa=None): """Extract number of consensus isoforms predicted, and total number of bases in all consensuus isoforms from isoforms_fa and write the two attributes to summary_fn. if hq_fa (polished high-quality isoforms) is not None, report the number of polished hq clusters if lq_fa (polished high-quality isoforms) is not None, report the number of polished hq clusters """ try: summary = ClusterSummary() dataset_uuids = [] with ContigSet(isoforms_fa) as reader: for r in reader: summary.num_consensus_isoforms += 1 summary.num_total_bases += len(r.sequence[:]) dataset_uuids.append(reader.uuid) if hq_fa is not None and op.getsize(hq_fa) > 0: summary.num_polished_hq_isoforms = 0 with ContigSet(hq_fa) as reader: for r in reader: summary.num_polished_hq_isoforms += 1 dataset_uuids.append(reader.uuid) if lq_fa is not None and op.getsize(lq_fa) > 0: summary.num_polished_lq_isoforms = 0 with ContigSet(lq_fa) as reader: for r in reader: summary.num_polished_lq_isoforms += 1 dataset_uuids.append(reader.uuid) summary.write(summary_fn, dataset_uuids=dataset_uuids) except ZeroDivisionError: errMsg = "No consensus isoforms predicted." logging.error(errMsg) raise RuntimeError(errMsg)
def n_reads_in_contigset(contigset_file): """Return number of reads in a contigset""" cs = ContigSet(contigset_file) cs.assertIndexed() return int(cs.numRecords)
def test_empty_fastq_consolidate(self): fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/' 'lambda/2590980/0008/Analysis_Results/' 'm141115_075238_ethan_c100699872550000001' '823139203261572_s1_p0.1.subreads.fastq') fq1_out = tempfile.NamedTemporaryFile(suffix="1.fastq").name fq2_out = tempfile.NamedTemporaryFile(suffix="2.fastq").name cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name # Two full with open(fq1_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240): fqh.write(line) with open(fq2_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240, 480): fqh.write(line) cset = ContigSet(fq1_out, fq2_out) cset_l = sum(1 for _ in cset) self.assertEqual(cset_l, 120) cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) self.assertEqual(cset_l, 120) self.assertEqual(cset_l, sum(1 for _ in cfq)) # one full one empty with open(fq1_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240): fqh.write(line) with open(fq2_out, 'w') as fqh: with open(fn, 'r') as fih: fqh.write("") cset = ContigSet(fq1_out, fq2_out) cset_l = sum(1 for _ in cset) self.assertEqual(cset_l, 60) cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) self.assertEqual(cset_l, 60) self.assertEqual(cset_l, sum(1 for _ in cfq)) # one empty one full with open(fq1_out, 'w') as fqh: with open(fn, 'r') as fih: fqh.write("") with open(fq2_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240): fqh.write(line) cset = ContigSet(fq1_out, fq2_out) cset_l = sum(1 for _ in cset) self.assertEqual(cset_l, 60) cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) self.assertEqual(cset_l, 60) self.assertEqual(cset_l, sum(1 for _ in cfq)) # both empty with open(fq1_out, 'w') as fqh: with open(fn, 'r') as fih: fqh.write("") with open(fq2_out, 'w') as fqh: with open(fn, 'r') as fih: fqh.write("") cset = ContigSet(fq1_out, fq2_out) cset_l = sum(1 for _ in cset) self.assertEqual(cset_l, 0) cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) self.assertEqual(cset_l, 0) self.assertEqual(cset_l, sum(1 for _ in cfq))
def test_contigset_consolidate(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format( i=ReferenceSet(data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) singletons = ['A.baumannii.1', 'A.odontolyticus.1'] double = 'B.cereus.1' reader = rs1.resourceReaders()[0] exp_double = rs1.get_contig(double) exp_singles = [rs1.get_contig(name) for name in singletons] # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord(exp_singles[0]) writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord(exp_double.name + '_0_10', exp_double.sequence + 'ATCGATCGATCG') writer.writeRecord(exp_singles[1]) exp_double_seq = ''.join([exp_double.sequence, 'ATCGATCGATCG', exp_double.sequence]) exp_single_seqs = [rec.sequence for rec in exp_singles] acc_file = ContigSet(outFas1, outFas2) acc_file.induceIndices() log.debug(acc_file.toExternalFiles()) self.assertEqual(len(acc_file), 4) self.assertEqual(len(list(acc_file)), 4) acc_file.consolidate() log.debug(acc_file.toExternalFiles()) # open acc and compare to exp for name, seq in zip(singletons, exp_single_seqs): self.assertEqual(acc_file.get_contig(name).sequence[:], seq) self.assertEqual(acc_file.get_contig(double).sequence[:], exp_double_seq) self.assertEqual(len(acc_file._openReaders), 1) self.assertEqual(len(acc_file.index), 3) self.assertEqual(len(acc_file._indexMap), 3) self.assertEqual(len(acc_file), 3) self.assertEqual(len(list(acc_file)), 3)
def consolidateXml(args): """Combine BAMs and apply the filters described in the XML file, producing one consolidated XML""" dset = ContigSet(args.infile) dset.consolidate(args.datafile) dset.write(args.xmlfile)
def makeReport(inReadsFN, hq_isoforms_fq, lq_isoforms_fq, inSummaryFN, outDir): """ Generate a report with ID, tables, attributes and plot groups. inReadsFN --- an input FASTA file which has all consensus isoforms produced by pbtranscript.py cluster. This file is required to plot a read length histogram as part of the report: consensus_isoforms_readlength_hist.png hq_isoforms_fq/lq_isoforms_lq --- input FASTQ files which has all HQ/LQ isoforms produced by pbtranscript.py cluster. These two files will be required to plot the average QV histograms: hq_lq_isoforms_avgqv_hist.png inSummaryFN --- a summary TXT file with cluster attributes, including two attributes: number of consensus isoforms average length of consensus isoforms Attributes of the report are extracted from this file. """ log.info("Plotting read length histogram from file: {f}". format(f=inReadsFN)) # Collect read lengths of reader = ContigSet(inReadsFN) rs = [len(r.sequence) for r in reader] reader.close() readlengths = np.array(rs).astype(float) # Plot read length histogram readlength_plot = create_readlength_plot(readlengths, outDir) readlength_group = PlotGroup(Constants.PG_READLENGTH, plots=[readlength_plot], thumbnail=readlength_plot.thumbnail) # Collect average qvs hq_qvs = [np.mean(r.quality) for r in ContigSet(hq_isoforms_fq)] lq_qvs = [np.mean(r.quality) for r in ContigSet(lq_isoforms_fq)] avgqvs = np.array(hq_qvs + lq_qvs) # Plot average qv histogram avgqv_plot = create_avgqv_plot(avgqvs, outDir) avgqv_group = PlotGroup(Constants.PG_AVGQV, plots=[avgqv_plot], thumbnail=avgqv_plot.thumbnail) log.info("Plotting summary attributes from file: {f}". format(f=inSummaryFN)) # Produce attributes based on summary. dataset_uuids = [ContigSet(inReadsFN).uuid] attributes = _report_to_attributes(inSummaryFN) r = load_report_from_json(inSummaryFN) # FIXME(nechols)(2016-03-22) not using the dataset UUIDs from these # reports; should we be? table = attributesToTable(attributes) log.info(str(table)) # A report is consist of ID, tables, attributes, and plotgroups. report = Report(Constants.R_ID, attributes=attributes, plotgroups=[readlength_group, avgqv_group], dataset_uuids=dataset_uuids) return spec.apply_view(report)