def run(referenceset, fastq, gff, fasta, contigset, alignmentset, options, log_level): #'--log-file foo.log', #'--verbose', #'--debug', # requires 'ipdb' #'-j NWORKERS', #'--algorithm quiver', #'--diploid', # binary #'--minConfidence 40', #'--minCoverage 5', #'--alignmentSetRefWindows', cmd = "variantCaller --log-level {log_level} {options} --referenceFilename {referenceset} -o {fastq} -o {gff} -o {fasta} {alignmentset}" system(cmd.format(**locals())) try: say('Converting fasta {!r} to contigset {!r}'.format(fasta, contigset)) # Convert to contigset.xml import pysam pysam.faidx(fasta) # pylint: disable=no-member # I do not know why pylint does not see this defined. ds = ContigSet(fasta, strict=True) ds.write(contigset, relPaths=True) say('Successfully wrapped fasta {!r} in contigset {!r}'.format( fasta, contigset)) except Exception: say(traceback.format_exc()) say('Skipping conversion to contigset.')
def test_contigset_consolidate_genomic_consensus(self): """ Verify that the contigs output by GenomicConsensus (e.g. quiver) can be consolidated. """ FASTA1 = ( "lambda_NEB3011_0_60", "GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCG") FASTA2 = ( "lambda_NEB3011_120_180", "CACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTG") FASTA3 = ( "lambda_NEB3011_60_120", "GTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGT") files = [] for i, (header, seq) in enumerate([FASTA1, FASTA2, FASTA3]): _files = [] for suffix in ["", "|quiver", "|plurality", "|arrow", "|poa"]: tmpfile = tempfile.NamedTemporaryFile(suffix=".fasta").name with open(tmpfile, "w") as f: f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq)) _files.append(tmpfile) files.append(_files) for i in range(3): ds = ContigSet(*[f[i] for f in files]) out1 = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name fa1 = tempfile.NamedTemporaryFile(suffix=".fasta").name ds.consolidate(fa1) ds.write(out1) with ContigSet(out1) as ds_new: assert len([rec for rec in ds_new]) == 1
def resolved_tool_contract_runner(resolved_contract): rc = resolved_contract alignment_path = rc.task.input_files[0] reference_path = rc.task.input_files[1] gff_path = rc.task.output_files[0] vcf_path = rc.task.output_files[1] dataset_path = rc.task.output_files[2] fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path) fastq_path = rc.task.output_files[3] args = [ alignment_path, "--verbose", "--reference", reference_path, "--outputFilename", gff_path, "--outputFilename", fasta_path, "--outputFilename", fastq_path, "--outputFilename", vcf_path, "--numWorkers", str(rc.task.nproc), "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]), "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]), "--maskRadius", str(Constants.DEFAULT_MASK_RADIUS) if \ bool(rc.task.options[Constants.MASKING_ID]) else "0", "--algorithm", rc.task.options[Constants.ALGORITHM_ID], "--alignmentSetRefWindows", ] args_ = get_parser().arg_parser.parser.parse_args(args) rc = args_runner(args_) if rc == 0: pysam.faidx(fasta_path) ds = ContigSet(fasta_path, strict=True) ds.write(dataset_path) return rc
def resolved_tool_contract_runner(resolved_contract): rc = resolved_contract alignment_path = rc.task.input_files[0] reference_path = rc.task.input_files[1] gff_path = rc.task.output_files[0] dataset_path = rc.task.output_files[1] fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path) fastq_path = rc.task.output_files[2] args = [ alignment_path, "--verbose", "--reference", reference_path, "--outputFilename", gff_path, "--outputFilename", fasta_path, "--outputFilename", fastq_path, "--numWorkers", str(rc.task.nproc), "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]), "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]), "--algorithm", rc.task.options[Constants.ALGORITHM_ID], "--alignmentSetRefWindows", ] if rc.task.options[Constants.DIPLOID_MODE_ID]: args.append("--diploid") args_ = get_parser().arg_parser.parser.parse_args(args) rc = args_runner(args_) if rc == 0: pysam.faidx(fasta_path) ds = ContigSet(fasta_path, strict=True) ds.write(dataset_path) return rc
def run(referenceset, fastq, gff, fasta, contigset, alignmentset, options, log_level): #'--log-file foo.log', #'--verbose', #'--debug', # requires 'ipdb' #'-j NWORKERS', #'--algorithm quiver', #'--diploid', # binary #'--minConfidence 40', #'--minCoverage 5', #'--alignmentSetRefWindows', cmd = "variantCaller --log-level {log_level} {options} --referenceFilename {referenceset} -o {fastq} -o {gff} -o {fasta} {alignmentset}" system(cmd.format(**locals())) try: say('Converting fasta {!r} to contigset {!r}'.format(fasta, contigset)) # Convert to contigset.xml import pysam pysam.faidx(fasta) # pylint: disable=no-member # I do not know why pylint does not see this defined. ds = ContigSet(fasta, strict=True) ds.write(contigset, relPaths=True) say('Successfully wrapped fasta {!r} in contigset {!r}'.format(fasta, contigset)) except Exception: say(traceback.format_exc()) say('Skipping conversion to contigset.')
def __gather_contigset(resource_file_extension, input_files, output_file, new_resource_file=None, skip_empty=True): """ :param input_files: List of file paths :param output_file: File Path :param new_resource_file: the path of the file to which the other contig files are consolidated :param skip_empty: Ignore empty files (doesn't do much yet) :return: Output file :rtype: str """ if skip_empty: _input_files = [] for file_name in input_files: cs = ContigSet(file_name) if len(cs.toExternalFiles()) > 0: _input_files.append(file_name) input_files = _input_files tbr = ContigSet(*input_files) if not new_resource_file: if output_file.endswith('xml'): new_resource_file = output_file[:-3] + resource_file_extension tbr.consolidate(new_resource_file) tbr.newUuid() sanitize_dataset_tags(tbr) tbr.write(output_file) return output_file
def test_contigset_consolidate_genomic_consensus(self): """ Verify that the contigs output by GenomicConsensus (e.g. quiver) can be consolidated. """ FASTA1 = ("lambda_NEB3011_0_60", "GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCG") FASTA2 = ("lambda_NEB3011_120_180", "CACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTG") FASTA3 = ("lambda_NEB3011_60_120", "GTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGT") files = [] for i, (header, seq) in enumerate([FASTA1, FASTA2, FASTA3]): _files = [] for suffix in ["", "|quiver", "|plurality", "|arrow", "|poa"]: tmpfile = tempfile.NamedTemporaryFile(suffix=".fasta").name with open(tmpfile, "w") as f: f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq)) _files.append(tmpfile) files.append(_files) for i in range(3): ds = ContigSet(*[f[i] for f in files]) out1 = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name fa1 = tempfile.NamedTemporaryFile(suffix=".fasta").name ds.consolidate(fa1) ds.write(out1) with ContigSet(out1) as ds_new: self.assertEqual(len([rec for rec in ds_new]), 1, "failed on %d" % i)
def __gather_contigset(resource_file_extension, input_files, output_file, new_resource_file=None, skip_empty=True): """ :param input_files: List of file paths :param output_file: File Path :param new_resource_file: the path of the file to which the other contig files are consolidated :param skip_empty: Ignore empty files (doesn't do much yet) :return: Output file :rtype: str """ if skip_empty: _input_files = [] for file_name in input_files: cs = ContigSet(file_name) if len(cs.toExternalFiles()) > 0: _input_files.append(file_name) input_files = _input_files tbr = ContigSet(*input_files) if not new_resource_file: if output_file.endswith('xml'): new_resource_file = output_file[:-3] + resource_file_extension tbr.consolidate(new_resource_file) tbr.newUuid() tbr.write(output_file) return output_file
def _write_fasta_or_contigset(file_name): fasta_file = re.sub(".contigset.xml", ".fasta", file_name) rec = [">chr%d\nacgtacgtacgt" % x for x in range(251)] with open(fasta_file, "w") as f: f.write("\n".join(rec)) if file_name.endswith(".xml"): cs = ContigSet(fasta_file) cs.write(file_name)
def _write_fasta_or_contigset(file_name, make_faidx=False, n_records=251): fasta_file = re.sub(".contigset.xml", ".fasta", file_name) rec = [">chr%d\nacgtacgtacgt" % x for x in range(n_records)] with open(fasta_file, "w") as f: f.write("\n".join(rec)) f.flush() if make_faidx: pysam.faidx(fasta_file) if file_name.endswith(".xml"): cs = ContigSet(fasta_file, strict=make_faidx) cs.write(file_name)
def test_contigset_empty(self): fa_file = tempfile.NamedTemporaryFile(suffix=".fasta").name ds_file = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name open(fa_file, "w").write("") ds = ContigSet(fa_file, strict=False) ds.write(ds_file) fai_file = fa_file + ".fai" open(fai_file, "w").write("") ds = ContigSet(fa_file, strict=True) ds.write(ds_file) self.assertEqual(len(ds), 0)
def write_contigset_records(pbcore_writer_class, records, file_name): """ Writes the Chunked fasta files and Writes a ContigSet.xml Filename has contigset.xml """ fasta_file_name = ".".join(file_name.split(".")[:-2]) + ".fasta" write_pbcore_records(pbcore_writer_class, records, fasta_file_name) log.debug("Writing ContigSet XML to {f}".format(f=file_name)) ds = ContigSet(fasta_file_name) ds.write(file_name)
def test_contigset_empty(self): fa_file = tempfile.NamedTemporaryFile(suffix=".fasta") ds_file = tempfile.NamedTemporaryFile(suffix=".contigset.xml") open(fa_file.name, "w").write("") ds = ContigSet(fa_file.name, strict=False) ds.write(ds_file.name) fai_file = fa_file.name + ".fai" open(fai_file, "w").write("") ds = ContigSet(fa_file.name, strict=True) ds.write(ds_file.name) assert len(ds) == 0 fa_file.close() ds_file.close()
def __gather_contigset(input_files, output_file, new_resource_file): """Copied from pbcoretools.chunking.gather:__gather_contigset() """ skip_empty = True if skip_empty: _input_files = [] for file_name in input_files: cs = ContigSet(file_name) if len(cs.toExternalFiles()) > 0: _input_files.append(file_name) input_files = _input_files tbr = ContigSet(*input_files) tbr.consolidate(new_resource_file) tbr.newUuid() tbr.write(output_file, relPaths=True) return output_file
def as_contigset(fasta_file, xml_file): if fasta_file == xml_file or xml_file is None: if not op.isfile(fasta_file) or op.getsize(fasta_file) == 0: return ContigSet() return ContigSet(fasta_file) file_size = op.getsize(fasta_file) fai_file = fasta_file + ".fai" if op.exists(fai_file): os.remove(fai_file) ds = ContigSet(fasta_file, generateIndices=True) ds.write(xml_file) if not file_size > 0: with open(fai_file, "w") as fai: fai.write("") return ds
def gather_contigset(input_files, output_file, new_resource_file=None, skip_empty=True): """ :param input_files: List of file paths :param output_file: File Path :param new_resource_file: the path of the file to which the other contig files are consolidated :param skip_empty: Ignore empty files (doesn't do much yet) :return: Output file :rtype: str """ tbr = ContigSet(*input_files) if not new_resource_file: if output_file.endswith('xml'): new_resource_file = output_file[:-3] + 'fasta' tbr.consolidate(new_resource_file) tbr.write(output_file) return output_file
def test_fastq_consolidate(self): fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/' 'lambda/2590980/0008/Analysis_Results/' 'm141115_075238_ethan_c100699872550000001' '823139203261572_s1_p0.1.subreads.fastq') fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name cset_out = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name with open(fq_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240): fqh.write(line) cset = ContigSet(fq_out) cset_l = sum(1 for _ in cset) assert cset_l == 60 cset.filters.addRequirement(length=[('>', 1000)]) cset_l = sum(1 for _ in cset) assert cset_l == 23 cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) assert cset_l == 23 assert cset_l == sum(1 for _ in cfq) cset.write(cset_out)
def test_fastq_consolidate(self): fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/' 'lambda/2590980/0008/Analysis_Results/' 'm141115_075238_ethan_c100699872550000001' '823139203261572_s1_p0.1.subreads.fastq') fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name cset_out = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name with open(fq_out, 'w') as fqh: with open(fn, 'r') as fih: for line in itertools.islice(fih, 240): fqh.write(line) cset = ContigSet(fq_out) cset_l = sum(1 for _ in cset) self.assertEqual(cset_l, 60) cset.filters.addRequirement(length=[('>', 1000)]) cset_l = sum(1 for _ in cset) self.assertEqual(cset_l, 23) cset.consolidate(cfq_out) cset_l = sum(1 for _ in cset) cfq = FastqReader(cfq_out) self.assertEqual(cset_l, 23) self.assertEqual(cset_l, sum(1 for _ in cfq)) cset.write(cset_out)
def consolidateXml(args): """Combine BAMs and apply the filters described in the XML file, producing one consolidated XML""" dset = ContigSet(args.infile) dset.consolidate(args.datafile) dset.write(args.xmlfile)