def run(referenceset, fastq, gff, fasta, contigset, alignmentset, options,
        log_level):
    #'--log-file foo.log',
    #'--verbose',
    #'--debug', # requires 'ipdb'
    #'-j NWORKERS',
    #'--algorithm quiver',
    #'--diploid', # binary
    #'--minConfidence 40',
    #'--minCoverage 5',
    #'--alignmentSetRefWindows',
    cmd = "variantCaller --log-level {log_level} {options} --referenceFilename {referenceset} -o {fastq} -o {gff} -o {fasta} {alignmentset}"
    system(cmd.format(**locals()))
    try:
        say('Converting fasta {!r} to contigset {!r}'.format(fasta, contigset))
        # Convert to contigset.xml

        import pysam
        pysam.faidx(fasta)  # pylint: disable=no-member
        # I do not know why pylint does not see this defined.

        ds = ContigSet(fasta, strict=True)
        ds.write(contigset, relPaths=True)
        say('Successfully wrapped fasta {!r} in contigset {!r}'.format(
            fasta, contigset))
    except Exception:
        say(traceback.format_exc())
        say('Skipping conversion to contigset.')
Esempio n. 2
0
 def test_contigset_consolidate_genomic_consensus(self):
     """
     Verify that the contigs output by GenomicConsensus (e.g. quiver) can
     be consolidated.
     """
     FASTA1 = (
         "lambda_NEB3011_0_60",
         "GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCG")
     FASTA2 = (
         "lambda_NEB3011_120_180",
         "CACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTG")
     FASTA3 = (
         "lambda_NEB3011_60_120",
         "GTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGT")
     files = []
     for i, (header, seq) in enumerate([FASTA1, FASTA2, FASTA3]):
         _files = []
         for suffix in ["", "|quiver", "|plurality", "|arrow", "|poa"]:
             tmpfile = tempfile.NamedTemporaryFile(suffix=".fasta").name
             with open(tmpfile, "w") as f:
                 f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq))
             _files.append(tmpfile)
         files.append(_files)
     for i in range(3):
         ds = ContigSet(*[f[i] for f in files])
         out1 = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
         fa1 = tempfile.NamedTemporaryFile(suffix=".fasta").name
         ds.consolidate(fa1)
         ds.write(out1)
         with ContigSet(out1) as ds_new:
             assert len([rec for rec in ds_new]) == 1
def resolved_tool_contract_runner(resolved_contract):
    rc = resolved_contract
    alignment_path = rc.task.input_files[0]
    reference_path = rc.task.input_files[1]
    gff_path = rc.task.output_files[0]
    vcf_path = rc.task.output_files[1]
    dataset_path = rc.task.output_files[2]
    fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path)
    fastq_path = rc.task.output_files[3]
    args = [
        alignment_path,
        "--verbose",
        "--reference", reference_path,
        "--outputFilename", gff_path,
        "--outputFilename", fasta_path,
        "--outputFilename", fastq_path,
        "--outputFilename", vcf_path,
        "--numWorkers", str(rc.task.nproc),
        "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]),
        "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]),
        "--maskRadius", str(Constants.DEFAULT_MASK_RADIUS) if \
                        bool(rc.task.options[Constants.MASKING_ID]) else "0",
        "--algorithm", rc.task.options[Constants.ALGORITHM_ID],
        "--alignmentSetRefWindows",
    ]
    args_ = get_parser().arg_parser.parser.parse_args(args)
    rc = args_runner(args_)
    if rc == 0:
        pysam.faidx(fasta_path)
        ds = ContigSet(fasta_path, strict=True)
        ds.write(dataset_path)
    return rc
Esempio n. 4
0
def resolved_tool_contract_runner(resolved_contract):
    rc = resolved_contract
    alignment_path = rc.task.input_files[0]
    reference_path = rc.task.input_files[1]
    gff_path = rc.task.output_files[0]
    vcf_path = rc.task.output_files[1]
    dataset_path = rc.task.output_files[2]
    fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path)
    fastq_path = rc.task.output_files[3]
    args = [
        alignment_path,
        "--verbose",
        "--reference", reference_path,
        "--outputFilename", gff_path,
        "--outputFilename", fasta_path,
        "--outputFilename", fastq_path,
        "--outputFilename", vcf_path,
        "--numWorkers", str(rc.task.nproc),
        "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]),
        "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]),
        "--maskRadius", str(Constants.DEFAULT_MASK_RADIUS) if \
                        bool(rc.task.options[Constants.MASKING_ID]) else "0",
        "--algorithm", rc.task.options[Constants.ALGORITHM_ID],
        "--alignmentSetRefWindows",
    ]
    args_ = get_parser().arg_parser.parser.parse_args(args)
    rc = args_runner(args_)
    if rc == 0:
        pysam.faidx(fasta_path)
        ds = ContigSet(fasta_path, strict=True)
        ds.write(dataset_path)
    return rc
Esempio n. 5
0
def resolved_tool_contract_runner(resolved_contract):
    rc = resolved_contract
    alignment_path = rc.task.input_files[0]
    reference_path = rc.task.input_files[1]
    gff_path = rc.task.output_files[0]
    dataset_path = rc.task.output_files[1]
    fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path)
    fastq_path = rc.task.output_files[2]
    args = [
        alignment_path,
        "--verbose",
        "--reference", reference_path,
        "--outputFilename", gff_path,
        "--outputFilename", fasta_path,
        "--outputFilename", fastq_path,
        "--numWorkers", str(rc.task.nproc),
        "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]),
        "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]),
        "--algorithm", rc.task.options[Constants.ALGORITHM_ID],
        "--alignmentSetRefWindows",
    ]
    if rc.task.options[Constants.DIPLOID_MODE_ID]:
        args.append("--diploid")
    args_ = get_parser().arg_parser.parser.parse_args(args)
    rc = args_runner(args_)
    if rc == 0:
        pysam.faidx(fasta_path)
        ds = ContigSet(fasta_path, strict=True)
        ds.write(dataset_path)
    return rc
def run(referenceset, fastq, gff, fasta, contigset, alignmentset, options, log_level):
    #'--log-file foo.log',
    #'--verbose',
    #'--debug', # requires 'ipdb'
    #'-j NWORKERS',
    #'--algorithm quiver',
    #'--diploid', # binary
    #'--minConfidence 40',
    #'--minCoverage 5',
    #'--alignmentSetRefWindows',
    cmd = "variantCaller --log-level {log_level} {options} --referenceFilename {referenceset} -o {fastq} -o {gff} -o {fasta} {alignmentset}"
    system(cmd.format(**locals()))
    try:
        say('Converting fasta {!r} to contigset {!r}'.format(fasta, contigset))
        # Convert to contigset.xml

        import pysam
        pysam.faidx(fasta) # pylint: disable=no-member
        # I do not know why pylint does not see this defined.

        ds = ContigSet(fasta, strict=True)
        ds.write(contigset, relPaths=True)
        say('Successfully wrapped fasta {!r} in contigset {!r}'.format(fasta, contigset))
    except Exception:
        say(traceback.format_exc())
        say('Skipping conversion to contigset.')
Esempio n. 7
0
def __gather_contigset(resource_file_extension,
                       input_files,
                       output_file,
                       new_resource_file=None,
                       skip_empty=True):
    """
    :param input_files: List of file paths
    :param output_file: File Path
    :param new_resource_file: the path of the file to which the other contig
                              files are consolidated
    :param skip_empty: Ignore empty files (doesn't do much yet)

    :return: Output file

    :rtype: str
    """
    if skip_empty:
        _input_files = []
        for file_name in input_files:
            cs = ContigSet(file_name)
            if len(cs.toExternalFiles()) > 0:
                _input_files.append(file_name)
        input_files = _input_files
    tbr = ContigSet(*input_files)
    if not new_resource_file:
        if output_file.endswith('xml'):
            new_resource_file = output_file[:-3] + resource_file_extension
    tbr.consolidate(new_resource_file)
    tbr.newUuid()
    sanitize_dataset_tags(tbr)
    tbr.write(output_file)
    return output_file
 def test_contigset_consolidate_genomic_consensus(self):
     """
     Verify that the contigs output by GenomicConsensus (e.g. quiver) can
     be consolidated.
     """
     FASTA1 = ("lambda_NEB3011_0_60",
         "GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCG")
     FASTA2 = ("lambda_NEB3011_120_180",
         "CACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTG")
     FASTA3 = ("lambda_NEB3011_60_120",
         "GTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGT")
     files = []
     for i, (header, seq) in enumerate([FASTA1, FASTA2, FASTA3]):
         _files = []
         for suffix in ["", "|quiver", "|plurality", "|arrow", "|poa"]:
             tmpfile = tempfile.NamedTemporaryFile(suffix=".fasta").name
             with open(tmpfile, "w") as f:
                 f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq))
             _files.append(tmpfile)
         files.append(_files)
     for i in range(3):
         ds = ContigSet(*[f[i] for f in files])
         out1 = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
         fa1 = tempfile.NamedTemporaryFile(suffix=".fasta").name
         ds.consolidate(fa1)
         ds.write(out1)
         with ContigSet(out1) as ds_new:
             self.assertEqual(len([rec for rec in ds_new]), 1,
                              "failed on %d" % i)
Esempio n. 9
0
def __gather_contigset(resource_file_extension, input_files, output_file,
                       new_resource_file=None,
                       skip_empty=True):
    """
    :param input_files: List of file paths
    :param output_file: File Path
    :param new_resource_file: the path of the file to which the other contig
                              files are consolidated
    :param skip_empty: Ignore empty files (doesn't do much yet)

    :return: Output file

    :rtype: str
    """
    if skip_empty:
        _input_files = []
        for file_name in input_files:
            cs = ContigSet(file_name)
            if len(cs.toExternalFiles()) > 0:
                _input_files.append(file_name)
        input_files = _input_files
    tbr = ContigSet(*input_files)
    if not new_resource_file:
        if output_file.endswith('xml'):
            new_resource_file = output_file[:-3] + resource_file_extension
    tbr.consolidate(new_resource_file)
    tbr.newUuid()
    tbr.write(output_file)
    return output_file
def _write_fasta_or_contigset(file_name):
    fasta_file = re.sub(".contigset.xml", ".fasta", file_name)
    rec = [">chr%d\nacgtacgtacgt" % x for x in range(251)]
    with open(fasta_file, "w") as f:
        f.write("\n".join(rec))
    if file_name.endswith(".xml"):
        cs = ContigSet(fasta_file)
        cs.write(file_name)
Esempio n. 11
0
def _write_fasta_or_contigset(file_name):
    fasta_file = re.sub(".contigset.xml", ".fasta", file_name)
    rec = [">chr%d\nacgtacgtacgt" % x for x in range(251)]
    with open(fasta_file, "w") as f:
        f.write("\n".join(rec))
    if file_name.endswith(".xml"):
        cs = ContigSet(fasta_file)
        cs.write(file_name)
def _write_fasta_or_contigset(file_name, make_faidx=False, n_records=251):
    fasta_file = re.sub(".contigset.xml", ".fasta", file_name)
    rec = [">chr%d\nacgtacgtacgt" % x for x in range(n_records)]
    with open(fasta_file, "w") as f:
        f.write("\n".join(rec))
        f.flush()
    if make_faidx:
        pysam.faidx(fasta_file)
    if file_name.endswith(".xml"):
        cs = ContigSet(fasta_file, strict=make_faidx)
        cs.write(file_name)
 def test_contigset_empty(self):
     fa_file = tempfile.NamedTemporaryFile(suffix=".fasta").name
     ds_file = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
     open(fa_file, "w").write("")
     ds = ContigSet(fa_file, strict=False)
     ds.write(ds_file)
     fai_file = fa_file + ".fai"
     open(fai_file, "w").write("")
     ds = ContigSet(fa_file, strict=True)
     ds.write(ds_file)
     self.assertEqual(len(ds), 0)
Esempio n. 14
0
 def test_contigset_empty(self):
     fa_file = tempfile.NamedTemporaryFile(suffix=".fasta").name
     ds_file = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
     open(fa_file, "w").write("")
     ds = ContigSet(fa_file, strict=False)
     ds.write(ds_file)
     fai_file = fa_file + ".fai"
     open(fai_file, "w").write("")
     ds = ContigSet(fa_file, strict=True)
     ds.write(ds_file)
     self.assertEqual(len(ds), 0)
Esempio n. 15
0
def write_contigset_records(pbcore_writer_class, records, file_name):
    """
    Writes the Chunked fasta files and Writes a ContigSet.xml

    Filename has contigset.xml

    """
    fasta_file_name = ".".join(file_name.split(".")[:-2]) + ".fasta"
    write_pbcore_records(pbcore_writer_class, records, fasta_file_name)
    log.debug("Writing ContigSet XML to {f}".format(f=file_name))
    ds = ContigSet(fasta_file_name)
    ds.write(file_name)
Esempio n. 16
0
def write_contigset_records(pbcore_writer_class, records, file_name):
    """
    Writes the Chunked fasta files and Writes a ContigSet.xml

    Filename has contigset.xml

    """
    fasta_file_name = ".".join(file_name.split(".")[:-2]) + ".fasta"
    write_pbcore_records(pbcore_writer_class, records, fasta_file_name)
    log.debug("Writing ContigSet XML to {f}".format(f=file_name))
    ds = ContigSet(fasta_file_name)
    ds.write(file_name)
Esempio n. 17
0
 def test_contigset_empty(self):
     fa_file = tempfile.NamedTemporaryFile(suffix=".fasta")
     ds_file = tempfile.NamedTemporaryFile(suffix=".contigset.xml")
     open(fa_file.name, "w").write("")
     ds = ContigSet(fa_file.name, strict=False)
     ds.write(ds_file.name)
     fai_file = fa_file.name + ".fai"
     open(fai_file, "w").write("")
     ds = ContigSet(fa_file.name, strict=True)
     ds.write(ds_file.name)
     assert len(ds) == 0
     fa_file.close()
     ds_file.close()
Esempio n. 18
0
def __gather_contigset(input_files, output_file, new_resource_file):
    """Copied from pbcoretools.chunking.gather:__gather_contigset()
    """
    skip_empty = True
    if skip_empty:
        _input_files = []
        for file_name in input_files:
            cs = ContigSet(file_name)
            if len(cs.toExternalFiles()) > 0:
                _input_files.append(file_name)
        input_files = _input_files
    tbr = ContigSet(*input_files)
    tbr.consolidate(new_resource_file)
    tbr.newUuid()
    tbr.write(output_file, relPaths=True)
    return output_file
Esempio n. 19
0
def as_contigset(fasta_file, xml_file):
    if fasta_file == xml_file or xml_file is None:
        if not op.isfile(fasta_file) or op.getsize(fasta_file) == 0:
            return ContigSet()
        return ContigSet(fasta_file)
    file_size = op.getsize(fasta_file)

    fai_file = fasta_file + ".fai"
    if op.exists(fai_file):
        os.remove(fai_file)

    ds = ContigSet(fasta_file, generateIndices=True)
    ds.write(xml_file)
    if not file_size > 0:
        with open(fai_file, "w") as fai:
            fai.write("")
    return ds
Esempio n. 20
0
def as_contigset(fasta_file, xml_file):
    if fasta_file == xml_file or xml_file is None:
        if not op.isfile(fasta_file) or op.getsize(fasta_file) == 0:
            return ContigSet()
        return ContigSet(fasta_file)
    file_size = op.getsize(fasta_file)

    fai_file = fasta_file + ".fai"
    if op.exists(fai_file):
        os.remove(fai_file)

    ds = ContigSet(fasta_file, generateIndices=True)
    ds.write(xml_file)
    if not file_size > 0:
        with open(fai_file, "w") as fai:
            fai.write("")
    return ds
Esempio n. 21
0
def gather_contigset(input_files, output_file, new_resource_file=None,
                     skip_empty=True):
    """
    :param input_files: List of file paths
    :param output_file: File Path
    :param new_resource_file: the path of the file to which the other contig
                              files are consolidated
    :param skip_empty: Ignore empty files (doesn't do much yet)

    :return: Output file

    :rtype: str
    """
    tbr = ContigSet(*input_files)
    if not new_resource_file:
        if output_file.endswith('xml'):
            new_resource_file = output_file[:-3] + 'fasta'
    tbr.consolidate(new_resource_file)
    tbr.write(output_file)
    return output_file
Esempio n. 22
0
def gather_contigset(input_files,
                     output_file,
                     new_resource_file=None,
                     skip_empty=True):
    """
    :param input_files: List of file paths
    :param output_file: File Path
    :param new_resource_file: the path of the file to which the other contig
                              files are consolidated
    :param skip_empty: Ignore empty files (doesn't do much yet)

    :return: Output file

    :rtype: str
    """
    tbr = ContigSet(*input_files)
    if not new_resource_file:
        if output_file.endswith('xml'):
            new_resource_file = output_file[:-3] + 'fasta'
    tbr.consolidate(new_resource_file)
    tbr.write(output_file)
    return output_file
Esempio n. 23
0
 def test_fastq_consolidate(self):
     fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
           'lambda/2590980/0008/Analysis_Results/'
           'm141115_075238_ethan_c100699872550000001'
           '823139203261572_s1_p0.1.subreads.fastq')
     fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
     cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
     cset_out = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
     with open(fq_out, 'w') as fqh:
         with open(fn, 'r') as fih:
             for line in itertools.islice(fih, 240):
                 fqh.write(line)
     cset = ContigSet(fq_out)
     cset_l = sum(1 for _ in cset)
     assert cset_l == 60
     cset.filters.addRequirement(length=[('>', 1000)])
     cset_l = sum(1 for _ in cset)
     assert cset_l == 23
     cset.consolidate(cfq_out)
     cset_l = sum(1 for _ in cset)
     cfq = FastqReader(cfq_out)
     assert cset_l == 23
     assert cset_l == sum(1 for _ in cfq)
     cset.write(cset_out)
 def test_fastq_consolidate(self):
     fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
           'lambda/2590980/0008/Analysis_Results/'
           'm141115_075238_ethan_c100699872550000001'
           '823139203261572_s1_p0.1.subreads.fastq')
     fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
     cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
     cset_out = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
     with open(fq_out, 'w') as fqh:
         with open(fn, 'r') as fih:
             for line in itertools.islice(fih, 240):
                 fqh.write(line)
     cset = ContigSet(fq_out)
     cset_l = sum(1 for _ in cset)
     self.assertEqual(cset_l, 60)
     cset.filters.addRequirement(length=[('>', 1000)])
     cset_l = sum(1 for _ in cset)
     self.assertEqual(cset_l, 23)
     cset.consolidate(cfq_out)
     cset_l = sum(1 for _ in cset)
     cfq = FastqReader(cfq_out)
     self.assertEqual(cset_l, 23)
     self.assertEqual(cset_l, sum(1 for _ in cfq))
     cset.write(cset_out)
Esempio n. 25
0
def consolidateXml(args):
    """Combine BAMs and apply the filters described in the XML file, producing
    one consolidated XML"""
    dset = ContigSet(args.infile)
    dset.consolidate(args.datafile)
    dset.write(args.xmlfile)
Esempio n. 26
0
def consolidateXml(args):
    """Combine BAMs and apply the filters described in the XML file, producing
    one consolidated XML"""
    dset = ContigSet(args.infile)
    dset.consolidate(args.datafile)
    dset.write(args.xmlfile)