def resolved_tool_contract_runner(resolved_contract):
    rc = resolved_contract
    alignment_path = rc.task.input_files[0]
    reference_path = rc.task.input_files[1]
    gff_path = rc.task.output_files[0]
    vcf_path = rc.task.output_files[1]
    dataset_path = rc.task.output_files[2]
    fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path)
    fastq_path = rc.task.output_files[3]
    args = [
        alignment_path,
        "--verbose",
        "--reference", reference_path,
        "--outputFilename", gff_path,
        "--outputFilename", fasta_path,
        "--outputFilename", fastq_path,
        "--outputFilename", vcf_path,
        "--numWorkers", str(rc.task.nproc),
        "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]),
        "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]),
        "--maskRadius", str(Constants.DEFAULT_MASK_RADIUS) if \
                        bool(rc.task.options[Constants.MASKING_ID]) else "0",
        "--algorithm", rc.task.options[Constants.ALGORITHM_ID],
        "--alignmentSetRefWindows",
    ]
    args_ = get_parser().arg_parser.parser.parse_args(args)
    rc = args_runner(args_)
    if rc == 0:
        pysam.faidx(fasta_path)
        ds = ContigSet(fasta_path, strict=True)
        ds.write(dataset_path)
    return rc
def run(referenceset, fastq, gff, fasta, contigset, alignmentset, options,
        log_level):
    #'--log-file foo.log',
    #'--verbose',
    #'--debug', # requires 'ipdb'
    #'-j NWORKERS',
    #'--algorithm quiver',
    #'--diploid', # binary
    #'--minConfidence 40',
    #'--minCoverage 5',
    #'--alignmentSetRefWindows',
    cmd = "variantCaller --log-level {log_level} {options} --referenceFilename {referenceset} -o {fastq} -o {gff} -o {fasta} {alignmentset}"
    system(cmd.format(**locals()))
    try:
        say('Converting fasta {!r} to contigset {!r}'.format(fasta, contigset))
        # Convert to contigset.xml

        import pysam
        pysam.faidx(fasta)  # pylint: disable=no-member
        # I do not know why pylint does not see this defined.

        ds = ContigSet(fasta, strict=True)
        ds.write(contigset, relPaths=True)
        say('Successfully wrapped fasta {!r} in contigset {!r}'.format(
            fasta, contigset))
    except Exception:
        say(traceback.format_exc())
        say('Skipping conversion to contigset.')
Example #3
0
 def test_contigset_build(self):
     ds1 = ContigSet(data.getXml(3), skipMissing=True)
     assert type(ds1).__name__ == 'ContigSet'
     assert type(ds1._metadata).__name__ == 'ContigSetMetadata'
     ds2 = ContigSet(data.getXml(3), skipMissing=True)
     assert type(ds2).__name__ == 'ContigSet'
     assert type(ds2._metadata).__name__ == 'ContigSetMetadata'
Example #4
0
def resolved_tool_contract_runner(resolved_contract):
    rc = resolved_contract
    alignment_path = rc.task.input_files[0]
    reference_path = rc.task.input_files[1]
    gff_path = rc.task.output_files[0]
    vcf_path = rc.task.output_files[1]
    dataset_path = rc.task.output_files[2]
    fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path)
    fastq_path = rc.task.output_files[3]
    args = [
        alignment_path,
        "--verbose",
        "--reference", reference_path,
        "--outputFilename", gff_path,
        "--outputFilename", fasta_path,
        "--outputFilename", fastq_path,
        "--outputFilename", vcf_path,
        "--numWorkers", str(rc.task.nproc),
        "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]),
        "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]),
        "--maskRadius", str(Constants.DEFAULT_MASK_RADIUS) if \
                        bool(rc.task.options[Constants.MASKING_ID]) else "0",
        "--algorithm", rc.task.options[Constants.ALGORITHM_ID],
        "--alignmentSetRefWindows",
    ]
    args_ = get_parser().arg_parser.parser.parse_args(args)
    rc = args_runner(args_)
    if rc == 0:
        pysam.faidx(fasta_path)
        ds = ContigSet(fasta_path, strict=True)
        ds.write(dataset_path)
    return rc
def run(referenceset, fastq, gff, fasta, contigset, alignmentset, options, log_level):
    #'--log-file foo.log',
    #'--verbose',
    #'--debug', # requires 'ipdb'
    #'-j NWORKERS',
    #'--algorithm quiver',
    #'--diploid', # binary
    #'--minConfidence 40',
    #'--minCoverage 5',
    #'--alignmentSetRefWindows',
    cmd = "variantCaller --log-level {log_level} {options} --referenceFilename {referenceset} -o {fastq} -o {gff} -o {fasta} {alignmentset}"
    system(cmd.format(**locals()))
    try:
        say('Converting fasta {!r} to contigset {!r}'.format(fasta, contigset))
        # Convert to contigset.xml

        import pysam
        pysam.faidx(fasta) # pylint: disable=no-member
        # I do not know why pylint does not see this defined.

        ds = ContigSet(fasta, strict=True)
        ds.write(contigset, relPaths=True)
        say('Successfully wrapped fasta {!r} in contigset {!r}'.format(fasta, contigset))
    except Exception:
        say(traceback.format_exc())
        say('Skipping conversion to contigset.')
Example #6
0
def _get_fasta_path(file_name):
    if file_name.endswith(".contigset.xml"):
        ds = ContigSet(file_name)
        fasta_files = ds.toExternalFiles()
        assert len(fasta_files) == 1
        return fasta_files[0]
    return file_name
Example #7
0
 def test_contigset_build(self):
     ds1 = ContigSet(data.getXml(3), skipMissing=True)
     self.assertEquals(type(ds1).__name__, 'ContigSet')
     self.assertEquals(type(ds1._metadata).__name__, 'ContigSetMetadata')
     ds2 = ContigSet(data.getXml(3), skipMissing=True)
     self.assertEquals(type(ds2).__name__, 'ContigSet')
     self.assertEquals(type(ds2._metadata).__name__, 'ContigSetMetadata')
Example #8
0
    def test_contigset_consolidate_int_names(self):
        # build set to merge
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')
        outFas1 = os.path.join(outdir, 'tempfile1.fasta')
        outFas2 = os.path.join(outdir, 'tempfile2.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        shutil.copyfile(
            ReferenceSet(data.getXml(8)).toExternalFiles()[0], inFas)
        rs1 = ContigSet(inFas)

        double = 'B.cereus.1'
        exp_double = rs1.get_contig(double)

        # todo: modify the names first:
        with FastaWriter(outFas1) as writer:
            writer.writeRecord('5141', exp_double.sequence)
        with FastaWriter(outFas2) as writer:
            writer.writeRecord('5142', exp_double.sequence)

        exp_double_seqs = [exp_double.sequence, exp_double.sequence]
        exp_names = ['5141', '5142']

        obs_file = ContigSet(outFas1, outFas2)
        log.debug(obs_file.toExternalFiles())
        obs_file.consolidate()
        log.debug(obs_file.toExternalFiles())

        # open obs and compare to exp
        for name, seq in zip(exp_names, exp_double_seqs):
            assert obs_file.get_contig(name).sequence[:] == seq
Example #9
0
def resolved_tool_contract_runner(resolved_contract):
    rc = resolved_contract
    alignment_path = rc.task.input_files[0]
    reference_path = rc.task.input_files[1]
    gff_path = rc.task.output_files[0]
    dataset_path = rc.task.output_files[1]
    fasta_path = re.sub(".contigset.xml", ".fasta", dataset_path)
    fastq_path = rc.task.output_files[2]
    args = [
        alignment_path,
        "--verbose",
        "--reference", reference_path,
        "--outputFilename", gff_path,
        "--outputFilename", fasta_path,
        "--outputFilename", fastq_path,
        "--numWorkers", str(rc.task.nproc),
        "--minCoverage", str(rc.task.options[Constants.MIN_COVERAGE_ID]),
        "--minConfidence", str(rc.task.options[Constants.MIN_CONFIDENCE_ID]),
        "--algorithm", rc.task.options[Constants.ALGORITHM_ID],
        "--alignmentSetRefWindows",
    ]
    if rc.task.options[Constants.DIPLOID_MODE_ID]:
        args.append("--diploid")
    args_ = get_parser().arg_parser.parser.parse_args(args)
    rc = args_runner(args_)
    if rc == 0:
        pysam.faidx(fasta_path)
        ds = ContigSet(fasta_path, strict=True)
        ds.write(dataset_path)
    return rc
Example #10
0
def _get_fasta_path(file_name):
    if file_name.endswith(".contigset.xml"):
        ds = ContigSet(file_name)
        fasta_files = ds.toExternalFiles()
        assert len(fasta_files) == 1
        return fasta_files[0]
    return file_name
 def test_contigset_consolidate_genomic_consensus(self):
     """
     Verify that the contigs output by GenomicConsensus (e.g. quiver) can
     be consolidated.
     """
     FASTA1 = ("lambda_NEB3011_0_60",
         "GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCG")
     FASTA2 = ("lambda_NEB3011_120_180",
         "CACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTG")
     FASTA3 = ("lambda_NEB3011_60_120",
         "GTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGT")
     files = []
     for i, (header, seq) in enumerate([FASTA1, FASTA2, FASTA3]):
         _files = []
         for suffix in ["", "|quiver", "|plurality", "|arrow", "|poa"]:
             tmpfile = tempfile.NamedTemporaryFile(suffix=".fasta").name
             with open(tmpfile, "w") as f:
                 f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq))
             _files.append(tmpfile)
         files.append(_files)
     for i in range(3):
         ds = ContigSet(*[f[i] for f in files])
         out1 = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
         fa1 = tempfile.NamedTemporaryFile(suffix=".fasta").name
         ds.consolidate(fa1)
         ds.write(out1)
         with ContigSet(out1) as ds_new:
             self.assertEqual(len([rec for rec in ds_new]), 1,
                              "failed on %d" % i)
def _write_fasta_or_contigset(file_name):
    fasta_file = re.sub(".contigset.xml", ".fasta", file_name)
    rec = [">chr%d\nacgtacgtacgt" % x for x in range(251)]
    with open(fasta_file, "w") as f:
        f.write("\n".join(rec))
    if file_name.endswith(".xml"):
        cs = ContigSet(fasta_file)
        cs.write(file_name)
Example #13
0
def _write_fasta_or_contigset(file_name):
    fasta_file = re.sub(".contigset.xml", ".fasta", file_name)
    rec = [">chr%d\nacgtacgtacgt" % x for x in range(251)]
    with open(fasta_file, "w") as f:
        f.write("\n".join(rec))
    if file_name.endswith(".xml"):
        cs = ContigSet(fasta_file)
        cs.write(file_name)
Example #14
0
 def test_contigset_build(self):
     ds1 = ContigSet(data.getXml(3))
     self.assertEquals(type(ds1).__name__, 'ContigSet')
     self.assertEquals(type(ds1._metadata).__name__, 'ContigSetMetadata')
     ds2 = ContigSet(data.getXml(3))
     self.assertEquals(type(ds2).__name__, 'ContigSet')
     self.assertEquals(type(ds2._metadata).__name__, 'ContigSetMetadata')
     for contigmd in ds2.metadata.contigs:
         self.assertEquals(type(contigmd).__name__, 'ContigMetadata')
def makeReport(inReadsFN, inSummaryFN, outDir):
    """
    Generate a report with ID, tables, attributes and plot groups.

    inReadsFN --- an input FASTA file which has all consensus
    isoforms produced by pbtranscript.py cluster.
    This file is required to plot a read length histogram as part of
    the report:
         consensus_isoforms_readlength_hist.png

    inSummaryFN --- a summary TXT file with cluster attributes,
    including two attributes:
         number of consensus isoforms
         average length of consensus isoforms
    Attributes of the report are extracted from this file.

    """
    log.info("Plotting read length histogram from file: {f}".
             format(f=inReadsFN))

    # Collect read lengths of
    reader = ContigSet(inReadsFN)
    rs = [len(r.sequence) for r in reader]
    reader.close()
    readlengths = np.array(rs)

    # Plot read length histogram
    readlength_plot = create_readlength_plot(readlengths, outDir)
    readlength_group = PlotGroup(Constants.PG_READLENGTH,
                                 title="Read Length of Consensus Isoforms Reads",
                                 plots=[readlength_plot],
                                 thumbnail=readlength_plot.thumbnail)

    log.info("Plotting summary attributes from file: {f}".
             format(f=inSummaryFN))
    # Produce attributes based on summary.
    dataset_uuids = [ContigSet(inReadsFN).uuid]
    if inSummaryFN.endswith(".json"):
        attributes = _report_to_attributes(inSummaryFN)
        r = load_report_from_json(inSummaryFN)
        # FIXME(nechols)(2016-03-22) not using the dataset UUIDs from these
        # reports; should we be?
    else:
        attributes = summaryToAttributes(inSummaryFN)

    table = attributesToTable(attributes)
    log.info(str(table))

    # A report is consist of ID, tables, attributes, and plotgroups.
    report = Report(Constants.R_ID,
                    title="Transcript Clustering",
                    attributes=attributes,
                    plotgroups=[readlength_group],
                    dataset_uuids=dataset_uuids)

    return report
Example #16
0
def __gather_contigset(resource_file_extension, input_files, output_file,
                       new_resource_file=None,
                       skip_empty=True):
    """
    :param input_files: List of file paths
    :param output_file: File Path
    :param new_resource_file: the path of the file to which the other contig
                              files are consolidated
    :param skip_empty: Ignore empty files (doesn't do much yet)

    :return: Output file

    :rtype: str
    """
    if skip_empty:
        _input_files = []
        for file_name in input_files:
            cs = ContigSet(file_name)
            if len(cs.toExternalFiles()) > 0:
                _input_files.append(file_name)
        input_files = _input_files
    tbr = ContigSet(*input_files)
    if not new_resource_file:
        if output_file.endswith('xml'):
            new_resource_file = output_file[:-3] + resource_file_extension
    tbr.consolidate(new_resource_file)
    tbr.newUuid()
    tbr.write(output_file)
    return output_file
Example #17
0
def gather_contigset(input_files,
                     output_file,
                     new_resource_file=None,
                     skip_empty=True):
    """
    :param input_files: List of file paths
    :param output_file: File Path
    :param new_resource_file: the path of the file to which the other contig
                              files are consolidated
    :param skip_empty: Ignore empty files (doesn't do much yet)

    :return: Output file

    :rtype: str
    """
    if skip_empty:
        _input_files = []
        for file_name in input_files:
            cs = ContigSet(file_name)
            if len(cs.toExternalFiles()) > 0:
                _input_files.append(file_name)
        input_files = _input_files
    tbr = ContigSet(*input_files)
    if not new_resource_file:
        if output_file.endswith('xml'):
            new_resource_file = output_file[:-3] + 'fasta'
    tbr.consolidate(new_resource_file)
    tbr.write(output_file)
    return output_file
def _write_fasta_or_contigset(file_name, make_faidx=False, n_records=251):
    fasta_file = re.sub(".contigset.xml", ".fasta", file_name)
    rec = [">chr%d\nacgtacgtacgt" % x for x in range(n_records)]
    with open(fasta_file, "w") as f:
        f.write("\n".join(rec))
        f.flush()
    if make_faidx:
        pysam.faidx(fasta_file)
    if file_name.endswith(".xml"):
        cs = ContigSet(fasta_file, strict=make_faidx)
        cs.write(file_name)
Example #19
0
def write_contigset_records(pbcore_writer_class, records, file_name):
    """
    Writes the Chunked fasta files and Writes a ContigSet.xml

    Filename has contigset.xml

    """
    fasta_file_name = ".".join(file_name.split(".")[:-2]) + ".fasta"
    write_pbcore_records(pbcore_writer_class, records, fasta_file_name)
    log.debug("Writing ContigSet XML to {f}".format(f=file_name))
    ds = ContigSet(fasta_file_name)
    ds.write(file_name)
Example #20
0
def write_contigset_records(pbcore_writer_class, records, file_name):
    """
    Writes the Chunked fasta files and Writes a ContigSet.xml

    Filename has contigset.xml

    """
    fasta_file_name = ".".join(file_name.split(".")[:-2]) + ".fasta"
    write_pbcore_records(pbcore_writer_class, records, fasta_file_name)
    log.debug("Writing ContigSet XML to {f}".format(f=file_name))
    ds = ContigSet(fasta_file_name)
    ds.write(file_name)
Example #21
0
def makeReport(inReadsFN, inSummaryFN, outDir):
    """
    Generate a report with ID, tables, attributes and plot groups.

    inReadsFN --- an input FASTA file which has all consensus
    isoforms produced by pbtranscript.py cluster.
    This file is required to plot a read length histogram as part of
    the report:
         consensus_isoforms_readlength_hist.png

    inSummaryFN --- a summary TXT file with cluster attributes,
    including two attributes:
         number of consensus isoforms
         average length of consensus isoforms
    Attributes of the report are extracted from this file.

    """
    log.info("Plotting read length histogram from file: {f}".
             format(f=inReadsFN))

    # Collect read lengths of
    reader = ContigSet(inReadsFN)
    rs = [len(r.sequence) for r in reader]
    reader.close()
    readlengths = np.array(rs)

    # Plot read length histogram
    readlength_plot = create_readlength_plot(readlengths, outDir)
    readlength_group = PlotGroup(Constants.PG_READLENGTH,
                                 plots=[readlength_plot],
                                 thumbnail=readlength_plot.thumbnail)

    log.info("Plotting summary attributes from file: {f}".
             format(f=inSummaryFN))
    # Produce attributes based on summary.
    dataset_uuids = [ContigSet(inReadsFN).uuid]
    attributes = _report_to_attributes(inSummaryFN)
    r = load_report_from_json(inSummaryFN)
        # FIXME(nechols)(2016-03-22) not using the dataset UUIDs from these
        # reports; should we be?

    table = attributesToTable(attributes)
    log.info(str(table))

    # A report is consist of ID, tables, attributes, and plotgroups.
    report = Report(Constants.R_ID,
                    title=meta_rpt.title,
                    attributes=attributes,
                    plotgroups=[readlength_group],
                    dataset_uuids=dataset_uuids)

    return meta_rpt.apply_view(report)
Example #22
0
 def run_after(self, rtc, output_dir):
     json_file = rtc.task.output_files[0]
     chunks = load_pipeline_chunks_from_json(json_file)
     n_rec = 0
     with ContigSet(self.INPUT_FILES[0]) as f:
         n_rec = len(f)
     n_rec_chunked = 0
     for chunk in chunks:
         d = chunk.chunk_d
         with ContigSet(d['$chunk.contigset_id']) as cs:
             n_rec_chunked += len([r for r in cs])
         self._check_unchunked_files(d)
     self.assertEqual(n_rec_chunked, n_rec)
Example #23
0
 def test_merged_contigset(self):
     fn = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
     with ContigSet(upstreamData.getLambdaFasta(),
                    upstreamData.getFasta()) as cset:
         assert len(list(cset)) == 49
         assert len(cset) == 49
         cset.consolidate()
         cset.write(fn)
         log.debug("Writing to {f}".format(f=fn))
         assert len(list(cset)) == 49
         assert len(cset) == 49
     with ContigSet(fn) as cset:
         assert len(list(cset)) == 49
         assert len(cset) == 49
Example #24
0
    def test_missing_fai_error_message(self):
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        shutil.copyfile(
            ReferenceSet(data.getXml(8)).toExternalFiles()[0], inFas)
        rs1 = ContigSet(inFas)
        with pytest.raises(IOError) as cm:
            rs1.assertIndexed()
            assert str(cm) == (
                "Companion FASTA index (.fai) file not found or malformatted! "
                "Use 'samtools faidx' to generate FASTA index.")
    def test_contigset_consolidate_int_names(self):
        #build set to merge
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')
        outFas1 = os.path.join(outdir, 'tempfile1.fasta')
        outFas2 = os.path.join(outdir, 'tempfile2.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        backticks('cp {i} {o}'.format(
                      i=ReferenceSet(data.getXml(9)).toExternalFiles()[0],
                      o=inFas))
        rs1 = ContigSet(inFas)

        double = 'B.cereus.1'
        exp_double = rs1.get_contig(double)

        # todo: modify the names first:
        with FastaWriter(outFas1) as writer:
            writer.writeRecord('5141', exp_double.sequence)
        with FastaWriter(outFas2) as writer:
            writer.writeRecord('5142', exp_double.sequence)

        exp_double_seqs = [exp_double.sequence, exp_double.sequence]
        exp_names = ['5141', '5142']

        obs_file = ContigSet(outFas1, outFas2)
        log.debug(obs_file.toExternalFiles())
        obs_file.consolidate()
        log.debug(obs_file.toExternalFiles())

        # open obs and compare to exp
        for name, seq in zip(exp_names, exp_double_seqs):
            self.assertEqual(obs_file.get_contig(name).sequence[:], seq)
Example #26
0
 def test_contigset_write(self):
     fasta = upstreamData.getLambdaFasta()
     ds = ContigSet(fasta)
     assert isinstance(ds.resourceReaders()[0], IndexedFastaReader)
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, 'test.fasta')
     w = FastaWriter(outfn)
     for rec in ds:
         w.writeRecord(rec)
     w.close()
     fas = FastaReader(outfn)
     for rec in fas:
         # make sure a __repr__ didn't slip through:
         assert not rec.sequence.startswith('<')
Example #27
0
 def test_len_fastq(self):
     fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
           'lambda/2590980/0008/Analysis_Results/'
           'm141115_075238_ethan_c100699872550000001'
           '823139203261572_s1_p0.1.subreads.fastq')
     fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
     with open(fq_out, 'w') as fqh:
         with open(fn, 'r') as fih:
             for line in itertools.islice(fih, 24):
                 fqh.write(line)
     cset = ContigSet(fq_out)
     assert not cset.isIndexed
     assert isinstance(cset.resourceReaders()[0], FastqReader)
     assert sum(1 for _ in cset) == sum(1 for _ in FastqReader(fq_out))
     assert sum(1 for _ in cset) == 6
 def test_contigset_write(self):
     fasta = upstreamData.getLambdaFasta()
     ds = ContigSet(fasta)
     self.assertTrue(isinstance(ds.resourceReaders()[0],
                                IndexedFastaReader))
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, 'test.fasta')
     w = FastaWriter(outfn)
     for rec in ds:
         w.writeRecord(rec)
     w.close()
     fas = FastaReader(outfn)
     for rec in fas:
         # make sure a __repr__ didn't slip through:
         self.assertFalse(rec.sequence.startswith('<'))
Example #29
0
    def test_missing_fai_error_message(self):
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        backticks('cp {i} {o}'.format(i=ReferenceSet(
            data.getXml(9)).toExternalFiles()[0],
                                      o=inFas))
        rs1 = ContigSet(inFas)
        with self.assertRaises(IOError) as cm:
            rs1.assertIndexed()
        self.assertEqual(
            str(cm.exception),
            ("Companion FASTA index (.fai) file not found or malformatted! "
             "Use 'samtools faidx' to generate FASTA index."))
Example #30
0
def run_fasta_to_reference(input_file_name, output_file_name,
                           organism, reference_name,
                           ploidy):
    """Copied from pbcoretools/tasks/converters.py:run_fasta_to_reference()
    """
    ds_in = ContigSet(input_file_name)
    if len(ds_in.externalResources) > 1:
        raise TypeError("Only a single FASTA file is supported as input.")
    fasta_file_name = ds_in.externalResources[0].resourceId
    output_dir_name = op.dirname(output_file_name)
    args = [
        "fasta-to-reference",
        "--organism", organism,
        "--ploidy", ploidy,
        "--debug",
        fasta_file_name,
        output_dir_name,
        reference_name
    ]
    log.info(" ".join(args))
    system(" ".join(args))
    ref_file = op.join(output_dir_name, reference_name, "referenceset.xml")
    assert op.isfile(ref_file)
    with ReferenceSet(ref_file, strict=True) as ds_ref:
        ds_ref.makePathsAbsolute()
        log.info("saving final ReferenceSet to {f!r}".format(f=output_file_name))
        ds_ref.write(output_file_name)
Example #31
0
def run_fasta_to_reference(input_file_name,
                           output_file_name,
                           organism=None,
                           reference_name=None,
                           ploidy="haploid"):
    if reference_name is None or reference_name == "":
        reference_name = op.splitext(op.basename(input_file_name))[0]
    ds_in = ContigSet(input_file_name)
    if len(ds_in.externalResources) > 1:
        raise TypeError("Only a single FASTA file is supported as input.")
    fasta_file_name = ds_in.externalResources[0].resourceId
    output_dir_name = op.dirname(output_file_name)
    args = [
        "fasta-to-reference", "--organism",
        str(organism) if organism != "" else "unknown", "--ploidy",
        str(ploidy) if ploidy != "" else "unknown", "--debug", fasta_file_name,
        output_dir_name, reference_name
    ]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    ref_file = op.join(output_dir_name, reference_name, "referenceset.xml")
    assert op.isfile(ref_file)
    with ReferenceSet(ref_file, strict=True) as ds_ref:
        ds_ref.makePathsAbsolute()
        log.info("saving final ReferenceSet to {f}".format(f=output_file_name))
        ds_ref.write(output_file_name)
    return 0
Example #32
0
 def getMetrics(cls):
     super(TestIsoSeqCluster, cls).getMetrics()
     cls.hq_fasta_file = cls.lq_fasta_file = None
     for file_id, file_info in cls.datastore.get_file_dict().iteritems():
         if file_info.file_type_id == FileTypes.DS_CONTIG.file_type_id:
             file_name = op.basename(file_info.path)
             if file_name.startswith("hq_isoforms"):
                 cls.hq_fasta_file = file_info.path
                 with ContigSet(cls.hq_fasta_file) as ds:
                     n = len(ds)
                     cls.metric_dict["num_polished_hq_isoforms_fasta"] = n
             elif file_name.startswith("lq_isoforms"):
                 cls.lq_fasta_file = file_info.path
                 with ContigSet(cls.lq_fasta_file) as ds:
                     n = len(ds)
                     cls.metric_dict["num_polished_lq_isoforms_fasta"] = n
 def test_len_fastq(self):
     fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
           'lambda/2590980/0008/Analysis_Results/'
           'm141115_075238_ethan_c100699872550000001'
           '823139203261572_s1_p0.1.subreads.fastq')
     fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
     with open(fq_out, 'w') as fqh:
         with open(fn, 'r') as fih:
             for line in itertools.islice(fih, 24):
                 fqh.write(line)
     cset = ContigSet(fq_out)
     self.assertFalse(cset.isIndexed)
     self.assertTrue(isinstance(cset.resourceReaders()[0], FastqReader))
     self.assertEqual(sum(1 for _ in cset),
                      sum(1 for _ in FastqReader(fq_out)))
     self.assertEqual(sum(1 for _ in cset), 6)
    def test_missing_fai_error_message(self):
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        backticks('cp {i} {o}'.format(
            i=ReferenceSet(data.getXml(9)).toExternalFiles()[0],
            o=inFas))
        rs1 = ContigSet(inFas)
        with self.assertRaises(IOError) as cm:
            rs1.assertIndexed()
        self.assertEqual(
            str(cm.exception),
            ( "Companion FASTA index (.fai) file not found or malformatted! "
             "Use 'samtools faidx' to generate FASTA index."))
 def test_write_contigset_records(self):
     records = [FastaRecord("chr1", "acgt"), FastaRecord("chr2", "tgca")]
     tmp_contigs = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
     write_contigset_records(FastaWriter, records, tmp_contigs)
     with ContigSet(tmp_contigs) as ds_in:
         rec2 = [(rec.id, rec.sequence) for rec in ds_in]
         assert rec2 == [("chr1", "acgt"), ("chr2", "tgca")]
Example #36
0
def contigset_to_fasta(rtc):
    with ContigSet(rtc.task.input_files[0]) as ds_in:
        if len(ds_in.externalResources) != 1:
            raise ValueError("This task assumes that the ContigSet contains "+
                             "only a single FASTA file.")
        file_name = ds_in.externalResources[0].resourceId
        os.symlink(file_name, rtc.task.output_files[0])
    return 0
Example #37
0
def as_contigset(fasta_file, xml_file):
    if fasta_file == xml_file or xml_file is None:
        if not op.isfile(fasta_file) or op.getsize(fasta_file) == 0:
            return ContigSet()
        return ContigSet(fasta_file)
    file_size = op.getsize(fasta_file)

    fai_file = fasta_file + ".fai"
    if op.exists(fai_file):
        os.remove(fai_file)

    ds = ContigSet(fasta_file, generateIndices=True)
    ds.write(xml_file)
    if not file_size > 0:
        with open(fai_file, "w") as fai:
            fai.write("")
    return ds
Example #38
0
 def test_contigset_consolidate_genomic_consensus(self):
     """
     Verify that the contigs output by GenomicConsensus (e.g. quiver) can
     be consolidated.
     """
     FASTA1 = (
         "lambda_NEB3011_0_60",
         "GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCG")
     FASTA2 = (
         "lambda_NEB3011_120_180",
         "CACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTG")
     FASTA3 = (
         "lambda_NEB3011_60_120",
         "GTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGT")
     files = []
     for i, (header, seq) in enumerate([FASTA1, FASTA2, FASTA3]):
         _files = []
         for suffix in ["", "|quiver", "|plurality", "|arrow", "|poa"]:
             tmpfile = tempfile.NamedTemporaryFile(suffix=".fasta").name
             with open(tmpfile, "w") as f:
                 f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq))
             _files.append(tmpfile)
         files.append(_files)
     for i in range(3):
         ds = ContigSet(*[f[i] for f in files])
         out1 = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
         fa1 = tempfile.NamedTemporaryFile(suffix=".fasta").name
         ds.consolidate(fa1)
         ds.write(out1)
         with ContigSet(out1) as ds_new:
             assert len([rec for rec in ds_new]) == 1
Example #39
0
 def test_contigset_empty(self):
     fa_file = tempfile.NamedTemporaryFile(suffix=".fasta").name
     ds_file = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
     open(fa_file, "w").write("")
     ds = ContigSet(fa_file, strict=False)
     ds.write(ds_file)
     fai_file = fa_file + ".fai"
     open(fai_file, "w").write("")
     ds = ContigSet(fa_file, strict=True)
     ds.write(ds_file)
     self.assertEqual(len(ds), 0)
Example #40
0
 def test_contigset_empty(self):
     fa_file = tempfile.NamedTemporaryFile(suffix=".fasta")
     ds_file = tempfile.NamedTemporaryFile(suffix=".contigset.xml")
     open(fa_file.name, "w").write("")
     ds = ContigSet(fa_file.name, strict=False)
     ds.write(ds_file.name)
     fai_file = fa_file.name + ".fai"
     open(fai_file, "w").write("")
     ds = ContigSet(fa_file.name, strict=True)
     ds.write(ds_file.name)
     assert len(ds) == 0
     fa_file.close()
     ds_file.close()
Example #41
0
 def run_after(self, rtc, output_dir):
     rpt = None
     uuids = []
     for file_name in rtc.task.output_files:
         if file_name.endswith(".json"):
             rpt = load_report_from_json(file_name)
         elif file_name.endswith(".xml"):
             uuids.append(ContigSet(file_name, strict=True).uuid)
         else:
             assert file_name.endswith(".csv")
     self.assertEqual(sorted(rpt._dataset_uuids), sorted(uuids))
Example #42
0
def gather_contigset(input_files, output_file, new_resource_file=None,
                     skip_empty=True):
    """
    :param input_files: List of file paths
    :param output_file: File Path
    :param new_resource_file: the path of the file to which the other contig
                              files are consolidated
    :param skip_empty: Ignore empty files (doesn't do much yet)

    :return: Output file

    :rtype: str
    """
    tbr = ContigSet(*input_files)
    if not new_resource_file:
        if output_file.endswith('xml'):
            new_resource_file = output_file[:-3] + 'fasta'
    tbr.consolidate(new_resource_file)
    tbr.write(output_file)
    return output_file
 def test_fastq_consolidate(self):
     fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
           'lambda/2590980/0008/Analysis_Results/'
           'm141115_075238_ethan_c100699872550000001'
           '823139203261572_s1_p0.1.subreads.fastq')
     fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
     cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
     with open(fq_out, 'w') as fqh:
         with open(fn, 'r') as fih:
             for line in itertools.islice(fih, 240):
                 fqh.write(line)
     cset = ContigSet(fq_out)
     cset_l = sum(1 for _ in cset)
     self.assertEqual(cset_l, 60)
     cset.filters.addRequirement(length=[('>', 1000)])
     cset_l = sum(1 for _ in cset)
     self.assertEqual(cset_l, 23)
     cset.consolidate(cfq_out)
     cset_l = sum(1 for _ in cset)
     cfq = FastqReader(cfq_out)
     self.assertEqual(cset_l, 23)
     self.assertEqual(cset_l, sum(1 for _ in cfq))
Example #44
0
 def test_not_all_lowercase_contigs(self):
     """
     Test that no contigs in the output ContigSet have only lowercase
     characters.
     """
     n_uppercase = 0
     for file_id, file_info in self.datastore.get_file_dict().iteritems():
         if file_info.file_type_id == FileTypes.DS_CONTIG.file_type_id:
             if not file_info.is_chunked:
                 with ContigSet(file_info.path) as contigs:
                     for contig in contigs:
                         c = Counter(contig.sequence)
                         n_uppercase += c['A'] + c['C'] + c['G'] + c['T']
     self.assertTrue(n_uppercase > 0, "All contigs are lowercase-only")
Example #45
0
 def _open_files(self, *input_filenames):
     """Open file handers and return."""
     readers = []
     for fn in input_filenames:
         if ContigSetReaderWrapper.get_file_type(fn) == "FASTA":
             readers.append(FastaReader(fn))
         elif ContigSetReaderWrapper.get_file_type(fn) == "FASTQ":
             readers.append(FastqReader(fn))
         elif ContigSetReaderWrapper.get_file_type(fn) == "CONTIGSET":
             readers.append(ContigSet(fn))
         else:
             raise IOError(
                 "Could not read %s as FASTA/FASTQ/CONTIGSET file." % fn)
     return readers
Example #46
0
def write_cluster_summary(summary_fn, isoforms_fa, hq_fa=None, lq_fa=None):
    """Extract number of consensus isoforms predicted, and total
    number of bases in all consensuus isoforms from isoforms_fa and write
    the two attributes to summary_fn.

    if hq_fa (polished high-quality isoforms) is not None, report
        the number of polished hq clusters
    if lq_fa (polished high-quality isoforms) is not None, report
        the number of polished hq clusters
    """
    try:
        summary = ClusterSummary()
        dataset_uuids = []
        with ContigSet(isoforms_fa) as reader:
            for r in reader:
                summary.num_consensus_isoforms += 1
                summary.num_total_bases += len(r.sequence[:])
            dataset_uuids.append(reader.uuid)

        if hq_fa is not None and op.getsize(hq_fa) > 0:
            summary.num_polished_hq_isoforms = 0
            with ContigSet(hq_fa) as reader:
                for r in reader:
                    summary.num_polished_hq_isoforms += 1
                dataset_uuids.append(reader.uuid)
        if lq_fa is not None and op.getsize(lq_fa) > 0:
            summary.num_polished_lq_isoforms = 0
            with ContigSet(lq_fa) as reader:
                for r in reader:
                    summary.num_polished_lq_isoforms += 1
                dataset_uuids.append(reader.uuid)
        summary.write(summary_fn, dataset_uuids=dataset_uuids)
    except ZeroDivisionError:
        errMsg = "No consensus isoforms predicted."
        logging.error(errMsg)
        raise RuntimeError(errMsg)
 def test_contigset_empty(self):
     fa_file = tempfile.NamedTemporaryFile(suffix=".fasta").name
     ds_file = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
     open(fa_file, "w").write("")
     ds = ContigSet(fa_file, strict=False)
     ds.write(ds_file)
     fai_file = fa_file + ".fai"
     open(fai_file, "w").write("")
     ds = ContigSet(fa_file, strict=True)
     ds.write(ds_file)
     self.assertEqual(len(ds), 0)
Example #48
0
def n_reads_in_contigset(contigset_file):
    """Return number of reads in a contigset"""
    cs = ContigSet(contigset_file)
    cs.assertIndexed()
    return int(cs.numRecords)
Example #49
0
    def test_empty_fastq_consolidate(self):
        fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
              'lambda/2590980/0008/Analysis_Results/'
              'm141115_075238_ethan_c100699872550000001'
              '823139203261572_s1_p0.1.subreads.fastq')
        fq1_out = tempfile.NamedTemporaryFile(suffix="1.fastq").name
        fq2_out = tempfile.NamedTemporaryFile(suffix="2.fastq").name
        cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name

        # Two full
        with open(fq1_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                for line in itertools.islice(fih, 240):
                    fqh.write(line)
        with open(fq2_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                for line in itertools.islice(fih, 240, 480):
                    fqh.write(line)
        cset = ContigSet(fq1_out, fq2_out)
        cset_l = sum(1 for _ in cset)
        self.assertEqual(cset_l, 120)
        cset.consolidate(cfq_out)
        cset_l = sum(1 for _ in cset)
        cfq = FastqReader(cfq_out)
        self.assertEqual(cset_l, 120)
        self.assertEqual(cset_l, sum(1 for _ in cfq))

        # one full one empty
        with open(fq1_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                for line in itertools.islice(fih, 240):
                    fqh.write(line)
        with open(fq2_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                fqh.write("")
        cset = ContigSet(fq1_out, fq2_out)
        cset_l = sum(1 for _ in cset)
        self.assertEqual(cset_l, 60)
        cset.consolidate(cfq_out)
        cset_l = sum(1 for _ in cset)
        cfq = FastqReader(cfq_out)
        self.assertEqual(cset_l, 60)
        self.assertEqual(cset_l, sum(1 for _ in cfq))

        # one empty one full
        with open(fq1_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                fqh.write("")
        with open(fq2_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                for line in itertools.islice(fih, 240):
                    fqh.write(line)
        cset = ContigSet(fq1_out, fq2_out)
        cset_l = sum(1 for _ in cset)
        self.assertEqual(cset_l, 60)
        cset.consolidate(cfq_out)
        cset_l = sum(1 for _ in cset)
        cfq = FastqReader(cfq_out)
        self.assertEqual(cset_l, 60)
        self.assertEqual(cset_l, sum(1 for _ in cfq))

        # both empty
        with open(fq1_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                fqh.write("")
        with open(fq2_out, 'w') as fqh:
            with open(fn, 'r') as fih:
                fqh.write("")
        cset = ContigSet(fq1_out, fq2_out)
        cset_l = sum(1 for _ in cset)
        self.assertEqual(cset_l, 0)
        cset.consolidate(cfq_out)
        cset_l = sum(1 for _ in cset)
        cfq = FastqReader(cfq_out)
        self.assertEqual(cset_l, 0)
        self.assertEqual(cset_l, sum(1 for _ in cfq))
    def test_contigset_consolidate(self):
        #build set to merge
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')
        outFas1 = os.path.join(outdir, 'tempfile1.fasta')
        outFas2 = os.path.join(outdir, 'tempfile2.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        backticks('cp {i} {o}'.format(
                      i=ReferenceSet(data.getXml(9)).toExternalFiles()[0],
                      o=inFas))
        rs1 = ContigSet(inFas)

        singletons = ['A.baumannii.1', 'A.odontolyticus.1']
        double = 'B.cereus.1'
        reader = rs1.resourceReaders()[0]
        exp_double = rs1.get_contig(double)
        exp_singles = [rs1.get_contig(name) for name in singletons]

        # todo: modify the names first:
        with FastaWriter(outFas1) as writer:
            writer.writeRecord(exp_singles[0])
            writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence)
        with FastaWriter(outFas2) as writer:
            writer.writeRecord(exp_double.name + '_0_10',
                               exp_double.sequence + 'ATCGATCGATCG')
            writer.writeRecord(exp_singles[1])

        exp_double_seq = ''.join([exp_double.sequence,
                                  'ATCGATCGATCG',
                                  exp_double.sequence])
        exp_single_seqs = [rec.sequence for rec in exp_singles]

        acc_file = ContigSet(outFas1, outFas2)
        acc_file.induceIndices()
        log.debug(acc_file.toExternalFiles())
        self.assertEqual(len(acc_file), 4)
        self.assertEqual(len(list(acc_file)), 4)
        acc_file.consolidate()
        log.debug(acc_file.toExternalFiles())

        # open acc and compare to exp
        for name, seq in zip(singletons, exp_single_seqs):
            self.assertEqual(acc_file.get_contig(name).sequence[:], seq)
        self.assertEqual(acc_file.get_contig(double).sequence[:],
                         exp_double_seq)

        self.assertEqual(len(acc_file._openReaders), 1)
        self.assertEqual(len(acc_file.index), 3)
        self.assertEqual(len(acc_file._indexMap), 3)
        self.assertEqual(len(acc_file), 3)
        self.assertEqual(len(list(acc_file)), 3)
Example #51
0
def consolidateXml(args):
    """Combine BAMs and apply the filters described in the XML file, producing
    one consolidated XML"""
    dset = ContigSet(args.infile)
    dset.consolidate(args.datafile)
    dset.write(args.xmlfile)
Example #52
0
def makeReport(inReadsFN, hq_isoforms_fq, lq_isoforms_fq, inSummaryFN, outDir):
    """
    Generate a report with ID, tables, attributes and plot groups.

    inReadsFN --- an input FASTA file which has all consensus
    isoforms produced by pbtranscript.py cluster.
    This file is required to plot a read length histogram as part of
    the report:
         consensus_isoforms_readlength_hist.png

    hq_isoforms_fq/lq_isoforms_lq --- input FASTQ files which has
    all HQ/LQ isoforms produced by pbtranscript.py cluster.
    These two files will be required to plot the average QV histograms:
         hq_lq_isoforms_avgqv_hist.png

    inSummaryFN --- a summary TXT file with cluster attributes,
    including two attributes:
         number of consensus isoforms
         average length of consensus isoforms
    Attributes of the report are extracted from this file.

    """
    log.info("Plotting read length histogram from file: {f}".
             format(f=inReadsFN))

    # Collect read lengths of
    reader = ContigSet(inReadsFN)
    rs = [len(r.sequence) for r in reader]
    reader.close()
    readlengths = np.array(rs).astype(float)

    # Plot read length histogram
    readlength_plot = create_readlength_plot(readlengths, outDir)
    readlength_group = PlotGroup(Constants.PG_READLENGTH,
                                 plots=[readlength_plot],
                                 thumbnail=readlength_plot.thumbnail)

    # Collect average qvs
    hq_qvs = [np.mean(r.quality) for r in ContigSet(hq_isoforms_fq)]
    lq_qvs = [np.mean(r.quality) for r in ContigSet(lq_isoforms_fq)]
    avgqvs = np.array(hq_qvs + lq_qvs)

    # Plot average qv histogram
    avgqv_plot = create_avgqv_plot(avgqvs, outDir)
    avgqv_group = PlotGroup(Constants.PG_AVGQV,
                            plots=[avgqv_plot],
                            thumbnail=avgqv_plot.thumbnail)

    log.info("Plotting summary attributes from file: {f}".
             format(f=inSummaryFN))
    # Produce attributes based on summary.
    dataset_uuids = [ContigSet(inReadsFN).uuid]
    attributes = _report_to_attributes(inSummaryFN)
    r = load_report_from_json(inSummaryFN)
    # FIXME(nechols)(2016-03-22) not using the dataset UUIDs from these
    # reports; should we be?

    table = attributesToTable(attributes)
    log.info(str(table))

    # A report is consist of ID, tables, attributes, and plotgroups.
    report = Report(Constants.R_ID,
                    attributes=attributes,
                    plotgroups=[readlength_group, avgqv_group],
                    dataset_uuids=dataset_uuids)

    return spec.apply_view(report)