Beispiel #1
0
def _get_contigs_to_plot(alignment_summ_gff, contigs):
    """
    Returns a dict (string: ContigCoverage) that maps a contig ID to its coverage object.
    :param alignment_summ_gff: (str) path to alignment_summ_gff
    :param contigs: (list) top contigs from reference
    """
    def _get_name(id_):
        for c in contigs:
            if c.id == id_:
                return c.name

    cov_map = {}
    contig_ids = [c.id for c in contigs]

    reader = GffReader(alignment_summ_gff)
    for rec in reader:
        if rec.seqid not in contig_ids:
            log.info("Skipping seqid '{i}'.".format(i=rec.seqid))
            continue

        try:
            contig_cov = cov_map[rec.seqid]
        except KeyError:
            contig_cov = ContigCoverage(rec.seqid, _get_name(rec.seqid))
            cov_map[rec.seqid] = contig_cov

        contig_cov.add_data(rec)

    reader.close()

    return cov_map
Beispiel #2
0
def _get_contigs_to_plot(alignment_summ_gff, contigs):
    """
    Returns a dict (string: ContigCoverage) that maps a contig ID to its coverage object.
    :param alignment_summ_gff: (str) path to alignment_summ_gff
    :param contigs: (list) top contigs from reference
    """

    def _get_name(id_):
        for c in contigs:
            if c.id == id_:
                return c.name

    cov_map = {}
    contig_ids = [c.id for c in contigs]

    reader = GffReader(alignment_summ_gff)
    for rec in reader:
        if rec.seqid not in contig_ids:
            log.info(
                "Unable to find gff '{i}' in alignment contig ids.".format(i=rec.seqid))
            continue

        try:
            contig_cov = cov_map[rec.seqid]
        except KeyError:
            contig_cov = ContigCoverage(rec.seqid, _get_name(rec.seqid))
            cov_map[rec.seqid] = contig_cov

        contig_cov.add_data(rec)

    reader.close()

    return cov_map
Beispiel #3
0
def _get_contig_coverage(alignment_summ_gff, contigs):
    """
    Modifies the passed contigs object to include coverage information.
    :param alignment_summ_gff: (str) path to alignment_summ_gff
    :param contigs: (dict) contig id -> ContigInfo object
    """
    reader = GffReader(alignment_summ_gff)
    for rec in reader:
        # Some contigs don't have any coverage, but make it into the gff file
        if rec.seqid in contigs:
            contigs[rec.seqid].add_coverage_data(rec)

    reader.close()
def _get_contig_coverage(alignment_summ_gff, contigs):
    """
    Modifies the passed contigs object to include coverage information.
    :param alignment_summ_gff: (str) path to alignment_summ_gff
    :param contigs: (dict) contig id -> ContigInfo object
    """
    reader = GffReader(alignment_summ_gff)
    for rec in reader:
        # Some contigs don't have any coverage, but make it into the gff file
        if rec.seqid in contigs:
            contigs[rec.seqid].add_coverage_data(rec)

    reader.close()
Beispiel #5
0
def _extract_alignment_summ_data(aln_summ_gff, contigs):
    """
    :param aln_summ_gff: (str) path to alignment_summary.gff
    :param contigs: (list) top contigs from reference
    :returns: 2 dictionaries containing data extracted from alignment_summary.gff
    """

    def _get_name(id_):
        for c in contigs:
            if c.id == id_:
                return c.name

    contig_ids = [c.id for c in contigs]

    ref_data = {}
    var_map = {}

    log.info("Reading GFF data from {f}".format(f=aln_summ_gff))

    reader = GffReader(aln_summ_gff)
    for rec in reader:
        seqid = rec.seqid.split()[0]
        if seqid not in contig_ids:
            continue

        # first data set
        ref_data.setdefault(seqid, [0, 0, 0, 0])
        ref_data[seqid][LENGTH] = max(rec.end, ref_data[seqid][LENGTH])
        numGaps, lenGaps = rec.attributes["gaps"].split(",")
        ref_data[seqid][GAPS] += int(lenGaps)
        ref_data[seqid][COV] += float( rec.attributes["cov2"].split(",")[0] ) * \
            (rec.end - rec.start + 1)

        # second data set
        contig_var = None
        try:
            contig_var = var_map[seqid]
        except KeyError:
            contig_var = ContigVariants(seqid, _get_name(seqid))
            var_map[seqid] = contig_var

        contig_var.add_data(rec)

    reader.close()

    return ref_data, var_map
Beispiel #6
0
def _append_variants_gff_data(ref_data, variants_gff):
    """
    Adds data from variants gff to the ref_data dict
    :param ref_data: (dict) dict of data pulled from alignment_summary.gff
    :param variants_gff: (str) path to variants_gff

    :type variants_gff: str
    """
    reader = GffReader(variants_gff)
    for record in reader:
        err_len = record.end - record.start + 1
        seqid = record.seqid.split()[0]
        if seqid in ref_data:
            ref_data[seqid][ERR] += err_len
        else:
            # the variants might not be present in the top 25 contigs,
            # so we can just raise a warning in the log.
            msg = "Unable to find {r} in {f}".format(r=seqid, f=variants_gff)
            log.warn(msg)

    reader.close()
Beispiel #7
0
def _append_variants_gff_data(ref_data, variants_gff):
    """
    Adds data from variants gff to the ref_data dict
    :param ref_data: (dict) dict of data pulled from alignment_summary.gff
    :param variants_gff: (str) path to variants_gff

    :type variants_gff: str
    """
    reader = GffReader(variants_gff)
    for record in reader:
        err_len = record.end - record.start + 1
        seqid = record.seqid.split()[0]
        if seqid in ref_data:
            ref_data[seqid][ERR] += err_len
        else:
            # the variants might not be present in the top 25 contigs,
            # so we can just raise a warning in the log.
            msg = "Unable to find {r} in {f}".format(
                r=seqid, f=variants_gff)
            log.warn(msg)

    reader.close()
def main():
    headers = [
        ("source", "GenomicConsensus %s" % __VERSION__),
        ("pacbio-alignment-summary-version", "0.6"),
        ("source-commandline", " ".join(sys.argv)),
    ]

    desc = "Augment the alignment_summary.gff file with consensus and variants information."
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument("--variantsGff",
                        type=str,
                        help="Input variants.gff or variants.gff.gz filename",
                        required=True)
    parser.add_argument("--output",
                        "-o",
                        type=str,
                        help="Output alignment_summary.gff filename")
    parser.add_argument("inputAlignmentSummaryGff",
                        type=str,
                        help="Input alignment_summary.gff filename")

    options = parser.parse_args()

    inputVariantsGff = GffReader(options.variantsGff)
    inputAlignmentSummaryGff = GffReader(options.inputAlignmentSummaryGff)

    summaries = {}
    for gffRecord in inputAlignmentSummaryGff:
        region = Region(gffRecord.seqid, gffRecord.start, gffRecord.end)
        summaries[region] = {"ins": 0, "del": 0, "sub": 0, "cQv": (0, 0, 0)}
    inputAlignmentSummaryGff.close()

    counterNames = {
        "insertion": "ins",
        "deletion": "del",
        "substitution": "sub"
    }
    for variantGffRecord in inputVariantsGff:
        for region in summaries:
            summary = summaries[region]
            if (region.seqid == variantGffRecord.seqid
                    and region.start <= variantGffRecord.start <= region.end):
                counterName = counterNames[variantGffRecord.type]
                variantLength = max(len(variantGffRecord.reference),
                                    len(variantGffRecord.variantSeq))
                summary[counterName] += variantLength
            # TODO: base consensusQV on effective coverage
            summary["cQv"] = (20, 20, 20)

    inputAlignmentSummaryGff = open(options.inputAlignmentSummaryGff)
    outputAlignmentSummaryGff = open(options.output, "w")

    inHeader = True

    for line in inputAlignmentSummaryGff:
        line = line.rstrip()

        # Pass any metadata line straight through
        if line[0] == "#":
            print >> outputAlignmentSummaryGff, line.strip()
            continue

        if inHeader:
            # We are at the end of the header -- write the tool-specific headers
            for k, v in headers:
                print >> outputAlignmentSummaryGff, ("##%s %s" % (k, v))
            inHeader = False

        # Parse the line
        rec = Gff3Record.fromString(line)

        if rec.type == "region":
            summary = summaries[(rec.seqid, rec.start, rec.end)]
            if "cQv" in summary:
                cQvTuple = summary["cQv"]
                line += ";%s=%s" % ("cQv", ",".join(
                    str(int(f)) for f in cQvTuple))
            for counterName in counterNames.values():
                if counterName in summary:
                    line += ";%s=%d" % (counterName, summary[counterName])
            print >> outputAlignmentSummaryGff, line