コード例 #1
0
def excessive_overlap(record, excess=15, excess_divergent=30):
    """
    Find excessive overlaps in the genome, where excessive is defined as 15
    bases for same strand, and 30 for divergent translation.

    Does a product of all the top-level features in the genome, and calculates
    gaps.
    """
    results = []
    bad = 0
    qc_features = []

    for (gene_a,
         gene_b) in itertools.combinations(coding_genes(record.features), 2):
        # Get the CDS from the subfeature list.
        # TODO: not recursive.
        cds_a = [x for x in genes(gene_a.sub_features, feature_type="CDS")]
        cds_b = [x for x in genes(gene_b.sub_features, feature_type="CDS")]

        if len(cds_a) == 0:
            log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_a))
            continue

        if len(cds_b) == 0:
            log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_b))
            continue

        cds_a = cds_a[0]
        cds_b = cds_b[0]

        # Set of locations that are included in the CDS of A and the
        # CDS of B
        cas = set(range(cds_a.location.start, cds_a.location.end))
        cbs = set(range(cds_b.location.start, cds_b.location.end))

        # Here we calculate the intersection between the two sets, and
        # if it's larger than our excessive size, we know that they're
        # overlapped
        ix = cas.intersection(cbs)

        if (cds_a.location.strand == cds_b.location.strand and len(ix) >=
                excess) or (cds_a.location.strand != cds_b.location.strand
                            and len(ix) >= excess_divergent):
            bad += float(len(ix)) / float(min(excess, excess_divergent))
            qc_features.append(
                gen_qc_feature(min(ix),
                               max(ix),
                               "Excessive Overlap",
                               id_src=gene_a))
            results.append((gene_a, gene_b, min(ix), max(ix)))

    # Good isn't accurate here. It's a triangle number and just ugly, but we
    # don't care enough to fix it.
    good = len(list(coding_genes(record.features)))
    good = int(good - bad)
    if good < 0:
        good = 0
    return good, int(bad), results, qc_features
コード例 #2
0
def bad_gene_model(record):
    """Find features without product
    """
    results = []
    good = 0
    bad = 0
    qc_features = []

    for gene in coding_genes(record.features):
        exons = [x for x in genes(gene.sub_features, feature_type='exon') if len(x) > 10]
        CDSs = [x for x in genes(gene.sub_features, feature_type='CDS')]
        if len(exons) >= 1 and len(CDSs) >= 1:
            if len(exons) != len(CDSs):
                results.append((
                    get_gff3_id(gene),
                    None,
                    None,
                    'Mismatched number of exons and CDSs in gff3 representation',
                ))
                qc_features.append(gen_qc_feature(
                    gene.location.start, gene.location.end,
                    'Mismatched number of exons and CDSs in gff3 representation',
                    strand=gene.strand,
                    id_src=gene
                ))
                bad += 1
            else:
                for (exon, cds) in zip(sorted(exons, key=lambda x: x.location.start), sorted(CDSs, key=lambda x: x.location.start)):
                    if len(exon) != len(cds):
                        results.append((
                            get_gff3_id(gene),
                            exon,
                            cds,
                            'CDS does not extend to full length of gene',
                        ))
                        qc_features.append(gen_qc_feature(
                            exon.location.start, exon.location.end,
                            'CDS does not extend to full length of gene',
                            strand=exon.strand,
                            id_src=gene
                        ))
                        bad += 1
                    else:
                        good += 1
        else:
            log.warn("Could not handle %s, %s", exons, CDSs)
            results.append((
                get_gff3_id(gene),
                None,
                None,
                '{0} exons, {1} CDSs'.format(len(exons), len(CDSs))
            ))

    return good, len(results) + bad, results, qc_features
コード例 #3
0
    def sd_spacing(record, feature):
        """Shine-Dalgarno spacing
        """
        rbss = get_rbs_from(gene)
        if len(rbss) == 0:
            return "None"
        else:
            resp = []
            for rbs in rbss:
                cdss = list(
                    genes(feature.sub_features, feature_type="CDS", sort=True))

                if rbs.location.strand > 0:
                    distance = min(
                        cdss,
                        key=lambda x: x.location.start - rbs.location.end)
                    distance_val = str(distance.location.start -
                                       rbs.location.end)
                    resp.append(distance_val)
                else:
                    distance = min(
                        cdss,
                        key=lambda x: x.location.end - rbs.location.start)
                    distance_val = str(rbs.location.start -
                                       distance.location.end)
                    resp.append(distance_val)

            if len(resp) == 1:
                return str(resp[0])
            return resp
コード例 #4
0
def missing_tags(record):
    """Find features without product
    """
    results = []
    good = 0
    bad = 0
    qc_features = []

    for gene in coding_genes(record.features):
        cds = [x for x in genes(gene.sub_features, feature_type="CDS")]
        if len(cds) == 0:
            log.warn("Gene missing CDS subfeature %s", get_gff3_id(gene))
            continue

        cds = cds[0]

        if "product" not in cds.qualifiers:
            log.info("Missing product tag on %s", get_gff3_id(gene))
            qc_features.append(
                gen_qc_feature(
                    cds.location.start,
                    cds.location.end,
                    "Missing product tag",
                    strand=cds.strand,
                ))
            results.append(cds)
            bad += 1
        else:
            good += 1

    return good, bad, results, qc_features
コード例 #5
0
 def start_codon(record, feature):
     """Start Codon
     """
     cdss = list(genes(feature.sub_features, feature_type="CDS", sort=True))
     data = [x for x in cdss]
     if len(data) == 1:
         return str(data[0].extract(record).seq[0:3])
     else:
         return [
             "{0} ({1.location.start}..{1.location.end}:{1.location.strand})"
             .format(x.extract(record).seq[0:3], x) for x in data
         ]
コード例 #6
0
def exact_coding_density(record, mean=92.5, sd=20):
    """
    Find exact coding density in the genome
    """
    data = numpy.zeros(len(record.seq))

    for gene_a in coding_genes(record.features):
        for cds in genes(gene_a.sub_features, feature_type="CDS"):
            for i in range(cds.location.start, cds.location.end + 1):
                data[i - 1] = 1

    return float(sum(data)) / len(data)
コード例 #7
0
def weird_starts(record):
    """Find features without product
    """
    good = 0
    bad = 0
    qc_features = []
    results = []

    overall = {}
    for gene in coding_genes(record.features):
        seq = [x for x in genes(gene.sub_features, feature_type='CDS')]
        if len(seq) == 0:
            log.warn("No CDS for gene %s", get_gff3_id(gene))
            continue
        else:
            seq = seq[0]

        seq_str = str(seq.extract(record.seq))
        start_codon = seq_str[0:3]
        stop_codon = seq_str[-3]
        seq.__start = start_codon
        seq.__stop = stop_codon
        if start_codon not in overall:
            overall[start_codon] = 1
        else:
            overall[start_codon] += 1

        if start_codon not in ('ATG', 'TTG', 'GTG'):
            log.warn("Weird start codon (%s) on %s", start_codon, get_gff3_id(gene))
            seq.__error = 'Unusual start codon %s' % start_codon

            s = 0
            e = 0
            if seq.strand > 0:
                s = seq.location.start
                e = seq.location.start + 3
            else:
                s = seq.location.end
                e = seq.location.end - 3

            results.append(seq)

            qc_features.append(gen_qc_feature(
                s, e,
                'Weird start codon',
                strand=seq.strand,
                id_src=gene
            ))
            bad += 1
        else:
            good += 1

    return good, bad, results, qc_features, overall
コード例 #8
0
def coding_density(record, mean=92.5, sd=20):
    """
    Find coding density in the genome
    """
    feature_lengths = 0

    for gene_a in coding_genes(record.features):
        feature_lengths += sum(
            [len(x) for x in genes(gene_a.sub_features, feature_type="CDS")])

    avgFeatLen = float(feature_lengths) / float(len(record.seq))
    return int(norm(100 * avgFeatLen, mean=mean, sd=sd) * 100), int(100 *
                                                                    avgFeatLen)
コード例 #9
0
 def length(record, feature):
     """CDS Length (AA)
     """
     cdss = list(genes(feature.sub_features, feature_type="CDS", sort=True))
     return str((sum([len(cds) for cds in cdss]) / 3) - 1)
コード例 #10
0
def missing_rbs(record, lookahead_min=5, lookahead_max=15):
    """
    Identify gene features with missing RBSs

    This "looks ahead" 5-15 bases ahead of each gene feature, and checks if
    there's an RBS feature in those bounds.

    The returned data is a set of genes with the RBS sequence in the __upstream
    attribute, and a message in the __message attribute.
    """
    results = []
    good = 0
    bad = 0
    qc_features = []
    sd_finder = NaiveSDCaller()

    any_rbss = False

    for gene in coding_genes(record.features):
        # Check if there are RBSs, TODO: make this recursive. Each feature in
        # gene.sub_features can also have sub_features.
        rbss = get_rbs_from(gene)
        # No RBS found
        if len(rbss) == 0:
            # Get the sequence lookahead_min to lookahead_max upstream
            if gene.strand > 0:
                start = gene.location.start - lookahead_max
                end = gene.location.start - lookahead_min
            else:
                start = gene.location.end + lookahead_min
                end = gene.location.end + lookahead_max
            # We have to ensure the feature is ON the genome, otherwise we may
            # be trying to access a location outside of the length of the
            # genome, which would be bad.
            (start,
             end) = __ensure_location_in_bounds(start=start,
                                                end=end,
                                                parent_length=len(record))
            # Temporary feature to extract sequence
            tmp = SeqFeature(FeatureLocation(start, end, strand=gene.strand),
                             type="domain")
            # Get the sequence
            seq = str(tmp.extract(record.seq))
            # Set the default properties
            gene.__upstream = seq.lower()
            gene.__message = "No RBS annotated, None found"

            # Try and do an automated shinefind call
            sds = sd_finder.list_sds(seq)
            if len(sds) > 0:
                sd = sds[0]
                gene.__upstream = sd_finder.highlight_sd(
                    seq.lower(), sd["start"], sd["end"])
                gene.__message = "Unannotated but valid RBS"

            qc_features.append(
                gen_qc_feature(start,
                               end,
                               "Missing RBS",
                               strand=gene.strand,
                               id_src=gene))

            bad += 1
            results.append(gene)
        else:
            if len(rbss) > 1:
                log.warn("%s RBSs found for gene %s", rbss[0].id,
                         get_gff3_id(gene))
            any_rbss = True
            # get first RBS/CDS
            cds = list(genes(gene.sub_features, feature_type="CDS"))[0]
            rbs = rbss[0]

            # Get the distance between the two
            if gene.strand > 0:
                distance = cds.location.start - rbs.location.end
            else:
                distance = rbs.location.start - cds.location.end

            # If the RBS is too far away, annotate that
            if distance > lookahead_max:
                gene.__message = "RBS too far away (%s nt)" % distance

                qc_features.append(
                    gen_qc_feature(
                        rbs.location.start,
                        rbs.location.end,
                        gene.__message,
                        strand=gene.strand,
                        id_src=gene,
                    ))

                bad += 1
                results.append(gene)
            else:
                good += 1

    return good, bad, results, qc_features, any_rbss
コード例 #11
0
def gene_model_correction_issues(record):
    """Find features that have issues from the gene model correction step.
    These have qualifiers beginning with CPT_GMS
    """
    results = []
    good = 0
    bad = 0
    qc_features = []

    # For each gene
    for gene in coding_genes(record.features):
        # Get the list of child CDSs
        cdss = [x for x in genes(gene.sub_features, feature_type="CDS")]
        # And our matching qualifiers
        gene_data = [(k, v) for (k, v) in gene.qualifiers.items()
                     if k == "cpt_gmc"]
        # If there are problems with ONLY the parent, let's complain
        local_results = []
        local_qc_features = []
        for x in gene_data:
            if "Missing Locus Tag" in x[1]:
                # Missing locus tag is an either or thing, if it hits here
                # there shouldn't be anything else wrong with it.

                # Obviously missing so we remove it
                gene.qualifiers["locus_tag"] = [""]
                # Translation from bp_genbank2gff3.py
                cdss[0].qualifiers["locus_tag"] = cdss[0].qualifiers["Name"]
                # Append our results
                local_results.append(
                    (gene, cdss[0], "Gene is missing a locus_tag"))
                local_qc_features.append(
                    gen_qc_feature(
                        gene.location.start,
                        gene.location.end,
                        "Gene is missing a locus_tag",
                        strand=gene.strand,
                    ))

        # We need to alert on any child issues as well.
        for cds in cdss:
            cds_data = [(k, v[0]) for (k, v) in cds.qualifiers.items()
                        if k == "cpt_gmc"]
            if len(gene_data) == 0 and len(cds_data) == 0:
                # Alles gut
                pass
            else:
                for _, problem in cds_data:
                    if problem == "BOTH Missing Locus Tag":
                        gene.qualifiers["locus_tag"] = [""]
                        cds.qualifiers["locus_tag"] = [""]
                        local_results.append(
                            (gene, cds,
                             "Both gene and CDS are missing locus tags"))
                        local_qc_features.append(
                            gen_qc_feature(
                                cds.location.start,
                                cds.location.end,
                                "CDS is missing a locus_tag",
                                strand=cds.strand,
                            ))
                        local_qc_features.append(
                            gen_qc_feature(
                                gene.location.start,
                                gene.location.end,
                                "Gene is missing a locus_tag",
                                strand=gene.strand,
                            ))
                    elif problem == "Different locus tag from associated gene.":
                        gene.qualifiers["locus_tag"] = gene.qualifiers["Name"]
                        cds.qualifiers["locus_tag"] = cds.qualifiers[
                            "cpt_gmc_locus"]
                        local_results.append(
                            (gene, cds,
                             "Gene and CDS have differing locus tags"))
                        local_qc_features.append(
                            gen_qc_feature(
                                gene.location.start,
                                gene.location.end,
                                "Gene and CDS have differing locus tags",
                                strand=gene.strand,
                            ))
                    elif problem == "Missing Locus Tag":
                        # Copy this over
                        gene.qualifiers["locus_tag"] = gene.qualifiers["Name"]
                        # This one is missing
                        cds.qualifiers["locus_tag"] = [""]
                        local_results.append(
                            (gene, cds, "CDS is missing a locus_tag"))
                        local_qc_features.append(
                            gen_qc_feature(
                                cds.location.start,
                                cds.location.end,
                                "CDS is missing a locus_tag",
                                strand=cds.strand,
                            ))
                    else:
                        log.warn("Cannot handle %s", problem)

        if len(local_results) > 0:
            bad += 1
        else:
            good += 1

        qc_features.extend(local_qc_features)
        results.extend(local_results)
    return good, bad, results, qc_features
コード例 #12
0
def weird_starts(record):
    """Find features without product
    """
    good = 0
    bad = 0
    qc_features = []
    results = []

    overall = {}
    for gene in coding_genes(record.features):
        seq = [x for x in genes(gene.sub_features, feature_type="CDS")]
        if len(seq) == 0:
            log.warn("No CDS for gene %s", get_gff3_id(gene))
            continue
        else:
            seq = seq[0]

        seq_str = str(seq.extract(record.seq))
        start_codon = seq_str[0:3]
        if len(seq_str) < 3:
            sys.stderr.write("Fatal Error: CDS of length less than 3 at " +
                             str(seq.location) + '\n')
            exit(2)


#        if len(seq_str) % 3 != 0:
#            if len(seq_str) < 3:
#                stop_codon = seq_str[-(len(seq_str))]
#            else:
#                stop_codon = seq_str[-3]
#
#            log.warn("CDS at %s length is not a multiple of three (Length = %d)", get_gff3_id(gene), len(seq_str))
#            seq.__error = "Bad CDS Length"
#            results.append(seq)
#            qc_features.append(
#                gen_qc_feature(
#                    s, e, "Bad Length", strand=seq.strand, id_src=gene
#                )
#            )
#            bad += 1
#            seq.__start = start_codon
#            seq.__stop = stop_codon
#            continue

        stop_codon = seq_str[-3]
        seq.__start = start_codon
        seq.__stop = stop_codon
        if start_codon not in overall:
            overall[start_codon] = 1
        else:
            overall[start_codon] += 1

        if start_codon not in ("ATG", "TTG", "GTG"):
            log.warn("Weird start codon (%s) on %s", start_codon,
                     get_gff3_id(gene))
            seq.__error = "Unusual start codon %s" % start_codon

            s = 0
            e = 0
            if seq.strand > 0:
                s = seq.location.start
                e = seq.location.start + 3
            else:
                s = seq.location.end
                e = seq.location.end - 3

            results.append(seq)
            qc_features.append(
                gen_qc_feature(s,
                               e,
                               "Weird start codon",
                               strand=seq.strand,
                               id_src=gene))
            bad += 1
        else:
            good += 1

    return good, bad, results, qc_features, overall
コード例 #13
0
def excessive_gap(
    record,
    excess=50,
    excess_divergent=200,
    min_gene=30,
    slop=30,
    lookahead_min=5,
    lookahead_max=15,
):
    """
    Identify excessive gaps between gene features.

    Default "excessive" gap size is 10, but that should likely be larger.
    """
    results = []
    good = 0
    bad = 0

    contiguous_regions = []

    sorted_genes = sorted(genes(record.features),
                          key=lambda feature: feature.location.start)
    if len(sorted_genes) == 0:
        log.warn("NO GENES FOUND")
        return good, bad, results, []

    current_gene = None
    for gene in sorted_genes:
        # If the gene's start is contiguous to the "current_gene", then we
        # extend current_gene
        log.debug("gene.id", gene.id)
        for cds in genes(gene.sub_features, feature_type="CDS"):
            log.debug("\t%s %s", cds.id, cds.location)
            if current_gene is None:
                current_gene = [int(cds.location.start), int(cds.location.end)]

            if cds.location.start <= current_gene[1] + excess:
                # Don't want to decrease size
                if int(cds.location.end) >= current_gene[1]:
                    current_gene[1] = int(cds.location.end)
            else:
                # If it's discontiguous, we append the region and clear.
                contiguous_regions.append(current_gene)
                current_gene = [int(cds.location.start), int(cds.location.end)]

    # This generally expected that annotations would NOT continue unto the end
    # of the genome, however that's a bug, and we can make it here with an
    # empty contiguous_regions list
    contiguous_regions.append(current_gene)

    for i in range(len(contiguous_regions) + 1):
        if i == 0:
            a = (1, 1)
            b = contiguous_regions[i]
        elif i >= len(contiguous_regions):
            a = contiguous_regions[i - 1]
            b = (len(record.seq), None)
        else:
            a = contiguous_regions[i - 1]
            b = contiguous_regions[i]

        gap_size = abs(b[0] - a[1])
        if gap_size > min(excess, excess_divergent):
            a_feat_l = itertools.islice(
                feature_lambda(
                    sorted_genes,
                    feature_test_location,
                    {"loc": a[1]},
                    subfeatures=False,
                ),
                1,
            )
            b_feat_l = itertools.islice(
                feature_lambda(
                    sorted_genes,
                    feature_test_location,
                    {"loc": b[0]},
                    subfeatures=False,
                ),
                1,
            )

            try:
                a_feat = next(a_feat_l)
            except StopIteration:
                # Triggers on end of genome
                a_feat = None
            try:
                b_feat = next(b_feat_l)
            except StopIteration:
                # Triggers on end of genome
                b_feat = None

            result_obj = [
                a[1],
                b[0],
                None if not a_feat else a_feat.location.strand,
                None if not b_feat else b_feat.location.strand,
            ]

            if a_feat is None or b_feat is None:
                if gap_size > excess_divergent:
                    results.append(result_obj)
            else:
                if (a_feat.location.strand == b_feat.location.strand
                        and gap_size > excess):
                    results.append(result_obj)
                elif (a_feat.location.strand != b_feat.location.strand
                      and gap_size > excess_divergent):
                    results.append(result_obj)

    better_results = []
    qc_features = []
    of = MGAFinder(11, "CDS", "closed", min_gene)
    # of = OrfFinder(11, 'CDS', 'closed', min_gene)

    for result_obj in results:
        start = result_obj[0]
        end = result_obj[1]
        f = gen_qc_feature(start, end,
                           "Excessive gap, %s bases" % abs(end - start))
        qc_features.append(f)
        putative_genes = of.putative_genes_in_sequence(
            str(record[start - slop:end + slop].seq))
        putative_genes = list(
            require_sd(putative_genes, record, start, lookahead_min,
                       lookahead_max))
        for putative_gene in putative_genes:
            # (0, 33, 1, 'ATTATTTTATCAAAACGCTTTACAATCTTTTAG', 'MILSKRFTIF', 123123, 124324)
            possible_gene_start = start + putative_gene[0]
            possible_gene_end = start + putative_gene[1]

            if possible_gene_start <= possible_gene_end:
                possible_cds = SeqFeature(
                    FeatureLocation(possible_gene_start,
                                    possible_gene_end,
                                    strand=putative_gene[2]),
                    type="CDS",
                )
            else:
                possible_cds = SeqFeature(
                    FeatureLocation(
                        possible_gene_end,
                        possible_gene_start,
                        strand=putative_gene[2],
                    ),
                    type="CDS",
                )

            # Now we adjust our boundaries for the RBS that's required
            # There are only two cases, the rbs is upstream of it, or downstream
            if putative_gene[5] < possible_gene_start:
                possible_gene_start = putative_gene[5]
            else:
                possible_gene_end = putative_gene[6]

            if putative_gene[5] <= putative_gene[6]:
                possible_rbs = SeqFeature(
                    FeatureLocation(putative_gene[5],
                                    putative_gene[6],
                                    strand=putative_gene[2]),
                    type="Shine_Dalgarno_sequence",
                )
            else:
                possible_rbs = SeqFeature(
                    FeatureLocation(
                        putative_gene[6],
                        putative_gene[5],
                        strand=putative_gene[2],
                    ),
                    type="Shine_Dalgarno_sequence",
                )

            if possible_gene_start <= possible_gene_end:
                possible_gene = SeqFeature(
                    FeatureLocation(possible_gene_start,
                                    possible_gene_end,
                                    strand=putative_gene[2]),
                    type="gene",
                    qualifiers={"note": ["Possible gene"]},
                )
            else:
                possible_gene = SeqFeature(
                    FeatureLocation(
                        possible_gene_end,
                        possible_gene_start,
                        strand=putative_gene[2],
                    ),
                    type="gene",
                    qualifiers={"note": ["Possible gene"]},
                )
            possible_gene.sub_features = [possible_rbs, possible_cds]
            qc_features.append(possible_gene)

        better_results.append(result_obj + [len(putative_genes)])

    # Bad gaps are those with more than zero possible genes found
    bad = len([x for x in better_results if x[2] > 0])
    # Generally taking "good" here as every possible gap in the genome
    # Thus, good is TOTAL - gaps
    good = len(sorted_genes) + 1 - bad
    # and bad is just gaps
    return good, bad, better_results, qc_features