Ejemplo n.º 1
0
def writeGff3(data, handle, parentGff3):
    for record in GFF.parse(parentGff3):
        cdss = list(
            feature_lambda(record.features,
                           feature_test_id, {"id": data.keys()},
                           subfeatures=False))
        record.features = []
        for cds in cdss:
            if "note" not in cds.qualifiers:
                cds.qualifiers["note"] = []

            id = get_id(cds)
            if data[id]["cleavage"]:
                cds.qualifiers["note"].append("Cleavage between %s and %s" %
                                              data[id]["cleavageSite"])
            record.features.append(fetchParent(cds))

        GFF.write([record], handle)
Ejemplo n.º 2
0
def shinefind(
    genbank_file,
    gff3_output=None,
    table_output=None,
    lookahead_min=5,
    lookahead_max=15,
    top_only=False,
    add=False,
):
    table_output.write("\t".join([
        "ID",
        "Name",
        "Terminus",
        "Terminus",
        "Strand",
        "Upstream Sequence",
        "SD",
        "Spacing",
    ]) + "\n")

    sd_finder = NaiveSDCaller()
    # Parse GFF3 records
    for record in list(SeqIO.parse(genbank_file, "genbank")):
        # Sometimes you have a case where TWO CDS features have the same start. Only handle ONE.
        seen = {}
        # Shinefind's "gff3_output".
        gff3_output_record = SeqRecord(record.seq, record.id)
        # Loop over all CDS features
        for feature in record.features:
            if feature.type != "CDS":
                continue

            seen_loc = (feature.location.start
                        if feature.strand > 0 else feature.location.end)
            if seen_loc in seen:
                continue
            else:
                seen[seen_loc] = True

            sds, start, end, seq = sd_finder.testFeatureUpstream(
                feature, record, sd_min=lookahead_min, sd_max=lookahead_max)

            feature_id = get_id(feature)
            sd_features = sd_finder.to_features(sds,
                                                feature.location.strand,
                                                start,
                                                end,
                                                feature_id=feature.id)

            human_strand = "+" if feature.location.strand == 1 else "-"

            # http://book.pythontips.com/en/latest/for_-_else.html
            log.debug("Found %s SDs", len(sds))
            for (sd, sd_feature) in zip(sds, sd_features):
                # If we only want the top feature, after the bulk of the
                # forloop executes once, we append the top feature, and fake a
                # break, because an actual break triggers the else: block
                table_output.write("\t".join(
                    map(
                        str,
                        [
                            feature.id,
                            feature_id,
                            feature.location.start,
                            feature.location.end,
                            human_strand,
                            sd_finder.highlight_sd(seq, sd["start"],
                                                   sd["end"]),
                            sd["hit"],
                            int(sd["spacing"]) + lookahead_min,
                        ],
                    )) + "\n")

                if add:
                    # Append the top RBS to the gene feature
                    record.features.append(sd_feature)
                # Also register the feature with the separate GFF3 output
                gff3_output_record.features.append(sd_feature)

                if top_only:
                    break
            else:
                if len(sds) != 0:
                    log.debug("Should not reach here if %s", len(sds) != 0)
                    # Somehow this is triggerring, and I don't feel like figuring out why. Someone else's problem.
                    continue
                table_output.write("\t".join(
                    map(
                        str,
                        [
                            feature.id,
                            feature_id,
                            feature.location.start,
                            feature.location.end,
                            human_strand,
                            seq,
                            None,
                            -1,
                        ],
                    )) + "\n")

        record.features = sorted(record.features,
                                 key=lambda x: x.location.start)
        SeqIO.write([record], sys.stdout, "genbank")

        gff3_output_record.features = sorted(gff3_output_record.features,
                                             key=lambda x: x.location.start)
        gff3_output_record.annotations = {}
        GFF.write([gff3_output_record], gff3_output)
Ejemplo n.º 3
0
                top_hits[qseq][fn] = (evalue, sseq, dice)

    sys.stdout.write("# Query Feature\tLocation\t")
    sys.stdout.write("\t".join(["%s\tevalue\tdice" % x for x in blast_names]))
    sys.stdout.write("\n")
    for rec in GFF.parse(args.gff3):
        for feat in fsort(
                feature_lambda(rec.features,
                               feature_test_type, {"types": "CDS"},
                               subfeatures=False)):
            sys.stdout.write(feat._parent._parent.qualifiers["Name"][0])
            sys.stdout.write("\t")
            sys.stdout.write(str(feat.location))

            for db in blast_names:
                fid = get_id(feat)
                if fid in top_hits:
                    if fn in top_hits[fid]:
                        sys.stdout.write("\t")
                        sys.stdout.write(";".join([
                            "%s %s" % (x, y) for (x, y) in top_hits[fid][fn][1]
                        ]))
                        sys.stdout.write("\t")
                        sys.stdout.write(str(top_hits[fid][fn][0]))
                        sys.stdout.write("\t")
                        sys.stdout.write(str(top_hits[fid][fn][2]))
                    else:
                        sys.stdout.write("\tNone")
                        sys.stdout.write("\tNone")
                        sys.stdout.write("\tNone")
                else:
Ejemplo n.º 4
0
def find_lipoprotein(gff3_file,
                     fasta_genome,
                     lipobox_mindist=10,
                     lipobox_maxdist=60):
    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_genome, "fasta"))

    CASES = [
        re.compile('^.{%s,%s}[ACGSILMFTV][^REKD][GASNL]C' %
                   (lipobox_mindist, lipobox_maxdist)),
        # re.compile('^.{%s,%s}AWAC' % (lipobox_mindist, lipobox_maxdist)),
        # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file
    ]

    for record in GFF.parse(gff3_file, base_dict=seq_dict):
        good_features = []

        genes = list(
            feature_lambda(record.features,
                           feature_test_type, {'type': 'gene'},
                           subfeatures=True))
        for gene in genes:
            cdss = list(
                feature_lambda(gene.sub_features,
                               feature_test_type, {'type': 'CDS'},
                               subfeatures=False))
            if len(cdss) == 0:
                continue

            # Someday this will bite me in the arse.
            cds = cdss[0]

            try:
                tmpseq = str(
                    cds.extract(record.seq).translate(table=11,
                                                      cds=True)).replace(
                                                          "*", "")
            except:
                continue

            for case in CASES:
                m = case.search(tmpseq)
                if m:
                    if cds.location.strand > 0:
                        start = cds.location.start + (3 * (m.end() - 4))
                        end = cds.location.start + (3 * m.end())
                    else:
                        start = cds.location.end - (3 * (m.end() - 4))
                        end = cds.location.end - (3 * m.end())

                    tmp = SeqFeature(FeatureLocation(
                        min(start, end),
                        max(start, end),
                        strand=cds.location.strand),
                                     type='Lipobox',
                                     qualifiers={
                                         'source': 'CPT_LipoRy',
                                         'ID': '%s.lipobox' % get_id(gene),
                                     })
                    tmp.qualifiers['sequence'] = str(
                        tmp.extract(record).seq.translate())

                    gene.sub_features.append(tmp)
                    good_features.append(gene)

        record.features = good_features
        yield [record]
def main(fasta, gff3, feature_filter=None, nodesc=False):

    if feature_filter == "nice_cds":
        from gff2gb import gff3_to_genbank as cpt_Gff2Gbk

        for rec in cpt_Gff2Gbk(gff3, fasta, 11):
            seenList = {}
            if rec.seq[0] == "?":
                print("No Fasta ID matches GFF")
                exit(1)
            for feat in sorted(rec.features, key=lambda x: x.location.start):
                if feat.type != "CDS":
                    continue

                ind = 0
                if (str(
                        feat.qualifiers.get("locus_tag",
                                            get_id(feat)).replace(" ", "-"))
                        in seenList.keys()):
                    seenList[str(
                        feat.qualifiers.get("locus_tag",
                                            get_id(feat)).replace(" ",
                                                                  "-"))] += 1
                    ind = seenList[str(
                        feat.qualifiers.get("locus_tag",
                                            get_id(feat)).replace(" ", "-"))]
                else:
                    seenList[str(
                        feat.qualifiers.get("locus_tag",
                                            get_id(feat)).replace(" ",
                                                                  "-"))] = 1
                append = ""
                if ind != 0:
                    append = "_" + str(ind)

                if nodesc:
                    description = ""
                else:
                    feat.qualifiers["ID"] = [feat._ID]
                    product = feat.qualifiers.get("product", "")
                    description = "{1} [Location={0.location};ID={0.qualifiers[ID][0]}]".format(
                        feat, product)
                # print(feat.qualifiers.get('locus_tag', get_id(feat)).replace(' ', '-'))
                yield [
                    SeqRecord(
                        feat.extract(rec).seq,
                        id=str(
                            feat.qualifiers.get(
                                "locus_tag", get_id(feat)).replace(" ", "-")) +
                        append,
                        description=description,
                    )
                ]

    elif feature_filter == "unique_cds":
        seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
        seen_ids = {}

        for rec in GFF.parse(gff3, base_dict=seq_dict):
            noMatch = True
            if "Alias" in rec.features[0].qualifiers.keys():
                lColumn = rec.features[0].qualifiers["Alias"][0]
            else:
                lColumn = ""
            for x in seq_dict:
                if x == rec.id or x == lColumn:
                    noMatch = False
            if noMatch:
                print("No Fasta ID matches GFF")
                exit(1)
            newfeats = []
            for feat in sorted(
                    feature_lambda(rec.features,
                                   feature_test_type, {"type": "CDS"},
                                   subfeatures=False),
                    key=lambda f: f.location.start,
            ):
                nid = rec.id + "____" + feat.id
                if nid in seen_ids:
                    nid = nid + "__" + uuid.uuid4().hex
                feat.qualifiers["ID"] = nid
                newfeats.append(feat)
                seen_ids[nid] = True

                if nodesc:
                    description = ""
                else:
                    important_data = {"Location": feat.location}
                    if "Name" in feat.qualifiers:
                        important_data["Name"] = feat.qualifiers.get(
                            "Name", [""])[0]

                    description = "[{}]".format(";".join([
                        "{key}={value}".format(key=k, value=v)
                        for (k, v) in important_data.items()
                    ]))

                yield [
                    SeqRecord(
                        feat.extract(rec).seq,
                        id=nid.replace(" ", "-"),
                        description=description,
                    )
                ]
            rec.features = newfeats
            rec.annotations = {}
            GFF.write([rec], sys.stderr)
    else:
        seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
        for rec in GFF.parse(gff3, base_dict=seq_dict):
            noMatch = True
            if "Alias" in rec.features[0].qualifiers.keys():
                lColumn = rec.features[0].qualifiers["Alias"][0]
            else:
                lColumn = ""
            for x in seq_dict:
                if x == rec.id or x == lColumn:
                    noMatch = False
            if noMatch:
                print("No Fasta ID matches GFF")
                exit(1)
            for feat in sorted(
                    feature_lambda(
                        rec.features,
                        feature_test_type,
                        {"type": feature_filter},
                        subfeatures=False,
                    ),
                    key=lambda f: f.location.start,
            ):
                id = feat.id
                if len(id) == 0:
                    id = get_id(feat)

                if nodesc:
                    description = ""
                else:
                    important_data = {"Location": feat.location}
                    if "Name" in feat.qualifiers:
                        important_data["Name"] = feat.qualifiers.get(
                            "Name", [""])[0]

                    description = "[{}]".format(";".join([
                        "{key}={value}".format(key=k, value=v)
                        for (k, v) in important_data.items()
                    ]))

                yield [
                    SeqRecord(
                        feat.extract(rec).seq,
                        id=id.replace(" ", "-"),
                        description=description,
                    )
                ]
def main(fasta, gff3, feature_filter=None, nodesc=False):

    if feature_filter == 'nice_cds':
        from gff2gb import gff3_to_genbank
        for rec in gff3_to_genbank(gff3, fasta):
            for feat in sorted(rec.features, key=lambda x: x.location.start):
                if feat.type != 'CDS':
                    continue

                if nodesc:
                    description = ''
                else:
                    feat.qualifiers['ID'] = [feat._ID]
                    product = feat.qualifiers.get('product', '')
                    description = '{1} [Location={0.location};ID={0.qualifiers[ID][0]}]'.format(
                        feat, product)

                yield [
                    SeqRecord(feat.extract(rec).seq,
                              id=feat.qualifiers.get('locus_tag',
                                                     get_id(feat)).replace(
                                                         ' ', '-'),
                              description=description)
                ]

    elif feature_filter == 'unique_cds':
        seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
        seen_ids = {}
        for rec in GFF.parse(gff3, base_dict=seq_dict):
            newfeats = []
            for feat in sorted(feature_lambda(rec.features,
                                              feature_test_type,
                                              {'type': 'CDS'},
                                              subfeatures=False),
                               key=lambda f: f.location.start):
                nid = rec.id + '____' + feat.id
                if nid in seen_ids:
                    nid = nid + '__' + uuid.uuid4().hex
                feat.qualifiers['ID'] = nid
                newfeats.append(feat)
                seen_ids[nid] = True

                if nodesc:
                    description = ''
                else:
                    important_data = {
                        'Location': feat.location,
                    }
                    if 'Name' in feat.qualifiers:
                        important_data['Name'] = feat.qualifiers.get(
                            'Name', [''])[0]

                    description = '[{}]'.format(';'.join([
                        '{key}={value}'.format(key=k, value=v)
                        for (k, v) in important_data.items()
                    ]))

                yield [
                    SeqRecord(feat.extract(rec).seq,
                              id=nid.replace(' ', '-'),
                              description=description)
                ]
            rec.features = newfeats
            rec.annotations = {}
            GFF.write([rec], sys.stderr)
    else:
        seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
        for rec in GFF.parse(gff3, base_dict=seq_dict):
            for feat in sorted(feature_lambda(rec.features,
                                              feature_test_type,
                                              {'type': feature_filter},
                                              subfeatures=False),
                               key=lambda f: f.location.start):
                id = feat.id
                if len(id) == 0:
                    id = get_id(feat)

                if nodesc:
                    description = ''
                else:
                    important_data = {
                        'Location': feat.location,
                    }
                    if 'Name' in feat.qualifiers:
                        important_data['Name'] = feat.qualifiers.get(
                            'Name', [''])[0]

                    description = '[{}]'.format(';'.join([
                        '{key}={value}'.format(key=k, value=v)
                        for (k, v) in important_data.items()
                    ]))

                yield [
                    SeqRecord(feat.extract(rec).seq,
                              id=id.replace(' ', '-'),
                              description=description)
                ]
Ejemplo n.º 7
0
def shinefind(fasta,
              gff3,
              gff3_output=None,
              table_output=None,
              lookahead_min=5,
              lookahead_max=15,
              top_only=False,
              add=False):
    table_output.write('\t'.join([
        'ID', 'Name', 'Terminus', 'Terminus', 'Strand', 'Upstream Sequence',
        'SD', 'Spacing'
    ]) + "\n")

    sd_finder = NaiveSDCaller()
    # Load up sequence(s) for GFF3 data
    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
    # Parse GFF3 records
    for record in GFF.parse(gff3, base_dict=seq_dict):
        # Shinefind's "gff3_output".
        gff3_output_record = SeqRecord(record.seq, record.id)
        # Filter out just coding sequences
        ignored_features = []
        for x in record.features:
            # If feature X does NOT contain a CDS, add to ignored_features
            # list. This means if we have a top level gene feature with or
            # without a CDS subfeature, we're catch it appropriately here.
            if len(
                    list(
                        feature_lambda([x],
                                       feature_test_type, {'type': 'CDS'},
                                       subfeatures=True))) == 0:
                ignored_features.append(x)

        # Loop over all gene features
        for gene in feature_lambda(record.features,
                                   feature_test_type, {'type': 'gene'},
                                   subfeatures=True):

            # Get the CDS from this gene.
            feature = list(
                feature_lambda(gene.sub_features,
                               feature_test_type, {'type': 'CDS'},
                               subfeatures=True))
            # If no CDSs are in this gene feature, then quit
            if len(feature) == 0:
                # We've already caught these above in our ignored_features
                # list, so we skip out on the rest of this for loop
                continue
            else:
                # Otherwise pull the first (bad?) We don't expect >1 CDS/gene
                feature = feature[0]

            # Three different ways RBSs can be stored that we expect.
            rbs_rbs = list(
                feature_lambda(gene.sub_features,
                               feature_test_type, {'type': 'RBS'},
                               subfeatures=False))
            rbs_sds = list(
                feature_lambda(gene.sub_features,
                               feature_test_type,
                               {'type': 'Shine_Dalgarno_sequence'},
                               subfeatures=False))
            regulatory_elements = list(
                feature_lambda(gene.sub_features,
                               feature_test_type, {'type': 'regulatory'},
                               subfeatures=False))
            rbs_regulatory = list(
                feature_lambda(regulatory_elements,
                               feature_test_quals,
                               {'regulatory_class': ['ribosome_binding_site']},
                               subfeatures=False))
            rbss = rbs_rbs + rbs_sds + rbs_regulatory

            # If someone has already annotated an RBS, we quit
            if len(rbss) > 0:
                log.debug("Has %s RBSs", len(rbss))
                ignored_features.append(gene)
                continue

            sds, start, end, seq = sd_finder.testFeatureUpstream(
                feature, record, sd_min=lookahead_min, sd_max=lookahead_max)

            feature_id = get_id(feature)
            sd_features = sd_finder.to_features(sds,
                                                feature.location.strand,
                                                start,
                                                end,
                                                feature_id=feature.id)

            human_strand = '+' if feature.location.strand == 1 else '-'

            # http://book.pythontips.com/en/latest/for_-_else.html
            log.debug('Found %s SDs', len(sds))
            for (sd, sd_feature) in zip(sds, sd_features):
                # If we only want the top feature, after the bulk of the
                # forloop executes once, we append the top feature, and fake a
                # break, because an actual break triggers the else: block
                table_output.write('\t'.join(
                    map(str, [
                        feature.id,
                        feature_id,
                        feature.location.start,
                        feature.location.end,
                        human_strand,
                        sd_finder.highlight_sd(seq, sd['start'], sd['end']),
                        sd['hit'],
                        int(sd['spacing']) + lookahead_min,
                    ])) + "\n")

                if add:
                    # Append the top RBS to the gene feature
                    gene.sub_features.append(sd_feature)
                    # Pick out start/end locations for all sub_features
                    locations = [x.location.start for x in gene.sub_features] + \
                        [x.location.end for x in gene.sub_features]
                    # Update gene's start/end to be inclusive
                    gene.location._start = min(locations)
                    gene.location._end = max(locations)
                # Also register the feature with the separate GFF3 output
                sd_feature = fix_gene_boundaries(sd_feature)
                gff3_output_record.features.append(sd_feature)

                if top_only:
                    break
            else:
                if len(sds) != 0:
                    log.debug('Should not reach here if %s', len(sds) != 0)
                    # Somehow this is triggerring, and I don't feel like figuring out why. Someone else's problem.
                    continue
                table_output.write('\t'.join(
                    map(str, [
                        feature.id,
                        feature_id,
                        feature.location.start,
                        feature.location.end,
                        human_strand,
                        seq,
                        None,
                        -1,
                    ])) + "\n")

        record.annotations = {}
        GFF.write([record], sys.stdout)

        gff3_output_record.features = sorted(gff3_output_record.features,
                                             key=lambda x: x.location.start)
        gff3_output_record.annotations = {}
        GFF.write([gff3_output_record], gff3_output)
Ejemplo n.º 8
0
def feature_test_id(feature, **kwargs):
    return get_id(feature) in kwargs["id"]