Ejemplo n.º 1
0
def annotate_geneclusters(seq_record, options):
    """Re-annotate gene clusters in the seq_record"""
    pfam_features = utils.get_pfam_features(seq_record)
    cf_clusters = find_cf_clusters(pfam_features, seq_record, options)
    #Integrate ClusterFinder clusters with existing cluster features
    newclusters = []
    cluster_features = utils.get_cluster_features(seq_record)
    for cf_cluster in cf_clusters:
        overlaps = False
        cf_type = "cf_putative"
        for cluster in cluster_features:
            cluster_sig_genes = [gene for gene in utils.get_secmet_cds_features(seq_record) if gene in utils.get_cluster_cds_features(cluster, seq_record)]
            if utils.features_overlap(cf_cluster, cluster):
                overlaps = True
                if options.borderpredict: #Predict gene cluster borders using ClusterFinder
                    if ((cluster.location.end + cluster.location.start) / 2) in cf_cluster.location:
                        cluster.location = cf_cluster.location
                        for sig_gene in cluster_sig_genes:
                            startpoint = min([sig_gene.location.start, sig_gene.location.end])
                            endpoint = max([sig_gene.location.start, sig_gene.location.end])
                            if cluster.location.start > startpoint:
                                cluster.location = FeatureLocation(startpoint, cluster.location.end)
                            if cluster.location.end < endpoint:
                                cluster.location = FeatureLocation(cluster.location.start, endpoint)
                elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end:
                    cluster.location = cf_cluster.location
                elif cf_cluster.location.start < cluster.location.start:
                    cluster.location = FeatureLocation(cf_cluster.location.start, cluster.location.end)
                elif cf_cluster.location.end > cluster.location.end:
                    cluster.location = FeatureLocation(cluster.location.start, cf_cluster.location.end)
                cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ]
        if not overlaps:
            cf_cluster_CDSs = utils.get_cluster_cds_features(cf_cluster, seq_record)
            for CDS in cf_cluster_CDSs:
                if 'sec_met' in CDS.qualifiers:
                    type_sec_met_qualifiers = [feat for feat in CDS.qualifiers['sec_met'] if "Type: " in feat]
                    for qualifier in type_sec_met_qualifiers:
                        if "cf_fatty_acid" in qualifier:
                            if cf_type == "cf_putative":
                                cf_type = "cf_fatty_acid"
                            elif cf_type == "cf_saccharide":
                                cf_type = "cf_fatty_acid-saccharide"
                        if "cf_saccharide" in qualifier:
                            if cf_type == "cf_putative":
                                cf_type = "cf_saccharide"
                            elif cf_type == "cf_fatty_acid":
                                cf_type = "cf_fatty_acid-saccharide"
            new_cluster = SeqFeature(cf_cluster.location, type="cluster")
            new_cluster.qualifiers['product'] = [cf_type]
            new_cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ]
            newclusters.append(new_cluster)
    seq_record.features.extend(newclusters)
    #Re-number clusters
    clusters = utils.get_cluster_features(seq_record)
    clusters.sort(compare_feature_locations)
    clusternr = options.clusternr_offset
    for cluster in clusters:
        cluster.qualifiers['note'] = ["Cluster number: %s" % clusternr]
        clusternr += 1
    options.next_clusternr = clusternr
Ejemplo n.º 2
0
def convert_clusters(record, annotations, options):
    """Convert cluster SeqFeatures to JSON"""
    js_clusters = []
    for cluster in utils.get_cluster_features(record):
        features = utils.get_cluster_cds_features(cluster, record)
        borders = utils.get_cluster_cluster_border_features(cluster, record)

        tta_codons = []
        all_misc_features = utils.get_all_features_of_type(
            record, 'misc_feature')
        for feature in all_misc_features:
            if not utils.features_overlap(cluster, feature):
                continue
            if 'note' not in feature.qualifiers:
                continue

            for note in feature.qualifiers['note']:
                if note.startswith('tta leucine codon'):
                    tta_codons.append(feature)
                    break

        js_cluster = {}
        js_cluster['start'] = int(cluster.location.start) + 1
        js_cluster['end'] = int(cluster.location.end)
        js_cluster['idx'] = utils.get_cluster_number(cluster)
        js_cluster['orfs'] = convert_cds_features(record, features,
                                                  annotations, options)
        js_cluster['borders'] = convert_cluster_border_features(borders)
        js_cluster['tta_codons'] = convert_tta_codons(tta_codons)
        js_cluster['type'] = utils.get_cluster_type(cluster)
        if 'probability' in cluster.qualifiers:
            js_cluster['probability'] = cluster.qualifiers['probability'][0]
        if options.input_type == 'prot':
            js_cluster['unordered'] = True
        js_cluster['knowncluster'] = "-"
        js_cluster['BGCid'] = "-"

        if 'knownclusterblast' in cluster.qualifiers:
            knownclusters = cluster.qualifiers['knownclusterblast']
            bestcluster = [
                kcluster for kcluster in knownclusters
                if kcluster.startswith('1.')
            ]
            if not len(bestcluster) == 1:
                logging.warning(
                    "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters."
                    % str(knownclusters))
            else:
                reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0])
                js_cluster['knowncluster'] = reObj.group(2)
                js_cluster['BGCid'] = reObj.group(1)
                logging.debug('Found closest cluster "%s" for cluster no. %s' %
                              (js_cluster['knowncluster'],
                               utils.get_cluster_number(cluster)))
        js_clusters.append(js_cluster)

    return js_clusters
Ejemplo n.º 3
0
def write_RiPP(txt, info, options):
    "Write RiPP table to TXT"
    #TXT columns: RiPP ID, annotation, core peptide, mol weight, monoisotopic_mass, alt mol weights, nr bridges
    txt.write("\t".join([
        "RiPP ID", "annotation", "core peptide", "molecular weight",
        "monoisotopic_mass", "alternative molecular weights",
        "number of bridges"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        RiPP_features = _find_core_peptides(cluster_feature, info.seq_record)
        RiPPs = []
        for peptide in RiPP_features:
            for cds in cluster_gene_features:
                if utils.features_overlap(cds, peptide):
                    RiPPs.append(utils.get_gene_acc(cds).partition(".")[0])
                    break
        idx = 0
        for RiPP in RiPP_features:
            RiPP_ID = RiPPs[idx]
            note_quals = RiPP.qualifiers['note']
            annotation = [
                qual.partition("predicted class: ")[2] for qual in note_quals
                if "predicted class:" in qual
            ][0]
            core_peptide = [
                qual.partition("predicted core seq: ")[2]
                for qual in note_quals if "predicted core seq:" in qual
            ][0]
            mol_weight = [
                qual.partition("molecular weight: ")[2] for qual in note_quals
                if "molecular weight: " in qual
            ][0]
            monoiso_mass = [
                qual.partition("monoisotopic mass: ")[2] for qual in note_quals
                if "monoisotopic mass: " in qual
            ][0]
            if "alternative weights" in note_quals:
                alt_mol_weights = [
                    qual.partition("alternative weights: ")[2].replace(
                        " ", "") for qual in note_quals
                    if "alternative weights:" in qual
                ][0]
            else:
                alt_mol_weights = ""
            nr_bridges = [
                qual.partition("number of bridges: ")[2] for qual in note_quals
                if "number of bridges: " in qual
            ][0]
            txt.write("\t".join([
                RiPP_ID, annotation, core_peptide, mol_weight, monoiso_mass,
                alt_mol_weights, nr_bridges
            ]) + "\n")
            idx += 1
Ejemplo n.º 4
0
    def test_features_overlap(self):
        "Test utils.features_overlap()"
        self.assertFalse(
            utils.features_overlap(self.features[0], self.features[1]))
        self.assertFalse(
            utils.features_overlap(self.features[1], self.features[0]))

        self.assertTrue(
            utils.features_overlap(self.features[0], self.features[3]))
        self.assertTrue(
            utils.features_overlap(self.features[3], self.features[0]))

        self.assertTrue(
            utils.features_overlap(self.features[0], self.features[4]))
        self.assertTrue(
            utils.features_overlap(self.features[4], self.features[0]))

        self.assertTrue(
            utils.features_overlap(self.features[0], self.features[5]))
        self.assertTrue(
            utils.features_overlap(self.features[5], self.features[0]))
Ejemplo n.º 5
0
def find_overlapping_groups(cdsfeatures):
    #Identify groups of genes with overlaps
    overlapping_groups = []
    for cdsfeature in cdsfeatures:
        overlaps = False
        for othercdsfeature in cdsfeatures:
            if utils.features_overlap(cdsfeature, othercdsfeature):
                overlaps = True
                added = False
                overlapping_groups2 = []
                for group in overlapping_groups:
                    if othercdsfeature in group:
                        group.append(cdsfeature)
                    overlapping_groups2.append(group)
                overlapping_groups = overlapping_groups2
                if not added:
                    overlapping_groups.append([cdsfeature, othercdsfeature])
                    added = True
                break
        if not overlaps:
            overlapping_groups.append([cdsfeature])
    return overlapping_groups
Ejemplo n.º 6
0
def write_BGC(txt, info, options):
    "Write BGC table to TXT"
    #TXT columns: BGC ID, BGC_type, detection_rules_used, BGC_range, genes, subclusters,
    # NRPSs_PKSs, signature_genes, RiPPs, pred_structure, monomers
    txt.write("\t".join([
        "BGC ID", "BGC type", "detection rules used", "BGC_range", "genes",
        "subclusters", "NRPSs/PKSs", "signature_genes", "RiPPs",
        "predicted structure", "monomers"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        BGC_ID = "%s_c%s" % (info.seq_record.id.partition(".")[0], BGCnr)
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        BGC_type = info.clustertypes[BGCnr].replace("-", ";")
        detection_rules_used = '"' + ";".join(
            get_detection_rules(cluster_feature)) + '"'
        BGC_range = ";".join([
            str(cluster_feature.location.start),
            str(cluster_feature.location.end)
        ])
        genes = ";".join(info.accessions[BGCnr])
        if 'subclusterblast' in cluster_feature.qualifiers:
            subclusters = ";".join([
                qual.partition("\t")[2]
                for qual in cluster_feature.qualifiers['subclusterblast']
            ])
        else:
            subclusters = ""
        #TODO The subclusterblast module should probably be changed for the precalcs to provide a list here of the 100% hits instead of all hits
        NRPSs_PKSs = ";".join([
            utils.get_gene_acc(cds).partition(".")[0]
            for cds in cluster_gene_features
            if 'sec_met' in cds.qualifiers and len([
                qual for qual in cds.qualifiers['sec_met']
                if qual.startswith('NRPS/PKS Domain:')
            ]) > 0
        ])
        signature_genes = ";".join([
            utils.get_gene_acc(cds).partition(".")[0]
            for cds in cluster_gene_features if 'sec_met' in cds.qualifiers
        ])
        if len(_find_core_peptides(cluster_feature, info.seq_record)) != 0:
            ripp_list = []
            for peptide in _find_core_peptides(cluster_feature,
                                               info.seq_record):
                for cds in cluster_gene_features:
                    if utils.features_overlap(cds, peptide):
                        ripp_list.append(
                            utils.get_gene_acc(cds).partition(".")[0])
                        break
#            RiPPs = ";".join([[utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features
#                if utils.features_overlap(cds, peptide)][0] for peptide in
#               _find_core_peptides(cluster_feature, info.seq_record)])
            RiPPs = ";".join(ripp_list)
        else:
            RiPPs = "-"
        if 'structure' in cluster_feature.qualifiers:
            pred_structure = ";".join(cluster_feature.qualifiers['structure'])
        else:
            pred_structure = "N/A"
        monomers = utils.get_structure_pred(cluster_feature)
        #Write data to TXT
        txt.write("\t".join([
            BGC_ID, BGC_type, detection_rules_used, BGC_range, genes,
            subclusters, NRPSs_PKSs, signature_genes, RiPPs, pred_structure,
            monomers
        ]) + "\n")
Ejemplo n.º 7
0
def write_NRPS_PKS(txt, info, options):
    "Write NRPS/PKS table to TXT"
    #TXT columns: NRPS/PKS ID, annotation, aSDomain, score, evalue, domain type, subtype, range, activity, NRPSPredictor2, Stachelhaus, Minowa, pkssignature, consensus
    txt.write("\t".join([
        "Cluster_ID", "NRPSPKS_ID", "annotation", "aSDomain", "score",
        "evalue", "domain_type", "subtype", "domain_start", "domain_end",
        "KR activity", "KR stereochemistry", "NRPSPredictor2", "Stachelhaus",
        "Minowa", "pkssignature", "consensus"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        cluster_id = "{seq_id}_c{cluster_nr}".format(seq_id=info.seq_record.id,
                                                     cluster_nr=BGCnr)
        NRPSs_PKSs = [
            cds for cds in cluster_gene_features
            if 'sec_met' in cds.qualifiers and len([
                qual for qual in cds.qualifiers['sec_met']
                if qual.startswith('NRPS/PKS Domain:')
            ]) > 0
        ]
        for cds in NRPSs_PKSs:
            enzyme_ID = utils.get_gene_acc(cds).partition(".")[0]
            if len([
                    qual for qual in cds.qualifiers['sec_met']
                    if "NRPS/PKS subtype: " in qual
            ]) > 0:
                enzyme_annotation = [
                    qual for qual in cds.qualifiers['sec_met']
                    if qual.startswith("NRPS/PKS subtype")
                ][0].partition("NRPS/PKS subtype: ")[2]
            else:
                logging.warn("No enzyme annotation for %s" % enzyme_ID)
                enzyme_annotation = ""
            aSDomains = [
                dom for dom in utils.get_cluster_aSDomain_features(
                    cluster_feature, info.seq_record) if
                utils.features_overlap(cds, dom) and utils.get_gene_id(cds) in
                [dom.qualifiers['locus_tag'], dom.qualifiers['locus_tag'][0]]
            ]
            for aSDomain in aSDomains:
                domtype = aSDomain.qualifiers['domain'][0]
                if "domain_subtype" in aSDomain.qualifiers:
                    subtype = aSDomain.qualifiers['domain_subtype'][0]
                else:
                    subtype = ""
                aSDomain_ID = aSDomain.qualifiers['asDomain_id'][0]
                score = str(aSDomain.qualifiers['score'][0])
                evalue = str(aSDomain.qualifiers['evalue'][0])
                dom_start = str(aSDomain.location.start)
                dom_end = str(aSDomain.location.end)
                kr_activity = ""
                kr_stereochemistry = ""
                NRPSPredictor2 = ""
                Stachelhaus = ""
                Minowa = ""
                pkssignature = ""
                consensus = ""
                if aSDomain.qualifiers.has_key('specificity'):
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("KR activity")
                    ]) > 0:
                        kr_activity = [
                            qual.partition("KR activity: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("KR activity")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("KR stereochemistry")
                    ]) > 0:
                        kr_stereochemistry = [
                            qual.partition("KR stereochemistry: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("KR stereochemistry")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("NRPSpredictor2")
                    ]) > 0:
                        NRPSPredictor2 = [
                            qual.partition("NRPSpredictor2 SVM: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("NRPSpredictor2")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("Stachelhaus")
                    ]) > 0:
                        Stachelhaus = [
                            qual.partition("Stachelhaus code: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("Stachelhaus")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("Minowa")
                    ]) > 0:
                        Minowa = [
                            qual.partition("Minowa: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("Minowa")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("PKS signature")
                    ]) > 0:
                        pkssignature = [
                            qual.partition("PKS signature: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("PKS signature")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("consensus")
                    ]) > 0:
                        consensus = [
                            qual.partition("consensus: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("consensus")
                        ][0]

                txt.write("\t".join([
                    cluster_id, enzyme_ID, enzyme_annotation, aSDomain_ID,
                    score, evalue, domtype, subtype, dom_start, dom_end,
                    kr_activity, kr_stereochemistry, NRPSPredictor2,
                    Stachelhaus, Minowa, pkssignature, consensus
                ]) + "\n")
Ejemplo n.º 8
0
def annotate_geneclusters(seq_record, options):
    """Re-annotate gene clusters in the seq_record"""
    pfam_features = utils.get_pfam_features(seq_record)
    cf_clusters = find_cf_clusters(pfam_features, seq_record, options)
    # Integrate ClusterFinder clusters with existing cluster features
    newclusters = []
    cluster_features = utils.get_cluster_features(seq_record)
    secmet_cds_features = utils.get_secmet_cds_features(seq_record)

    for cf_cluster in cf_clusters:
        overlaps = False
        cf_type = "cf_putative"
        for cluster in cluster_features:
            if not utils.features_overlap(cf_cluster, cluster):
                continue

            overlaps = True

            # Get signature genes from antiSMASH-predicted cluster
            features_in_cluster = utils.get_cluster_cds_features(
                cluster, seq_record)
            cluster_sig_genes = [
                gene for gene in secmet_cds_features
                if gene in features_in_cluster
            ]

            # Predict gene cluster borders using ClusterFinder
            if options.borderpredict:
                if ((cluster.location.end + cluster.location.start) /
                        2) in cf_cluster.location:
                    # Make sure that antiSMASH signature genes are still included in the cluster
                    for sig_gene in cluster_sig_genes:
                        startpoint = min(
                            [sig_gene.location.start, sig_gene.location.end])
                        endpoint = max(
                            [sig_gene.location.start, sig_gene.location.end])
                        if cf_cluster.location.start > startpoint:
                            cf_cluster.location = FeatureLocation(
                                startpoint, cf_cluster.location.end)
                        if cf_cluster.location.end < endpoint:
                            cf_cluster.location = FeatureLocation(
                                cf_cluster.location.start, endpoint)
                    cluster_border = SeqFeature(cf_cluster.location,
                                                type="cluster_border")
                    cluster_border.qualifiers = {
                        "tool": ["clusterfinder"],
                        "probability": [cf_cluster.probability],
                        "note": ["best prediction"],
                    }
                    seq_record.features.append(cluster_border)
            elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end:
                cluster.location = cf_cluster.location
            elif cf_cluster.location.start < cluster.location.start:
                cluster.location = FeatureLocation(cf_cluster.location.start,
                                                   cluster.location.end)
            elif cf_cluster.location.end > cluster.location.end:
                cluster.location = FeatureLocation(cluster.location.start,
                                                   cf_cluster.location.end)
            cluster.qualifiers['probability'] = [
                "%01.4f" % cf_cluster.probability
            ]
        if not overlaps and not ('borderpredict_only' in options
                                 and options.borderpredict_only):
            cf_cluster_CDSs = utils.get_cluster_cds_features(
                cf_cluster, seq_record)
            for CDS in cf_cluster_CDSs:
                if 'sec_met' in CDS.qualifiers:
                    type_sec_met_qualifiers = [
                        feat for feat in CDS.qualifiers['sec_met']
                        if "Type: " in feat
                    ]
                    for qualifier in type_sec_met_qualifiers:
                        if "cf_fatty_acid" in qualifier:
                            if cf_type == "cf_putative":
                                cf_type = "cf_fatty_acid"
                            elif cf_type == "cf_saccharide":
                                cf_type = "cf_fatty_acid-saccharide"
                        if "cf_saccharide" in qualifier:
                            if cf_type == "cf_putative":
                                cf_type = "cf_saccharide"
                            elif cf_type == "cf_fatty_acid":
                                cf_type = "cf_fatty_acid-saccharide"
            new_cluster = SeqFeature(cf_cluster.location, type="cluster")
            new_cluster.qualifiers['product'] = [cf_type]
            new_cluster.qualifiers['probability'] = [
                "%01.4f" % cf_cluster.probability
            ]
            newclusters.append(new_cluster)

    if len(newclusters):
        seq_record.features.extend(newclusters)
        renumber_clusters(seq_record, options)