def annotate_geneclusters(seq_record, options): """Re-annotate gene clusters in the seq_record""" pfam_features = utils.get_pfam_features(seq_record) cf_clusters = find_cf_clusters(pfam_features, seq_record, options) #Integrate ClusterFinder clusters with existing cluster features newclusters = [] cluster_features = utils.get_cluster_features(seq_record) for cf_cluster in cf_clusters: overlaps = False cf_type = "cf_putative" for cluster in cluster_features: cluster_sig_genes = [gene for gene in utils.get_secmet_cds_features(seq_record) if gene in utils.get_cluster_cds_features(cluster, seq_record)] if utils.features_overlap(cf_cluster, cluster): overlaps = True if options.borderpredict: #Predict gene cluster borders using ClusterFinder if ((cluster.location.end + cluster.location.start) / 2) in cf_cluster.location: cluster.location = cf_cluster.location for sig_gene in cluster_sig_genes: startpoint = min([sig_gene.location.start, sig_gene.location.end]) endpoint = max([sig_gene.location.start, sig_gene.location.end]) if cluster.location.start > startpoint: cluster.location = FeatureLocation(startpoint, cluster.location.end) if cluster.location.end < endpoint: cluster.location = FeatureLocation(cluster.location.start, endpoint) elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end: cluster.location = cf_cluster.location elif cf_cluster.location.start < cluster.location.start: cluster.location = FeatureLocation(cf_cluster.location.start, cluster.location.end) elif cf_cluster.location.end > cluster.location.end: cluster.location = FeatureLocation(cluster.location.start, cf_cluster.location.end) cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] if not overlaps: cf_cluster_CDSs = utils.get_cluster_cds_features(cf_cluster, seq_record) for CDS in cf_cluster_CDSs: if 'sec_met' in CDS.qualifiers: type_sec_met_qualifiers = [feat for feat in CDS.qualifiers['sec_met'] if "Type: " in feat] for qualifier in type_sec_met_qualifiers: if "cf_fatty_acid" in qualifier: if cf_type == "cf_putative": cf_type = "cf_fatty_acid" elif cf_type == "cf_saccharide": cf_type = "cf_fatty_acid-saccharide" if "cf_saccharide" in qualifier: if cf_type == "cf_putative": cf_type = "cf_saccharide" elif cf_type == "cf_fatty_acid": cf_type = "cf_fatty_acid-saccharide" new_cluster = SeqFeature(cf_cluster.location, type="cluster") new_cluster.qualifiers['product'] = [cf_type] new_cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] newclusters.append(new_cluster) seq_record.features.extend(newclusters) #Re-number clusters clusters = utils.get_cluster_features(seq_record) clusters.sort(compare_feature_locations) clusternr = options.clusternr_offset for cluster in clusters: cluster.qualifiers['note'] = ["Cluster number: %s" % clusternr] clusternr += 1 options.next_clusternr = clusternr
def convert_clusters(record, annotations, options): """Convert cluster SeqFeatures to JSON""" js_clusters = [] for cluster in utils.get_cluster_features(record): features = utils.get_cluster_cds_features(cluster, record) borders = utils.get_cluster_cluster_border_features(cluster, record) tta_codons = [] all_misc_features = utils.get_all_features_of_type( record, 'misc_feature') for feature in all_misc_features: if not utils.features_overlap(cluster, feature): continue if 'note' not in feature.qualifiers: continue for note in feature.qualifiers['note']: if note.startswith('tta leucine codon'): tta_codons.append(feature) break js_cluster = {} js_cluster['start'] = int(cluster.location.start) + 1 js_cluster['end'] = int(cluster.location.end) js_cluster['idx'] = utils.get_cluster_number(cluster) js_cluster['orfs'] = convert_cds_features(record, features, annotations, options) js_cluster['borders'] = convert_cluster_border_features(borders) js_cluster['tta_codons'] = convert_tta_codons(tta_codons) js_cluster['type'] = utils.get_cluster_type(cluster) if 'probability' in cluster.qualifiers: js_cluster['probability'] = cluster.qualifiers['probability'][0] if options.input_type == 'prot': js_cluster['unordered'] = True js_cluster['knowncluster'] = "-" js_cluster['BGCid'] = "-" if 'knownclusterblast' in cluster.qualifiers: knownclusters = cluster.qualifiers['knownclusterblast'] bestcluster = [ kcluster for kcluster in knownclusters if kcluster.startswith('1.') ] if not len(bestcluster) == 1: logging.warning( "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters." % str(knownclusters)) else: reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0]) js_cluster['knowncluster'] = reObj.group(2) js_cluster['BGCid'] = reObj.group(1) logging.debug('Found closest cluster "%s" for cluster no. %s' % (js_cluster['knowncluster'], utils.get_cluster_number(cluster))) js_clusters.append(js_cluster) return js_clusters
def write_RiPP(txt, info, options): "Write RiPP table to TXT" #TXT columns: RiPP ID, annotation, core peptide, mol weight, monoisotopic_mass, alt mol weights, nr bridges txt.write("\t".join([ "RiPP ID", "annotation", "core peptide", "molecular weight", "monoisotopic_mass", "alternative molecular weights", "number of bridges" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) RiPP_features = _find_core_peptides(cluster_feature, info.seq_record) RiPPs = [] for peptide in RiPP_features: for cds in cluster_gene_features: if utils.features_overlap(cds, peptide): RiPPs.append(utils.get_gene_acc(cds).partition(".")[0]) break idx = 0 for RiPP in RiPP_features: RiPP_ID = RiPPs[idx] note_quals = RiPP.qualifiers['note'] annotation = [ qual.partition("predicted class: ")[2] for qual in note_quals if "predicted class:" in qual ][0] core_peptide = [ qual.partition("predicted core seq: ")[2] for qual in note_quals if "predicted core seq:" in qual ][0] mol_weight = [ qual.partition("molecular weight: ")[2] for qual in note_quals if "molecular weight: " in qual ][0] monoiso_mass = [ qual.partition("monoisotopic mass: ")[2] for qual in note_quals if "monoisotopic mass: " in qual ][0] if "alternative weights" in note_quals: alt_mol_weights = [ qual.partition("alternative weights: ")[2].replace( " ", "") for qual in note_quals if "alternative weights:" in qual ][0] else: alt_mol_weights = "" nr_bridges = [ qual.partition("number of bridges: ")[2] for qual in note_quals if "number of bridges: " in qual ][0] txt.write("\t".join([ RiPP_ID, annotation, core_peptide, mol_weight, monoiso_mass, alt_mol_weights, nr_bridges ]) + "\n") idx += 1
def test_features_overlap(self): "Test utils.features_overlap()" self.assertFalse( utils.features_overlap(self.features[0], self.features[1])) self.assertFalse( utils.features_overlap(self.features[1], self.features[0])) self.assertTrue( utils.features_overlap(self.features[0], self.features[3])) self.assertTrue( utils.features_overlap(self.features[3], self.features[0])) self.assertTrue( utils.features_overlap(self.features[0], self.features[4])) self.assertTrue( utils.features_overlap(self.features[4], self.features[0])) self.assertTrue( utils.features_overlap(self.features[0], self.features[5])) self.assertTrue( utils.features_overlap(self.features[5], self.features[0]))
def find_overlapping_groups(cdsfeatures): #Identify groups of genes with overlaps overlapping_groups = [] for cdsfeature in cdsfeatures: overlaps = False for othercdsfeature in cdsfeatures: if utils.features_overlap(cdsfeature, othercdsfeature): overlaps = True added = False overlapping_groups2 = [] for group in overlapping_groups: if othercdsfeature in group: group.append(cdsfeature) overlapping_groups2.append(group) overlapping_groups = overlapping_groups2 if not added: overlapping_groups.append([cdsfeature, othercdsfeature]) added = True break if not overlaps: overlapping_groups.append([cdsfeature]) return overlapping_groups
def write_BGC(txt, info, options): "Write BGC table to TXT" #TXT columns: BGC ID, BGC_type, detection_rules_used, BGC_range, genes, subclusters, # NRPSs_PKSs, signature_genes, RiPPs, pred_structure, monomers txt.write("\t".join([ "BGC ID", "BGC type", "detection rules used", "BGC_range", "genes", "subclusters", "NRPSs/PKSs", "signature_genes", "RiPPs", "predicted structure", "monomers" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out BGC_ID = "%s_c%s" % (info.seq_record.id.partition(".")[0], BGCnr) cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) BGC_type = info.clustertypes[BGCnr].replace("-", ";") detection_rules_used = '"' + ";".join( get_detection_rules(cluster_feature)) + '"' BGC_range = ";".join([ str(cluster_feature.location.start), str(cluster_feature.location.end) ]) genes = ";".join(info.accessions[BGCnr]) if 'subclusterblast' in cluster_feature.qualifiers: subclusters = ";".join([ qual.partition("\t")[2] for qual in cluster_feature.qualifiers['subclusterblast'] ]) else: subclusters = "" #TODO The subclusterblast module should probably be changed for the precalcs to provide a list here of the 100% hits instead of all hits NRPSs_PKSs = ";".join([ utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features if 'sec_met' in cds.qualifiers and len([ qual for qual in cds.qualifiers['sec_met'] if qual.startswith('NRPS/PKS Domain:') ]) > 0 ]) signature_genes = ";".join([ utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features if 'sec_met' in cds.qualifiers ]) if len(_find_core_peptides(cluster_feature, info.seq_record)) != 0: ripp_list = [] for peptide in _find_core_peptides(cluster_feature, info.seq_record): for cds in cluster_gene_features: if utils.features_overlap(cds, peptide): ripp_list.append( utils.get_gene_acc(cds).partition(".")[0]) break # RiPPs = ";".join([[utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features # if utils.features_overlap(cds, peptide)][0] for peptide in # _find_core_peptides(cluster_feature, info.seq_record)]) RiPPs = ";".join(ripp_list) else: RiPPs = "-" if 'structure' in cluster_feature.qualifiers: pred_structure = ";".join(cluster_feature.qualifiers['structure']) else: pred_structure = "N/A" monomers = utils.get_structure_pred(cluster_feature) #Write data to TXT txt.write("\t".join([ BGC_ID, BGC_type, detection_rules_used, BGC_range, genes, subclusters, NRPSs_PKSs, signature_genes, RiPPs, pred_structure, monomers ]) + "\n")
def write_NRPS_PKS(txt, info, options): "Write NRPS/PKS table to TXT" #TXT columns: NRPS/PKS ID, annotation, aSDomain, score, evalue, domain type, subtype, range, activity, NRPSPredictor2, Stachelhaus, Minowa, pkssignature, consensus txt.write("\t".join([ "Cluster_ID", "NRPSPKS_ID", "annotation", "aSDomain", "score", "evalue", "domain_type", "subtype", "domain_start", "domain_end", "KR activity", "KR stereochemistry", "NRPSPredictor2", "Stachelhaus", "Minowa", "pkssignature", "consensus" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) cluster_id = "{seq_id}_c{cluster_nr}".format(seq_id=info.seq_record.id, cluster_nr=BGCnr) NRPSs_PKSs = [ cds for cds in cluster_gene_features if 'sec_met' in cds.qualifiers and len([ qual for qual in cds.qualifiers['sec_met'] if qual.startswith('NRPS/PKS Domain:') ]) > 0 ] for cds in NRPSs_PKSs: enzyme_ID = utils.get_gene_acc(cds).partition(".")[0] if len([ qual for qual in cds.qualifiers['sec_met'] if "NRPS/PKS subtype: " in qual ]) > 0: enzyme_annotation = [ qual for qual in cds.qualifiers['sec_met'] if qual.startswith("NRPS/PKS subtype") ][0].partition("NRPS/PKS subtype: ")[2] else: logging.warn("No enzyme annotation for %s" % enzyme_ID) enzyme_annotation = "" aSDomains = [ dom for dom in utils.get_cluster_aSDomain_features( cluster_feature, info.seq_record) if utils.features_overlap(cds, dom) and utils.get_gene_id(cds) in [dom.qualifiers['locus_tag'], dom.qualifiers['locus_tag'][0]] ] for aSDomain in aSDomains: domtype = aSDomain.qualifiers['domain'][0] if "domain_subtype" in aSDomain.qualifiers: subtype = aSDomain.qualifiers['domain_subtype'][0] else: subtype = "" aSDomain_ID = aSDomain.qualifiers['asDomain_id'][0] score = str(aSDomain.qualifiers['score'][0]) evalue = str(aSDomain.qualifiers['evalue'][0]) dom_start = str(aSDomain.location.start) dom_end = str(aSDomain.location.end) kr_activity = "" kr_stereochemistry = "" NRPSPredictor2 = "" Stachelhaus = "" Minowa = "" pkssignature = "" consensus = "" if aSDomain.qualifiers.has_key('specificity'): if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("KR activity") ]) > 0: kr_activity = [ qual.partition("KR activity: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("KR activity") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("KR stereochemistry") ]) > 0: kr_stereochemistry = [ qual.partition("KR stereochemistry: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("KR stereochemistry") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("NRPSpredictor2") ]) > 0: NRPSPredictor2 = [ qual.partition("NRPSpredictor2 SVM: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("NRPSpredictor2") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("Stachelhaus") ]) > 0: Stachelhaus = [ qual.partition("Stachelhaus code: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("Stachelhaus") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("Minowa") ]) > 0: Minowa = [ qual.partition("Minowa: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("Minowa") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("PKS signature") ]) > 0: pkssignature = [ qual.partition("PKS signature: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("PKS signature") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("consensus") ]) > 0: consensus = [ qual.partition("consensus: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("consensus") ][0] txt.write("\t".join([ cluster_id, enzyme_ID, enzyme_annotation, aSDomain_ID, score, evalue, domtype, subtype, dom_start, dom_end, kr_activity, kr_stereochemistry, NRPSPredictor2, Stachelhaus, Minowa, pkssignature, consensus ]) + "\n")
def annotate_geneclusters(seq_record, options): """Re-annotate gene clusters in the seq_record""" pfam_features = utils.get_pfam_features(seq_record) cf_clusters = find_cf_clusters(pfam_features, seq_record, options) # Integrate ClusterFinder clusters with existing cluster features newclusters = [] cluster_features = utils.get_cluster_features(seq_record) secmet_cds_features = utils.get_secmet_cds_features(seq_record) for cf_cluster in cf_clusters: overlaps = False cf_type = "cf_putative" for cluster in cluster_features: if not utils.features_overlap(cf_cluster, cluster): continue overlaps = True # Get signature genes from antiSMASH-predicted cluster features_in_cluster = utils.get_cluster_cds_features( cluster, seq_record) cluster_sig_genes = [ gene for gene in secmet_cds_features if gene in features_in_cluster ] # Predict gene cluster borders using ClusterFinder if options.borderpredict: if ((cluster.location.end + cluster.location.start) / 2) in cf_cluster.location: # Make sure that antiSMASH signature genes are still included in the cluster for sig_gene in cluster_sig_genes: startpoint = min( [sig_gene.location.start, sig_gene.location.end]) endpoint = max( [sig_gene.location.start, sig_gene.location.end]) if cf_cluster.location.start > startpoint: cf_cluster.location = FeatureLocation( startpoint, cf_cluster.location.end) if cf_cluster.location.end < endpoint: cf_cluster.location = FeatureLocation( cf_cluster.location.start, endpoint) cluster_border = SeqFeature(cf_cluster.location, type="cluster_border") cluster_border.qualifiers = { "tool": ["clusterfinder"], "probability": [cf_cluster.probability], "note": ["best prediction"], } seq_record.features.append(cluster_border) elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end: cluster.location = cf_cluster.location elif cf_cluster.location.start < cluster.location.start: cluster.location = FeatureLocation(cf_cluster.location.start, cluster.location.end) elif cf_cluster.location.end > cluster.location.end: cluster.location = FeatureLocation(cluster.location.start, cf_cluster.location.end) cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] if not overlaps and not ('borderpredict_only' in options and options.borderpredict_only): cf_cluster_CDSs = utils.get_cluster_cds_features( cf_cluster, seq_record) for CDS in cf_cluster_CDSs: if 'sec_met' in CDS.qualifiers: type_sec_met_qualifiers = [ feat for feat in CDS.qualifiers['sec_met'] if "Type: " in feat ] for qualifier in type_sec_met_qualifiers: if "cf_fatty_acid" in qualifier: if cf_type == "cf_putative": cf_type = "cf_fatty_acid" elif cf_type == "cf_saccharide": cf_type = "cf_fatty_acid-saccharide" if "cf_saccharide" in qualifier: if cf_type == "cf_putative": cf_type = "cf_saccharide" elif cf_type == "cf_fatty_acid": cf_type = "cf_fatty_acid-saccharide" new_cluster = SeqFeature(cf_cluster.location, type="cluster") new_cluster.qualifiers['product'] = [cf_type] new_cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] newclusters.append(new_cluster) if len(newclusters): seq_record.features.extend(newclusters) renumber_clusters(seq_record, options)