def test_get_cluster_cds_features(self): "Test utils.get_cluster_cds_features()" cluster1, cluster2 = utils.get_cluster_features(self.record) self.assertEqual(self.features[0], cluster1) self.assertEqual(self.features[-1], cluster2) clusterfeatures = utils.get_cluster_cds_features(cluster1, self.record) self.assertEqual(self.features[3:6], clusterfeatures) clusterfeatures = utils.get_cluster_cds_features(cluster2, self.record) self.assertEqual(self.features[-3:-1], clusterfeatures)
def write(seq_records, options): logging.debug("Exporting antiSMASH information as txt tables") #Don't store TXT tables for protein input if options.input_type == 'prot': return #Localize output folder, create TXT subdirectory txt_outfolder = options.full_outputfolder_path + os.sep + "txt" if not os.path.exists(txt_outfolder): os.mkdir(txt_outfolder) #Define table names tables = "genome", "BGC", "signature_gene_info", "gene", "NRPS_PKS", "smCOG", "RiPP", "transltable" #For each gene cluster, write out info to TXT files for seq_record in seq_records: if len(utils.get_cluster_features(seq_record)) > 0: #Open up TXT files txt_files = {} for table in tables: txt_files[table] = open( path.join( txt_outfolder, "%s_%s.txt" % (seq_record.id.partition(".")[0], table)), "w") #Gather all information info = utils.Storage() info.clustertypes, info.clustergenes, info.accessions, info.cdsmotifs, info.clusternrs = {}, {}, {}, {}, [] clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clusternr = utils.get_cluster_number(cluster) info.clusternrs.append(clusternr) info.clustertypes[clusternr] = utils.get_cluster_type(cluster) info.clustergenes[clusternr] = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features( cluster, seq_record) ] info.accessions[clusternr] = [ utils.get_gene_acc(cds) for cds in utils.get_cluster_cds_features( cluster, seq_record) ] info.cdsmotifs[clusternr] = utils.get_all_features_of_type( seq_record, ["CDS_motif"]) info.seq_record = seq_record #Write information to tables for table in tables: getattr(write_tables, 'write_' + table)(txt_files[table], info, options) for table in tables: txt_files[table].close()
def write(seq_records, options): """Write all cluster proteins to a file Args: seq_records (iterable): An iterable containing Bio.SeqRecords options (argparse.Namespace): The options passed to the program """ basename = seq_records[0].id output_name = path.join(options.outputfoldername, "%s_genecluster_proteins.fa" % basename) logging.debug("Writing seq_records to %r" % output_name) with open(output_name, 'w+') as handle: for seq_record in seq_records: clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clustertype = utils.get_cluster_type(cluster) clusternr = utils.get_cluster_number(cluster) for feature in utils.get_cluster_cds_features( cluster, seq_record): qual = feature.qualifiers fasta_header = '>%s:%s %s #%s - %s\n' % ( qual['locus_tag'][0], qual['protein_id'][0], clustertype, clusternr, qual['product'][0]) handle.write(fasta_header) handle.write( '%s\n' % '\n'.join(textwrap.wrap(qual['translation'][0], 60)))
def store_percentage_identities(seq_record): clusters = utils.get_cluster_features(seq_record) cfg = config.get_config() for cluster in clusters: features = [ feature for feature in utils.get_cluster_cds_features(cluster, seq_record) if 'sec_met' in feature.qualifiers ] cdhit_table, gene_to_cluster = utils.get_cdhit_table( features, float(cfg.cdh_display_cutoff)) for cdhit_cluster in cdhit_table: if len(cdhit_cluster["genes"]) > 1: cl_features = [ feature for feature in features if utils.get_gene_id( feature) in cdhit_cluster["genes"].keys() ] pct_table = utils.get_pct_identity_table(cl_features) for cds in cl_features: result = ",".join([ "%s=%s" % (othercds, pct_table[utils.get_gene_id(cds)][othercds]) for othercds in pct_table[utils.get_gene_id( cds)].keys() ]) for ann in cds.qualifiers['sec_met']: if ann.startswith("Percentage identity"): del ann cds.qualifiers['sec_met'].append( "Percentage identity: %s" % (result))
def annotate_geneclusters(seq_record, options): """Re-annotate gene clusters in the seq_record""" pfam_features = utils.get_pfam_features(seq_record) cf_clusters = find_cf_clusters(pfam_features, seq_record, options) #Integrate ClusterFinder clusters with existing cluster features newclusters = [] cluster_features = utils.get_cluster_features(seq_record) for cf_cluster in cf_clusters: overlaps = False cf_type = "cf_putative" for cluster in cluster_features: cluster_sig_genes = [gene for gene in utils.get_secmet_cds_features(seq_record) if gene in utils.get_cluster_cds_features(cluster, seq_record)] if utils.features_overlap(cf_cluster, cluster): overlaps = True if options.borderpredict: #Predict gene cluster borders using ClusterFinder if ((cluster.location.end + cluster.location.start) / 2) in cf_cluster.location: cluster.location = cf_cluster.location for sig_gene in cluster_sig_genes: startpoint = min([sig_gene.location.start, sig_gene.location.end]) endpoint = max([sig_gene.location.start, sig_gene.location.end]) if cluster.location.start > startpoint: cluster.location = FeatureLocation(startpoint, cluster.location.end) if cluster.location.end < endpoint: cluster.location = FeatureLocation(cluster.location.start, endpoint) elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end: cluster.location = cf_cluster.location elif cf_cluster.location.start < cluster.location.start: cluster.location = FeatureLocation(cf_cluster.location.start, cluster.location.end) elif cf_cluster.location.end > cluster.location.end: cluster.location = FeatureLocation(cluster.location.start, cf_cluster.location.end) cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] if not overlaps: cf_cluster_CDSs = utils.get_cluster_cds_features(cf_cluster, seq_record) for CDS in cf_cluster_CDSs: if 'sec_met' in CDS.qualifiers: type_sec_met_qualifiers = [feat for feat in CDS.qualifiers['sec_met'] if "Type: " in feat] for qualifier in type_sec_met_qualifiers: if "cf_fatty_acid" in qualifier: if cf_type == "cf_putative": cf_type = "cf_fatty_acid" elif cf_type == "cf_saccharide": cf_type = "cf_fatty_acid-saccharide" if "cf_saccharide" in qualifier: if cf_type == "cf_putative": cf_type = "cf_saccharide" elif cf_type == "cf_fatty_acid": cf_type = "cf_fatty_acid-saccharide" new_cluster = SeqFeature(cf_cluster.location, type="cluster") new_cluster.qualifiers['product'] = [cf_type] new_cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] newclusters.append(new_cluster) seq_record.features.extend(newclusters) #Re-number clusters clusters = utils.get_cluster_features(seq_record) clusters.sort(compare_feature_locations) clusternr = options.clusternr_offset for cluster in clusters: cluster.qualifiers['note'] = ["Cluster number: %s" % clusternr] clusternr += 1 options.next_clusternr = clusternr
def write_gene(txt, info, options): "Write gene table to TXT" #TXT columns: gene ID, gene start, gene end, gene strand, smCOG, locus_tag/geneID, annotation txt.write("\t".join([ "gene ID", "gene start", "gene end", "gene strand", "smCOG", "locus_tag", "annotation" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) for cds in cluster_gene_features: gene_id = utils.get_gene_acc(cds).partition(".")[0] cds_start = str(cds.location.start) cds_end = str(cds.location.end) if cds.strand == 1: cds_strand = "+" else: cds_strand = "-" smCOG = "" ##Not used for now locus_tag = utils.get_gene_id(cds).partition(".")[0] annotation = utils.get_gene_annotation(cds) txt.write("\t".join([ gene_id, cds_start, cds_end, cds_strand, smCOG, locus_tag, annotation ]) + "\n")
def generate_details_div(cluster, seq_record, options, js_domains, details=None): """Generate details div""" cluster_rec = utils.get_cluster_by_nr(seq_record, cluster['idx']) if cluster_rec is None: return details if details is None: details = pq('<div>') details.addClass('details') header = pq('<h3>') header.text('Detailed annotation') details.append(header) js_cluster_domains = { 'id': "cluster-%s-details" % cluster['idx'], 'orfs': [] } features = utils.get_cluster_cds_features(cluster_rec, seq_record) for feature in features: if not 'sec_met' in feature.qualifiers: continue if 'translation' in feature.qualifiers: sequence = feature.qualifiers['translation'][0] else: sequence = str(utils.get_aa_sequence(feature)) js_orf = { 'id': utils.get_gene_id(feature), 'sequence': sequence, 'domains': [], } for qual in feature.qualifiers['sec_met']: if not qual.startswith('NRPS/PKS Domain:'): continue js_domain = _parse_domain(qual, feature, seq_record) if len(js_domain) > 0: js_orf['domains'].append(js_domain) if len(js_orf['domains']) > 0: js_cluster_domains['orfs'].append(js_orf) if len(js_cluster_domains['orfs']) > 0: details_svg = pq('<div>') details_svg.addClass('details-svg') details_svg.attr('id', '%s-svg' % js_cluster_domains['id']) details.append(details_svg) js_domains.append(js_cluster_domains) return details
def write_RiPP(txt, info, options): "Write RiPP table to TXT" #TXT columns: RiPP ID, annotation, core peptide, mol weight, monoisotopic_mass, alt mol weights, nr bridges txt.write("\t".join([ "RiPP ID", "annotation", "core peptide", "molecular weight", "monoisotopic_mass", "alternative molecular weights", "number of bridges" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) RiPP_features = _find_core_peptides(cluster_feature, info.seq_record) RiPPs = [] for peptide in RiPP_features: for cds in cluster_gene_features: if utils.features_overlap(cds, peptide): RiPPs.append(utils.get_gene_acc(cds).partition(".")[0]) break idx = 0 for RiPP in RiPP_features: RiPP_ID = RiPPs[idx] note_quals = RiPP.qualifiers['note'] annotation = [ qual.partition("predicted class: ")[2] for qual in note_quals if "predicted class:" in qual ][0] core_peptide = [ qual.partition("predicted core seq: ")[2] for qual in note_quals if "predicted core seq:" in qual ][0] mol_weight = [ qual.partition("molecular weight: ")[2] for qual in note_quals if "molecular weight: " in qual ][0] monoiso_mass = [ qual.partition("monoisotopic mass: ")[2] for qual in note_quals if "monoisotopic mass: " in qual ][0] if "alternative weights" in note_quals: alt_mol_weights = [ qual.partition("alternative weights: ")[2].replace( " ", "") for qual in note_quals if "alternative weights:" in qual ][0] else: alt_mol_weights = "" nr_bridges = [ qual.partition("number of bridges: ")[2] for qual in note_quals if "number of bridges: " in qual ][0] txt.write("\t".join([ RiPP_ID, annotation, core_peptide, mol_weight, monoiso_mass, alt_mol_weights, nr_bridges ]) + "\n") idx += 1
def convert_clusters(record, annotations, options): """Convert cluster SeqFeatures to JSON""" js_clusters = [] for cluster in utils.get_cluster_features(record): features = utils.get_cluster_cds_features(cluster, record) borders = utils.get_cluster_cluster_border_features(cluster, record) tta_codons = [] all_misc_features = utils.get_all_features_of_type( record, 'misc_feature') for feature in all_misc_features: if not utils.features_overlap(cluster, feature): continue if 'note' not in feature.qualifiers: continue for note in feature.qualifiers['note']: if note.startswith('tta leucine codon'): tta_codons.append(feature) break js_cluster = {} js_cluster['start'] = int(cluster.location.start) + 1 js_cluster['end'] = int(cluster.location.end) js_cluster['idx'] = utils.get_cluster_number(cluster) js_cluster['orfs'] = convert_cds_features(record, features, annotations, options) js_cluster['borders'] = convert_cluster_border_features(borders) js_cluster['tta_codons'] = convert_tta_codons(tta_codons) js_cluster['type'] = utils.get_cluster_type(cluster) if 'probability' in cluster.qualifiers: js_cluster['probability'] = cluster.qualifiers['probability'][0] if options.input_type == 'prot': js_cluster['unordered'] = True js_cluster['knowncluster'] = "-" js_cluster['BGCid'] = "-" if 'knownclusterblast' in cluster.qualifiers: knownclusters = cluster.qualifiers['knownclusterblast'] bestcluster = [ kcluster for kcluster in knownclusters if kcluster.startswith('1.') ] if not len(bestcluster) == 1: logging.warning( "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters." % str(knownclusters)) else: reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0]) js_cluster['knowncluster'] = reObj.group(2) js_cluster['BGCid'] = reObj.group(1) logging.debug('Found closest cluster "%s" for cluster no. %s' % (js_cluster['knowncluster'], utils.get_cluster_number(cluster))) js_clusters.append(js_cluster) return js_clusters
def retrieve_gene_cluster_annotations(seq_record, smcogdict, gtrcoglist, transportercoglist, geneclusternr): allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] pksnrpscoregenes = [ utils.get_gene_id(cds) for cds in utils.get_pksnrps_cds_features(seq_record) ] feature_by_id = utils.get_feature_dict(seq_record) clustergenes = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features( utils.get_cluster_by_nr(seq_record, geneclusternr), seq_record) ] clustertype = utils.get_cluster_type( utils.get_cluster_by_nr(seq_record, geneclusternr)) annotations = {} colors = [] starts = [] ends = [] strands = [] pksnrpsprots = [] gtrs = [] transporters = [] for j in clustergenes: cdsfeature = feature_by_id[j] if cdsfeature.qualifiers.has_key('product'): annotations[j] = cdsfeature.qualifiers['product'][0] else: annotations[j] = 'Unannotated gene' starts.append(cdsfeature.location.start) ends.append(cdsfeature.location.end) if cdsfeature.strand == -1: strands.append("-") else: strands.append("+") if j in allcoregenes: colors.append("#810E15") else: colors.append("grey") if j in pksnrpscoregenes: pksnrpsprots.append(j) if smcogdict.has_key(j): if len(smcogdict[j]) > 0 and smcogdict[j][0] in gtrcoglist: gtrs.append(j) if len(smcogdict[j]) > 0 and smcogdict[j][0] in transportercoglist: transporters.append(j) clustersize = max(ends) - min(starts) return clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize
def create_blast_inputs(genecluster, seq_record): options = config.get_config() #Create input fasta files for BLAST search if options.taxon == "plants": queryclusterprots = filter_overlap(utils.get_cluster_cds_features(genecluster, seq_record)) else: queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record) queryclusternames = [] queryclusterseqs = [] queryclusterprotsnames = [] for cds in queryclusterprots: if cds.strand == 1: strand = "+" else: strand = "-" fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \ str(cds.location.start).replace(">","").replace("<","") + "-" + \ str(cds.location.end).replace(">","").replace("<",""), \ strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)]) queryclusterseqs.append(str(utils.get_aa_sequence(cds))) queryclusternames.append(fullname) queryclusterprotsnames.append(utils.get_gene_acc(cds)) return queryclusternames, queryclusterseqs, queryclusterprotsnames
def convert_clusters(record, annotations, options): """Convert cluster SeqFeatures to JSON""" js_clusters = [] for cluster in utils.get_cluster_features(record): features = utils.get_cluster_cds_features(cluster, record) js_cluster = {} js_cluster['start'] = int(cluster.location.start) + 1 js_cluster['end'] = int(cluster.location.end) js_cluster['idx'] = utils.get_cluster_number(cluster) js_cluster['orfs'] = convert_cds_features(record, features, annotations, options) js_cluster['type'] = utils.get_cluster_type(cluster) if options.coexpress: js_cluster["geo"] = utils.get_geotable_json(features) if 'probability' in cluster.qualifiers: js_cluster['probability'] = cluster.qualifiers['probability'][0] if options.input_type == 'prot': js_cluster['unordered'] = True js_cluster['knowncluster'] = "-" js_cluster['BGCid'] = "-" js_cluster['domains'] = utils.get_cluster_domains(cluster, record) if options.enable_cdhit: js_cluster['cdhitclusters'] = utils.get_cluster_cdhit_table( cluster, record) if 'knownclusterblast' in cluster.qualifiers: knownclusters = cluster.qualifiers['knownclusterblast'] bestcluster = [ kcluster for kcluster in knownclusters if kcluster.startswith('1.') ] if not len(bestcluster) == 1: logging.warning( "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters." % str(knownclusters)) else: reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0]) js_cluster['knowncluster'] = reObj.group(2) js_cluster['BGCid'] = reObj.group(1) logging.debug('Found closest cluster "%s" for cluster no. %s' % (js_cluster['knowncluster'], utils.get_cluster_number(cluster))) js_clusters.append(js_cluster) return js_clusters
def create_blast_inputs(genecluster, seq_record): #Create input fasta files for BLAST search queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record) queryclusternames = [] queryclusterseqs = [] queryclusterprotsnames = [] for cds in queryclusterprots: if cds.strand == 1: strand = "+" else: strand = "-" fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \ str(cds.location.nofuzzy_start) + "-" + \ str(cds.location.nofuzzy_end), \ strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)]) queryclusterseqs.append(str(utils.get_aa_sequence(cds))) queryclusternames.append(fullname) queryclusterprotsnames.append(utils.get_gene_acc(cds)) return queryclusternames, queryclusterseqs, queryclusterprotsnames
def write(seq_records, options): """Write all cluster proteins to a file Args: seq_records (iterable): An iterable containing Bio.SeqRecords options (argparse.Namespace): The options passed to the program """ basename = seq_records[0].id output_name = path.join(options.outputfoldername, "%s_genecluster_proteins.fa" % basename) logging.debug("Writing seq_records to %r" % output_name) with open(output_name, 'w+') as handle: for seq_record in seq_records: clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clustertype = utils.get_cluster_type(cluster) clusternr = utils.get_cluster_number(cluster) for feature in utils.get_cluster_cds_features(cluster, seq_record): qual = feature.qualifiers fasta_header = '>%s:%s %s #%s - %s\n' % (qual['locus_tag'][0], qual['protein_id'][0], clustertype, clusternr, qual['product'][0]) handle.write( fasta_header ) handle.write( '%s\n' % '\n'.join( textwrap.wrap(qual['translation'][0], 60) ) )
def write_signature_gene_info(txt, info, options): "Write signature gene table to TXT" #TXT columns: signature_gene, pHMM_hit, e-value, bit score, nr of seeds txt.write("\t".join([ "signature gene", "pHMM hits", "e-value", "bit score", "number of seeds" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) signature_genes = [ cds for cds in cluster_gene_features if 'sec_met' in cds.qualifiers ] for cds in signature_genes: if len([ qual for qual in cds.qualifiers['sec_met'] if qual.startswith('Domains detected: ') ]) == 0: continue gene_ID = utils.get_gene_acc(cds).partition(".")[0] domdetect_qual = [ qual for qual in cds.qualifiers['sec_met'] if qual.startswith('Domains detected: ') ][0] if ";" in domdetect_qual: domains = domdetect_qual.partition( "Domains detected: ")[2].split(";") else: domains = [domdetect_qual.partition("Domains detected: ")[2]] for domain in domains: domain_name = domain.partition(" (")[0].replace(" ", "") evalue = domain.partition("E-value: ")[2].partition(",")[0] bitscore = domain.partition("bitscore: ")[2].partition(",")[0] nr_seeds = domain.partition("seeds: ")[2].partition(")")[0] txt.write("\t".join( [gene_ID, domain_name, evalue, bitscore, nr_seeds]) + "\n")
def test_find_clusters(self): i = 0 nseqdict = {"Metabolite0": "?", "Metabolite1": "?"} self.config.next_clusternr = 1 for gene_id in self.feature_by_id: if gene_id != "GENE_X": clustertype = "Metabolite%d" % (i % 2) hmm_detection._update_sec_met_entry( self.feature_by_id[gene_id], self.results_by_id[gene_id], clustertype, nseqdict) i += 1 hmm_detection.find_clusters(self.record, self.rulesdict) result_clusters = [ sorted([ utils.get_gene_id(f) for f in utils.get_cluster_cds_features(feature, self.record) ]) for feature in utils.get_cluster_features(self.record) ] expected_clusters = [["GENE_1", "GENE_2"], ["GENE_3"], ["GENE_4", "GENE_5"]] self.assertEqual(result_clusters, expected_clusters, msg="\nResult : %s\nExpected : %s" % (result_clusters, expected_clusters))
def run_coexpress(seq_record, all_gene_expressions, geo): options = get_config() cl_count = 1 cl_list = utils.get_cluster_features(seq_record) gene_expressions = all_gene_expressions[seq_record.id] logging.info('Running CoExpress analysis on the clusters..') for cluster in cl_list: logging.debug( 'Running CoExpress analysis on record "%s".. (Cluster %s of %s)' % (geo["info"]["id"], cl_count, len(cl_list))) features = utils.get_cluster_cds_features(cluster, seq_record) cl_count += 1 cluster_genes = {} for feature in features: gene_id = utils.get_gene_id(feature) if gene_id in gene_expressions: cluster_genes[gene_id] = gene_expressions[gene_id] #calculate correlation value between genes for gene_1 in cluster_genes: if "cor" not in cluster_genes[gene_1]: cluster_genes[gene_1]["cor"] = {} if "exp" not in cluster_genes[gene_1]: continue for gene_2 in cluster_genes: if "cor" not in cluster_genes[gene_2]: cluster_genes[gene_2]["cor"] = {} if gene_2 == gene_1: continue if "exp" not in cluster_genes[gene_2]: continue if gene_1 in cluster_genes[gene_2]["cor"]: continue cor_val = calc_correlation_value(cluster_genes[gene_1], cluster_genes[gene_2]) cluster_genes[gene_1]["cor"][gene_2] = cor_val cluster_genes[gene_2]["cor"][gene_1] = cor_val #calculate distance value for building dendogram for gene_1 in cluster_genes: if "dist" not in cluster_genes[gene_1]: cluster_genes[gene_1]["dist"] = {} for gene_2 in cluster_genes: if "dist" not in cluster_genes[gene_2]: cluster_genes[gene_2]["dist"] = {} dist = 100.0 if "cor" in cluster_genes[gene_1] and gene_2 in cluster_genes[ gene_1]["cor"]: cor_val = min(1.00, cluster_genes[gene_1]["cor"][gene_2]) dist = 100.0 * (1.0 - cor_val) cluster_genes[gene_1]["dist"][gene_2] = dist cluster_genes[gene_2]["dist"][gene_1] = dist # check for remote genes, add if correlation value >= 0.9 for gene_1 in cluster_genes: for seqid in all_gene_expressions: prefix = "%s:" % seqid.replace(":", "_") for gene_2 in all_gene_expressions[seqid]: if ( prefix + gene_2 ) not in options.hmm_results: # only add biosynthetic remote genes continue if gene_2 == gene_1: continue if gene_2 in cluster_genes: continue cor_val = min( 1.00, calc_correlation_value( cluster_genes[gene_1], all_gene_expressions[seqid][gene_2])) if 1.00 > cor_val >= 0.9: cluster_genes[gene_1]["dist"][gene_2] = 100.0 * ( 1.0 - cor_val) # review the remote genes, discard genes with less than 2 edges if True: edges_count = {} for gene_1 in cluster_genes: for gene_2 in cluster_genes[gene_1]["dist"]: if gene_2 not in cluster_genes: if gene_2 not in edges_count: edges_count[gene_2] = 0 edges_count[gene_2] += 1 for gene_1 in cluster_genes: new_dists = {} for gene_2 in cluster_genes[gene_1]["dist"]: if (gene_2 in cluster_genes) or (edges_count[gene_2] >= 2): new_dists[gene_2] = cluster_genes[gene_1]["dist"][ gene_2] cluster_genes[gene_1]["dist"] = new_dists # review the remote genes, discard genes without any connection to cluster's biosynthetic genes if True: have_connections = [] prefix = "%s:" % seq_record.id.replace(":", "_") for gene_1 in cluster_genes: if (prefix + gene_1) in options.hmm_results: for gene_2 in cluster_genes[gene_1]["dist"]: if (gene_2 not in cluster_genes) and ( gene_2 not in have_connections): have_connections.append(gene_2) for gene_1 in cluster_genes: new_dists = {} for gene_2 in cluster_genes[gene_1]["dist"]: if (gene_2 in cluster_genes) or (gene_2 in have_connections): new_dists[gene_2] = cluster_genes[gene_1]["dist"][ gene_2] cluster_genes[gene_1]["dist"] = new_dists #update seq_record update_features(features, cluster_genes, geo) if False: #This feature is temporarily disabled, saved for next version #options.coexpress_signal_cluster_size < len(overlaps): logging.info('Running expression signal analysis on seq_record..') signals = [] n = options.coexpress_signal_cluster_size - 1 #build list of cluster locations (for annotating signal regions) clrefs = [] for cluster in cl_list: clrefs.append(((cluster.location.start, cluster.location.end), utils.get_cluster_number(cluster))) clrefs = sorted(clrefs, key=lambda cl: cl[0][0]) #build signals for i in xrange(0, len(overlaps) - n): genes = [] for overlap in overlaps[i:i + n]: gene = overlap[0] for feature in overlap: if utils.get_gene_id(feature) in gene_expressions: gene = feature break genes.append(gene) cors = [] checked = [] hits = [] for x in xrange(0, len(genes)): gene_x = utils.get_gene_id(genes[x]) if prefix + gene_x in options.hmm_results: hits.append(options.hmm_results[prefix + gene_x][0].query_id) for y in xrange(0, len(genes)): if ((x, y) in checked) or ((y, x) in checked): continue cor_val = 0 gene_y = utils.get_gene_id(genes[y]) if (gene_x in gene_expressions) and (gene_y in gene_expressions): cor_val = calc_correlation_value( gene_expressions[gene_x], gene_expressions[gene_y]) cors.append(cor_val) checked.append((x, y)) sloc = (genes[0].location.start + genes[-1].location.end) / 2 cor_val = 0 if len(cors) > 0 and len(list(set(hits))) > 1: cor_val = np.median(cors) cl_idx = -1 for clref in clrefs: if sloc < clref[0][0]: continue if sloc <= clref[0][1]: cl_idx = clref[1] break signals.append((sloc, cor_val, cl_idx)) if "coexpress_signal" not in options: options.coexpress_signal = {} if geo["info"]["id"] not in options.coexpress_signal: options.coexpress_signal[geo["info"]["id"]] = {} options.coexpress_signal[geo["info"]["id"]][seq_record.id] = signals
def write(seq_records, options): if options.input_type == 'prot': return #Open up TXT file and XLS record outfolder = options.full_outputfolder_path txtfile = open(path.join(outfolder, "geneclusters.txt"), "w") wb = Workbook() font1 = Font() style1 = XFStyle() style1.font = font1 font1.bold = True ws0 = wb.add_sheet('0') ws0.write(0, 0, "Input accession number", style1) ws0.write(0, 1, "Input name", style1) ws0.write(0, 2, "Gene cluster type", style1) ws0.write(0, 3, "Gene cluster genes", style1) ws0.write(0, 4, "Gene cluster gene accessions", style1) if options.knownclusterblast: ws0.write(0, 5, "Compound with gene cluster of highest homology", style1) #For each gene cluster, write out info column = 1 for seq_record in seq_records: clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clustertype = utils.get_cluster_type(cluster) clusternr = utils.get_cluster_number(cluster) clustergenes = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features(cluster, seq_record) ] accessions = [ utils.get_gene_acc(cds) for cds in utils.get_cluster_cds_features(cluster, seq_record) ] ws0.write(column, 0, seq_record.id) try: ws0.write(column, 1, seq_record.description) except: ws0.write( column, 1, "Name to long to be contained in Excel cell; see txt file in downloadable zip archive." ) ws0.write(column, 2, clustertype) try: ws0.write(column, 3, ";".join(clustergenes)) except: ws0.write( column, 3, "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive." ) try: ws0.write(column, 4, ";".join(accessions)) except: ws0.write( column, 4, "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive." ) if hasattr(seq_record, 'closestcompounddict') and \ seq_record.closestcompounddict.has_key(clusternr): ws0.write(column, 5, seq_record.closestcompounddict[clusternr]) column += 1 txtfile.write("\t".join([ seq_record.id, seq_record.description, clustertype, ";".join( clustergenes), ";".join(accessions) ]) + "\n") wb.save(path.join(outfolder, "%s.geneclusters.xls" % seq_record.id))
def get_inter_cluster_relation(seq_records, geo_id): logging.debug('Calculating inter cluster relations on geo_record "%s"..' % (geo_id)) data = [] full_g = nx.Graph() cluster_genes = {} bio_genes = set() cur_cluster1 = 0 # First, inspect all cluster to get cluster_genes for record in seq_records: for cluster in utils.get_cluster_features(record): cur_cluster1 += 1 cluster_genes[cur_cluster1] = set() for cluster_gene in utils.get_cluster_cds_features( cluster, record): # We only care about cluster_genes that have a geo match for cluster_gene_geo in utils.parse_geo_feature(cluster_gene): # We only care about data from the current geo_id if cluster_gene_geo['rec_id'] == geo_id: cur_gene1 = utils.get_gene_id(cluster_gene) cur_gene1_distances = cluster_gene_geo['dist'] cur_gene1_neighbors = set(cur_gene1_distances) # Add each gene to cluster_genes, and to the full_g(raph) and to bio_genes cluster_genes[cur_cluster1].add(cur_gene1) full_g.add_node(cur_gene1) if 'sec_met' in cluster_gene.qualifiers: bio_genes.add(cur_gene1) # Get intra-cluster edges interactions = cur_gene1_neighbors.intersection( cluster_genes[cur_cluster1]) update_g(cur_gene1, interactions, cur_gene1_distances, full_g) # From the second cluster onwards, we'll add inter-cluster edges backwards, i.e.: 2-1, 3-1, 3-2, 4-1, 4-2, etc... if cur_cluster1 is not 1: for cur_cluster2 in cluster_genes: if cur_cluster1 is not cur_cluster2: interactions = cur_gene1_neighbors.intersection( cluster_genes[cur_cluster2]) update_g(cur_gene1, interactions, cur_gene1_distances, full_g) # Remove single nodes for node in full_g.nodes(): if full_g.degree(node) == 0: full_g.remove_node(node) # Get communities community_dict = community.best_partition(full_g) number_of_clusters = len(cluster_genes) # Now check inter-cluster interactions for i in range(1, number_of_clusters + 1): cluster1 = cluster_genes[i] for j in range(i + 1, number_of_clusters + 1): cluster2 = cluster_genes[j] cluster3 = cluster1.union(cluster2) cluster_pair_g = full_g.subgraph(cluster3) communities_present = np.unique( [community_dict[n] for n in cluster3 if n in community_dict]) # CRITERIA 1 = only intra-community edges for cur_community in communities_present: cur_community_nodes = [ n for n in cluster3 if n in community_dict and community_dict[n] == cur_community ] cur_community_g = cluster_pair_g.subgraph(cur_community_nodes) decomposed_g = list( nx.connected_component_subgraphs(cur_community_g)) for cur_g in decomposed_g: # CRITERIA 2 = no isolates. anything with a clustering_coefficient=0 will be pruned out. clustering_coefficient = nx.clustering(cur_g) pred_nodes = [ n for n in clustering_coefficient if clustering_coefficient[n] > 0 ] pred_g = cur_g.subgraph(pred_nodes) pred_edges = pred_g.edges() prediction = set(pred_g.nodes()) prediction_cluster1 = prediction.intersection(cluster1) prediction_cluster2 = prediction.intersection(cluster2) bio_prediction = prediction.intersection(bio_genes) bio_prediction_cluster1 = prediction_cluster1.intersection( bio_genes) bio_prediction_cluster2 = prediction_cluster2.intersection( bio_genes) #CRITERIA 3 = at least 2 genes per cluster #CRITERIA 5 = at least 1 bio per cluster #CRITERIA 4 = at least 3 bio if (len(prediction_cluster1) >= 2 and len(prediction_cluster2) >= 2 and len(bio_prediction_cluster1) >= 1 and len(bio_prediction_cluster2) >= 1 and len(bio_prediction) >= 3): pred_edges1 = [ n for n in pred_edges if n[0] in cluster1 and n[1] in cluster1 ] pred_edges2 = [ n for n in pred_edges if n[0] in cluster2 and n[1] in cluster2 ] pred_edges12 = [ n for n in pred_edges if n[0] in cluster1 and n[1] in cluster2 ] pred_edges21 = [ n for n in pred_edges if n[0] in cluster2 and n[1] in cluster1 ] inter_cluster_edges = pred_edges12 + pred_edges21 data.append({}) data[-1]['source'] = {} data[-1]['source']['id'] = i data[-1]['source']['links'] = pred_edges1 data[-1]['target'] = {} data[-1]['target']['id'] = j data[-1]['target']['links'] = pred_edges2 data[-1]['links'] = inter_cluster_edges return data
def write_BGC(txt, info, options): "Write BGC table to TXT" #TXT columns: BGC ID, BGC_type, detection_rules_used, BGC_range, genes, subclusters, # NRPSs_PKSs, signature_genes, RiPPs, pred_structure, monomers txt.write("\t".join([ "BGC ID", "BGC type", "detection rules used", "BGC_range", "genes", "subclusters", "NRPSs/PKSs", "signature_genes", "RiPPs", "predicted structure", "monomers" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out BGC_ID = "%s_c%s" % (info.seq_record.id.partition(".")[0], BGCnr) cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) BGC_type = info.clustertypes[BGCnr].replace("-", ";") detection_rules_used = '"' + ";".join( get_detection_rules(cluster_feature)) + '"' BGC_range = ";".join([ str(cluster_feature.location.start), str(cluster_feature.location.end) ]) genes = ";".join(info.accessions[BGCnr]) if 'subclusterblast' in cluster_feature.qualifiers: subclusters = ";".join([ qual.partition("\t")[2] for qual in cluster_feature.qualifiers['subclusterblast'] ]) else: subclusters = "" #TODO The subclusterblast module should probably be changed for the precalcs to provide a list here of the 100% hits instead of all hits NRPSs_PKSs = ";".join([ utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features if 'sec_met' in cds.qualifiers and len([ qual for qual in cds.qualifiers['sec_met'] if qual.startswith('NRPS/PKS Domain:') ]) > 0 ]) signature_genes = ";".join([ utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features if 'sec_met' in cds.qualifiers ]) if len(_find_core_peptides(cluster_feature, info.seq_record)) != 0: ripp_list = [] for peptide in _find_core_peptides(cluster_feature, info.seq_record): for cds in cluster_gene_features: if utils.features_overlap(cds, peptide): ripp_list.append( utils.get_gene_acc(cds).partition(".")[0]) break # RiPPs = ";".join([[utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features # if utils.features_overlap(cds, peptide)][0] for peptide in # _find_core_peptides(cluster_feature, info.seq_record)]) RiPPs = ";".join(ripp_list) else: RiPPs = "-" if 'structure' in cluster_feature.qualifiers: pred_structure = ";".join(cluster_feature.qualifiers['structure']) else: pred_structure = "N/A" monomers = utils.get_structure_pred(cluster_feature) #Write data to TXT txt.write("\t".join([ BGC_ID, BGC_type, detection_rules_used, BGC_range, genes, subclusters, NRPSs_PKSs, signature_genes, RiPPs, pred_structure, monomers ]) + "\n")
def write_NRPS_PKS(txt, info, options): "Write NRPS/PKS table to TXT" #TXT columns: NRPS/PKS ID, annotation, aSDomain, score, evalue, domain type, subtype, range, activity, NRPSPredictor2, Stachelhaus, Minowa, pkssignature, consensus txt.write("\t".join([ "Cluster_ID", "NRPSPKS_ID", "annotation", "aSDomain", "score", "evalue", "domain_type", "subtype", "domain_start", "domain_end", "KR activity", "KR stereochemistry", "NRPSPredictor2", "Stachelhaus", "Minowa", "pkssignature", "consensus" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) cluster_id = "{seq_id}_c{cluster_nr}".format(seq_id=info.seq_record.id, cluster_nr=BGCnr) NRPSs_PKSs = [ cds for cds in cluster_gene_features if 'sec_met' in cds.qualifiers and len([ qual for qual in cds.qualifiers['sec_met'] if qual.startswith('NRPS/PKS Domain:') ]) > 0 ] for cds in NRPSs_PKSs: enzyme_ID = utils.get_gene_acc(cds).partition(".")[0] if len([ qual for qual in cds.qualifiers['sec_met'] if "NRPS/PKS subtype: " in qual ]) > 0: enzyme_annotation = [ qual for qual in cds.qualifiers['sec_met'] if qual.startswith("NRPS/PKS subtype") ][0].partition("NRPS/PKS subtype: ")[2] else: logging.warn("No enzyme annotation for %s" % enzyme_ID) enzyme_annotation = "" aSDomains = [ dom for dom in utils.get_cluster_aSDomain_features( cluster_feature, info.seq_record) if utils.features_overlap(cds, dom) and utils.get_gene_id(cds) in [dom.qualifiers['locus_tag'], dom.qualifiers['locus_tag'][0]] ] for aSDomain in aSDomains: domtype = aSDomain.qualifiers['domain'][0] if "domain_subtype" in aSDomain.qualifiers: subtype = aSDomain.qualifiers['domain_subtype'][0] else: subtype = "" aSDomain_ID = aSDomain.qualifiers['asDomain_id'][0] score = str(aSDomain.qualifiers['score'][0]) evalue = str(aSDomain.qualifiers['evalue'][0]) dom_start = str(aSDomain.location.start) dom_end = str(aSDomain.location.end) kr_activity = "" kr_stereochemistry = "" NRPSPredictor2 = "" Stachelhaus = "" Minowa = "" pkssignature = "" consensus = "" if aSDomain.qualifiers.has_key('specificity'): if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("KR activity") ]) > 0: kr_activity = [ qual.partition("KR activity: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("KR activity") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("KR stereochemistry") ]) > 0: kr_stereochemistry = [ qual.partition("KR stereochemistry: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("KR stereochemistry") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("NRPSpredictor2") ]) > 0: NRPSPredictor2 = [ qual.partition("NRPSpredictor2 SVM: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("NRPSpredictor2") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("Stachelhaus") ]) > 0: Stachelhaus = [ qual.partition("Stachelhaus code: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("Stachelhaus") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("Minowa") ]) > 0: Minowa = [ qual.partition("Minowa: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("Minowa") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("PKS signature") ]) > 0: pkssignature = [ qual.partition("PKS signature: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("PKS signature") ][0] if len([ qual for qual in aSDomain.qualifiers['specificity'] if qual.startswith("consensus") ]) > 0: consensus = [ qual.partition("consensus: ")[2] for qual in aSDomain.qualifiers['specificity'] if qual.startswith("consensus") ][0] txt.write("\t".join([ cluster_id, enzyme_ID, enzyme_annotation, aSDomain_ID, score, evalue, domtype, subtype, dom_start, dom_end, kr_activity, kr_stereochemistry, NRPSPredictor2, Stachelhaus, Minowa, pkssignature, consensus ]) + "\n")
def generate_sidepanel(cluster, seq_record, options, sidepanel=None): """Generate sidepanel div""" cluster_rec = utils.get_cluster_by_nr(seq_record, cluster['idx']) if cluster_rec is None: return sidepanel if sidepanel is None: sidepanel = pq('<div>') sidepanel.addClass('sidepanel') structure = pq('<div>') structure.addClass('structure') structure_header = pq('<h3>') structure_header.text('Predicted core structure') structure.append(structure_header) a = pq('<a>') a.attr('href', _get_structure_image_url(cluster_rec, options.outputfoldername)) a.attr('target', '_new') structure.append(a) structure_img = pq('<img>') structure_img.attr( 'src', _get_structure_image_url(cluster_rec, options.outputfoldername)) a.append(structure_img) warning = pq('<div>') warning.addClass('as-structure-warning') if not 'docking' in options: options.docking = {} if cluster['idx'] in options.docking and options.docking[cluster['idx']]: warning.text('Rough prediction of core scaffold based on assumed ' 'PKS linker matching; tailoring reactions not taken ' 'into account') else: warning.text('Rough prediction of core scaffold based on assumed ' 'PKS/NRPS colinearity; tailoring reactions not taken ' 'into account') structure.append(warning) sidepanel.append(structure) details = pq('<div>') details.addClass('more-details') details_header = pq('<h3>') details_header.text('Prediction details') details.append(details_header) details_list = pq('<dl>') details_list.addClass('prediction-text') details.append(details_list) sidepanel.append(details) dt = pq('<dt>') dt.text('Monomers prediction:') details_list.append(dt) dd = pq('<dd>') dd.text(_get_monomer_prediction(cluster_rec)) details_list.append(dd) features = utils.get_cluster_cds_features(cluster_rec, seq_record) for feature in features: if not 'sec_met' in feature.qualifiers: continue header_printed = False per_CDS_predictions = [] for qual in feature.qualifiers['sec_met']: if not qual.startswith('NRPS/PKS Domain:'): continue # logging.debug("qual: %s" % qual) preds = _parse_substrate_predictions(qual) per_Adomain_predictions = [] for key, val in preds: if not header_printed: dt = pq('<dt>') dt.text(utils.get_gene_id(feature)) details_list.append(dt) header_printed = True dd = pq('<dd>') dd.html('%s: %s<br>' % (key, val)) details_list.append(dd) if qual.startswith("NRPS/PKS Domain: AMP-binding"): values = _filter_norine_as(val.split(",")) if len(values) > 0: per_Adomain_predictions.extend(val.split(",")) if len(preds) > 0: if qual.startswith("NRPS/PKS Domain: AMP-binding"): per_Adomains_predictions_unique = list( set(per_Adomain_predictions)) per_CDS_predictions.append(per_Adomains_predictions_unique) # logging.debug("substrate prediction list: %s" % ",".join(per_Adomains_predictions_unique) ) dd = pq('<dd>') dd.append(pq('<br>')) details_list.append(dd) if len(per_CDS_predictions) > 0: url = _get_norine_url_for_specArray(per_CDS_predictions) if url: dd = pq('<dd>') dd.append("Search NORINE for peptide in ") a = pq('<a>') a.attr('href', url) a.attr('target', '_new') a.text("strict mode") dd.append(a) dd.append(" // ") url = _get_norine_url_for_specArray(per_CDS_predictions, be_strict=False) a = pq('<a>') a.attr('href', url) a.attr('target', '_new') a.text("relaxed mode") dd.append(a) dd.append(pq('<br>')) dd.append(pq('<br>')) details_list.append(dd) if cluster['type'].find('nrps') > -1: cross_refs = pq("<div>") refs_header = pq('<h3>') refs_header.text('Database cross-links') cross_refs.append(refs_header) links = pq("<div>") links.addClass('prediction-text') a = pq("<a>") a.attr('href', 'http://bioinfo.lifl.fr/norine/form2.jsp') a.attr('target', '_new') a.text("Link to NORINE database query form") links.append(a) links.append("<br>") a = pq("<a>") url = _get_norine_url_for_cluster(cluster_rec) logging.debug("NORINE URL string: %s" % url) a.attr('href', url) a.attr('target', '_new') a.text("strict mode") links.append("Direct lookup in NORINE database in ") links.append(a) links.append(" // ") url = _get_norine_url_for_cluster(cluster_rec, be_strict=False) a = pq("<a>") a.attr('href', url) a.attr('target', '_new') a.text("relaxed mode") links.append(a) cross_refs.append(links) sidepanel.append(cross_refs) return sidepanel
def annotate_geneclusters(seq_record, options): """Re-annotate gene clusters in the seq_record""" pfam_features = utils.get_pfam_features(seq_record) cf_clusters = find_cf_clusters(pfam_features, seq_record, options) # Integrate ClusterFinder clusters with existing cluster features newclusters = [] cluster_features = utils.get_cluster_features(seq_record) secmet_cds_features = utils.get_secmet_cds_features(seq_record) for cf_cluster in cf_clusters: overlaps = False cf_type = "cf_putative" for cluster in cluster_features: if not utils.features_overlap(cf_cluster, cluster): continue overlaps = True # Get signature genes from antiSMASH-predicted cluster features_in_cluster = utils.get_cluster_cds_features( cluster, seq_record) cluster_sig_genes = [ gene for gene in secmet_cds_features if gene in features_in_cluster ] # Predict gene cluster borders using ClusterFinder if options.borderpredict: if ((cluster.location.end + cluster.location.start) / 2) in cf_cluster.location: # Make sure that antiSMASH signature genes are still included in the cluster for sig_gene in cluster_sig_genes: startpoint = min( [sig_gene.location.start, sig_gene.location.end]) endpoint = max( [sig_gene.location.start, sig_gene.location.end]) if cf_cluster.location.start > startpoint: cf_cluster.location = FeatureLocation( startpoint, cf_cluster.location.end) if cf_cluster.location.end < endpoint: cf_cluster.location = FeatureLocation( cf_cluster.location.start, endpoint) cluster_border = SeqFeature(cf_cluster.location, type="cluster_border") cluster_border.qualifiers = { "tool": ["clusterfinder"], "probability": [cf_cluster.probability], "note": ["best prediction"], } seq_record.features.append(cluster_border) elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end: cluster.location = cf_cluster.location elif cf_cluster.location.start < cluster.location.start: cluster.location = FeatureLocation(cf_cluster.location.start, cluster.location.end) elif cf_cluster.location.end > cluster.location.end: cluster.location = FeatureLocation(cluster.location.start, cf_cluster.location.end) cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] if not overlaps and not ('borderpredict_only' in options and options.borderpredict_only): cf_cluster_CDSs = utils.get_cluster_cds_features( cf_cluster, seq_record) for CDS in cf_cluster_CDSs: if 'sec_met' in CDS.qualifiers: type_sec_met_qualifiers = [ feat for feat in CDS.qualifiers['sec_met'] if "Type: " in feat ] for qualifier in type_sec_met_qualifiers: if "cf_fatty_acid" in qualifier: if cf_type == "cf_putative": cf_type = "cf_fatty_acid" elif cf_type == "cf_saccharide": cf_type = "cf_fatty_acid-saccharide" if "cf_saccharide" in qualifier: if cf_type == "cf_putative": cf_type = "cf_saccharide" elif cf_type == "cf_fatty_acid": cf_type = "cf_fatty_acid-saccharide" new_cluster = SeqFeature(cf_cluster.location, type="cluster") new_cluster.qualifiers['product'] = [cf_type] new_cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] newclusters.append(new_cluster) if len(newclusters): seq_record.features.extend(newclusters) renumber_clusters(seq_record, options)