def perform_docking_domain_analysis(options, clusterpksgenes, genecluster, seq_record, pksnrpsvars): feature_by_id = utils.get_feature_dict(seq_record) #log("Predicting PKS gene order by docking domain sequence " \ # "analysis", stdout=True) startergene, endinggene = find_first_and_last_genes( clusterpksgenes, pksnrpsvars.domainnamesdict) with TemporaryDirectory(change=True): dockinganalysis_dir = utils.get_full_path(__file__, "docking_analysis") ntermintresdict = extract_nterminus(dockinganalysis_dir, clusterpksgenes, seq_record, startergene, feature_by_id) ctermintresdict = extract_cterminus(dockinganalysis_dir, clusterpksgenes, seq_record, endinggene, feature_by_id) possible_orders = find_possible_orders(clusterpksgenes, startergene, endinggene) geneorders, possible_orders_scoredict = rank_biosynthetic_orders( ntermintresdict, ctermintresdict, startergene, endinggene, possible_orders) write_gene_orders_to_html(options, geneorders, possible_orders_scoredict, genecluster, startergene, endinggene) #log("Predicting PKS gene order by docking domain sequence " \ # "analysis succeeded.", stdout=True) #Write html outfile with docking domain analysis output pksnrpsvars.dockingdomainanalysis.append(genecluster) return geneorders[0]
def getECs(seq_record, options): if not name in options.ecpred: logging.debug("ECprediction %s not selected, returning..." % name) return CDSFeatureDict = utils.get_feature_dict(seq_record) logging.debug("Predicting EC numbers using KEGG online queries") KEGGspeciesLocusTagDict = _getKEGG_speciesLocusTag(CDSFeatureDict) ECDict = _get_ECNumberDict(KEGGspeciesLocusTagDict) notes = [] # logging.debug("Found %s EC predictions" % len(ECDict.keys())) for key in ECDict.keys(): Feature = CDSFeatureDict[key] if Feature.qualifiers.has_key('note'): notes = Feature.qualifiers['note'] if len(ECDict[key]) > 0: logging.debug("Found EC numbers: %s" % ", ".join(ECDict[key])) notes.append('EC number prediction based on KEGG query: %s' % ECDict[key]) Feature.qualifiers['note'] = notes if Feature.qualifiers.has_key('EC_number'): logging.warn('ECpredictor[kegg]: Overwriting existing EC annotation: %s with %s' % \ (", ".join(Feature.qualifiers['EC_number']), ", ".join(ECDict[key]))) Feature.qualifiers['EC_number'] = ECDict[key] else: logging.warn('ECpredictor[KEGG]: Could not find EC number for %s' % utils.get_gene_id(Feature))
def test_get_feature_dict(self): "Test utils.get_feature_dict()" fd = utils.get_feature_dict(self.rec) ids = [ f.qualifiers['locus_tag'][0] for f in self.rec.features if f.type == "CDS" ] keys = fd.keys() ids.sort() keys.sort() self.assertListEqual(ids, keys)
def retrieve_gene_cluster_annotations(seq_record, smcogdict, gtrcoglist, transportercoglist, geneclusternr): allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] pksnrpscoregenes = [ utils.get_gene_id(cds) for cds in utils.get_pksnrps_cds_features(seq_record) ] feature_by_id = utils.get_feature_dict(seq_record) clustergenes = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features( utils.get_cluster_by_nr(seq_record, geneclusternr), seq_record) ] clustertype = utils.get_cluster_type( utils.get_cluster_by_nr(seq_record, geneclusternr)) annotations = {} colors = [] starts = [] ends = [] strands = [] pksnrpsprots = [] gtrs = [] transporters = [] for j in clustergenes: cdsfeature = feature_by_id[j] if cdsfeature.qualifiers.has_key('product'): annotations[j] = cdsfeature.qualifiers['product'][0] else: annotations[j] = 'Unannotated gene' starts.append(cdsfeature.location.start) ends.append(cdsfeature.location.end) if cdsfeature.strand == -1: strands.append("-") else: strands.append("+") if j in allcoregenes: colors.append("#810E15") else: colors.append("grey") if j in pksnrpscoregenes: pksnrpsprots.append(j) if smcogdict.has_key(j): if len(smcogdict[j]) > 0 and smcogdict[j][0] in gtrcoglist: gtrs.append(j) if len(smcogdict[j]) > 0 and smcogdict[j][0] in transportercoglist: transporters.append(j) clustersize = max(ends) - min(starts) return clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize
def find_colinear_order(clusterpksnrpsgenes, seq_record, domainnamesdict): feature_by_id = utils.get_feature_dict(seq_record) #If NRPS genes, mixed NRPS/PKS genes, PKS genes without detected docking domains, or clusters with a 1-3 PKS genes, assume colinearity direction = 0 for feature in clusterpksnrpsgenes: k = utils.get_gene_id(feature) if feature_by_id[k].strand == 1: direction += 1 elif feature_by_id[k].strand == -1: direction = direction - 1 if direction < 0: clusterpksnrpsgenes.reverse() #Reverse if first gene encodes a multidomain protein with a TE/TD domain if "Thioesterase" in domainnamesdict[utils.get_gene_id( clusterpksnrpsgenes[0])] or "TD" in domainnamesdict[ utils.get_gene_id(clusterpksnrpsgenes[0])]: if len(domainnamesdict[utils.get_gene_id(clusterpksnrpsgenes[0])]) > 1: clusterpksnrpsgenes.reverse() geneorder = [utils.get_gene_id(feature) for feature in clusterpksnrpsgenes] return geneorder
def filter_nonterminal_docking_domains(seq_record, pksnrpsvars): dockingdomains = [ 'NRPS-COM_Nterm', 'NRPS-COM_Cterm', 'PKS_Docking_Cterm', 'PKS_Docking_Nterm' ] hitgenes = pksnrpsvars.domaindict.keys() feature_by_id = utils.get_feature_dict(seq_record) for hitgene in hitgenes: to_remove = [] cdsfeature = feature_by_id[hitgene] cds_seq = utils.get_aa_sequence(cdsfeature) hitgenelength = len(cds_seq) x = 0 for hit in pksnrpsvars.domaindict[hitgene]: if hit[0] in dockingdomains: if not (hitgenelength - max(hit[1], hit[2]) < 50 or min(hit[1], hit[2]) < 50): to_remove.append(x) x += 1 to_remove.reverse() for idx in to_remove: del pksnrpsvars.domaindict[hitgene][idx] if pksnrpsvars.domaindict[hitgene] == []: del pksnrpsvars.domaindict[hitgene]
def _annotate(seq_record, options, results): "Annotate seq_record with CDS_motifs for the result" logging.debug("generating feature objects for PFAM hits") min_score = _min_score(options) max_evalue = _max_evalue(options) feature_by_id = utils.get_feature_dict(seq_record) for r in results: i = 1 for hsp in r.hsps: if hsp.bitscore <= min_score or hsp.evalue >= max_evalue: continue if not feature_by_id.has_key(hsp.query_id): continue feature = feature_by_id[hsp.query_id] start, end = _calculate_start_end(feature, hsp) loc = FeatureLocation(start, end, strand=feature.strand) newFeature = SeqFeature(location=loc, type=options.FeatureTags.fullhmmer_tag) quals = defaultdict(list) quals['label'].append(r.id) if feature.qualifiers.has_key('locus_tag'): quals['locus_tag'] = feature.qualifiers['locus_tag'] else: quals['locus_tag'] = [hsp.query_id] quals['domain'] = [hsp.hit_id] quals['asDomain_id'] = ['fullhmmer_'+'_'.join(quals['locus_tag'])+'_'+'{:04d}'.format(i)] i += 1 quals['evalue'] = [str("{:.2E}".format(float(hsp.evalue)))] quals['score'] = [str(hsp.bitscore)] quals['aSTool'] = ["fullhmmer"] quals['detection'] = ["hmmscan"] quals['database'] = [path.basename(r.target)] if feature.qualifiers.has_key('transl_table'): [transl_table] = feature.qualifiers['transl_table'] else: transl_table = 1 quals['translation'] = [str(newFeature.extract(seq_record.seq).translate(table=transl_table))] quals['note'].append("%s-Hit: %s. Score: %s. E-value: %s. Domain range: %s..%s." % \ (path.basename(r.target), hsp.hit_id, hsp.bitscore, hsp.evalue, hsp.hit_start, hsp.hit_end)) quals['description'] = [hsp.hit_description] try: pfamid = name_to_pfamid[hsp.hit_id] if quals.has_key('db_xref'): quals['db_xref'].append("PFAM: %s" % pfamid) else: quals['db_xref'] = ["PFAM: %s" % pfamid] except KeyError: pass newFeature.qualifiers=quals seq_record.features.append(newFeature)
def main(): multiprocessing.freeze_support() res_object = {} # get genome files files = [] for line in open(sys.argv[1], 'r'): files.append(path.expanduser(line.replace("\n", ""))) # mockup antismash run per files i = 1 for fpath in files: res_object[fpath] = {} print "Processing %s... (%d/%d)" % (fpath, i, len(files)) i += 1 options = get_mockup_config() options.sequences = [fpath] config.set_config(options) run_antismash.setup_logging( options) #To-DO: get antismash logging to works! # load plugins plugins = run_antismash.load_detection_plugins() run_antismash.filter_plugins(plugins, options, options.enabled_cluster_types) # parse to seq_records seq_records = run_antismash.parse_input_sequences(options) options.next_clusternr = 1 for seq_record in seq_records: if options.input_type == 'nucl': seq_records = [ record for record in seq_records if len(record.seq) > 1000 ] if len(seq_records) < 1: continue utils.sort_features(seq_record) run_antismash.strip_record(seq_record) utils.fix_record_name_id(seq_record, options) # fetch results_by_id feature_by_id = utils.get_feature_dict(seq_record) results = [] results_by_id = {} for feature in utils.get_cds_features(seq_record): prefix = "%s:" % seq_record.id.replace(":", "_") gene_id = utils.get_gene_id(feature) if (prefix + gene_id) in options.hmm_results: results_by_id[gene_id] = options.hmm_results[prefix + gene_id] for res in results_by_id[gene_id]: results.append(res) # ignore short aa's min_length_aa = 100 short_cds_buffer = [] for f in seq_record.features: # temporarily remove short aa if f.type == "CDS" and len( f.qualifiers['translation'] [0]) < min_length_aa and not results_by_id.has_key( utils.get_gene_id(f)): short_cds_buffer.append(f) seq_record.features.remove(f) overlaps = utils.get_overlaps_table(seq_record) rulesdict = hmm_detection.create_rules_dict( options.enabled_cluster_types) # find total cdhit numbers in the chromosome total_cdhit = len( utils.get_cdhit_table(utils.get_cds_features(seq_record))[0]) res_object[fpath][seq_record.id] = { "total_clusters": 0, "total_genes": len(overlaps[0]), "total_cdhit": total_cdhit, "genes_with_hits": 0, "largest_cdhit": 0, "largest_domain_variations": 0, "per_hits": {}, "cluster_types": {} } # filter overlap hits results, results_by_id = hmm_detection.filter_results( results, results_by_id, overlaps, feature_by_id) # count hits for gene_id in results_by_id: res_gene = results_by_id[gene_id] if len(res_gene) > 0: res_object[fpath][seq_record.id]["genes_with_hits"] += 1 for hsp in res_gene: domain_name = hsp.query_id.replace("plants/", "") if domain_name not in res_object[fpath][ seq_record.id]["per_hits"]: res_object[fpath][ seq_record.id]["per_hits"][domain_name] = 0 res_object[fpath][ seq_record.id]["per_hits"][domain_name] += 1 # do cluster finding algorithm typedict = hmm_detection.apply_cluster_rules( results_by_id, feature_by_id, options.enabled_cluster_types, rulesdict, overlaps) hmm_detection.fix_hybrid_clusters_typedict(typedict) nseqdict = hmm_detection.get_nseq() for cds in results_by_id.keys(): feature = feature_by_id[cds] if typedict[cds] != "none": hmm_detection._update_sec_met_entry( feature, results_by_id[cds], typedict[cds], nseqdict) hmm_detection.find_clusters(seq_record, rulesdict, overlaps) seq_record.features.extend(short_cds_buffer) res_object[fpath][seq_record.id]["total_clusters"] += len( utils.get_cluster_features(seq_record)) # do cluster specific and unspecific analysis if len(utils.get_cluster_features(seq_record)) > 0: run_antismash.cluster_specific_analysis( plugins, seq_record, options) run_antismash.unspecific_analysis(seq_record, options) #Rearrange hybrid clusters name alphabetically hmm_detection.fix_hybrid_clusters(seq_record) #before writing to output, remove all hmm_detection's subdir prefixes from clustertype for cluster in utils.get_cluster_features(seq_record): prod_names = [] for prod in cluster.qualifiers['product']: prod_name = [] for name in prod.split('-'): prod_name.append(name.split('/')[-1]) prod_names.append("-".join(prod_name)) cluster.qualifiers['product'] = prod_names for cds in utils.get_cds_features(seq_record): if 'sec_met' in cds.qualifiers: temp_qual = [] for row in cds.qualifiers['sec_met']: if row.startswith('Type: '): clustertypes = [ (ct.split('/')[-1]) for ct in row.split('Type: ')[-1].split('-') ] temp_qual.append('Type: ' + "-".join(clustertypes)) elif row.startswith('Domains detected: '): cluster_results = [] for cluster_result in row.split( 'Domains detected: ')[-1].split(';'): cluster_results.append( cluster_result.split(' (E-value')[0].split( '/')[-1] + ' (E-value' + cluster_result.split(' (E-value')[-1]) temp_qual.append('Domains detected: ' + ";".join(cluster_results)) else: temp_qual.append(row) cds.qualifiers['sec_met'] = temp_qual #on plants, remove plant clustertype from hybrid types, and replace single #plant clustertype with "putative" for cluster in utils.get_cluster_features(seq_record): prod_names = [] for prod in cluster.qualifiers['product']: prod_name = list(set(prod.split('-'))) if (len(prod_name) > 1) and ("plant" in prod_name): prod_name.remove("plant") elif prod_name == ["plant"]: prod_name = ["putative"] prod_names.append("-".join(prod_name)) cluster.qualifiers['product'] = prod_names for cds in utils.get_cds_features(seq_record): if 'sec_met' in cds.qualifiers: temp_qual = [] for row in cds.qualifiers['sec_met']: if row.startswith('Type: '): clustertypes = list( set(row.split('Type: ')[-1].split('-'))) if (len(clustertypes) > 1) and ("plant" in clustertypes): clustertypes.remove("plant") elif clustertypes == ["plant"]: clustertypes = ["putative"] temp_qual.append('Type: ' + "-".join(clustertypes)) else: temp_qual.append(row) cds.qualifiers['sec_met'] = temp_qual # find largest cdhit number & largest domain diversity in a cluster res_object[fpath][seq_record.id]["average_cdhit"] = 0 res_object[fpath][seq_record.id]["average_domain_variations"] = 0 cdhit_numbers = [] domain_numbers = [] for cluster in utils.get_cluster_features(seq_record): cluster_type = utils.get_cluster_type(cluster) if cluster_type not in res_object[fpath][ seq_record.id]["cluster_types"]: res_object[fpath][ seq_record.id]["cluster_types"][cluster_type] = 0 res_object[fpath][ seq_record.id]["cluster_types"][cluster_type] += 1 num_cdhit = len( utils.get_cluster_cdhit_table(cluster, seq_record)) num_domain = len(utils.get_cluster_domains( cluster, seq_record)) cdhit_numbers.append(num_cdhit) domain_numbers.append(num_domain) if num_cdhit > res_object[fpath][ seq_record.id]["largest_cdhit"]: res_object[fpath][ seq_record.id]["largest_cdhit"] = num_cdhit if num_domain > res_object[fpath][ seq_record.id]["largest_domain_variations"]: res_object[fpath][seq_record.id][ "largest_domain_variations"] = num_domain if len(cdhit_numbers) > 0: res_object[fpath][seq_record.id][ "average_cdhit"] = numpy.median(cdhit_numbers) if len(domain_numbers) > 0: res_object[fpath][seq_record.id][ "average_domain_variations"] = numpy.median(domain_numbers) with open('result.js', 'w') as h: h.write('var result = %s;' % json.dumps(res_object, indent=4))
def detect_signature_genes(seq_record, enabled_clustertypes, options): "Function to be executed by module" logging.info('Detecting gene clusters using HMM library') feature_by_id = utils.get_feature_dict(seq_record) rulesdict = create_rules_dict(enabled_clustertypes) results = [] sig_by_name = {} results_by_id = {} for sig in get_sig_profiles(): sig_by_name[sig.name] = sig for feature in utils.get_cds_features(seq_record): prefix = "%s:" % seq_record.id.replace(":", "_") gene_id = utils.get_gene_id(feature) if (prefix + gene_id) in options.hmm_results: results_by_id[gene_id] = options.hmm_results[prefix + gene_id] for res in results_by_id[gene_id]: results.append(res) short_cds_buffer = [] if options.ignore_short_aa: # Temporarily filter out cds with < prot_min_length AA length min_length_aa = 50 if options.eukaryotic: min_length_aa = 100 for f in seq_record.features: if f.type == "CDS" and len( f.qualifiers['translation'] [0]) < min_length_aa and not results_by_id.has_key( utils.get_gene_id(f)): short_cds_buffer.append(f) seq_record.features.remove(f) #Get overlap tables (for overlap filtering etc) overlaps = utils.get_overlaps_table(seq_record) #Filter results by comparing scores of different models (for PKS systems) results_to_delete = [gene_id for gene_id in results_by_id] results, results_by_id = filter_results(results, results_by_id, overlaps, feature_by_id) #Update filtered results back to the options.hmm_results for gene_id in results_by_id: results_to_delete.remove(gene_id) prefix = "%s:" % seq_record.id.replace(":", "_") if (prefix + gene_id) in options.hmm_results: options.hmm_results[(prefix + gene_id)] = results_by_id[gene_id] for gene_id in results_to_delete: prefix = "%s:" % seq_record.id.replace(":", "_") if (prefix + gene_id) in options.hmm_results: del options.hmm_results[(prefix + gene_id)] #Use rules to determine gene clusters typedict = apply_cluster_rules(results_by_id, feature_by_id, enabled_clustertypes, rulesdict, overlaps) #Rearrange hybrid clusters name in typedict alphabetically fix_hybrid_clusters_typedict(typedict) #Find number of sequences on which each pHMM is based nseqdict = get_nseq() #Save final results to seq_record for cds in results_by_id.keys(): feature = feature_by_id[cds] if typedict[cds] != "none": _update_sec_met_entry(feature, results_by_id[cds], typedict[cds], nseqdict) find_clusters(seq_record, rulesdict, overlaps) #Find additional NRPS/PKS genes in gene clusters add_additional_nrpspks_genes(typedict, results_by_id, seq_record, nseqdict) #Rearrange hybrid clusters name alphabetically fix_hybrid_clusters(seq_record) #Add details of gene cluster detection to cluster features store_detection_details(results_by_id, rulesdict, seq_record) # Re-add the short CDSs seq_record.features.extend(short_cds_buffer) utils.sort_features(seq_record) #If all-orfs option on, remove irrelevant short orfs if options.all_orfs: remove_irrelevant_allorfs(seq_record) #Display %identity if options.enable_cdhit: store_percentage_identities(seq_record)
def detect_signature_genes(seq_record, enabled_clustertypes, options): "Function to be executed by module" feature_by_id = utils.get_feature_dict(seq_record) full_fasta = utils.get_multifasta(seq_record) rulesdict = create_rules_dict(enabled_clustertypes) results = [] sig_by_name = {} results_by_id = {} for sig in _signature_profiles: sig_by_name[sig.name] = sig runresults = utils.run_hmmsearch(utils.get_full_path( __file__, 'bgc_seeds.hmm'), full_fasta, use_tempfile=True) for runresult in runresults: acc = runresult.accession.split('.')[0] # Store result if it is above cut-off for hsp in runresult.hsps: if hsp.query_id in sig_by_name: sig = sig_by_name[hsp.query_id] elif acc in sig_by_name: sig = sig_by_name[acc] else: logging.error( 'BUG: Failed to find signature for ID %s / ACC %s', hsp.query_id, acc) continue if hsp.bitscore > sig.cutoff: results.append(hsp) if hsp.hit_id not in results_by_id: results_by_id[hsp.hit_id] = [hsp] else: results_by_id[hsp.hit_id].append(hsp) #Get overlap tables (for overlap filtering etc) overlaps = utils.get_overlaps_table(seq_record) #Filter results by comparing scores of different models (for PKS systems) results, results_by_id = filter_results(results, results_by_id) # Filter results of overlapping genes (only for plants) if options.taxon == 'plants': results, results_by_id = filter_result_overlapping_genes( results, results_by_id, overlaps, feature_by_id) #Filter multiple results of the same model in one gene results, results_by_id = filter_result_multiple(results, results_by_id) #Use rules to determine gene clusters typedict = apply_cluster_rules(results_by_id, feature_by_id, enabled_clustertypes, rulesdict, overlaps) #Find number of sequences on which each pHMM is based nseqdict = get_nseq() #Save final results to seq_record for cds in results_by_id.keys(): feature = feature_by_id[cds] _update_sec_met_entry(feature, results_by_id[cds], typedict[cds], nseqdict) find_clusters(seq_record, rulesdict) #Find additional NRPS/PKS genes in gene clusters add_additional_nrpspks_genes(typedict, results_by_id, seq_record, nseqdict) #Add details of gene cluster detection to cluster features store_detection_details(results_by_id, rulesdict, seq_record) #If all-orfs option on, remove irrelevant short orfs if options.all_orfs: remove_irrelevant_allorfs(seq_record)