def test_run(self): for sequence in self.sequences: gff_parser.run(sequence, self.config) len_cds_1 = len(utils.get_cds_features(self.sequences[0])) len_cds_2 = len(utils.get_cds_features(self.sequences[1])) detected_result = (len_cds_1, len_cds_2) expected_result = (1, 0) self.assertEqual(detected_result, expected_result, msg="\nResult : %s\nExpected : %s" % (detected_result, expected_result))
def fastaseqlengths(seq_record): seqlengths = {} cdsfeatures = utils.get_cds_features(seq_record) for cds in cdsfeatures: seqlength = len(str(utils.get_aa_sequence(cds))) seqlengths[utils.get_gene_acc(cds)] = seqlength return seqlengths
def find_nr_cds(clusterpositions, seq_record): #Find the number of CDSs in candidate cluster and adjust the cluster starts and ends to match the CDS starts and ends cdsfeatures = utils.get_cds_features(seq_record) withinclustercdsfeatures = [] for cds in cdsfeatures: if clusterpositions[0] <= int(cds.location.start) <= clusterpositions[1] or \ clusterpositions[0] <= int(cds.location.end) <= clusterpositions[1] or \ int(cds.location.start) <= clusterpositions[0] <= int(cds.location.end) or \ int(cds.location.start) <= clusterpositions[1] <= int(cds.location.end): withinclustercdsfeatures.append(cds) if len(withinclustercdsfeatures) == 0: return clusterpositions, 0 startlocations = [ int(cds.location.start) for cds in withinclustercdsfeatures ] endlocations = [int(cds.location.end) for cds in withinclustercdsfeatures] #If statement to avoid getting the complete genome as cluster if one CDS starts at end and finishes at start of genome if seq_record is not None and not (0 in startlocations and len( seq_record.seq) in endlocations): newclusterstart = min(startlocations) newclusterend = max(endlocations) newclusterpositions = [newclusterstart, newclusterend] else: newclusterpositions = clusterpositions return newclusterpositions, len(withinclustercdsfeatures)
def generate_searchgtr_htmls(seq_records, options): #Generate lists of COGs that are glycosyltransferases or transporters gtrcoglist = ['SMCOG1045', 'SMCOG1062', 'SMCOG1102'] searchgtrformtemplateparts = load_searchgtr_search_form_template() options.searchgtr_links = {} for seq_record in seq_records: smcogdict, _ = utils.get_smcog_annotations(seq_record) for feature in utils.get_cds_features(seq_record): gene_id = utils.get_gene_id(feature) if smcogdict.has_key(gene_id): smcog = smcogdict[gene_id] if smcog in gtrcoglist: if not os.path.exists(options.full_outputfolder_path + os.sep + "html"): os.mkdir(options.full_outputfolder_path + os.sep + "html") formfileloc = options.full_outputfolder_path + os.sep + "html" + os.sep + utils.get_gene_id( feature) + "_searchgtr.html" link_loc = "html" + os.sep + utils.get_gene_id( feature) + "_searchgtr.html" options.searchgtr_links[seq_record.id + "_" + gene_id] = link_loc formfile = open(formfileloc, "w") specificformtemplate = searchgtrformtemplateparts[ 0].replace("GlycTr", gene_id) formfile.write(specificformtemplate) formfile.write("%s\n%s" % (gene_id, utils.get_aa_sequence(feature))) formfile.write(searchgtrformtemplateparts[1]) formfile.close()
def predict_class_from_gene_cluster(seq_record, cluster): ''' Predict the lantipeptide class from the gene cluster ''' found_domains = [] for feature in utils.get_cds_features(seq_record): if feature.location.start < cluster.location.start or \ feature.location.end > cluster.location.end: continue if not 'sec_met' in feature.qualifiers: continue for entry in feature.qualifiers['sec_met']: if entry.startswith('Domains detected:'): entry = entry[17:] domains = entry.split(';') for domain in domains: found_domains.append(domain.split()[0]) if 'Lant_dehyd_N' in found_domains or 'Lant_dehyd_C' in found_domains: return 'Class-I' if 'DUF4135' in found_domains: return 'Class-II' if 'Pkinase' in found_domains: # this could be class 3 or class 4, but as nobody has seen class 4 # in vivo yet, we'll ignore that return 'Class-III' # Ok, no biosynthetic enzymes found, let's try the prepeptide if 'Gallidermin' in found_domains: return 'Class-I' return None
def find_lan_a_features(seq_record, cluster): lan_a_features = [] for feature in utils.get_cds_features(seq_record): if feature.location.start < cluster.location.start or \ feature.location.end > cluster.location.end: continue aa_seq = utils.get_aa_sequence(feature) if len(aa_seq) < 80: lan_a_features.append(feature) continue if not 'sec_met' in feature.qualifiers: continue domain = None for entry in feature.qualifiers['sec_met']: if entry.startswith('Domains detected:'): domain = entry.split()[2] break if domain is None: continue if domain not in known_precursor_domains: continue lan_a_features.append(feature) return lan_a_features
def load_clusterblast_database(seq_record, searchtype="general"): accessiondict = {} for cds in utils.get_cds_features(seq_record): accessiondict[utils.get_gene_acc(cds)] = utils.get_gene_accession(cds) clusters = load_geneclusters(searchtype) proteins = load_geneclusterproteins(accessiondict, searchtype) return clusters, proteins
def load_clusterblast_database(seq_record, searchtype="general"): options = config.get_config() accessiondict = {} for cds in utils.get_cds_features(seq_record): accessiondict[utils.get_gene_acc(cds)] = utils.get_gene_accession(cds) clusters = load_geneclusters(searchtype) proteinlocations, proteinstrands, proteinannotations, proteintags = load_geneclusterproteins(accessiondict, searchtype) return clusters, proteinlocations, proteinstrands, proteinannotations, proteintags
def seq_record_convert_nucl_to_prot(seq_records, options): seq_record = seq_records[0] cdsfeatures = utils.get_cds_features(seq_record) cdsmotifs = utils.get_all_features_of_type(seq_record, ["CDS_motif"]) #Find corresponding cdsmotifs for each cdsfeature cdsmotifdict = {} for cdsfeature in cdsfeatures: for cdsmotif in cdsmotifs: if cdsfeature.location.start <= cdsmotif.location.start <= cdsfeature.location.end: if not cdsmotifdict.has_key(cdsfeature.qualifiers['product'][0]): cdsmotifdict[cdsfeature.qualifiers['product'][0]] = [cdsmotif] else: cdsmotifdict[cdsfeature.qualifiers['product'][0]].append(cdsmotif) #For each cdsfeature, write a protein SeqRecord with CDS_motif features (abMotifs AND sec_met) prot_seq_records = [] for cdsfeature in cdsfeatures: cds_domains = [] #Extract sec_met info from feature if 'sec_met' in cdsfeature.qualifiers: if len([qual for qual in cdsfeature.qualifiers['sec_met'] if "NRPS/PKS subtype: " in qual]) > 0: cds_description = [qual for qual in cdsfeature.qualifiers['sec_met'] if "NRPS/PKS subtype: " in qual][0].partition("NRPS/PKS subtype: ")[2] else: cds_description = "Unknown protein" cds_domains = [qual for qual in cdsfeature.qualifiers['sec_met'] if "NRPS/PKS Domain: " in qual] else: cds_description = "Unknown protein" #Create protein seq_record prot_seq_record = SeqRecord(Seq(cdsfeature.qualifiers['translation'][0], IUPAC.protein), id=cdsfeature.qualifiers['product'][0], name=cdsfeature.qualifiers['product'][0], description=cds_description) utils.fix_record_name_id(prot_seq_record, options) #Add CDS_motif features based on NRPS/PKS domains cdsmotif_features = [] for cds_domain in cds_domains: domainstart, domainend = cds_domain.partition(" (")[2].partition("). ")[0].split("-") domainlocation = FeatureLocation(int(domainstart), int(domainend)) domain_feature = SeqFeature(domainlocation, type="CDS_motif") domain_feature.qualifiers['note'] = [cds_domain] cdsmotif_features.append(domain_feature) #Add CDS_motif features based on NRPS/PKS abMotifs if cdsmotifdict.has_key(cdsfeature.qualifiers['product'][0]): for cdsmotif in cdsmotifdict[cdsfeature.qualifiers['product'][0]]: oldstart, oldend = cdsmotif.location.start, cdsmotif.location.end newstart = (oldstart - cdsfeature.location.start) / 3 newend = (oldend - cdsfeature.location.start) / 3 newlocation = FeatureLocation(newstart, newend) cdsmotif.location = newlocation cdsmotif_features.append(cdsmotif) prot_seq_record.features.extend(cdsmotif_features) prot_seq_records.append(prot_seq_record) return prot_seq_records
def find_col_id(geo_dataset, seq_records): if geo_dataset["info"]["type"] == "CSV": geo_dataset["info"]["col_id"] = 0 return geo_dataset for id_ref, data in geo_dataset["data"].items(): for i in xrange(0, len(data[0])): for seq_record in seq_records: for feature in utils.get_cds_features(seq_record): gene_id = utils.get_gene_id(feature) if gene_id.upper() == data[0][i].upper(): geo_dataset["info"]["col_id"] = i return geo_dataset geo_dataset["info"]["col_id"] = -1 return geo_dataset
def getECs(seq_record, options): logging.debug("Predicting EC numbers with EFICAz") if not name in options.ecpred: logging.debug("ECprediction %s not selected, returning..." % name) return if not 'cpus' in options: options.cpus = 1 EFICAzECs = EFICAzECPrediction(seq_record, options) EFICAzECs.runECpred() logging.debug("Found %s predictions for EC4" % len(EFICAzECs.getEC4Dict().keys())) for feature in utils.get_cds_features(seq_record): featureID = utils.get_gene_id(feature) notes = [] if feature.qualifiers.has_key("note"): notes = feature.qualifiers['note'] if EFICAzECs.getEC4(featureID): logging.debug("Annotating %s" % featureID) if feature.qualifiers.has_key('EC_number'): logging.warn('ECpredictor[eficaz]: Overwriting existing EC annotation: %s with %s' % \ (", ".join(feature.qualifiers['EC_number']), ", ".join(EFICAzECs.getEC4(featureID)))) feature.qualifiers['EC_number'] = EFICAzECs.getEC4(featureID) notes.append("EFICAz EC number prediction: EC4: {0}; {1}".format(", ".join(EFICAzECs.getEC4(featureID)), \ "; ".join(EFICAzECs.getEC4Info(featureID))) ) # Only annotate 3 digit EC if no 4 digit EC is available if (EFICAzECs.getEC3(featureID) and not EFICAzECs.getEC4(featureID)): if feature.qualifiers.has_key('EC_number'): if not re.search("\d+\.\d+\.\d+\.\d+", " ".join( feature.qualifiers['EC_number'])): logging.warn('ECpredictor[eficaz]: Overwriting existing EC annotation: %s with %s' % \ (", ".join(feature.qualifiers['EC_number']), ", ".join(EFICAzECs.getEC3(featureID)))) feature.qualifiers['EC_number'] = EFICAzECs.getEC3( featureID) if EFICAzECs.getEC3Info(featureID): notes.append("EFICAz EC number prediction: EC3: {0}; {1}".format(", ".join(EFICAzECs.getEC3(featureID)), \ "; ".join(EFICAzECs.getEC3Info(featureID)))) if not feature.qualifiers.has_key('EC_number'): feature.qualifiers['EC_number'] = EFICAzECs.getEC3(featureID) feature.qualifiers['note'] = notes logging.debug("Finished EC number prediction with EFICAz")
def find_nr_cds(clusterpositions, seq_record): #Find the number of CDSs in candidate cluster and adjust the cluster starts and ends to match the CDS starts and ends cdsfeatures = utils.get_cds_features(seq_record) withinclustercdsfeatures = [] for cds in cdsfeatures: if clusterpositions[0] <= int(cds.location.start) <= clusterpositions[1] or \ clusterpositions[0] <= int(cds.location.end) <= clusterpositions[1] or \ int(cds.location.start) <= clusterpositions[0] <= int(cds.location.end) or \ int(cds.location.start) <= clusterpositions[1] <= int(cds.location.end): withinclustercdsfeatures.append(cds) if len(withinclustercdsfeatures) == 0: return clusterpositions, 0 newclusterstart = min([int(cds.location.start) for cds in withinclustercdsfeatures]) newclusterend = max([int(cds.location.end) for cds in withinclustercdsfeatures]) newclusterpositions = [newclusterstart, newclusterend] return newclusterpositions, len(withinclustercdsfeatures)
def _getMultiFastaList(self): features = utils.get_cds_features(self.seq_record) allFastaList = [] for feature in features: gene_id = utils.get_gene_id(feature) fasta_seq = feature.qualifiers['translation'][0] if "-" in str(fasta_seq): fasta_seq = Seq( str(fasta_seq).replace("-", ""), generic_protein) # Never write empty fasta entries if len(fasta_seq) == 0: logging.debug("No translation for %s, skipping" % gene_id) continue allFastaList.append(">%s\n%s\n" % (gene_id, fasta_seq)) return allFastaList
def remove_irrelevant_allorfs(seq_record): #Get features allfeatures = utils.get_cds_features(seq_record) #Remove auto-orf features without unique sec_met qualifiers; remove glimmer ORFs overlapping with sec_met auto-orfs not catched by Glimmer auto_orf_features = [ feature for feature in allfeatures if feature.qualifiers.has_key('note') and "auto-all-orf" in feature.qualifiers['note'] ] other_features = [ feature for feature in allfeatures if not feature.qualifiers.has_key('note') or "auto-all-orf" not in feature.qualifiers['note'] ] to_delete = [] for autofeature in auto_orf_features: if not autofeature.qualifiers.has_key("sec_met"): to_delete.append(autofeature) else: glimmer_has_sec_met = False for otherfeature in other_features: if overlaps(autofeature, otherfeature ) and otherfeature.qualifiers.has_key('sec_met'): to_delete.append(autofeature) glimmer_has_sec_met = True if glimmer_has_sec_met == False: for otherfeature in other_features: if overlaps( autofeature, otherfeature ) and not otherfeature.qualifiers.has_key('sec_met'): to_delete.append(otherfeature) featurenrs = [] idx = 0 for feature in seq_record.features: if feature in to_delete: featurenrs.append(idx) idx += 1 featurenrs.reverse() for featurenr in featurenrs: del seq_record.features[featurenr]
def find_flavoprotein(seq_record, cluster): "Look for an epiD-like flavoprotein responsible for aminovinylcystein" for feature in utils.get_cds_features(seq_record): if feature.location.start < cluster.location.start or \ feature.location.end > cluster.location.end: continue if not 'sec_met' in feature.qualifiers: continue domain = None for entry in feature.qualifiers['sec_met']: if entry.startswith('Domains detected:'): domain = entry.split()[2] break if domain is None: continue if domain in 'Flavoprotein': return True return False
def find_short_chain_dehydrogenase(seq_record, cluster): "Look for an eciO-like short-chain dehydrogenase responsible for N-terminal lactone" for feature in utils.get_cds_features(seq_record): if feature.location.start < cluster.location.start or \ feature.location.end > cluster.location.end: continue if not 'sec_met' in feature.qualifiers: continue domain = None for entry in feature.qualifiers['sec_met']: if entry.startswith('Domains detected:'): domain = entry.split()[2] break if domain is None: continue if domain in ('adh_short', 'adh_short_C2'): return True return False
def find_p450_oxygenase(seq_record, cluster): "Look for a p450 oxygenase" #return False for feature in utils.get_cds_features(seq_record): if feature.location.start < cluster.location.start or \ feature.location.end > cluster.location.end: continue if not 'sec_met' in feature.qualifiers: continue domain = None for entry in feature.qualifiers['sec_met']: if entry.startswith('Domains detected:'): domain = entry.split()[2] break if domain is None: continue if domain in 'p450': return True return False
def find_clusters(seq_record, rulesdict, overlaps): #Functions that detects the gene clusters based on the identified core genes features = utils.get_cds_features(seq_record) clustertype = "" clusters = [] cfg = config.get_config() clusternr = cfg.next_clusternr last_cutoff = 0 cluster_cds = [] for feature in features: within_cutoff = False if ('sec_met' in feature.qualifiers) and (len([ feat for feat in feature.qualifiers['sec_met'] if "Type: " in feat ]) > 0): feature_start = min(feature.location.start, feature.location.end) feature_end = max(feature.location.start, feature.location.end) feature_type = [ feat for feat in feature.qualifiers['sec_met'] if "Type: " in feat ][0].partition("Type: ")[2] feature_cutoff = max( [rulesdict[value][1] for value in feature_type.split("-")]) feature_extension = max( [rulesdict[value][2] for value in feature_type.split("-")]) if (cfg.enable_dynamic_cutoff): multiply_cutoff = get_dynamic_cutoff_multiplier( utils.get_gene_id(feature), overlaps) feature_cutoff = int(feature_cutoff * multiply_cutoff) feature_extension = int(feature_extension * multiply_cutoff) cluster = None if len(clusters) > 0: cluster = clusters[-1] cluster_start = cluster.location.start cluster_end = cluster.location.end # Check cutoff cutoff = max(last_cutoff, feature_cutoff) within_cutoff = feature_start <= cluster_end + cutoff within_gene_num_cutoff = (min([ abs(overlaps[1][utils.get_gene_id(feature)] - overlaps[1][ncds]) for ncds in cluster_cds ]) - 1 <= cfg.gene_num_cutoff) if (cfg.gene_num_cutoff_only): within_cutoff = within_gene_num_cutoff else: within_cutoff = within_cutoff or within_gene_num_cutoff if not within_cutoff: if len(clusters) > 0: # Finalize the last extended cluster cluster = clusters[-1] cluster.location = FeatureLocation( max( 0, cluster.location.start - cluster.qualifiers['extension'][0]), min( len(seq_record), cluster.location.end + cluster.qualifiers['extension'][0])) # Create new cluster new_cluster = SeqFeature(FeatureLocation( feature_start, feature_end), type="cluster") new_cluster.qualifiers['note'] = [ "Cluster number: " + str(clusternr) ] new_cluster.qualifiers['cutoff'] = [feature_cutoff] new_cluster.qualifiers['extension'] = [feature_extension] new_cluster.qualifiers['product'] = [feature_type] clusters.append(new_cluster) cluster = clusters[-1] cluster_cds = [utils.get_gene_id(feature)] clusternr += 1 # Update cluster last_cutoff = feature_cutoff cluster.location = FeatureLocation( min(cluster.location.start, feature_start), max(cluster.location.end, feature_end)) cluster.qualifiers['cutoff'] = [ max(cluster.qualifiers['cutoff'][0], feature_cutoff) ] cluster.qualifiers['extension'] = [ max(cluster.qualifiers['extension'][0], feature_extension) ] cluster.qualifiers['product'] = [ "-".join( list( set(cluster.qualifiers['product'][0].split('-')) | set(feature_type.split('-')))) ] if "-" in cluster.qualifiers['product'][0]: cluster.qualifiers['product'] = [ "-".join([ ct for ct in cluster.qualifiers['product'][0].split('-') if ct != "other" ]) ] if (utils.get_gene_id(feature) not in cluster_cds): cluster_cds.append(utils.get_gene_id(feature)) if len(clusters) > 0: # Finalize the last extended cluster cluster = clusters[-1] cluster.location = FeatureLocation( max(0, cluster.location.start - cluster.qualifiers['extension'][0]), min(len(seq_record), cluster.location.end + cluster.qualifiers['extension'][0])) seq_record.features.extend(clusters) cfg.next_clusternr = clusternr
def find_clusters(seq_record, rulesdict): #Functions that detects the gene clusters based on the identified core genes features = utils.get_cds_features(seq_record) clusters = [] cfg = config.get_config() clusternr = cfg.next_clusternr for feature in features: within_cutoff = False if ('sec_met' not in feature.qualifiers) or (len([ feat for feat in feature.qualifiers['sec_met'] if "Type: " in feat ]) <= 0): continue feature_start = min(feature.location.start, feature.location.end) feature_end = max(feature.location.start, feature.location.end) feature_type = [ feat for feat in feature.qualifiers['sec_met'] if "Type: " in feat ][0].partition("Type: ")[2] if feature_type == "none": continue feature_cutoff = max( [rulesdict[value][1] for value in feature_type.split("-")]) feature_extension = max( [rulesdict[value][2] for value in feature_type.split("-")]) cluster = None if len(clusters) > 0: cluster = clusters[-1] cluster_end = cluster.location.end # Check cutoff cutoff = max(cluster.qualifiers['cutoff'][0], feature_cutoff) cutoff = max( cutoff, cluster.qualifiers['extension'][0] + feature_extension) within_cutoff = feature_start <= cluster_end + cutoff if not within_cutoff: if len(clusters) > 0: # Finalize the last extended cluster cluster = clusters[-1] cluster.location = FeatureLocation( max( 0, cluster.location.start - cluster.qualifiers['extension'][0]), min( len(seq_record), cluster.location.end + cluster.qualifiers['extension'][0])) # Create new cluster new_cluster = SeqFeature(FeatureLocation(feature_start, feature_end), type="cluster") new_cluster.qualifiers['note'] = [ "Cluster number: " + str(clusternr) ] new_cluster.qualifiers['cutoff'] = [feature_cutoff] new_cluster.qualifiers['extension'] = [feature_extension] new_cluster.qualifiers['product'] = [feature_type] clusters.append(new_cluster) cluster = clusters[-1] clusternr += 1 # Update cluster cluster.location = FeatureLocation( min(cluster.location.start, feature_start), max(cluster.location.end, feature_end)) cluster.qualifiers['cutoff'] = [ max(cluster.qualifiers['cutoff'][0], feature_cutoff) ] cluster.qualifiers['extension'] = [ max(cluster.qualifiers['extension'][0], feature_extension) ] cluster.qualifiers['product'] = [ "-".join( list( set(cluster.qualifiers['product'][0].split('-')) | set(feature_type.split('-')))) ] if "-" in cluster.qualifiers['product'][0]: cluster.qualifiers['product'] = [ "-".join([ ct for ct in cluster.qualifiers['product'][0].split('-') if ct != "other" ]) ] if len(clusters) > 0: # Finalize the last extended cluster cluster = clusters[-1] cluster.location = FeatureLocation( max(0, cluster.location.start - cluster.qualifiers['extension'][0]), min(len(seq_record), cluster.location.end + cluster.qualifiers['extension'][0])) for cluster in clusters: #Add a note to specify whether a cluster lies on the contig/scaffold edge or not if cluster.location.start == 0 or cluster.location.end == len( seq_record): cluster.qualifiers['contig_edge'] = "True" else: cluster.qualifiers['contig_edge'] = "False" seq_record.features.extend(clusters) cfg.next_clusternr = clusternr
def main(): multiprocessing.freeze_support() res_object = {} # get genome files files = [] for line in open(sys.argv[1], 'r'): files.append(path.expanduser(line.replace("\n", ""))) # mockup antismash run per files i = 1 for fpath in files: res_object[fpath] = {} print "Processing %s... (%d/%d)" % (fpath, i, len(files)) i += 1 options = get_mockup_config() options.sequences = [fpath] config.set_config(options) run_antismash.setup_logging( options) #To-DO: get antismash logging to works! # load plugins plugins = run_antismash.load_detection_plugins() run_antismash.filter_plugins(plugins, options, options.enabled_cluster_types) # parse to seq_records seq_records = run_antismash.parse_input_sequences(options) options.next_clusternr = 1 for seq_record in seq_records: if options.input_type == 'nucl': seq_records = [ record for record in seq_records if len(record.seq) > 1000 ] if len(seq_records) < 1: continue utils.sort_features(seq_record) run_antismash.strip_record(seq_record) utils.fix_record_name_id(seq_record, options) # fetch results_by_id feature_by_id = utils.get_feature_dict(seq_record) results = [] results_by_id = {} for feature in utils.get_cds_features(seq_record): prefix = "%s:" % seq_record.id.replace(":", "_") gene_id = utils.get_gene_id(feature) if (prefix + gene_id) in options.hmm_results: results_by_id[gene_id] = options.hmm_results[prefix + gene_id] for res in results_by_id[gene_id]: results.append(res) # ignore short aa's min_length_aa = 100 short_cds_buffer = [] for f in seq_record.features: # temporarily remove short aa if f.type == "CDS" and len( f.qualifiers['translation'] [0]) < min_length_aa and not results_by_id.has_key( utils.get_gene_id(f)): short_cds_buffer.append(f) seq_record.features.remove(f) overlaps = utils.get_overlaps_table(seq_record) rulesdict = hmm_detection.create_rules_dict( options.enabled_cluster_types) # find total cdhit numbers in the chromosome total_cdhit = len( utils.get_cdhit_table(utils.get_cds_features(seq_record))[0]) res_object[fpath][seq_record.id] = { "total_clusters": 0, "total_genes": len(overlaps[0]), "total_cdhit": total_cdhit, "genes_with_hits": 0, "largest_cdhit": 0, "largest_domain_variations": 0, "per_hits": {}, "cluster_types": {} } # filter overlap hits results, results_by_id = hmm_detection.filter_results( results, results_by_id, overlaps, feature_by_id) # count hits for gene_id in results_by_id: res_gene = results_by_id[gene_id] if len(res_gene) > 0: res_object[fpath][seq_record.id]["genes_with_hits"] += 1 for hsp in res_gene: domain_name = hsp.query_id.replace("plants/", "") if domain_name not in res_object[fpath][ seq_record.id]["per_hits"]: res_object[fpath][ seq_record.id]["per_hits"][domain_name] = 0 res_object[fpath][ seq_record.id]["per_hits"][domain_name] += 1 # do cluster finding algorithm typedict = hmm_detection.apply_cluster_rules( results_by_id, feature_by_id, options.enabled_cluster_types, rulesdict, overlaps) hmm_detection.fix_hybrid_clusters_typedict(typedict) nseqdict = hmm_detection.get_nseq() for cds in results_by_id.keys(): feature = feature_by_id[cds] if typedict[cds] != "none": hmm_detection._update_sec_met_entry( feature, results_by_id[cds], typedict[cds], nseqdict) hmm_detection.find_clusters(seq_record, rulesdict, overlaps) seq_record.features.extend(short_cds_buffer) res_object[fpath][seq_record.id]["total_clusters"] += len( utils.get_cluster_features(seq_record)) # do cluster specific and unspecific analysis if len(utils.get_cluster_features(seq_record)) > 0: run_antismash.cluster_specific_analysis( plugins, seq_record, options) run_antismash.unspecific_analysis(seq_record, options) #Rearrange hybrid clusters name alphabetically hmm_detection.fix_hybrid_clusters(seq_record) #before writing to output, remove all hmm_detection's subdir prefixes from clustertype for cluster in utils.get_cluster_features(seq_record): prod_names = [] for prod in cluster.qualifiers['product']: prod_name = [] for name in prod.split('-'): prod_name.append(name.split('/')[-1]) prod_names.append("-".join(prod_name)) cluster.qualifiers['product'] = prod_names for cds in utils.get_cds_features(seq_record): if 'sec_met' in cds.qualifiers: temp_qual = [] for row in cds.qualifiers['sec_met']: if row.startswith('Type: '): clustertypes = [ (ct.split('/')[-1]) for ct in row.split('Type: ')[-1].split('-') ] temp_qual.append('Type: ' + "-".join(clustertypes)) elif row.startswith('Domains detected: '): cluster_results = [] for cluster_result in row.split( 'Domains detected: ')[-1].split(';'): cluster_results.append( cluster_result.split(' (E-value')[0].split( '/')[-1] + ' (E-value' + cluster_result.split(' (E-value')[-1]) temp_qual.append('Domains detected: ' + ";".join(cluster_results)) else: temp_qual.append(row) cds.qualifiers['sec_met'] = temp_qual #on plants, remove plant clustertype from hybrid types, and replace single #plant clustertype with "putative" for cluster in utils.get_cluster_features(seq_record): prod_names = [] for prod in cluster.qualifiers['product']: prod_name = list(set(prod.split('-'))) if (len(prod_name) > 1) and ("plant" in prod_name): prod_name.remove("plant") elif prod_name == ["plant"]: prod_name = ["putative"] prod_names.append("-".join(prod_name)) cluster.qualifiers['product'] = prod_names for cds in utils.get_cds_features(seq_record): if 'sec_met' in cds.qualifiers: temp_qual = [] for row in cds.qualifiers['sec_met']: if row.startswith('Type: '): clustertypes = list( set(row.split('Type: ')[-1].split('-'))) if (len(clustertypes) > 1) and ("plant" in clustertypes): clustertypes.remove("plant") elif clustertypes == ["plant"]: clustertypes = ["putative"] temp_qual.append('Type: ' + "-".join(clustertypes)) else: temp_qual.append(row) cds.qualifiers['sec_met'] = temp_qual # find largest cdhit number & largest domain diversity in a cluster res_object[fpath][seq_record.id]["average_cdhit"] = 0 res_object[fpath][seq_record.id]["average_domain_variations"] = 0 cdhit_numbers = [] domain_numbers = [] for cluster in utils.get_cluster_features(seq_record): cluster_type = utils.get_cluster_type(cluster) if cluster_type not in res_object[fpath][ seq_record.id]["cluster_types"]: res_object[fpath][ seq_record.id]["cluster_types"][cluster_type] = 0 res_object[fpath][ seq_record.id]["cluster_types"][cluster_type] += 1 num_cdhit = len( utils.get_cluster_cdhit_table(cluster, seq_record)) num_domain = len(utils.get_cluster_domains( cluster, seq_record)) cdhit_numbers.append(num_cdhit) domain_numbers.append(num_domain) if num_cdhit > res_object[fpath][ seq_record.id]["largest_cdhit"]: res_object[fpath][ seq_record.id]["largest_cdhit"] = num_cdhit if num_domain > res_object[fpath][ seq_record.id]["largest_domain_variations"]: res_object[fpath][seq_record.id][ "largest_domain_variations"] = num_domain if len(cdhit_numbers) > 0: res_object[fpath][seq_record.id][ "average_cdhit"] = numpy.median(cdhit_numbers) if len(domain_numbers) > 0: res_object[fpath][seq_record.id][ "average_domain_variations"] = numpy.median(domain_numbers) with open('result.js', 'w') as h: h.write('var result = %s;' % json.dumps(res_object, indent=4))
def get_targetGenomeInfo(seq_records, options): targetFastaFilename = options.metabolicmodeldir + os.sep + 'targetGenome_locusTag_aaSeq.fa' fp = open(targetFastaFilename, 'w') targetGenome_locusTag_aaSeq_dict = {} targetGenome_locusTag_ec_dict = {} targetGenome_locusTag_prod_dict = {} counter_for_temp_locusTags = 1 # Reads GenBank file for seq_record in seq_records: logging.debug( '[MetabolicModeling] processing sequence id "%s" out of %s sequences', seq_record.id, len(seq_records)) for feature in utils.get_cds_features(seq_record): # Retrieving "locus_tag (i.e., ORF name)" for each CDS locusTag = feature.qualifiers.get('locus_tag', ['-'])[0] logging.debug("Found locus_tag %s for feature %s", locusTag, utils.get_gene_id(feature)) # Assign own locus tag, if not set: if locusTag == "-": if not utils.get_gene_id(feature) == "no_tag_found": locusTag = utils.get_gene_id(feature) else: locusTag = "automodelorf{0:05d}".format( counter_for_temp_locusTags) feature.qualifiers['locus_tag'] = [locusTag] logging.debug("replaced locus tag to %s for %s.", locusTag, utils.get_gene_id(feature)) counter_for_temp_locusTags += 1 # Some locus_tag's have multiple same qualifiers (e.g., EC_number) for item in feature.qualifiers: # Note that the numbers of CDS and "translation" do not match. # There are occasions that CDS does not have "translation". if item == 'translation': # Retrieving "translation (i.e., amino acid sequences)" for each CDS translation = feature.qualifiers.get('translation') targetGenome_locusTag_aaSeq_dict[locusTag] = translation[0] print >> fp, '>%s\n%s' % (str(locusTag), str( translation[0])) # Used to find "and" relationship in the GPR association if item == 'product': product = feature.qualifiers.get('product')[0] targetGenome_locusTag_prod_dict[locusTag] = product # Watch multiple EC_number's if item == 'EC_number': ecnum = feature.qualifiers.get('EC_number') targetGenome_locusTag_ec_dict[locusTag] = ecnum # Check if the gbk file has EC_number # Additional conditions should be given upon setup of in-house EC_number assigner logging.debug("len(targetGenome_locusTag_ec_dict.keys):") logging.debug(len(targetGenome_locusTag_ec_dict)) logging.debug("len(targetGenome_locusTag_prod_dict.keys):") logging.debug(len(targetGenome_locusTag_prod_dict)) fp.close() return targetGenome_locusTag_ec_dict, targetGenome_locusTag_prod_dict, targetFastaFilename
def detect_signature_genes(seq_record, enabled_clustertypes, options): "Function to be executed by module" logging.info('Detecting gene clusters using HMM library') feature_by_id = utils.get_feature_dict(seq_record) rulesdict = create_rules_dict(enabled_clustertypes) results = [] sig_by_name = {} results_by_id = {} for sig in get_sig_profiles(): sig_by_name[sig.name] = sig for feature in utils.get_cds_features(seq_record): prefix = "%s:" % seq_record.id.replace(":", "_") gene_id = utils.get_gene_id(feature) if (prefix + gene_id) in options.hmm_results: results_by_id[gene_id] = options.hmm_results[prefix + gene_id] for res in results_by_id[gene_id]: results.append(res) short_cds_buffer = [] if options.ignore_short_aa: # Temporarily filter out cds with < prot_min_length AA length min_length_aa = 50 if options.eukaryotic: min_length_aa = 100 for f in seq_record.features: if f.type == "CDS" and len( f.qualifiers['translation'] [0]) < min_length_aa and not results_by_id.has_key( utils.get_gene_id(f)): short_cds_buffer.append(f) seq_record.features.remove(f) #Get overlap tables (for overlap filtering etc) overlaps = utils.get_overlaps_table(seq_record) #Filter results by comparing scores of different models (for PKS systems) results_to_delete = [gene_id for gene_id in results_by_id] results, results_by_id = filter_results(results, results_by_id, overlaps, feature_by_id) #Update filtered results back to the options.hmm_results for gene_id in results_by_id: results_to_delete.remove(gene_id) prefix = "%s:" % seq_record.id.replace(":", "_") if (prefix + gene_id) in options.hmm_results: options.hmm_results[(prefix + gene_id)] = results_by_id[gene_id] for gene_id in results_to_delete: prefix = "%s:" % seq_record.id.replace(":", "_") if (prefix + gene_id) in options.hmm_results: del options.hmm_results[(prefix + gene_id)] #Use rules to determine gene clusters typedict = apply_cluster_rules(results_by_id, feature_by_id, enabled_clustertypes, rulesdict, overlaps) #Rearrange hybrid clusters name in typedict alphabetically fix_hybrid_clusters_typedict(typedict) #Find number of sequences on which each pHMM is based nseqdict = get_nseq() #Save final results to seq_record for cds in results_by_id.keys(): feature = feature_by_id[cds] if typedict[cds] != "none": _update_sec_met_entry(feature, results_by_id[cds], typedict[cds], nseqdict) find_clusters(seq_record, rulesdict, overlaps) #Find additional NRPS/PKS genes in gene clusters add_additional_nrpspks_genes(typedict, results_by_id, seq_record, nseqdict) #Rearrange hybrid clusters name alphabetically fix_hybrid_clusters(seq_record) #Add details of gene cluster detection to cluster features store_detection_details(results_by_id, rulesdict, seq_record) # Re-add the short CDSs seq_record.features.extend(short_cds_buffer) utils.sort_features(seq_record) #If all-orfs option on, remove irrelevant short orfs if options.all_orfs: remove_irrelevant_allorfs(seq_record) #Display %identity if options.enable_cdhit: store_percentage_identities(seq_record)
def test_get_cds_features(self): "Test utils.get_all_cds_features()" cds = utils.get_cds_features(self.rec) features = utils.get_all_features_of_type(self.rec, "CDS") self.assertListEqual(cds, features)