def __init__(self, seq_record, options): "Initialize ASF object" # Set options if 'activeSiteFinderConf' not in options: options.activeSiteFinderConf = path.join(utils.get_full_path(__file__, ''), "config", "SignatureResources.xml") if 'activeSiteFinderHMMDir' not in options: options.activeSiteFinderHMMDir = path.join(utils.get_full_path(__file__, ''), "hmm") # Assign variables try: XMLtree = ET.parse(options.activeSiteFinderConf) except ET.ParseError: logging.exception("Could not load/parse ActiveSiteFinder configuration file %s.", options.activeSiteFinderConf) sys.exit(1) XMLroot = XMLtree.getroot() HmmProfilesFilenameObj = XMLroot.findall(".//Execute/database") self.seq_record = seq_record self.options = options self.XMLtree = XMLtree self.XMLroot = XMLroot self.HmmProfilesFilenameObj = HmmProfilesFilenameObj
def check_prereqs(): failure_messages = [] for binary_name, optional in _required_binaries: if utils.locate_executable(binary_name) is None and not optional: failure_messages.append("Failed to locate executable for %r" % binary_name) hmm_files = [] # Check if hmmdetails.txt is readable and well-formatted lineno = 1 for line in open(utils.get_full_path(__file__, "hmmdetails.txt"), "r"): if line.count("\t") != 3: failure_messages.append( "Failed to use HMM profile from line %s due to misformatting:\n %r" % (lineno, line)) continue hmm_files.append(line.split('\t')[3].strip()) lineno += 1 #Check if cluster_rules.txt is readable and well-formatted lineno = 1 for line in open(utils.get_full_path(__file__, "cluster_rules.txt"), "r"): if line.count("\t") != 3: failure_messages.append( "Failed to use cluster rules from the line %s due to misformatting:\n %r" % (lineno, line)) lineno += 1 hmm = utils.get_full_path(__file__, _markov_model) if utils.locate_file(hmm) is None: # try to generate file from all specified profiles in hmmdetails try: with open(hmm, 'w') as all_hmms_handle: for hmm_file in hmm_files: with open(utils.get_full_path(__file__, hmm_file), 'r') as handle: all_hmms_handle.write(handle.read()) except OSError: failure_messages.append('Failed to generate file {!r}'.format(hmm)) for ext in _binary_extensions: binary = "{}{}".format(hmm, ext) if utils.locate_file(binary) is None: _, err, retcode = utils.run_hmmpress(hmm) if retcode != 0: failure_messages.append('Failed to hmmpress {!r}: {!r}'.format( hmm, err)) break return failure_messages
def load_geneclusterproteins(accessiondict, searchtype): options = config.get_config() if not 'clusterblastdir' in options: options.clusterblastdir = path.dirname( utils.get_full_path(__file__, '')) options.subclusterblastdir = path.join( path.dirname(options.clusterblastdir), 'subclusterblast') options.knownclusterblastdir = path.join( path.dirname(options.clusterblastdir), 'knownclusterblast') else: options.subclusterblastdir = path.join( path.dirname(path.dirname(utils.get_full_path(__file__, ''))), 'subclusterblast') options.knownclusterblastdir = path.join( path.dirname(path.dirname(utils.get_full_path(__file__, ''))), 'knownclusterblast') #Load gene cluster database proteins info into memory if searchtype == "general": logging.debug("ClusterBlast: Loading gene cluster database proteins into " \ "memory...") gclusterprotsfile = path.join(options.clusterblastdir, "geneclusterprots.fasta") elif searchtype == "subclusters": logging.debug("SubClusterBlast: Loading gene cluster database proteins into " \ "memory...") gclusterprotsfile = path.join(options.subclusterblastdir, "subclusterprots.fasta") elif searchtype == "knownclusters": logging.debug("KnownClusterBlast: Loading gene cluster database proteins into " \ "memory...") gclusterprotsfile = path.join(options.knownclusterblastdir, "knownclusterprots.fasta") proteins = {} with open(gclusterprotsfile, 'r') as handle: for line in handle: line = line.rstrip("\n") if not line or line[0] != ">": continue tabs = line.split("|") locustag = tabs[4] if accessiondict.has_key(locustag): locustag = "h_" + locustag location = tabs[2] strand = tabs[3] annotations = tabs[5] name = tabs[6] proteins[name] = Protein(name, locustag, location, strand, annotations) return proteins
def load_geneclusters(searchtype): #Load gene cluster database into memory options = config.get_config() if not 'clusterblastdir' in options: options.clusterblastdir = path.dirname( utils.get_full_path(__file__, '')) options.subclusterblastdir = path.join( path.dirname(options.clusterblastdir), 'subclusterblast') options.knownclusterblastdir = path.join( path.dirname(options.clusterblastdir), 'knownclusterblast') else: options.subclusterblastdir = path.join( path.dirname(path.dirname(utils.get_full_path(__file__, ''))), 'subclusterblast') options.knownclusterblastdir = path.join( path.dirname(path.dirname(utils.get_full_path(__file__, ''))), 'knownclusterblast') if searchtype == "general": logging.debug( "ClusterBlast: Loading gene clusters database into memory...") geneclustersfile = path.join(options.clusterblastdir, "geneclusters.txt") elif searchtype == "subclusters": logging.debug( "SubClusterBlast: Loading gene clusters database into memory...") geneclustersfile = path.join(options.subclusterblastdir, "subclusters.txt") elif searchtype == "knownclusters": logging.debug( "KnownClusterBlast: Loading gene clusters database into memory...") geneclustersfile = path.join(options.knownclusterblastdir, "knownclusters.txt") geneclustersfile = open(geneclustersfile, "r") filetext = geneclustersfile.read() lines = [line for line in filetext.split("\n") if "\t" in line] clusters = {} for i in lines: tabs = i.split("\t") accession = tabs[0] clusterdescription = tabs[1] clusternr = tabs[2] clustertype = tabs[3] clustername = accession + "_" + clusternr clustertags = tabs[4].split(";") clusterprots = tabs[5].split(";") clusters[clustername] = [ clusterprots, clusterdescription, clustertype, clustertags ] return clusters
def load_geneclusterproteins(accessiondict, searchtype): options = config.get_config() if not 'clusterblastdir' in options: options.clusterblastdir = path.dirname(utils.get_full_path(__file__, '')) options.subclusterblastdir = path.join(path.dirname(options.clusterblastdir), 'subclusterblast') options.knownclusterblastdir = path.join(path.dirname(options.clusterblastdir), 'knownclusterblast') else: options.subclusterblastdir = path.join(path.dirname(path.dirname(utils.get_full_path(__file__, ''))), 'subclusterblast') options.knownclusterblastdir = path.join(path.dirname(path.dirname(utils.get_full_path(__file__, ''))), 'knownclusterblast') #Load gene cluster database proteins info into memory if searchtype == "general" and options.taxon == "plants": logging.info("ClusterBlast: Loading gene cluster database proteins into " \ "memory...") gclusterprotsfile = path.join(options.clusterblastdir, "plantgeneclusterprots.fasta") elif searchtype == "general": logging.info("ClusterBlast: Loading gene cluster database proteins into " \ "memory...") gclusterprotsfile = path.join(options.clusterblastdir, "geneclusterprots.fasta") elif searchtype == "subclusters": logging.info("SubClusterBlast: Loading gene cluster database proteins into " \ "memory...") gclusterprotsfile = path.join(options.subclusterblastdir, "subclusterprots.fasta") elif searchtype == "knownclusters": logging.info("KnownClusterBlast: Loading gene cluster database proteins into " \ "memory...") gclusterprotsfile = path.join(options.knownclusterblastdir, "knownclusterprots.fasta") gclusterprotsfile = open(gclusterprotsfile,"r") filetext = gclusterprotsfile.read() filetext = filetext.replace("\r","\n") lines = filetext.split("\n") proteinlocations = {} proteinstrands = {} proteinannotations = {} proteintags = {} for i in lines: if len(i) > 0 and i[0] == ">": tabs = i.split("|") protein = tabs[6] locustag = tabs[4] if accessiondict.has_key(locustag): locustag = "h_" + locustag proteintags[protein] = locustag location = tabs[2] proteinlocations[protein] = location strand = tabs[3] proteinstrands[protein] = strand annotation = tabs[5] proteinannotations[protein] = annotation return proteinlocations, proteinstrands, proteinannotations, proteintags
def test_labyrinthopeptin(self): "Test lantipeptide prediction for labyrinthopeptin" rec = seqio.read(utils.get_full_path(__file__, 'labyrinthopeptin.gbk')) self.assertEqual(7, len(rec.features)) specific_analysis(rec, None) self.assertEqual(11, len(rec.features))
def test_nisin(self): "Test lantipeptide prediction for nisin A" rec = seqio.read(utils.get_full_path(__file__, 'nisin.gbk')) self.assertEqual(38, len(rec.features)) specific_analysis(rec, None) self.assertEqual(40, len(rec.features)) prepeptides = h._find_core_peptides(utils.get_cluster_by_nr(rec, 1), rec) self.assertEqual(1, len(prepeptides)) prepeptide = prepeptides[0] leaders = h._find_leader_peptides(utils.get_cluster_by_nr(rec, 1), rec) self.assertEqual(1, len(leaders)) leader = leaders[0] # real monoisotopic mass is 3351.51, but we overpredict a Dha self.assertAlmostEqual(3333.6, h._get_monoisotopic_mass(prepeptide)) # real mw is 3354.5, see above self.assertAlmostEqual(3336.0, h._get_molecular_weight(prepeptide)) self.assertEqual([3354.0, 3372.1, 3390.1, 3408.1], h._get_alternative_weights(prepeptide)) self.assertEqual(5, h._get_number_bridges(prepeptide)) self.assertEqual("MSTKDFNLDLVSVSKKDSGASPR", h._get_leader_peptide_sequence(leader)) self.assertEqual("ITSISLCTPGCKTGALMGCNMKTATCHCSIHVSK", h._get_core_peptide_sequence(prepeptide)) self.assertEqual('Class I', h._get_core_peptide_class(prepeptide))
def perform_docking_domain_analysis(options, clusterpksgenes, genecluster, seq_record, pksnrpsvars): feature_by_id = utils.get_feature_dict(seq_record) #log("Predicting PKS gene order by docking domain sequence " \ # "analysis", stdout=True) startergene, endinggene = find_first_and_last_genes( clusterpksgenes, pksnrpsvars.domainnamesdict) with TemporaryDirectory(change=True): dockinganalysis_dir = utils.get_full_path(__file__, "docking_analysis") ntermintresdict = extract_nterminus(dockinganalysis_dir, clusterpksgenes, seq_record, startergene, feature_by_id) ctermintresdict = extract_cterminus(dockinganalysis_dir, clusterpksgenes, seq_record, endinggene, feature_by_id) possible_orders = find_possible_orders(clusterpksgenes, startergene, endinggene) geneorders, possible_orders_scoredict = rank_biosynthetic_orders( ntermintresdict, ctermintresdict, startergene, endinggene, possible_orders) write_gene_orders_to_html(options, geneorders, possible_orders_scoredict, genecluster, startergene, endinggene) #log("Predicting PKS gene order by docking domain sequence " \ # "analysis succeeded.", stdout=True) #Write html outfile with docking domain analysis output pksnrpsvars.dockingdomainanalysis.append(genecluster) return geneorders[0]
def test_microbisporicin(self): "Test lantipeptide prediction for microbisporicin" rec = seqio.read(utils.get_full_path(__file__, 'microbisporicin.gbk')) self.assertEqual(56, len(rec.features)) specific_analysis(rec, None) self.assertEqual(58, len(rec.features)) prepeptides = h._find_core_peptides(utils.get_cluster_by_nr(rec, 1), rec) self.assertEqual(1, len(prepeptides)) prepeptide = prepeptides[0] leaders = h._find_leader_peptides(utils.get_cluster_by_nr(rec, 1), rec) self.assertEqual(1, len(leaders)) leader = leaders[0] # NOTE: this is not the correct weight for microbisporicin # there are some additional modifications we do not predict yet self.assertAlmostEqual(2212.9, h._get_monoisotopic_mass(prepeptide)) self.assertAlmostEqual(2214.5, h._get_molecular_weight(prepeptide)) self.assertEqual(4, h._get_number_bridges(prepeptide)) self.assertEqual("MPADILETRTSETEDLLDLDLSIGVEEITAGPA", h._get_leader_peptide_sequence(leader)) self.assertEqual("VTSWSLCTPGCTSPGGGSNCSFCC", h._get_core_peptide_sequence(prepeptide)) self.assertEqual('Class I', h._get_core_peptide_class(prepeptide)) self.assertEqual(['AviCys', 'Cl', 'OH'], h._get_core_peptide_extra_modifications(prepeptide))
def check_prereqs(options): "Check if all required applications are around" failure_messages = [] for binary_name, optional in _required_binaries: if utils.locate_executable(binary_name) is None and not optional: failure_messages.append("Failed to locate file: %r" % binary_name) for hmm in _markov_models: hmm = utils.get_full_path(__file__, hmm) if utils.locate_file(hmm) is None: failure_messages.append("Failed to locate file %r" % hmm) continue for ext in _binary_extensions: binary = "%s%s" % (hmm, ext) if utils.locate_file(binary) is None: command = ['hmmpress', hmm] try: out, err, retcode = utils.execute(command) except OSError as e: retcode = 1 err = str(e) if retcode != 0: failure_messages.append("Failed to hmmpress %r: %r" % (hmm, err)) break return failure_messages
def test_epicidin(self): "Test lantipeptide prediction for epicidin 280" rec = seqio.read(utils.get_full_path(__file__, 'epicidin_280.gbk')) self.assertEqual(21, len(rec.features)) specific_analysis(rec, None) self.assertEqual(23, len(rec.features)) prepeptides = h._find_core_peptides(utils.get_cluster_by_nr(rec, 1), rec) self.assertEqual(1, len(prepeptides)) prepeptide = prepeptides[0] leaders = h._find_leader_peptides(utils.get_cluster_by_nr(rec, 1), rec) self.assertEqual(1, len(leaders)) leader = leaders[0] self.assertAlmostEqual(3115.7, h._get_monoisotopic_mass(prepeptide)) self.assertAlmostEqual(3117.7, h._get_molecular_weight(prepeptide)) self.assertEqual([3135.7, 3153.7, 3171.7], h._get_alternative_weights(prepeptide)) self.assertEqual(3, h._get_number_bridges(prepeptide)) self.assertEqual("MENKKDLFDLEIKKDNMENNNELEAQ", h._get_leader_peptide_sequence(leader)) self.assertEqual("SLGPAIKATRQVCPKATRFVTVSCKKSDCQ", h._get_core_peptide_sequence(prepeptide)) self.assertEqual('Class I', h._get_core_peptide_class(prepeptide)) self.assertEqual(['Lac'], h._get_core_peptide_extra_modifications(prepeptide))
def generate_chemical_structure_preds(pksnrpsvars, seq_record, options): #Create directory to store structures options.structuresfolder = path.abspath(path.join(options.outputfoldername, "structures")) if not os.path.exists(options.structuresfolder): os.mkdir(options.structuresfolder) originaldir = os.getcwd() structure_drawing_dir = utils.get_full_path(__file__, '') + os.sep + "NRPeditor" os.chdir(structure_drawing_dir) #Combine predictions into a prediction of the final chemical structure and generate images geneclusters = utils.get_cluster_features(seq_record) for genecluster in geneclusters: smiles_string = "N/A" geneclusternr = utils.get_cluster_number(genecluster) if pksnrpsvars.compound_pred_dict.has_key(geneclusternr): # if product is ectoine generate predefined SMILE string and generate structure if pksnrpsvars.compound_pred_dict[geneclusternr] == "ectoine": smiles_string = "CC1=NCCC(N1)C(=O)O" smilesfile = open("genecluster" + str(geneclusternr) + ".smi","w") smilesfile.write(smiles_string) smilesfile.close() depictstatus = depict_smile(geneclusternr,options.structuresfolder) if depictstatus == "failed": pksnrpsvars.failedstructures.append(geneclusternr) elif genecluster in pksnrpsvars.failedstructures: del pksnrpsvars.failedstructures[pksnrpsvars.failedstructures.index(geneclusternr)] else: # use information on peptide / polyketide sequence to gernerate structure image residues = pksnrpsvars.compound_pred_dict[geneclusternr].replace("(","").replace(")","").replace(" + "," ").replace("-"," ") nrresidues = len(residues.split(" ")) if nrresidues > 1: if sys.platform == ('win32') or sys.platform == ('darwin'): structcommand = 'main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "' elif sys.platform == ('linux2'): structcommand = './main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "' for i in [res for res in residues.split(" ") if len(res) > 1]: structcommand = structcommand + i + " " structcommand = structcommand + 'TE"' smilesinfo = os.popen(structcommand) smilesinfo = smilesinfo.read() smiles_string = (smilesinfo.split("core peptide: ")[1]).split("\ntermintype")[0] if sys.platform == ('linux2') or sys.platform == ('darwin'): smiles_string.replace("[X]","[*:X]") smiles_string2 = "" a = 1 for k in smiles_string: if k == "X": smiles_string2 = smiles_string2 + str(a) a += 1 else: smiles_string2 = smiles_string2 + k smiles_string = smiles_string2 smilesfile = open("genecluster" + str(geneclusternr) + ".smi","w") smilesfile.write(smiles_string) smilesfile.close() depictstatus = depict_smile(geneclusternr, options.structuresfolder) if depictstatus == "failed": pksnrpsvars.failedstructures.append(geneclusternr) _update_sec_met_entry(genecluster, smiles_string) os.chdir(originaldir)
def alignsmcogs(smcog, inputnr): #Align to multiple sequence alignment, output as fasta file infile1 = utils.get_full_path(__file__, "%s_muscle.fasta" % str(smcog).lower()) if sys.platform == ('linux2') or sys.platform == ('win32'): musclecommand = ["muscle", "-quiet", "-profile", "-in1", infile1, "-in2", "input" + str(inputnr) + ".fasta", "-out", "muscle" + str(inputnr) + ".fasta"] elif sys.platform == ('darwin'): musclecommand = ["muscle", "-quiet", "-profile", "-in1", infile1, "-in2", "input" + str(inputnr) + ".fasta", "-out", "muscle" + str(inputnr) + ".fasta"] utils.execute(musclecommand)
def converttree(inputnr, smcogsoutputfolder, tag): #Convert tree to XTG and draw PNG image using TreeGraph command = ['java', '-Djava.awt.headless=true', '-jar', utils.get_full_path(__file__, 'TreeGraph.jar'), '-convert', 'tree%s.nwk'% inputnr, '-xtg', 'tree%s.xtg' % inputnr ] p = subprocess.Popen(command, stdout=subprocess.PIPE,stderr=subprocess.STDOUT) processes_starttime = time.time() while True: if (time.time() - processes_starttime) > 1200: if sys.platform == ('linux2') or sys.platform == ('darwin'): os.kill(p.pid,signal.SIGKILL) logging.info("Now in " + os.getcwd() + " TreeGraph -convert on tree" + str(inputnr) + " ran out out of time") break elif sys.platform == ('win32'): subprocess.Popen("taskkill /F /T /PID %i"%p.pid , shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT) logging.info("Now in " + os.getcwd() + " TreeGraph -convert on tree " + str(inputnr) + " ran out out of time") break if p.poll() == 0: break time.sleep(2) out, err = p.communicate() output = out if "exception" not in output and "Exception" not in output: command = ['java', '-Djava.awt.headless=true', '-jar', utils.get_full_path(__file__, 'TreeGraph.jar'), '-image', 'tree%s.xtg'% inputnr, "%s.png" % tag.split('.')[0] ] p = subprocess.Popen(command, stdout=subprocess.PIPE,stderr=subprocess.STDOUT) processes_starttime = time.time() while True: if (time.time() - processes_starttime) > 1200: if sys.platform == ('linux2') or sys.platform == ('darwin'): os.kill(p.pid,signal.SIGKILL) logging.info("Now in " + os.getcwd() + " TreeGraph -image on tree " + str(inputnr) + " ran out out of time") break elif sys.platform == ('win32'): subprocess.Popen("taskkill /F /T /PID %i"%p.pid , shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT) logging.info("Now in " + os.getcwd() + " TreeGraph -image on tree " + str(inputnr) + " ran out out of time") break if p.poll() == 0: break time.sleep(2) out, err = p.communicate() output = out if "exception" not in output and "Exception" not in output: shutil.copy(tag.split(".")[0] + '.png', smcogsoutputfolder) os.remove(tag.split(".")[0] + ".png") os.remove("tree" + str(inputnr) + ".xtg") os.remove("trimmed_alignment" + str(inputnr) + ".fasta")
def get_supported_cluster_types(): "Get a list of all supported cluster types" clustertypes = [ line.split("\t")[0] for line in open( utils.get_full_path(__file__, 'cluster_rules.txt'), "r") ] # skip first line containing the header return clustertypes[1:]
def load_searchgtr_search_form_template(): #Create folder for SEARCHGTR HTML files, load search form template searchgtrformtemplate = open( path.join(utils.get_full_path(__file__, ''), "searchgtr_form.html"), "r") searchgtrformtemplate = searchgtrformtemplate.read() searchgtrformtemplate = searchgtrformtemplate.replace("\r", "\n") searchgtrformtemplateparts = searchgtrformtemplate.split("FASTASEQUENCE") return searchgtrformtemplateparts
def setUp(self): self.config = Namespace() config.set_config(self.config) self.config.gff3 = utils.get_full_path(__file__, "test_gff.gff") self.config.single_entries = False contig1 = FakeRecord(seq="".join(["A" for c in xrange(0, 2000)])) contig1.id = "CONTIG_1" contig2 = FakeRecord(seq="".join(["A" for c in xrange(0, 2000)])) contig2.id = "CONTIG_2" self.sequences = [contig1, contig2]
def test_sco_cluster3(self): "Test lantipeptide prediction for SCO cluster #3" rec = seqio.read(utils.get_full_path(__file__, 'sco_cluster3.gbk')) self.assertEqual(69, len(rec.features)) specific_analysis(rec, None) self.assertEqual(71, len(rec.features)) prepeptides = h._find_core_peptides(utils.get_cluster_by_nr(rec, 1), rec) self.assertEqual(1, len(prepeptides)) prepeptide = prepeptides[0] self.assertEqual('Class I', h._get_core_peptide_class(prepeptide))
def run_nrpspks_specific_hmmer(seq_record, withinclustergenes, pksnrpsvars): nrpspksfasta = utils.get_specific_multifasta(withinclustergenes) #Analyse for abMotifs abmotif_opts = ["-E", "0.25"] abmotif_results = utils.run_hmmscan( utils.get_full_path(__file__, "abmotifs.hmm"), nrpspksfasta, abmotif_opts) mhmmlengthsdict = utils.hmmlengths( utils.get_full_path(__file__, "abmotifs.hmm")) pksnrpsvars.motifdict = parse_hmmscan_results(abmotif_results, mhmmlengthsdict) #Analyse for C/A/PCP/E/KS/AT/ATd/DH/KR/ER/ACP/TE/TD/COM/Docking/MT/CAL domains nrpspksdomain_opts = ["--cut_tc"] nrpspksdomain_results = utils.run_hmmscan( utils.get_full_path(__file__, "nrpspksdomains.hmm"), nrpspksfasta, nrpspksdomain_opts) hmmlengthsdict = utils.hmmlengths( utils.get_full_path(__file__, "nrpspksdomains.hmm")) pksnrpsvars.domaindict = parse_hmmscan_results(nrpspksdomain_results, hmmlengthsdict) filter_nonterminal_docking_domains(seq_record, pksnrpsvars) #Analyse KS domains & PKS/NRPS protein domain composition to detect NRPS/PKS types kshmmlengthsdict = utils.hmmlengths( utils.get_full_path(__file__, "ksdomains.hmm")) ksdomain_results = utils.run_hmmscan( utils.get_full_path(__file__, "ksdomains.hmm"), nrpspksfasta, nrpspksdomain_opts) pksnrpsvars.ksdomaindict = parse_hmmscan_results(ksdomain_results, kshmmlengthsdict)
def filter_results(results, results_by_id): #Filter results by comparing scores of different models (for PKS systems) for line in open(utils.get_full_path(__file__, "filterhmmdetails.txt"), "r").read().split("\n"): filterhmms = line.split(",") for cds in results_by_id.keys(): cdsresults = results_by_id[cds] hmmhits = [hit.query_id for hit in cdsresults] #Check if multiple competing HMM hits are present competing_hits = set(hmmhits) & set(filterhmms) if len(competing_hits) > 1: #Identify overlapping hits overlapping_groups = [] for hit in cdsresults: for otherhit in [ cdsresult for cdsresult in cdsresults if hit != cdsresult ]: overlap = len( set(range(hit.hit_start, hit.hit_end)) & set(range(otherhit.hit_start, otherhit.hit_end))) if overlap > 20: added = "n" for group in overlapping_groups: if hit in group and otherhit in group: added = "y" break elif hit in group and otherhit not in group: group.append(otherhit) added = "y" break elif hit not in group and otherhit in group: group.append(hit) added = "y" break if added == "n": overlapping_groups.append([hit, otherhit]) #Remove worst-scoring of overlapping hits for group in overlapping_groups: highestscore = max([hit.bitscore for hit in group]) hit_with_highestscore = group[[ hit.bitscore for hit in group ].index(highestscore)] to_delete = [ hit for hit in group if hit != hit_with_highestscore ] for res in [res for res in results]: if res in to_delete: del results[results.index(res)] del results_by_id[cds][results_by_id[cds].index( res)] return results, results_by_id
def run(seq_record, options): "run hmmsearch against PFAM for all CDS features" if 'pfamdir' not in options: options.pfamdir = utils.get_full_path(__file__, '') query_sequence = utils.get_multifasta(seq_record) target_hmmfile = path.join(options.pfamdir, 'Pfam-A.hmm') logging.info('Running whole-genome pfam search') results = utils.run_hmmscan(target_hmmfile, query_sequence) _annotate(seq_record, options, results)
def load_id_lines(): sandpuma_dir = utils.get_full_path(__file__, 'sandpuma') fasta_file = path.join(sandpuma_dir, 'flat', 'fullset0_smiles.faa') id_lines = [] with open(fasta_file, 'r') as fh: for line in fh: if not line.startswith(">"): continue id_lines.append(line.strip().lstrip(">")) return id_lines
def get_supported_cluster_types(): "Get a list of all supported cluster types" clustertypes = [ line.split("\t")[0] for line in open( utils.get_full_path(__file__, 'cluster_rules.txt'), "r") ][1:] for fname in listdir(path.dirname(path.abspath(__file__))): dir_path = path.join(path.dirname(path.abspath(__file__)), fname) if path.isdir(dir_path): clustertypes.extend([ (fname + "/" + line.split("\t")[0]) for line in open(path.join(dir_path, "cluster_rules.txt"), "r") ][1:]) return clustertypes
def filter_result_overlapping_genes(results, results_by_id, overlaps, feature_by_id): # filter results of overlapping genes (only gene with the best score can retain its result) filterhmm_list = [] overlap_id_with_result = {} for line in open(utils.get_full_path(__file__, "filterhmmdetails.txt"), "r").read().split("\n"): filterhmms = line.split(",") if filterhmms not in filterhmm_list: filterhmm_list.append(filterhmms) for cds in results_by_id.keys(): if overlaps[1][cds] not in overlap_id_with_result.keys(): overlap_id_with_result[overlaps[1][cds]] = [cds] elif cds not in overlap_id_with_result[overlaps[1][cds]]: overlap_id_with_result[overlaps[1][cds]].append(cds) for overlap_id in overlap_id_with_result.keys(): best_hit_scores = {} for cds in overlap_id_with_result[overlap_id]: for hit in results_by_id[cds]: feature = feature_by_id[hit.hit_id] if (hit.query_id not in best_hit_scores) or ( best_hit_scores[hit.query_id] < abs(feature.location.end - feature.location.start)): best_hit_scores[hit.query_id] = abs(feature.location.end - feature.location.start) for cds in overlap_id_with_result[overlap_id]: to_delete = [] for hit in results_by_id[cds]: feature = feature_by_id[hit.hit_id] if (abs(feature.location.end - feature.location.start) < best_hit_scores[hit.query_id]): to_delete.append(hit) else: # filter for filterhmmdetails.txt for filterhmms in filterhmm_list: if hit.query_id not in filterhmms: continue for similar_hit in filterhmms: if similar_hit not in best_hit_scores.keys(): continue if (abs(feature.location.end - feature.location.start) < best_hit_scores[similar_hit]): to_delete.append(hit) break for hit in to_delete: del results[results.index(hit)] del results_by_id[cds][results_by_id[cds].index(hit)] if len(results_by_id[cds]) < 1: del results_by_id[cds] return results, results_by_id
def check_prereqs(options): "Check if all required applications are around" failure_messages = [] for binary_name, optional in _required_binaries: if utils.locate_executable(binary_name) is None and not optional: failure_messages.append("Failed to locate file: %r" % binary_name) for file_name, optional in _required_files: if utils.locate_file( path.join(utils.get_full_path(__file__, ''), file_name)) is None and not optional: failure_messages.append("Failed to locate file: %r" % file_name) return failure_messages
def load_cog_annotations(): "Load the smCOG type annotations from a file" type_keys = { 'B': 'biosynthetic-additional', 'T': 'transport', 'R': 'regulatory', 'O': 'other' } annotations = {} for line in open(utils.get_full_path(__file__, 'cog_annotations.txt'), 'r'): line = line.strip() cog, _, type_ = line.split('\t', 3) annotations[cog] = type_keys.get(type_, 'other') return annotations
def run_nrpspredictor(seq_record, nrpsnames, nrpsseqs, options): #NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor with TemporaryDirectory(change=True): nrpsseqs_file = "nrpsseqs.fasta" NRPSPredictor2_dir = utils.get_full_path(__file__, "NRPSPredictor2") utils.writefasta(nrpsnames, nrpsseqs, nrpsseqs_file) #Get NRPSPredictor2 code predictions, output sig file for input for NRPSPredictor2 SVMs nrpscodepred.run_nrpscodepred(options) #Run NRPSPredictor2 SVM datadir = path.join(NRPSPredictor2_dir, 'data') libdir = path.join(NRPSPredictor2_dir, 'lib') jarfile = path.join(NRPSPredictor2_dir, 'build', 'NRPSpredictor2.jar') classpath = [ jarfile, '%s/java-getopt-1.0.13.jar' % libdir, '%s/Utilities.jar' % libdir, '%s/libsvm.jar' % libdir ] if sys.platform == ("linux2") or sys.platform == ("darwin"): java_separator = ":" elif sys.platform == ("win32"): java_separator = ";" commands = [ 'java', '-Ddatadir=%s' % datadir, '-cp', java_separator.join(classpath), 'org.roettig.NRPSpredictor2.NRPSpredictor2', '-i', 'input.sig', '-r', path.join( options.raw_predictions_outputfolder, "ctg" + str(options.record_idx) + '_nrpspredictor2_svm.txt'), '-s', '1', '-b', options.eukaryotic and '1' or '0' ] out, err, retcode = utils.execute(commands) if err != '': logging.debug('running nrpspredictor2 gave error %r' % err) #Copy NRPSPredictor results and move back to original directory try: os.remove( path.join( options.raw_predictions_outputfolder, "ctg" + str(options.record_idx) + "_nrpspredictor2_codes.txt")) except: pass shutil.move( "ctg" + str(options.record_idx) + "_nrpspredictor2_codes.txt", options.raw_predictions_outputfolder)
def check_prereqs(): failure_messages = [] for binary_name, optional in _required_binaries: if utils.locate_executable(binary_name) is None and not optional: failure_messages.append("Failed to locate executable for %r" % binary_name) for hmm in _markov_models: hmm = utils.get_full_path(__file__, hmm) if utils.locate_file(hmm) is None: failure_messages.append("Failed to locate file %r" % hmm) continue for ext in _binary_extensions: binary = "%s%s" % (hmm, ext) if utils.locate_file(binary) is None: _, err, retcode = utils.run_hmmpress(hmm) if retcode != 0: failure_messages.append("Failed to hmmpress %r: %r" % (hmm, err)) break else: binary_mtime = path.getmtime(binary) hmm_mtime = path.getmtime(hmm) if hmm_mtime > binary_mtime: try: from glob import glob for f in glob("%s.h3?" % hmm): logging.debug("removing outdated file %s", f) os.remove(f) except OSError as e: failure_messages.append("Failed to remove outdated binary file for %s: %s" % \ (hmm, e)) break _, err, retcode = utils.run_hmmpress(hmm) if retcode != 0: failure_messages.append("Failed to hmmpress %r: %r" % (hmm, err)) import datetime failure_messages.append("HMM binary files outdated. %s (changed: %s) vs %s (changed: %s)" % \ (hmm, datetime.datetime.fromtimestamp(hmm_mtime), binary, datetime.datetime.fromtimestamp(binary_mtime))) break return failure_messages
def generate_webpage(seq_records, options): d = pq(filename=utils.get_full_path(__file__, 'index.tpl'), parser='html') num = count_all_clusters(seq_records) set_title(d, seq_records[0].id, num) set_colourscheme(d, options) set_urls(d, options) set_version(d) set_download_links(d, seq_records[0].id, options) generate_searchgtr_htmls(seq_records, options) records = js.convert_records(seq_records, options) extra_data = dict(js_domains=[], clusterblast_clusters=[], subclusterblast_clusters=[], knownclusterblast_clusters=[]) if 'triggered_limit' in options and options.triggered_limit: add_truncation_notice(d, options) records_written = 0 for i in range(len(records)): odd = True records[i]['seq_id'] = utils.ascii_string(records[i]['seq_id']) if len(records[i]['clusters']) > 0: add_separator(d, records[i]['seq_id'], records[i]['orig_id'], options) for cluster in records[i]['clusters']: add_cluster(d, cluster, seq_records[i], options, extra_data, odd, seq_records[0].id) records_written += 1 odd = not odd if records_written == 0: add_no_result_note(d, options) write_geneclusters_js(records, options.outputfoldername, extra_data) with open(path.join(options.outputfoldername, 'index.html'), 'w') as h: h.write('<!doctype html>\n') h.write(d.outerHtml())
def run(seq_record, options): "run hmmsearch against PFAM for all CDS features" if 'pfamdir' not in options: options.pfamdir = utils.get_full_path(__file__, '') query_sequence = utils.get_multifasta(seq_record) target_hmmfile = path.join(options.pfamdir, 'Pfam-A.hmm') logging.info('Running whole-genome pfam search') if options.skip_cleanup: results_file = path.join(options.full_outputfolder_path, 'fullhmmer.txt') if path.exists(results_file): results = list(SearchIO.parse(results_file, 'hmmer3-text')) else: results = utils.run_hmmscan(target_hmmfile, query_sequence, results_file=results_file) else: results = utils.run_hmmscan(target_hmmfile, query_sequence) _annotate(seq_record, options, results)