def extract_cterminus(da_dir, clusterpksgenes, seq_record, endinggene, feature_by_id): #Extract C-terminal 100 residues of each non-ending protein, scan for docking domains using hmmsearch, parse output to locate interacting residues ctermintresdict = {} ctermnames = [] ctermseqs = [] cterm_file = os.path.join(da_dir, 'cterm.fasta') for k in clusterpksgenes: if k != endinggene: ctermnames.append(k) seq = str(utils.get_aa_sequence(feature_by_id[k])) ctermseqs.append(seq[-100:]) ctermfasta = "input.fasta" z = 0 for k in ctermnames: utils.writefasta([ctermnames[z]], [ctermseqs[z]], ctermfasta) utils.execute([ "muscle", "-profile", "-quiet", "-in1", cterm_file, "-in2", "input.fasta", "-out", "muscle.fasta" ]) intresidues = extractpositions("muscle.fasta", [55, 64], "EryAII_ref", ctermnames[z]) ctermintresdict[ctermnames[z]] = intresidues z += 1 return ctermintresdict
def runblast(query, target): command = [ "blastp", "-db", target, "-query", query, "-outfmt", "6", "-max_target_seqs", "10000", "-evalue", "1e-05", "-out", query.split(".")[0] + ".out" ] utils.execute(command)
def extract_nterminus(da_dir, clusterpksgenes, seq_record, startergene, feature_by_id): #Extract N-terminal 50 residues of each non-starting protein, scan for docking domains using hmmsearch, parse output to locate interacting residues ntermintresdict = {} ntermnames = [] ntermseqs = [] nterm_file = os.path.join(da_dir, 'nterm.fasta') for k in clusterpksgenes: if k != startergene: ntermnames.append(k) seq = str(utils.get_aa_sequence(feature_by_id[k])) ntermseqs.append(seq[:50]) ntermfasta = "input.fasta" z = 0 for k in ntermnames: utils.writefasta([ntermnames[z]], [ntermseqs[z]], ntermfasta) utils.execute([ "muscle", "-profile", "-quiet", "-in1", nterm_file, "-in2", "input.fasta", "-out", "muscle.fasta" ]) intresidues = extractpositions("muscle.fasta", [2, 15], "EryAIII_5_6_ref", ntermnames[z]) ntermintresdict[ntermnames[z]] = intresidues z += 1 return ntermintresdict
def run_pplacer(reference_alignment, alignment_file, data_dir, reference_tree): """Function that uses the reference tree with the new alignment to place query domains onto reference tree. """ #Locations of files #reference_tree = os.path.join(data_dir, "RAxML_bestTree.647KS_RAxML.tre") pplacer_json = os.path.join(os.getcwd(), "pplacer_tree.jplace") #Reference package creation: taxit create --aln-fasta test_set_for_development.fasta --tree-stats test_set_for_development.log --tree-file test_set_for_development.nwk -P test_set_for_development.refpkg -l test #Reference package creation: taxit create --aln-fasta 647KS_mcformat.afa --tree-stats RAxML_info.647KS_RAxML.tre --tree-file RAxML_bestTree.647KS_RAxML.tre -P RAxML_bestTree.647KS_RAxML.refpkg -l 647KS pplacer_cmd = [ "pplacer", "-t", reference_tree, "-r", reference_alignment, "-o", pplacer_json, "-c", os.path.join(data_dir, "RAxML_bestTree.647KS_RAxML.refpkg"), alignment_file ] out, err, retcode = utils.execute(pplacer_cmd) if retcode == 1: logging.error( "Running pplacer failed. Check if the program is installed appropriately." ) sys.exit(1) guppy_cmd = ["guppy", "sing", pplacer_json] out, err, retcode = utils.execute(guppy_cmd) if retcode == 1: logging.error( "Running guppy failed. Check if the program is installed appropriately." ) sys.exit(1) return os.getcwd() + os.sep + "pplacer_tree.sing.tre"
def alignsmcogs(smcog, inputnr): #Align to multiple sequence alignment, output as fasta file infile1 = utils.get_full_path(__file__, "%s_muscle.fasta" % str(smcog).lower()) if sys.platform == ('linux2') or sys.platform == ('win32'): musclecommand = ["muscle", "-quiet", "-profile", "-in1", infile1, "-in2", "input" + str(inputnr) + ".fasta", "-out", "muscle" + str(inputnr) + ".fasta"] elif sys.platform == ('darwin'): musclecommand = ["muscle", "-quiet", "-profile", "-in1", infile1, "-in2", "input" + str(inputnr) + ".fasta", "-out", "muscle" + str(inputnr) + ".fasta"] utils.execute(musclecommand)
def make_blastDB(query_fasta, options): db_dir = options.metabolicmodeldir + os.sep + 'targetBlastDB' DBprogramName = utils.locate_executable('makeblastdb') utils.execute( [DBprogramName, '-in', query_fasta, '-out', db_dir, '-dbtype', 'prot']) #Checks if DB is properly created; otherwise shutdown if os.path.isfile(options.metabolicmodeldir + os.sep + 'targetBlastDB.psq') == False: logging.exception("error in make_blastDB: blast DB not created") #FIXME: don't use sys.exit sys.exit(1)
def convert_to_tabular(tempdir): command = [ "diamond", "view", "-a", path.join(tempdir, "matches.daa"), "-o", path.join(tempdir, "input.out") ] return utils.execute(command)
def check_prereqs(options): "Check if all required applications are around" failure_messages = [] for binary_name, optional in _required_binaries: if utils.locate_executable(binary_name) is None and not optional: failure_messages.append("Failed to locate file: %r" % binary_name) for hmm in _markov_models: hmm = utils.get_full_path(__file__, hmm) if utils.locate_file(hmm) is None: failure_messages.append("Failed to locate file %r" % hmm) continue for ext in _binary_extensions: binary = "%s%s" % (hmm, ext) if utils.locate_file(binary) is None: command = ['hmmpress', hmm] try: out, err, retcode = utils.execute(command) except OSError as e: retcode = 1 err = str(e) if retcode != 0: failure_messages.append("Failed to hmmpress %r: %r" % (hmm, err)) break return failure_messages
def _runEFICAz(self, chunkDir): cwd = os.getcwd() try: os.chdir(chunkDir) except OSError: logging.exception("Can't chdir to %s" % chunkDir) sys.exit(1) fastafile = os.path.basename(self.ChunkFilenames[chunkDir]) ecpredfile = fastafile + ".ecpred" # Only perform calculations if result file does not already exist (from previous run) if not os.path.isfile(os.path.join(self.basedirName, ecpredfile)): EFICAzExecutable = utils.locate_executable(EFICAzBinary) if not EFICAzExecutable: logging.exception( "EFICAz executable not found, bailing out, analysis not posible" ) sys.exit(1) cmdline = [EFICAzExecutable, fastafile] logging.debug("executing %s in directory %s" % (" ".join(cmdline), chunkDir)) try: utils.execute(cmdline) except: logging.exception('cannot execute EFICAz!') sys.exit(1) else: # As this method is executed in an own thread, it does not have the ability to change # the variables within th eobject; # As a workaround we just copy the "old" file to the tempdir... try: shutil.copy( os.path.abspath(os.path.join(self.basedirName, ecpredfile)), self.ChunkFilenames[chunkDir] + ".ecpred") except: logging.exception("Could not copy existing eficaz result file %s to tempfile %s", \ os.path.isfile(os.path.abspath(self.basedirName, ecpredfile)), \ self.ChunkFilenames[chunkDir]+".ecpred" ) sys.exit(1) os.chdir(cwd)
def align_ks_domains(reference_alignment, ks_names, ks_seqs, data_dir): """Function that aligns a number of query KS domain sequences to the reference alignment of KS domains. """ #Set file names and write query domains to temp input file in_temp = os.path.join(os.getcwd(), "in_seq.fasta") in_temp_aligned = os.path.join(os.getcwd(), "in_seq_aligned.fasta") out_temp = os.path.join(os.getcwd(), "out_seq.fasta") alignment_file = os.path.join(os.getcwd(), "aligned.fasta") with open(in_temp, "w") as tmp_input: for name, seq in zip(ks_names, ks_seqs): tmp_input.write("%s\n%s\n" % (name, seq)) #Generate alignment of query sequences muscle_cmd = str(MuscleCommandline(input=in_temp, out=in_temp_aligned)) out, err, retcode = utils.execute(muscle_cmd.split(" ")) if retcode == 1: logging.error( "Alignment of query KS sequences with Muscle failed. Check if Muscle is installed appropriately." ) sys.exit(1) #Align the query alignment to the reference alignment using muscle --profile muscle_cmd = str( MuscleCommandline(profile='True', in1=reference_alignment, in2=in_temp_aligned, out=out_temp)) out, err, retcode = utils.execute(muscle_cmd.split(" ")) if retcode == 1: logging.error( "Alignment of query+reference KS sequences with Muscle failed. Check if Muscle is installed appropriately." ) sys.exit(1) else: f_temp_input = open(out_temp, 'r').read() reformat(input=f_temp_input, out_filename=alignment_file) #Remove temporary files for f in [in_temp, out_temp]: os.remove(f) return alignment_file
def run_diamond(query, target, tempdir, options): command = [ "diamond", "blastp", "--db", target, "--threads", str(options.cpus), "--query", query, "--compress", "0", "--max-target-seqs", "10000", "--evalue", "1e-05", "--daa", "matches.daa", "--tmpdir", tempdir ] return utils.execute(command)
def test_execute_with_input(self): "Test utils.execute() with stdin input" expected = """ Called subprocess.Popen( ['fake', '--with', 'parameters'], stderr=-1, stdin=-1, stdout=-1) Called proc.communicate(input='fake input')""" cmd = ['fake', '--with', 'parameters'] out, err, retcode = utils.execute(cmd, input='fake input') self.assertEqual('output', out) self.assertEqual('error', err) self.assertEqual(0, retcode) assert_same_trace(self.tt, expected)
def run_blastp(target_fasta='', blastp_result='', db_dir='', evalue=1e-30): BLASTPprogramName = utils.locate_executable('blastp') # Execute blast if output file is not present if not os.path.isfile(blastp_result): args = [ BLASTPprogramName, '-query', target_fasta, '-out', blastp_result, '-db', db_dir, '-evalue', str(evalue), '-outfmt', "10 qseqid sseqid evalue score length pident" ] out, err, retcode = utils.execute(args) if retcode != 0: logging.debug("out: %r, err: %r, retcode: %s", out, err, retcode) else: logging.warn("Found blast file %s, skipping new caluclation", blastp_result)
def run_nrpspredictor(seq_record, nrpsnames, nrpsseqs, options): #NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor with TemporaryDirectory(change=True): nrpsseqs_file = "nrpsseqs.fasta" NRPSPredictor2_dir = utils.get_full_path(__file__, "NRPSPredictor2") utils.writefasta(nrpsnames, nrpsseqs, nrpsseqs_file) #Get NRPSPredictor2 code predictions, output sig file for input for NRPSPredictor2 SVMs nrpscodepred.run_nrpscodepred(options) #Run NRPSPredictor2 SVM datadir = path.join(NRPSPredictor2_dir, 'data') libdir = path.join(NRPSPredictor2_dir, 'lib') jarfile = path.join(NRPSPredictor2_dir, 'build', 'NRPSpredictor2.jar') classpath = [ jarfile, '%s/java-getopt-1.0.13.jar' % libdir, '%s/Utilities.jar' % libdir, '%s/libsvm.jar' % libdir ] if sys.platform == ("linux2") or sys.platform == ("darwin"): java_separator = ":" elif sys.platform == ("win32"): java_separator = ";" commands = [ 'java', '-Ddatadir=%s' % datadir, '-cp', java_separator.join(classpath), 'org.roettig.NRPSpredictor2.NRPSpredictor2', '-i', 'input.sig', '-r', path.join( options.raw_predictions_outputfolder, "ctg" + str(options.record_idx) + '_nrpspredictor2_svm.txt'), '-s', '1', '-b', options.eukaryotic and '1' or '0' ] out, err, retcode = utils.execute(commands) if err != '': logging.debug('running nrpspredictor2 gave error %r' % err) #Copy NRPSPredictor results and move back to original directory try: os.remove( path.join( options.raw_predictions_outputfolder, "ctg" + str(options.record_idx) + "_nrpspredictor2_codes.txt")) except: pass shutil.move( "ctg" + str(options.record_idx) + "_nrpspredictor2_codes.txt", options.raw_predictions_outputfolder)
def run_diamond(query, target, tempdir, options): command = [ "diamond", "blastp", "--db", target, "--threads", str(options.cpus), "--query", query, "--compress", "0", "--max-target-seqs", "10000", "--evalue", "1e-05", "--out", "input.out", "--outfmt", "6", # 6 is blast tabular format, just as in blastp "--tmpdir", tempdir ] return utils.execute(command)
def hmmsearch(fasta,hmm): lsname = fastanames(fasta)[0] text, err, retcode = utils.execute(["hmmsearch", "--noali", hmm, fasta]) text = text.replace("\r","\n") start = text.find('Domain annotation for each sequence:') end = text.find('Internal pipeline statistics summary:') lines = [] ls_names = [] ls_domain_nrs = [] ls_starts = [] ls_ends = [] ls_scores = [] ls_evalues = [] lines = text[start:end].split('\n') if "[No targets detected" in text: ls_names.append(lsname) ls_scores.append(str(0)) else: lines = lines[4:-4] for i in lines: tabs = i.split(" ") tabs2 = [] for i in tabs: if i != "": tabs2.append(i) ls_names.append(lsname) ls_domain_nrs.append(tabs2[0]) ls_starts.append(tabs2[6]) ls_ends.append(tabs2[7]) ls_scores.append(tabs2[2]) ls_evalues.append(tabs2[4]) dicthmm = {} for i in ls_names: j = ls_names.index(i) dicthmm[i] = ls_scores[j] return dicthmm
def run_kr_analysis(infile2, out_file): ##Core script #Extract activity and stereochemistry signatures from KR domains infile = utils.get_full_path(__file__, "KRdomains_muscle.fasta") muscle_file = "muscle.fasta" dict2 = fastadict(infile2) namesb = fastanames(infile2) seqsb = fastaseqs(namesb, dict2) #startpos = 2 querysignames = [] querysigseqs_act = [] querysigseqs_ste = [] for i in namesb: seq = seqsb[namesb.index(i)] querysignames.append(i) writefasta([i], [seq], "infile.fasta") infile2 = "infile.fasta" refsequence = "MAPSI|PKS|CAM00062.1|Erythromycin_synthase_modules_1_and_2|Sacc_KR1" namesa = [i] #Run muscle and collect sequence positions from file utils.execute([ "muscle", "-profile", "-quiet", "-in1", infile, "-in2", infile2, "-out", "muscle.fasta" ]) positions_act = [110, 134, 147, 151] positions_ste = [90, 91, 92, 139, 144, 147, 149, 151] #Count residues in ref sequence and put positions in list muscle_dict = fastadict(muscle_file) muscle_seqs = lseqs(muscle_dict) muscle_names = lnames(muscle_dict) refseqnr = muscle_names.index(refsequence) #Extract activity signature refseq = muscle_seqs[refseqnr] poslist_act = [] b = 0 c = 0 while refseq != "": i = refseq[0] if c in positions_act and i != "-": poslist_act.append(b) if i != "-": c += 1 b += 1 refseq = refseq[1:] #Extract stereochemistry signature refseq = muscle_seqs[refseqnr] poslist_ste = [] b = 0 c = 0 while refseq != "": i = refseq[0] if c in positions_ste and i != "-": poslist_ste.append(b) if i != "-": c += 1 b += 1 refseq = refseq[1:] #Extract positions from query sequence query = namesa[0] query_seqnr = muscle_names.index(query) query_seq = muscle_seqs[query_seqnr] seq_act = "" seq_ste = "" for j in poslist_act: aa = query_seq[j] seq_act = seq_act + aa querysigseqs_act.append(seq_act) for j in poslist_ste: aa = query_seq[j] seq_ste = seq_ste + aa querysigseqs_ste.append(seq_ste) #Check activity activitydict = {} for i in querysignames: querysigseq_act = querysigseqs_act[querysignames.index(i)] activity = "inactive" if querysigseq_act[0] == "K" and ( querysigseq_act[1] == "S" or querysigseq_act[1] == "A" or querysigseq_act[1] == "G" ) and querysigseq_act[2] == "Y" and querysigseq_act[3] == "N": activity = "active" if querysigseq_act[0] == "E" and ( querysigseq_act[1] == "S" or querysigseq_act[1] == "A" or querysigseq_act[1] == "G" ) and querysigseq_act[2] == "H" and querysigseq_act[3] == "H": activity = "active" if querysigseq_act[0] == "K" and ( querysigseq_act[1] == "S" or querysigseq_act[1] == "A" or querysigseq_act[1] == "G") and querysigseq_act[2] == "Y" and ( querysigseq_act[3] == "N" or querysigseq_act[3] == "G"): activity = "active" activitydict[i] = activity #Predict stereochemistry stereodict = {} for i in querysignames: querysigseq_ste = querysigseqs_ste[querysignames.index(i)] if querysigseq_ste[0:3] != "LDD" and querysigseq_ste[ 3] == "W" and querysigseq_ste[4] != "H" and querysigseq_ste[ 5:] == "YAN": stereochemistry = "A1" elif querysigseq_ste[0:3] != "LDD" and querysigseq_ste[3:] == "WHYAN": stereochemistry = "A2" elif querysigseq_ste[0:3] == "LDD" and querysigseq_ste[ 5] == "Y" and querysigseq_ste[6] != "P" and querysigseq_ste[ 7] == "N": stereochemistry = "B1" elif querysigseq_ste[0:3] == "LDD" and querysigseq_ste[5:] == "YPN": stereochemistry = "B2" elif querysigseq_ste[5] != "Y": stereochemistry = "C1" elif querysigseq_ste[5] == "Y" and querysigseq_ste[7] != "N": stereochemistry = "C2" else: stereochemistry = "?" stereodict[i] = stereochemistry #Output to file outfile = open(out_file, "w") for i in querysignames: outfile.write(i + "\t" + activitydict[i] + "\t" + stereodict[i] + "\n")
def run_glimmer(seq_record, options): "Run glimmer3 to annotate prokaryotic sequences" basedir = utils.get_genefinding_basedir(options) with TemporaryDirectory(change=True): utils.fix_record_name_id(seq_record, options) name = seq_record.id while len(name) > 0 and name[0] == '-': name = name[1:] if name == "": name = "unknown" fasta_file = '%s.fasta' % name longorfs_file = '%s.longorfs' % name icm_file = '%s.icm' % name result_file = '%s.predict' % name # run long-orfs with open(fasta_file, 'w') as handle: seqio.write([seq_record], handle, 'fasta') long_orfs = [path.join(basedir, 'long-orfs')] long_orfs.extend([ '-l', '-n', '-t', '1.15', '--trans_table', '11', fasta_file, longorfs_file ]) out, err, _ = execute(long_orfs) if err.find('ERROR') > -1: logging.error("Locating long orfs failed: %r" % err) return # run extract extract = [ path.join(basedir, 'extract'), '-t', fasta_file, longorfs_file ] out, err, retcode = execute(extract) if out == '': logging.error("Failed to extract genes from model, aborting: %r" % err) return build_icm = [path.join(basedir, 'build-icm'), '-r', icm_file] out, err, retcode = execute(build_icm, input=out) if err != '': logging.error("Failed to build gene model: %r" % err) return # run glimmer3 glimmer = [path.join(basedir, 'glimmer3')] glimmer.extend([ '-l', '-o', '50', '-g', '90', '-q', '3000', '-t', '30', '--trans_table', '11', fasta_file, icm_file, name ]) out, err, retcode = execute(glimmer) if err.find('ERROR') > -1: logging.error("Failed to run glimmer3: %r" % err) return for line in open(result_file, 'r'): # skip first line if line.startswith('>'): continue name, start, end, strand, score = line.split() try: start = int(start) end = int(end) strand = int(strand) except ValueError: logging.error('Malformatted glimmer output line %r' % line.rstrip()) if start > end: bpy_strand = -1 tmp = start start = end end = tmp else: bpy_strand = 1 loc = FeatureLocation(start - 1, end, strand=bpy_strand) feature = SeqFeature(location=loc, id=name, type="CDS", qualifiers={ 'locus_tag': ['ctg%s_%s' % (options.record_idx, name)], 'note': ['Glimmer score: %s' % score] }) seq_record.features.append(feature)
def make_blastdb(inputfile, dbname): command = [ "makeblastdb", "-in", inputfile, "-out", dbname, "-dbtype", "prot" ] utils.execute(command)
def run_sandpuma(seq_record, nrpsnames, nrpsseqs, options): """Run SANDPUMA on the set of NRPS sequences from this genome""" nrpspredictor_output = "ctg" + str( options.record_idx) + "_nrpspredictor3_svm.txt" individual_predictions = "ctg" + str(options.record_idx) + "_ind.res.tsv" percentage_identities = "ctg" + str(options.record_idx) + "_pid.res.tsv" sandpuma_predictions = "ctg" + str(options.record_idx) + "_sandpuma.tsv" ensemble_predictions = "ctg" + str(options.record_idx) + "_ens.res.tsv" # In debug mode, simply copy over previous predictions. if options.dbgsandpuma != '': shutil.copy( path.join(options.dbgsandpuma, nrpspredictor_output), path.join(options.raw_predictions_outputfolder, nrpspredictor_output)) shutil.copy( path.join(options.dbgsandpuma, individual_predictions), path.join(options.raw_predictions_outputfolder, individual_predictions)) shutil.copy( path.join(options.dbgsandpuma, percentage_identities), path.join(options.raw_predictions_outputfolder, percentage_identities)) shutil.copy( path.join(options.dbgsandpuma, sandpuma_predictions), path.join(options.raw_predictions_outputfolder, sandpuma_predictions)) shutil.copy( path.join(options.dbgsandpuma, ensemble_predictions), path.join(options.raw_predictions_outputfolder, ensemble_predictions)) return #NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor sandpumadir = utils.get_full_path(__file__, "sandpuma") with TemporaryDirectory(change=True): #Extract A domains from the NRPS sequences and write to FASTA file nrpsseqs_file = "input_adomains.fasta" utils.writefasta(nrpsnames, nrpsseqs, nrpsseqs_file) #Run SANDPUMA on the FASTA file sandpuma_command = [ sandpumadir + os.sep + 'predictnrps_nodep_par.sh', 'input_adomains.fasta', sandpumadir, str(options.cpus) ] err = utils.execute(sandpuma_command)[1] if err != '': logging.error('Running SANDPUMA gave an error') raise RuntimeError("Sandpuma failed to run: %s" % err) #Copy SANDPUMA (including NRPSPredictor2) results and move back to original directory shutil.move( "query.rep", options.raw_predictions_outputfolder + os.sep + nrpspredictor_output) shutil.move( "ind.res.tsv", options.raw_predictions_outputfolder + os.sep + individual_predictions) shutil.move( "pid.res.tsv", options.raw_predictions_outputfolder + os.sep + percentage_identities) shutil.move( "sandpuma.tsv", options.raw_predictions_outputfolder + os.sep + sandpuma_predictions) shutil.move( "ens.res.tsv", options.raw_predictions_outputfolder + os.sep + ensemble_predictions)
def run_prodigal(seq_record, options): "Run progidal to annotate prokaryotic sequences" if "prodigal" in options: if "basedir" in options.prodigal: basedir = options.prodigal.basedir else: basedir = "" with TemporaryDirectory(change=True): utils.fix_record_name_id(seq_record, options) name = seq_record.id while len(name) > 0 and name[0] == '-': name = name[1:] if name == "": name = "unknown" fasta_file = '%s.fasta' % name result_file = '%s.predict' % name with open(fasta_file, 'w') as handle: seqio.write([seq_record], handle, 'fasta') # run prodigal prodigal = [path.join(basedir, 'prodigal')] prodigal.extend(['-i', fasta_file, '-f', 'sco', '-o', result_file]) if options.genefinding == "prodigal-m" or len(seq_record.seq) < 20000: prodigal.extend(['-p', 'meta']) err = execute(prodigal)[1] if err.find('Error') > -1: logging.error("Failed to run prodigal: %r" % err) return for line in open(result_file, 'r'): # skip first line if not line.startswith('>'): continue name, start, end, prodigalStrand = line[1:].rstrip().split("_") try: start = int(start) end = int(end) if prodigalStrand == "+": strand = 1 else: strand = -1 except ValueError: logging.error('Malformatted prodigal output line %r' % line.rstrip()) continue if start > end: strand = -1 tmp = start start = end end = tmp loc = FeatureLocation(start - 1, end, strand=strand) feature = SeqFeature(location=loc, id=name, type="CDS", qualifiers={ 'locus_tag': ['ctg%s_%s' % (options.record_idx, name)] }) seq_record.features.append(feature)
def _execute_tool(self, analysisResource, fileName=None, stdin_data=None): "Perform the external program execution" cmdlineList = [] # Assemble commad line list # extract program name from XML executeObj = analysisResource.find('./Execute') cmdlineList.append(executeObj.attrib['program']) # Cycle through parameters in XML for parameter in list(analysisResource.findall('./Execute/parameters/parameter')): if 'prefix' in parameter.attrib: cmdlineList.append(parameter.attrib['prefix']) cmdlineList.append(parameter.text) # Get database name database = analysisResource.find('./Execute/database') if 'prefix' in database.attrib: cmdlineList.append(database.attrib['prefix']) # Add searchpath cmdlineList.append(utils.locate_file(path.join(self.options.activeSiteFinderHMMDir, database.text))) if fileName: # Get (optional) input file prefix (e.g. -query in blast) if 'inputfile_prefix' in executeObj.attrib: cmdlineList.append(executeObj.attrib['inputfile_prefix']) cmdlineList.append(fileName) if stdin_data: # Get (optional) prefix for stdin (e.g. "-" for hmmpfam / hmmscan if 'STDINprefix' in executeObj.attrib: cmdlineList.append(executeObj.attrib['STDINprefix']) logging.debug("ASF: %s; external program call:\n%s", analysisResource.attrib['name'], " ".join(cmdlineList)) try: if fileName: logging.debug("Executing tool with file input") out, _, retcode = utils.execute(cmdlineList) else: logging.debug("Executing tools with STDIN input") out, _, retcode = utils.execute(cmdlineList, input=stdin_data) except OSError: logging.warn('OS error on execution of: %s', " ".join(cmdlineList)) return [] if retcode != 0: logging.warn('%s returned %s', cmdlineList[0], retcode) return [] res_stream = StringIO(out) logging.debug('External program output: %s', res_stream) # Get Biopython parser information from XML biopython_parser = analysisResource.find('./Execute/BioPythonParser') try: results = list(SearchIO.parse(res_stream, biopython_parser.text)) except Exception as e: logging.warn('Error parsing results for active site finder analysis: %s ; no hits will be reported', e) results = [] return results
def run_minowa_cal(infile2, outfile): ## Core infile = utils.get_full_path(__file__, "CAL_domains_muscle.fasta") muscle_file = "muscle.fasta" out_file = open(outfile,"w") dict2 = fastadict(infile2) namesa = fastanames(infile2) seqsa = fastaseqs(namesa,dict2) startpos = 43 namesb = namesa seqsb = seqsa for i in namesb: seq = seqsb[namesb.index(i)] writefasta([i],[seq],"infile.fasta") infile2 = "infile.fasta" out_file.write("\\\\" + "\n" + i + "\n") refsequence = "Q54297_CAL1" namesa = [i] seqsa = [seq] #Run muscle and collect sequence positions from file utils.execute(["muscle", "-profile", "-quiet", "-in1", infile, "-in2", infile2, "-out", "muscle.fasta"]) file = open(utils.get_full_path(__file__, "CALpositions.txt"),"r") text = file.read() text = text.replace("\r","\n") text = text.strip() text = text.replace(' ','_') positions = text.split("\t") positions2 = [] for i in positions: pos = int(i) pos = pos - startpos positions2.append(pos) positions = positions2 #Count residues in ref sequence and put positions in list muscle_dict = fastadict(muscle_file) muscle_seqs = lseqs(muscle_dict) muscle_names = lnames(muscle_dict) refseqnr = muscle_names.index(refsequence) refseq = muscle_seqs[refseqnr] poslist = [] a = 0 b = 0 c = 0 while refseq != "": i = refseq[0] if c in positions and i != "-": poslist.append(b) if i != "-": c += 1 b += 1 refseq = refseq[1:] #Extract positions from query sequence and create fasta file to use as input for hmm searches query = namesa[0] query_seqnr = muscle_names.index(query) query_seq = muscle_seqs[query_seqnr] seq = "" for j in poslist: aa = query_seq[j] if aa == "-": aa = "X" seq = seq + aa query_names = [] query_names.append(query) query_seqs = [] query_seqs.append(seq) writefasta(query_names,query_seqs,"hmm_infile.fasta") #- then use list to extract positions from every sequence -> HMMs (one time, without any query sequence) #Compare scores and output prediction hmm_names = [] hmm_scores = [] cal_hmms_dir = utils.get_full_path(__file__, 'CAL_HMMs') hmmresults = hmmsearch("hmm_infile.fasta", path.join(cal_hmms_dir, "Acetyl-CoA.hmm")) hmmname = "Acetyl-CoA" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(cal_hmms_dir, "AHBA.hmm")) hmmname = "AHBA" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(cal_hmms_dir, "fatty_acid.hmm")) hmmname = "fatty_acid" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(cal_hmms_dir, "NH2.hmm")) hmmname = "NH2" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(cal_hmms_dir, "shikimic_acid.hmm")) hmmname = "shikimic_acid" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) #Sort names & scores by scores: scoredict = {} a = 0 for i in hmm_names: score = hmm_scores[a] scoredict[i] = float(score) a += 1 hmm_names = sortdictkeysbyvalues(scoredict) hmm_scores = [] for i in hmm_names: score = str(scoredict[i]) hmm_scores.append(score) out_file.write("Substrate:") out_file.write("\t") out_file.write("Score:") out_file.write("\n") for i in hmm_names: out_file.write(i) out_file.write("\t") j = hmm_names.index(i) score = hmm_scores[j] out_file.write(score) out_file.write("\n")
def run_minowa_a(infile2, outfile): ## Core infile = utils.get_full_path(__file__, "A_domains_muscle.fasta") muscle_file = "muscle.fasta" out_file = open(outfile, "w") dict2 = fastadict(infile2) namesa = fastanames(infile2) seqsa = fastaseqs(namesa, dict2) startpos = 65 namesb = namesa seqsb = seqsa for i in namesb: seq = seqsb[namesb.index(i)] writefasta([i], [seq], "infile.fasta") infile2 = "infile.fasta" out_file.write("\\\\" + "\n" + i + "\n") refsequence = "P0C062_A1" namesa = [i] seqsa = [seq] #Run muscle and collect sequence positions from file utils.execute([ "muscle", "-profile", "-quiet", "-in1", infile, "-in2", infile2, "-out", "muscle.fasta" ]) file = open(utils.get_full_path(__file__, "Apositions.txt"), "r") text = file.read() text = text.replace("\r", "\n") text = text.strip() text = text.replace(' ', '_') positions = text.split("\t") positions2 = [] for i in positions: pos = int(i) pos = pos - startpos positions2.append(pos) positions = positions2 #Count residues in ref sequence and put positions in list muscle_dict = fastadict(muscle_file) muscle_seqs = lseqs(muscle_dict) muscle_names = lnames(muscle_dict) refseqnr = muscle_names.index(refsequence) refseq = muscle_seqs[refseqnr] poslist = [] a = 0 b = 0 c = 0 while refseq != "": i = refseq[0] if c in positions and i != "-": poslist.append(b) if i != "-": c += 1 b += 1 refseq = refseq[1:] #Extract positions from query sequence and create fasta file to use as input for hmm searches query = namesa[0] query_seqnr = muscle_names.index(query) query_seq = muscle_seqs[query_seqnr] seq = "" for j in poslist: aa = query_seq[j] if aa == "-": aa = "X" seq = seq + aa query_names = [] query_names.append(query) query_seqs = [] query_seqs.append(seq) writefasta(query_names, query_seqs, "hmm_infile.fasta") #- then use list to extract positions from every sequence -> HMMs (one time, without any query sequence) #Compare scores and output prediction hmm_names = [] hmm_scores = [] a_hmms_dir = utils.get_full_path(__file__, 'A_HMMs') hmmresults = hmmsearch( "hmm_infile.fasta", path.join(a_hmms_dir, "2-3-diaminopropionate.hmm")) hmmname = "2-3-diaminoproprionate" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "3mGlu.hmm")) hmmname = "3mGlu" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "5NhOrn.hmm")) hmmname = "5NhOrn" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Abu.hmm")) hmmname = "Abu" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Ahp.hmm")) hmmname = "Ahp" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "alaninol.hmm")) hmmname = "alaninol" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Asn.hmm")) hmmname = "Asn" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Asp.hmm")) hmmname = "Asp" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "beta-Lys.hmm")) hmmname = "beta-Lys" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "bOHTyr.hmm")) hmmname = "bOHTyr" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Cys.hmm")) hmmname = "Cys" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "DHB.hmm")) hmmname = "DHB" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "fOHOrn.hmm")) hmmname = "fOHOrn" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Glu.hmm")) hmmname = "Glu" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch( "hmm_infile.fasta", path.join(a_hmms_dir, "guanidinoacetic_acid.hmm")) hmmname = "guanidinoacetic_acid" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "His.hmm")) hmmname = "His" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Hpg2Cl.hmm")) hmmname = "Hpg2Cl" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Ile.hmm")) hmmname = "Ile" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Leu.hmm")) hmmname = "Leu" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "MeAsp.hmm")) hmmname = "MeAsp" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "OHOrn.hmm")) hmmname = "OHOrn" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Orn.hmm")) hmmname = "Orn" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Phenylacetate.hmm")) hmmname = "Phenylacetate" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Pro.hmm")) hmmname = "Pro" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Qna.hmm")) hmmname = "Qna" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Sar.hmm")) hmmname = "Sar" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Thr-4-Cl.hmm")) hmmname = "Thr-4-Cl" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Trp.hmm")) hmmname = "Trp" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Val.hmm")) hmmname = "Val" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "3-HPA.hmm")) hmmname = "3-HPA" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "4-MHA.hmm")) hmmname = "4-MHA" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Aad.hmm")) hmmname = "Aad" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Aeo.hmm")) hmmname = "Aeo" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Ala.hmm")) hmmname = "Ala" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Arg.hmm")) hmmname = "Arg" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "B-Ala.hmm")) hmmname = "B-Ala" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Bmt.hmm")) hmmname = "Bmt" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "capreomycidine.hmm")) hmmname = "capreomycidine" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Dab.hmm")) hmmname = "Dab" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "DHpg.hmm")) hmmname = "DHpg" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Gln.hmm")) hmmname = "Gln" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Gly.hmm")) hmmname = "Gly" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "hAsn.hmm")) hmmname = "hAsn" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "homoTyr.hmm")) hmmname = "homoTyr" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Hpg.hmm")) hmmname = "Hpg" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Kyn.hmm")) hmmname = "Kyn" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Lys.hmm")) hmmname = "Lys" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "mPro.hmm")) hmmname = "mPro" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "OmAsp.hmm")) hmmname = "OmAsp" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Phe.hmm")) hmmname = "Phe" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "pipecolate.hmm")) hmmname = "pipecolate" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "QA.hmm")) hmmname = "QA" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Sal.hmm")) hmmname = "Sal" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Ser.hmm")) hmmname = "Ser" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Thr.hmm")) hmmname = "Thr" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) hmmresults = hmmsearch("hmm_infile.fasta", path.join(a_hmms_dir, "Tyr.hmm")) hmmname = "Tyr" hmm_names.append(hmmname) hmmscore = hmmscores(hmmresults) hmm_scores.append(hmmscore[0]) #Sort names & scores by scores: scoredict = {} a = 0 for i in hmm_names: score = hmm_scores[a] scoredict[i] = float(score) a += 1 hmm_names = sortdictkeysbyvalues(scoredict) hmm_scores = [] for i in hmm_names: score = str(scoredict[i]) hmm_scores.append(score) out_file.write("Substrate:") out_file.write("\t") out_file.write("Score:") out_file.write("\n") for i in hmm_names: out_file.write(i) out_file.write("\t") j = hmm_names.index(i) score = hmm_scores[j] out_file.write(score) out_file.write("\n") out_file.write("\n")
def run_pkssignature_analysis(infile2, outfile): ##Core script #Extract PKS signature from AT domains infile = utils.get_full_path(__file__, "AT_domains_muscle.fasta") muscle_file = "muscle.fasta" dict2 = fastadict(infile2) namesb = fastanames(infile2) seqsb = fastaseqs(namesb, dict2) startpos = 7 querysignames = [] querysigseqs = [] for i in namesb: seq = seqsb[namesb.index(i)] querysignames.append(i) writefasta([i], [seq], "infile.fasta") infile2 = "infile.fasta" refsequence = "P0AAI9_AT1" namesa = [i] #Run muscle and collect sequence positions from file utils.execute([ "muscle", "-profile", "-quiet", "-in1", infile, "-in2", infile2, "-out", "muscle.fasta" ]) file = open(utils.get_full_path(__file__, "ATpositions.txt"), "r") text = file.read() text = text.strip() text = text.replace(' ', '_') positions = text.split("\t") positions2 = [] for i in positions: pos = int(i) pos = pos - startpos positions2.append(pos) positions = positions2 #Count residues in ref sequence and put positions in list muscle_dict = fastadict(muscle_file) muscle_seqs = lseqs(muscle_dict) muscle_names = lnames(muscle_dict) refseqnr = muscle_names.index(refsequence) refseq = muscle_seqs[refseqnr] poslist = [] a = 0 b = 0 c = 0 while refseq != "": i = refseq[0] if c in positions and i != "-": poslist.append(b) if i != "-": c += 1 b += 1 refseq = refseq[1:] #Extract positions from query sequence query = namesa[0] query_seqnr = muscle_names.index(query) query_seq = muscle_seqs[query_seqnr] seq = "" for j in poslist: aa = query_seq[j] seq = seq + aa querysigseqs.append(seq) #Load reference PKS signatures infile3 = utils.get_full_path(__file__, "pks_signatures.fasta") signaturesdict = fastadict(infile3) signaturenames = fastanames(infile3) signatureseqs = fastaseqs(signaturenames, signaturesdict) out_file = open(outfile, "w") #Compare PKS signature with database of signatures and write output to txt file for k in querysignames: querysigseq = querysigseqs[querysignames.index(k)] scoredict = {} for i in signaturenames: sigseq = signatureseqs[signaturenames.index(i)] positions = range(len(querysigseq)) score = 0 for j in positions: if querysigseq[j] == sigseq[j]: score += 1 score = ((float(score) / 24) * 100) scoredict[i] = score sortedhits = sortdictkeysbyvalues(scoredict) sortedhits = sortedhits[:10] sortedscores = [] sortedhits2 = [] for i in sortedhits: score = scoredict[i] if score > 50: score = "%.0f" % (score) sortedscores.append(score) sortedhits2.append(i) sortedhits = sortedhits2 #Write output to txt file out_file.write("//\n" + k + "\t" + querysigseq + "\n") a = 0 for i in sortedhits: out_file.write(i + "\t" + signatureseqs[signaturenames.index(i)] + "\t" + sortedscores[a] + "\n") a += 1 out_file.write("\n\n")
def run_glimmerhmm(seq_record, options): basedir = utils.get_genefinding_basedir(options) with TemporaryDirectory(change=True): #Write FASTA file and run GlimmerHMM utils.fix_record_name_id(seq_record, options) name = seq_record.id while len(name) > 0 and name[0] == '-': name = name[1:] if name == "": name = "unknown" fasta_file = '%s.fasta' % name result_file = '%s.predict' % name with open(fasta_file, 'w') as handle: seqio.write([seq_record], handle, 'fasta') glimmerhmm = ['glimmerhmm'] glimmerhmm.extend([ fasta_file, utils.get_full_path(__file__, "train_%s" % options.glimmerhmm_train_folder), "-g" ]) out, err, retcode = execute(glimmerhmm) if err.find('ERROR') > -1: logging.error("Failed to run GlimmerHMM: %r" % err) return #Parse GlimmerHMM predictions resultstext = out if "CDS" not in resultstext: logging.error("GlimmerHMM gene prediction failed: no genes found.") resultstext = resultstext.replace("\r", " ") lines = resultstext.split("\n") lines = lines[2:-1] orfnames = [] positions = [] strands = [] x = 0 orfnr = 0 starts = [] ends = [] for line in lines: columns = line.split("\t") if len(columns) > 1: if x == 0: if columns[6] == "+": bpy_strand = 1 else: bpy_strand = -1 if "mRNA" not in line: starts.append(int(columns[3])) ends.append(int(columns[4])) elif x == (len(lines) - 1) or "mRNA" in lines[x + 1]: if columns[6] == "+": bpy_strand = 1 else: bpy_strand = -1 strands.append(bpy_strand) starts.append(int(columns[3])) ends.append(int(columns[4])) orfnames.append("orf" + (5 - orfnr) * "0" + str(orfnr)) orfnr += 1 if len(starts) == 1: if starts[0] == 0: starts[0] = 1 if ends[0] == 0: ends[0] = 1 positions.append([[starts[0] - 1, ends[0]]]) else: pos = [] if bpy_strand == -1: starts.reverse() ends.reverse() for i in starts: if i == 0: i = 1 if ends[starts.index(i)] == 0: ends[starts.index(i)] = 1 pos.append([i - 1, ends[starts.index(i)]]) positions.append(pos) starts = [] ends = [] elif "mRNA" not in line: starts.append(int(columns[3])) ends.append(int(columns[4])) x += 1 if len(orfnames) == 0: logging.error("GlimmerHMM gene prediction failed. Please check the " \ "format of your input FASTA file.") #Create seq_record features for identified genes idx = 0 for orfname in orfnames: bpy_strand = strands[idx] genepositions = positions[idx] #For genes with only one CDS if len(genepositions) == 1: gstart, gend = genepositions[0] loc = FeatureLocation(gstart, gend, strand=bpy_strand) feature = SeqFeature( location=loc, id=orfname, type="CDS", qualifiers={ 'locus_tag': ['ctg%s_%s' % (options.record_idx, orfname)] }) seq_record.features.append(feature) #For genes with multiple exons else: gstart, gend = min(genepositions[0]), max(genepositions[-1]) sublocations = [] for exonstart, exonend in genepositions: exonloc = FeatureLocation(exonstart, exonend, strand=bpy_strand) sublocations.append(exonloc) loc = CompoundLocation(sublocations) feature = SeqFeature( location=loc, id=orfname, type="CDS", qualifiers={ 'locus_tag': ['ctg%s_%s' % (options.record_idx, orfname)] }) seq_record.features.append(feature) idx += 1