def main(protId, config_file): global prot_config, prot_id, taxonset, nexus_file rootDir = os.getcwd() prot_id = protId prot_config = configure.setParams(config_file) species_id = prot_config.species work_dir = prot_config.path_work_dir + '/' + prot_id os.chdir(work_dir) cache_dir = prot_config.path_cache nexus_file = 'nexus_' + prot_id + '.nexus' # Save the species name into a variable taxonset trees = dendropy.TreeList.get_from_path(prot_config.species_tree, "newick") taxonset = [] for element in trees.taxon_namespace: taxonset.append(str(element).replace("'", "").replace(" ", "_")) generateNexusFile() colourizeTree.main(nexus_file, prot_config.hamstr_oma_tree_map, prot_id, prot_config.species_tree, prot_config.plot_figtree, prot_config.species_MaxLikMatrix, species_id, cache_dir) os.chdir(rootDir)
def main(argv): id_list, fasta_list, config_file = '', '', '' # Setting the get options method to read the input arguments try: opts, args = getopt.getopt(argv, "f:i:c:h", ["fasta=", "id=", "config=", "help"]) except getopt.GetoptError: print 'Invalid arguments:\nUsage:\tprotTrace.py -i <omaIdsFile> | -f <fastaSeqsFile> -c <configFile> [-help]' sys.exit(2) for opt, arg in opts: if opt in ('-h','--help'): print "USAGE:\tprotTrace.py -i <omaIdsFile> | -f <fastaSeqsFile> -c <configFile> [-h]\n\t-i\t\tText file containing protein OMA ids (1 id per line)\n\t-f\t\tList of input protein sequences in fasta format\n\t-c\t\tConfiguration file for setting program's dependencies" sys.exit(2) elif opt in ('-i', '--id'): id_list = arg elif opt in ('-f','--fasta'): fasta_list = arg elif opt in ('-c','--config'): config_file = arg else: print 'Invalid arguments:\nUsage:\tprotTrace.py -i <omaIdsFile> | -f <fastaSeqsFile> -c <configFile> [-help]' sys.exit(2) config_file = os.path.abspath(config_file) # Calling the class in configure.py module and setting the tool parameters proteinParams = configure.setParams(config_file) if id_list != '': for ids in open(id_list): print '##### Running for OMA id: %s #####' %ids.split()[0] if proteinParams.preprocessing: preprocessing.Preprocessing(ids.split()[0], 'None', config_file) if proteinParams.traceability_calculation: traceabilityCalculation.main(ids.split()[0], config_file) if proteinParams.mapTraceabilitySpeciesTree: mapToSpeciesTree.main(ids.split()[0], config_file) elif fasta_list != '': with open(fasta_list) as fa: for seqs in fa: if '>' in seqs: print '##### Running for fasta id: %s #####' %seqs[1:-1] inputId = seqs.split()[0][1:] querySeq = fa.next() if proteinParams.preprocessing: preprocessing.Preprocessing(inputId, querySeq, config_file) if proteinParams.traceability_calculation: traceabilityCalculation.main(inputId, config_file) if proteinParams.mapTraceabilitySpeciesTree: mapToSpeciesTree.main(inputId, config_file)
def Preprocessing(prot_id, querySeq, config_file): # Store the current working directory into a variable rootDir = os.getcwd() print 'Prot_id: ', prot_id # Creating instance of class configure # Saves the information provided by the program configuration file prot_config = configure.setParams(config_file) # Declares global variables which will be used by all methods in the module global cache, cache_dir, work_dir, omaIdFile, orth_file, aln_file, phy_file, id_file, proteome_file, tree_file, trans_file, hmm_file, xml_file, REvolver_output_dir, species_id, indel_file, scale_file # Getting information from the configuration file # Setting the names for the protTrace temporary and output files species_id = prot_config.species proteome_file = 'proteome_' + prot_id id_file = 'ogIds_' + prot_id + '.txt' work_dir = prot_config.path_work_dir + '/' + prot_id cache_dir = prot_config.path_cache omaIdFile = work_dir + '/omaId.txt' orth_file = 'ogSeqs_' + prot_id + '.fa' aln_file = 'ogSeqs_' + prot_id + '.aln' phy_file = 'ogSeqs_' + prot_id + '.phy' tree_file = 'RAxML_bestTree.' + prot_id ### CHANGE HERE IF RAxML OUTPUT NAMES CHANGE ### trans_file = 'ogSeqs_' + prot_id + '.trans' hmm_file = prot_id + '.hmm' xml_file = 'revolver_config_' + prot_id + '.xml' REvolver_output_dir = work_dir + '/REvolver_output/' indel_file = 'indel_' + prot_id scale_file = 'scale_' + prot_id delTemp = prot_config.delete_temp cache = prot_config.reuse_cache # Creates a working directory where temporary and output files will be stored if not os.path.exists(work_dir): print '##### Creating working directory:\n', work_dir try: os.mkdir(work_dir) except: sys.exit('ERROR: Working directory cannot be created!') # Change current working directory os.chdir(work_dir) # Parse proteome of the input species given in program configuration file # The proteome is extracted from the OMA database sequences file if prot_config.search_proteome: startProcessTime = time.time() parseOmaProteome(species_id, prot_config.path_oma_seqs, prot_config.makeblastdb, proteome_file) proteome_file = os.path.abspath(proteome_file) print '#####\tTIME TAKEN: %s mins\tSearch proteome#####' % ( (time.time() - startProcessTime) / 60) # Search ortholog groups for the input OMA id # In case of fasta sequences, first the OMA id is parsed from the OMA database # followed with the extraction of the OMA group if prot_config.search_ortholog_groups: startProcessTime = time.time() run = findOmaOrthologs(prot_id, querySeq, prot_config.path_oma_group, prot_config.path_oma_seqs, proteome_file, prot_config.formatdb, prot_config.blastp, delTemp, species_id) print '#####\tTIME TAKEN: %s mins\tSearch OGs#####' % ( (time.time() - startProcessTime) / 60) # Search for the ortholog sequences for the respective OMA orthologs group # For all the OMA ids in the OMA group, extract sequences from OMA database sequences file if prot_config.search_ortholog_sequences: if cache and os.path.exists(orth_file): print 'OMA orthologs sequences file exist. Reusing it!' else: if run == 2: startProcessTime = time.time() findOmaSequences(prot_id, prot_config.path_oma_seqs, species_id, prot_config.hamstr_oma_tree_map) print '#####\tTIME TAKEN: %s mins\tSearch OGSeqs#####' % ( (time.time() - startProcessTime) / 60) else: print '##### Preparing ortholog file #####' fOrth = open(orth_file, 'w') fOrth.write('>' + species_id + '\n' + querySeq) fOrth.close() # Extend the ortholog set by performing a HaMStR search # The orthologs sequences by OMA is used as core-ortholog set try: f = open(orth_file).read().split('\n') # Run HaMStR search if 2 or more sequences are present. Otherwise, run HaMStROneSeq search if only 1 sequence is present if len(f) > 3: if prot_config.run_hamstr: startProcessTime = time.time() if cache and os.path.exists(cache_dir + '/' + orth_file): print 'Pre-HaMStR computed orthologs file found in Cache for re-use!' os.system('cp %s %s' % (cache_dir + '/' + orth_file, orth_file)) else: success = hamstr_search.main( prot_config.hamstr, orth_file, prot_id, prot_config.hamstr_oma_tree_map, prot_config.formatdb, prot_config.blastp, delTemp) if success: print '#####\tTIME TAKEN: %s mins\tHaMStR#####' % ( (time.time() - startProcessTime) / 60) os.system('cp %s %s' % (orth_file, cache_dir + '/' + orth_file)) else: if prot_config.run_hamstrOneSeq: print '##### HaMStROneSeq search for orthologs #####' startProcessTime = time.time() if cache and os.path.exists(cache_dir + '/' + orth_file): print 'Pre-HaMStR computed orthologs file found in Cache for re-use!' os.system( 'cp %s %s' % (cache_dir + '/' + orth_file, orth_file)) else: # Read the orthologs file and limit it to just the query species id and sequence ortholog_temp = open(orth_file).read().split( '\n') rewrite_orth_file = open(orth_file, 'w') for orthLines in range(len(ortholog_temp) - 1): if '>' in ortholog_temp[ orthLines] and species_id in ortholog_temp[ orthLines]: rewrite_orth_file.write( ortholog_temp[orthLines] + '\n' + ortholog_temp[orthLines + 1]) break rewrite_orth_file.close() run_hamstrOneSeq( prot_config.hamstr, os.path.abspath(orth_file), prot_config.hamstr_oma_tree_map, prot_id, prot_config.formatdb, prot_config.blastp, proteome_file, delTemp) print '#####\tTIME TAKEN: %s mins\tHaMStR-OneSeq#####' % ( (time.time() - startProcessTime) / 60) os.system( 'cp %s %s' % (orth_file, cache_dir + '/' + orth_file)) elif prot_config.run_hamstrOneSeq: startProcessTime = time.time() print '##### HaMStROneSeq search for orthologs #####' if cache and os.path.exists(cache_dir + '/' + orth_file): print 'Pre-HaMStR computed orthologs file found in Cache for re-use!' os.system('cp %s %s' % (cache_dir + '/' + orth_file, orth_file)) else: # Read the orthologs file and limit it to just the query species id and sequence ortholog_temp = open(orth_file).read().split('\n') rewrite_orth_file = open(orth_file, 'w') inputTaxaSet = open('inputTaxaSet_oneSeq.txt', 'w') for orthLines in range(len(ortholog_temp) - 1): if '>' in ortholog_temp[ orthLines] and species_id in ortholog_temp[ orthLines]: rewrite_orth_file.write(ortholog_temp[orthLines] + '\n' + ortholog_temp[orthLines + 1]) elif '>' in ortholog_temp[ orthLines] and not species_id in ortholog_temp[ orthLines]: inOmaId = ortholog_temp[orthLines].split()[0][1:] for mapLine in open( prot_config.hamstr_oma_tree_map): if inOmaId in mapLine: inputTaxaSet.write(mapLine.split()[0] + '\n') break inputTaxaSet.close() rewrite_orth_file.close() run_hamstrOneSeq(prot_config.hamstr, os.path.abspath(orth_file), prot_config.hamstr_oma_tree_map, prot_id, prot_config.formatdb, prot_config.blastp, proteome_file, delTemp) print '#####\tTIME TAKEN: %s mins\tHaMStR-OneSeq#####' % ( (time.time() - startProcessTime) / 60) os.system('cp %s %s' % (orth_file, cache_dir + '/' + orth_file)) else: print '#####\tCalculating traceability with ONLY OMA set\t#####' elif len(f) > 0 and len(f) < 4: if prot_config.run_hamstrOneSeq: print '##### HaMStROneSeq search for orthologs #####' startProcessTime = time.time() if cache and os.path.exists(cache_dir + '/' + orth_file): print 'Pre-HaMStR computed orthologs file found in Cache for re-use!' os.system('cp %s %s' % (cache_dir + '/' + orth_file, orth_file)) else: run_hamstrOneSeq(prot_config.hamstr, os.path.abspath(orth_file), prot_config.hamstr_oma_tree_map, prot_id, prot_config.formatdb, prot_config.blastp, proteome_file, delTemp) print '#####\tTIME TAKEN: %s mins\tHaMStR-OneSeq#####' % ( (time.time() - startProcessTime) / 60) os.system('cp %s %s' % (orth_file, cache_dir + '/' + orth_file)) else: sys.exit( 'ERROR: No sequence found in OMA sequences! The ortholog sequences file is empty!' ) except IOError: sys.exit('ERROR: Orthologs sequences file is invalid!') except KeyboardInterrupt: sys.exit('Keyboard interruption by user!!!') # Performs MSA on the orthologs sequences if prot_config.perform_msa: print '##### Performing MSA of the orthologs sequences #####' startProcessTime = time.time() performMSA(prot_config.msa, prot_config.clustalw) print '#####\tTIME TAKEN: %s mins\tMAFFT#####' % ( (time.time() - startProcessTime) / 60) # Calls tree reconstruction module which generates tree using degapped alignment # and also calculates the scaling factor based on maximum likelihood distance between species if prot_config.calculate_scaling_factor: print '##### Tree reconstruction and scaling factor calculation #####' startProcessTime = time.time() treeReconstruction.main( prot_config.tree_reconstruction, prot_config.msa, prot_config.clustalw, prot_config.degapping, orth_file, prot_config.aa_substitution_matrix, prot_id, prot_config.treePuzzle, prot_config.parameters_treePuzzle, prot_config.hamstr_oma_tree_map, prot_config.species_MaxLikMatrix, scale_file, tree_file, delTemp, prot_config.default_scaling_factor, cache_dir) print '#####\tTIME TAKEN: %s mins\tRAxML#####' % ( (time.time() - startProcessTime) / 60) # Calculate indels if prot_config.calculate_indel: if cache and os.path.exists(indel_file): print 'Pre-computed indel found for re-use!' else: # Transform alignment print '##### Transforming MSA based on indel blocks #####' alignmentLength = 0 try: alignmentLength = transformAlignment.main(phy_file, trans_file) except: pass calculateIndels(tree_file, trans_file, alignmentLength, prot_config.iqtree24, prot_config.default_indel, prot_config.default_indel_distribution) # Domain constraint file for REvolver if prot_config.traceability_calculation: # Creates a output directory for REvolver if not os.path.exists(REvolver_output_dir): print '##### Creating REvolver output directory:\n', REvolver_output_dir try: os.mkdir(REvolver_output_dir) except: sys.exit('ERROR: REvolver output directory cannot be created!') print '##### Generating domain constraints for REvolver #####' hmmscan(prot_config.hmmscan, orth_file, prot_config.pfam_database, hmm_file, prot_id, species_id) # Prepare XML config file to be used as an input for REvolver print '##### Preparing XML configuration file for REvolver #####' if os.path.exists(scale_file): f = open(scale_file).read().split('\n') scaling_factor = f[0] else: print 'WARNING: Scaling factor file not found. Using default value:', prot_config.default_scaling_factor scaling_factor = prot_config.default_scaling_factor # Writing randomly generated indels into the file (Randome pick between 0.50 to 2.00) writeScale = open(scale_file, 'w') writeScale.write(scaling_factor) writeScale.close() if os.path.exists(indel_file): f = open(indel_file).read().split('\n') indel = f[0] p = f[1] else: print 'WARNING: Indel file not found. Using default value:', prot_config.default_indel indel = prot_config.default_indel p = prot_config.default_indel_distribution # Writing randomly generated indels into the file (Random pick for indel between 0.05 to 1.9; for distribution between 0.0 to 0.9) # The random value limit has been set as observed from the large scale indels distribution writeIndel = open(indel_file, 'w') writeIndel.write(indel + '\n' + p) writeIndel.close() prepareXML(xml_file, prot_config.pfam_database, prot_config.hmmfetch, prot_config.aa_substitution_matrix, indel, p, scaling_factor, prot_config.simulation_tree, prot_id, hmm_file, REvolver_output_dir) os.chdir(rootDir)
def main(p_id, config_file): global prot_id prot_id = p_id rootDir = os.getcwd() global prot_config prot_config = configure.setParams(config_file) cache = prot_config.reuse_cache nr_proc = prot_config.nr_processors work_dir = prot_config.path_work_dir + '/' + prot_id os.chdir(work_dir) if os.path.exists(work_dir + '/omaId.txt'): global blastHitId blastHitId = open(work_dir + '/omaId.txt').read().split('\n')[0] else: sys.exit('### ERROR: No reciprocal BLAST hit id found!!!! ###') global xml_file xml_file = 'revolver_config_' + prot_id + '.xml' global proteome_file proteome_file = 'proteome_' + prot_id trees = dendropy.TreeList.get_from_path(prot_config.simulation_tree, "newick") global taxonset taxonset = [] for element in trees.taxon_namespace: taxonset.append(str(element).replace("'", "")) taxonset = taxonset[::-1] print '##### Running REvolver / BLAST cycles: #####' start_time = time.time() if cache and os.path.exists('decay_summary_%s.txt_parameter' %prot_id): pass else: try: pool = Pool(processes=nr_proc) results = pool.map(actual_traceability_calculation, range(prot_config.simulation_runs)) except KeyboardInterrupt as e: pool.terminate() pool.join() print("Interrupting REvolver") sys.exit(e) except: print("ERROR: Multiprocessing step <-> Traceability Calculations.") pass print '#####\tTIME TAKEN: %s mins REvolver/BLAST#####' %((time.time() - start_time) / 60) ffull = open('full_decay_results_%s.txt' %prot_id, 'w') fsum = open('decay_summary_%s.txt' %prot_id, 'w') detection_probability = {} for res in results: for key, value in res.iteritems(): if not key in detection_probability.keys(): detection_probability[key] = [] detection_probability[key].append(value) else: detection_probability[key].append(value) #print detection_probability for taxa in taxonset: ffull.write(taxa + ' ') count = 0 for element in detection_probability[taxa]: ffull.write(str(element)) count += int(element) ffull.write('\n') fsum.write(str(float(count) / float(prot_config.simulation_runs)) + '\n') ffull.close() fsum.close() print '##### Calculating decay parameters #####' decayParams(prot_config.R, prot_id, prot_config.decay_script) os.chdir(rootDir)
def main(prot_id, config_file): rootDir = os.getcwd() prot_config = configure.setParams(config_file) cache = prot_config.reuse_cache work_dir = prot_config.path_work_dir + '/' + prot_id os.chdir(work_dir) if os.path.exists(work_dir + '/omaId.txt'): blastHitId = open(work_dir + '/omaId.txt').read().split('\n')[0] else: sys.exit('### ERROR: No reciprocal BLAST hit id found!!!! ###') xml_file = 'revolver_config_' + prot_id + '.xml' #print xml_file proteome_file = 'proteome_' + prot_id trees = dendropy.TreeList.get_from_path(prot_config.simulation_tree, "newick") taxonset = [] for element in trees.taxon_namespace: taxonset.append(str(element).replace("'", "")) taxonset = taxonset[::-1] detection_probability = {} for taxas in taxonset: detection_probability[taxas] = [] print '##### Running REvolver / BLAST cycles: #####' start_time = time.time() command = 'java -Xmx2G -Xms2G -cp "%s" revolver %s' % ( prot_config.REvolver, xml_file) print 'REvolver calculations command: ', command if cache and os.path.exists('decay_summary_%s.txt_parameter' % prot_id): pass else: for i in range(int(prot_config.simulation_runs)): print 'Run: ', i + 1 success = False trials = 0 while (not success and trials < 10): trials += 1 try: run_revolver(prot_config.REvolver, xml_file) blastOutput = run_blast(prot_config.blastp, prot_id, proteome_file) for taxa in taxonset: detection = 0 for line in blastOutput.split('\n'): #print line if taxa == line.split('\t')[0]: if line.split('\t')[1] == blastHitId: detection = 1 break detection_probability[taxa].append(detection) success = True except KeyboardInterrupt: sys.exit('Keyboard interruption by user!!!') except: pass if trials >= 10: sys.exit( 'TOO MANY TRIALS FOR REVOLVER!!! Check REvolver configuration file.' ) print '#####\tTIME TAKEN: %s mins REvolver/BLAST#####' % ( (time.time() - start_time) / 60) ffull = open('full_decay_results_%s.txt' % prot_id, 'w') fsum = open('decay_summary_%s.txt' % prot_id, 'w') for taxa in taxonset: ffull.write(taxa + ' ') count = 0 for element in detection_probability[taxa]: ffull.write(str(element)) count += int(element) ffull.write('\n') fsum.write( str(float(count) / float(prot_config.simulation_runs)) + '\n') ffull.close() fsum.close() print '##### Calculating decay parameters #####' decayParams(prot_config.R, prot_id, prot_config.decay_script) os.chdir(rootDir)
def main(prot_id, config_file): rootDir = os.getcwd() prot_config = configure.setParams(config_file) cache = prot_config.reuse_cache work_dir = prot_config.path_work_dir + '/' + prot_id os.chdir(work_dir) if os.path.exists(work_dir + '/omaId.txt'): blastHitId = open(work_dir + '/omaId.txt').read().split('\n')[0] else: sys.exit('### ERROR: No reciprocal BLAST hit id found!!!! ###') xml_file = 'revolver_config_' + prot_id + '.xml' #print xml_file proteome_file = 'proteome_' + prot_id trees = dendropy.TreeList.get_from_path(prot_config.simulation_tree, "newick") taxonset = [] for element in trees.taxon_namespace: taxonset.append(str(element).replace("'", "")) taxonset = taxonset[::-1] detection_probability = {} for taxas in taxonset: detection_probability[taxas] = [] print '##### Running REvolver / BLAST cycles: #####' start_time = time.time() command = 'java -Xmx2G -Xms2G -cp "%s" revolver %s' %(prot_config.REvolver, xml_file) print 'REvolver calculations command: ', command if cache and os.path.exists('decay_summary_%s.txt_parameter' %prot_id): pass else: for i in range(int(prot_config.simulation_runs)): print 'Run: ', i + 1 success = False trials = 0 while(not success and trials < 10): trials += 1 try: run_revolver(prot_config.REvolver, xml_file) blastOutput = run_blast(prot_config.blastp, prot_id, proteome_file) for taxa in taxonset: detection = 0 for line in blastOutput.split('\n'): #print line if taxa == line.split('\t')[0]: if line.split('\t')[1] == blastHitId: detection = 1 break detection_probability[taxa].append(detection) success = True except KeyboardInterrupt: sys.exit('Keyboard interruption by user!!!') except: pass if trials >= 10: sys.exit('TOO MANY TRIALS FOR REVOLVER!!! Check REvolver configuration file.') print '#####\tTIME TAKEN: %s mins REvolver/BLAST#####' %((time.time() - start_time) / 60) ffull = open('full_decay_results_%s.txt' %prot_id, 'w') fsum = open('decay_summary_%s.txt' %prot_id, 'w') for taxa in taxonset: ffull.write(taxa + ' ') count = 0 for element in detection_probability[taxa]: ffull.write(str(element)) count += int(element) ffull.write('\n') fsum.write(str(float(count) / float(prot_config.simulation_runs)) + '\n') ffull.close() fsum.close() print '##### Calculating decay parameters #####' decayParams(prot_config.R, prot_id, prot_config.decay_script) os.chdir(rootDir)