Python ProcessPool.apply Examples

Programming Language: Python
Namespace/Package Name: pathos.multiprocessing
Class/Type: ProcessPool
Method/Function: apply
Examples at hotexamples.com: 1
Python ProcessPool.apply - 1 examples found. These are the top rated real world Python examples of pathos.multiprocessing.ProcessPool.apply extracted from open source projects. You can rate examples to help us improve the quality of examples.
Frequently Used Methods
Show Hide
ProcessPool(30)
clear(30)
close(30)
join(30)
map(30)
uimap(25)
imap(23)
restart(22)
apipe(12)
terminate(8)
amap(6)
_clear(1)
apply(1)
get(1)
pipe(1)
ready(1)
Example #1
Show file
def main():

	"""
	Main Function
	"""
	now = datetime.datetime.now()
	
	c_args = parse_args(__file__)

	#### Define global variables ####	
	genome_info_file=os.path.abspath(c_args['genome_info'])
	project_name=c_args['project_name']
	seq_dir=os.path.abspath(c_args['seq_dir'])
	output_dir=os.path.abspath(c_args['output_dir'])
	
	filterparam={"match_cutoff":c_args["percent_match"],
				"hitcov_cutoff":c_args["hitcoverage"],
				"chicov_cutoff":c_args["chimcoverage"]}
	
	iteration=0                                     #### iteration number
	resume_iteration=None                           #### resume iteration from this number
	iter_dir=None                                   #### path for current iteration directory
	project_dir=None                                #### path for project directory 
	projectdir_dict=defaultdict()                   #### dict of project dir output
	iterdir_dict=defaultdict()                      #### dict of iter dir output
	current_timestamp=now.strftime("%Y%m%d_%H%M%S") ### current timestamp 
		
		
	if c_args["resume_iter"]!=None:
		resume_iteration=int(c_args['resume_iter'])
		
	### create project_dir paths ###
	project_dir=os.path.join(output_dir,project_name)
	
	### log file ###
	log_file=os.path.join(os.path.abspath("../log"),project_name+"_"+current_timestamp+".log")
	
	print "[{}]: Analysis Start\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"))

	#### create log file for the project ####
	logging.basicConfig(filename=log_file,format="[%(asctime)s] %(levelname)-8s %(message)s",datefmt='%a, %d %b %Y %H:%M:%S',level=logging.INFO)
	logging.info("This is a run-log for DeNoGAP2 pipeline executed on {}\n".format(current_timestamp))
	#logging.info("[Command executed] {}".format(command))

	#### read/check genome information file #####
	logging.info("Reading genome information file {}".format(genome_info_file))
	print "[{}]: Reading genome information file {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),genome_info_file)
	
	genome_dict=read_genome_info(genome_info_file)
	
	### list of reference genome names and other genome names ###
	list_ref_genome=[genome_name for genome_name in genome_dict
						if int(genome_dict[genome_name]["REFERENCE"])==1]
	
	list_other_genome=[genome_name for genome_name in genome_dict
						if int(genome_dict[genome_name]["REFERENCE"])==0]				
	
	### read names for the fasta files in the data directory #####
	logging.info("Reading fasta file names from the sequence directory {}".format(seq_dir))
	print "[{}]: Reading fasta file names from the sequence directory {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),seq_dir)
	
	fasta_dict=su().get_file_name_from_dir(seq_dir)
	fasta_list=fasta_dict.values()
	
	#### check if protein sequences for all genome names specified in information file are present in the data directory ####
	for genome_name in genome_dict:
		if not genome_name in fasta_dict:
			logging.error("ERROR, Protein sequence fasta file for {} not found in {}\n".format(genome_name,data_dir))
			print "[{}]: ERROR, Protein sequence fasta file for {} not found in {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),genome_name,data_dir)
			sys.exit()	
	
	#### If not already present set-up project directory #####
	logging.info("Setting up project directory for DeNoGAP-HMM {}".format(project_dir))
	print "[{}]: Setting up project directory for DeNoGAP-HMM {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),project_dir)
	
	if not os.path.exists(project_dir):
		logging.info("Creating new project dir {}\n".format(project_dir))
		print "[{}]: Creating new project dir {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),project_dir)
		os.makedirs(project_dir)
		projectdir_dict=SetDir().mk_project_dirs(project_dir)
	else:
		logging.info("Using existing project dir {}\n".format(project_dir))
		print "[{}]: Using existing project dir {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),project_dir)
		projectdir_dict=SetDir().mk_project_dirs(project_dir)
		
	##### Create and load sequences into index database #####	
	logging.info("Creating Sequence index database {}".format(os.path.join(project_dir,"Sequence_index.idx")))
	print "[{}]: Creating Sequence index database {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),os.path.join(project_dir,"Sequence_index.idx"))
	
	fasta_seq_idx=SeqIO.index_db(os.path.join(projectdir_dict["TMP"],"Sequences_indexdb.idx"),
								 fasta_list,"fasta")					 
		
	#### Initial homolog mapping for reference genomes
	#### Starts with iteration 0
	#### If folder for iteration 0 is already present then program will
	#### terminate further execution to avoid any overwriting
	
	if resume_iteration==None:

		### setup dir for the iteration ###
		iterdir_dict=SetDir().mk_hmm_iter_dirs(project_dir,iteration)
	
		logging.info("Starting pairwise sequence comparision step\n")
		print "[{}]: Starting pairwise sequence comparision step\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"))
		
		#### make sequence database ###
		logging.info("Creating sequence database for pairwise comparision\n")
		print "[{}]: Creating sequence database for pairwise comparision\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"))

		seqdb_dict=defaultdict(dict)
		
		for genome_name in list_ref_genome:
			seq_dict=SeqIO.index(fasta_dict[genome_name],'fasta')
			seqdb_dict[genome_name]=seq_dict
			seq_dict.close()
				
		seqdb_dir=projectdir_dict["SEQ_DB"]
		seqdb_file="DBSEQ.fasta"
		
		logging.info("Saved sequence database {} for pairwise comparision at {}\n".format(seqdb_file,seqdb_dir))
		print "[{}]: Saved sequence database {} for pairwise comparision {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),seqdb_file,seqdb_dir)		
				
		seqdb_path=su().write_seqfile(seqdb_dir,seqdb_file,seqdb_dict)
		
		#### Align query sequences against sequence database ####
		
		logging.info("Aligning Sequences\n")
		print "[{}]: Aligning Sequences\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"))
		
		parsed_alignment_dict=defaultdict(dict)		
		
		for genome_name in list_ref_genome:
			
				logging.info("QUERY: {}\n".format(genome_name))
				print "[{}]: QUERY: {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),genome_name)
					
				outpath=os.path.join(iterdir_dict["ALL_MATCH"],genome_name+".hmmalign.txt")
				domtabpath=os.path.join(iterdir_dict["ALL_MATCH"],genome_name+".domtab.txt")
				bestmatchpath=os.path.join(iterdir_dict["BEST_MATCH"],genome_name+".bhh.txt")
				partialmatchpath=os.path.join(iterdir_dict["PARTIAL_MATCH"],genome_name+".phh.txt")
				chimeramatchpath=os.path.join(iterdir_dict["CHIMERIC_MATCH"],genome_name+".chh.txt")                                                             
					                        
						
				hmmer_proc_returncode,hmmerstderr=Hmmer("phmmer").run_hmmer(c_args,
				                                                            seqdb_path,
				                                                            fasta_dict[genome_name],
				                                                            outpath,
				                                                            domtabpath)
					
				if hmmer_proc_returncode!=0:
					print "[{}]: HMMER Execution Failed, Exiting with an error {}, "\
						"returncode: {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),
												  hmmerstderr,
					                              hmmer_proc_returncode)
					logging.error("HMMER Execution Failed, Exiting with an error {}, "\
						"returncode: {}\n".format(hmmerstderr,
												  hmmer_proc_returncode))
					sys.exit()
						
				domhmmer_dict=Hmmer("phmmer").parse_hmmer_domtab(domtabpath)
				simhmmer_dict=Hmmer("phmmer").parse_hmmer_similarity(outpath)
				hmmer_result=Hmmer("phmmer").add_hmmer_stats(domhmmer_dict,simhmmer_dict)
					
				domhmmer_dict=None
				simhmmer_dict=None
				gc.collect()
					
				fixed_hmmer_result=FixMultiHSP("phmmer").fix_hsps(hmmer_result,
																  c_args['avg_accuracy'])
				filtered_hmmer_result=Hmmer("phmmer").filter_hits(filterparam,
																  fixed_hmmer_result)
					
				hmmer_result=None
				fixed_hmmer_result=None
				gc.collect()
					
				write_hmmer_result(genome_name,
					                bestmatchpath,
					                filtered_hmmer_result["BEST"])
				write_hmmer_result(genome_name,
					                partialmatchpath,
					                filtered_hmmer_result["PARTIAL"])
				write_hmmer_result(genome_name,
					                chimeramatchpath,
					                filtered_hmmer_result["CHIMERA"])
		
		
		logging.info("MCL Clustering Best-Hits\n")
		print "[{}]: MCL Clustering Best-Hits\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"))
		
		mcl_abc_file=os.path.join(projectdir_dict["TMP"],"seq.abc")
			                   	
		os.system("cat {}/* | cut -f 1,2,11 | sed '1d' > {}".format(
				iterdir_dict["BEST_MATCH"],mcl_abc_file))
				
		mcl_cluster=sc().mcl_clustering(mcl_abc_file,c_args["mcl_inflation"])
		
		logging.info("Adding Group-IDs\n")
		print "[{}]: Adding Group-ID\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"))
		
		grouped_cluster=sc().add_cluster_ids(mcl_cluster,start_at=1000)
		
		hmm_cluster_file=os.path.join(iterdir_dict['CLUSTER'],
		                              "Hmmcluster_"+now.strftime("%Y%m%d_%H%M%S")+".txt")
		
		logging.info("Saving Cluster file at {}\n".format(hmm_cluster_file))
		print "[{}]: Saving Cluster file at {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),
		                                                 hmm_cluster_file)
		
		cp_cmd="cp {0} {1}".format(grouped_cluster,hmm_cluster_file)
		proc=subprocess.Popen([cp_cmd],shell=True,stdout=subprocess.PIPE,
		                      stderr=subprocess.PIPE)
		proc.wait()
		stdout,stderr=proc.communicate()
		
		if proc.returncode!=0:
			print stderr
			sys.exit()
		else:	                     
			logging.info("Pairwise sequence comparision completed successfully\n")
			print "[{}]: Pairwise sequence comparision completed successfully\n"\
				  .format(now.strftime("%Y:%m:%d_%H:%M:%S"))
				  
	elif resume_iteration!=None:
		"""
		Execute iterative block for each genome
		read cluster file from iteration to resume from
		build hmm models/ sequence database
		scan hmm models/ sequence database iteratively against each genome
		"""
		
		while len(list_other_genome)!=0:
		
			iteration=int(resume_iteration)+1
			iterdir_dict=SetDir().mk_hmm_iter_dirs(project_dir,iteration)
			
			### getting model cluster directory #####
			model_cluster_dir=os.path.join(projectdir_dict["BASE"],
										   "iter_{0}".format(resume_iteration),"CLUSTER")
			
			### getting model cluster file ####							   
			model_cluster_file=[os.path.join(model_cluster_dir,file_name) 
			                     for file_name in os.listdir(model_cluster_dir)
			                        if file_name.startswith("Hmmcluster")]
			
			### Call function to build hmm models and if singleton then add 
			### to sequence database                        
			#hdb().build_hmmdb(model_cluster_file[0],seq_dir,
			#								outdir_dict["HMM_DB"],
			#								outdir_dict["SEQ_DB"])
			model_dir=os.path.join(projectdir_dict["HMMER_DB"],"HMM_MODELS")
			os.makedirs(model_dir)
			
			#### Run hmmbuild ####
			
			with open(model_cluster_file[0],"r") as clustfile:
			
				pool=ProcessPool(nodes=c_args["cpu"])
				[pool.apply(build_hmm,args=(clustline,fasta_dict,model_dir))
					     for clustline in clustfile]