Example #1
0
def main():

	"""
	Main Function
	"""
	now = datetime.datetime.now()
	
	c_args = parse_args(__file__)

	#### Define global variables ####	
	genome_info_file=os.path.abspath(c_args['genome_info'])
	project_name=c_args['project_name']
	seq_dir=os.path.abspath(c_args['seq_dir'])
	output_dir=os.path.abspath(c_args['output_dir'])
	
	filterparam={"match_cutoff":c_args["percent_match"],
				"hitcov_cutoff":c_args["hitcoverage"],
				"chicov_cutoff":c_args["chimcoverage"]}
	
	iteration=0                                     #### iteration number
	resume_iteration=None                           #### resume iteration from this number
	iter_dir=None                                   #### path for current iteration directory
	project_dir=None                                #### path for project directory 
	projectdir_dict=defaultdict()                   #### dict of project dir output
	iterdir_dict=defaultdict()                      #### dict of iter dir output
	current_timestamp=now.strftime("%Y%m%d_%H%M%S") ### current timestamp 
		
		
	if c_args["resume_iter"]!=None:
		resume_iteration=int(c_args['resume_iter'])
		
	### create project_dir paths ###
	project_dir=os.path.join(output_dir,project_name)
	
	### log file ###
	log_file=os.path.join(os.path.abspath("../log"),project_name+"_"+current_timestamp+".log")
	
	print "[{}]: Analysis Start\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"))

	#### create log file for the project ####
	logging.basicConfig(filename=log_file,format="[%(asctime)s] %(levelname)-8s %(message)s",datefmt='%a, %d %b %Y %H:%M:%S',level=logging.INFO)
	logging.info("This is a run-log for DeNoGAP2 pipeline executed on {}\n".format(current_timestamp))
	#logging.info("[Command executed] {}".format(command))

	#### read/check genome information file #####
	logging.info("Reading genome information file {}".format(genome_info_file))
	print "[{}]: Reading genome information file {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),genome_info_file)
	
	genome_dict=read_genome_info(genome_info_file)
	
	### list of reference genome names and other genome names ###
	list_ref_genome=[genome_name for genome_name in genome_dict
						if int(genome_dict[genome_name]["REFERENCE"])==1]
	
	list_other_genome=[genome_name for genome_name in genome_dict
						if int(genome_dict[genome_name]["REFERENCE"])==0]				
	
	### read names for the fasta files in the data directory #####
	logging.info("Reading fasta file names from the sequence directory {}".format(seq_dir))
	print "[{}]: Reading fasta file names from the sequence directory {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),seq_dir)
	
	fasta_dict=su().get_file_name_from_dir(seq_dir)
	fasta_list=fasta_dict.values()
	
	#### check if protein sequences for all genome names specified in information file are present in the data directory ####
	for genome_name in genome_dict:
		if not genome_name in fasta_dict:
			logging.error("ERROR, Protein sequence fasta file for {} not found in {}\n".format(genome_name,data_dir))
			print "[{}]: ERROR, Protein sequence fasta file for {} not found in {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),genome_name,data_dir)
			sys.exit()	
	
	#### If not already present set-up project directory #####
	logging.info("Setting up project directory for DeNoGAP-HMM {}".format(project_dir))
	print "[{}]: Setting up project directory for DeNoGAP-HMM {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),project_dir)
	
	if not os.path.exists(project_dir):
		logging.info("Creating new project dir {}\n".format(project_dir))
		print "[{}]: Creating new project dir {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),project_dir)
		os.makedirs(project_dir)
		projectdir_dict=SetDir().mk_project_dirs(project_dir)
	else:
		logging.info("Using existing project dir {}\n".format(project_dir))
		print "[{}]: Using existing project dir {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),project_dir)
		projectdir_dict=SetDir().mk_project_dirs(project_dir)
		
	##### Create and load sequences into index database #####	
	logging.info("Creating Sequence index database {}".format(os.path.join(project_dir,"Sequence_index.idx")))
	print "[{}]: Creating Sequence index database {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),os.path.join(project_dir,"Sequence_index.idx"))
	
	fasta_seq_idx=SeqIO.index_db(os.path.join(projectdir_dict["TMP"],"Sequences_indexdb.idx"),
								 fasta_list,"fasta")					 
		
	#### Initial homolog mapping for reference genomes
	#### Starts with iteration 0
	#### If folder for iteration 0 is already present then program will
	#### terminate further execution to avoid any overwriting
	
	if resume_iteration==None:

		### setup dir for the iteration ###
		iterdir_dict=SetDir().mk_hmm_iter_dirs(project_dir,iteration)
	
		logging.info("Starting pairwise sequence comparision step\n")
		print "[{}]: Starting pairwise sequence comparision step\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"))
		
		#### make sequence database ###
		logging.info("Creating sequence database for pairwise comparision\n")
		print "[{}]: Creating sequence database for pairwise comparision\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"))

		seqdb_dict=defaultdict(dict)
		
		for genome_name in list_ref_genome:
			seq_dict=SeqIO.index(fasta_dict[genome_name],'fasta')
			seqdb_dict[genome_name]=seq_dict
			seq_dict.close()
				
		seqdb_dir=projectdir_dict["SEQ_DB"]
		seqdb_file="DBSEQ.fasta"
		
		logging.info("Saved sequence database {} for pairwise comparision at {}\n".format(seqdb_file,seqdb_dir))
		print "[{}]: Saved sequence database {} for pairwise comparision {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),seqdb_file,seqdb_dir)		
				
		seqdb_path=su().write_seqfile(seqdb_dir,seqdb_file,seqdb_dict)
		
		#### Align query sequences against sequence database ####
		
		logging.info("Aligning Sequences\n")
		print "[{}]: Aligning Sequences\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"))
		
		parsed_alignment_dict=defaultdict(dict)		
		
		for genome_name in list_ref_genome:
			
				logging.info("QUERY: {}\n".format(genome_name))
				print "[{}]: QUERY: {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),genome_name)
					
				outpath=os.path.join(iterdir_dict["ALL_MATCH"],genome_name+".hmmalign.txt")
				domtabpath=os.path.join(iterdir_dict["ALL_MATCH"],genome_name+".domtab.txt")
				bestmatchpath=os.path.join(iterdir_dict["BEST_MATCH"],genome_name+".bhh.txt")
				partialmatchpath=os.path.join(iterdir_dict["PARTIAL_MATCH"],genome_name+".phh.txt")
				chimeramatchpath=os.path.join(iterdir_dict["CHIMERIC_MATCH"],genome_name+".chh.txt")                                                             
					                        
						
				hmmer_proc_returncode,hmmerstderr=Hmmer("phmmer").run_hmmer(c_args,
				                                                            seqdb_path,
				                                                            fasta_dict[genome_name],
				                                                            outpath,
				                                                            domtabpath)
					
				if hmmer_proc_returncode!=0:
					print "[{}]: HMMER Execution Failed, Exiting with an error {}, "\
						"returncode: {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),
												  hmmerstderr,
					                              hmmer_proc_returncode)
					logging.error("HMMER Execution Failed, Exiting with an error {}, "\
						"returncode: {}\n".format(hmmerstderr,
												  hmmer_proc_returncode))
					sys.exit()
						
				domhmmer_dict=Hmmer("phmmer").parse_hmmer_domtab(domtabpath)
				simhmmer_dict=Hmmer("phmmer").parse_hmmer_similarity(outpath)
				hmmer_result=Hmmer("phmmer").add_hmmer_stats(domhmmer_dict,simhmmer_dict)
					
				domhmmer_dict=None
				simhmmer_dict=None
				gc.collect()
					
				fixed_hmmer_result=FixMultiHSP("phmmer").fix_hsps(hmmer_result,
																  c_args['avg_accuracy'])
				filtered_hmmer_result=Hmmer("phmmer").filter_hits(filterparam,
																  fixed_hmmer_result)
					
				hmmer_result=None
				fixed_hmmer_result=None
				gc.collect()
					
				write_hmmer_result(genome_name,
					                bestmatchpath,
					                filtered_hmmer_result["BEST"])
				write_hmmer_result(genome_name,
					                partialmatchpath,
					                filtered_hmmer_result["PARTIAL"])
				write_hmmer_result(genome_name,
					                chimeramatchpath,
					                filtered_hmmer_result["CHIMERA"])
		
		
		logging.info("MCL Clustering Best-Hits\n")
		print "[{}]: MCL Clustering Best-Hits\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"))
		
		mcl_abc_file=os.path.join(projectdir_dict["TMP"],"seq.abc")
			                   	
		os.system("cat {}/* | cut -f 1,2,11 | sed '1d' > {}".format(
				iterdir_dict["BEST_MATCH"],mcl_abc_file))
				
		mcl_cluster=sc().mcl_clustering(mcl_abc_file,c_args["mcl_inflation"])
		
		logging.info("Adding Group-IDs\n")
		print "[{}]: Adding Group-ID\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"))
		
		grouped_cluster=sc().add_cluster_ids(mcl_cluster,start_at=1000)
		
		hmm_cluster_file=os.path.join(iterdir_dict['CLUSTER'],
		                              "Hmmcluster_"+now.strftime("%Y%m%d_%H%M%S")+".txt")
		
		logging.info("Saving Cluster file at {}\n".format(hmm_cluster_file))
		print "[{}]: Saving Cluster file at {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),
		                                                 hmm_cluster_file)
		
		cp_cmd="cp {0} {1}".format(grouped_cluster,hmm_cluster_file)
		proc=subprocess.Popen([cp_cmd],shell=True,stdout=subprocess.PIPE,
		                      stderr=subprocess.PIPE)
		proc.wait()
		stdout,stderr=proc.communicate()
		
		if proc.returncode!=0:
			print stderr
			sys.exit()
		else:	                     
			logging.info("Pairwise sequence comparision completed successfully\n")
			print "[{}]: Pairwise sequence comparision completed successfully\n"\
				  .format(now.strftime("%Y:%m:%d_%H:%M:%S"))
				  
	elif resume_iteration!=None:
		"""
		Execute iterative block for each genome
		read cluster file from iteration to resume from
		build hmm models/ sequence database
		scan hmm models/ sequence database iteratively against each genome
		"""
		
		while len(list_other_genome)!=0:
		
			iteration=int(resume_iteration)+1
			iterdir_dict=SetDir().mk_hmm_iter_dirs(project_dir,iteration)
			
			### getting model cluster directory #####
			model_cluster_dir=os.path.join(projectdir_dict["BASE"],
										   "iter_{0}".format(resume_iteration),"CLUSTER")
			
			### getting model cluster file ####							   
			model_cluster_file=[os.path.join(model_cluster_dir,file_name) 
			                     for file_name in os.listdir(model_cluster_dir)
			                        if file_name.startswith("Hmmcluster")]
			
			### Call function to build hmm models and if singleton then add 
			### to sequence database                        
			#hdb().build_hmmdb(model_cluster_file[0],seq_dir,
			#								outdir_dict["HMM_DB"],
			#								outdir_dict["SEQ_DB"])
			model_dir=os.path.join(projectdir_dict["HMMER_DB"],"HMM_MODELS")
			os.makedirs(model_dir)
			
			#### Run hmmbuild ####
			
			with open(model_cluster_file[0],"r") as clustfile:
			
				pool=ProcessPool(nodes=c_args["cpu"])
				[pool.apply(build_hmm,args=(clustline,fasta_dict,model_dir))
					     for clustline in clustfile]