def main(): """ Main Function """ now = datetime.datetime.now() c_args = parse_args(__file__) #### Define global variables #### genome_info_file=os.path.abspath(c_args['genome_info']) project_name=c_args['project_name'] seq_dir=os.path.abspath(c_args['seq_dir']) output_dir=os.path.abspath(c_args['output_dir']) filterparam={"match_cutoff":c_args["percent_match"], "hitcov_cutoff":c_args["hitcoverage"], "chicov_cutoff":c_args["chimcoverage"]} iteration=0 #### iteration number resume_iteration=None #### resume iteration from this number iter_dir=None #### path for current iteration directory project_dir=None #### path for project directory projectdir_dict=defaultdict() #### dict of project dir output iterdir_dict=defaultdict() #### dict of iter dir output current_timestamp=now.strftime("%Y%m%d_%H%M%S") ### current timestamp if c_args["resume_iter"]!=None: resume_iteration=int(c_args['resume_iter']) ### create project_dir paths ### project_dir=os.path.join(output_dir,project_name) ### log file ### log_file=os.path.join(os.path.abspath("../log"),project_name+"_"+current_timestamp+".log") print "[{}]: Analysis Start\n".format(now.strftime("%Y:%m:%d_%H:%M:%S")) #### create log file for the project #### logging.basicConfig(filename=log_file,format="[%(asctime)s] %(levelname)-8s %(message)s",datefmt='%a, %d %b %Y %H:%M:%S',level=logging.INFO) logging.info("This is a run-log for DeNoGAP2 pipeline executed on {}\n".format(current_timestamp)) #logging.info("[Command executed] {}".format(command)) #### read/check genome information file ##### logging.info("Reading genome information file {}".format(genome_info_file)) print "[{}]: Reading genome information file {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),genome_info_file) genome_dict=read_genome_info(genome_info_file) ### list of reference genome names and other genome names ### list_ref_genome=[genome_name for genome_name in genome_dict if int(genome_dict[genome_name]["REFERENCE"])==1] list_other_genome=[genome_name for genome_name in genome_dict if int(genome_dict[genome_name]["REFERENCE"])==0] ### read names for the fasta files in the data directory ##### logging.info("Reading fasta file names from the sequence directory {}".format(seq_dir)) print "[{}]: Reading fasta file names from the sequence directory {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),seq_dir) fasta_dict=su().get_file_name_from_dir(seq_dir) fasta_list=fasta_dict.values() #### check if protein sequences for all genome names specified in information file are present in the data directory #### for genome_name in genome_dict: if not genome_name in fasta_dict: logging.error("ERROR, Protein sequence fasta file for {} not found in {}\n".format(genome_name,data_dir)) print "[{}]: ERROR, Protein sequence fasta file for {} not found in {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),genome_name,data_dir) sys.exit() #### If not already present set-up project directory ##### logging.info("Setting up project directory for DeNoGAP-HMM {}".format(project_dir)) print "[{}]: Setting up project directory for DeNoGAP-HMM {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),project_dir) if not os.path.exists(project_dir): logging.info("Creating new project dir {}\n".format(project_dir)) print "[{}]: Creating new project dir {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),project_dir) os.makedirs(project_dir) projectdir_dict=SetDir().mk_project_dirs(project_dir) else: logging.info("Using existing project dir {}\n".format(project_dir)) print "[{}]: Using existing project dir {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),project_dir) projectdir_dict=SetDir().mk_project_dirs(project_dir) ##### Create and load sequences into index database ##### logging.info("Creating Sequence index database {}".format(os.path.join(project_dir,"Sequence_index.idx"))) print "[{}]: Creating Sequence index database {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),os.path.join(project_dir,"Sequence_index.idx")) fasta_seq_idx=SeqIO.index_db(os.path.join(projectdir_dict["TMP"],"Sequences_indexdb.idx"), fasta_list,"fasta") #### Initial homolog mapping for reference genomes #### Starts with iteration 0 #### If folder for iteration 0 is already present then program will #### terminate further execution to avoid any overwriting if resume_iteration==None: ### setup dir for the iteration ### iterdir_dict=SetDir().mk_hmm_iter_dirs(project_dir,iteration) logging.info("Starting pairwise sequence comparision step\n") print "[{}]: Starting pairwise sequence comparision step\n".format(now.strftime("%Y:%m:%d_%H:%M:%S")) #### make sequence database ### logging.info("Creating sequence database for pairwise comparision\n") print "[{}]: Creating sequence database for pairwise comparision\n".format(now.strftime("%Y:%m:%d_%H:%M:%S")) seqdb_dict=defaultdict(dict) for genome_name in list_ref_genome: seq_dict=SeqIO.index(fasta_dict[genome_name],'fasta') seqdb_dict[genome_name]=seq_dict seq_dict.close() seqdb_dir=projectdir_dict["SEQ_DB"] seqdb_file="DBSEQ.fasta" logging.info("Saved sequence database {} for pairwise comparision at {}\n".format(seqdb_file,seqdb_dir)) print "[{}]: Saved sequence database {} for pairwise comparision {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),seqdb_file,seqdb_dir) seqdb_path=su().write_seqfile(seqdb_dir,seqdb_file,seqdb_dict) #### Align query sequences against sequence database #### logging.info("Aligning Sequences\n") print "[{}]: Aligning Sequences\n".format(now.strftime("%Y:%m:%d_%H:%M:%S")) parsed_alignment_dict=defaultdict(dict) for genome_name in list_ref_genome: logging.info("QUERY: {}\n".format(genome_name)) print "[{}]: QUERY: {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"),genome_name) outpath=os.path.join(iterdir_dict["ALL_MATCH"],genome_name+".hmmalign.txt") domtabpath=os.path.join(iterdir_dict["ALL_MATCH"],genome_name+".domtab.txt") bestmatchpath=os.path.join(iterdir_dict["BEST_MATCH"],genome_name+".bhh.txt") partialmatchpath=os.path.join(iterdir_dict["PARTIAL_MATCH"],genome_name+".phh.txt") chimeramatchpath=os.path.join(iterdir_dict["CHIMERIC_MATCH"],genome_name+".chh.txt") hmmer_proc_returncode,hmmerstderr=Hmmer("phmmer").run_hmmer(c_args, seqdb_path, fasta_dict[genome_name], outpath, domtabpath) if hmmer_proc_returncode!=0: print "[{}]: HMMER Execution Failed, Exiting with an error {}, "\ "returncode: {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"), hmmerstderr, hmmer_proc_returncode) logging.error("HMMER Execution Failed, Exiting with an error {}, "\ "returncode: {}\n".format(hmmerstderr, hmmer_proc_returncode)) sys.exit() domhmmer_dict=Hmmer("phmmer").parse_hmmer_domtab(domtabpath) simhmmer_dict=Hmmer("phmmer").parse_hmmer_similarity(outpath) hmmer_result=Hmmer("phmmer").add_hmmer_stats(domhmmer_dict,simhmmer_dict) domhmmer_dict=None simhmmer_dict=None gc.collect() fixed_hmmer_result=FixMultiHSP("phmmer").fix_hsps(hmmer_result, c_args['avg_accuracy']) filtered_hmmer_result=Hmmer("phmmer").filter_hits(filterparam, fixed_hmmer_result) hmmer_result=None fixed_hmmer_result=None gc.collect() write_hmmer_result(genome_name, bestmatchpath, filtered_hmmer_result["BEST"]) write_hmmer_result(genome_name, partialmatchpath, filtered_hmmer_result["PARTIAL"]) write_hmmer_result(genome_name, chimeramatchpath, filtered_hmmer_result["CHIMERA"]) logging.info("MCL Clustering Best-Hits\n") print "[{}]: MCL Clustering Best-Hits\n".format(now.strftime("%Y:%m:%d_%H:%M:%S")) mcl_abc_file=os.path.join(projectdir_dict["TMP"],"seq.abc") os.system("cat {}/* | cut -f 1,2,11 | sed '1d' > {}".format( iterdir_dict["BEST_MATCH"],mcl_abc_file)) mcl_cluster=sc().mcl_clustering(mcl_abc_file,c_args["mcl_inflation"]) logging.info("Adding Group-IDs\n") print "[{}]: Adding Group-ID\n".format(now.strftime("%Y:%m:%d_%H:%M:%S")) grouped_cluster=sc().add_cluster_ids(mcl_cluster,start_at=1000) hmm_cluster_file=os.path.join(iterdir_dict['CLUSTER'], "Hmmcluster_"+now.strftime("%Y%m%d_%H%M%S")+".txt") logging.info("Saving Cluster file at {}\n".format(hmm_cluster_file)) print "[{}]: Saving Cluster file at {}\n".format(now.strftime("%Y:%m:%d_%H:%M:%S"), hmm_cluster_file) cp_cmd="cp {0} {1}".format(grouped_cluster,hmm_cluster_file) proc=subprocess.Popen([cp_cmd],shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.wait() stdout,stderr=proc.communicate() if proc.returncode!=0: print stderr sys.exit() else: logging.info("Pairwise sequence comparision completed successfully\n") print "[{}]: Pairwise sequence comparision completed successfully\n"\ .format(now.strftime("%Y:%m:%d_%H:%M:%S")) elif resume_iteration!=None: """ Execute iterative block for each genome read cluster file from iteration to resume from build hmm models/ sequence database scan hmm models/ sequence database iteratively against each genome """ while len(list_other_genome)!=0: iteration=int(resume_iteration)+1 iterdir_dict=SetDir().mk_hmm_iter_dirs(project_dir,iteration) ### getting model cluster directory ##### model_cluster_dir=os.path.join(projectdir_dict["BASE"], "iter_{0}".format(resume_iteration),"CLUSTER") ### getting model cluster file #### model_cluster_file=[os.path.join(model_cluster_dir,file_name) for file_name in os.listdir(model_cluster_dir) if file_name.startswith("Hmmcluster")] ### Call function to build hmm models and if singleton then add ### to sequence database #hdb().build_hmmdb(model_cluster_file[0],seq_dir, # outdir_dict["HMM_DB"], # outdir_dict["SEQ_DB"]) model_dir=os.path.join(projectdir_dict["HMMER_DB"],"HMM_MODELS") os.makedirs(model_dir) #### Run hmmbuild #### with open(model_cluster_file[0],"r") as clustfile: pool=ProcessPool(nodes=c_args["cpu"]) [pool.apply(build_hmm,args=(clustline,fasta_dict,model_dir)) for clustline in clustfile]