def aln_to_Newick(path, folders_dict, raxml_timelimit, raxml_path, threads): """ function: build core gene SNP tree using SNP alignment input: SNP_whole_matrix.aln output: strain_tree.nwk """ cluster_seq_path=folders_dict['cluster_seq_path'] log_path=folders_dict['log_path'] output_path = '_'.join([cluster_seq_path+'temp_coretree', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))]) os.system('mkdir %s'%output_path) SNP_matrix_path=cluster_seq_path+'SNP_whole_matrix.aln' cwd = os.getcwd() os.chdir(output_path) ## run fasttree start = time.time(); fasttree_program= 'fasttree' if check_dependency('fasttree') else 'FastTree' os.system(fasttree_program+' -gtr -nt -gamma -nosupport -mlacc 2 -slownni '+SNP_matrix_path+' > initial_tree.newick0 2> '+log_path+'fasttree.log') ; print ' fasttree time-cost:', times(start) resolve_polytomies('initial_tree.newick0','initial_tree.newick') ## run raxml start = time.time(); out_fname = "tree_infer.newick" if raxml_timelimit>0: print '%s%d%s'%('RAxML tree optimization within the timelimit of ',raxml_timelimit, ' minutes') # exec for killing process end_time = time.time() + int(raxml_timelimit*60) # raxml_program= 'raxml' if check_dependency('raxml') else 'raxmlHPC' process = subprocess.Popen('exec '+raxml_program+' -f d -T '+str(threads)+' -j -s '+SNP_matrix_path+' -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick > '+log_path+'raxml.log', shell=True) while (time.time() < end_time): if os.path.isfile('RAxML_result.topology'): break time.sleep(10) process.terminate() checkpoint_files = glob.glob('RAxML_checkpoint*') if os.path.isfile('RAxML_result.topology'): checkpoint_files.append('RAxML_result.topology') if len(checkpoint_files) > 0: last_tree_file = checkpoint_files[-1] shutil.copy(last_tree_file, 'raxml_tree.newick') else: shutil.copy('initial_tree.newick', 'raxml_tree.newick') else: shutil.copy('initial_tree.newick', 'raxml_tree.newick') print 'RAxML branch length optimization and rooting' os.system(raxml_program+' -f e -T '+str(threads)+' -s '+SNP_matrix_path+' -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick > '+log_path+'raxml.log') shutil.copy('RAxML_result.branches', out_fname) print ' raxml time-cost:', times(start) midpointRooting(out_fname,'strain_tree.nwk') shutil.copy('strain_tree.nwk', cluster_seq_path+'strain_tree.nwk') os.chdir(cwd) os.system('rm -r %s'%output_path)
def diamond_run(output_path, dmd_ref_file, threads, diamond_evalue, diamond_max_target_seqs, diamond_identity, diamond_query_cover, diamond_subject_cover, diamond_path, diamond_no_self_hits=0): """ run diamond using sensitive alignment mode """ if diamond_path == '': diam = ''.join( [os.path.dirname(os.path.realpath(__file__)), '/../tools/diamond']) else: diam = diamond_path print 'diamond inputfile:', dmd_ref_file input_prefix = dmd_ref_file.split('.faa')[0] if input_prefix == 'reference': output_m8_filename = 'query_matches.m8' else: output_m8_filename = '%s%s' % (input_prefix, '.m8') makedb_command = ''.join([ diam, ' makedb -p ', threads, ' --in ', output_path, dmd_ref_file, ' -d ', output_path, 'nr_', input_prefix, '> ', output_path, 'diamond_makedb_', input_prefix, '.log 2>&1' ]) start = time.time() os.system(makedb_command) print 'diamond build index (makedb):', times(start) print 'command line record:', makedb_command ## option to enable --no-self-hits if diamond_no_self_hits == 0: option_no_self_hits = '' else: option_no_self_hits = ' --no-self-hits' blastp_command = ''.join([ diam, ' blastp --sensitive -p ', threads, ' -e ', diamond_evalue, ' --id ', diamond_identity, ' --query-cover ', diamond_query_cover, ' --subject-cover ', diamond_subject_cover, option_no_self_hits, ' -k ', diamond_max_target_seqs, ' -d ', output_path, 'nr_', input_prefix, ' -f 6 qseqid sseqid bitscore', ' -q ', output_path, dmd_ref_file, ' -o ', output_path, output_m8_filename, ' -t ./ > ', output_path, 'diamond_blastp_', input_prefix, '.log 2>&1' ]) start = time.time() os.system(blastp_command) print 'diamond alignment (blastp):', times(start) print 'diamond_max_target_seqs used: %s' % diamond_max_target_seqs print 'command line record:', blastp_command ## remove diamond binary database (dmnd) file os.system(''.join(['rm ', output_path, 'nr_', input_prefix, '.dmnd']))
def diamond_run(output_path, dmd_ref_file, threads, diamond_evalue, diamond_max_target_seqs, diamond_identity, diamond_query_cover, diamond_subject_cover, diamond_path, diamond_no_self_hits=0): """ run diamond using sensitive alignment mode """ if diamond_path=='': diamond_path=find_executable('diamond') diam=diamond_path print 'diamond inputfile:', dmd_ref_file input_prefix= dmd_ref_file.split('.faa')[0] if input_prefix=='reference': output_m8_filename='query_matches.m8' else: output_m8_filename= '%s%s'%(input_prefix,'.m8') makedb_command= ''.join([diam,' makedb -p ',threads, ' --in ', output_path, dmd_ref_file, ' -d ',output_path,'nr_',input_prefix, '> ',output_path,'diamond_makedb_',input_prefix,'.log 2>&1' ]) start = time.time() os.system(makedb_command) print 'diamond build index (makedb):', times(start) print 'command line record:', makedb_command ## option to enable --no-self-hits if diamond_no_self_hits==0: option_no_self_hits='' else: option_no_self_hits=' --no-self-hits' blastp_command= ''.join([diam,' blastp --sensitive -p ',threads, ' -e ', diamond_evalue, ' --id ', diamond_identity, ' --query-cover ', diamond_query_cover, ' --subject-cover ', diamond_subject_cover, option_no_self_hits, ' -k ', diamond_max_target_seqs, ' -d ',output_path,'nr_', input_prefix, ' -f 6 qseqid sseqid bitscore', ' -q ',output_path,dmd_ref_file, ' -o ',output_path,output_m8_filename, ' -t ./ > ',output_path,'diamond_blastp_',input_prefix,'.log 2>&1' ]) start = time.time() os.system(blastp_command) print 'diamond alignment (blastp):', times(start) print 'diamond_max_target_seqs used: %s'%diamond_max_target_seqs print 'command line record:', blastp_command ## remove diamond binary database (dmnd) file os.system(''.join(['rm ',output_path,'nr_', input_prefix,'.dmnd']))
def build_representative_cluster(clustering_path, threads, input_prefix): """ build representative cluster """ start = time.time() cluster_file= ''.join([clustering_path,input_prefix,'_cluster.output']) representative_outputfile= ''.join([clustering_path,input_prefix,'_representative','.faa']) subproblem_seqs_path= '%ssubproblem_cluster_seqs/'%clustering_path subproblem_merged_faa= ''.join([clustering_path,input_prefix,'.faa']) subproblem_faa_dict= read_fasta(subproblem_merged_faa) with open(cluster_file, 'rb') as cluster_input: subproblem_geneCluster_dt= defaultdict(list) cluster_input_lines= [iline for iline in cluster_input] subproblem_geneCluster_dt= {} subproblem_run_number= input_prefix.split('subproblem_')[1] for gid, iline in enumerate(cluster_input_lines):#cluster_input ## use time to avoid clusterID conflict clusterID= "GCs%s_%07d%s"%(subproblem_run_number, gid, time.strftime('%M%S',time.gmtime())) gene_ids= iline.rstrip().split('\t') subproblem_geneCluster_dt[clusterID]= gene_ids ## representative_seq representative_seq=subproblem_faa_dict[gene_ids[0]] ## write in representative strain with open(representative_outputfile, 'a') as representative_output: write_in_fa(representative_output, clusterID, representative_seq) ## write subproblem_geneCluster_dt write_pickle(''.join([clustering_path,input_prefix,'_dicts.cpk']), subproblem_geneCluster_dt) print 'build representative clusters for', input_prefix,': ', times(start), '\n'
def build_representative_cluster(clustering_path, threads, input_prefix): """ build representative cluster """ start = time.time() cluster_file = ''.join([clustering_path, input_prefix, '_cluster.output']) representative_outputfile = ''.join( [clustering_path, input_prefix, '_representative', '.faa']) subproblem_seqs_path = '%ssubproblem_cluster_seqs/' % clustering_path subproblem_merged_faa = ''.join([clustering_path, input_prefix, '.faa']) subproblem_faa_dict = read_fasta(subproblem_merged_faa) with open(cluster_file, 'rb') as cluster_input: subproblem_geneCluster_dt = defaultdict(list) cluster_input_lines = [iline for iline in cluster_input] subproblem_geneCluster_dt = {} subproblem_run_number = input_prefix.split('subproblem_')[1] for gid, iline in enumerate(cluster_input_lines): #cluster_input ## use time to avoid clusterID conflict clusterID = "GCs%s_%07d%s" % (subproblem_run_number, gid, time.strftime('%M%S', time.gmtime())) gene_ids = iline.rstrip().split('\t') subproblem_geneCluster_dt[clusterID] = gene_ids ## representative_seq representative_seq = subproblem_faa_dict[gene_ids[0]] ## write in representative strain with open(representative_outputfile, 'a') as representative_output: write_in_fa(representative_output, clusterID, representative_seq) ## write subproblem_geneCluster_dt write_pickle(''.join([clustering_path, input_prefix, '_dicts.cpk']), subproblem_geneCluster_dt) print 'build representative clusters for', input_prefix, ': ', times( start), '\n'
def filter_hits_single(output_path, threads, input_prefix='', gather_seq_length_flag=0, no_filtering=True): """ """ start = time.time() if input_prefix == '': ## default empty for all-vs.-all m8 file m8_filename = 'query_matches.m8' filtered_hits_filename = 'filtered_hits.abc' else: ## add input prefix for sub_problem abc files in divide-and-conquer method m8_filename = '%s%s' % (input_prefix, '.m8') filtered_hits_filename = '%s%s' % (input_prefix, '_filtered_hits.abc') if no_filtering == True: m8_fpath = os.path.abspath(output_path + m8_filename) filtered_hits_fpath = os.path.abspath(output_path + filtered_hits_filename) os.system(' '.join(['ln -s', m8_fpath, filtered_hits_fpath])) else: with open(output_path+m8_filename,'rb') as m8_file,\ open(output_path+filtered_hits_filename,'wb') as abc_file: print('filter hits:') #using_BS=1; using_BAL=0 for iline in m8_file: cols_list = iline.rstrip().split('\t') #query, ref, bit_score from column (0,1,-1) abc_file.write( '%s\n' % '\t'.join([cols_list[ind] for ind in (0, 1, -1)])) ## BAL scoring option not used # if using_BS:## using bscore # abc_file.write('%s\n'%'\t'.join([cols_list[ind] for ind in (0,1,-1)])) # elif using_BAL:## using bscore/aln_len # abc_file.write('%s\n'%'\t'.join([cols_list[0], cols_list[1], \ # str(round( float(cols_list[-1])/float(cols_list[3]),5 ))])) if input_prefix == '': print 'filter hits runtime:', times(start), '\n' else: print 'filter hits runtime for ', input_prefix, ':', times( start), '\n'
def mcl_run(clustering_path, threads, input_prefix, mcl_inflation): """ """ start = time.time() cwd = os.getcwd() os.chdir(clustering_path) command_mcl=''.join(['mcl ',input_prefix,'_filtered_hits.abc --abc ',\ '-o ',input_prefix,'_cluster.output -I ',str(mcl_inflation),\ ' -te ',str(threads),' > ','mcl-',input_prefix,'.log 2>&1']) os.system(command_mcl) print 'run command line mcl in ',clustering_path,': \n', command_mcl print 'mcl runtime for ', input_prefix,': ', times(start), '\n' os.chdir(cwd)
def mcl_run(clustering_path, threads, input_prefix, mcl_inflation): """ """ start = time.time() cwd = os.getcwd() os.chdir(clustering_path) command_mcl=''.join(['mcl ',input_prefix,'_filtered_hits.abc --abc ',\ '-o ',input_prefix,'_cluster.output -I ',str(mcl_inflation),\ ' -te ',str(threads),' > ','mcl-',input_prefix,'.log 2>&1']) os.system(command_mcl) print 'run command line mcl in ', clustering_path, ': \n', command_mcl print 'mcl runtime for ', input_prefix, ': ', times(start), '\n' os.chdir(cwd)
def mcl_run(output_path, threads, mcl_inflation, input_prefix=''): """ """ start = time.time() if input_prefix=='': ## default empty for abc file from all-vs.-all m8 file filtered_hits_filename='filtered_hits.abc' else: filtered_hits_filename='%s%s'%(input_prefix,'_filtered_hits.abc') command_mcl=''.join(['mcl ',output_path,filtered_hits_filename,' --abc ',\ '-o ',output_path,'allclusters.tsv -I ',str(mcl_inflation),\ ' -te ',str(threads),' > ',output_path,'mcl.log 2>&1']) os.system(command_mcl) print 'command line mcl:', command_mcl print 'mcl runtime:', times(start),'\n'
def mcl_run(output_path, threads, mcl_inflation, input_prefix=''): """ """ start = time.time() if input_prefix == '': ## default empty for abc file from all-vs.-all m8 file filtered_hits_filename = 'filtered_hits.abc' else: filtered_hits_filename = '%s%s' % (input_prefix, '_filtered_hits.abc') command_mcl=''.join(['mcl ',output_path,filtered_hits_filename,' --abc ',\ '-o ',output_path,'allclusters.tsv -I ',str(mcl_inflation),\ ' -te ',str(threads),' > ',output_path,'mcl.log 2>&1']) os.system(command_mcl) print 'command line mcl:', command_mcl print 'mcl runtime:', times(start), '\n'
def filter_hits_single(output_path, threads, input_prefix='', gather_seq_length_flag=0, no_filtering=True): """ """ start = time.time() if input_prefix=='': ## default empty for all-vs.-all m8 file m8_filename='query_matches.m8' filtered_hits_filename='filtered_hits.abc' else: ## add input prefix for sub_problem abc files in divide-and-conquer method m8_filename='%s%s'%(input_prefix,'.m8') filtered_hits_filename='%s%s'%(input_prefix,'_filtered_hits.abc') if no_filtering==True: m8_fpath=os.path.abspath(output_path+m8_filename) filtered_hits_fpath=os.path.abspath(output_path+filtered_hits_filename) os.system(' '.join(['ln -s',m8_fpath,filtered_hits_fpath])) else: with open(output_path+m8_filename,'rb') as m8_file,\ open(output_path+filtered_hits_filename,'wb') as abc_file: print('filter hits:') #using_BS=1; using_BAL=0 for iline in m8_file: cols_list= iline.rstrip().split('\t') #query, ref, bit_score from column (0,1,-1) abc_file.write('%s\n'%'\t'.join([cols_list[ind] for ind in (0,1,-1)])) ## BAL scoring option not used # if using_BS:## using bscore # abc_file.write('%s\n'%'\t'.join([cols_list[ind] for ind in (0,1,-1)])) # elif using_BAL:## using bscore/aln_len # abc_file.write('%s\n'%'\t'.join([cols_list[0], cols_list[1], \ # str(round( float(cols_list[-1])/float(cols_list[3]),5 ))])) if input_prefix=='': print 'filter hits runtime:', times(start),'\n' else: print 'filter hits runtime for ',input_prefix,':', times(start),'\n'
max_strain_fraction_presence_association=params.max_strain_fraction_presence_association, optional_table_column=params.optional_table_column, clean_temporary_files=params.clean_temporary_files ) if 1 in params.steps:#step 01: myPangenome.make_strain_list() print '====== step01: strain list successfully loaded' ## deactivated (activation:'2' -> 2) if '2' in params.steps:# step02: print '====== starting step02: download NCBI RefSeq GenBank file from strain list' start = time.time() fetch_refseq(path, strain_list) print '====== time for step02:' print times(start),'\n' if 3 in params.steps:# step03: print '====== starting step03: extract sequences from GenBank file' start = time.time() myPangenome.extract_gbk_sequences() print '====== time for step03:' print times(start),'\n' if 4 in params.steps:# step04: print '====== starting step04: extract metadata from GenBank file' start = time.time() myPangenome.extract_gbk_metadata() print '====== time for step04:' print times(start),'\n'