def parallel_blat(target_fasta, query_fasta, out_file, QUERY_SPLIT, nJobs=20): '''Function takes in target,query and output file. parallelizes blat by running GNU parallel - Currently only parallelizes on query space - Also assumes that query fasta file takes two lines per sequence (not wrapped)''' target_length = float( subprocess.check_output('grep -c \'>\' ' + target_fasta, shell=True)) query_length = float( subprocess.check_output('grep -c \'>\' ' + query_fasta, shell=True)) os.system( 'awk \'/^>/{print s? s"\\n"$0:$0;s="";next}{s=s sprintf("%s",$0)}END{if(s)print s}\' ' + query_fasta + ' > ' + query_fasta + '_nospace') #os.system( 'awk \'/^>/{print s? s"\\n"$0:$0;s="";next}{s=s sprintf("%s",$0)}END{if(s)print s}\' ' +target_fasta + ' > '+ target_fasta) query_fasta = query_fasta + '_nospace' #TARGET_SPLIT = 1 #QUERY_SPLIT = 4 #Alernately #QUERY_SPLIT = min(int(math.ceil(float(query_length)/float(target_length))),50) #QUERY_SPLIT = max(int(math.ceil(float(query_length)/float(target_length))),500) #QUERY_SPLIT = int(min(QUERY_SPLIT,query_length)) #QUERY_SPLIT= 100 #pdb.set_trace() print('Query length: ' + str(query_length) + ' Target length: ' + str(target_length) + ' Query Split: ' + str(QUERY_SPLIT)) split_size = int(math.floor(float(query_length) / QUERY_SPLIT)) '''if split_size % 2 !=0: split_size +=1''' '''if query_length <= float(target_length): print('Cannot parallelize on query. Running Vanilla Blat') os.system('blat -noHead ' + target_fasta + ' ' + query_fasta + ' ' + out_file) return''' for n in range(QUERY_SPLIT): if n == QUERY_SPLIT - 1: cut_file(query_fasta, query_fasta + '_' + str(n + 1), 2 * (n) * split_size + 1, 2 * query_length) else: cut_file(query_fasta, query_fasta + '_' + str(n + 1), 2 * (n) * split_size + 1, 2 * (n + 1) * split_size) #pdb.set_trace() q_range = range(QUERY_SPLIT) x = [int(i) + 1 for i in q_range] q_str = " ".join(map(str, x)) os.system('rm ' + out_file + '_*') cmds = [] for a in x: cmds.append('blat -noHead ' + target_fasta + ' ' + query_fasta + '_' + str(a) + ' ' + out_file + '_' + str(a)) #print('blat -noHead '+ target_fasta + ' ' + query_fasta + '_' + str(a)+ ' ' +out_file + '_' + str(a)) cmds = tuple(cmds) run_parallel_cmds.run_cmds(cmds, nJobs) #print('parallel blat -noHead ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: ' + q_str ) #os.system('time parallel blat -noHead ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: ' + q_str ) #os.system('parallel blat ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: {1..' + str(QUERY_SPLIT) + '}' ) #os.system('sort -k 10 ' + out_file + '_* > ' + out_file) os.system('cat ' + out_file + '_* > ' + out_file) os.system('rm ' + out_file + '_*') os.system('rm ' + query_fasta + '_*')
def parallel_blat(target_fasta,query_fasta,out_file,QUERY_SPLIT,nJobs=60): '''Function takes in target,query and output file. parallelizes blat by running GNU parallel - Currently only parallelizes on query space - Also assumes that query fasta file takes two lines per sequence (not wrapped)''' target_length = float(subprocess.check_output('grep -c \'>\' ' + target_fasta,shell=True)) query_length = float(subprocess.check_output('grep -c \'>\' ' + query_fasta,shell=True)) os.system( 'awk \'/^>/{print s? s"\\n"$0:$0;s="";next}{s=s sprintf("%s",$0)}END{if(s)print s}\' ' +query_fasta + ' > '+ query_fasta +'_nospace') #os.system( 'awk \'/^>/{print s? s"\\n"$0:$0;s="";next}{s=s sprintf("%s",$0)}END{if(s)print s}\' ' +target_fasta + ' > '+ target_fasta) query_fasta = query_fasta + '_nospace' #TARGET_SPLIT = 1 #QUERY_SPLIT = 4 #Alernately #QUERY_SPLIT = min(int(math.ceil(float(query_length)/float(target_length))),50) #QUERY_SPLIT = max(int(math.ceil(float(query_length)/float(target_length))),500) #QUERY_SPLIT = int(min(QUERY_SPLIT,query_length)) #QUERY_SPLIT= 100 #pdb.set_trace() print('Query length: ' +str(query_length) + ' Target length: ' + str(target_length) + ' Query Split: ' + str(QUERY_SPLIT)) split_size = int(math.floor(float(query_length)/QUERY_SPLIT)) '''if split_size % 2 !=0: split_size +=1''' '''if query_length <= float(target_length): print('Cannot parallelize on query. Running Vanilla Blat') os.system('./blat -noHead ' + target_fasta + ' ' + query_fasta + ' ' + out_file) return''' query_size = int(query_length / QUERY_SPLIT) if query_size%2 != 0: query_size += 1 split_dir = out_file[:-1] + '_split' if not os.path.exists(split_dir): os.makedirs(split_dir) subprocess.call(['split', '-dl', str(query_size), query_fasta, split_dir + '/split']) n = 0 cmds = [] split_files = os.listdir(split_dir) for f in split_files: cmds.append('./blat -noHead '+ target_fasta + ' ' + split_dir+'/'+f + ' ' + out_file + '_' + str(n)) #print('blat -noHead '+ target_fasta + ' ' + split_dir+'/'+f + ' ' +out_file + '_' + str(n)) n += 1 cmds = tuple(cmds) run_parallel_cmds.run_cmds(cmds,nJobs) #print('parallel blat -noHead ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: ' + q_str ) #os.system('time parallel blat -noHead ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: ' + q_str ) #os.system('parallel blat ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: {1..' + str(QUERY_SPLIT) + '}' ) #os.system('sort -k 10 ' + out_file + '_* > ' + out_file) os.system('cat ' + out_file + '_* > ' + out_file) os.system('rm ' + split_dir + '/*') os.system('rm ' + out_file + '_*' ) os.system('rm ' + query_fasta + '_*' )
def parallel_blat(target_fasta,query_fasta,out_file,QUERY_SPLIT,nJobs=20): '''Function takes in target,query and output file. parallelizes blat by running GNU parallel - Currently only parallelizes on query space - Also assumes that query fasta file takes two lines per sequence (not wrapped)''' target_length = float(subprocess.check_output('grep -c \'>\' ' + target_fasta,shell=True)) query_length = float(subprocess.check_output('grep -c \'>\' ' + query_fasta,shell=True)) os.system( 'awk \'/^>/{print s? s"\\n"$0:$0;s="";next}{s=s sprintf("%s",$0)}END{if(s)print s}\' ' +query_fasta + ' > '+ query_fasta +'_nospace') #os.system( 'awk \'/^>/{print s? s"\\n"$0:$0;s="";next}{s=s sprintf("%s",$0)}END{if(s)print s}\' ' +target_fasta + ' > '+ target_fasta) query_fasta = query_fasta + '_nospace' #TARGET_SPLIT = 1 #QUERY_SPLIT = 4 #Alernately #QUERY_SPLIT = min(int(math.ceil(float(query_length)/float(target_length))),50) #QUERY_SPLIT = max(int(math.ceil(float(query_length)/float(target_length))),500) #QUERY_SPLIT = int(min(QUERY_SPLIT,query_length)) #QUERY_SPLIT= 100 #pdb.set_trace() print('Query length: ' +str(query_length) + ' Target length: ' + str(target_length) + ' Query Split: ' + str(QUERY_SPLIT)) split_size = int(math.floor(float(query_length)/QUERY_SPLIT)) '''if split_size % 2 !=0: split_size +=1''' '''if query_length <= float(target_length): print('Cannot parallelize on query. Running Vanilla Blat') os.system('blat -noHead ' + target_fasta + ' ' + query_fasta + ' ' + out_file) return''' for n in range(QUERY_SPLIT): if n==QUERY_SPLIT-1: cut_file(query_fasta,query_fasta+'_'+str(n+1),2*(n)*split_size+1,2*query_length) else: cut_file(query_fasta,query_fasta+'_'+str(n+1),2*(n)*split_size+1,2*(n+1)*split_size) #pdb.set_trace() q_range = range(QUERY_SPLIT) x = [int(i)+1 for i in q_range] q_str = " ".join(map(str,x)) os.system('rm ' + out_file + '_*' ) cmds = [] for a in x: cmds.append('blat -noHead '+ target_fasta + ' ' + query_fasta + '_' + str(a)+ ' ' + out_file + '_' + str(a)) #print('blat -noHead '+ target_fasta + ' ' + query_fasta + '_' + str(a)+ ' ' +out_file + '_' + str(a)) cmds = tuple(cmds) run_parallel_cmds.run_cmds(cmds,nJobs) #print('parallel blat -noHead ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: ' + q_str ) #os.system('time parallel blat -noHead ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: ' + q_str ) #os.system('parallel blat ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: {1..' + str(QUERY_SPLIT) + '}' ) #os.system('sort -k 10 ' + out_file + '_* > ' + out_file) os.system('cat ' + out_file + '_* > ' + out_file) os.system('rm ' + out_file + '_*' ) os.system('rm ' + query_fasta + '_*' )
def rc_gnu(infile, tempfile, outfile, nCPU, python_path='python ', shannon_dir=''): chunks = nCPU if chunks == 1: run_cmd(python_path + ' ' + shannon_dir + 'rc_s.py ' + infile + ' ' + outfile) return find_L(infile) file_length = float( subprocess.check_output('grep -c \'>\' ' + infile, shell=True)) split_size = int(math.ceil(float(file_length) / chunks)) infile_piece = open(tempfile + '_1', 'w') piece_no = 1 curr_seqs = [] read_tot = 0 no_reads = 0 for line in open(infile): curr_seqs.append(line) fields = line.strip().split() if fields and fields[0][0] != '>': read_tot += len(fields[0]) no_reads += 1 if len(curr_seqs) == split_size * 2: infile_piece = open(tempfile + '_' + str(piece_no), 'w') infile_piece.write(''.join(curr_seqs)) infile_piece.close() piece_no += 1 curr_seqs = [] if curr_seqs: infile_piece = open(tempfile + '_' + str(piece_no), 'w') infile_piece.write(''.join(curr_seqs)) infile_piece.close() else: piece_no -= 1 N = no_reads L = (read_tot) / max(N, 1) '''for n in range(chunks): if n==chunks-1: cut_file(infile,infile+'_'+str(n+1),2*(n)*split_size+1,2*file_length) else: cut_file(infile,infile+'_'+str(n+1),2*(n)*split_size+1,2*(n+1)*split_size)''' chunks = piece_no c_range = range(chunks) x = [int(i) + 1 for i in c_range] c_str = " ".join(map(str, x)) cmds = [] for i in range(chunks): cmds.append(python_path + ' ' + shannon_dir + 'rc_s.py ' + tempfile + '_' + str(i + 1) + ' ' + outfile + '_' + str(i + 1)) run_parallel_cmds.run_cmds(cmds, chunks) #os.system('parallel --bibtex ' + python_path + ' ' + shannon_dir + 'rc_s.py ' + tempfile + '_{} ' + outfile + '_{} ' + ' ::: ' + c_str ) file_list = ' '.join([outfile + '_' + str(i + 1) for i in range(chunks)]) run_cmd('cat ' + file_list + ' > ' + outfile) run_cmd('rm ' + outfile + '_* ') run_cmd('rm ' + tempfile + '_* ') print(N) print(L) return (N, L)
'''if double_stranded: mb_sf_param_string += " --ds " ''' if only_reads: mb_sf_param_string += " --only_reads " if filter_FP_flag: mb_sf_param_string += " --filter_FP " mb_sf_param_string += " --nJobs " + str(nJobs) + " " if main_server_parameter_string and inDisk: if run_parallel and nJobs > 1: cmds = [] for param_str in main_server_parameter_string.split(): cmds.append(python_path + " " + shannon_dir + "run_MB_SF_fn.py " + param_str + " --run_alg " + mb_sf_param_string + " --kmer_size " + str(K) + " " + paired_end_flag + " --dir_name " + comp_directory_name + " " + param_str + " --shannon_dir " + shannon_dir + " --python_path " + python_path) cmds = tuple(cmds) run_parallel_cmds.run_cmds(cmds,nJobs) #run_cmd(gnu_parallel_path + " -j " + str(nJobs) + " " + python_path + " " + shannon_dir + "run_MB_SF.py {} --run_alg " + mb_sf_param_string + " --kmer_size " + str(K) + " " + paired_end_flag + " --dir_name " + comp_directory_name + " --shannon_dir " + shannon_dir + " --python_path " + python_path + " ::: " + main_server_parameter_string) else: for param_str in main_server_parameter_string.split(): run_cmd(python_path + " " + shannon_dir + "run_MB_SF_fn.py " + param_str + " --run_alg " + mb_sf_param_string + " --kmer_size " + str(K) + " " + paired_end_flag + " --dir_name " + comp_directory_name + " " + param_str + " --shannon_dir " + shannon_dir + " --python_path " + python_path) elif inMem: param_str={}; contig_size = {} for comp in new_components: dir_base = comp_directory_name + "/" + sample_name + str(comp) param_str[comp] = dir_base + " --run_alg " + mb_sf_param_string + " --kmer_size " + str(K) + " " + paired_end_flag + " --dir_name " + comp_directory_name + " " + dir_base + " --shannon_dir " + shannon_dir + " --python_path " + python_path contig_size[comp] = sum(len(cw_vec) for cw_vec in contig_weights[comp]) contig_vec = contig_size.items() sorted_contig_vec = sorted(contig_vec,key=itemgetter(1),reverse=True) def get_column(matrix, i):