Ejemplo n.º 1
0
def parallel_blat(target_fasta, query_fasta, out_file, QUERY_SPLIT, nJobs=20):
    '''Function takes in target,query and output file. parallelizes blat by running GNU parallel
	- Currently only parallelizes on query space
	- Also assumes that query fasta file takes two lines per sequence (not wrapped)'''
    target_length = float(
        subprocess.check_output('grep -c \'>\' ' + target_fasta, shell=True))
    query_length = float(
        subprocess.check_output('grep -c \'>\' ' + query_fasta, shell=True))
    os.system(
        'awk \'/^>/{print s? s"\\n"$0:$0;s="";next}{s=s sprintf("%s",$0)}END{if(s)print s}\' '
        + query_fasta + ' > ' + query_fasta + '_nospace')
    #os.system( 'awk \'/^>/{print s? s"\\n"$0:$0;s="";next}{s=s sprintf("%s",$0)}END{if(s)print s}\' ' +target_fasta + ' > '+ target_fasta)
    query_fasta = query_fasta + '_nospace'
    #TARGET_SPLIT = 1
    #QUERY_SPLIT = 4
    #Alernately
    #QUERY_SPLIT = min(int(math.ceil(float(query_length)/float(target_length))),50)
    #QUERY_SPLIT = max(int(math.ceil(float(query_length)/float(target_length))),500)
    #QUERY_SPLIT = int(min(QUERY_SPLIT,query_length))
    #QUERY_SPLIT= 100
    #pdb.set_trace()
    print('Query length: ' + str(query_length) + ' Target length: ' +
          str(target_length) + ' Query Split: ' + str(QUERY_SPLIT))
    split_size = int(math.floor(float(query_length) / QUERY_SPLIT))
    '''if split_size % 2 !=0:
		split_size +=1'''
    '''if query_length <= float(target_length):
		print('Cannot parallelize on query. Running Vanilla Blat')
		os.system('blat -noHead ' + target_fasta + ' ' +  query_fasta + ' ' + out_file)	
		return'''

    for n in range(QUERY_SPLIT):
        if n == QUERY_SPLIT - 1:
            cut_file(query_fasta, query_fasta + '_' + str(n + 1),
                     2 * (n) * split_size + 1, 2 * query_length)
        else:
            cut_file(query_fasta, query_fasta + '_' + str(n + 1),
                     2 * (n) * split_size + 1, 2 * (n + 1) * split_size)
    #pdb.set_trace()
    q_range = range(QUERY_SPLIT)
    x = [int(i) + 1 for i in q_range]
    q_str = " ".join(map(str, x))
    os.system('rm ' + out_file + '_*')
    cmds = []
    for a in x:
        cmds.append('blat -noHead ' + target_fasta + ' ' + query_fasta + '_' +
                    str(a) + ' ' + out_file + '_' + str(a))
        #print('blat -noHead '+ target_fasta + ' ' + query_fasta + '_' + str(a)+ ' ' +out_file + '_' + str(a))
    cmds = tuple(cmds)
    run_parallel_cmds.run_cmds(cmds, nJobs)

    #print('parallel blat -noHead ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: ' + q_str )
    #os.system('time parallel blat -noHead ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: ' + q_str  )
    #os.system('parallel blat ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: {1..' + str(QUERY_SPLIT) + '}' )
    #os.system('sort -k 10 ' + out_file + '_* > ' + out_file)
    os.system('cat ' + out_file + '_* > ' + out_file)
    os.system('rm ' + out_file + '_*')
    os.system('rm ' + query_fasta + '_*')
Ejemplo n.º 2
0
def parallel_blat(target_fasta,query_fasta,out_file,QUERY_SPLIT,nJobs=60):
	'''Function takes in target,query and output file. parallelizes blat by running GNU parallel
	- Currently only parallelizes on query space
	- Also assumes that query fasta file takes two lines per sequence (not wrapped)'''
	target_length = float(subprocess.check_output('grep -c \'>\' ' + target_fasta,shell=True))
	query_length = float(subprocess.check_output('grep -c \'>\' ' + query_fasta,shell=True))
	os.system( 'awk \'/^>/{print s? s"\\n"$0:$0;s="";next}{s=s sprintf("%s",$0)}END{if(s)print s}\' ' +query_fasta + ' > '+ query_fasta +'_nospace')
	#os.system( 'awk \'/^>/{print s? s"\\n"$0:$0;s="";next}{s=s sprintf("%s",$0)}END{if(s)print s}\' ' +target_fasta + ' > '+ target_fasta)
	query_fasta = query_fasta + '_nospace'
	#TARGET_SPLIT = 1
	#QUERY_SPLIT = 4
	#Alernately
	#QUERY_SPLIT = min(int(math.ceil(float(query_length)/float(target_length))),50)
	#QUERY_SPLIT = max(int(math.ceil(float(query_length)/float(target_length))),500) 
	#QUERY_SPLIT = int(min(QUERY_SPLIT,query_length))
	#QUERY_SPLIT= 100
	#pdb.set_trace()
	print('Query length: ' +str(query_length) + ' Target length: ' + str(target_length) + ' Query Split: ' + str(QUERY_SPLIT))
	split_size = int(math.floor(float(query_length)/QUERY_SPLIT))
	'''if split_size % 2 !=0:
		split_size +=1'''
	'''if query_length <= float(target_length):
		print('Cannot parallelize on query. Running Vanilla Blat')
		os.system('./blat -noHead ' + target_fasta + ' ' +  query_fasta + ' ' + out_file)	
		return'''
	query_size = int(query_length / QUERY_SPLIT)
	if query_size%2 != 0:
		query_size += 1

	split_dir = out_file[:-1] + '_split'
	if not os.path.exists(split_dir):
		os.makedirs(split_dir)
	subprocess.call(['split', '-dl', str(query_size), query_fasta, split_dir + '/split'])
	n = 0
	cmds = []
	split_files = os.listdir(split_dir)
	for f in split_files:
		cmds.append('./blat -noHead '+ target_fasta + ' ' + split_dir+'/'+f + ' ' + out_file + '_' + str(n))
		#print('blat -noHead '+ target_fasta + ' ' + split_dir+'/'+f + ' ' +out_file + '_' + str(n))
		n += 1

	cmds = tuple(cmds)
	run_parallel_cmds.run_cmds(cmds,nJobs)

	#print('parallel blat -noHead ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: ' + q_str )
	#os.system('time parallel blat -noHead ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: ' + q_str  )
	#os.system('parallel blat ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: {1..' + str(QUERY_SPLIT) + '}' )
	#os.system('sort -k 10 ' + out_file + '_* > ' + out_file)
	os.system('cat ' + out_file + '_* > ' + out_file)
	os.system('rm ' + split_dir + '/*')
	os.system('rm ' + out_file + '_*' )
	os.system('rm ' + query_fasta + '_*' )
Ejemplo n.º 3
0
def parallel_blat(target_fasta,query_fasta,out_file,QUERY_SPLIT,nJobs=20):
	'''Function takes in target,query and output file. parallelizes blat by running GNU parallel
	- Currently only parallelizes on query space
	- Also assumes that query fasta file takes two lines per sequence (not wrapped)'''
	target_length = float(subprocess.check_output('grep -c \'>\' ' + target_fasta,shell=True))
	query_length = float(subprocess.check_output('grep -c \'>\' ' + query_fasta,shell=True))
	os.system( 'awk \'/^>/{print s? s"\\n"$0:$0;s="";next}{s=s sprintf("%s",$0)}END{if(s)print s}\' ' +query_fasta + ' > '+ query_fasta +'_nospace')
	#os.system( 'awk \'/^>/{print s? s"\\n"$0:$0;s="";next}{s=s sprintf("%s",$0)}END{if(s)print s}\' ' +target_fasta + ' > '+ target_fasta)
	query_fasta = query_fasta + '_nospace'
	#TARGET_SPLIT = 1
	#QUERY_SPLIT = 4
	#Alernately
	#QUERY_SPLIT = min(int(math.ceil(float(query_length)/float(target_length))),50)
	#QUERY_SPLIT = max(int(math.ceil(float(query_length)/float(target_length))),500) 
	#QUERY_SPLIT = int(min(QUERY_SPLIT,query_length))
	#QUERY_SPLIT= 100
	#pdb.set_trace()
	print('Query length: ' +str(query_length) + ' Target length: ' + str(target_length) + ' Query Split: ' + str(QUERY_SPLIT))
	split_size = int(math.floor(float(query_length)/QUERY_SPLIT))
	'''if split_size % 2 !=0:
		split_size +=1'''
	'''if query_length <= float(target_length):
		print('Cannot parallelize on query. Running Vanilla Blat')
		os.system('blat -noHead ' + target_fasta + ' ' +  query_fasta + ' ' + out_file)	
		return'''

	for n in range(QUERY_SPLIT):
		if n==QUERY_SPLIT-1:
                	cut_file(query_fasta,query_fasta+'_'+str(n+1),2*(n)*split_size+1,2*query_length)
		else:
			cut_file(query_fasta,query_fasta+'_'+str(n+1),2*(n)*split_size+1,2*(n+1)*split_size)
	#pdb.set_trace()
	q_range = range(QUERY_SPLIT)
	x = [int(i)+1 for i in q_range]
	q_str = " ".join(map(str,x))
	os.system('rm ' + out_file + '_*' )
	cmds = []
	for a in x:
		cmds.append('blat -noHead '+ target_fasta + ' ' + query_fasta + '_' + str(a)+ ' ' + out_file + '_' + str(a))
		#print('blat -noHead '+ target_fasta + ' ' + query_fasta + '_' + str(a)+ ' ' +out_file + '_' + str(a))
	cmds = tuple(cmds)
	run_parallel_cmds.run_cmds(cmds,nJobs)

	#print('parallel blat -noHead ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: ' + q_str )
	#os.system('time parallel blat -noHead ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: ' + q_str  )
	#os.system('parallel blat ' + target_fasta + ' ' + query_fasta + '_{} ' +out_file + '_{} ::: {1..' + str(QUERY_SPLIT) + '}' )
	#os.system('sort -k 10 ' + out_file + '_* > ' + out_file)
	os.system('cat ' + out_file + '_* > ' + out_file)
	os.system('rm ' + out_file + '_*' )
	os.system('rm ' + query_fasta + '_*' )
Ejemplo n.º 4
0
def rc_gnu(infile,
           tempfile,
           outfile,
           nCPU,
           python_path='python ',
           shannon_dir=''):
    chunks = nCPU
    if chunks == 1:
        run_cmd(python_path + ' ' + shannon_dir + 'rc_s.py ' + infile + ' ' +
                outfile)
        return find_L(infile)

    file_length = float(
        subprocess.check_output('grep -c \'>\' ' + infile, shell=True))
    split_size = int(math.ceil(float(file_length) / chunks))
    infile_piece = open(tempfile + '_1', 'w')
    piece_no = 1
    curr_seqs = []
    read_tot = 0
    no_reads = 0
    for line in open(infile):
        curr_seqs.append(line)
        fields = line.strip().split()
        if fields and fields[0][0] != '>':
            read_tot += len(fields[0])
            no_reads += 1
        if len(curr_seqs) == split_size * 2:
            infile_piece = open(tempfile + '_' + str(piece_no), 'w')
            infile_piece.write(''.join(curr_seqs))
            infile_piece.close()
            piece_no += 1
            curr_seqs = []

    if curr_seqs:
        infile_piece = open(tempfile + '_' + str(piece_no), 'w')
        infile_piece.write(''.join(curr_seqs))
        infile_piece.close()
    else:
        piece_no -= 1

    N = no_reads
    L = (read_tot) / max(N, 1)
    '''for n in range(chunks):
        if n==chunks-1:
            cut_file(infile,infile+'_'+str(n+1),2*(n)*split_size+1,2*file_length)
        else:
            cut_file(infile,infile+'_'+str(n+1),2*(n)*split_size+1,2*(n+1)*split_size)'''
    chunks = piece_no
    c_range = range(chunks)
    x = [int(i) + 1 for i in c_range]
    c_str = " ".join(map(str, x))
    cmds = []
    for i in range(chunks):
        cmds.append(python_path + ' ' + shannon_dir + 'rc_s.py ' + tempfile +
                    '_' + str(i + 1) + ' ' + outfile + '_' + str(i + 1))
    run_parallel_cmds.run_cmds(cmds, chunks)

    #os.system('parallel --bibtex ' + python_path + ' ' + shannon_dir + 'rc_s.py ' + tempfile + '_{} ' + outfile + '_{} ' + ' ::: ' + c_str  )

    file_list = ' '.join([outfile + '_' + str(i + 1) for i in range(chunks)])

    run_cmd('cat ' + file_list + ' > ' + outfile)
    run_cmd('rm ' + outfile + '_* ')
    run_cmd('rm ' + tempfile + '_*  ')
    print(N)
    print(L)
    return (N, L)
Ejemplo n.º 5
0
'''if double_stranded:
	mb_sf_param_string += "  --ds " '''
if only_reads:
	mb_sf_param_string += "  --only_reads "
if filter_FP_flag:
	mb_sf_param_string += "  --filter_FP "
mb_sf_param_string += "  --nJobs "	+ str(nJobs) + " "


if main_server_parameter_string and inDisk:
	if run_parallel and nJobs > 1:
		cmds = []
		for param_str in main_server_parameter_string.split():
			cmds.append(python_path + " " + shannon_dir + "run_MB_SF_fn.py " + param_str + " --run_alg " + mb_sf_param_string + " --kmer_size " + str(K)  + " " + paired_end_flag + " --dir_name " + comp_directory_name + " " + param_str + " --shannon_dir " + shannon_dir + " --python_path " + python_path)
		cmds = tuple(cmds)
		run_parallel_cmds.run_cmds(cmds,nJobs)
		#run_cmd(gnu_parallel_path + " -j " + str(nJobs) + " " + python_path + " " + shannon_dir + "run_MB_SF.py {} --run_alg " + mb_sf_param_string + " --kmer_size " + str(K)  + " " + paired_end_flag + " --dir_name " + comp_directory_name + " --shannon_dir " + shannon_dir + " --python_path " + python_path +  " ::: " + main_server_parameter_string)
	else:
		for param_str in main_server_parameter_string.split():
				run_cmd(python_path + " " + shannon_dir + "run_MB_SF_fn.py " + param_str + " --run_alg " + mb_sf_param_string + " --kmer_size " + str(K)  + " " + paired_end_flag + " --dir_name " + comp_directory_name + " " + param_str + " --shannon_dir " + shannon_dir + " --python_path " + python_path)
elif inMem:
	param_str={}; contig_size = {}

	for comp in new_components:
		dir_base = comp_directory_name + "/" + sample_name + str(comp)	
		param_str[comp] = dir_base + " --run_alg " + mb_sf_param_string + " --kmer_size " + str(K)  + " " + paired_end_flag + " --dir_name " + comp_directory_name + " " + dir_base + " --shannon_dir " + shannon_dir + " --python_path " + python_path
		contig_size[comp] = sum(len(cw_vec) for cw_vec in contig_weights[comp])

	contig_vec = contig_size.items()
	sorted_contig_vec = sorted(contig_vec,key=itemgetter(1),reverse=True)
	def get_column(matrix, i):
Ejemplo n.º 6
0
'''if double_stranded:
	mb_sf_param_string += "  --ds " '''
if only_reads:
	mb_sf_param_string += "  --only_reads "
if filter_FP_flag:
	mb_sf_param_string += "  --filter_FP "
mb_sf_param_string += "  --nJobs "	+ str(nJobs) + " "


if main_server_parameter_string and inDisk:
	if run_parallel and nJobs > 1:
		cmds = []
		for param_str in main_server_parameter_string.split():
			cmds.append(python_path + " " + shannon_dir + "run_MB_SF_fn.py " + param_str + " --run_alg " + mb_sf_param_string + " --kmer_size " + str(K)  + " " + paired_end_flag + " --dir_name " + comp_directory_name + " " + param_str + " --shannon_dir " + shannon_dir + " --python_path " + python_path)
		cmds = tuple(cmds)
		run_parallel_cmds.run_cmds(cmds,nJobs)
		#run_cmd(gnu_parallel_path + " -j " + str(nJobs) + " " + python_path + " " + shannon_dir + "run_MB_SF.py {} --run_alg " + mb_sf_param_string + " --kmer_size " + str(K)  + " " + paired_end_flag + " --dir_name " + comp_directory_name + " --shannon_dir " + shannon_dir + " --python_path " + python_path +  " ::: " + main_server_parameter_string)
	else:
		for param_str in main_server_parameter_string.split():
				run_cmd(python_path + " " + shannon_dir + "run_MB_SF_fn.py " + param_str + " --run_alg " + mb_sf_param_string + " --kmer_size " + str(K)  + " " + paired_end_flag + " --dir_name " + comp_directory_name + " " + param_str + " --shannon_dir " + shannon_dir + " --python_path " + python_path)
elif inMem:
	param_str={}; contig_size = {}

	for comp in new_components:
		dir_base = comp_directory_name + "/" + sample_name + str(comp)	
		param_str[comp] = dir_base + " --run_alg " + mb_sf_param_string + " --kmer_size " + str(K)  + " " + paired_end_flag + " --dir_name " + comp_directory_name + " " + dir_base + " --shannon_dir " + shannon_dir + " --python_path " + python_path
		contig_size[comp] = sum(len(cw_vec) for cw_vec in contig_weights[comp])

	contig_vec = contig_size.items()
	sorted_contig_vec = sorted(contig_vec,key=itemgetter(1),reverse=True)
	def get_column(matrix, i):