Ejemplo n.º 1
0
timer['after_generate_ref'] = time.time()
timer['for_ref'] = timer['after_generate_ref'] - timer['after_sp']

if compare_ans:
	curr_ref = sample_output_name+'algo_output/reference.fasta'
	reconstr = sample_output_name+'algo_output/reconstructed.fasta'
	all_seq = sample_output_name+'algo_output/all_sequences.fasta'
	reconstr_per = sample_output_name+'algo_output/reconstr_per.txt'
	reconstr_rev_per = sample_output_name+'algo_output/reconstr_rev_per.txt'
	reconstr_log = sample_output_name+'algo_output/reconstr_log.txt'
	reconstr_rev_log = sample_output_name+'algo_output/reconstr_rev_log.txt'
	all_seq_per = sample_output_name+'algo_output/all_seq_per.txt'
	self_repeats = sample_output_name+'algo_output/self_repeats.txt'
	if not blat:
		run_cmd('mummer -maxmatch -l 80 ' + reconstr + ' ' + curr_ref + ' > ' + reconstr_per)
		tester.analyzer(reconstr_per,reconstr_log,exp_file,N)
		run_cmd('mummer -maxmatch -l 80 ' + curr_ref + ' ' + reconstr + ' > ' + reconstr_rev_per)
		tester.reverse_analyzer(reconstr_rev_per,reconstr_rev_log,reconstr,N)
	else:
		run_cmd(python_path + ' ' + shannon_dir + 'parallel_blat_python.py ' + reconstr + ' ' + curr_ref + ' ' + reconstr_per)
		tester.analyzer_blat_noExp(reconstr_per,reconstr_log,exp_file,N)
	if false_positive:
		tester.false_positive(reconstr,reconstr_per,reconstr_rev_log)	
	





if run_trinity or compare_trinity:
	curr_ref = sample_output_name+'algo_output/reference.fasta'
Ejemplo n.º 2
0
def run_MB_SF(arguments,inMem=False,contigs=[],weights=[],rps=[]):
	print(arguments)
	inDisk = not inMem
	arguments = arguments.strip().split()
	L = 100
	N = 8743351

	sn = ''
	sparsity = 0.5
	start_loc = 1
	stop_loc = 120000000


	paired_end = 0
	add_errors = 1
	double_stranded = 0

	sim = 0
	to_set_exp = 0
	generate_reads = 0
	trimmomatic = 0 #currently not enabled
	run_seecer =0
	run_jellyfish =0        # runs jellyfish to get kmers
	run_extension_corr =0   # runs contig based error correction to filter k1mers used to build kmer graph
	run_cpp = 0             # builds condensed kmer graph
	mb = 0                  # runs multibridging
	sparse_flow = 0
	parallelize_sf = 0
	generate_ref = 0
	compare_ans = 0         # compares reconstructed transcripts to reference transcripts to measure performance of assembler
	run_trinity = 0         # Runs a competing algorithm trinity     
	compare_trinity = 0
	plots = 0
	plots_express = 0
	run_rsem_eval = 0
	run_cuffinks = 0        # Runs a competing algorithm cufflinks
	compare_cufflinks = 0
	compare_soap =0
	compare_oasis=0
	compare_trans=0

	false_positive=0


	bowtie = 0 
	extract_bam = 0
	oracle_set = 0


	err_string = ' '
	pairedend_string = ' '
	mb_string = ' '
	trinity_string = ' '
	jellyfish_dir = ' jellyfish'	
	jellyfish_kmer_cutoff = 1
	seecer_dir = ' /data/sreeramk/packages/SEECER-0.1.3/SEECER/bin/run_seecer.sh'
	K_value = 24
	blat = 1

	# ----------------------------------------------------------------------------------


	# Sets parameters from terminal
	sample_name = None
	filter_FP_flag = False
	shannon_dir = ''
	only_k1 = ' --only_k1 '  #Default: write only k1mers
	only_reads = False  #default: only_reads = false
	nJobs = 1
	python_path = 'python'
	n_inp = arguments
	if len(n_inp)>1:
	    sample_name = arguments[0]
	    if '--run_alg' in n_inp:
	        mb = 1
	        sparse_flow = 1
	    if '--ds' in n_inp:
	        double_stranded = 1
	    if '--paired_end' in n_inp:
	        paired_end = 1
	    if '--compare' in n_inp:
	        compare_ans = 1
	    if '--kmer_size' in n_inp:
	        K_value = int(n_inp[n_inp.index('--kmer_size')+1])
	    if '--only_reads' in n_inp:
	    	only_reads = True
	    	run_jellyfish = 1
	    	run_extension_corr = 1
	    if '--nJobs' in n_inp:
	        nJobs = int(n_inp[n_inp.index('--nJobs')+1])
	    if '--both' in n_inp:
	    	only_k1 == ' ' #Dont write only k1mers
	        #print(K_value)
	    if '--dir_name'in n_inp:
	        directory_name = n_inp[n_inp.index('--dir_name')+1]
	    if '--shannon_dir' in n_inp:
		shannon_dir = n_inp[n_inp.index('--shannon_dir')+1]
	    if '--python_path' in n_inp:
	    	python_path = n_inp[n_inp.index('--python_path')+1]
	    if '--filter_FP' in n_inp and paired_end:
	    	filter_FP_flag = True
	        
	if paired_end:
		F = 350 #Fragment size
		F_sd = 0 #Fragment size Standard Deviation
		sn = sn+'_F_'+str(F)
		pairedend_string = ' -p ' + str(F) + ',' + str(F_sd) + ' '
	else:
		F=L


	if add_errors:
		error_rate = 0.01
		err_string = ' -r ' + str(error_rate) + ' '
		sn = sn + '_ERR'
		mb_string += ' -e '

	if double_stranded:
		sn = sn + '_DS' 
		ds_string = ' '
		#mb_string += ' -d '
	else:
		ds_string = ' --stranded '
		trinity_string += ' --SS_lib_type FR '


	def run_cmd(s1):
		os.system(s1)

	sample_output_name = sample_name

	base_dir = '' 
	bed_file=' ./Genome/GSE51861_isoform.bed'
	ref_file=' ./Genome/hg19.fa' # reference chromosome
	exp_file= sample_name + 'algo_input/random_out.exp'
	reads_file = sample_name + 'algo_input/reads'
	if paired_end:
		reads_string = reads_file + '_1.fasta '+reads_file + '_2.fasta '
	else:	
		reads_string = reads_file + '.fasta '

	timer = {}
	timer['start'] = time.time()	

	if not os.path.exists(sample_name+'algo_input'):
	   	os.makedirs(sample_name+'algo_input')
	if not os.path.exists(sample_output_name+'algo_output'):
	   	os.makedirs(sample_output_name+'algo_output')
	if not os.path.exists(sample_output_name+'intermediate'):
	   	os.makedirs(sample_output_name+'intermediate')

	'''if mb or sparse_flow:
	        os.system('cp *.py ' + sample_output_name + 'algo_output/')'''


	if trimmomatic:
		run_cmd('java -jar /home/sreeramkannan/Packages/trinityrnaseq_r20131110/trinity-plugins/Trimmomatic-0.30/trimmomatic.jar SE -phred33 SRR453566_1.fastq reads_trimmomatic.fastq ILLUMINACLIP:/home/sreeramkannan/Packages/trinityrnaseq_r20131110/trinity-plugins/Trimmomatic-0.30/adapters/TruSeq2-SE.fa:2:30:10 LEADING:20 TRAILING:20 SLIDINGWINDOW:4:30 MINLEN:75 CROP:75 TOPHRED64')
		run_cmd('java -jar /home/sreeramkannan/Packages/trinityrnaseq_r20131110/trinity-plugins/Trimmomatic-0.30/trimmomatic.jar PE -phred33 reads_1.fastq reads_2.fastq reads_trim_1.fastq reads_trim_UP1.fastq reads_trim_2.fastq reads_trim_UP2.fastq LEADING:20 TRAILING:20 SLIDINGWINDOW:4:30 MINLEN:50 CROP:50 TOPHRED64')



	if run_jellyfish:
		run_jfs = ' '
		if double_stranded:
			run_jfs += ' -C '
		#run_cmd('rm '+sample_name+'algo_input/jelly*')  #Remove old jellyfish files
		run_cmd(jellyfish_dir+' count -m ' + str(K_value+1) + run_jfs+ ' -o ' + sample_name+'algo_input/jellyfish_p1_output -s 20000000 -c 4 -t ' + str(nJobs) + ' ' +reads_string)

		'''if os.path.isfile(sample_name+'algo_input/jellyfish_p1_output_1'):
			run_cmd(jellyfish_dir+' merge -o '+ sample_name+'algo_input/jellyfish_p1_output.jf ' + sample_name+'algo_input/jellyfish_p1_output\_*')
		else:
			run_cmd('mv ' + sample_name+'algo_input/jellyfish_p1_output_0 ' +sample_name+'algo_input/jellyfish_p1_output.jf')'''
		
		run_cmd(jellyfish_dir+' dump -c -t -L ' + str(jellyfish_kmer_cutoff) + ' ' + sample_name+'algo_input/jellyfish_p1_output > ' + sample_name+'algo_input/k1mer.dict_org')
		if (not run_extension_corr) and double_stranded:
	                tester.double_strandify(sample_name+'algo_input/k1mer.dict_org', sample_name+'algo_input/k1mer.dict')
	        if (not run_extension_corr) and (not double_stranded):
			run_cmd('mv ' + sample_name+'algo_input/k1mer.dict_org ' + sample_name+'algo_input/k1mer.dict')	

	if run_extension_corr:	
			if double_stranded:
				str_ec = ' -d '
			else: 
				str_ec = ' '
			run_cmd(python_path + ' ext_corr.py ' + str_ec + sample_name+'algo_input/k1mer.dict_org ' +sample_name+'algo_input/k1mer.dict 3 75')

	'''if run_jellyfish or run_extension_corr:
			run_cmd('python kp1mer_to_ kmer.py ' + sample_name+'algo_input/k1mer.dict ' + sample_name+'algo_input/kmer.dict')'''


	if run_cpp:
	        run_cmd('tr \'\\t\' \'\\n\' <' + sample_name+'algo_input/kmer.dict > ' + sample_name+'algo_input/kmer.dict_2l ') 
	        run_cmd('tr \'\\t\' \'\\n\' <' + sample_name+'algo_input/k1mer.dict > ' + sample_name+'algo_input/k1mer.dict_2l ')
		run_cmd('./condenser ' + sample_name+'algo_input/kmer.dict_2l '+sample_name+'algo_input/k1mer.dict_2l  ' + sample_name+'algo_input/'  + ' ' + str(K_value) + ' | tee ' + sample_name + '_cpp_terminal_output.txt')	

	with open(sample_name + '_terminal_output.txt', 'w') as f7:
	    f7.write(" \n")

	    
	if inDisk and mb:
		#run_cmd('rm '+sample_output_name+'intermediate/*')
		jf_s = ' '; cpp_s = ' ';
		use_jellyfish = 1; use_cpp = 0 #force set parameter for jellyfish
		if use_jellyfish:
			jf_s = '  '+ sample_name+'algo_input/kmer.dict '+sample_name+'algo_input/k1mer.dict '
		if use_cpp:
			cpp_s = '-c ' +sample_name+'algo_input/nodes.txt '+sample_name+'algo_input/edges.txt '
	        if not paired_end:                
			run_cmd(python_path + ' ' + shannon_dir + 'multibridging.py  -f --kmer=' +str(K_value) + mb_string + only_k1 +  jf_s + cpp_s + reads_file+'.fasta ' + sample_output_name + 'intermediate ' + ' | tee ' + sample_name + '_terminal_output.txt') # ' 2>&1 | tee ./' + sample_name + 'algo_input/log.txt')
		else:
			run_cmd(python_path + ' ' + shannon_dir + 'multibridging.py -f --kmer='+ str(K_value) + mb_string + only_k1 + jf_s + cpp_s + reads_file+'_1.fasta '+reads_file+'_2.fasta ' + sample_output_name+ 'intermediate ' + ' | tee ' + sample_name + '_terminal_output.txt') # 2>&1 | tee ./' + sample_name + 'algo_input/log.txt')
	elif inMem and mb: 
		jf_s = ''; cpp_s = ''
		#In Memory
		if paired_end:
			multibridging.main('-f --kmer='+ str(K_value) + mb_string + only_k1 + jf_s + cpp_s + reads_file+'_1.fasta '+reads_file+'_2.fasta ' + sample_output_name+ 'intermediate ', inMem, contigs, weights, rps)
		else:
			multibridging.main('-f --kmer='+ str(K_value) + mb_string + only_k1 + jf_s + cpp_s + reads_file+'.fasta ' + sample_output_name+ 'intermediate ', inMem, contigs, weights, rps)

	timer['after_mb'] = time.time()
	#timer['for_mb'] = timer['after_mb'] - timer['after_gen_reads']


	if sparse_flow:
	        reconstr_file = sample_output_name+'algo_output/reconstructed.fasta'
	        #run_cmd('rm '+sample_output_name+'algo_output/reconstructed_comp_*.fasta')
	        #run_cmd('rm '+sample_output_name+'algo_output/reconstructed.fasta')
	        
	        run_cmd(python_path + ' ' + shannon_dir + 'algorithm_SF.py -1 '+ sample_output_name)
	        ncomp = 0
	        iteration_string = " "
	        while os.path.isfile(sample_output_name + 'intermediate/nodes'+str(ncomp)+'.txt'):
	                if 0: #ncomp==0:  and ncomp!=2:  #testing purpose
	                	ncomp +=1
	                	continue
	                print('Component:',ncomp)
	                if not parallelize_sf:
	                        os.system(python_path + ' '  + shannon_dir + 'algorithm_SF.py ' + str(ncomp) + ' '+ sample_output_name) # + ' | tee ' + sample_name + '_' + str(ncomp) + '_terminal_output.txt')
	                iteration_string += str(ncomp) + " "
	                ncomp=ncomp+1
	        
		if parallelize_sf:
			os.system('parallel ' + python_path + ' ' + shannon_dir + 'algorithm_SF.py {} ' + sample_output_name+ " ::: " + iteration_string)
	        os.system("cat " + sample_output_name+'algo_output/reconstructed_comp_*.fasta' +  " >> " + reconstr_file)
	        #filter_trans(sample_output_name+'algo_output/reconstructed.fasta', sample_output_name+'algo_output/reconstructed_short.fasta', 200)
	    
	if sparse_flow:
	    if os.path.exists(directory_name+"/before_sp_log.txt"):
	        f_log = open(directory_name+"/before_sp_log.txt", 'a')
	    else:
	        f_log = open(directory_name+"/before_sp_log.txt", 'w')
	        
	    num_transcripts = 0
	    with open(sample_output_name+"algo_output/reconstructed.fasta", 'r') as reconstructed_transcripts:
	        num_transcripts = len(reconstructed_transcripts.readlines())
	    

	    f_log.write(str(time.asctime()) + ": " +sample_output_name + " has completed: " + str(num_transcripts) + " transcripts" + "\n")
	    print(str(time.asctime()) + ": " +sample_output_name + " has completed: " + str(num_transcripts) + " transcripts")
	    f_log.close()

	if filter_FP_flag:
		reads_1 = sample_name+'algo_input/reads_1.fasta'
		reads_2 = sample_name+'algo_input/reads_2.fasta'
		rec_fasta = sample_name+"algo_output/reconstructed.fasta"
		out_dir = sample_name+"algo_output/"
		filter_FP.filter_FP(rec_fasta, reads_1, reads_2, out_dir)


	timer['after_sp'] = time.time()
	#timer['for_sp'] = timer['after_sp'] - timer['after_mb']

	if compare_ans:
		curr_ref = sample_output_name+'algo_output/reference.fasta'
		reconstr = sample_output_name+'algo_output/reconstructed.fasta'
		all_seq = sample_output_name+'algo_output/all_sequences.fasta'
		reconstr_per = sample_output_name+'algo_output/reconstr_per.txt'
		reconstr_rev_per = sample_output_name+'algo_output/reconstr_rev_per.txt'
		reconstr_log = sample_output_name+'algo_output/reconstr_log.txt'
		reconstr_rev_log = sample_output_name+'algo_output/reconstr_rev_log.txt'
		all_seq_per = sample_output_name+'algo_output/all_seq_per.txt'
		self_repeats = sample_output_name+'algo_output/self_repeats.txt'
		if not blat:
			run_cmd('mummer -maxmatch -l 80 ' + reconstr + ' ' + curr_ref + ' > ' + reconstr_per)
			tester.analyzer(reconstr_per,reconstr_log,exp_file,N)
			run_cmd('mummer -maxmatch -l 80 ' + curr_ref + ' ' + reconstr + ' > ' + reconstr_rev_per)
			tester.reverse_analyzer(reconstr_rev_per,reconstr_rev_log,reconstr,N)
		else:
			run_cmd(python_path + ' ' + shannon_dir + 'parallel_blat_python.py ' + reconstr + ' ' + curr_ref + ' ' + reconstr_per)
			tester.analyzer_blat_noExp(reconstr_per,reconstr_log,exp_file,N)
		if false_positive:
			tester.false_positive(reconstr,reconstr_per,reconstr_rev_log)	
Ejemplo n.º 3
0
def run_MB_SF(arguments, inMem=False, contigs=[], weights=[], rps=[]):
    print(arguments)
    inDisk = not inMem
    arguments = arguments.strip().split()
    L = 100
    N = 8743351

    sn = ''
    sparsity = 0.5
    start_loc = 1
    stop_loc = 120000000

    paired_end = 0
    add_errors = 1
    double_stranded = 0

    sim = 0
    to_set_exp = 0
    generate_reads = 0
    trimmomatic = 0  #currently not enabled
    run_seecer = 0
    run_jellyfish = 0  # runs jellyfish to get kmers
    run_extension_corr = 0  # runs contig based error correction to filter k1mers used to build kmer graph
    run_cpp = 0  # builds condensed kmer graph
    mb = 0  # runs multibridging
    sparse_flow = 0
    parallelize_sf = 0
    generate_ref = 0
    compare_ans = 0  # compares reconstructed transcripts to reference transcripts to measure performance of assembler
    run_trinity = 0  # Runs a competing algorithm trinity
    compare_trinity = 0
    plots = 0
    plots_express = 0
    run_rsem_eval = 0
    run_cuffinks = 0  # Runs a competing algorithm cufflinks
    compare_cufflinks = 0
    compare_soap = 0
    compare_oasis = 0
    compare_trans = 0

    false_positive = 0

    bowtie = 0
    extract_bam = 0
    oracle_set = 0

    err_string = ' '
    pairedend_string = ' '
    mb_string = ' '
    trinity_string = ' '
    jellyfish_dir = ' jellyfish'
    jellyfish_kmer_cutoff = 1
    seecer_dir = ' /data/sreeramk/packages/SEECER-0.1.3/SEECER/bin/run_seecer.sh'
    K_value = 24
    blat = 1

    # ----------------------------------------------------------------------------------

    # Sets parameters from terminal
    sample_name = None
    filter_FP_flag = False
    shannon_dir = ''
    only_k1 = ' --only_k1 '  #Default: write only k1mers
    only_reads = False  #default: only_reads = false
    nJobs = 1
    python_path = 'python'
    n_inp = arguments
    if len(n_inp) > 1:
        sample_name = arguments[0]
        if '--run_alg' in n_inp:
            mb = 1
            sparse_flow = 1
        if '--ds' in n_inp:
            double_stranded = 1
        if '--paired_end' in n_inp:
            paired_end = 1
        if '--compare' in n_inp:
            compare_ans = 1
        if '--kmer_size' in n_inp:
            K_value = int(n_inp[n_inp.index('--kmer_size') + 1])
        if '--only_reads' in n_inp:
            only_reads = True
            run_jellyfish = 1
            run_extension_corr = 1
        if '--nJobs' in n_inp:
            nJobs = int(n_inp[n_inp.index('--nJobs') + 1])
        if '--both' in n_inp:
            only_k1 == ' '  #Dont write only k1mers
        #print(K_value)
        if '--dir_name' in n_inp:
            directory_name = n_inp[n_inp.index('--dir_name') + 1]
        if '--shannon_dir' in n_inp:
            shannon_dir = n_inp[n_inp.index('--shannon_dir') + 1]
        if '--python_path' in n_inp:
            python_path = n_inp[n_inp.index('--python_path') + 1]
        if '--filter_FP' in n_inp and paired_end:
            filter_FP_flag = True

    if paired_end:
        F = 350  #Fragment size
        F_sd = 0  #Fragment size Standard Deviation
        sn = sn + '_F_' + str(F)
        pairedend_string = ' -p ' + str(F) + ',' + str(F_sd) + ' '
    else:
        F = L

    if add_errors:
        error_rate = 0.01
        err_string = ' -r ' + str(error_rate) + ' '
        sn = sn + '_ERR'
        mb_string += ' -e '

    if double_stranded:
        sn = sn + '_DS'
        ds_string = ' '
        #mb_string += ' -d '
    else:
        ds_string = ' --stranded '
        trinity_string += ' --SS_lib_type FR '

    def run_cmd(s1):
        os.system(s1)

    sample_output_name = sample_name

    base_dir = ''
    bed_file = ' ./Genome/GSE51861_isoform.bed'
    ref_file = ' ./Genome/hg19.fa'  # reference chromosome
    exp_file = sample_name + 'algo_input/random_out.exp'
    reads_file = sample_name + 'algo_input/reads'
    if paired_end:
        reads_string = reads_file + '_1.fasta ' + reads_file + '_2.fasta '
    else:
        reads_string = reads_file + '.fasta '

    timer = {}
    timer['start'] = time.time()

    if not os.path.exists(sample_name + 'algo_input'):
        os.makedirs(sample_name + 'algo_input')
    if not os.path.exists(sample_output_name + 'algo_output'):
        os.makedirs(sample_output_name + 'algo_output')
    if not os.path.exists(sample_output_name + 'intermediate'):
        os.makedirs(sample_output_name + 'intermediate')
    '''if mb or sparse_flow:
	        os.system('cp *.py ' + sample_output_name + 'algo_output/')'''

    if trimmomatic:
        run_cmd(
            'java -jar /home/sreeramkannan/Packages/trinityrnaseq_r20131110/trinity-plugins/Trimmomatic-0.30/trimmomatic.jar SE -phred33 SRR453566_1.fastq reads_trimmomatic.fastq ILLUMINACLIP:/home/sreeramkannan/Packages/trinityrnaseq_r20131110/trinity-plugins/Trimmomatic-0.30/adapters/TruSeq2-SE.fa:2:30:10 LEADING:20 TRAILING:20 SLIDINGWINDOW:4:30 MINLEN:75 CROP:75 TOPHRED64'
        )
        run_cmd(
            'java -jar /home/sreeramkannan/Packages/trinityrnaseq_r20131110/trinity-plugins/Trimmomatic-0.30/trimmomatic.jar PE -phred33 reads_1.fastq reads_2.fastq reads_trim_1.fastq reads_trim_UP1.fastq reads_trim_2.fastq reads_trim_UP2.fastq LEADING:20 TRAILING:20 SLIDINGWINDOW:4:30 MINLEN:50 CROP:50 TOPHRED64'
        )

    if run_jellyfish:
        run_jfs = ' '
        if double_stranded:
            run_jfs += ' -C '
        #run_cmd('rm '+sample_name+'algo_input/jelly*')  #Remove old jellyfish files
        run_cmd(jellyfish_dir + ' count -m ' + str(K_value + 1) + run_jfs +
                ' -o ' + sample_name +
                'algo_input/jellyfish_p1_output -s 20000000 -c 4 -t ' +
                str(nJobs) + ' ' + reads_string)
        '''if os.path.isfile(sample_name+'algo_input/jellyfish_p1_output_1'):
			run_cmd(jellyfish_dir+' merge -o '+ sample_name+'algo_input/jellyfish_p1_output.jf ' + sample_name+'algo_input/jellyfish_p1_output\_*')
		else:
			run_cmd('mv ' + sample_name+'algo_input/jellyfish_p1_output_0 ' +sample_name+'algo_input/jellyfish_p1_output.jf')'''

        run_cmd(jellyfish_dir + ' dump -c -t -L ' +
                str(jellyfish_kmer_cutoff) + ' ' + sample_name +
                'algo_input/jellyfish_p1_output > ' + sample_name +
                'algo_input/k1mer.dict_org')
        if (not run_extension_corr) and double_stranded:
            tester.double_strandify(sample_name + 'algo_input/k1mer.dict_org',
                                    sample_name + 'algo_input/k1mer.dict')
        if (not run_extension_corr) and (not double_stranded):
            run_cmd('mv ' + sample_name + 'algo_input/k1mer.dict_org ' +
                    sample_name + 'algo_input/k1mer.dict')

    if run_extension_corr:
        if double_stranded:
            str_ec = ' -d '
        else:
            str_ec = ' '
        run_cmd(python_path + ' ext_corr.py ' + str_ec + sample_name +
                'algo_input/k1mer.dict_org ' + sample_name +
                'algo_input/k1mer.dict 3 75')
    '''if run_jellyfish or run_extension_corr:
			run_cmd('python kp1mer_to_ kmer.py ' + sample_name+'algo_input/k1mer.dict ' + sample_name+'algo_input/kmer.dict')'''

    if run_cpp:
        run_cmd('tr \'\\t\' \'\\n\' <' + sample_name +
                'algo_input/kmer.dict > ' + sample_name +
                'algo_input/kmer.dict_2l ')
        run_cmd('tr \'\\t\' \'\\n\' <' + sample_name +
                'algo_input/k1mer.dict > ' + sample_name +
                'algo_input/k1mer.dict_2l ')
        run_cmd('./condenser ' + sample_name + 'algo_input/kmer.dict_2l ' +
                sample_name + 'algo_input/k1mer.dict_2l  ' + sample_name +
                'algo_input/' + ' ' + str(K_value) + ' | tee ' + sample_name +
                '_cpp_terminal_output.txt')

    with open(sample_name + '_terminal_output.txt', 'w') as f7:
        f7.write(" \n")

    if inDisk and mb:
        #run_cmd('rm '+sample_output_name+'intermediate/*')
        jf_s = ' '
        cpp_s = ' '
        use_jellyfish = 1
        use_cpp = 0  #force set parameter for jellyfish
        if use_jellyfish:
            jf_s = '  ' + sample_name + 'algo_input/kmer.dict ' + sample_name + 'algo_input/k1mer.dict '
        if use_cpp:
            cpp_s = '-c ' + sample_name + 'algo_input/nodes.txt ' + sample_name + 'algo_input/edges.txt '
        if not paired_end:
            run_cmd(python_path + ' ' + shannon_dir +
                    'multibridging.py  -f --kmer=' + str(K_value) + mb_string +
                    only_k1 + jf_s + cpp_s + reads_file + '.fasta ' +
                    sample_output_name + 'intermediate ' + ' | tee ' +
                    sample_name + '_terminal_output.txt'
                    )  # ' 2>&1 | tee ./' + sample_name + 'algo_input/log.txt')
        else:
            run_cmd(python_path + ' ' + shannon_dir +
                    'multibridging.py -f --kmer=' + str(K_value) + mb_string +
                    only_k1 + jf_s + cpp_s + reads_file + '_1.fasta ' +
                    reads_file + '_2.fasta ' + sample_output_name +
                    'intermediate ' + ' | tee ' + sample_name +
                    '_terminal_output.txt'
                    )  # 2>&1 | tee ./' + sample_name + 'algo_input/log.txt')
    elif inMem and mb:
        jf_s = ''
        cpp_s = ''
        #In Memory
        if paired_end:
            multibridging.main(
                '-f --kmer=' + str(K_value) + mb_string + only_k1 + jf_s +
                cpp_s + reads_file + '_1.fasta ' + reads_file + '_2.fasta ' +
                sample_output_name + 'intermediate ', inMem, contigs, weights,
                rps)
        else:
            multibridging.main(
                '-f --kmer=' + str(K_value) + mb_string + only_k1 + jf_s +
                cpp_s + reads_file + '.fasta ' + sample_output_name +
                'intermediate ', inMem, contigs, weights, rps)

    timer['after_mb'] = time.time()
    #timer['for_mb'] = timer['after_mb'] - timer['after_gen_reads']

    if sparse_flow:
        reconstr_file = sample_output_name + 'algo_output/reconstructed.fasta'
        #run_cmd('rm '+sample_output_name+'algo_output/reconstructed_comp_*.fasta')
        #run_cmd('rm '+sample_output_name+'algo_output/reconstructed.fasta')

        run_cmd(python_path + ' ' + shannon_dir + 'algorithm_SF.py -1 ' +
                sample_output_name)
        ncomp = 0
        iteration_string = " "
        while os.path.isfile(sample_output_name + 'intermediate/nodes' +
                             str(ncomp) + '.txt'):
            if 0:  #ncomp==0:  and ncomp!=2:  #testing purpose
                ncomp += 1
                continue
            print('Component:', ncomp)
            if not parallelize_sf:
                os.system(
                    python_path + ' ' + shannon_dir + 'algorithm_SF.py ' +
                    str(ncomp) + ' ' + sample_output_name
                )  # + ' | tee ' + sample_name + '_' + str(ncomp) + '_terminal_output.txt')
            iteration_string += str(ncomp) + " "
            ncomp = ncomp + 1

        if parallelize_sf:
            os.system('parallel ' + python_path + ' ' + shannon_dir +
                      'algorithm_SF.py {} ' + sample_output_name + " ::: " +
                      iteration_string)
        os.system("cat " + sample_output_name +
                  'algo_output/reconstructed_comp_*.fasta' + " >> " +
                  reconstr_file)
        #filter_trans(sample_output_name+'algo_output/reconstructed.fasta', sample_output_name+'algo_output/reconstructed_short.fasta', 200)

    if sparse_flow:
        if os.path.exists(directory_name + "/before_sp_log.txt"):
            f_log = open(directory_name + "/before_sp_log.txt", 'a')
        else:
            f_log = open(directory_name + "/before_sp_log.txt", 'w')

        num_transcripts = 0
        with open(sample_output_name + "algo_output/reconstructed.fasta",
                  'r') as reconstructed_transcripts:
            num_transcripts = len(reconstructed_transcripts.readlines())

        f_log.write(
            str(time.asctime()) + ": " + sample_output_name +
            " has completed: " + str(num_transcripts) + " transcripts" + "\n")
        print(
            str(time.asctime()) + ": " + sample_output_name +
            " has completed: " + str(num_transcripts) + " transcripts")
        f_log.close()

    if filter_FP_flag:
        reads_1 = sample_name + 'algo_input/reads_1.fasta'
        reads_2 = sample_name + 'algo_input/reads_2.fasta'
        rec_fasta = sample_name + "algo_output/reconstructed.fasta"
        out_dir = sample_name + "algo_output/"
        filter_FP.filter_FP(rec_fasta, reads_1, reads_2, out_dir)

    timer['after_sp'] = time.time()
    #timer['for_sp'] = timer['after_sp'] - timer['after_mb']

    if compare_ans:
        curr_ref = sample_output_name + 'algo_output/reference.fasta'
        reconstr = sample_output_name + 'algo_output/reconstructed.fasta'
        all_seq = sample_output_name + 'algo_output/all_sequences.fasta'
        reconstr_per = sample_output_name + 'algo_output/reconstr_per.txt'
        reconstr_rev_per = sample_output_name + 'algo_output/reconstr_rev_per.txt'
        reconstr_log = sample_output_name + 'algo_output/reconstr_log.txt'
        reconstr_rev_log = sample_output_name + 'algo_output/reconstr_rev_log.txt'
        all_seq_per = sample_output_name + 'algo_output/all_seq_per.txt'
        self_repeats = sample_output_name + 'algo_output/self_repeats.txt'
        if not blat:
            run_cmd('mummer -maxmatch -l 80 ' + reconstr + ' ' + curr_ref +
                    ' > ' + reconstr_per)
            tester.analyzer(reconstr_per, reconstr_log, exp_file, N)
            run_cmd('mummer -maxmatch -l 80 ' + curr_ref + ' ' + reconstr +
                    ' > ' + reconstr_rev_per)
            tester.reverse_analyzer(reconstr_rev_per, reconstr_rev_log,
                                    reconstr, N)
        else:
            run_cmd(python_path + ' ' + shannon_dir +
                    'parallel_blat_python.py ' + reconstr + ' ' + curr_ref +
                    ' ' + reconstr_per)
            tester.analyzer_blat_noExp(reconstr_per, reconstr_log, exp_file, N)
        if false_positive:
            tester.false_positive(reconstr, reconstr_per, reconstr_rev_log)
Ejemplo n.º 4
0
timer['for_ref'] = timer['after_generate_ref'] - timer['after_sp']

if compare_ans:
    curr_ref = sample_output_name + 'algo_output/reference.fasta'
    reconstr = sample_output_name + 'algo_output/reconstructed.fasta'
    all_seq = sample_output_name + 'algo_output/all_sequences.fasta'
    reconstr_per = sample_output_name + 'algo_output/reconstr_per.txt'
    reconstr_rev_per = sample_output_name + 'algo_output/reconstr_rev_per.txt'
    reconstr_log = sample_output_name + 'algo_output/reconstr_log.txt'
    reconstr_rev_log = sample_output_name + 'algo_output/reconstr_rev_log.txt'
    all_seq_per = sample_output_name + 'algo_output/all_seq_per.txt'
    self_repeats = sample_output_name + 'algo_output/self_repeats.txt'
    if not blat:
        run_cmd('mummer -maxmatch -l 80 ' + reconstr + ' ' + curr_ref + ' > ' +
                reconstr_per)
        tester.analyzer(reconstr_per, reconstr_log, exp_file, N)
        run_cmd('mummer -maxmatch -l 80 ' + curr_ref + ' ' + reconstr + ' > ' +
                reconstr_rev_per)
        tester.reverse_analyzer(reconstr_rev_per, reconstr_rev_log, reconstr,
                                N)
    else:
        run_cmd(python_path + ' ' + shannon_dir + 'parallel_blat_python.py ' +
                reconstr + ' ' + curr_ref + ' ' + reconstr_per)
        tester.analyzer_blat_noExp(reconstr_per, reconstr_log, exp_file, N)
    if false_positive:
        tester.false_positive(reconstr, reconstr_per, reconstr_rev_log)

if run_trinity or compare_trinity:
    curr_ref = sample_output_name + 'algo_output/reference.fasta'
    run_cmd('mkdir ' + sample_output_name + 'algo_output/Trinity/')
    trinity_per = sample_output_name + 'algo_output/Trinity/trinity_per.txt'