Ejemplo n.º 1
0
def parse_pfm_dict(fasta_exprun,fasta_dir):
	# fasta_exprun = sys.argv[1] 
	# fasta_dir = sys.argv[2]
	# output = mp.Queue()
	l_kmer = 6
	JASPAR_database = "pfm_vertebrates.txt"
	handle_database = open(JASPAR_database)
	database_content = handle_database.read()

	exp_summary = "ERP008935_info.csv"
	handle_expsum = open(exp_summary)
	expsum_content = handle_expsum.read()

	handle_fasta=open(fasta_dir+fasta_exprun + '.fa')
	fasta_dict = extract_motif.fasta_parser(handle_fasta)

	first_comp,second_comp,pfm_comp1,pfm_comp2 = find_component(fasta_exprun,database_content,expsum_content)

	if len(pfm_comp1)>0 and len(pfm_comp2)>0:
		print(first_comp,second_comp)
	else:
		exit()

	new_pfm_dict = dict()
	ref_pwm_short = extract_motif.tf_proc(database_content,[first_comp,second_comp],l_kmer)
	new_pfm =np.zeros(np.shape(ref_pwm_short[first_comp]))

	count_run = 0
	print(len(fasta_dict))
	for read_nmer in fasta_dict:
		count_run = count_run + 1
		if count_run %10000==0:
			print(count_run)
		category_name= ''
		spacing=0
		# kmers = [ fasta_dict[read_nmer][n:n+l_kmer] for n in range(0,len(fasta_dict[read_nmer])-l_kmer+1)]
		best_kmer = ''
		best_score = 0
		# tic()
		fbest_kmer_index,fbest_kmer_orient,fbest_kmer,fbest_score = find_best_kmer(ref_pwm_short[first_comp],fasta_dict[read_nmer],l_kmer)
		rbest_kmer_index,rbest_kmer_orient,rbest_kmer,rbest_score = find_best_kmer(ref_pwm_short[second_comp],fasta_dict[read_nmer],l_kmer)
		# toc()
		# f_pwm = np.matrix(ref_pwm_short[first_comp],copy = False)
		# s_pwm = np.matrix(ref_pwm_short[second_comp],copy = False)
		# print(f_pwm)
		# tic()
		# fbest_kmer_index,fbest_kmer_orient = find_best_kmer_mp(f_pwm,fasta_dict[read_nmer],l_kmer)
		# rbest_kmer_index,rbest_kmer_orient = find_best_kmer_mp(s_pwm,fasta_dict[read_nmer],l_kmer)
		# toc()
		if abs(fbest_kmer_index-rbest_kmer_index)>5+l_kmer or abs(fbest_kmer_index-rbest_kmer_index)<3:
			continue
		# else:
			# print(fbest_kmer_orient,rbest_kmer_orient,fbest_kmer_index,rbest_kmer_index)
		if fbest_kmer_orient == 1 and rbest_kmer_orient == 1:
			# if abs(fbest_kmer_orient*fbest_kmer_index-rbest_kmer_orient*rbest_kmer_index)>5+l_kmer:
				# continue
			if fbest_kmer_index < rbest_kmer_index:
				spacing = rbest_kmer_index-fbest_kmer_index-l_kmer
				category_name = first_comp+'::'+second_comp+'_'+'1+2+'+ '_' + str(spacing)
				motif = fasta_dict[read_nmer][fbest_kmer_index:rbest_kmer_index+l_kmer] 
			else:
				spacing = fbest_kmer_index-rbest_kmer_index-l_kmer
				category_name = first_comp+'::'+second_comp+'_''2+1+'+ '_' + str(spacing)
				motif = fasta_dict[read_nmer][rbest_kmer_index:fbest_kmer_index+l_kmer] 

		if fbest_kmer_orient == -1 and rbest_kmer_orient == -1:
			# if abs(fbest_kmer_orient*fbest_kmer_index-rbest_kmer_orient*rbest_kmer_index)>5+l_kmer:
				# continue
			if fbest_kmer_index < rbest_kmer_index:
				spacing = rbest_kmer_index-fbest_kmer_index-l_kmer
				category_name = first_comp+'::'+second_comp+'_'+'2+1+'+ '_' + str(spacing)
				motif = reverse_comp(fasta_dict[read_nmer][fbest_kmer_index:rbest_kmer_index+l_kmer]) 
			else:
				spacing = fbest_kmer_index-rbest_kmer_index-l_kmer
				category_name = first_comp+'::'+second_comp+'_'+'1+2+'+ '_' + str(spacing)
				motif = reverse_comp(fasta_dict[read_nmer][rbest_kmer_index:fbest_kmer_index+l_kmer]) 

		if fbest_kmer_orient == 1 and rbest_kmer_orient == -1:
			# if abs(fbest_kmer_index-rbest_kmer_index)<3:
				# continue
			if fbest_kmer_index < rbest_kmer_index:
				spacing = rbest_kmer_index-fbest_kmer_index-l_kmer
				category_name = first_comp+'::'+second_comp+'_'+'1+2-'+ '_' + str(spacing)
				motif = fasta_dict[read_nmer][fbest_kmer_index:rbest_kmer_index+l_kmer] 
			else:
				spacing = fbest_kmer_index-rbest_kmer_index-l_kmer
				category_name = first_comp+'::'+second_comp+'_'+'2-1+'+ '_' + str(spacing)
				motif = fasta_dict[read_nmer][rbest_kmer_index:fbest_kmer_index+l_kmer] 

		if fbest_kmer_orient == -1 and rbest_kmer_orient == 1:
			# if abs(fbest_kmer_index-rbest_kmer_index)<3:
				# continue
			if fbest_kmer_index > rbest_kmer_index:
				spacing = fbest_kmer_index-rbest_kmer_index-l_kmer
				category_name = first_comp+'::'+second_comp+'_'+'1+2-' + '_' + str(spacing)
				motif = reverse_comp(fasta_dict[read_nmer][rbest_kmer_index:fbest_kmer_index+l_kmer]) 
			else:
				spacing = rbest_kmer_index-fbest_kmer_index-l_kmer
				category_name = first_comp+'::'+second_comp+'_'+'2-1+' + '_' + str(spacing)
				motif = reverse_comp(fasta_dict[read_nmer][fbest_kmer_index:rbest_kmer_index+l_kmer]) 
			# print(motif,category_name)
			# print(category_name,spacing)		
			# new_pfm = extract_motif.pfm_writer(new_pfm,best_kmer)
			# print(best_score)
			new_pfm_dict = pfm_dict_writer(new_pfm_dict, category_name, motif)
	# print(ref_pwm_short[first_comp])
	print(new_pfm_dict)
	return new_pfm_dict
Ejemplo n.º 2
0
    return pfm


fasta_exprun = sys.argv[1]
fasta_dir = sys.argv[2]
l_kmer = 6
JASPAR_database = "pfm_vertebrates.txt"
handle_database = open(JASPAR_database)
database_content = handle_database.read()

exp_summary = "ERP008935_info.csv"
handle_expsum = open(exp_summary)
expsum_content = handle_expsum.read()

# download bease on the existing fasta
first_comp, second_comp, pfm_comp1, pfm_comp2 = find_component(
    fasta_exprun, database_content, expsum_content)
# ERR[0-9]*?.*(?=ALX4_ALX4)
exp_entries = re.findall('ERR[0-9]*?.*' + first_comp + '_' + second_comp,
                         expsum_content)
for exp_entry in exp_entries:
    temp_exp = re.findall('ERR[0-9]*', exp_entry)[0]
    #check is the original selex data was there
    #check if there is some redundancy, if not generate ref_dict
    if os.path.isfile('Motif/' + temp_exp + '.fa'):
        continue
    else:
        if os.path.isfile(fasta_dir + temp_exp + '.fa'):
            pass
        else:
            call(['bash', 'sra_dl_single.sh', temp_exp])
        print(temp_exp)