Esempio n. 1
0
def genotype_missing(remPercent):

	ori_genotype_file_name = "genotype_ori.txt"
	geno_title_info, geno_dict = load_raw_data(ori_genotype_file_name)
	ori_geno_total = len(geno_dict)
	print "ori_geno_total", ori_geno_total

	geno_sorted_list = sort_dict_by_key(geno_dict)
	if remPercent > 0:
		print "remPercent is", remPercent
		for i in range(int(remPercent*ori_geno_total)):
			if len(geno_sorted_list) > 1:
				random_index = random.randrange(0, (len(geno_sorted_list)-1))
				position = geno_sorted_list[random_index][0]
				if position in geno_dict:
					del geno_dict[int(position)]
					del geno_sorted_list[random_index]

			geno_sorted_list = sort_dict_by_key(geno_dict)

	print "new geno total", len(geno_dict)
	output_files("genotype.txt", geno_title_info, geno_dict)
	os.system("cp genotype.txt genotype_" + str(remPercent) + ".txt")
	chr_name = "chr9"

	file_name = "haplotype.txt"
	hifi_run(file_name, chr_name)
	mode = "filterbyrefid"
	seed_correction(file_name, chr_name, mode)
	hifiAccuCheck("imputed_" + file_name, chr_name)
	print "filterbyrefid accuracy"
	hifiAccuCheck("non_one.txt", chr_name)
Esempio n. 2
0
	def run(self):
			
		self.load_seed_geno_ref()	
		#elapse_time = time.time() - start_time
		#print "load_seed_geno_ref time is: " + str(format(elapse_time, "0.3f")) + "s"

		self.merge_pos()
		#elapse_time = time.time() - elapse_time
		#print "merge_pos time is: " + str(format(elapse_time, "0.3f")) + "s"
		self.hg_merge()
		#elapse_time = time.time() - elapse_time
		#print "hg_merge time is: " + str(format(elapse_time, "0.3f")) + "s"
		self.get_maf()
		#elapse_time = time.time() - elapse_time
		#print "get_maf time is: " + str(format(elapse_time, "0.3f")) + "s"
		
		maf_num_list = self.maf_dict.keys()
		#print maf_num_list
		
		maf_num_list.sort()
		maf_num_list.reverse()
		print maf_num_list
		self.combine_msg(186)
		for maf_num in maf_num_list:
			
			# improve add new pos to the current list
			#self.combine_msg(maf_num)
			if maf_num >= 0 and maf_num != 186:
				print "current maf_num:", maf_num
				self.pos_to_impute.extend(self.maf_dict[maf_num])
				#print len(self.pos_to_impute)
				pre_imputed_size = len(self.h_xn_imputed_dict)
				#print "pre_imputed_size", pre_imputed_size
				self.to_impute_window('X')
				self.to_impute_window('N')
				print "newly_imputed_size", len(self.h_xn_imputed_dict) - pre_imputed_size
		"""
		self.combine_msg(180)
		#elapse_time = time.time() - elapse_time
		#print "combine_msg time is: " + str(format(elapse_time, "0.3f")) + "s"
		start_time = time.time()
		self.to_impute_window('X')
		self.to_impute_window('N')
		elapse_time = time.time() - start_time
		print "***********************to_impute_window time is: " + str(format(elapse_time, "0.3f")) + "s"
		"""
		self.output_dict("h_xn_new.txt", self.h_xn_imputed_dict)
		seed_std_compare("h_xn_new.txt", self.chr_name)
		
		self.output_dict("h_xn_1.txt", self.h_xn_dict)
		seed_std_compare("h_xn_1.txt", self.chr_name)
		hifiAccuCheck("h_xn_1.txt", self.chr_name)
Esempio n. 3
0
def generate_std_seed_run(seed_number, add_range):
	chr_name = "chr9"
	hap_std_dict = load_seed_data(file_path + "ASW_" + chr_name + "_child_hap_refed.txt")[1]
	hap_std_list = sort_dict_by_key(hap_std_dict)

	genotype_file = file_path + "genotype_NA10847_" + chr_name + ".txt"
	geno_dict = load_raw_data(genotype_file)[1]
	seed_homo_dict, seed_hetero_dict = group_seed(hap_std_dict, geno_dict)

	seed_hetero_list = sort_dict_by_key(seed_hetero_dict)

	selected_seed_dict = {}

	if add_range == "begining":
		# add seed from begining
		for i in range(seed_number - 1):
			selected_seed_dict[seed_hetero_list[i][0]] = seed_hetero_list[i][1]
	elif add_range == "middle":
		# add seed in the middle
		hetero_total = len(seed_hetero_list)
		middle_point = hetero_total / 2
		for i in range(seed_number - 1):
			selected_seed_dict[seed_hetero_list[middle_point + i][0]] = seed_hetero_list[middle_point + i][1]
	elif add_range == "end":
		# add seed from end
		seed_hetero_list.reverse()
		for i in range(1, seed_number):
			selected_seed_dict[seed_hetero_list[i][0]] = seed_hetero_list[i][1]
	elif add_range == "random":
		# randomly adding seed
		i = 0
		while i < seed_number - 1:
			random_index = random.randrange(0, (len(seed_hetero_list) - 1))
			while seed_hetero_list[random_index][0] in selected_seed_dict or seed_hetero_list[random_index][
				1].allele_new == "N" \
					or seed_hetero_list[random_index][1].allele_new == "X":
				#while seed_hetero_list[random_index][0] in selected_seed_dict:
				random_index = random.randrange(0, (len(seed_hetero_list) - 1))
			selected_seed_dict[seed_hetero_list[random_index][0]] = seed_hetero_list[random_index][1]
			i += 1

	# always add the last snp into seed, hifi requirement
	selected_seed_dict[hap_std_list[-1][0]] = hap_std_list[-1][1]

	file_name = "haplotype_std.txt"
	output_revised_seed(file_name, selected_seed_dict)
	seed_std_compare(file_name, chr_name)
	refMerger(file_name, chr_name, 0)
	file_name = "haplotype.txt"
	hifi_run(file_name, chr_name)
	hifiAccuCheck("imputed_" + file_name, chr_name)
Esempio n. 4
0
def hifi_revise(seed_input_file, chr_name):

	start_time = time.time()
	hifi_test(seed_input_file)
	hifiAccuCheck("imputed_" + seed_input_file, chr_name)
	#hifiAccuCheck("imputedhaplotype_1.txt", chr_name)
	
	#analyze_data()
	#seed_file_name = "hap_homo.txt"
	#output_genohomo(seed_input_file, seed_file_name)
	#output_hapstdhomo(seed_input_file, seed_file_name)
	
	elapse_time = time.time() - start_time
	print "***********************to_impute_window time is: " + str(format(elapse_time, "0.3f")) + "s"
Esempio n. 5
0
def hifi_revise(seed_input_file, chr_name):

    start_time = time.time()
    hifi_test(seed_input_file)
    hifiAccuCheck("imputed_" + seed_input_file, chr_name)
    #hifiAccuCheck("imputedhaplotype_1.txt", chr_name)

    #analyze_data()
    #seed_file_name = "hap_homo.txt"
    #output_genohomo(seed_input_file, seed_file_name)
    #output_hapstdhomo(seed_input_file, seed_file_name)

    elapse_time = time.time() - start_time
    print "***********************to_impute_window time is: " + str(
        format(elapse_time, "0.3f")) + "s"
Esempio n. 6
0
def depth_cutoff():
	for i in range(1, 2):
		i = 11
		data_path = solid_path + "song_" + str(i) + "/prem_rmsk_indel/"
		os.chdir(data_path)
		os.system("pwd")
		sam_file = "song_" + str(i) + "_prem_" + solid_chr[i] + "_sorted_rmsk_indel.sam"
		print "**************** song_" + str(i)
		print "sam_file", sam_file
		for depth_threshold in range(0, 3):
			snpPick(data_path + sam_file, depth_threshold, solid_chr[i])
			sam_file_name = "song_" + str(i) + "_prem_" + solid_chr[i] + "_sorted_rmsk_indel_"
			seed_std_compare(data_path + sam_file_name + str(depth_threshold) + "_called_seed.txt", solid_chr[i])
			combined_seed_file = data_path + sam_file_name + str(depth_threshold) + "_combined_seed.txt"
			refMerger(combined_seed_file, solid_chr[i], 0)
			hifi_test("haplotype.txt")
			hifiAccuCheck(data_path + "imputed_haplotype.txt", solid_chr[i])
Esempio n. 7
0
def simulation_run():
	for depth in dept_list:
		depth_path = simulation_path + depth + 'x/'
		print "depth_path", depth_path
		#os.system("cd " + depth_path)
		for error_rate in error_rate_list:
			#combined_seed_file = depth_path + "NA12878_hg18ch6_A_" + depth + "x_" + error_rate + "er_indel_0_combined_seed.txt"
			#print "combined_seed_file", combined_seed_file
			#refMerger(combined_seed_file, chr_name, 0)
			#seed_file = depth_path + "NA12878_hg18ch6_A_" + depth + "x_" + error_rate + "er_indel_0_combined_seed.txt"
			#print "seed_file", seed_file
			#hap_seed_name = "haplotype_" + depth + "x_" + error_rate + "er.txt"
			#os.system("cp " + "haplotype.txt" + " " + hap_seed_name)
			#hap_seed_name = "haplotype.txt"
			#hifi_test(hap_seed_name)
			error_path = depth_path + "NA12878_hg18ch6_A_" + depth + "x_" + error_rate + "er/"
			#os.system("cd " + error_path)
			#os.system("pwd")
			hifiAccuCheck(error_path + "imputedhaplotype.txt", chr_name)