Ejemplo n.º 1
0
def returnDeletions(snp_pos, snp_af, phase, deletions):
	del_output = [] 
	for d in deletions:
		d_snp_del = distanceSNP_Deletion (snp_pos, d['pos'], d['length'])
		if d_snp_del <= DIST_THRESHOLD and (1 - MAJOR_AF_THRESHOLD <= d['af'] <= MAJOR_AF_THRESHOLD):
			[R, P] = pearsonr(phase, d['phase'])
			[g, g_n] = gamma.returnGamma (phase, d['phase'], snp_af, d['af'])
			[a,b,c,e] = vcf.return2x2Table(phase, d['phase']) 
			del_output.append(['DEL', d['pos'], d['length'], d['af'], d_snp_del, R, g, g_n, a, b, c, e])
	return del_output
Ejemplo n.º 2
0
def preprocessData (vcf_filename, chromosome, individuals=('parents', 'children')):
	[hit_snps_positions, hit_snps, discarded_snps_positions, genes, indices_individuals, deletions] = gatherData(vcf_filename, chromosome, individuals)	
	
	vcf_file = vcf.openVCFFile(vcf_filename)
	vcf.discardVCFHeaders(vcf_file)
	vcf_file.readline() # discard the column description header

	output = [] # results are stored here and later written to file
	n_hit_snps, n_discarded_snps, n_nonhit_snps = 0, 0, 0 

	for variant in vcf_file.readlines (): # move through the phased variants in this file
		data = [value for value in variant.split()]
		if len(data[3]) == 1: # variant is a SNP and no deletion
			snp_pos = int(data[1])
			snp_type = hg19.determineType(snp_pos, genes)
			dist_tss = hg19.distanceToTSS(snp_pos, genes)
			if snp_pos in hit_snps_positions: # SNP is a hit SNP
				[phase, snp_af] = processHitSNP (snp_pos, data, hit_snps, indices_individuals)
				[max_r, p, d, dist_snp_del, min_dist_snp_del] = returnSnpDeletionPair (snp_pos, phase, deletions)	
				if d is not None:
					[g,g_n] = gamma.returnGamma(phase, d['phase'], snp_af, d['af'])
					[a,b,c,e] = vcf.return2x2Table(phase, d['phase']) 
					n_hit_snps = n_hit_snps + 1
					output.append([chromosome, 1, snp_pos, snp_type, snp_af, d['pos'], d['length'], d['af'], max_r, p, dist_snp_del, dist_tss, min_dist_snp_del, g, g_n, a, b, c, e])		
				else:
					n_discarded_snps = n_discarded_snps + 1
					discarded_snps_positions.append(snp_pos) 
			elif snp_pos not in discarded_snps_positions: # non-hit SNP 
				phase 	= vcf.returnPhase(data, indices_individuals) 
				snp_af 	= vcf.determineAlleleFrequency (phase)
				[max_r, p, d, dist_snp_del, min_dist_snp_del] = returnSnpDeletionPair (snp_pos, phase, deletions)	
				if d is not None:
					[g,g_n] = gamma.returnGamma(phase, d['phase'], snp_af, d['af'])
					[a,b,c,e] = vcf.return2x2Table(phase, d['phase']) 
					n_nonhit_snps = n_nonhit_snps + 1
					output.append([chromosome, 0, snp_pos, snp_type, snp_af, d['pos'], d['length'], d['af'], max_r, p, dist_snp_del, dist_tss, min_dist_snp_del, g, g_n, a, b, c, e])		
				else:
					n_discarded_snps = n_discarded_snps + 1
					discarded_snps_positions.append(snp_pos)  
	writeToOutputFile(output, chromosome)
	vcf_file.close()	
	print '# hit SNPs: ', n_hit_snps, '\t#discarded SNPs:', n_discarded_snps, '\t# non hit SNPs: ', n_nonhit_snps