def obtainSignificantHitSNPs (chromosomes, individuals=('parents')):
	hit_snps = [] # list of all significant hit snps 

	# process each chromosome in the list chromosomes
	for chromosome in chromosomes:
		print 'Processing chromosome ', chromosome
		# obtain the SNP data (position and hit allele)
		[hit_snps_positions, allele_info] = readHitSNPData (chromosome)
		# create the vcf file object
		vcf_filename = returnVCFFileName (chromosome)
		# retrieve indices of the individuals 
		indices_individuals = vcf.returnColumns (vcf_filename, individuals) 
		n_individuals = len(indices_individuals)		
		# retrieve the deletions on the chromosome 
		deletions = vcf.returnDeletions (chromosome, indices_individuals, .96) 
		# retrieve the already processed hit snp data and incorporate the relevant info 
		hit_snps_chr = collectAllData (chromosome, individuals, deletions, allele_info, n_individuals)
		# add the snps for this chromosome to the list 
		for hit_snp in hit_snps_chr:
			hit_snps.append(hit_snp)

	print 'Before FDR: ', len(hit_snps)
	# apply fdr 
	hit_snps = fdr(hit_snps, 0.05)
	print 'After FDR: ', len(hit_snps)
		
	# apply material significance level (r^2 >= .8)
	hit_snps = materialSignificanceTest (hit_snps)
	print 'After Testing for material significance (R^2 >= .8): ', len(hit_snps)
	writeToOutputFile (hit_snps, individuals)
Ejemplo n.º 2
0
def processSNPs (vcf_filename, chromosome, individuals=('parents', 'children')):
	# gather the required data
	[hit_snps_positions, hit_snps, discarded_snps_positions] = pr.readHitSNPs (chromosome)
	genes = hg19.read (chromosome)
	indices_individuals = vcf.returnColumns (vcf_filename, individuals) 
	deletions = vcf.returnDeletions (chromosome, indices_individuals, MAJOR_AF_THRESHOLD)
	# open file	
	vcf_file = vcf.openVCFFile(vcf_filename)
	vcf.discardVCFHeaders(vcf_file)
	vcf_file.readline() # discard the column description header
	
	for variant in vcf_file.readlines(): # move through the genetic variants
		snp_output = [] 
		data = [value for value in variant.split()]
		if len(data[3]) == 1: # variant is a SNP and no deletion
			snp_pos = int(data[1])
			snp_type = hg19.determineType(snp_pos, genes)
			dist_tss = hg19.distanceToTSS(snp_pos, genes)	
			if snp_pos in hit_snps_positions: # SNP is a hit SNP
				[phase, snp_af] = pr.processHitSNP (snp_pos, data, hit_snps, indices_individuals)
				if 1 - MAJOR_AF_THRESHOLD <= snp_af <= MAJOR_AF_THRESHOLD:
					snp_output.append(['HITSNP', snp_pos, snp_af, snp_type, dist_tss]) 
					dels = returnDeletions(snp_pos, snp_af, phase, deletions) # TODO implement
					if len(dels) > 0:
						snp_output.append(dels)
			elif snp_pos not in discarded_snps_positions:
				phase 	= vcf.returnPhase(data, indices_individuals) 
				snp_af 	= vcf.determineAlleleFrequency (phase)
				if 1 - MAJOR_AF_THRESHOLD <= snp_af <= MAJOR_AF_THRESHOLD:				
					snp_output.append(['NONHITSNP', snp_pos, snp_af, snp_type, dist_tss]) 
					dels = returnDeletions(snp_pos, snp_af, phase, deletions) # TODO implement
					if len(dels) > 0:
						snp_output.append(dels)
			if len(snp_output) > 0:
				writeSNPToOutputFile(snp_output, chromosome)
	vcf_file.close()
Ejemplo n.º 3
0
def gatherData (vcf_filename, chromosome, individuals):
	[hit_snps_positions, hit_snps, discarded_snps_positions] = readHitSNPs (chromosome) 
	genes = hg19.read (chromosome)
	indices_individuals = vcf.returnColumns (vcf_filename, individuals) 
	deletions = vcf.returnDeletions (chromosome, indices_individuals, MAJOR_AF_THRESHOLD)
	return [hit_snps_positions, hit_snps, discarded_snps_positions, genes, indices_individuals, deletions]