def obtainSignificantHitSNPs (chromosomes, individuals=('parents')): hit_snps = [] # list of all significant hit snps # process each chromosome in the list chromosomes for chromosome in chromosomes: print 'Processing chromosome ', chromosome # obtain the SNP data (position and hit allele) [hit_snps_positions, allele_info] = readHitSNPData (chromosome) # create the vcf file object vcf_filename = returnVCFFileName (chromosome) # retrieve indices of the individuals indices_individuals = vcf.returnColumns (vcf_filename, individuals) n_individuals = len(indices_individuals) # retrieve the deletions on the chromosome deletions = vcf.returnDeletions (chromosome, indices_individuals, .96) # retrieve the already processed hit snp data and incorporate the relevant info hit_snps_chr = collectAllData (chromosome, individuals, deletions, allele_info, n_individuals) # add the snps for this chromosome to the list for hit_snp in hit_snps_chr: hit_snps.append(hit_snp) print 'Before FDR: ', len(hit_snps) # apply fdr hit_snps = fdr(hit_snps, 0.05) print 'After FDR: ', len(hit_snps) # apply material significance level (r^2 >= .8) hit_snps = materialSignificanceTest (hit_snps) print 'After Testing for material significance (R^2 >= .8): ', len(hit_snps) writeToOutputFile (hit_snps, individuals)
def processSNPs (vcf_filename, chromosome, individuals=('parents', 'children')): # gather the required data [hit_snps_positions, hit_snps, discarded_snps_positions] = pr.readHitSNPs (chromosome) genes = hg19.read (chromosome) indices_individuals = vcf.returnColumns (vcf_filename, individuals) deletions = vcf.returnDeletions (chromosome, indices_individuals, MAJOR_AF_THRESHOLD) # open file vcf_file = vcf.openVCFFile(vcf_filename) vcf.discardVCFHeaders(vcf_file) vcf_file.readline() # discard the column description header for variant in vcf_file.readlines(): # move through the genetic variants snp_output = [] data = [value for value in variant.split()] if len(data[3]) == 1: # variant is a SNP and no deletion snp_pos = int(data[1]) snp_type = hg19.determineType(snp_pos, genes) dist_tss = hg19.distanceToTSS(snp_pos, genes) if snp_pos in hit_snps_positions: # SNP is a hit SNP [phase, snp_af] = pr.processHitSNP (snp_pos, data, hit_snps, indices_individuals) if 1 - MAJOR_AF_THRESHOLD <= snp_af <= MAJOR_AF_THRESHOLD: snp_output.append(['HITSNP', snp_pos, snp_af, snp_type, dist_tss]) dels = returnDeletions(snp_pos, snp_af, phase, deletions) # TODO implement if len(dels) > 0: snp_output.append(dels) elif snp_pos not in discarded_snps_positions: phase = vcf.returnPhase(data, indices_individuals) snp_af = vcf.determineAlleleFrequency (phase) if 1 - MAJOR_AF_THRESHOLD <= snp_af <= MAJOR_AF_THRESHOLD: snp_output.append(['NONHITSNP', snp_pos, snp_af, snp_type, dist_tss]) dels = returnDeletions(snp_pos, snp_af, phase, deletions) # TODO implement if len(dels) > 0: snp_output.append(dels) if len(snp_output) > 0: writeSNPToOutputFile(snp_output, chromosome) vcf_file.close()
def gatherData (vcf_filename, chromosome, individuals): [hit_snps_positions, hit_snps, discarded_snps_positions] = readHitSNPs (chromosome) genes = hg19.read (chromosome) indices_individuals = vcf.returnColumns (vcf_filename, individuals) deletions = vcf.returnDeletions (chromosome, indices_individuals, MAJOR_AF_THRESHOLD) return [hit_snps_positions, hit_snps, discarded_snps_positions, genes, indices_individuals, deletions]