def returnDeletions(snp_pos, snp_af, phase, deletions): del_output = [] for d in deletions: d_snp_del = distanceSNP_Deletion (snp_pos, d['pos'], d['length']) if d_snp_del <= DIST_THRESHOLD and (1 - MAJOR_AF_THRESHOLD <= d['af'] <= MAJOR_AF_THRESHOLD): [R, P] = pearsonr(phase, d['phase']) [g, g_n] = gamma.returnGamma (phase, d['phase'], snp_af, d['af']) [a,b,c,e] = vcf.return2x2Table(phase, d['phase']) del_output.append(['DEL', d['pos'], d['length'], d['af'], d_snp_del, R, g, g_n, a, b, c, e]) return del_output
def preprocessData (vcf_filename, chromosome, individuals=('parents', 'children')): [hit_snps_positions, hit_snps, discarded_snps_positions, genes, indices_individuals, deletions] = gatherData(vcf_filename, chromosome, individuals) vcf_file = vcf.openVCFFile(vcf_filename) vcf.discardVCFHeaders(vcf_file) vcf_file.readline() # discard the column description header output = [] # results are stored here and later written to file n_hit_snps, n_discarded_snps, n_nonhit_snps = 0, 0, 0 for variant in vcf_file.readlines (): # move through the phased variants in this file data = [value for value in variant.split()] if len(data[3]) == 1: # variant is a SNP and no deletion snp_pos = int(data[1]) snp_type = hg19.determineType(snp_pos, genes) dist_tss = hg19.distanceToTSS(snp_pos, genes) if snp_pos in hit_snps_positions: # SNP is a hit SNP [phase, snp_af] = processHitSNP (snp_pos, data, hit_snps, indices_individuals) [max_r, p, d, dist_snp_del, min_dist_snp_del] = returnSnpDeletionPair (snp_pos, phase, deletions) if d is not None: [g,g_n] = gamma.returnGamma(phase, d['phase'], snp_af, d['af']) [a,b,c,e] = vcf.return2x2Table(phase, d['phase']) n_hit_snps = n_hit_snps + 1 output.append([chromosome, 1, snp_pos, snp_type, snp_af, d['pos'], d['length'], d['af'], max_r, p, dist_snp_del, dist_tss, min_dist_snp_del, g, g_n, a, b, c, e]) else: n_discarded_snps = n_discarded_snps + 1 discarded_snps_positions.append(snp_pos) elif snp_pos not in discarded_snps_positions: # non-hit SNP phase = vcf.returnPhase(data, indices_individuals) snp_af = vcf.determineAlleleFrequency (phase) [max_r, p, d, dist_snp_del, min_dist_snp_del] = returnSnpDeletionPair (snp_pos, phase, deletions) if d is not None: [g,g_n] = gamma.returnGamma(phase, d['phase'], snp_af, d['af']) [a,b,c,e] = vcf.return2x2Table(phase, d['phase']) n_nonhit_snps = n_nonhit_snps + 1 output.append([chromosome, 0, snp_pos, snp_type, snp_af, d['pos'], d['length'], d['af'], max_r, p, dist_snp_del, dist_tss, min_dist_snp_del, g, g_n, a, b, c, e]) else: n_discarded_snps = n_discarded_snps + 1 discarded_snps_positions.append(snp_pos) writeToOutputFile(output, chromosome) vcf_file.close() print '# hit SNPs: ', n_hit_snps, '\t#discarded SNPs:', n_discarded_snps, '\t# non hit SNPs: ', n_nonhit_snps