def processPhasedData (vcf_filename, chromosome): [hit_snp_positions, hit_snps] = readHitSNPs(chromosome) genes = hg19.read(chromosome) indices_children = vcf.returnColumnsOfChildren(vcf_filename) deletions = vcf.returnDeletions(chromosome, indices_children, MAJOR_AF_THRESHOLD) vcf_file = vcf.openVCFFile(vcf_filename) vcf.discardVCFHeaders(vcf_file) vcf_file.readline() # discard the column description header output = [] # output is stored here for variant in vcf_file.readlines(): data = [value for value in variant.split()] if len(data[3]) == 1: # variant is a SNP # then determine whether this is a hit snp: snp_pos = int(data[1]) #if snp_pos in hit_snp_positions: # hit snp # [phase, snp_af] = processHitSNP (snp_pos, data, hit_snps, indices_children) # if snp_af <= MAJOR_AF_THRESHOLD and 1 - snp_af <= MAJOR_AF_THRESHOLD: # snp_type = hg19.determineType(snp_pos, genes) # dist_tss = hg19.distanceToTSS(snp_pos, genes) # [max_r, p, d] = maximumR_hitsnp (snp_pos, phase, deletions) # #print '1', max_r, p # if d is not None: # dist_snp_del = distanceSNP_CNV(snp_pos, d['pos'], d['length']) # output.append([chromosome, 1, snp_pos, snp_type, snp_af, d['pos'], d['length'], d['af'], max_r, p, dist_snp_del, dist_tss]) #else: # non-hit snp phase = vcf.returnPhase(data, indices_children) snp_af = vcf.determineAlleleFrequency (phase) if snp_af <= MAJOR_AF_THRESHOLD and 1 - snp_af <= MAJOR_AF_THRESHOLD: snp_pos = int(data[1]) snp_type = hg19.determineType(snp_pos, genes) dist_tss = hg19.distanceToTSS(snp_pos, genes) [max_r, p, d] = maximumR (snp_pos, phase, deletions) type_snp = 0 if snp_pos in hit_snp_positions: type_snp = 1 #print '0', max_r, p if d is not None: dist_snp_del = distanceSNP_CNV(snp_pos, d['pos'], d['length']) output.append([chromosome, type_snp, snp_pos, snp_type, snp_af, d['pos'], d['length'], d['af'], max_r, p, dist_snp_del, dist_tss]) writeToOutputFile(output, chromosome) vcf_file.close()
def processSNPs (vcf_filename, chromosome, individuals=('parents', 'children')): # gather the required data [hit_snps_positions, hit_snps, discarded_snps_positions] = pr.readHitSNPs (chromosome) genes = hg19.read (chromosome) indices_individuals = vcf.returnColumns (vcf_filename, individuals) deletions = vcf.returnDeletions (chromosome, indices_individuals, MAJOR_AF_THRESHOLD) # open file vcf_file = vcf.openVCFFile(vcf_filename) vcf.discardVCFHeaders(vcf_file) vcf_file.readline() # discard the column description header for variant in vcf_file.readlines(): # move through the genetic variants snp_output = [] data = [value for value in variant.split()] if len(data[3]) == 1: # variant is a SNP and no deletion snp_pos = int(data[1]) snp_type = hg19.determineType(snp_pos, genes) dist_tss = hg19.distanceToTSS(snp_pos, genes) if snp_pos in hit_snps_positions: # SNP is a hit SNP [phase, snp_af] = pr.processHitSNP (snp_pos, data, hit_snps, indices_individuals) if 1 - MAJOR_AF_THRESHOLD <= snp_af <= MAJOR_AF_THRESHOLD: snp_output.append(['HITSNP', snp_pos, snp_af, snp_type, dist_tss]) dels = returnDeletions(snp_pos, snp_af, phase, deletions) # TODO implement if len(dels) > 0: snp_output.append(dels) elif snp_pos not in discarded_snps_positions: phase = vcf.returnPhase(data, indices_individuals) snp_af = vcf.determineAlleleFrequency (phase) if 1 - MAJOR_AF_THRESHOLD <= snp_af <= MAJOR_AF_THRESHOLD: snp_output.append(['NONHITSNP', snp_pos, snp_af, snp_type, dist_tss]) dels = returnDeletions(snp_pos, snp_af, phase, deletions) # TODO implement if len(dels) > 0: snp_output.append(dels) if len(snp_output) > 0: writeSNPToOutputFile(snp_output, chromosome) vcf_file.close()
def gatherData (vcf_filename, chromosome, individuals): [hit_snps_positions, hit_snps, discarded_snps_positions] = readHitSNPs (chromosome) genes = hg19.read (chromosome) indices_individuals = vcf.returnColumns (vcf_filename, individuals) deletions = vcf.returnDeletions (chromosome, indices_individuals, MAJOR_AF_THRESHOLD) return [hit_snps_positions, hit_snps, discarded_snps_positions, genes, indices_individuals, deletions]