def __list_species_snps(self): for l in self.vcf.each_snp(): snp = VcfSnp(l) if snp.is_a_substitution() and \ snp.has_high_quality(self.MIN_QUAL) and \ snp.species_snp(): print(snp.coordinate(' '))
def process_snps(vcf): """ per each snp, compute the ratio of the min allele freq / total number of alleles seen. Ignore cases were we are seeing more than two alleles. Find how many chrmosomes you observe with the minor allele compute the ratio of that and the total number of alleles seen. """ vcf.load_meta_header() total_n_chrms = vcf.num_of_samples * 2 mafs = defaultdict(lambda: 0) for l in vcf.each_snp(): snp = VcfSnp(l) if snp.has_high_quality(MIN_QUAL): a_counts = snp.alternative_allele_counts() a_total = snp.total_num_alleles() if len(a_counts) == 1: # only 1 alternative allele n_chrms_with_alt_allele = a_counts[0] n_chrms_with_ref_allele = a_total - n_chrms_with_alt_allele if n_chrms_with_ref_allele <= n_chrms_with_alt_allele: mafs[round(n_chrms_with_ref_allele / total_n_chrms, 2)] += 1 else: mafs[round(n_chrms_with_alt_allele / total_n_chrms, 2)] += 1 return mafs
def setUp(self): self.fixtures = { "non_syn": VcfSnp( "Chr1 627540 . A G 11.30 . AC1=1;AC=1;AF1=0.5;AN=2;DP4=3,2,2,0;DP=14;EFF=NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Tcc/Ccc|S8P|668|RGS12|protein_coding|CODING|ENSMMUT00000009994|exon_1_625681_627664);FQ=14.2;MQ=53;PV4=1,0.31,0.26,1;SF=5;VDB=0.0279 " ), "syn_coding": VcfSnp( "Chr1 24428 . T G 7.59 . AC1=2;AC=2;AF1=1;AN=2;DP4=0,0,2,0;DP=2;EFF=SYNONYMOUS_CODING(LOW|SILENT|ggA/ggC|G145|368|HMX1|protein_coding|CODING|ENSMMUT00000019076|exon_1_24407_24477);FQ=-33;MQ=22;SF=4;VDB=0.0133 GT:GQ:SP:PL . ." ), "intron": VcfSnp( "Chr1 26208 . N T 11.10 . AC1=2;AC=2;AF1=1;AN=2;DP4=0,0,0,2;DP=2;EFF=INTRON(MODIFIER||||368|HMX1|protein_coding|CODING|ENSMMUT00000019076|);FQ=-33;MQ=25;SF=14;VDB=0.0099 GT:GQ:SP:PL . . . . . . . . . . . . . ." ), "intergenic": VcfSnp( "Chr1 2986 . G T 9.52 . AC1=1;AC=1;AF1=0.5;AN=2;DP4=6,0,0,1;DP=10;EFF=INTERGENIC(MODIFIER|||||||||);FQ=12.3;MQ=60;PV4=0.14,1,1,1;SF=5 GT:GQ:SP:PL . . . . . 0/1:41:0:39,0,96 . . . . . . . . . ." ), "not_annotated": VcfSnp( "20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4" ), "full_snp": VcfSnp( "Chr1 7053 . A G 203.90 . AC1=1;AC=12;AF1=0.5;AN=20;DP4=78,96,122,120;DP=442;EFF=INTERGENIC(MODIFIER|||||||||);FQ=225;MQ=45;PV4=0.75,1,1.6e-09,1;SF=0,1,2,7,9,11,14,15,16,19;VDB=0.0399 GT:GQ:SP:PL 0/1:99:1:255,0,255 1/1:99:0:255,111,0 0/1:99:1:255,0,255 . . . . 0/1:99:4:218,0,255 . 1/1:99:0:255,132,0 . 0/1:99:5:255,0,255 . . 0/1:99:0:234,0,255 0/1:99:0:108,0,126 0/1:99:1:255,0,254 . . 0/1:99:2:255,0,255" ), "all_same_gtype": VcfSnp( "Chr1 2222 . A G 203.90 . AC1=1;AC=12;AF1=0.5;AN=20;DP4=78,96,122,120;DP=442;EFF=INTERGENIC(MODIFIER|||||||||);FQ=225;MQ=45;PV4=0.75,1,1.6e-09,1;SF=0,1,2,7,9,11,14,15,16,19;VDB=0.0399 GT:GQ:SP:PL 0/1:99:1:255,0,255 0/1:99:0:255,111,0 0/1:99:1:255,0,255 . . . . 0/1:99:4:218,0,255" ), "second_vars": VcfSnp( "Chr1 8239092 . G T,C 222.00 AC1=2;AC=12,6;AF1=1;AN=18;DP4=0,0,186,177;DP=385;EFF=INTERGENIC(MODIFIER|||||||||);FQ=-126;MQ=40;SF=3,4,7,8,9,10,12,14,15;VDB=0.0384;RDP=37,55,44,34,43,39,48,49,45,56,40,61,43,45,43,28,51,52,45,46 GT:GQ:SP:PL . . . 2/2:99:0:255,.,.,99,.,0 1/1:99:0:255,123,0,.,.,. . . 2/2:99:0:255,.,.,138,.,0 1/1:99:0:255,132,0,.,., . 1/1:99:0:255,166,0,.,.,. 1/1:99:0:255,111,0,.,.,. . 2/2:99:0:255,.,.,126,.,0 . 1/1:99:0:255,123,0,.,.,. 1/1:99:0:255,66,0,.,.,. . . . ." ) }
def process_snps(vcf, min_num_samples): skipped, total = 0, 0 hc = defaultdict(lambda: 0) # fc -> count for l in vcf.each_snp(): snp = VcfSnp(l) if snp.annotated == False: raise (Exception('Found a snp that is not annotated: %s' % l)) if len(snp.gtypes()) >= min_num_samples and snp.all_gtypes_the_same(): skipped += 1 else: hc[snp.func_cons] += 1 total += 1 hc["SAME_GENOTYPE_ON_ALL_SAMPLES_SKIPPED"] = skipped hc["TOTAL"] = total return hc
def __process_snps(self): for l in self.stream: if self.__in_header(l): self.more_samples_to_process = True return else: snp = VcfSnp(l) if snp.is_a_substitution(): self.subs[snp.coordinate()] += 1 else: self.indels[snp.coordinate()] += 1 if snp.annotated: self.genes_partial[snp.gene] = True self.more_samples_to_process = False
def process_snps(vcf, fd_cov): """ We have to read each coverage line and the corresponding snp. """ for l in vcf.each_snp(): snp = VcfSnp(l) coor_cov, a_cov = pop_coor(fd_cov) print add_rdp(l.split(), a_cov)
def process_snps(vcf): h_genes = defaultdict(lambda: 0) for l in vcf.each_snp(): snp = VcfSnp(l) if snp.annotated == False: raise(Exception('Found a snp that is not annotated: %s' % l)) if snp.impact == "HIGH" and snp.gene != "": h_genes[snp.gene] += 1 return h_genes
def prepare(vcf, grps_pheno, grps_haplo): """Prepare the data in the snps for the heatmap""" matrix, a_sites, a_groups = [], [], [] for curr_grp in grps_pheno.groups: for _id in grps_pheno.indices_for_grp(curr_grp): pheno = curr_grp[0:5] haplo = grps_haplo.what_is(_id) a_groups.append(_id + "_" + pheno + "_" + haplo) for l in vcf.each_snp(): snp = VcfSnp(l) a_calls = [] a_sites.append(snp.coordinate()) gts = snp.gtypes() # col_num -> gt_set for curr_grp in grps_pheno.groups: for _id in grps_pheno.indices_for_grp(curr_grp): _index = vcf.id_to_col[_id] a_calls.append(make_the_call(gts, _index, snp)) matrix.append(a_calls) return np.transpose(np.array(matrix)), a_sites, a_groups
def __calculate_snp_freq(self): """ Compute the snp frequency (# of snps per kbp) Drop snps that are indels, have low quality If wes, also drop non coding regions If drop is True, we have to drop species snps """ num_snps = 0 total = 0 for l in self.vcf.each_snp(): snp = VcfSnp(l) total += 1 if snp.is_a_substitution() and snp.has_high_quality(self.MIN_QUAL): if self.exp_type == 'wgs': if not self.drop or (self.drop and not self.__is_a_species_snp(snp)): num_snps += 1 if self.exp_type == 'wes' and snp.in_coding_region(): if not self.drop or (self.drop and not self.__is_a_species_snp(snp)): num_snps += 1 logging.info("Total/counted: %d/%d" % (total, num_snps)) return (float(num_snps)/self.GENOME_SIZE[self.exp_type])*1000
def process_snps(self): """Per each snp, we want to decide if it is interesting. By interesting we mean that the samples in any group have a different genotype compare to the other groups. To do that we count the var allele freq per each of the samples in the group (#g1, #g2 ...) and apply the following condition: std_dev(#g1, #g2, #g3 ...) > X """ ict = self.vcf.col_to_id for l in self.vcf.each_snp(): self.h = self.o_groups.fresh_hash( ) # key: group , value: num of alt alleles seen snp = VcfSnp(l) self.process_genotypes(snp) self.check_filters_and_report(l)
def __calculate_snp_freq(self): """ Compute the snp frequency (# of snps per kbp) Drop snps that are indels, have low quality If wes, also drop non coding regions If drop is True, we have to drop species snps """ num_snps = 0 total = 0 for l in self.vcf.each_snp(): snp = VcfSnp(l) total += 1 if snp.is_a_substitution() and snp.has_high_quality(self.MIN_QUAL): if self.exp_type == 'wgs': if not self.drop or (self.drop and not self.__is_a_species_snp(snp)): num_snps += 1 if self.exp_type == 'wes' and snp.in_coding_region(): if not self.drop or (self.drop and not self.__is_a_species_snp(snp)): num_snps += 1 logging.info("Total/counted: %d/%d" % (total, num_snps)) return (float(num_snps) / self.GENOME_SIZE[self.exp_type]) * 1000
def check_targets(chunk): # Check if the snps are on target or not data = {'sites': []} for l in chunk: chrm, coor = l.split()[0:2] data['sites'].append({'Chrm': chrm, 'Start': int(coor)}) data_json = json.dumps(data) r = requests.post(SERVER_URL, data=data_json) # Add the OT INFO field if they are on target for i, ont in enumerate(json.loads(r.text)): if ont == 0: # not on target print chunk[i] else: print VcfSnp(chunk[i]).add_info('OT')