Ejemplo n.º 1
0
 def __list_species_snps(self):
   for l in self.vcf.each_snp():
     snp = VcfSnp(l)
     if snp.is_a_substitution() and \
        snp.has_high_quality(self.MIN_QUAL) and \
        snp.species_snp():
       print(snp.coordinate(' '))
Ejemplo n.º 2
0
def process_snps(vcf):
    """ per each snp, compute the ratio of the min allele freq / total number
      of alleles seen.
      Ignore cases were we are seeing more than two alleles.
      Find how many chrmosomes you observe with the minor allele
      compute the ratio of that and the total number of alleles seen.
  """
    vcf.load_meta_header()
    total_n_chrms = vcf.num_of_samples * 2
    mafs = defaultdict(lambda: 0)
    for l in vcf.each_snp():
        snp = VcfSnp(l)
        if snp.has_high_quality(MIN_QUAL):
            a_counts = snp.alternative_allele_counts()
            a_total = snp.total_num_alleles()
            if len(a_counts) == 1:  # only 1 alternative allele
                n_chrms_with_alt_allele = a_counts[0]
                n_chrms_with_ref_allele = a_total - n_chrms_with_alt_allele
                if n_chrms_with_ref_allele <= n_chrms_with_alt_allele:
                    mafs[round(n_chrms_with_ref_allele / total_n_chrms,
                               2)] += 1
                else:
                    mafs[round(n_chrms_with_alt_allele / total_n_chrms,
                               2)] += 1
    return mafs
Ejemplo n.º 3
0
 def setUp(self):
     self.fixtures = {
         "non_syn":
         VcfSnp(
             "Chr1    627540  .       A       G       11.30   .       AC1=1;AC=1;AF1=0.5;AN=2;DP4=3,2,2,0;DP=14;EFF=NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Tcc/Ccc|S8P|668|RGS12|protein_coding|CODING|ENSMMUT00000009994|exon_1_625681_627664);FQ=14.2;MQ=53;PV4=1,0.31,0.26,1;SF=5;VDB=0.0279 "
         ),
         "syn_coding":
         VcfSnp(
             "Chr1    24428   .       T       G       7.59    .       AC1=2;AC=2;AF1=1;AN=2;DP4=0,0,2,0;DP=2;EFF=SYNONYMOUS_CODING(LOW|SILENT|ggA/ggC|G145|368|HMX1|protein_coding|CODING|ENSMMUT00000019076|exon_1_24407_24477);FQ=-33;MQ=22;SF=4;VDB=0.0133 GT:GQ:SP:PL     .       ."
         ),
         "intron":
         VcfSnp(
             "Chr1    26208   .       N       T       11.10   .       AC1=2;AC=2;AF1=1;AN=2;DP4=0,0,0,2;DP=2;EFF=INTRON(MODIFIER||||368|HMX1|protein_coding|CODING|ENSMMUT00000019076|);FQ=-33;MQ=25;SF=14;VDB=0.0099 GT:GQ:SP:PL     .       .       .       .       .       .       .       .       .       .       .       .       .    ."
         ),
         "intergenic":
         VcfSnp(
             "Chr1    2986    .       G       T       9.52    .       AC1=1;AC=1;AF1=0.5;AN=2;DP4=6,0,0,1;DP=10;EFF=INTERGENIC(MODIFIER|||||||||);FQ=12.3;MQ=60;PV4=0.14,1,1,1;SF=5   GT:GQ:SP:PL     .       .       .       .       .       0/1:41:0:39,0,96        .       .       .       .       .       .       .       .       .    ."
         ),
         "not_annotated":
         VcfSnp(
             "20     1110696 rs6040355 A      G,T     67   PASS   NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2   2/2:35:4"
         ),
         "full_snp":
         VcfSnp(
             "Chr1    7053    .       A       G       203.90  .       AC1=1;AC=12;AF1=0.5;AN=20;DP4=78,96,122,120;DP=442;EFF=INTERGENIC(MODIFIER|||||||||);FQ=225;MQ=45;PV4=0.75,1,1.6e-09,1;SF=0,1,2,7,9,11,14,15,16,19;VDB=0.0399   GT:GQ:SP:PL     0/1:99:1:255,0,255      1/1:99:0:255,111,0      0/1:99:1:255,0,255      .       .       .      .       0/1:99:4:218,0,255      .       1/1:99:0:255,132,0      .       0/1:99:5:255,0,255      .       .       0/1:99:0:234,0,255      0/1:99:0:108,0,126      0/1:99:1:255,0,254      .       .       0/1:99:2:255,0,255"
         ),
         "all_same_gtype":
         VcfSnp(
             "Chr1    2222    .       A       G       203.90  .       AC1=1;AC=12;AF1=0.5;AN=20;DP4=78,96,122,120;DP=442;EFF=INTERGENIC(MODIFIER|||||||||);FQ=225;MQ=45;PV4=0.75,1,1.6e-09,1;SF=0,1,2,7,9,11,14,15,16,19;VDB=0.0399   GT:GQ:SP:PL     0/1:99:1:255,0,255      0/1:99:0:255,111,0      0/1:99:1:255,0,255      .       .       .      .       0/1:99:4:218,0,255"
         ),
         "second_vars":
         VcfSnp(
             "Chr1    8239092 .       G       T,C     222.00  AC1=2;AC=12,6;AF1=1;AN=18;DP4=0,0,186,177;DP=385;EFF=INTERGENIC(MODIFIER|||||||||);FQ=-126;MQ=40;SF=3,4,7,8,9,10,12,14,15;VDB=0.0384;RDP=37,55,44,34,43,39,48,49,45,56,40,61,43,45,43,28,51,52,45,46   GT:GQ:SP:PL     .     .     .       2/2:99:0:255,.,.,99,.,0    1/1:99:0:255,123,0,.,.,.        .       .       2/2:99:0:255,.,.,138,.,0        1/1:99:0:255,132,0,.,., .       1/1:99:0:255,166,0,.,.,.        1/1:99:0:255,111,0,.,.,.        .       2/2:99:0:255,.,.,126,.,0        .       1/1:99:0:255,123,0,.,.,.       1/1:99:0:255,66,0,.,.,. .       .       .       ."
         )
     }
Ejemplo n.º 4
0
def process_snps(vcf, min_num_samples):
    skipped, total = 0, 0
    hc = defaultdict(lambda: 0)  # fc -> count
    for l in vcf.each_snp():
        snp = VcfSnp(l)
        if snp.annotated == False:
            raise (Exception('Found a snp that is not annotated: %s' % l))
        if len(snp.gtypes()) >= min_num_samples and snp.all_gtypes_the_same():
            skipped += 1
        else:
            hc[snp.func_cons] += 1
            total += 1
    hc["SAME_GENOTYPE_ON_ALL_SAMPLES_SKIPPED"] = skipped
    hc["TOTAL"] = total
    return hc
Ejemplo n.º 5
0
    def __process_snps(self):
        for l in self.stream:
            if self.__in_header(l):
                self.more_samples_to_process = True
                return
            else:
                snp = VcfSnp(l)
                if snp.is_a_substitution():
                    self.subs[snp.coordinate()] += 1
                else:
                    self.indels[snp.coordinate()] += 1
                if snp.annotated:
                    self.genes_partial[snp.gene] = True

        self.more_samples_to_process = False
Ejemplo n.º 6
0
def process_snps(vcf, fd_cov):
  """ We have to read each coverage line and the corresponding snp.
  """
  for l in vcf.each_snp():
    snp = VcfSnp(l)
    coor_cov, a_cov = pop_coor(fd_cov)
    print add_rdp(l.split(), a_cov)
Ejemplo n.º 7
0
def process_snps(vcf):
  h_genes = defaultdict(lambda: 0)
  for l in vcf.each_snp():
    snp = VcfSnp(l)
    if snp.annotated == False:
      raise(Exception('Found a snp that is not annotated: %s' % l))
    if snp.impact == "HIGH" and snp.gene != "":
      h_genes[snp.gene] += 1
  return h_genes
Ejemplo n.º 8
0
 def __list_species_snps(self):
     for l in self.vcf.each_snp():
         snp = VcfSnp(l)
         if snp.is_a_substitution() and \
            snp.has_high_quality(self.MIN_QUAL) and \
            snp.species_snp():
             print(snp.coordinate(' '))
Ejemplo n.º 9
0
def prepare(vcf, grps_pheno, grps_haplo):
  """Prepare the data in the snps for the heatmap"""
  matrix, a_sites, a_groups = [], [], []

  for curr_grp in grps_pheno.groups:
    for _id in grps_pheno.indices_for_grp(curr_grp):
      pheno = curr_grp[0:5]
      haplo = grps_haplo.what_is(_id)
      a_groups.append(_id + "_" + pheno + "_" + haplo)

  for l in vcf.each_snp():
    snp     = VcfSnp(l)
    a_calls = []
    a_sites.append(snp.coordinate())
    gts     = snp.gtypes() # col_num -> gt_set
    for curr_grp in grps_pheno.groups:
      for _id in grps_pheno.indices_for_grp(curr_grp):
        _index = vcf.id_to_col[_id]
        a_calls.append(make_the_call(gts, _index, snp))
    matrix.append(a_calls)

  return np.transpose(np.array(matrix)), a_sites, a_groups
Ejemplo n.º 10
0
def prepare(vcf, grps_pheno, grps_haplo):
    """Prepare the data in the snps for the heatmap"""
    matrix, a_sites, a_groups = [], [], []

    for curr_grp in grps_pheno.groups:
        for _id in grps_pheno.indices_for_grp(curr_grp):
            pheno = curr_grp[0:5]
            haplo = grps_haplo.what_is(_id)
            a_groups.append(_id + "_" + pheno + "_" + haplo)

    for l in vcf.each_snp():
        snp = VcfSnp(l)
        a_calls = []
        a_sites.append(snp.coordinate())
        gts = snp.gtypes()  # col_num -> gt_set
        for curr_grp in grps_pheno.groups:
            for _id in grps_pheno.indices_for_grp(curr_grp):
                _index = vcf.id_to_col[_id]
                a_calls.append(make_the_call(gts, _index, snp))
        matrix.append(a_calls)

    return np.transpose(np.array(matrix)), a_sites, a_groups
Ejemplo n.º 11
0
  def __calculate_snp_freq(self):
    """
    Compute the snp frequency (# of snps per kbp)
    Drop snps that are indels, have low quality
    If wes, also drop non coding regions
    If drop is True, we have to drop species snps
    """
    num_snps = 0
    total = 0
    for l in self.vcf.each_snp():
      snp = VcfSnp(l)
      total += 1
      if snp.is_a_substitution() and snp.has_high_quality(self.MIN_QUAL):
        if self.exp_type == 'wgs':
          if not self.drop or (self.drop and not self.__is_a_species_snp(snp)):
            num_snps += 1
        if self.exp_type == 'wes' and snp.in_coding_region():
          if not self.drop or (self.drop and not self.__is_a_species_snp(snp)):
            num_snps += 1

    logging.info("Total/counted: %d/%d" % (total, num_snps))
    return (float(num_snps)/self.GENOME_SIZE[self.exp_type])*1000
Ejemplo n.º 12
0
 def process_snps(self):
     """Per each snp, we want to decide if it is interesting. By interesting
 we mean that the samples in any group have a different genotype compare
 to the other groups.
 To do that we count the var allele freq per each of the samples in
 the group (#g1, #g2 ...) and apply the following condition:
 std_dev(#g1, #g2, #g3 ...) > X
 """
     ict = self.vcf.col_to_id
     for l in self.vcf.each_snp():
         self.h = self.o_groups.fresh_hash(
         )  # key: group , value: num of alt alleles seen
         snp = VcfSnp(l)
         self.process_genotypes(snp)
         self.check_filters_and_report(l)
Ejemplo n.º 13
0
    def __calculate_snp_freq(self):
        """
    Compute the snp frequency (# of snps per kbp)
    Drop snps that are indels, have low quality
    If wes, also drop non coding regions
    If drop is True, we have to drop species snps
    """
        num_snps = 0
        total = 0
        for l in self.vcf.each_snp():
            snp = VcfSnp(l)
            total += 1
            if snp.is_a_substitution() and snp.has_high_quality(self.MIN_QUAL):
                if self.exp_type == 'wgs':
                    if not self.drop or (self.drop
                                         and not self.__is_a_species_snp(snp)):
                        num_snps += 1
                if self.exp_type == 'wes' and snp.in_coding_region():
                    if not self.drop or (self.drop
                                         and not self.__is_a_species_snp(snp)):
                        num_snps += 1

        logging.info("Total/counted: %d/%d" % (total, num_snps))
        return (float(num_snps) / self.GENOME_SIZE[self.exp_type]) * 1000
Ejemplo n.º 14
0
def check_targets(chunk):
    # Check if the snps are on target or not
    data = {'sites': []}
    for l in chunk:
        chrm, coor = l.split()[0:2]
        data['sites'].append({'Chrm': chrm, 'Start': int(coor)})
    data_json = json.dumps(data)
    r = requests.post(SERVER_URL, data=data_json)

    # Add the OT INFO field if they are on target
    for i, ont in enumerate(json.loads(r.text)):
        if ont == 0:  # not on target
            print chunk[i]
        else:
            print VcfSnp(chunk[i]).add_info('OT')