Ejemplos de VcfSnp en Python, ejemplos de drdvcf.VcfSnp en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: snp_frequency.py Proyecto: drio/py.analysis

 def __list_species_snps(self):
   for l in self.vcf.each_snp():
     snp = VcfSnp(l)
     if snp.is_a_substitution() and \
        snp.has_high_quality(self.MIN_QUAL) and \
        snp.species_snp():
       print(snp.coordinate(' '))

Ejemplo n.º 2

0

Mostrar archivo

def process_snps(vcf):
    """ per each snp, compute the ratio of the min allele freq / total number
      of alleles seen.
      Ignore cases were we are seeing more than two alleles.
      Find how many chrmosomes you observe with the minor allele
      compute the ratio of that and the total number of alleles seen.
  """
    vcf.load_meta_header()
    total_n_chrms = vcf.num_of_samples * 2
    mafs = defaultdict(lambda: 0)
    for l in vcf.each_snp():
        snp = VcfSnp(l)
        if snp.has_high_quality(MIN_QUAL):
            a_counts = snp.alternative_allele_counts()
            a_total = snp.total_num_alleles()
            if len(a_counts) == 1:  # only 1 alternative allele
                n_chrms_with_alt_allele = a_counts[0]
                n_chrms_with_ref_allele = a_total - n_chrms_with_alt_allele
                if n_chrms_with_ref_allele <= n_chrms_with_alt_allele:
                    mafs[round(n_chrms_with_ref_allele / total_n_chrms,
                               2)] += 1
                else:
                    mafs[round(n_chrms_with_alt_allele / total_n_chrms,
                               2)] += 1
    return mafs

Ejemplo n.º 3

0

Mostrar archivo

 def setUp(self):
     self.fixtures = {
         "non_syn":
         VcfSnp(
             "Chr1    627540  .       A       G       11.30   .       AC1=1;AC=1;AF1=0.5;AN=2;DP4=3,2,2,0;DP=14;EFF=NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Tcc/Ccc|S8P|668|RGS12|protein_coding|CODING|ENSMMUT00000009994|exon_1_625681_627664);FQ=14.2;MQ=53;PV4=1,0.31,0.26,1;SF=5;VDB=0.0279 "
         ),
         "syn_coding":
         VcfSnp(
             "Chr1    24428   .       T       G       7.59    .       AC1=2;AC=2;AF1=1;AN=2;DP4=0,0,2,0;DP=2;EFF=SYNONYMOUS_CODING(LOW|SILENT|ggA/ggC|G145|368|HMX1|protein_coding|CODING|ENSMMUT00000019076|exon_1_24407_24477);FQ=-33;MQ=22;SF=4;VDB=0.0133 GT:GQ:SP:PL     .       ."
         ),
         "intron":
         VcfSnp(
             "Chr1    26208   .       N       T       11.10   .       AC1=2;AC=2;AF1=1;AN=2;DP4=0,0,0,2;DP=2;EFF=INTRON(MODIFIER||||368|HMX1|protein_coding|CODING|ENSMMUT00000019076|);FQ=-33;MQ=25;SF=14;VDB=0.0099 GT:GQ:SP:PL     .       .       .       .       .       .       .       .       .       .       .       .       .    ."
         ),
         "intergenic":
         VcfSnp(
             "Chr1    2986    .       G       T       9.52    .       AC1=1;AC=1;AF1=0.5;AN=2;DP4=6,0,0,1;DP=10;EFF=INTERGENIC(MODIFIER|||||||||);FQ=12.3;MQ=60;PV4=0.14,1,1,1;SF=5   GT:GQ:SP:PL     .       .       .       .       .       0/1:41:0:39,0,96        .       .       .       .       .       .       .       .       .    ."
         ),
         "not_annotated":
         VcfSnp(
             "20     1110696 rs6040355 A      G,T     67   PASS   NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2   2/2:35:4"
         ),
         "full_snp":
         VcfSnp(
             "Chr1    7053    .       A       G       203.90  .       AC1=1;AC=12;AF1=0.5;AN=20;DP4=78,96,122,120;DP=442;EFF=INTERGENIC(MODIFIER|||||||||);FQ=225;MQ=45;PV4=0.75,1,1.6e-09,1;SF=0,1,2,7,9,11,14,15,16,19;VDB=0.0399   GT:GQ:SP:PL     0/1:99:1:255,0,255      1/1:99:0:255,111,0      0/1:99:1:255,0,255      .       .       .      .       0/1:99:4:218,0,255      .       1/1:99:0:255,132,0      .       0/1:99:5:255,0,255      .       .       0/1:99:0:234,0,255      0/1:99:0:108,0,126      0/1:99:1:255,0,254      .       .       0/1:99:2:255,0,255"
         ),
         "all_same_gtype":
         VcfSnp(
             "Chr1    2222    .       A       G       203.90  .       AC1=1;AC=12;AF1=0.5;AN=20;DP4=78,96,122,120;DP=442;EFF=INTERGENIC(MODIFIER|||||||||);FQ=225;MQ=45;PV4=0.75,1,1.6e-09,1;SF=0,1,2,7,9,11,14,15,16,19;VDB=0.0399   GT:GQ:SP:PL     0/1:99:1:255,0,255      0/1:99:0:255,111,0      0/1:99:1:255,0,255      .       .       .      .       0/1:99:4:218,0,255"
         ),
         "second_vars":
         VcfSnp(
             "Chr1    8239092 .       G       T,C     222.00  AC1=2;AC=12,6;AF1=1;AN=18;DP4=0,0,186,177;DP=385;EFF=INTERGENIC(MODIFIER|||||||||);FQ=-126;MQ=40;SF=3,4,7,8,9,10,12,14,15;VDB=0.0384;RDP=37,55,44,34,43,39,48,49,45,56,40,61,43,45,43,28,51,52,45,46   GT:GQ:SP:PL     .     .     .       2/2:99:0:255,.,.,99,.,0    1/1:99:0:255,123,0,.,.,.        .       .       2/2:99:0:255,.,.,138,.,0        1/1:99:0:255,132,0,.,., .       1/1:99:0:255,166,0,.,.,.        1/1:99:0:255,111,0,.,.,.        .       2/2:99:0:255,.,.,126,.,0        .       1/1:99:0:255,123,0,.,.,.       1/1:99:0:255,66,0,.,.,. .       .       .       ."
         )
     }

Ejemplo n.º 4

0

Mostrar archivo

def process_snps(vcf, min_num_samples):
    skipped, total = 0, 0
    hc = defaultdict(lambda: 0)  # fc -> count
    for l in vcf.each_snp():
        snp = VcfSnp(l)
        if snp.annotated == False:
            raise (Exception('Found a snp that is not annotated: %s' % l))
        if len(snp.gtypes()) >= min_num_samples and snp.all_gtypes_the_same():
            skipped += 1
        else:
            hc[snp.func_cons] += 1
            total += 1
    hc["SAME_GENOTYPE_ON_ALL_SAMPLES_SKIPPED"] = skipped
    hc["TOTAL"] = total
    return hc

Ejemplo n.º 5

0

Mostrar archivo

    def __process_snps(self):
        for l in self.stream:
            if self.__in_header(l):
                self.more_samples_to_process = True
                return
            else:
                snp = VcfSnp(l)
                if snp.is_a_substitution():
                    self.subs[snp.coordinate()] += 1
                else:
                    self.indels[snp.coordinate()] += 1
                if snp.annotated:
                    self.genes_partial[snp.gene] = True

        self.more_samples_to_process = False

Ejemplo n.º 6

0

Mostrar archivo

def process_snps(vcf, fd_cov):
  """ We have to read each coverage line and the corresponding snp.
  """
  for l in vcf.each_snp():
    snp = VcfSnp(l)
    coor_cov, a_cov = pop_coor(fd_cov)
    print add_rdp(l.split(), a_cov)

Ejemplo n.º 7

0

Mostrar archivo

def process_snps(vcf):
  h_genes = defaultdict(lambda: 0)
  for l in vcf.each_snp():
    snp = VcfSnp(l)
    if snp.annotated == False:
      raise(Exception('Found a snp that is not annotated: %s' % l))
    if snp.impact == "HIGH" and snp.gene != "":
      h_genes[snp.gene] += 1
  return h_genes

Ejemplo n.º 8

0

Mostrar archivo

 def __list_species_snps(self):
     for l in self.vcf.each_snp():
         snp = VcfSnp(l)
         if snp.is_a_substitution() and \
            snp.has_high_quality(self.MIN_QUAL) and \
            snp.species_snp():
             print(snp.coordinate(' '))

Ejemplo n.º 9

0

Mostrar archivo

Archivo: heat-grp-snps.py Proyecto: drio/py.analysis

def prepare(vcf, grps_pheno, grps_haplo):
  """Prepare the data in the snps for the heatmap"""
  matrix, a_sites, a_groups = [], [], []

  for curr_grp in grps_pheno.groups:
    for _id in grps_pheno.indices_for_grp(curr_grp):
      pheno = curr_grp[0:5]
      haplo = grps_haplo.what_is(_id)
      a_groups.append(_id + "_" + pheno + "_" + haplo)

  for l in vcf.each_snp():
    snp     = VcfSnp(l)
    a_calls = []
    a_sites.append(snp.coordinate())
    gts     = snp.gtypes() # col_num -> gt_set
    for curr_grp in grps_pheno.groups:
      for _id in grps_pheno.indices_for_grp(curr_grp):
        _index = vcf.id_to_col[_id]
        a_calls.append(make_the_call(gts, _index, snp))
    matrix.append(a_calls)

  return np.transpose(np.array(matrix)), a_sites, a_groups

Ejemplo n.º 10

0

Mostrar archivo

def prepare(vcf, grps_pheno, grps_haplo):
    """Prepare the data in the snps for the heatmap"""
    matrix, a_sites, a_groups = [], [], []

    for curr_grp in grps_pheno.groups:
        for _id in grps_pheno.indices_for_grp(curr_grp):
            pheno = curr_grp[0:5]
            haplo = grps_haplo.what_is(_id)
            a_groups.append(_id + "_" + pheno + "_" + haplo)

    for l in vcf.each_snp():
        snp = VcfSnp(l)
        a_calls = []
        a_sites.append(snp.coordinate())
        gts = snp.gtypes()  # col_num -> gt_set
        for curr_grp in grps_pheno.groups:
            for _id in grps_pheno.indices_for_grp(curr_grp):
                _index = vcf.id_to_col[_id]
                a_calls.append(make_the_call(gts, _index, snp))
        matrix.append(a_calls)

    return np.transpose(np.array(matrix)), a_sites, a_groups

Ejemplo n.º 11

0

Mostrar archivo

Archivo: snp_frequency.py Proyecto: drio/py.analysis

  def __calculate_snp_freq(self):
    """
    Compute the snp frequency (# of snps per kbp)
    Drop snps that are indels, have low quality
    If wes, also drop non coding regions
    If drop is True, we have to drop species snps
    """
    num_snps = 0
    total = 0
    for l in self.vcf.each_snp():
      snp = VcfSnp(l)
      total += 1
      if snp.is_a_substitution() and snp.has_high_quality(self.MIN_QUAL):
        if self.exp_type == 'wgs':
          if not self.drop or (self.drop and not self.__is_a_species_snp(snp)):
            num_snps += 1
        if self.exp_type == 'wes' and snp.in_coding_region():
          if not self.drop or (self.drop and not self.__is_a_species_snp(snp)):
            num_snps += 1

    logging.info("Total/counted: %d/%d" % (total, num_snps))
    return (float(num_snps)/self.GENOME_SIZE[self.exp_type])*1000

Ejemplo n.º 12

0

Mostrar archivo

 def process_snps(self):
     """Per each snp, we want to decide if it is interesting. By interesting
 we mean that the samples in any group have a different genotype compare
 to the other groups.
 To do that we count the var allele freq per each of the samples in
 the group (#g1, #g2 ...) and apply the following condition:
 std_dev(#g1, #g2, #g3 ...) > X
 """
     ict = self.vcf.col_to_id
     for l in self.vcf.each_snp():
         self.h = self.o_groups.fresh_hash(
         )  # key: group , value: num of alt alleles seen
         snp = VcfSnp(l)
         self.process_genotypes(snp)
         self.check_filters_and_report(l)

Ejemplo n.º 13

0

Mostrar archivo

    def __calculate_snp_freq(self):
        """
    Compute the snp frequency (# of snps per kbp)
    Drop snps that are indels, have low quality
    If wes, also drop non coding regions
    If drop is True, we have to drop species snps
    """
        num_snps = 0
        total = 0
        for l in self.vcf.each_snp():
            snp = VcfSnp(l)
            total += 1
            if snp.is_a_substitution() and snp.has_high_quality(self.MIN_QUAL):
                if self.exp_type == 'wgs':
                    if not self.drop or (self.drop
                                         and not self.__is_a_species_snp(snp)):
                        num_snps += 1
                if self.exp_type == 'wes' and snp.in_coding_region():
                    if not self.drop or (self.drop
                                         and not self.__is_a_species_snp(snp)):
                        num_snps += 1

        logging.info("Total/counted: %d/%d" % (total, num_snps))
        return (float(num_snps) / self.GENOME_SIZE[self.exp_type]) * 1000

Ejemplo n.º 14

0

Mostrar archivo

def check_targets(chunk):
    # Check if the snps are on target or not
    data = {'sites': []}
    for l in chunk:
        chrm, coor = l.split()[0:2]
        data['sites'].append({'Chrm': chrm, 'Start': int(coor)})
    data_json = json.dumps(data)
    r = requests.post(SERVER_URL, data=data_json)

    # Add the OT INFO field if they are on target
    for i, ont in enumerate(json.loads(r.text)):
        if ont == 0:  # not on target
            print chunk[i]
        else:
            print VcfSnp(chunk[i]).add_info('OT')