Beispiel #1
0
    def _iterate_vcf(self, vcf_ittr, distin_dict, reg):
        """
        """

        pick_mode = distin_dict['pick_mode']
        # 辞書のキーが0。名前の文字列を示している。
        gr_list = [distin_dict[0], distin_dict[1]]
        log.info("gr_list {}.".format(gr_list))

        # At first, we check difference of genotype between two sample
        # that described at the beginning of each group
        top_smpl_list = [
            glv.conf.g_members_dict[gr_list[0]][0],
            glv.conf.g_members_dict[gr_list[1]][0]
        ]
        log.info("top_smpl_list {}.".format(top_smpl_list))

        # ================================================================
        start = time.time()
        # write out to file
        out_txt_file = distin_dict['variant']['out_path']
        utl.save_to_tmpfile(out_txt_file)

        # ここがparallele化できるか
        # f.writeの最後のflash必要か。
        with open(out_txt_file, mode='a') as f:

            # write header
            f.write("{}\n".format(distin_dict['variant']['hdr_text']))

            # access to vcf using iterater
            for record in vcf_ittr:

                # 1. Skip same GT between top two sample
                if self._skip_same_GT_between_top2sample(
                        record, top_smpl_list) > 0:
                    continue

                # 2. Check GT in your own group
                if self._skip_different_GT_in_own_group(
                        record, top_smpl_list, gr_list) > 0:
                    continue

                # 3. Select different allele combination among 2x2 allele
                asel = AlleleSelect()
                asel.select_diff_allele(record, top_smpl_list, gr_list)

                # skip if pick_mode is different
                #                if utl.is_my_pick_mode(
                #                    asel.var_type, distin_dict['pick_mode']) != True:
                #                    continue

                # 4. Save variant information as text file
                for var_type, line in zip(asel.var_types, asel.lines):
                    if utl.is_my_pick_mode(var_type,
                                           distin_dict['pick_mode']) == True:
                        f.write("{}\n".format(line))

        log.info("variant {} {}".format(utl.elapsed_time(time.time(), start),
                                        distin_dict['variant']['base_nam']))
Beispiel #2
0
    def _skip_same_GT_between_top2sample(self, record, tsl):

        # for REF 20200708
        sample0 = tsl[0]
        sample1 = tsl[1]

        s0_0, s0_1, s1_0, s1_1 = \
            AlleleSelect.record_call_for_sample(record, sample0, sample1)

        skip = glv.SKIP_DONT_SKIP

        # ./. only 0
        if Variant.is_None(s0_0, s0_1, s1_0, s1_1):
            skip = glv.SKIP_None
            #log.debug("SKIP_None {}{}/{}{}".format(s0_0,s0_1,s1_0,s1_1))
            return skip

        # same h**o: AA,AA
        if Variant.is_same_homo(s0_0, s0_1, s1_0, s1_1):
            skip = glv.SKIP_SAME_HOMO
            #log.debug("SKIP_SAME_HOMO {}{}/{}{}".format(s0_0,s0_1,s1_0,s1_1))
            return skip

        # same hetero: AB,AB
        if Variant.is_same_hetero(s0_0, s0_1, s1_0, s1_1):
            skip = glv.SKIP_SAME_HETERO
            #log.debug("SKIP_SAME_HETERO {}{}/{}{}".format(
            #    s0_0,s0_1,s1_0,s1_1))
            return skip

        return skip
Beispiel #3
0
    def _get_allele_line(self, record, sample_fullname_list):
        '''
        '''
        #line = [record.CHROM, record.POS, record.REF]
        #alt_list = [alt.value for alt in record.ALT]
        #line += [",".join(alt_list)]
        line = list()
        line += [
            AlleleSelect.allele_convert(
                "{}/{}".format(record.call_for_sample[fn].gt_alleles[0],
                               record.call_for_sample[fn].gt_alleles[1]),
                "allele") for fn in sample_fullname_list
        ]

        line_str = '\t'.join(map(str, line))
        return line_str
Beispiel #4
0
    def _skip_different_GT_in_own_group(self, record, tsl, gr_list):

        skip = glv.SKIP_DONT_SKIP

        # check twice, group0, and group1
        for gr_no in range(2):
            # pick sample name belong to a group
            for (sample_no, sample_name) in enumerate(
                glv.conf.g_members_dict[gr_list[gr_no]]):

                if sample_no == 0:
                    continue    # self

                sample0 = tsl[gr_no]
                sample1 = sample_name

                # もし、サンプル間でvariantが見つかった場合は、
                s0_0, s0_1, s1_0, s1_1 = \
                    AlleleSelect.record_call_for_sample(
                        record, sample0, sample1)

                # compare alleles with first sample
                if s0_0 == s1_0 and s0_1 == s1_1:

                    #log.debug("SKIP_SAME_HOMO {},({}){} {}{}/{}{}".format(
                    #    gr_list[gr_no],
                    #    sample_no, sample_name,
                    #    record.call_for_sample[tsl[gr_no]].gt_alleles[0],
                    #    record.call_for_sample[tsl[gr_no]].gt_alleles[1],
                    #    record.call_for_sample[sample_name].gt_alleles[0],
                    #    record.call_for_sample[sample_name].gt_alleles[1]))
                    pass

                else:
                    skip = glv.SKIP_DIFF_INGROUP
                    #log.debug("SKIP_SAME_HOMO {},({}){} {}{}/{}{}".format(
                    #    gr_list[gr_no],
                    #    sample_no, sample_name,
                    #    record.call_for_sample[tsl[gr_no]].gt_alleles[0],
                    #    record.call_for_sample[tsl[gr_no]].gt_alleles[1],
                    #    record.call_for_sample[sample_name].gt_alleles[0],
                    #    record.call_for_sample[sample_name].gt_alleles[1]))
                    return skip

        return skip
Beispiel #5
0
    def get_excluded_region(self):

        SEQUENCE_EXCLUDED_REGION = list()

        #logf_l = ["{} self.pos={} rel_pos={} rel_end_pos={} "]
        #logf_l += ["region_len={} template_len={}"]
        #logf = "".join(logf_l)

        region = "{}:{}-{}".format(self.chrom, self.abs_frag_pad_pre_stt,
                                   self.abs_frag_pad_aft_end)

        reader = vcfpy.Reader.from_path(glv.conf.vcf_file)
        vcf_ittr = reader.fetch(region)

        # access to vcf using iterater
        for record in vcf_ittr:
            sample0 = glv.conf.g_members_dict[self.g0_name][0]
            sample1 = glv.conf.g_members_dict[self.g1_name][0]

            # もし、サンプル間でvariantが見つかった場合は、
            s0_0, s0_1, s1_0, s1_1 = \
                AlleleSelect.record_call_for_sample(record, sample0, sample1)

            if Variant.is_same_gt(s0_0, s0_1, s1_0, s1_1) == False:

                # 20200713 here
                if self.pos != record.POS:
                    rel_pos = self._get_relpos(record.POS)  #
                    # そのポジションのrefのvseq分を登録する
                    # 長さがfragment長を超える場合は、
                    # 調整する。
                    # 見つかったのはPOS
                    # REFのlength

                    # self.pos|
                    #     1036|
                    # ATGCATGCA ref_len=1
                    #         T
                    #         C
                    #         1036 + 1 - 1
                    region_len = len(record.REF)
                    rel_end_pos = rel_pos + region_len - 1

                    #  pos   len     end
                    #  1036 (10)     1045
                    #            1041 temp_len

                    #                    log.debug(logf.format(
                    #                        1, self.pos, rel_pos, rel_end_pos, region_len,
                    #                        self.seq_template_ref_len))

                    #log.debug("{}, {}".format(
                    #    rel_end_pos, self.seq_template_ref_len))
                    #log.debug("{}, {}".format(
                    #    type(rel_end_pos), type(self.seq_template_ref_len)))

                    if rel_end_pos > self.seq_template_ref_len:
                        diff_len = rel_end_pos - self.seq_template_ref_len
                        region_len = region_len - diff_len

#                        log.debug(logf.format(
#                            2, self.pos, rel_pos, rel_end_pos, region_len,
#                            self.seq_template_ref_len))

                    SEQUENCE_EXCLUDED_REGION += [
                        "{},{}".format(rel_pos, region_len)
                    ]

        self.SEQUENCE_EXCLUDED_REGION = " ".join(SEQUENCE_EXCLUDED_REGION)
Beispiel #6
0
    def _iterate_vcf(self, vcf_ittr, distin_dict, proc_cnt):
        """
        """

        # basic informations
        gr_list = [distin_dict[0], distin_dict[1]]

        reg = distin_dict['region']
        reg_dict = glv.conf.regions_dict[reg]
        pick_mode = distin_dict['pick_mode']
        indel_size = distin_dict['indel_size']
        min_indel_len, max_indel_len = \
            [int(i) for i in indel_size.split('-')]

        # At first, we check difference of genotype between two sample
        # that described at the beginning of each group
        top_smpl_list = [
            glv.conf.group_members_dict[gr_list[0]][0],
            glv.conf.group_members_dict[gr_list[1]][0]
        ]

        # logging current target
        utl.print_distin_info("variant", distin_dict, proc_cnt)

        start = time.time()

        # File name to export variant
        out_txt_file = distin_dict['variant']['out_path']

        utl.save_to_tmpfile(out_txt_file)

        #------------------------------------------------------
        # To add an allele_int column for all sample
        # Members of the specified group come first
        # gr0:s1 g0:s2 g0:s3 g1:s4 g1:s5 g1:s6 s7 s8 s9 s10

        sample_nickname_ordered_list, \
        sample_fullname_ordered_list = \
            utl.get_ordered_sample_list(gr_list)

        sample_added_header = "{}\t{}".format(
            distin_dict['variant']['hdr_text'],
            "\t".join(sample_nickname_ordered_list))

        # Can I parallelize here?
        with open(out_txt_file, mode='a') as f:

            # write sample added header
            f.write("{}\n".format(sample_added_header))

            # access to vcf using iterater
            for record in vcf_ittr:

                # 1. Skip same GT between top two sample
                if self._skip_same_GT_between_top2sample(
                        record, top_smpl_list) > 0:
                    continue

                # 2. Check GT in your own group
                if self._skip_different_GT_in_own_group(
                        record, top_smpl_list, gr_list) > 0:
                    continue

                # 3. Select different allele combination among 2x2 allele
                asel = AlleleSelect(min_indel_len, max_indel_len)
                asel.select_diff_allele(record, top_smpl_list, gr_list)

                # from record, construct allele_int of the member
                # who is paying attention
                allele_int_line = ""

                # 4. Save variant information as text file
                for var_type, line in zip(asel.var_types, asel.lines):
                    if utl.is_my_pick_mode(var_type,
                                           distin_dict['pick_mode']) == True:

                        # make allele_int line
                        if allele_int_line == "":
                            #self._get_ai_line(
                            allele_int_line = \
                                self._get_allele_line(
                                    record, sample_fullname_ordered_list)

                        # add allele line
                        f.write("{}\t{}\n".format(line, allele_int_line))

        log.info("variant {} > {}.txt\n".format(
            utl.elapsed_time(time.time(), start),
            distin_dict['variant']['base_nam']))
Beispiel #7
0
    def print_allele(self):
        ''' When show_genotype is specified, the genotype of the specified
        regions and members are output to a file.
            main
            variant.py print_allele
            allele_select.py cls allele_int
        '''

        proc_name = "genotype"
        log.info("-------------------------------")
        log.info("Start processing {}\n".format(proc_name))

        # header
        header = list()
        header += ["CHROM", "POS", "Rlen", "Alen", "diff", "REF", "ALT"]
        header += glv.conf.group_members_dict['all']

        # reader
        reader = vcfpy.Reader.from_path(glv.conf.vcf_file_path)

        total_cnt = len(glv.conf.region_name_list)

        # Save to file for each region
        for proc_cnt, region_name in enumerate(glv.conf.region_name_list, 1):

            region = glv.conf.regions_dict[region_name]['reg']

            # Create a list of fullname for the specified members
            sample_fullname_list = list()
            for nickname in glv.conf.group_members_dict['all']:
                sample_fullname_list.append(utl.get_fullname(nickname))

            # if group priority
            #sample_fullname_list = \
            #    utl.get_sample_list_from_groupname(
            #        group_list, "fullname")

            # out file name
            outf_pref = "005_genotype"
            basename = "{}~{}~{}".format(outf_pref, region_name,
                                         glv.conf.show_genotype)
            out_file_path = "{}/{}.txt".format(glv.conf.out_dir_path, basename)

            # backup
            utl.save_to_tmpfile(out_file_path)

            log.info("")
            log.info("{} / {}, {}({}) > {}".format(proc_cnt, total_cnt,
                                                   region_name, region,
                                                   out_file_path))

            start = time.time()
            with open(out_file_path, mode='w') as f:

                f.write("{}\n".format('\t'.join(map(str, header))))

                vcf_ittr = reader.fetch(region)
                for record in vcf_ittr:

                    # Main informations
                    line = [record.CHROM, record.POS]

                    alt_list = [alt.value for alt in record.ALT]

                    # variant length and diff
                    len_ref = len(record.REF)
                    lens_alt_list = list()
                    for alt in alt_list:
                        lens_alt_list.append(len(alt))

                    diff_len = abs(len_ref - lens_alt_list[0])
                    lens_alt = ",".join(map(str, lens_alt_list))

                    line += [len_ref]
                    line += [lens_alt]
                    line += [diff_len]

                    line += [record.REF]
                    line += [",".join(alt_list)]

                    line += [
                        AlleleSelect.allele_convert(
                            "{}/{}".format(
                                record.call_for_sample[fn].gt_alleles[0],
                                record.call_for_sample[fn].gt_alleles[1]),
                            glv.conf.show_genotype)
                        for fn in sample_fullname_list
                    ]

                    f.write("{}\n".format('\t'.join(map(str, line))))

            log.info("genotype {} > {}.txt\n".format(
                utl.elapsed_time(time.time(), start), out_file_path))