Esempio n. 1
0
    def CONVERT_IN(self, MHC, _reference, _out, _hg, _Genetic_Map):

        print("[{}] Converting data to beagle format.".format(
            self.idx_process))
        self.idx_process += 1

        RUN_Bash(self.LINKAGE2BEAGLE +
                 ' pedigree={} data={} beagle={} standard=true > {}'.format(
                     MHC + '.QC.nopheno.ped', MHC + '.QC.dat', MHC +
                     '.QC.bgl', _out + '.bgl.log'))

        # if not self.__save_intermediates:
        #     os.system('rm {}'.format(MHC + '.QC.nopheno.ped'))
        #     os.system('rm {}'.format(MHC + '.QC.dat'))
        #     os.system('rm {}'.format(_out+'.bgl.log'))

        ### Converting data to reference_markers_Position (Dispersing same genomic position of some markers.)

        from src.redefineBPv1BH import redefineBP

        RefinedMarkers = redefineBP(_reference + '.markers',
                                    self.OUTPUT_dir_ref + '.refined.markers')
        self.refined_REF_markers = RefinedMarkers  # => This will be used in 'CONVERT_OUT'.

        ### Converting data to target_markers_Position and extract not_including snp.

        RUN_Bash('awk \'{print $2" "$4" "$5" "$6}\' %s > %s' %
                 (MHC + '.QC.bim', MHC + '.QC.markers'))

        RUN_Bash(
            'Rscript src/excluding_snp_and_refine_target_position-v1COOK02222017.R {} {} {}'
            .format(MHC + '.QC.markers', RefinedMarkers,
                    MHC + '.QC.pre.markers'))
        if not self.__save_intermediates:
            os.system(' '.join(['rm', MHC + '.QC.markers']))

        RUN_Bash('mv {} {}'.format(MHC + '.QC.bgl',
                                   MHC + '.QC.pre.bgl.phased'))

        RUN_Bash("awk '{print $1}' %s > %s" %
                 (MHC + '.QC.pre.markers',
                  join(self.OUTPUT_dir, 'selected_snp.txt')))

        from src.Panel_subset import Panel_Subset
        qc_refined = Panel_Subset(MHC + '.QC.pre', 'all',
                                  join(self.OUTPUT_dir, 'selected_snp.txt'),
                                  MHC + '.QC.refined')

        if not self.__save_intermediates:
            RUN_Bash('rm {}'.format(MHC + '.QC.pre.bgl.phased'))
            RUN_Bash('rm {}'.format(MHC + '.QC.pre.markers'))
            RUN_Bash('rm {}'.format(join(self.OUTPUT_dir, 'selected_snp.txt')))

        ### Converting data to GC_change_beagle format.

        from src.bgl2GC_trick_bgl import Bgl2GC

        # target
        [GCchangeBGL, GCchangeMarkers] = Bgl2GC(MHC + '.QC.refined.bgl.phased',
                                                MHC + '.QC.refined.markers',
                                                MHC + '.QC.GCchange.bgl',
                                                MHC + '.QC.GCchange.markers')

        self.GCchangeBGL = GCchangeBGL  # it will be used in 'CONVERT_OUT' with Genetic Map

        # print("<Target GCchanged bgl and marker file>\n"
        #       "bgl : {}\n"
        #       "markers : {}".format(GCchangeBGL, GCchangeMarkers))

        # reference
        [GCchangeBGL_REF, GCchangeMarkers_REF
         ] = Bgl2GC(_reference + '.bgl.phased', RefinedMarkers,
                    self.OUTPUT_dir_ref + '.GCchange.bgl.phased',
                    self.OUTPUT_dir_ref + '.GCchange.markers')
        # print("<Reference GCchanged bgl and marker file>\n"
        #       "bgl : {}\n"
        #       "markers : {}".format(GCchangeBGL_REF, GCchangeMarkers_REF))

        if not self.__save_intermediates:

            RUN_Bash('rm {}'.format(MHC + '.QC.refined.bgl.phased'))
            RUN_Bash('rm {}'.format(MHC + '.QC.refined.markers'))
            # RUN_Bash('rm {}'.format(RefinedMarkers))

            # os.system(' '.join(['rm', RefinedMarkers])) # => This will be used in 'CONVERT_OUT" when not using Multiple Markers.

        ### Converting data to vcf_format

        # target
        RUN_Bash(self.BEAGLE2VCF +
                 ' 6 {} {} 0 > {}'.format(GCchangeMarkers, GCchangeBGL, MHC +
                                          '.QC.vcf'))

        MHC_QC_VCF = MHC + '.QC.vcf'

        # reference
        RUN_Bash(self.BEAGLE2VCF +
                 ' 6 {} {} 0 > {}'.format(GCchangeMarkers_REF, GCchangeBGL_REF,
                                          self.OUTPUT_dir_ref + '.vcf'))

        reference_vcf = self.OUTPUT_dir_ref + '.vcf'

        ### Converting data to reference_phased

        RUN_Bash('sed "s%/%|%g" {} > {}'.format(
            reference_vcf, self.OUTPUT_dir_ref + '.phased.vcf'))

        REF_PHASED_VCF = self.OUTPUT_dir_ref + '.phased.vcf'

        if not self.__save_intermediates:
            RUN_Bash('rm {}'.format(reference_vcf))

            # # if self.f_useMultipleMarkers:
            # if not self.f_useGeneticMap:
            #     os.system(' '.join(['rm {}'.format(GCchangeBGL)])) # 'GCchangeBGL' will be used in 'CONVERT_OUT'
            #     os.system(' '.join(['rm {}'.format(GCchangeMarkers_REF)]))  # 'GCchangeMarkers_REF' will be used in 'CONVERT_OUT'
            #     os.system(' '.join(['rm {}'.format(GCchangeMarkers)]))
            #     os.system(' '.join(['rm {}'.format(GCchangeBGL_REF)]))
        """
        (1) `MHC_QC_VCF` := MHC + '.QC.vcf',
        (2) `REF_PHASED_VCF` := self.OUTPUT_dir_ref + '.phased.vcf'

        These two files are to be passed into Beagle phasing;
        """

        if self.FLAG_AdaptiveGeneticMap:

            ############### < Adaptive Genetic Map > ###############
            """
            awk '{print $1" "$2" "$3}' $geneticMap > $geneticMap.first
            awk '{print $2}' $REFERENCE.GCchange.markers > $geneticMap.second
            paste -d " " $geneticMap.first $geneticMap.second > $geneticMap.refined.map
    
            rm $geneticMap.first
            rm $geneticMap.second
    
            """

            REFINED_GENTIC_MAP = self.OUTPUT_dir_GM + '.refined.map'

            RUN_Bash('awk \'{print $1" "$2" "$3}\' %s > %s' %
                     (_Genetic_Map, self.OUTPUT_dir_GM + '.first'))
            RUN_Bash('awk \'{print $2}\' %s > %s' %
                     (GCchangeMarkers_REF, self.OUTPUT_dir_GM + '.second'))
            RUN_Bash(
                'paste -d " " {} {} > {}'.format(
                    self.OUTPUT_dir_GM + '.first',
                    self.OUTPUT_dir_GM + '.second', REFINED_GENTIC_MAP)
            )  # 이렇게 column bind시키는데는 당연히 *.first, *.second 파일의 row수가 같을 거라고 가정하는 상황.

            if os.path.exists(REFINED_GENTIC_MAP):

                self.refined_Genetic_Map = REFINED_GENTIC_MAP

                if not self.__save_intermediates:
                    os.system('rm {}'.format(self.OUTPUT_dir_GM + '.first'))
                    os.system('rm {}'.format(self.OUTPUT_dir_GM + '.second'))
                    # os.system('rm {}'.format(GCchangeMarkers_REF)) # (Genetic Map) *.GCchange.markers is removed here.

            else:
                print(std_ERROR_MAIN_PROCESS_NAME +
                      "Failed to generate Refined Genetic Map.")
                sys.exit()

        __RETURN__ = [MHC_QC_VCF, REF_PHASED_VCF]

        return __RETURN__
Esempio n. 2
0
def Make_EXON234_Panel(infile,
                       outfile,
                       BEAGLE2LINKAGE,
                       PLINK,
                       __save_intermediates=False):

    # REF_base = os.path.basename(outfile)
    # OUTPUT_dir = os.path.dirname(outfile)
    # outfile = os.path.join(OUTPUT_dir, REF_base)

    ### STEP1_Collect_SNP_HLA_DATA

    # # In STEP1, New *.markers file will be used just next step.
    # command = "grep rs {} > {}".format(infile + ".markers", outfile+".STEP1_SNP.markers")
    # # print(command)
    # os.system(command)
    #
    # command = "grep \'HLA_[A-Z]_[0-9][0-9][0-9][0-9]\' {} > {}".format(infile + ".markers", outfile+".STEP1_class1_4dit.markers")
    # # print(command)
    # os.system(command)
    #
    # command = "grep \'HLA_[A-Z][A-Z][A-Z][0-9]_[0-9][0-9][0-9][0-9]\' {} > {}".format(infile + ".markers", outfile+".STEP1_class2_4dit.markers")
    # # print(command)
    # os.system(command)
    #
    # command = 'cat {} {} {} > {}'.format(outfile+".STEP1_SNP.markers", outfile+".STEP1_class1_4dit.markers",
    #                                      outfile+".STEP1_class2_4dit.markers", outfile+".STEP1_SNP_4dit.markers")
    # # print(command)
    # os.system(command)
    #
    #
    # # Remove
    # if not __save_intermediates:
    #     os.system('rm {}'.format(outfile+".STEP1_SNP.markers"))
    #     os.system('rm {}'.format(outfile+".STEP1_class1_4dit.markers"))
    #     os.system('rm {}'.format(outfile+".STEP1_class2_4dit.markers"))

    p_MkRef_ToExclude = re.compile(r'^(AA_|SNP_|INS_|HLA_\w+_\d{2}$)')

    with open(infile + ".markers",
              'r') as f_in_markers, open(outfile + ".STEP1_SNP_4dit.markers",
                                         'w') as f_out_markers:
        for line in f_in_markers:
            l = line.split()

            m = p_MkRef_ToExclude.match(l[0])

            if not m:
                # To save
                f_out_markers.write(line)

    ### STEP2_EXON234_MARKERS

    [outbgl, outmarker] = HLA2EXON234(outfile + ".STEP1_SNP_4dit.markers",
                                      infile + ".bgl.phased",
                                      outfile + ".STEP2_exon234.bgl.phased",
                                      infile + ".markers",
                                      outfile + ".STEP2_exon234.markers")

    # Remove
    if not __save_intermediates:
        os.system('rm {}'.format(outfile + ".STEP1_SNP_4dit.markers"))

    ### STEP3_SORT

    # Dispersing genomic positions of given marker file (*.markers)
    refiend_outmarker = redefineBP(outmarker,
                                   outfile + ".STEP3_refined.markers")
    # print(refiend_outmarker)

    # Sorting the dispersed marker file.
    command = 'sort -gk 2 {} > {}'.format(refiend_outmarker,
                                          outfile + '.markers')
    # print(command)
    if not os.system(command):
        # Remove
        if not __save_intermediates:
            os.system('rm {}'.format(outmarker))
            os.system('rm {}'.format(refiend_outmarker))

    # Sorting beagle file to the oreder of the above sorted markers file
    sorted_outbgl = BGL2SortBGL_WS(outfile + '.markers', outbgl,
                                   outfile + ".bgl.phased")
    # print(sorted_outbgl)
    if not os.path.exists(sorted_outbgl):
        print(std_ERROR_MAIN_PROCESS_NAME +
              "Failed to generate '{}'.".format(sorted_outbgl))
        sys.exit()
    else:
        # Remove
        if not __save_intermediates:
            os.system('rm {}'.format(outbgl))

    ### STEP_4_Make_plink_file

    command = 'cat {} | {} {}'.format(
        sorted_outbgl, BEAGLE2LINKAGE, outfile + ".STEP4_tmp"
    )  # *.ped, *.dat (cf. 'java -jar' is included in 'BEAGLE2LINKAGE'.)
    # print(command)
    if not os.system(command):
        # Remove
        if not __save_intermediates:
            os.system(
                'rm {}'.format(outfile +
                               ".STEP4_tmp.dat"))  # *.dat file is unnecessary.

    command = 'cut -d \' \' -f-5 {} > {}'.format(
        outfile + ".STEP4_tmp.ped",
        outfile + ".STEP4_tmp.ped.left")  # ['FID', 'IID', 'PID', 'MID', 'Sex']
    # print(command)
    os.system(command)

    command = 'cut -d \' \' -f6- {} > {}'.format(
        outfile + ".STEP4_tmp.ped",
        outfile + ".STEP4_tmp.ped.right")  # genotype information part.
    # print(command)
    os.system(command)

    command = 'paste -d \' -9 \' {} /dev/null /dev/null /dev/null {} > {}'.format(
        outfile + ".STEP4_tmp.ped.left", outfile + ".STEP4_tmp.ped.right",
        outfile + ".ped")
    # print(command)
    if not os.system(command):
        # Remove
        if not __save_intermediates:
            os.system('rm {}'.format(outfile + ".STEP4_tmp.ped"))
            os.system('rm {}'.format(outfile + ".STEP4_tmp.ped.left"))
            os.system('rm {}'.format(outfile + ".STEP4_tmp.ped.right"))

    # (1) rsid, (2) bp, (3) allele1
    os.system(' '.join([
        "cut -d \' \' -f1", outfile + ".markers", ">",
        outfile + ".STEP4_map.rsid"
    ]))

    os.system(' '.join([
        "cut -d \' \' -f2", outfile + ".markers", ">",
        outfile + ".STEP4_map.bp"
    ]))

    os.system(' '.join([
        "cut -d \' \' -f3", outfile + ".markers", ">",
        outfile + ".STEP4_map.allele1"
    ]))

    os.system(' '.join([
        "paste -d \'6  0 \'", "/dev/null", "/dev/null",
        outfile + ".STEP4_map.rsid", "/dev/null", "/dev/null",
        outfile + ".STEP4_map.bp", ">", outfile + ".map"
    ]))

    # os.system(' '.join(
    #     ["paste -d \'   \'", outfile + ".STEP4_map.rsid", outfile + ".STEP4_map.bp", ">", outfile + ".refallele"]))

    os.system(' '.join([
        "paste -d \' \'", outfile + ".STEP4_map.rsid",
        outfile + ".STEP4_map.allele1", ">", outfile + ".refallele"
    ]))
    """
    (2019. 07. 09.)
    To make '*.refallele' file, I think right part is supposed to be 'outfile + ".STEP4_map.allele1"' not 'outfile + ".STEP4_map.bp"'
    """

    # bed, bim, fam files.
    command = ' '.join([
        PLINK,
        '--ped {} --map {} --make-bed --reference-allele {} --out {}'.format(
            outfile + ".ped", outfile + ".map", outfile + ".refallele",
            outfile)
    ])
    # print(command)
    if not os.system(command):
        # Remove
        if not __save_intermediates:
            os.system('rm {}'.format(outfile + ".STEP4_map.rsid"))
            os.system('rm {}'.format(outfile + ".STEP4_map.bp"))
            os.system('rm {}'.format(outfile + ".STEP4_map.allele1"))
            os.system('rm {}'.format(outfile + ".ped"))
            os.system('rm {}'.format(outfile + ".map"))
            os.system('rm {}'.format(outfile + ".log"))
            os.system('rm {}'.format(outfile + ".refallele"))

    # Allele Frequency file(*.frq)
    command = ' '.join([
        PLINK, '--bfile {} --keep-allele-order --freq --out {}'.format(
            outfile, outfile + ".FRQ")
    ])
    # print(command)
    if not os.system(command):
        # Remove
        if not __save_intermediates:
            os.system('rm {}'.format(outfile + ".FRQ.log"))

    return outfile