Example #1
0
    def Doubling(self, MHC, PHASED_RESULT):

        ### Target data doubling step.
        print("[{}] Performing Doubling".format(self.idx_process))
        self.idx_process += 1

        RUN_Bash('gzip -d -f {}'.format(PHASED_RESULT + '.vcf.gz'))

        RUN_Bash('grep ^## {} > {}'.format(
            PHASED_RESULT + '.vcf',
            PHASED_RESULT + '.vcf.header1'))  # Header part with '##'
        RUN_Bash('grep -v ^## {} | head -n 1 > {}'.format(
            PHASED_RESULT + '.vcf',
            PHASED_RESULT + '.vcf.header2'))  # Header part with '#'
        RUN_Bash('grep -v ^# {} > {}'.format(
            PHASED_RESULT + '.vcf',
            PHASED_RESULT + '.vcf.body'))  # Body part without '#' or '##'

        RUN_Bash('sed "s%#%%" {} > {}'.format(
            PHASED_RESULT + '.vcf.header2',
            PHASED_RESULT + '.vcf.noshop.header2'))
        RUN_Bash('cat {} {} > {}'.format(PHASED_RESULT + '.vcf.noshop.header2',
                                         PHASED_RESULT + '.vcf.body',
                                         PHASED_RESULT + '.tobeDoubled.vcf'))

        RUN_Bash(
            'Rscript /scratch3/users/nanje/MHC-Imputation-Accuracy/templates/src/Doubling_vcf.R {} {}'
            .format(PHASED_RESULT + '.tobeDoubled.vcf',
                    PHASED_RESULT + '.Doubled.pre.vcf'))

        if not os.path.exists(PHASED_RESULT + '.tobeDoubled.vcf'):
            print(
                std_ERROR_MAIN_PROCESS_NAME +
                "Doubled phased file('{}') can't be found(or wasn't generated at all."
                .format(PHASED_RESULT + '.tobeDoubled.pre.vcf'))
            sys.exit()

        RUN_Bash('sed "s%CHROM%#CHROM%" {} > {}'.format(
            PHASED_RESULT + '.Doubled.pre.vcf',
            PHASED_RESULT + '.Doubled.pre2.vcf'))

        RUN_Bash('cat {} {} > {}'.format(PHASED_RESULT + '.vcf.header1',
                                         PHASED_RESULT + '.Doubled.pre2.vcf',
                                         MHC + '.QC.phasing_out_Doubled.vcf'))

        if not self.__save_intermediates:
            # os.system(' '.join(['rm', PHASED_RESULT + '.vcf']))
            os.system(' '.join(['rm', PHASED_RESULT + '.vcf.header1']))
            os.system(' '.join(['rm', PHASED_RESULT + '.vcf.header2']))
            os.system(' '.join(['rm', PHASED_RESULT + '.vcf.noshop.header2']))
            os.system(' '.join(['rm', PHASED_RESULT + '.vcf.body']))
            os.system(' '.join(['rm', PHASED_RESULT + '.tobeDoubled.vcf']))
            os.system(' '.join(['rm', PHASED_RESULT + '.Doubled.pre.vcf']))
            os.system(' '.join(['rm', PHASED_RESULT + '.Doubled.pre2.vcf']))

        return MHC + '.QC.phasing_out_Doubled.vcf'
Example #2
0
    def IMPUTE_HapMap_Map(self, _out, _MHC_QC_VCF, _REF_PHASED_VCF, _nthreads):

        # Imputation function for only HapMap_Map.txt

        print(
            "[{}] Performing HLA imputation (see {}.MHC.QC.imputation_out.log for progress)."
            .format(self.idx_process, _out))
        self.idx_process += 1

        OUT = _out + '.MHC.QC.raw_imputation_out'
        """
        ### AGM - HapMap_map.txt

        beagle gt=$MHC.QC.vcf ref=$REFERENCE.phased.vcf out=$MHC.QC.imputation_out impute=true gprobs=true lowmem=true map=HapMap_Map.txt overlap=3000

        """

        command = '{} gt={} ref={} out={} impute=true gprobs=true lowmem=true map={} overlap=3000 nthreads={}'.format(
            self.BEAGLE4, _MHC_QC_VCF, _REF_PHASED_VCF, OUT, self.HapMap_Map,
            _nthreads)
        # print(command)

        try:
            f_log = open(OUT + '.log', 'w')

            imputation_start = time()
            subprocess.run(re.split('\s+', command),
                           check=True,
                           stdout=f_log,
                           stderr=f_log)
            imputation_end = time()

        except subprocess.CalledProcessError:
            raise CookHLAImputationError(std_ERROR_MAIN_PROCESS_NAME +
                                         "HapMapMap Imputation failed.\n")
            # sys.stderr.write(std_ERROR_MAIN_PROCESS_NAME + "Imputation({} / overlap:{}) failed.\n".format(_exonN, _overlap))
            # return -1
        else:
            # print(std_MAIN_PROCESS_NAME+"Imputation({} / overlap:{}) done.".format(_exonN, _overlap))
            # os.system("rm {}".format(raw_HLA_IMPUTATION_OUT+'.err.log'))
            f_log.close()

            imputation_time = (imputation_end - imputation_start) / 60
            sys.stdout.write(
                "HapMapMap Imputation time: {}(min)\n".format(imputation_time))

        RUN_Bash('gzip -d -f {}.vcf.gz'.format(OUT))

        __RETURN__ = OUT + '.vcf'

        return __RETURN__
Example #3
0
    def CONVERT_OUT(self, MHC, _reference, _out, raw_IMP_Result):

        ### Converting imputation result in vcf file to beagle format.

        RUN_Bash('cat {} | {} 0 {}'.format(raw_IMP_Result, self.VCF2BEAGLE,
                                           MHC + '.QC.imputation_GCchange'))

        if not self.__save_intermediates:
            os.system('rm {}'.format(MHC + '.QC.imputation_GCchange.int'))

        RUN_Bash('gzip -d -f {}'.format(MHC +
                                        '.QC.imputation_GCchange.bgl.gz'))

        ### Converting imputation GC_beagle to original beagle(Decoding GC-encoding).

        GC_decodedBGL = GCtricedBGL2OriginalBGL(
            MHC + '.QC.imputation_GCchange.bgl', self.refined_REF_markers,
            MHC + '.QC.imputation_ori.bgl')
        # print('GC_decodedBGL : {}'.format(GC_decodedBGL))

        if not self.__save_intermediates:
            os.system('rm {}'.format(MHC + '.QC.imputation_GCchange.bgl'))
            os.system('rm {}'.format(MHC + '.QC.imputation_GCchange.markers'))

        HLA_IMPUTED_Result_MHC = self.HLA_IMPUTED_Result_MHC

        RUN_Bash('Rscript src/complete_header.R {} {} {}'.format(
            self.GCchangeBGL, GC_decodedBGL,
            HLA_IMPUTED_Result_MHC + '.bgl.phased'))

        if not self.__save_intermediates:
            # os.system('rm {}'.format(MHC + '.QC.GCchange.bgl'))
            os.system('rm {}'.format(GC_decodedBGL))

        ### Converting imputation genotypes to PLINK .ped format.

        RUN_Bash('cat {} | {} {}'.format(
            HLA_IMPUTED_Result_MHC + '.bgl.phased', self.BEAGLE2LINKAGE,
            _out + '.tmp'))
        RUN_Bash("cut -d ' ' -f6- {} > {}".format(_out + '.tmp.ped',
                                                  _out + '.tmp'))
        RUN_Bash("paste -d ' ' {} {} | tr -d '\015' > {}".format(
            MHC + '.fam', _out + '.tmp',
            HLA_IMPUTED_Result_MHC + '.ped'))  # *.ped
        RUN_Bash('cut -f1-4 {} > {}'.format(
            _reference + '.bim', HLA_IMPUTED_Result_MHC + '.map'))  # *.map
        RUN_Bash('cp {} {}'.format(MHC + '.fam',
                                   HLA_IMPUTED_Result_MHC + '.fam'))

        # Create PLINK bed format
        RUN_Bash('{} --ped {} --map {} --make-bed --out {}'.format(
            self.PLINK, HLA_IMPUTED_Result_MHC + '.ped',
            HLA_IMPUTED_Result_MHC + '.map', HLA_IMPUTED_Result_MHC))
        # RUN_Bash('rm {}'.format(HLA_IMPUTED_Result_MHC + '.fam'))
        RUN_Bash('cp {} {}'.format(_reference + '.bim',
                                   HLA_IMPUTED_Result_MHC + '.bim'))

        if not self.__save_intermediates:
            RUN_Bash('rm {}'.format(_out + '.tmp'))
            RUN_Bash('rm {}'.format(_out + '.tmp.ped'))
            RUN_Bash('rm {}'.format(_out + '.tmp.dat'))

            # RUN_Bash('rm {}'.format(self.raw_IMP_Reuslt)) # Saving the single vcf result. (2020.01.03)
            RUN_Bash('rm {}'.format(self.refined_REF_markers))
            # RUN_Bash('rm {}'.format(self.GCchangeBGL))
            RUN_Bash('rm {}'.format(MHC + '.fam'))

        # BGL2Allele.py
        __RETURN__ = BGL2Alleles(HLA_IMPUTED_Result_MHC + '.bgl.phased',
                                 HLA_IMPUTED_Result_MHC + '.alleles', 'all')

        self.idx_process += 1
        return __RETURN__
Example #4
0
    def IMPUTE(self, _out, _MHC_QC_VCF, _REF_PHASED_VCF, _aver_erate,
               _Refined_Genetic_Map, _window, _overlap, _ne, _nthreads):

        print(
            "[{}] Performing HLA imputation (see {}.MHC.QC.imputation_out.log for progress)."
            .format(self.idx_process, _out))
        self.idx_process += 1

        OUT = _out + '.MHC.QC.raw_imputation_out'

        if self.FLAG_AdaptiveGeneticMap:  ### With Adatpive Genetic Map
            """
            ### AGM
            """
            """
            ## Beagle 4.1 ##
            
            beagle gt=$MHC.QC.vcf ref=$REFERENCE.phased.vcf out=$MHC.QC.imputation_out impute=true gprobs=true lowmem=true ne=10000 map=$geneticMap.refined.map err=$aver_erate overlap=3000
            
            
            
            ## Beagle 5.1 ##

            beagle \
                gt=$MHC.QC.vcf \
                ref=$REFERENCE.phased.vcf \
                out=$MHC.QC.imputation_out \
                impute=true \
                gp=true \
                map=$geneticMap.refined.map \
                err=$aver_erate \
                ne=10000    # fixed \
                nthreads=$nthreads
                        
            """

            with open(_aver_erate, 'r') as f:
                aver_erate = f.readline().rstrip('\n')

            command = '{} gt={} ref={} out={} impute=true gp=true err={} map={} ne=10000 nthreads={}'.format(
                self.BEAGLE5, _MHC_QC_VCF, _REF_PHASED_VCF, OUT, aver_erate,
                _Refined_Genetic_Map, _nthreads)
            # print(command)

            try:
                f_log = open(OUT + '.log', 'w')

                imputation_start = time()
                subprocess.run(re.split('\s+', command),
                               check=True,
                               stdout=f_log,
                               stderr=f_log)
                imputation_end = time()

            except subprocess.CalledProcessError:
                raise CookHLAImputationError(std_ERROR_MAIN_PROCESS_NAME +
                                             "AGM Imputation failed.\n")
                # sys.stderr.write(std_ERROR_MAIN_PROCESS_NAME + "Imputation({} / overlap:{}) failed.\n".format(_exonN, _overlap))
                # return -1
            else:
                # print(std_MAIN_PROCESS_NAME+"Imputation({} / overlap:{}) done.".format(_exonN, _overlap))
                # os.system("rm {}".format(raw_HLA_IMPUTATION_OUT+'.err.log'))
                f_log.close()

                imputation_time = (imputation_end - imputation_start) / 60
                sys.stdout.write(
                    "AGM Imputation time: {}(min)\n".format(imputation_time))

        else:  ### Plain
            """
            ### Plain
            """
            """
            ## Beagle 4.1 ##

            beagle gt=$MHC.QC.vcf ref=$REFERENCE.phased.vcf out=$MHC.QC.imputation_out impute=true gprobs=true lowmem=true overlap=3000


            
            ## Beagle 5.1 ##
            
            beagle \
                gt=$MHC.QC.vcf \
                ref=$REFERENCE.phased.vcf \
                out=$MHC.QC.imputation_out \
                impute=true \
                gp=true \
                nthreads=$nthreads
            
            """

            command = '{} \
                        gt={} \
                        ref={} \
                        out={} \
                        impute=true \
                        gp=true \
                        nthreads={}'.format(self.BEAGLE5, _MHC_QC_VCF,
                                            _REF_PHASED_VCF, OUT, _nthreads)
            # print(command)

            try:
                f_log = open(OUT + '.log', 'w')

                imputation_start = time()
                subprocess.run(re.split('\s+', command),
                               check=True,
                               stdout=f_log,
                               stderr=f_log)
                imputation_end = time()

            except subprocess.CalledProcessError:
                raise CookHLAImputationError(std_ERROR_MAIN_PROCESS_NAME +
                                             "Plain Imputation failed.\n")
                # sys.stderr.write(std_ERROR_MAIN_PROCESS_NAME + "Imputation({} / overlap:{}) failed.\n".format(_exonN, _overlap))
                # return -1
            else:
                # print(std_MAIN_PROCESS_NAME+"Imputation({} / overlap:{}) done.".format(_exonN, _overlap))
                # os.system("rm {}".format(raw_HLA_IMPUTATION_OUT+'.err.log'))

                imputation_time = (imputation_end - imputation_start) / 60
                sys.stdout.write(
                    "Plain Imputation time: {}(min)\n".format(imputation_time))

        RUN_Bash('gzip -d -f {}.vcf.gz'.format(OUT))

        __RETURN__ = OUT + '.vcf'

        return __RETURN__
Example #5
0
    def __init__(self,
                 idx_process,
                 MHC,
                 _reference,
                 _out,
                 _hg,
                 _window,
                 _overlap,
                 _ne,
                 _nthreads,
                 _AdaptiveGeneticMap,
                 _Average_Erate,
                 _LINKAGE2BEAGLE,
                 _BEAGLE2LINKAGE,
                 _BEAGLE2VCF,
                 _VCF2BEAGLE,
                 _PLINK,
                 _BEAGLE5,
                 _answer=None,
                 f_save_intermediates=False,
                 _HapMap_Map=None,
                 f_measureAcc_v2=False):

        ### Class variables

        # General
        self.idx_process = idx_process
        self.__save_intermediates = f_save_intermediates

        self.FLAG_AdaptiveGeneticMap = _AdaptiveGeneticMap and _Average_Erate  # (***) Deciding whether to use Adaptive genetic map or not.

        # Prefixes
        self.OUTPUT_dir = os.path.dirname(_out)
        self.OUTPUT_dir_ref = join(self.OUTPUT_dir,
                                   os.path.basename(_reference))
        self.OUTPUT_dir_GM = join(
            self.OUTPUT_dir, os.path.basename(
                _AdaptiveGeneticMap)) if self.FLAG_AdaptiveGeneticMap else None
        # self.HLA_IMPUTED_Result_MHC = join(os.path.dirname(MHC), "HLA_IMPUTED_Result.{}".format(os.path.basename(MHC)))
        self.HLA_IMPUTED_Result_MHC = MHC + '.HLA_IMPUTATION_OUT'

        # Result
        self.raw_IMP_Reuslt = None
        self.HLA_IMPUTATION_OUT = None  # Final Imputation output ('*.imputed.alleles').
        self.accuracy = None

        # Dependencies
        self.LINKAGE2BEAGLE = _LINKAGE2BEAGLE
        self.BEAGLE2LINKAGE = _BEAGLE2LINKAGE
        self.BEAGLE2VCF = _BEAGLE2VCF
        self.VCF2BEAGLE = _VCF2BEAGLE
        self.PLINK = _PLINK
        self.BEAGLE5 = _BEAGLE5

        # Adaptive Genetic Map
        self.__AGM__ = _AdaptiveGeneticMap
        self.__AVER__ = _Average_Erate

        # created in 'CONVERT_IN'
        self.refined_REF_markers = None  # used in 'CONVERT_OUT'
        self.refined_Genetic_Map = None  # used in 'IMPUTE'
        self.GCchangeBGL = None  # used in 'CONVERT_OUT'

        # HapMap_Map
        if _HapMap_Map:
            self.HapMap_Map = _HapMap_Map
            self.OUTPUT_dir_GM = join(self.OUTPUT_dir,
                                      os.path.basename(_HapMap_Map))

        ###### < Main - 'CONVERT_IN', 'IMPUTE', 'CONVERT_OUT' > ######

        ### (1) CONVERT_IN

        # self.CONVERT_IN(MHC, _reference, _out, _hg, _aver_erate=self.__AVER__, _Genetic_Map=self.__AGM__)
        [MHC_QC_VCF,
         REF_PHASED_VCF] = self.CONVERT_IN(MHC,
                                           _reference,
                                           _out,
                                           _hg,
                                           _Genetic_Map=self.__AGM__)

        # [Temporary Hard coding]
        # MHC_QC_VCF = "/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190605_onlyAGM/_3_HM_CEU_T1DGC_REF.MHC.QC.vcf"
        # REF_PHASED_VCF = "/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190605_onlyAGM/T1DGC_REF.phased.vcf"
        # self.refined_Genetic_Map = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190605_onlyAGM/CEU_T1DGC.mach_step.avg.clpsB.refined.map'
        # print("CONVERT_IN :\n{}\n{}".format(MHC_QC_VCF, REF_PHASED_VCF))

        ### (2) IMPUTE

        if _HapMap_Map:
            self.raw_IMP_Reuslt = self.IMPUTE_HapMap_Map(
                _out, MHC_QC_VCF, REF_PHASED_VCF, _window, _overlap, _ne,
                _nthreads)
        else:
            self.raw_IMP_Reuslt = self.IMPUTE(_out, MHC_QC_VCF, REF_PHASED_VCF,
                                              self.__AVER__,
                                              self.refined_Genetic_Map,
                                              _window, _overlap, _ne,
                                              _nthreads)

        # [Temporary Hard coding]
        # self.raw_IMP_Reuslt = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190605_onlyAGM/_3_HM_CEU_T1DGC_REF.QC.imputation_out.vcf'
        # self.refined_REF_markers = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190605_onlyAGM/T1DGC_REF.refined.markers'
        # self.GCchangeBGL = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190605_onlyAGM/_3_HM_CEU_T1DGC_REF.MHC.QC.GCchange.bgl'
        # print("raw Imputed Reuslt :\n{}".format(self.raw_IMP_Reuslt))

        ### (3) CONVERT_OUT

        self.HLA_IMPUTATION_OUT = self.CONVERT_OUT(MHC, _reference, _out,
                                                   self.raw_IMP_Reuslt)

        # [Temporary Hard coding]
        # self.IMP_Result = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190605_onlyAGM/HLA_IMPUTED_Result._3_HM_CEU_T1DGC_REF.MHC.imputed.alleles'
        # print("\n\nImputation Result : {}".format(self.IMP_Result))

        ###### < Get Accuracy > ######

        if bool(_answer):

            print(std_MAIN_PROCESS_NAME +
                  "Calculating accuracy of each HLA gene. (answer: '{}')".
                  format(_answer))

            if not os.path.exists(_answer):
                print(
                    std_WARNING_MAIN_PROCESS_NAME +
                    "Given answer file doesn't exist. Please check '--answer/-an' argument again.\n"
                    "Skipping calculating imputation accuracy.")
            elif os.path.getsize(_answer) == 0:
                print(
                    std_WARNING_MAIN_PROCESS_NAME +
                    "Given answer file doesn't have any content. Please check '--answer/-an' argument again.\n"
                    "Skipping calculating imputation accuracy.")
            else:

                if f_measureAcc_v2:
                    # measureAcc_v2
                    self.accuracy = measureAccuracy(
                        _answer,
                        self.HLA_IMPUTATION_OUT,
                        'all',
                        outfile=self.HLA_IMPUTATION_OUT + '.accuracy',
                        __only4digits=True)

                else:
                    # measureAcc_v3.5
                    measureAcc_start = time()

                    t = CookHLA_measureAcc(_answer, self.HLA_IMPUTATION_OUT,
                                           self.HLA_IMPUTATION_OUT)
                    self.accuracy = t.accuracy

                    measureAcc_end = time()

                    measureAcc_time = (measureAcc_end - measureAcc_start) / 60
                    print("\nAccuracy : {}".format(self.accuracy))
                    print("measureAccuracy time: {}(min)\n".format(
                        measureAcc_time))

        ###### < Get Accuracy > ######

        # Reference panel
        RUN_Bash('rm {}'.format(REF_PHASED_VCF))
        RUN_Bash('rm {}'.format(self.OUTPUT_dir_ref + '.GCchange.bgl.phased'))
        RUN_Bash('rm {}'.format(self.OUTPUT_dir_ref + '.GCchange.markers'))

        if self.FLAG_AdaptiveGeneticMap:
            RUN_Bash('rm {}'.format(self.refined_Genetic_Map))
Example #6
0
    def CONVERT_IN(self, MHC, _reference, _out, _hg, _Genetic_Map):

        print("[{}] Converting data to beagle format.".format(
            self.idx_process))
        self.idx_process += 1

        RUN_Bash(self.LINKAGE2BEAGLE +
                 ' pedigree={} data={} beagle={} standard=true > {}'.format(
                     MHC + '.QC.nopheno.ped', MHC + '.QC.dat', MHC +
                     '.QC.bgl', _out + '.bgl.log'))

        # if not self.__save_intermediates:
        #     os.system('rm {}'.format(MHC + '.QC.nopheno.ped'))
        #     os.system('rm {}'.format(MHC + '.QC.dat'))
        #     os.system('rm {}'.format(_out+'.bgl.log'))

        ### Converting data to reference_markers_Position (Dispersing same genomic position of some markers.)

        from src.redefineBPv1BH import redefineBP

        RefinedMarkers = redefineBP(_reference + '.markers',
                                    self.OUTPUT_dir_ref + '.refined.markers')
        self.refined_REF_markers = RefinedMarkers  # => This will be used in 'CONVERT_OUT'.

        ### Converting data to target_markers_Position and extract not_including snp.

        RUN_Bash('awk \'{print $2" "$4" "$5" "$6}\' %s > %s' %
                 (MHC + '.QC.bim', MHC + '.QC.markers'))

        RUN_Bash(
            'Rscript src/excluding_snp_and_refine_target_position-v1COOK02222017.R {} {} {}'
            .format(MHC + '.QC.markers', RefinedMarkers,
                    MHC + '.QC.pre.markers'))
        if not self.__save_intermediates:
            os.system(' '.join(['rm', MHC + '.QC.markers']))

        RUN_Bash('mv {} {}'.format(MHC + '.QC.bgl',
                                   MHC + '.QC.pre.bgl.phased'))

        RUN_Bash("awk '{print $1}' %s > %s" %
                 (MHC + '.QC.pre.markers',
                  join(self.OUTPUT_dir, 'selected_snp.txt')))

        from src.Panel_subset import Panel_Subset
        qc_refined = Panel_Subset(MHC + '.QC.pre', 'all',
                                  join(self.OUTPUT_dir, 'selected_snp.txt'),
                                  MHC + '.QC.refined')

        if not self.__save_intermediates:
            RUN_Bash('rm {}'.format(MHC + '.QC.pre.bgl.phased'))
            RUN_Bash('rm {}'.format(MHC + '.QC.pre.markers'))
            RUN_Bash('rm {}'.format(join(self.OUTPUT_dir, 'selected_snp.txt')))

        ### Converting data to GC_change_beagle format.

        from src.bgl2GC_trick_bgl import Bgl2GC

        # target
        [GCchangeBGL, GCchangeMarkers] = Bgl2GC(MHC + '.QC.refined.bgl.phased',
                                                MHC + '.QC.refined.markers',
                                                MHC + '.QC.GCchange.bgl',
                                                MHC + '.QC.GCchange.markers')

        self.GCchangeBGL = GCchangeBGL  # it will be used in 'CONVERT_OUT' with Genetic Map

        # print("<Target GCchanged bgl and marker file>\n"
        #       "bgl : {}\n"
        #       "markers : {}".format(GCchangeBGL, GCchangeMarkers))

        # reference
        [GCchangeBGL_REF, GCchangeMarkers_REF
         ] = Bgl2GC(_reference + '.bgl.phased', RefinedMarkers,
                    self.OUTPUT_dir_ref + '.GCchange.bgl.phased',
                    self.OUTPUT_dir_ref + '.GCchange.markers')
        # print("<Reference GCchanged bgl and marker file>\n"
        #       "bgl : {}\n"
        #       "markers : {}".format(GCchangeBGL_REF, GCchangeMarkers_REF))

        if not self.__save_intermediates:

            RUN_Bash('rm {}'.format(MHC + '.QC.refined.bgl.phased'))
            RUN_Bash('rm {}'.format(MHC + '.QC.refined.markers'))
            # RUN_Bash('rm {}'.format(RefinedMarkers))

            # os.system(' '.join(['rm', RefinedMarkers])) # => This will be used in 'CONVERT_OUT" when not using Multiple Markers.

        ### Converting data to vcf_format

        # target
        RUN_Bash(self.BEAGLE2VCF +
                 ' 6 {} {} 0 > {}'.format(GCchangeMarkers, GCchangeBGL, MHC +
                                          '.QC.vcf'))

        MHC_QC_VCF = MHC + '.QC.vcf'

        # reference
        RUN_Bash(self.BEAGLE2VCF +
                 ' 6 {} {} 0 > {}'.format(GCchangeMarkers_REF, GCchangeBGL_REF,
                                          self.OUTPUT_dir_ref + '.vcf'))

        reference_vcf = self.OUTPUT_dir_ref + '.vcf'

        ### Converting data to reference_phased

        RUN_Bash('sed "s%/%|%g" {} > {}'.format(
            reference_vcf, self.OUTPUT_dir_ref + '.phased.vcf'))

        REF_PHASED_VCF = self.OUTPUT_dir_ref + '.phased.vcf'

        if not self.__save_intermediates:
            RUN_Bash('rm {}'.format(reference_vcf))

            # # if self.f_useMultipleMarkers:
            # if not self.f_useGeneticMap:
            #     os.system(' '.join(['rm {}'.format(GCchangeBGL)])) # 'GCchangeBGL' will be used in 'CONVERT_OUT'
            #     os.system(' '.join(['rm {}'.format(GCchangeMarkers_REF)]))  # 'GCchangeMarkers_REF' will be used in 'CONVERT_OUT'
            #     os.system(' '.join(['rm {}'.format(GCchangeMarkers)]))
            #     os.system(' '.join(['rm {}'.format(GCchangeBGL_REF)]))
        """
        (1) `MHC_QC_VCF` := MHC + '.QC.vcf',
        (2) `REF_PHASED_VCF` := self.OUTPUT_dir_ref + '.phased.vcf'

        These two files are to be passed into Beagle phasing;
        """

        if self.FLAG_AdaptiveGeneticMap:

            ############### < Adaptive Genetic Map > ###############
            """
            awk '{print $1" "$2" "$3}' $geneticMap > $geneticMap.first
            awk '{print $2}' $REFERENCE.GCchange.markers > $geneticMap.second
            paste -d " " $geneticMap.first $geneticMap.second > $geneticMap.refined.map
    
            rm $geneticMap.first
            rm $geneticMap.second
    
            """

            REFINED_GENTIC_MAP = self.OUTPUT_dir_GM + '.refined.map'

            RUN_Bash('awk \'{print $1" "$2" "$3}\' %s > %s' %
                     (_Genetic_Map, self.OUTPUT_dir_GM + '.first'))
            RUN_Bash('awk \'{print $2}\' %s > %s' %
                     (GCchangeMarkers_REF, self.OUTPUT_dir_GM + '.second'))
            RUN_Bash(
                'paste -d " " {} {} > {}'.format(
                    self.OUTPUT_dir_GM + '.first',
                    self.OUTPUT_dir_GM + '.second', REFINED_GENTIC_MAP)
            )  # 이렇게 column bind시키는데는 당연히 *.first, *.second 파일의 row수가 같을 거라고 가정하는 상황.

            if os.path.exists(REFINED_GENTIC_MAP):

                self.refined_Genetic_Map = REFINED_GENTIC_MAP

                if not self.__save_intermediates:
                    os.system('rm {}'.format(self.OUTPUT_dir_GM + '.first'))
                    os.system('rm {}'.format(self.OUTPUT_dir_GM + '.second'))
                    # os.system('rm {}'.format(GCchangeMarkers_REF)) # (Genetic Map) *.GCchange.markers is removed here.

            else:
                print(std_ERROR_MAIN_PROCESS_NAME +
                      "Failed to generate Refined Genetic Map.")
                sys.exit()

        __RETURN__ = [MHC_QC_VCF, REF_PHASED_VCF]

        return __RETURN__
Example #7
0
    def Make_ExonN_Panel(self, _exonN, _EXON234_Panel, _out):
        """
        (1) Make a regular expression to nominate HLA allele binary marker of exon 2, 3, 4
        (2) To make exon 2 panel, for example, subset out exon 3, 4 HLA allele binary markers. Maybe with Plink('--recode').
        (3) With subsetted *.ped and *.map files, make a *.nopheno and *.markers files.
        (4) Feed them to LINKAGE2BEAGLE => Beagle file generated.
        (5) Get a frequency file.
        """

        ### Subsetting `_exonN` HLA allele binary markers and SNP markers. ((1) + (2) above.)

        # Input files - (1) Exon234 beagle file, (2) Exon234 markers file.
        f_bgl_exon234 = open(_EXON234_Panel + '.bgl.phased', 'r')
        f_markers_exon234 = open(_EXON234_Panel + '.markers', 'r')

        # Output files
        f_out_bgl = open(_out + '.bgl.phased', 'w')
        f_out_markers = open(_out + '.markers', 'w')

        count = 0

        for line_bgl_exon234 in f_bgl_exon234:

            m = p_1st_two_columns.match(line_bgl_exon234)
            items_bgl_exon234 = [
                m.group(1), m.group(2)
            ]  # Skip exception where that pattern finds nothing.

            if items_bgl_exon234[0] != 'M':
                """
                - Header part of *.bgl file.
                
                'P pedigree'
                'I id'
                'fID father'
                'mID mother'
                'C gender'
                
                etc.
                
                """

                f_out_bgl.write(
                    line_bgl_exon234)  # Just forward the line to output file.

            else:
                """
                Main body
                
                'M   rs969931'
                'M  rs2745406'
                
                """

                # Each line of '*.markers' file.
                line_markers_exon234 = f_markers_exon234.readline()

                if re.match(pattern=r'^HLA_', string=items_bgl_exon234[1]):

                    if p_ExonN[_exonN].match(items_bgl_exon234[1]):
                        # Gather only `exonN` HLA allele binary markers.
                        f_out_bgl.write(line_bgl_exon234)
                        f_out_markers.write(line_markers_exon234)

                else:
                    # In case of normal SNP markers, just forward them.
                    f_out_bgl.write(line_bgl_exon234)
                    f_out_markers.write(line_markers_exon234)

            count += 1
            # if count > 10 : break

        f_bgl_exon234.close()
        f_markers_exon234.close()
        f_out_bgl.close()
        f_out_markers.close()

        # Generated exonN Panel.
        bgl_exonN = _out + '.bgl.phased'
        markers_exonN = _out + '.markers'

        ### With those subsetted *.bgl and *.markers files, generate *.ped and *.map. ((3) + (4) above.)

        command = 'cat {} | {} {}'.format(
            bgl_exonN, self.BEAGLE2LINKAGE, _out + ".STEP4_tmp"
        )  # *.ped, *.dat (cf. 'java -jar' is included in 'BEAGLE2LINKAGE'.)
        # print(command)
        if not os.system(command):
            # Remove
            if not self.__save_intermediates:
                os.system('rm {}'.format(
                    _out + ".STEP4_tmp.dat"))  # *.dat file is unnecessary.

        command = 'cut -d \' \' -f-5 {} > {}'.format(
            _out + ".STEP4_tmp.ped", _out +
            ".STEP4_tmp.ped.left")  # ['FID', 'IID', 'PID', 'MID', 'Sex']
        # print(command)
        os.system(command)

        command = 'cut -d \' \' -f6- {} > {}'.format(
            _out + ".STEP4_tmp.ped",
            _out + ".STEP4_tmp.ped.right")  # genotype information part.
        # print(command)
        os.system(command)

        command = 'paste -d \' -9 \' {} /dev/null /dev/null /dev/null {} > {}'.format(
            _out + ".STEP4_tmp.ped.left", _out + ".STEP4_tmp.ped.right",
            _out + ".ped")
        # print(command)
        if not os.system(command):
            # Remove
            if not self.__save_intermediates:
                os.system('rm {}'.format(_out + ".STEP4_tmp.ped"))
                os.system('rm {}'.format(_out + ".STEP4_tmp.ped.left"))
                os.system('rm {}'.format(_out + ".STEP4_tmp.ped.right"))

        # (1) rsid, (2) bp, (3) allele1
        os.system(' '.join([
            "cut -d \' \' -f1", _out + ".markers", ">",
            _out + ".STEP4_map.rsid"
        ]))

        os.system(' '.join([
            "cut -d \' \' -f2", _out + ".markers", ">", _out + ".STEP4_map.bp"
        ]))

        os.system(' '.join([
            "cut -d \' \' -f3", _out + ".markers", ">",
            _out + ".STEP4_map.allele1"
        ]))

        os.system(' '.join([
            "paste -d \'6  0 \'", "/dev/null", "/dev/null",
            _out + ".STEP4_map.rsid", "/dev/null", "/dev/null",
            _out + ".STEP4_map.bp", ">", _out + ".map"
        ]))

        # os.system(' '.join(
        #     ["paste -d \'   \'", _out + ".STEP4_map.rsid", _out + ".STEP4_map.bp", ">", _out + ".refallele"]))

        os.system(' '.join([
            "paste -d \' \'", _out + ".STEP4_map.rsid",
            _out + ".STEP4_map.allele1", ">", _out + ".refallele"
        ]))
        """
        (2019. 07. 09.)
        To make '*.refallele' file, I think right part is supposed to be '_out + ".STEP4_map.allele1"' not '_out + ".STEP4_map.bp"'
        """

        # bed, bim, fam files.
        command = ' '.join([
            self.PLINK,
            '--ped {} --map {} --make-bed --reference-allele {} --out {}'.
            format(_out + ".ped", _out + ".map", _out + ".refallele", _out)
        ])
        # print(command)
        if not os.system(command):
            # Remove
            if not self.__save_intermediates:
                os.system('rm {}'.format(_out + ".STEP4_map.rsid"))
                os.system('rm {}'.format(_out + ".STEP4_map.bp"))
                os.system('rm {}'.format(_out + ".STEP4_map.allele1"))
                os.system('rm {}'.format(_out + ".ped"))
                os.system('rm {}'.format(_out + ".map"))
                os.system('rm {}'.format(_out + ".log"))
                os.system('rm {}'.format(_out + ".refallele"))

        # Allele Frequency file(*.frq)
        command = ' '.join([
            self.PLINK,
            '--bfile {} --keep-allele-order --freq --out {}'.format(
                _out, _out + ".FRQ")
        ])
        # print(command)
        if not os.system(command):
            # Remove
            if not self.__save_intermediates:
                os.system('rm {}'.format(_out + ".FRQ.log"))
        """
        
        Prephasing : Exon234 panel is used.
        Each imputation : Exon2,3,4 panel, each.
        
        For each Exon 2,3,4 panel to be used in Beagle Imputation, preprocessing must be done to them.
        (ex. GC change)
        (cf. redefining BP is already done in 'Make_EXON234_Panel.py'. So it won't be done here.)
        
        Below code is originally 'CONVERT_IN' part.
        
        """

        # reference
        [GCchangeBGL_REF,
         GCchangeMarkers_REF] = Bgl2GC(_out + '.bgl.phased', _out + '.markers',
                                       _out + '.GCchange.bgl.phased',
                                       _out + '.GCchange.markers')
        # print("<Reference GCchanged bgl and marker file>\n"
        #       "bgl : {}\n"
        #       "markers : {}".format(GCchangeBGL_REF, GCchangeMarkers_REF))

        RUN_Bash(self.BEAGLE2VCF + ' 6 {} {} 0 > {}'.format(
            GCchangeMarkers_REF, GCchangeBGL_REF, _out + '.vcf'))

        reference_vcf = _out + '.vcf'

        ### Converting data to reference_phased

        RUN_Bash('sed "s%/%|%g" {} > {}'.format(reference_vcf,
                                                _out + '.phased.vcf'))

        # REF_PHASED_VCF = _out + '.phased.vcf'

        if not self.__save_intermediates:
            RUN_Bash('rm {}'.format(reference_vcf))

            # # if self.f_useMultipleMarkers:
            # if not self.f_useGeneticMap:
            #     os.system(' '.join(['rm {}'.format(GCchangeBGL)])) # 'GCchangeBGL' will be used in 'CONVERT_OUT'
            #     os.system(' '.join(['rm {}'.format(GCchangeMarkers_REF)]))  # 'GCchangeMarkers_REF' will be used in 'CONVERT_OUT'
            #     os.system(' '.join(['rm {}'.format(GCchangeMarkers)]))
            #     os.system(' '.join(['rm {}'.format(GCchangeBGL_REF)]))

        return _out
Example #8
0
def MakeGeneticMap(
        _input,
        _reference,
        _out,
        _p_src="/scratch3/users/nanje/MHC-Imputation-Accuracy/templates/MakeGeneticMap",
        _p_dependency="./dependency",
        __save_intermediates=False):

    # Sample N check
    N_sample_target = getSampleNumbers(_input + '.fam')
    f_SmallSampleMode = N_sample_target < 100

    N_sample_reference = getSampleNumbers(_reference + '.fam')
    if f_SmallSampleMode and (N_sample_reference < 200):
        print(
            std_ERROR_MAIN_PROCESS_NAME +
            "If Target data has less than 100 samples, Reference panel must have at least 200 samples."
        )
        sys.exit()

    _p_plink = "/users/nanje/miniconda3/bin/plink"
    _p_linkage2beagle = "/usr/local/bin/linkage2beagle.jar"
    _p_beagle2linkage = "/usr/local/bin/beagle2linkage.jar"
    _p_transpose = "/usr/local/bin/transpose.jar"
    _p_mach = "/usr/local/bin/mach1"

    PLINK = "{} --noweb --silent --allow-no-sex".format(_p_plink)
    LINKAGE2BEAGLE = 'java -jar {}'.format(_p_linkage2beagle)
    RANDOMIZE_FAM = 'Rscript {}/STEP0_randomize_the_sample_about_fam_03_06_2017-COOK-V1.R'.format(
        _p_src)
    BGL2GC_TRICK_BGL = 'Rscript {}/bgl2GC_trick_bgl-v1.1COOK-02222017.R'.format(
        _p_src)
    BGL2BED = "{}/Panel-BGL2BED.sh".format(_p_src)
    STEP4_buildMap = "Rscript {}/STEP4-buildMap.R".format(_p_src)
    STEP5_collapseHLA = "Rscript {}/STEP5-collapseHLA.R".format(_p_src)

    # Intermediate path.
    if not _out:
        print(std_ERROR_MAIN_PROCESS_NAME +
              'The argument "{0}" has not been given. Please check it again.\n'
              .format("--out"))
        sys.exit()
    else:
        _out = _out if not _out.endswith('/') else _out.rstrip('/')
        if bool(os.path.dirname(_out)):
            os.makedirs(os.path.dirname(_out), exist_ok=True)

    OUTPUT_dir = os.path.dirname(_out)
    OUTPUT_INPUT = os.path.join(
        OUTPUT_dir, os.path.basename(_input))  # Generated in output folder
    OUTPUT_REF = os.path.join(OUTPUT_dir, os.path.basename(_reference))

    ###### < Control Flags > ######

    RANDOM = 1
    SUBSET_BGL = 1
    MAKING_MACH_INPUT = 1
    RUNMACH = 1
    BUILDING_MAP = 1
    Cleanup = 1

    if f_SmallSampleMode:

        # Only 'RANDOM' and 'SUBSET_BGL' two blocks become different
        print(std_MAIN_PROCESS_NAME + "Generating AGM with Small Samples.")

        if RANDOM:

            # RUN_Bash('awk \'{print $1" "$2" ""0"" ""0"" "$5" "$6}\' %s > %s' % (_input+'.fam', OUTPUT_INPUT+'.trick.fam'))
            RUN_Bash('awk \'{print $1" "$2" ""0"" ""0"" "$5" "$6}\' %s > %s' %
                     (_reference + '.fam', OUTPUT_REF + '.trick.fam')
                     )  # Only the reference fam file will be randomized.

            # RUN_Bash(RANDOMIZE_FAM + ' {} {}'.format(OUTPUT_INPUT+'.trick.fam', OUTPUT_INPUT+'.rearranged.fam'))
            RUN_Bash(RANDOMIZE_FAM +
                     ' {} {}'.format(OUTPUT_REF + '.trick.fam', OUTPUT_REF +
                                     '.rearranged.fam'))

            # RUN_Bash('rm {}'.format(OUTPUT_INPUT+'.trick.fam'))
            RUN_Bash('rm {}'.format(OUTPUT_REF + '.trick.fam'))

        if SUBSET_BGL:

            # RUN_Bash('head -100 %s | awk \'{print $1" "$2}\' > %s' % (OUTPUT_INPUT+'.rearranged.fam', OUTPUT_INPUT+'.subset.samples'))
            RUN_Bash(
                'head -200 %s | tail -n 100 | awk \'{print $1" "$2}\' > %s' %
                (OUTPUT_REF + '.rearranged.fam',
                 OUTPUT_INPUT + '.subset.samples'))
            RUN_Bash('head -100 %s | awk \'{print $1" "$2}\' > %s' %
                     (OUTPUT_REF + '.rearranged.fam',
                      OUTPUT_REF + '.subset.samples'))

            # RUN_Bash(PLINK + ' --bfile {} --keep {} --recode --out {}'.format(_input, OUTPUT_INPUT+'.subset.samples', OUTPUT_INPUT+'.subset'))
            # RUN_Bash(PLINK + ' --bfile {} --keep {} --make-bed --out {}'.format(_input, OUTPUT_INPUT+'.subset.samples', OUTPUT_INPUT+'.subset'))

            RUN_Bash(PLINK + ' --bfile {} --keep {} --recode --out {}'.format(
                _reference, OUTPUT_INPUT + '.subset.samples', OUTPUT_INPUT +
                '.subset'))
            RUN_Bash(PLINK +
                     ' --bfile {} --keep {} --make-bed --out {}'.format(
                         _reference, OUTPUT_INPUT +
                         '.subset.samples', OUTPUT_INPUT + '.subset'))

            RUN_Bash("cut -d ' ' -f1-5,7- {} > {}".format(
                OUTPUT_INPUT + '.subset.ped',
                OUTPUT_INPUT + '.subset.nopheno.ped'))
            RUN_Bash(
                'awk \'{print "M " $2}\' %s > %s' %
                (OUTPUT_INPUT + '.subset.map', OUTPUT_INPUT + '.subset.dat'))

            RUN_Bash(LINKAGE2BEAGLE + ' {} {} > {}'.format(
                OUTPUT_INPUT + '.subset.dat', OUTPUT_INPUT +
                '.subset.nopheno.ped', OUTPUT_INPUT + '.subset.bgl.phased'))

            RUN_Bash('awk \'{print $2" "$4" "$5" "$6}\' %s > %s' %
                     (OUTPUT_INPUT + '.subset.bim',
                      OUTPUT_INPUT + '.subset.markers'))

            Panel_Subset(_reference, OUTPUT_REF + '.subset.samples', 'all',
                         OUTPUT_REF + '.subset')

            RUN_Bash(BGL2GC_TRICK_BGL + ' {} {} {} {}'.format(
                OUTPUT_INPUT + '.subset.bgl.phased', OUTPUT_INPUT +
                '.subset.markers', OUTPUT_INPUT +
                '.subset.GCchange.bgl.phased', OUTPUT_INPUT +
                '.subset.GCchange.markers'))

            RUN_Bash(BGL2GC_TRICK_BGL + ' {} {} {} {}'.format(
                OUTPUT_REF + '.subset.bgl.phased', OUTPUT_REF +
                '.subset.markers', OUTPUT_REF +
                '.subset.GCchange.bgl.phased', OUTPUT_REF +
                '.subset.GCchange.markers'))

            # RUN_Bash('rm {}'.format(OUTPUT_INPUT+'.rearranged.fam')) # Small Sample
            RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.subset.samples'))
            RUN_Bash('rm {}'.format(OUTPUT_REF + '.rearranged.fam'))
            RUN_Bash('rm {}'.format(OUTPUT_REF + '.subset.samples'))
            RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.subset.bgl.phased'))
            RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.subset.markers'))
            RUN_Bash('rm {}'.format(OUTPUT_REF + '.subset.bgl.phased'))
            RUN_Bash('rm {}'.format(OUTPUT_REF + '.subset.markers'))

    else:

        if RANDOM:

            RUN_Bash('awk \'{print $1" "$2" ""0"" ""0"" "$5" "$6}\' %s > %s' %
                     (_input + '.fam', OUTPUT_INPUT + '.trick.fam'))
            RUN_Bash('awk \'{print $1" "$2" ""0"" ""0"" "$5" "$6}\' %s > %s' %
                     (_reference + '.fam', OUTPUT_REF + '.trick.fam'))

            RUN_Bash(RANDOMIZE_FAM +
                     ' {} {}'.format(OUTPUT_INPUT +
                                     '.trick.fam', OUTPUT_INPUT +
                                     '.rearranged.fam'))
            RUN_Bash(RANDOMIZE_FAM +
                     ' {} {}'.format(OUTPUT_REF + '.trick.fam', OUTPUT_REF +
                                     '.rearranged.fam'))

            RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.trick.fam'))
            RUN_Bash('rm {}'.format(OUTPUT_REF + '.trick.fam'))

        if SUBSET_BGL:

            RUN_Bash('head -100 %s | awk \'{print $1" "$2}\' > %s' %
                     (OUTPUT_INPUT + '.rearranged.fam',
                      OUTPUT_INPUT + '.subset.samples'))
            RUN_Bash('head -100 %s | awk \'{print $1" "$2}\' > %s' %
                     (OUTPUT_REF + '.rearranged.fam',
                      OUTPUT_REF + '.subset.samples'))

            RUN_Bash(PLINK + ' --bfile {} --keep {} --recode --out {}'.format(
                _input, OUTPUT_INPUT + '.subset.samples', OUTPUT_INPUT +
                '.subset'))
            RUN_Bash(PLINK +
                     ' --bfile {} --keep {} --make-bed --out {}'.format(
                         _input, OUTPUT_INPUT +
                         '.subset.samples', OUTPUT_INPUT + '.subset'))

            RUN_Bash("cut -d ' ' -f1-5,7- {} > {}".format(
                OUTPUT_INPUT + '.subset.ped',
                OUTPUT_INPUT + '.subset.nopheno.ped'))
            RUN_Bash(
                'awk \'{print "M " $2}\' %s > %s' %
                (OUTPUT_INPUT + '.subset.map', OUTPUT_INPUT + '.subset.dat'))

            RUN_Bash(LINKAGE2BEAGLE + ' {} {} > {}'.format(
                OUTPUT_INPUT + '.subset.dat', OUTPUT_INPUT +
                '.subset.nopheno.ped', OUTPUT_INPUT + '.subset.bgl.phased'))

            RUN_Bash('awk \'{print $2" "$4" "$5" "$6}\' %s > %s' %
                     (OUTPUT_INPUT + '.subset.bim',
                      OUTPUT_INPUT + '.subset.markers'))

            Panel_Subset(_reference, OUTPUT_REF + '.subset.samples', 'all',
                         OUTPUT_REF + '.subset')

            RUN_Bash(BGL2GC_TRICK_BGL + ' {} {} {} {}'.format(
                OUTPUT_INPUT + '.subset.bgl.phased', OUTPUT_INPUT +
                '.subset.markers', OUTPUT_INPUT +
                '.subset.GCchange.bgl.phased', OUTPUT_INPUT +
                '.subset.GCchange.markers'))

            RUN_Bash(BGL2GC_TRICK_BGL + ' {} {} {} {}'.format(
                OUTPUT_REF + '.subset.bgl.phased', OUTPUT_REF +
                '.subset.markers', OUTPUT_REF +
                '.subset.GCchange.bgl.phased', OUTPUT_REF +
                '.subset.GCchange.markers'))

            RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.rearranged.fam'))
            RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.subset.samples'))
            RUN_Bash('rm {}'.format(OUTPUT_REF + '.rearranged.fam'))
            RUN_Bash('rm {}'.format(OUTPUT_REF + '.subset.samples'))
            RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.subset.bgl.phased'))
            RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.subset.markers'))
            RUN_Bash('rm {}'.format(OUTPUT_REF + '.subset.bgl.phased'))
            RUN_Bash('rm {}'.format(OUTPUT_REF + '.subset.markers'))

    if MAKING_MACH_INPUT:

        RUN_Bash('bash {} {} {} {} {}'.format(
            BGL2BED, OUTPUT_INPUT + '.subset.GCchange',
            OUTPUT_INPUT + '.subset.GCchange', _p_beagle2linkage, _p_plink))
        RUN_Bash(PLINK + ' --bfile {} --recode --out {}'.format(
            OUTPUT_INPUT + '.subset.GCchange', OUTPUT_INPUT +
            '.subset.GCchange'))

        RUN_Bash('awk \'{print "M", $2}\' %s > %s' %
                 (OUTPUT_INPUT + '.subset.GCchange.map',
                  OUTPUT_INPUT + '.subset.GCchange.dat'))  # -d
        RUN_Bash('cut -d " " -f1-5,7- %s > %s' %
                 (OUTPUT_INPUT + '.subset.GCchange.ped',
                  OUTPUT_INPUT + '.subset.GCchange.nophe.ped'))  # -p

        RUN_Bash('cat {} | java -jar {} > {}'.format(
            OUTPUT_REF + '.subset.GCchange.bgl.phased', _p_transpose,
            OUTPUT_REF + '.subset.GCchange.bgl.phased.tr'))
        RUN_Bash('cut -d " " -f1,2,6- %s | tail -n+3 > %s' %
                 (OUTPUT_REF + '.subset.GCchange.bgl.phased.tr',
                  OUTPUT_REF + '.subset.GCchange.haps'))  # -h
        RUN_Bash('cut -d " " -f1 %s > %s' %
                 (OUTPUT_REF + '.subset.GCchange.markers',
                  OUTPUT_REF + '.subset.GCchange.haps.snps'))  # -s

    if RUNMACH:

        RUN_Bash(_p_mach +
                 ' -d {} -p {} -h {} -s {} --rounds 20 --greedy --prefix {}'.
                 format(OUTPUT_INPUT + '.subset.GCchange.dat', OUTPUT_INPUT +
                        '.subset.GCchange.nophe.ped', OUTPUT_REF +
                        '.subset.GCchange.haps', OUTPUT_REF +
                        '.subset.GCchange.haps.snps', _out + '.mach_step'))

    if BUILDING_MAP:

        # RUN_Bash(STEP4_buildMap+' {} {} {} {} {} > {}'.format(
        RUN_Bash(STEP4_buildMap + ' {} {} {} {} {} {}'.format(
            _out + '.mach_step.erate', _out + '.mach_step.rec', OUTPUT_REF +
            '.subset.GCchange.markers', _out + '.mach_step.gmap.avg', _out +
            '.mach_step.gmap.last', _out + '.aver.erate'))

        RUN_Bash(STEP5_collapseHLA +
                 ' {} {} {}'.format(_out + '.mach_step.gmap.avg', _out +
                                    '.mach_step.avg.clpsA', _out +
                                    '.mach_step.avg.clpsB'))

    # Final output check
    Flag_OUTPUT = True

    if not os.path.exists(_out + '.aver.erate'):
        print(std_WARNING_MAIN_PROCESS_NAME +
              "'{}' wasn't created.".format(_out + '.aver.erate'))
        Flag_OUTPUT = False
    # if not os.path.exists(_out+'.mach_step.avg.clpsA'):
    #     print(std_WARNING_MAIN_PROCESS_NAME + "'{}' wasn't created.".format(_out+'.mach_step.avg.clpsA'))
    #     Flag_OUTPUT = False
    if not os.path.exists(_out + '.mach_step.avg.clpsB'):
        print(std_WARNING_MAIN_PROCESS_NAME +
              "'{}' wasn't created.".format(_out + '.mach_step.avg.clpsB'))
        Flag_OUTPUT = False
    # if not os.path.exists(_out+'.mach_step.erate'):
    #     print(std_WARNING_MAIN_PROCESS_NAME + "'{}' wasn't created.".format(_out+'.mach_step.erate'))
    #     Flag_OUTPUT = False
    # if not os.path.exists(_out+'.mach_step.gmap.avg'):
    #     print(std_WARNING_MAIN_PROCESS_NAME + "'{}' wasn't created.".format(_out+'.mach_step.gmap.avg'))
    #     Flag_OUTPUT = False
    # if not os.path.exists(_out+'.mach_step.gmap.last'):
    #     print(std_WARNING_MAIN_PROCESS_NAME + "'{}' wasn't created.".format(_out+'.mach_step.gmap.last'))
    #     Flag_OUTPUT = False
    # if not os.path.exists(_out+'.mach_step.rec'):
    #     print(std_WARNING_MAIN_PROCESS_NAME + "'{}' wasn't created.".format(_out+'.mach_step.rec'))
    #     Flag_OUTPUT = False

    if Cleanup:

        RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.subset.*'))
        RUN_Bash('rm {}'.format(OUTPUT_REF + '.subset.*'))

        RUN_Bash('rm {}'.format(_out + '.mach_step.avg.clpsA'))
        RUN_Bash('rm {}'.format(_out + '.mach_step.erate'))
        RUN_Bash('rm {}'.format(_out + '.mach_step.gmap.avg'))
        RUN_Bash('rm {}'.format(_out + '.mach_step.gmap.last'))
        RUN_Bash('rm {}'.format(_out + '.mach_step.rec'))

    if Flag_OUTPUT:
        return (_out + '.mach_step.avg.clpsB', _out + '.aver.erate')
    else:
        return (-1, -1)
Example #9
0
    def __init__(self,
                 idx_process,
                 MHC,
                 _reference,
                 _out,
                 _hg,
                 _nthreads,
                 _AdaptiveGeneticMap,
                 _Average_Erate,
                 _LINKAGE2BEAGLE,
                 _BEAGLE2LINKAGE,
                 _BEAGLE2VCF,
                 _VCF2BEAGLE,
                 _PLINK,
                 _BEAGLE4,
                 _CSH,
                 _answer=None,
                 f_save_intermediates=False,
                 _MultP=1,
                 _given_prephased=None,
                 f_prephasing=False,
                 f_remove_raw_IMP_results=False,
                 f_measureAcc_v2=False):

        ### General
        self.idx_process = idx_process
        self.__save_intermediates = f_save_intermediates

        self.FLAG_AdaptiveGeneticMap = _AdaptiveGeneticMap and _Average_Erate  # (***) Deciding whether to use Adaptive genetic map or not.

        # Prefixes
        self.OUTPUT_dir = os.path.dirname(_out)
        self.OUTPUT_dir_ref = join(self.OUTPUT_dir,
                                   os.path.basename(_reference))
        self.OUTPUT_dir_GM = join(
            self.OUTPUT_dir, os.path.basename(
                _AdaptiveGeneticMap)) if self.FLAG_AdaptiveGeneticMap else None

        # Result
        self.Exon234_Panel = None
        self.dict_ExonN_Panel = {_exonN: None for _exonN in __EXON__}
        self.dict_ExonN_AGM = {_exonN: None for _exonN in __EXON__}
        self.dict_IMP_Result = {
            _exonN: {_overlap: None
                     for _overlap in __overlap__}
            for _exonN in __EXON__
        }
        self.accuracy = None
        self.HLA_IMPUTATION_OUT = None

        self.dict_DOUBLED_PHASED_RESULT = {_exonN: None for _exonN in __EXON__}
        self.dict_REF_PHASED_VCF = {_exonN: None for _exonN in __EXON__}

        # Dependencies
        self.LINKAGE2BEAGLE = _LINKAGE2BEAGLE
        self.BEAGLE2LINKAGE = _BEAGLE2LINKAGE
        self.BEAGLE2VCF = _BEAGLE2VCF
        self.VCF2BEAGLE = _VCF2BEAGLE
        self.PLINK = _PLINK
        self.BEAGLE4 = _BEAGLE4

        # created in 'CONVERT_IN'
        # self.refined_REF_markers = None # used in 'CONVERT_OUT'
        # self.refined_Genetic_Map = None # used in 'IMPUTE'
        # self.GCchangeBGL = None # used in 'CONVERT_OUT'

        # Adaptive Genetic Map
        self.__AGM__ = _AdaptiveGeneticMap if self.FLAG_AdaptiveGeneticMap else None
        self.__AVER__ = _Average_Erate if self.FLAG_AdaptiveGeneticMap else None

        ###### < Reference panel for Exon 2, 3, 4 > ######

        multiple_panels = HLA_MultipleRefs(_reference,
                                           self.OUTPUT_dir_ref,
                                           _hg,
                                           self.BEAGLE2LINKAGE,
                                           self.BEAGLE2VCF,
                                           self.PLINK,
                                           _MultP=_MultP,
                                           __AGM__=self.__AGM__,
                                           _out_AGM=self.OUTPUT_dir_GM)

        self.dict_ExonN_Panel = multiple_panels.ExonN_Panel
        self.Exon234_Panel = multiple_panels.EXON234_Panel

        self.dict_ExonN_AGM = multiple_panels.ExonN_AGM if self.FLAG_AdaptiveGeneticMap else {
            _exonN: None
            for _exonN in __EXON__
        }

        # [Temporary Hard-coding]
        # self.Exon234_Panel = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/T1DGC_REF.exon234'
        #
        # self.dict_ExonN_Panel['exon2'] = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/T1DGC_REF.exon2'
        # self.dict_ExonN_Panel['exon3'] = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/T1DGC_REF.exon3'
        # self.dict_ExonN_Panel['exon4'] = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/T1DGC_REF.exon4'
        #
        # self.dict_ExonN_AGM['exon2'] = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/CEU_T1DGC.mach_step.avg.clpsB.exon2.txt'
        # self.dict_ExonN_AGM['exon3'] = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/CEU_T1DGC.mach_step.avg.clpsB.exon3.txt'
        # self.dict_ExonN_AGM['exon4'] = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/CEU_T1DGC.mach_step.avg.clpsB.exon4.txt'

        ###### < Main - 'CONVERT_IN', 'IMPUTE', 'CONVERT_OUT' > ######

        ### (1) CONVERT_IN

        IMPUTATION_INPUT = self.CONVERT_IN(MHC,
                                           self.Exon234_Panel,
                                           _out,
                                           _hg,
                                           _given_prephased=_given_prephased,
                                           f_prephasing=f_prephasing)
        # Only one time of pre-phasing with Exon234 reference panel.

        ### (2) Imputation

        if _MultP == 1:

            imputation_serial_start = time()

            ## Serial implementation of main.
            for _exonN in __EXON__:
                for _overlap in __overlap__:

                    self.dict_IMP_Result[_exonN][_overlap] = \
                        self.IMPUTE(MHC, _out, IMPUTATION_INPUT, self.dict_ExonN_Panel[_exonN] + '.phased.vcf',
                                    _overlap, _exonN, _nthreads, self.__AVER__, self.dict_ExonN_AGM[_exonN], f_prephasing=f_prephasing)

            imputation_serial_end = time()

            imputation_serial_time = (imputation_serial_end -
                                      imputation_serial_start) / 60
            print(std_MAIN_PROCESS_NAME +
                  "Total imputation time of Serial implementation: {}(min)\n".
                  format(imputation_serial_time))

        else:

            ## Parallel implementation of main.

            imputation_parallel_start = time()

            pool = mp.Pool(processes=_MultP if _MultP <= 9 else 9)

            dict_Pool = {
                _exonN: {
                    _overlap: pool.apply_async(
                        self.IMPUTE,
                        (MHC, _out, IMPUTATION_INPUT,
                         self.dict_ExonN_Panel[_exonN] + '.phased.vcf',
                         _overlap, _exonN, _nthreads, self.__AVER__,
                         self.dict_ExonN_AGM[_exonN], f_prephasing))
                    for _overlap in __overlap__
                }
                for _exonN in __EXON__
            }

            pool.close()
            pool.join()

            for _exonN in __EXON__:
                for _overlap in __overlap__:
                    self.dict_IMP_Result[_exonN][_overlap] = dict_Pool[_exonN][
                        _overlap].get()

            imputation_parallel_end = time()

            imputation_parallel_time = (imputation_parallel_end -
                                        imputation_parallel_start) / 60
            print(
                std_MAIN_PROCESS_NAME +
                "Total imputation time of Parallel implementation (with {} core(s)): {}(min)\n"
                .format(_MultP, imputation_parallel_time))

        self.idx_process += 1

        # [Temporary Hard-coding]
        # self.dict_IMP_Result['exon2'][3000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon2.overlap3000.MHC.QC.double.imputation_out.vcf'
        # self.dict_IMP_Result['exon2'][4000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon2.overlap4000.MHC.QC.double.imputation_out.vcf'
        # self.dict_IMP_Result['exon2'][5000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon2.overlap5000.MHC.QC.double.imputation_out.vcf'
        # self.dict_IMP_Result['exon3'][3000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon3.overlap3000.MHC.QC.double.imputation_out.vcf'
        # self.dict_IMP_Result['exon3'][4000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon3.overlap4000.MHC.QC.double.imputation_out.vcf'
        # self.dict_IMP_Result['exon3'][5000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon3.overlap5000.MHC.QC.double.imputation_out.vcf'
        # self.dict_IMP_Result['exon4'][3000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon4.overlap3000.MHC.QC.double.imputation_out.vcf'
        # self.dict_IMP_Result['exon4'][4000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon4.overlap4000.MHC.QC.double.imputation_out.vcf'
        # self.dict_IMP_Result['exon4'][5000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon4.overlap5000.MHC.QC.double.imputation_out.vcf'

        ### (3) CONVERT_OUT

        self.HLA_IMPUTATION_OUT = self.CONVERT_OUT(self.dict_IMP_Result,
                                                   MHC + '.HLA_IMPUTATION_OUT',
                                                   _CSH,
                                                   f_prephasing=f_prephasing)
        print(std_MAIN_PROCESS_NAME +
              'IMPUTATION_OUT:\n{}'.format(self.HLA_IMPUTATION_OUT))

        ## Acquiring accuracy

        if bool(_answer):

            print(std_MAIN_PROCESS_NAME +
                  "Calculating accuracy of each HLA gene. (answer: '{}')".
                  format(_answer))

            if not os.path.exists(_answer):
                print(
                    std_WARNING_MAIN_PROCESS_NAME +
                    "Given answer file doesn't exist. Please check '--answer/-an' argument again.\n"
                    "Skipping calculating imputation accuracy.")
            elif os.path.getsize(_answer) == 0:
                print(
                    std_WARNING_MAIN_PROCESS_NAME +
                    "Given answer file doesn't have any content. Please check '--answer/-an' argument again.\n"
                    "Skipping calculating imputation accuracy.")

            else:
                if f_measureAcc_v2:
                    # measureAcc_v2
                    self.accuracy = measureAccuracy(
                        _answer,
                        self.HLA_IMPUTATION_OUT,
                        'all',
                        outfile=self.HLA_IMPUTATION_OUT + '.accuracy',
                        __only4digits=True)

                else:
                    # measureAcc_v3.5
                    measureAcc_start = time()

                    t = CookHLA_measureAcc(_answer, self.HLA_IMPUTATION_OUT,
                                           self.HLA_IMPUTATION_OUT)
                    self.accuracy = t.accuracy

                    measureAcc_end = time()

                    measureAcc_time = (measureAcc_end - measureAcc_start) / 60
                    print("\nAccuracy : {}".format(self.accuracy))
                    print("measureAccuracy time: {}(min)\n".format(
                        measureAcc_time))

        ### General Removal
        if not self.__save_intermediates:

            # 'Exon234 panel'
            RUN_Bash('rm {}'.format(self.Exon234_Panel + '.bed'))
            RUN_Bash('rm {}'.format(self.Exon234_Panel + '.bim'))
            RUN_Bash('rm {}'.format(self.Exon234_Panel + '.fam'))
            RUN_Bash('rm {}'.format(self.Exon234_Panel + '.FRQ.frq'))
            RUN_Bash('rm {}'.format(self.Exon234_Panel + '.markers'))
            RUN_Bash('rm {}'.format(self.Exon234_Panel + '.bgl.phased'))
            RUN_Bash('rm {}'.format(self.Exon234_Panel + '.GCchange.markers'))
            RUN_Bash('rm {}'.format(self.Exon234_Panel +
                                    '.GCchange.bgl.phased'))
            RUN_Bash('rm {}'.format(self.Exon234_Panel + '.phased.vcf'))
            RUN_Bash(
                'rm {}'.format(self.Exon234_Panel +
                               '.refined.markers'))  # only in Exon234 panel

            # 'Exon 2,3,4 panel'
            for _exonN in __EXON__:
                RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] +
                                        '.bed'))
                RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] +
                                        '.bim'))
                RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] +
                                        '.fam'))
                RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] +
                                        '.FRQ.frq'))
                RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] +
                                        '.markers'))
                RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] +
                                        '.bgl.phased'))
                RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] +
                                        '.GCchange.markers'))
                RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] +
                                        '.GCchange.bgl.phased'))
                RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] +
                                        '.phased.vcf'))

            # 'Exon 2,3,4 AGM'
            RUN_Bash('rm {}'.format(multiple_panels.EXON234_AGM))
            for _exonN in __EXON__:
                RUN_Bash('rm {}'.format(self.dict_ExonN_AGM[_exonN]))

            # # 'CONVERT_IN'
            # RUN_Bash('rm {}'.format(MHC + '.QC.nopheno.ped'))
            # RUN_Bash('rm {}'.format(MHC + '.QC.dat'))

            # 'CONVERT_OUT'
            for _exonN in __EXON__:
                for _overlap in __overlap__:
                    for _hla in HLA_names:
                        RUN_Bash('rm {}'.format(
                            self.dict_IMP_Result[_exonN][_overlap] +
                            '.HLA_{}'.format(_hla)))
                        if f_remove_raw_IMP_results:
                            RUN_Bash('rm {}'.format(
                                self.dict_IMP_Result[_exonN][_overlap]))
                            RUN_Bash(
                                'rm {}'.format(self.dict_IMP_Result[_exonN]
                                               [_overlap].rstrip('.vcf') +
                                               '.log'))
Example #10
0
    def IMPUTE(self,
               MHC,
               _out,
               _IMPUTATION_INPUT,
               _REF_PHASED_VCF,
               _overlap,
               _exonN,
               _nthreads,
               _aver_erate,
               _Refined_Genetic_Map,
               f_prephasing=False):

        if os.path.getsize(_IMPUTATION_INPUT) == 0:
            print(
                std_ERROR_MAIN_PROCESS_NAME +
                "Input file for imputation('{}') contains nothing. Please check it again."
                .format(_IMPUTATION_INPUT))
            sys.exit()

        # print("[{}] Performing HLA imputation (see {}.MHC.QC.imputation_out.log for progress).".format(self.idx_process, _out))
        print("\n[{}] Performing HLA imputation({} / overlap:{}).".format(
            self.idx_process, _exonN, _overlap))
        # self.idx_process += 1

        raw_HLA_IMPUTATION_OUT = MHC + (
            '.QC.{}.{}.doubled.raw_imputation_out'.format(_exonN, _overlap)
            if f_prephasing else '.QC.{}.{}.raw_imputation_out'.format(
                _exonN, _overlap))

        if self.FLAG_AdaptiveGeneticMap:  # With Adatpive Genetic Map
            """
            ### MM + AGM
            
            # prephasing
            java -jar beagle4.jar gt=$MHC.QC.phasing_out_double.vcf ref=$REFERENCE.phased.vcf out=$MHC.QC.double.imputation_out impute=true lowmem=true gprobs=true ne=10000 overlap=${OVERLAP} err=$aver_erate map=$geneticMap.refined.map
            
            # No-prephasing
            java -jar beagle4.jar gt=$MHC.QC.vcf                    ref=$REFERENCE.phased.vcf out=$MHC.QC.double.imputation_out impute=true lowmem=true gprobs=true ne=10000 overlap=${OVERLAP} err=$aver_erate map=$geneticMap.refined.map
            
            """

            # aver_erate
            with open(_aver_erate, 'r') as f:
                aver_erate = f.readline().rstrip('\n')

            command = '{} gt={} ref={} out={} impute=true lowmem=true gprobs=true ne=10000 overlap={} err={} map={} nthreads={}'.format(
                self.BEAGLE4, _IMPUTATION_INPUT, _REF_PHASED_VCF,
                raw_HLA_IMPUTATION_OUT, _overlap, aver_erate,
                _Refined_Genetic_Map, _nthreads)
            # print(command)

            try:
                f_log = open(raw_HLA_IMPUTATION_OUT + '.log', 'w')

                imputation_start = time()
                subprocess.run(re.split('\s+', command),
                               check=True,
                               stdout=f_log,
                               stderr=f_log)
                imputation_end = time()

            except subprocess.CalledProcessError:
                raise CookHLAImputationError(
                    std_ERROR_MAIN_PROCESS_NAME +
                    "Imputation({} / overlap:{}) failed.\n".format(
                        _exonN, _overlap))
                # sys.stderr.write(std_ERROR_MAIN_PROCESS_NAME + "Imputation({} / overlap:{}) failed.\n".format(_exonN, _overlap))
                # return -1
            else:
                # print(std_MAIN_PROCESS_NAME+"Imputation({} / overlap:{}) done.".format(_exonN, _overlap))
                # os.system("rm {}".format(raw_HLA_IMPUTATION_OUT+'.err.log'))
                f_log.close()

                imputation_time = (imputation_end - imputation_start) / 60
                sys.stdout.write(
                    "Imputation({} / overlap:{}) time: {}(min)\n".format(
                        _exonN, _overlap, imputation_time))

        else:  # Without Adaptive Genetic Map
            """
            ### MM
            
            # prephasing
            java -jar beagle4.jar gt=$MHC.QC.phasing_out_double.vcf ref=$REFERENCE.phased.vcf out=$MHC.QC.double.imputation_out impute=true lowmem=true overlap=$OVERLAP gprobs=true
            
            # No-prephasing
            java -jar beagle4.jar gt=$MHC.QC.vcf                    ref=$REFERENCE.phased.vcf out=$MHC.QC.double.imputation_out impute=true lowmem=true overlap=$OVERLAP gprobs=true

            """

            command = '{} gt={} ref={} out={} impute=true lowmem=true overlap={} gprobs=true nthreads={}'.format(
                self.BEAGLE4, _IMPUTATION_INPUT, _REF_PHASED_VCF,
                raw_HLA_IMPUTATION_OUT, _overlap, _nthreads)
            # print(command)

            try:
                f_log = open(raw_HLA_IMPUTATION_OUT + '.log', 'w')

                imputation_start = time()
                subprocess.run(re.split('\s+', command),
                               check=True,
                               stdout=f_log,
                               stderr=f_log)
                imputation_end = time()

            except subprocess.CalledProcessError:
                raise CookHLAImputationError(
                    std_ERROR_MAIN_PROCESS_NAME +
                    "Imputation({} / overlap:{}) failed.\n".format(
                        _exonN, _overlap))

            else:
                # print(std_MAIN_PROCESS_NAME+"Imputation({} / overlap:{}) done.".format(_exonN, _overlap))
                f_log.close()

                imputation_time = (imputation_end - imputation_start) / 60
                sys.stdout.write(
                    "Imputation({} / overlap:{}) time: {}(min)\n".format(
                        _exonN, _overlap, imputation_time))

        RUN_Bash('gzip -d -f {}.vcf.gz'.format(raw_HLA_IMPUTATION_OUT))

        return raw_HLA_IMPUTATION_OUT + '.vcf'
Example #11
0
    def CONVERT_IN(self,
                   MHC,
                   _reference,
                   _out,
                   _hg,
                   _given_prephased=None,
                   f_prephasing=False):

        if _given_prephased and f_prephasing:

            print(
                "(Test Purpose) Given pre-phased result will be used. ('{}')".
                format(_given_prephased))

            ############### < Multiple Markers > ###############

            ### Phasing & Doubling (only on Target Sample.)

            # Phasing (If previously prephased result is given, then the process to make new phased result will be skipped.
            PHASED_RESULT = _given_prephased

            # Doubling
            DOUBLED_PHASED_RESULT = self.Doubling(MHC,
                                                  PHASED_RESULT.rstrip('.vcf'))

            return DOUBLED_PHASED_RESULT

        OUTPUT_dir_Exon234_ref = join(self.OUTPUT_dir,
                                      os.path.basename(_reference))

        print("[{}] Converting data to beagle format.".format(
            self.idx_process))
        self.idx_process += 1

        RUN_Bash(self.LINKAGE2BEAGLE +
                 ' pedigree={} data={} beagle={} standard=true > {}'.format(
                     MHC + '.QC.nopheno.ped', MHC + '.QC.dat', MHC +
                     '.QC.bgl', _out + '.bgl.log'))

        # if not self.__save_intermediates:
        #     os.system('rm {}'.format(MHC + '.QC.nopheno.ped'))
        #     os.system('rm {}'.format(MHC + '.QC.dat'))
        #     os.system('rm {}'.format(_out + '.bgl.log'))

        ### Converting data to reference_markers_Position (Dispersing same genomic position of some markers.)

        # RefinedMarkers = redefineBP(_reference + '.markers', OUTPUT_dir_Exon234_ref + '.refined.markers')
        RUN_Bash(
            'cp {} {}'.format(_reference + '.markers',
                              OUTPUT_dir_Exon234_ref + '.refined.markers'))
        RefinedMarkers = OUTPUT_dir_Exon234_ref + '.refined.markers'
        # self.refined_REF_markers = RefinedMarkers # => This will be used in 'CONVERT_OUT'.

        ### Converting data to target_markers_Position and extract not_including snp.

        RUN_Bash('awk \'{print $2" "$4" "$5" "$6}\' %s > %s' %
                 (MHC + '.QC.bim', MHC + '.QC.markers'))

        RUN_Bash(
            'Rscript src/excluding_snp_and_refine_target_position-v1COOK02222017.R {} {} {}'
            .format(MHC + '.QC.markers', RefinedMarkers,
                    MHC + '.QC.pre.markers'))
        if not self.__save_intermediates:
            os.system(' '.join(['rm', MHC + '.QC.markers']))

        RUN_Bash('mv {} {}'.format(MHC + '.QC.bgl',
                                   MHC + '.QC.pre.bgl.phased'))

        RUN_Bash("awk '{print $1}' %s > %s" %
                 (MHC + '.QC.pre.markers',
                  join(self.OUTPUT_dir, 'selected_snp.txt')))

        from src.Panel_subset import Panel_Subset
        qc_refined = Panel_Subset(MHC + '.QC.pre', 'all',
                                  join(self.OUTPUT_dir, 'selected_snp.txt'),
                                  MHC + '.QC.refined')

        if not self.__save_intermediates:
            RUN_Bash('rm {}'.format(MHC + '.QC.pre.bgl.phased'))
            RUN_Bash('rm {}'.format(MHC + '.QC.pre.markers'))
            RUN_Bash('rm {}'.format(join(self.OUTPUT_dir, 'selected_snp.txt')))

        ### Converting data to GC_change_beagle format.

        from src.bgl2GC_trick_bgl import Bgl2GC

        # target
        [GCchangeBGL, GCchangeMarkers] = Bgl2GC(MHC + '.QC.refined.bgl.phased',
                                                MHC + '.QC.refined.markers',
                                                MHC + '.QC.GCchange.bgl',
                                                MHC + '.QC.GCchange.markers')

        self.GCchangeBGL = GCchangeBGL  # it will be used in 'CONVERT_OUT' with Genetic Map

        # print("<Target GCchanged bgl and marker file>\n"
        #       "bgl : {}\n"
        #       "markers : {}".format(GCchangeBGL, GCchangeMarkers))

        # reference
        [GCchangeBGL_REF, GCchangeMarkers_REF
         ] = Bgl2GC(_reference + '.bgl.phased', RefinedMarkers,
                    OUTPUT_dir_Exon234_ref + '.GCchange.bgl.phased',
                    OUTPUT_dir_Exon234_ref + '.GCchange.markers')
        # print("<Reference GCchanged bgl and marker file>\n"
        #       "bgl : {}\n"
        #       "markers : {}".format(GCchangeBGL_REF, GCchangeMarkers_REF))

        if not self.__save_intermediates:
            RUN_Bash('rm {}'.format(MHC + '.QC.refined.bgl.phased'))
            RUN_Bash('rm {}'.format(MHC + '.QC.refined.markers'))
            # RUN_Bash('rm {}'.format(RefinedMarkers))

            # os.system(' '.join(['rm', RefinedMarkers])) # => This will be used in 'CONVERT_OUT" when not using Multiple Markers.

        ### Converting data to vcf_format

        # target
        RUN_Bash(self.BEAGLE2VCF +
                 ' 6 {} {} 0 > {}'.format(GCchangeMarkers, GCchangeBGL, MHC +
                                          '.QC.vcf'))

        MHC_QC_VCF_exonN = MHC + '.QC.vcf'

        # reference
        RUN_Bash(self.BEAGLE2VCF +
                 ' 6 {} {} 0 > {}'.format(GCchangeMarkers_REF, GCchangeBGL_REF,
                                          OUTPUT_dir_Exon234_ref + '.vcf'))

        reference_vcf = OUTPUT_dir_Exon234_ref + '.vcf'

        ### Converting data to reference_phased

        RUN_Bash('sed "s%/%|%g" {} > {}'.format(
            reference_vcf, OUTPUT_dir_Exon234_ref + '.phased.vcf'))

        REF_PHASED_VCF = OUTPUT_dir_Exon234_ref + '.phased.vcf'

        if not self.__save_intermediates:
            RUN_Bash('rm {}'.format(reference_vcf))

            # # if self.f_useMultipleMarkers:
            # if not self.f_useGeneticMap:
            #     os.system(' '.join(['rm {}'.format(GCchangeBGL)])) # 'GCchangeBGL' will be used in 'CONVERT_OUT'
            #     os.system(' '.join(['rm {}'.format(GCchangeMarkers_REF)]))  # 'GCchangeMarkers_REF' will be used in 'CONVERT_OUT'
            #     os.system(' '.join(['rm {}'.format(GCchangeMarkers)]))
            #     os.system(' '.join(['rm {}'.format(GCchangeBGL_REF)]))
        """
        (1) `MHC_QC_VCF_exonN` := MHC + '.QC.vcf',
        (2) `REF_PHASED_VCF` := OUTPUT_dir_Exon234_ref + '.phased.vcf'

        These two files are to be passed into Beagle phasing;
        """

        if f_prephasing:

            ############### < Multiple Markers > ###############

            ### Phasing & Doubling (only on Target Sample.)

            # Phasing
            PHASED_RESULT = self.Phasing(MHC, MHC_QC_VCF_exonN, REF_PHASED_VCF)

            # [Temporary Hardcoding for Phased Result]
            # PHASED_RESULT = "/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/_3_HM_CEU_T1DGC_REF.MHC.QC.phasing_out_not_double"
            # print("[Temporary Hardcoding]Phased Result:\n{}".format(PHASED_RESULT))

            # Doubling
            DOUBLED_PHASED_RESULT = self.Doubling(MHC, PHASED_RESULT)

            return DOUBLED_PHASED_RESULT

        else:

            return MHC_QC_VCF_exonN