def Doubling(self, MHC, PHASED_RESULT): ### Target data doubling step. print("[{}] Performing Doubling".format(self.idx_process)) self.idx_process += 1 RUN_Bash('gzip -d -f {}'.format(PHASED_RESULT + '.vcf.gz')) RUN_Bash('grep ^## {} > {}'.format( PHASED_RESULT + '.vcf', PHASED_RESULT + '.vcf.header1')) # Header part with '##' RUN_Bash('grep -v ^## {} | head -n 1 > {}'.format( PHASED_RESULT + '.vcf', PHASED_RESULT + '.vcf.header2')) # Header part with '#' RUN_Bash('grep -v ^# {} > {}'.format( PHASED_RESULT + '.vcf', PHASED_RESULT + '.vcf.body')) # Body part without '#' or '##' RUN_Bash('sed "s%#%%" {} > {}'.format( PHASED_RESULT + '.vcf.header2', PHASED_RESULT + '.vcf.noshop.header2')) RUN_Bash('cat {} {} > {}'.format(PHASED_RESULT + '.vcf.noshop.header2', PHASED_RESULT + '.vcf.body', PHASED_RESULT + '.tobeDoubled.vcf')) RUN_Bash( 'Rscript /scratch3/users/nanje/MHC-Imputation-Accuracy/templates/src/Doubling_vcf.R {} {}' .format(PHASED_RESULT + '.tobeDoubled.vcf', PHASED_RESULT + '.Doubled.pre.vcf')) if not os.path.exists(PHASED_RESULT + '.tobeDoubled.vcf'): print( std_ERROR_MAIN_PROCESS_NAME + "Doubled phased file('{}') can't be found(or wasn't generated at all." .format(PHASED_RESULT + '.tobeDoubled.pre.vcf')) sys.exit() RUN_Bash('sed "s%CHROM%#CHROM%" {} > {}'.format( PHASED_RESULT + '.Doubled.pre.vcf', PHASED_RESULT + '.Doubled.pre2.vcf')) RUN_Bash('cat {} {} > {}'.format(PHASED_RESULT + '.vcf.header1', PHASED_RESULT + '.Doubled.pre2.vcf', MHC + '.QC.phasing_out_Doubled.vcf')) if not self.__save_intermediates: # os.system(' '.join(['rm', PHASED_RESULT + '.vcf'])) os.system(' '.join(['rm', PHASED_RESULT + '.vcf.header1'])) os.system(' '.join(['rm', PHASED_RESULT + '.vcf.header2'])) os.system(' '.join(['rm', PHASED_RESULT + '.vcf.noshop.header2'])) os.system(' '.join(['rm', PHASED_RESULT + '.vcf.body'])) os.system(' '.join(['rm', PHASED_RESULT + '.tobeDoubled.vcf'])) os.system(' '.join(['rm', PHASED_RESULT + '.Doubled.pre.vcf'])) os.system(' '.join(['rm', PHASED_RESULT + '.Doubled.pre2.vcf'])) return MHC + '.QC.phasing_out_Doubled.vcf'
def IMPUTE_HapMap_Map(self, _out, _MHC_QC_VCF, _REF_PHASED_VCF, _nthreads): # Imputation function for only HapMap_Map.txt print( "[{}] Performing HLA imputation (see {}.MHC.QC.imputation_out.log for progress)." .format(self.idx_process, _out)) self.idx_process += 1 OUT = _out + '.MHC.QC.raw_imputation_out' """ ### AGM - HapMap_map.txt beagle gt=$MHC.QC.vcf ref=$REFERENCE.phased.vcf out=$MHC.QC.imputation_out impute=true gprobs=true lowmem=true map=HapMap_Map.txt overlap=3000 """ command = '{} gt={} ref={} out={} impute=true gprobs=true lowmem=true map={} overlap=3000 nthreads={}'.format( self.BEAGLE4, _MHC_QC_VCF, _REF_PHASED_VCF, OUT, self.HapMap_Map, _nthreads) # print(command) try: f_log = open(OUT + '.log', 'w') imputation_start = time() subprocess.run(re.split('\s+', command), check=True, stdout=f_log, stderr=f_log) imputation_end = time() except subprocess.CalledProcessError: raise CookHLAImputationError(std_ERROR_MAIN_PROCESS_NAME + "HapMapMap Imputation failed.\n") # sys.stderr.write(std_ERROR_MAIN_PROCESS_NAME + "Imputation({} / overlap:{}) failed.\n".format(_exonN, _overlap)) # return -1 else: # print(std_MAIN_PROCESS_NAME+"Imputation({} / overlap:{}) done.".format(_exonN, _overlap)) # os.system("rm {}".format(raw_HLA_IMPUTATION_OUT+'.err.log')) f_log.close() imputation_time = (imputation_end - imputation_start) / 60 sys.stdout.write( "HapMapMap Imputation time: {}(min)\n".format(imputation_time)) RUN_Bash('gzip -d -f {}.vcf.gz'.format(OUT)) __RETURN__ = OUT + '.vcf' return __RETURN__
def CONVERT_OUT(self, MHC, _reference, _out, raw_IMP_Result): ### Converting imputation result in vcf file to beagle format. RUN_Bash('cat {} | {} 0 {}'.format(raw_IMP_Result, self.VCF2BEAGLE, MHC + '.QC.imputation_GCchange')) if not self.__save_intermediates: os.system('rm {}'.format(MHC + '.QC.imputation_GCchange.int')) RUN_Bash('gzip -d -f {}'.format(MHC + '.QC.imputation_GCchange.bgl.gz')) ### Converting imputation GC_beagle to original beagle(Decoding GC-encoding). GC_decodedBGL = GCtricedBGL2OriginalBGL( MHC + '.QC.imputation_GCchange.bgl', self.refined_REF_markers, MHC + '.QC.imputation_ori.bgl') # print('GC_decodedBGL : {}'.format(GC_decodedBGL)) if not self.__save_intermediates: os.system('rm {}'.format(MHC + '.QC.imputation_GCchange.bgl')) os.system('rm {}'.format(MHC + '.QC.imputation_GCchange.markers')) HLA_IMPUTED_Result_MHC = self.HLA_IMPUTED_Result_MHC RUN_Bash('Rscript src/complete_header.R {} {} {}'.format( self.GCchangeBGL, GC_decodedBGL, HLA_IMPUTED_Result_MHC + '.bgl.phased')) if not self.__save_intermediates: # os.system('rm {}'.format(MHC + '.QC.GCchange.bgl')) os.system('rm {}'.format(GC_decodedBGL)) ### Converting imputation genotypes to PLINK .ped format. RUN_Bash('cat {} | {} {}'.format( HLA_IMPUTED_Result_MHC + '.bgl.phased', self.BEAGLE2LINKAGE, _out + '.tmp')) RUN_Bash("cut -d ' ' -f6- {} > {}".format(_out + '.tmp.ped', _out + '.tmp')) RUN_Bash("paste -d ' ' {} {} | tr -d '\015' > {}".format( MHC + '.fam', _out + '.tmp', HLA_IMPUTED_Result_MHC + '.ped')) # *.ped RUN_Bash('cut -f1-4 {} > {}'.format( _reference + '.bim', HLA_IMPUTED_Result_MHC + '.map')) # *.map RUN_Bash('cp {} {}'.format(MHC + '.fam', HLA_IMPUTED_Result_MHC + '.fam')) # Create PLINK bed format RUN_Bash('{} --ped {} --map {} --make-bed --out {}'.format( self.PLINK, HLA_IMPUTED_Result_MHC + '.ped', HLA_IMPUTED_Result_MHC + '.map', HLA_IMPUTED_Result_MHC)) # RUN_Bash('rm {}'.format(HLA_IMPUTED_Result_MHC + '.fam')) RUN_Bash('cp {} {}'.format(_reference + '.bim', HLA_IMPUTED_Result_MHC + '.bim')) if not self.__save_intermediates: RUN_Bash('rm {}'.format(_out + '.tmp')) RUN_Bash('rm {}'.format(_out + '.tmp.ped')) RUN_Bash('rm {}'.format(_out + '.tmp.dat')) # RUN_Bash('rm {}'.format(self.raw_IMP_Reuslt)) # Saving the single vcf result. (2020.01.03) RUN_Bash('rm {}'.format(self.refined_REF_markers)) # RUN_Bash('rm {}'.format(self.GCchangeBGL)) RUN_Bash('rm {}'.format(MHC + '.fam')) # BGL2Allele.py __RETURN__ = BGL2Alleles(HLA_IMPUTED_Result_MHC + '.bgl.phased', HLA_IMPUTED_Result_MHC + '.alleles', 'all') self.idx_process += 1 return __RETURN__
def IMPUTE(self, _out, _MHC_QC_VCF, _REF_PHASED_VCF, _aver_erate, _Refined_Genetic_Map, _window, _overlap, _ne, _nthreads): print( "[{}] Performing HLA imputation (see {}.MHC.QC.imputation_out.log for progress)." .format(self.idx_process, _out)) self.idx_process += 1 OUT = _out + '.MHC.QC.raw_imputation_out' if self.FLAG_AdaptiveGeneticMap: ### With Adatpive Genetic Map """ ### AGM """ """ ## Beagle 4.1 ## beagle gt=$MHC.QC.vcf ref=$REFERENCE.phased.vcf out=$MHC.QC.imputation_out impute=true gprobs=true lowmem=true ne=10000 map=$geneticMap.refined.map err=$aver_erate overlap=3000 ## Beagle 5.1 ## beagle \ gt=$MHC.QC.vcf \ ref=$REFERENCE.phased.vcf \ out=$MHC.QC.imputation_out \ impute=true \ gp=true \ map=$geneticMap.refined.map \ err=$aver_erate \ ne=10000 # fixed \ nthreads=$nthreads """ with open(_aver_erate, 'r') as f: aver_erate = f.readline().rstrip('\n') command = '{} gt={} ref={} out={} impute=true gp=true err={} map={} ne=10000 nthreads={}'.format( self.BEAGLE5, _MHC_QC_VCF, _REF_PHASED_VCF, OUT, aver_erate, _Refined_Genetic_Map, _nthreads) # print(command) try: f_log = open(OUT + '.log', 'w') imputation_start = time() subprocess.run(re.split('\s+', command), check=True, stdout=f_log, stderr=f_log) imputation_end = time() except subprocess.CalledProcessError: raise CookHLAImputationError(std_ERROR_MAIN_PROCESS_NAME + "AGM Imputation failed.\n") # sys.stderr.write(std_ERROR_MAIN_PROCESS_NAME + "Imputation({} / overlap:{}) failed.\n".format(_exonN, _overlap)) # return -1 else: # print(std_MAIN_PROCESS_NAME+"Imputation({} / overlap:{}) done.".format(_exonN, _overlap)) # os.system("rm {}".format(raw_HLA_IMPUTATION_OUT+'.err.log')) f_log.close() imputation_time = (imputation_end - imputation_start) / 60 sys.stdout.write( "AGM Imputation time: {}(min)\n".format(imputation_time)) else: ### Plain """ ### Plain """ """ ## Beagle 4.1 ## beagle gt=$MHC.QC.vcf ref=$REFERENCE.phased.vcf out=$MHC.QC.imputation_out impute=true gprobs=true lowmem=true overlap=3000 ## Beagle 5.1 ## beagle \ gt=$MHC.QC.vcf \ ref=$REFERENCE.phased.vcf \ out=$MHC.QC.imputation_out \ impute=true \ gp=true \ nthreads=$nthreads """ command = '{} \ gt={} \ ref={} \ out={} \ impute=true \ gp=true \ nthreads={}'.format(self.BEAGLE5, _MHC_QC_VCF, _REF_PHASED_VCF, OUT, _nthreads) # print(command) try: f_log = open(OUT + '.log', 'w') imputation_start = time() subprocess.run(re.split('\s+', command), check=True, stdout=f_log, stderr=f_log) imputation_end = time() except subprocess.CalledProcessError: raise CookHLAImputationError(std_ERROR_MAIN_PROCESS_NAME + "Plain Imputation failed.\n") # sys.stderr.write(std_ERROR_MAIN_PROCESS_NAME + "Imputation({} / overlap:{}) failed.\n".format(_exonN, _overlap)) # return -1 else: # print(std_MAIN_PROCESS_NAME+"Imputation({} / overlap:{}) done.".format(_exonN, _overlap)) # os.system("rm {}".format(raw_HLA_IMPUTATION_OUT+'.err.log')) imputation_time = (imputation_end - imputation_start) / 60 sys.stdout.write( "Plain Imputation time: {}(min)\n".format(imputation_time)) RUN_Bash('gzip -d -f {}.vcf.gz'.format(OUT)) __RETURN__ = OUT + '.vcf' return __RETURN__
def __init__(self, idx_process, MHC, _reference, _out, _hg, _window, _overlap, _ne, _nthreads, _AdaptiveGeneticMap, _Average_Erate, _LINKAGE2BEAGLE, _BEAGLE2LINKAGE, _BEAGLE2VCF, _VCF2BEAGLE, _PLINK, _BEAGLE5, _answer=None, f_save_intermediates=False, _HapMap_Map=None, f_measureAcc_v2=False): ### Class variables # General self.idx_process = idx_process self.__save_intermediates = f_save_intermediates self.FLAG_AdaptiveGeneticMap = _AdaptiveGeneticMap and _Average_Erate # (***) Deciding whether to use Adaptive genetic map or not. # Prefixes self.OUTPUT_dir = os.path.dirname(_out) self.OUTPUT_dir_ref = join(self.OUTPUT_dir, os.path.basename(_reference)) self.OUTPUT_dir_GM = join( self.OUTPUT_dir, os.path.basename( _AdaptiveGeneticMap)) if self.FLAG_AdaptiveGeneticMap else None # self.HLA_IMPUTED_Result_MHC = join(os.path.dirname(MHC), "HLA_IMPUTED_Result.{}".format(os.path.basename(MHC))) self.HLA_IMPUTED_Result_MHC = MHC + '.HLA_IMPUTATION_OUT' # Result self.raw_IMP_Reuslt = None self.HLA_IMPUTATION_OUT = None # Final Imputation output ('*.imputed.alleles'). self.accuracy = None # Dependencies self.LINKAGE2BEAGLE = _LINKAGE2BEAGLE self.BEAGLE2LINKAGE = _BEAGLE2LINKAGE self.BEAGLE2VCF = _BEAGLE2VCF self.VCF2BEAGLE = _VCF2BEAGLE self.PLINK = _PLINK self.BEAGLE5 = _BEAGLE5 # Adaptive Genetic Map self.__AGM__ = _AdaptiveGeneticMap self.__AVER__ = _Average_Erate # created in 'CONVERT_IN' self.refined_REF_markers = None # used in 'CONVERT_OUT' self.refined_Genetic_Map = None # used in 'IMPUTE' self.GCchangeBGL = None # used in 'CONVERT_OUT' # HapMap_Map if _HapMap_Map: self.HapMap_Map = _HapMap_Map self.OUTPUT_dir_GM = join(self.OUTPUT_dir, os.path.basename(_HapMap_Map)) ###### < Main - 'CONVERT_IN', 'IMPUTE', 'CONVERT_OUT' > ###### ### (1) CONVERT_IN # self.CONVERT_IN(MHC, _reference, _out, _hg, _aver_erate=self.__AVER__, _Genetic_Map=self.__AGM__) [MHC_QC_VCF, REF_PHASED_VCF] = self.CONVERT_IN(MHC, _reference, _out, _hg, _Genetic_Map=self.__AGM__) # [Temporary Hard coding] # MHC_QC_VCF = "/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190605_onlyAGM/_3_HM_CEU_T1DGC_REF.MHC.QC.vcf" # REF_PHASED_VCF = "/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190605_onlyAGM/T1DGC_REF.phased.vcf" # self.refined_Genetic_Map = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190605_onlyAGM/CEU_T1DGC.mach_step.avg.clpsB.refined.map' # print("CONVERT_IN :\n{}\n{}".format(MHC_QC_VCF, REF_PHASED_VCF)) ### (2) IMPUTE if _HapMap_Map: self.raw_IMP_Reuslt = self.IMPUTE_HapMap_Map( _out, MHC_QC_VCF, REF_PHASED_VCF, _window, _overlap, _ne, _nthreads) else: self.raw_IMP_Reuslt = self.IMPUTE(_out, MHC_QC_VCF, REF_PHASED_VCF, self.__AVER__, self.refined_Genetic_Map, _window, _overlap, _ne, _nthreads) # [Temporary Hard coding] # self.raw_IMP_Reuslt = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190605_onlyAGM/_3_HM_CEU_T1DGC_REF.QC.imputation_out.vcf' # self.refined_REF_markers = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190605_onlyAGM/T1DGC_REF.refined.markers' # self.GCchangeBGL = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190605_onlyAGM/_3_HM_CEU_T1DGC_REF.MHC.QC.GCchange.bgl' # print("raw Imputed Reuslt :\n{}".format(self.raw_IMP_Reuslt)) ### (3) CONVERT_OUT self.HLA_IMPUTATION_OUT = self.CONVERT_OUT(MHC, _reference, _out, self.raw_IMP_Reuslt) # [Temporary Hard coding] # self.IMP_Result = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190605_onlyAGM/HLA_IMPUTED_Result._3_HM_CEU_T1DGC_REF.MHC.imputed.alleles' # print("\n\nImputation Result : {}".format(self.IMP_Result)) ###### < Get Accuracy > ###### if bool(_answer): print(std_MAIN_PROCESS_NAME + "Calculating accuracy of each HLA gene. (answer: '{}')". format(_answer)) if not os.path.exists(_answer): print( std_WARNING_MAIN_PROCESS_NAME + "Given answer file doesn't exist. Please check '--answer/-an' argument again.\n" "Skipping calculating imputation accuracy.") elif os.path.getsize(_answer) == 0: print( std_WARNING_MAIN_PROCESS_NAME + "Given answer file doesn't have any content. Please check '--answer/-an' argument again.\n" "Skipping calculating imputation accuracy.") else: if f_measureAcc_v2: # measureAcc_v2 self.accuracy = measureAccuracy( _answer, self.HLA_IMPUTATION_OUT, 'all', outfile=self.HLA_IMPUTATION_OUT + '.accuracy', __only4digits=True) else: # measureAcc_v3.5 measureAcc_start = time() t = CookHLA_measureAcc(_answer, self.HLA_IMPUTATION_OUT, self.HLA_IMPUTATION_OUT) self.accuracy = t.accuracy measureAcc_end = time() measureAcc_time = (measureAcc_end - measureAcc_start) / 60 print("\nAccuracy : {}".format(self.accuracy)) print("measureAccuracy time: {}(min)\n".format( measureAcc_time)) ###### < Get Accuracy > ###### # Reference panel RUN_Bash('rm {}'.format(REF_PHASED_VCF)) RUN_Bash('rm {}'.format(self.OUTPUT_dir_ref + '.GCchange.bgl.phased')) RUN_Bash('rm {}'.format(self.OUTPUT_dir_ref + '.GCchange.markers')) if self.FLAG_AdaptiveGeneticMap: RUN_Bash('rm {}'.format(self.refined_Genetic_Map))
def CONVERT_IN(self, MHC, _reference, _out, _hg, _Genetic_Map): print("[{}] Converting data to beagle format.".format( self.idx_process)) self.idx_process += 1 RUN_Bash(self.LINKAGE2BEAGLE + ' pedigree={} data={} beagle={} standard=true > {}'.format( MHC + '.QC.nopheno.ped', MHC + '.QC.dat', MHC + '.QC.bgl', _out + '.bgl.log')) # if not self.__save_intermediates: # os.system('rm {}'.format(MHC + '.QC.nopheno.ped')) # os.system('rm {}'.format(MHC + '.QC.dat')) # os.system('rm {}'.format(_out+'.bgl.log')) ### Converting data to reference_markers_Position (Dispersing same genomic position of some markers.) from src.redefineBPv1BH import redefineBP RefinedMarkers = redefineBP(_reference + '.markers', self.OUTPUT_dir_ref + '.refined.markers') self.refined_REF_markers = RefinedMarkers # => This will be used in 'CONVERT_OUT'. ### Converting data to target_markers_Position and extract not_including snp. RUN_Bash('awk \'{print $2" "$4" "$5" "$6}\' %s > %s' % (MHC + '.QC.bim', MHC + '.QC.markers')) RUN_Bash( 'Rscript src/excluding_snp_and_refine_target_position-v1COOK02222017.R {} {} {}' .format(MHC + '.QC.markers', RefinedMarkers, MHC + '.QC.pre.markers')) if not self.__save_intermediates: os.system(' '.join(['rm', MHC + '.QC.markers'])) RUN_Bash('mv {} {}'.format(MHC + '.QC.bgl', MHC + '.QC.pre.bgl.phased')) RUN_Bash("awk '{print $1}' %s > %s" % (MHC + '.QC.pre.markers', join(self.OUTPUT_dir, 'selected_snp.txt'))) from src.Panel_subset import Panel_Subset qc_refined = Panel_Subset(MHC + '.QC.pre', 'all', join(self.OUTPUT_dir, 'selected_snp.txt'), MHC + '.QC.refined') if not self.__save_intermediates: RUN_Bash('rm {}'.format(MHC + '.QC.pre.bgl.phased')) RUN_Bash('rm {}'.format(MHC + '.QC.pre.markers')) RUN_Bash('rm {}'.format(join(self.OUTPUT_dir, 'selected_snp.txt'))) ### Converting data to GC_change_beagle format. from src.bgl2GC_trick_bgl import Bgl2GC # target [GCchangeBGL, GCchangeMarkers] = Bgl2GC(MHC + '.QC.refined.bgl.phased', MHC + '.QC.refined.markers', MHC + '.QC.GCchange.bgl', MHC + '.QC.GCchange.markers') self.GCchangeBGL = GCchangeBGL # it will be used in 'CONVERT_OUT' with Genetic Map # print("<Target GCchanged bgl and marker file>\n" # "bgl : {}\n" # "markers : {}".format(GCchangeBGL, GCchangeMarkers)) # reference [GCchangeBGL_REF, GCchangeMarkers_REF ] = Bgl2GC(_reference + '.bgl.phased', RefinedMarkers, self.OUTPUT_dir_ref + '.GCchange.bgl.phased', self.OUTPUT_dir_ref + '.GCchange.markers') # print("<Reference GCchanged bgl and marker file>\n" # "bgl : {}\n" # "markers : {}".format(GCchangeBGL_REF, GCchangeMarkers_REF)) if not self.__save_intermediates: RUN_Bash('rm {}'.format(MHC + '.QC.refined.bgl.phased')) RUN_Bash('rm {}'.format(MHC + '.QC.refined.markers')) # RUN_Bash('rm {}'.format(RefinedMarkers)) # os.system(' '.join(['rm', RefinedMarkers])) # => This will be used in 'CONVERT_OUT" when not using Multiple Markers. ### Converting data to vcf_format # target RUN_Bash(self.BEAGLE2VCF + ' 6 {} {} 0 > {}'.format(GCchangeMarkers, GCchangeBGL, MHC + '.QC.vcf')) MHC_QC_VCF = MHC + '.QC.vcf' # reference RUN_Bash(self.BEAGLE2VCF + ' 6 {} {} 0 > {}'.format(GCchangeMarkers_REF, GCchangeBGL_REF, self.OUTPUT_dir_ref + '.vcf')) reference_vcf = self.OUTPUT_dir_ref + '.vcf' ### Converting data to reference_phased RUN_Bash('sed "s%/%|%g" {} > {}'.format( reference_vcf, self.OUTPUT_dir_ref + '.phased.vcf')) REF_PHASED_VCF = self.OUTPUT_dir_ref + '.phased.vcf' if not self.__save_intermediates: RUN_Bash('rm {}'.format(reference_vcf)) # # if self.f_useMultipleMarkers: # if not self.f_useGeneticMap: # os.system(' '.join(['rm {}'.format(GCchangeBGL)])) # 'GCchangeBGL' will be used in 'CONVERT_OUT' # os.system(' '.join(['rm {}'.format(GCchangeMarkers_REF)])) # 'GCchangeMarkers_REF' will be used in 'CONVERT_OUT' # os.system(' '.join(['rm {}'.format(GCchangeMarkers)])) # os.system(' '.join(['rm {}'.format(GCchangeBGL_REF)])) """ (1) `MHC_QC_VCF` := MHC + '.QC.vcf', (2) `REF_PHASED_VCF` := self.OUTPUT_dir_ref + '.phased.vcf' These two files are to be passed into Beagle phasing; """ if self.FLAG_AdaptiveGeneticMap: ############### < Adaptive Genetic Map > ############### """ awk '{print $1" "$2" "$3}' $geneticMap > $geneticMap.first awk '{print $2}' $REFERENCE.GCchange.markers > $geneticMap.second paste -d " " $geneticMap.first $geneticMap.second > $geneticMap.refined.map rm $geneticMap.first rm $geneticMap.second """ REFINED_GENTIC_MAP = self.OUTPUT_dir_GM + '.refined.map' RUN_Bash('awk \'{print $1" "$2" "$3}\' %s > %s' % (_Genetic_Map, self.OUTPUT_dir_GM + '.first')) RUN_Bash('awk \'{print $2}\' %s > %s' % (GCchangeMarkers_REF, self.OUTPUT_dir_GM + '.second')) RUN_Bash( 'paste -d " " {} {} > {}'.format( self.OUTPUT_dir_GM + '.first', self.OUTPUT_dir_GM + '.second', REFINED_GENTIC_MAP) ) # 이렇게 column bind시키는데는 당연히 *.first, *.second 파일의 row수가 같을 거라고 가정하는 상황. if os.path.exists(REFINED_GENTIC_MAP): self.refined_Genetic_Map = REFINED_GENTIC_MAP if not self.__save_intermediates: os.system('rm {}'.format(self.OUTPUT_dir_GM + '.first')) os.system('rm {}'.format(self.OUTPUT_dir_GM + '.second')) # os.system('rm {}'.format(GCchangeMarkers_REF)) # (Genetic Map) *.GCchange.markers is removed here. else: print(std_ERROR_MAIN_PROCESS_NAME + "Failed to generate Refined Genetic Map.") sys.exit() __RETURN__ = [MHC_QC_VCF, REF_PHASED_VCF] return __RETURN__
def Make_ExonN_Panel(self, _exonN, _EXON234_Panel, _out): """ (1) Make a regular expression to nominate HLA allele binary marker of exon 2, 3, 4 (2) To make exon 2 panel, for example, subset out exon 3, 4 HLA allele binary markers. Maybe with Plink('--recode'). (3) With subsetted *.ped and *.map files, make a *.nopheno and *.markers files. (4) Feed them to LINKAGE2BEAGLE => Beagle file generated. (5) Get a frequency file. """ ### Subsetting `_exonN` HLA allele binary markers and SNP markers. ((1) + (2) above.) # Input files - (1) Exon234 beagle file, (2) Exon234 markers file. f_bgl_exon234 = open(_EXON234_Panel + '.bgl.phased', 'r') f_markers_exon234 = open(_EXON234_Panel + '.markers', 'r') # Output files f_out_bgl = open(_out + '.bgl.phased', 'w') f_out_markers = open(_out + '.markers', 'w') count = 0 for line_bgl_exon234 in f_bgl_exon234: m = p_1st_two_columns.match(line_bgl_exon234) items_bgl_exon234 = [ m.group(1), m.group(2) ] # Skip exception where that pattern finds nothing. if items_bgl_exon234[0] != 'M': """ - Header part of *.bgl file. 'P pedigree' 'I id' 'fID father' 'mID mother' 'C gender' etc. """ f_out_bgl.write( line_bgl_exon234) # Just forward the line to output file. else: """ Main body 'M rs969931' 'M rs2745406' """ # Each line of '*.markers' file. line_markers_exon234 = f_markers_exon234.readline() if re.match(pattern=r'^HLA_', string=items_bgl_exon234[1]): if p_ExonN[_exonN].match(items_bgl_exon234[1]): # Gather only `exonN` HLA allele binary markers. f_out_bgl.write(line_bgl_exon234) f_out_markers.write(line_markers_exon234) else: # In case of normal SNP markers, just forward them. f_out_bgl.write(line_bgl_exon234) f_out_markers.write(line_markers_exon234) count += 1 # if count > 10 : break f_bgl_exon234.close() f_markers_exon234.close() f_out_bgl.close() f_out_markers.close() # Generated exonN Panel. bgl_exonN = _out + '.bgl.phased' markers_exonN = _out + '.markers' ### With those subsetted *.bgl and *.markers files, generate *.ped and *.map. ((3) + (4) above.) command = 'cat {} | {} {}'.format( bgl_exonN, self.BEAGLE2LINKAGE, _out + ".STEP4_tmp" ) # *.ped, *.dat (cf. 'java -jar' is included in 'BEAGLE2LINKAGE'.) # print(command) if not os.system(command): # Remove if not self.__save_intermediates: os.system('rm {}'.format( _out + ".STEP4_tmp.dat")) # *.dat file is unnecessary. command = 'cut -d \' \' -f-5 {} > {}'.format( _out + ".STEP4_tmp.ped", _out + ".STEP4_tmp.ped.left") # ['FID', 'IID', 'PID', 'MID', 'Sex'] # print(command) os.system(command) command = 'cut -d \' \' -f6- {} > {}'.format( _out + ".STEP4_tmp.ped", _out + ".STEP4_tmp.ped.right") # genotype information part. # print(command) os.system(command) command = 'paste -d \' -9 \' {} /dev/null /dev/null /dev/null {} > {}'.format( _out + ".STEP4_tmp.ped.left", _out + ".STEP4_tmp.ped.right", _out + ".ped") # print(command) if not os.system(command): # Remove if not self.__save_intermediates: os.system('rm {}'.format(_out + ".STEP4_tmp.ped")) os.system('rm {}'.format(_out + ".STEP4_tmp.ped.left")) os.system('rm {}'.format(_out + ".STEP4_tmp.ped.right")) # (1) rsid, (2) bp, (3) allele1 os.system(' '.join([ "cut -d \' \' -f1", _out + ".markers", ">", _out + ".STEP4_map.rsid" ])) os.system(' '.join([ "cut -d \' \' -f2", _out + ".markers", ">", _out + ".STEP4_map.bp" ])) os.system(' '.join([ "cut -d \' \' -f3", _out + ".markers", ">", _out + ".STEP4_map.allele1" ])) os.system(' '.join([ "paste -d \'6 0 \'", "/dev/null", "/dev/null", _out + ".STEP4_map.rsid", "/dev/null", "/dev/null", _out + ".STEP4_map.bp", ">", _out + ".map" ])) # os.system(' '.join( # ["paste -d \' \'", _out + ".STEP4_map.rsid", _out + ".STEP4_map.bp", ">", _out + ".refallele"])) os.system(' '.join([ "paste -d \' \'", _out + ".STEP4_map.rsid", _out + ".STEP4_map.allele1", ">", _out + ".refallele" ])) """ (2019. 07. 09.) To make '*.refallele' file, I think right part is supposed to be '_out + ".STEP4_map.allele1"' not '_out + ".STEP4_map.bp"' """ # bed, bim, fam files. command = ' '.join([ self.PLINK, '--ped {} --map {} --make-bed --reference-allele {} --out {}'. format(_out + ".ped", _out + ".map", _out + ".refallele", _out) ]) # print(command) if not os.system(command): # Remove if not self.__save_intermediates: os.system('rm {}'.format(_out + ".STEP4_map.rsid")) os.system('rm {}'.format(_out + ".STEP4_map.bp")) os.system('rm {}'.format(_out + ".STEP4_map.allele1")) os.system('rm {}'.format(_out + ".ped")) os.system('rm {}'.format(_out + ".map")) os.system('rm {}'.format(_out + ".log")) os.system('rm {}'.format(_out + ".refallele")) # Allele Frequency file(*.frq) command = ' '.join([ self.PLINK, '--bfile {} --keep-allele-order --freq --out {}'.format( _out, _out + ".FRQ") ]) # print(command) if not os.system(command): # Remove if not self.__save_intermediates: os.system('rm {}'.format(_out + ".FRQ.log")) """ Prephasing : Exon234 panel is used. Each imputation : Exon2,3,4 panel, each. For each Exon 2,3,4 panel to be used in Beagle Imputation, preprocessing must be done to them. (ex. GC change) (cf. redefining BP is already done in 'Make_EXON234_Panel.py'. So it won't be done here.) Below code is originally 'CONVERT_IN' part. """ # reference [GCchangeBGL_REF, GCchangeMarkers_REF] = Bgl2GC(_out + '.bgl.phased', _out + '.markers', _out + '.GCchange.bgl.phased', _out + '.GCchange.markers') # print("<Reference GCchanged bgl and marker file>\n" # "bgl : {}\n" # "markers : {}".format(GCchangeBGL_REF, GCchangeMarkers_REF)) RUN_Bash(self.BEAGLE2VCF + ' 6 {} {} 0 > {}'.format( GCchangeMarkers_REF, GCchangeBGL_REF, _out + '.vcf')) reference_vcf = _out + '.vcf' ### Converting data to reference_phased RUN_Bash('sed "s%/%|%g" {} > {}'.format(reference_vcf, _out + '.phased.vcf')) # REF_PHASED_VCF = _out + '.phased.vcf' if not self.__save_intermediates: RUN_Bash('rm {}'.format(reference_vcf)) # # if self.f_useMultipleMarkers: # if not self.f_useGeneticMap: # os.system(' '.join(['rm {}'.format(GCchangeBGL)])) # 'GCchangeBGL' will be used in 'CONVERT_OUT' # os.system(' '.join(['rm {}'.format(GCchangeMarkers_REF)])) # 'GCchangeMarkers_REF' will be used in 'CONVERT_OUT' # os.system(' '.join(['rm {}'.format(GCchangeMarkers)])) # os.system(' '.join(['rm {}'.format(GCchangeBGL_REF)])) return _out
def MakeGeneticMap( _input, _reference, _out, _p_src="/scratch3/users/nanje/MHC-Imputation-Accuracy/templates/MakeGeneticMap", _p_dependency="./dependency", __save_intermediates=False): # Sample N check N_sample_target = getSampleNumbers(_input + '.fam') f_SmallSampleMode = N_sample_target < 100 N_sample_reference = getSampleNumbers(_reference + '.fam') if f_SmallSampleMode and (N_sample_reference < 200): print( std_ERROR_MAIN_PROCESS_NAME + "If Target data has less than 100 samples, Reference panel must have at least 200 samples." ) sys.exit() _p_plink = "/users/nanje/miniconda3/bin/plink" _p_linkage2beagle = "/usr/local/bin/linkage2beagle.jar" _p_beagle2linkage = "/usr/local/bin/beagle2linkage.jar" _p_transpose = "/usr/local/bin/transpose.jar" _p_mach = "/usr/local/bin/mach1" PLINK = "{} --noweb --silent --allow-no-sex".format(_p_plink) LINKAGE2BEAGLE = 'java -jar {}'.format(_p_linkage2beagle) RANDOMIZE_FAM = 'Rscript {}/STEP0_randomize_the_sample_about_fam_03_06_2017-COOK-V1.R'.format( _p_src) BGL2GC_TRICK_BGL = 'Rscript {}/bgl2GC_trick_bgl-v1.1COOK-02222017.R'.format( _p_src) BGL2BED = "{}/Panel-BGL2BED.sh".format(_p_src) STEP4_buildMap = "Rscript {}/STEP4-buildMap.R".format(_p_src) STEP5_collapseHLA = "Rscript {}/STEP5-collapseHLA.R".format(_p_src) # Intermediate path. if not _out: print(std_ERROR_MAIN_PROCESS_NAME + 'The argument "{0}" has not been given. Please check it again.\n' .format("--out")) sys.exit() else: _out = _out if not _out.endswith('/') else _out.rstrip('/') if bool(os.path.dirname(_out)): os.makedirs(os.path.dirname(_out), exist_ok=True) OUTPUT_dir = os.path.dirname(_out) OUTPUT_INPUT = os.path.join( OUTPUT_dir, os.path.basename(_input)) # Generated in output folder OUTPUT_REF = os.path.join(OUTPUT_dir, os.path.basename(_reference)) ###### < Control Flags > ###### RANDOM = 1 SUBSET_BGL = 1 MAKING_MACH_INPUT = 1 RUNMACH = 1 BUILDING_MAP = 1 Cleanup = 1 if f_SmallSampleMode: # Only 'RANDOM' and 'SUBSET_BGL' two blocks become different print(std_MAIN_PROCESS_NAME + "Generating AGM with Small Samples.") if RANDOM: # RUN_Bash('awk \'{print $1" "$2" ""0"" ""0"" "$5" "$6}\' %s > %s' % (_input+'.fam', OUTPUT_INPUT+'.trick.fam')) RUN_Bash('awk \'{print $1" "$2" ""0"" ""0"" "$5" "$6}\' %s > %s' % (_reference + '.fam', OUTPUT_REF + '.trick.fam') ) # Only the reference fam file will be randomized. # RUN_Bash(RANDOMIZE_FAM + ' {} {}'.format(OUTPUT_INPUT+'.trick.fam', OUTPUT_INPUT+'.rearranged.fam')) RUN_Bash(RANDOMIZE_FAM + ' {} {}'.format(OUTPUT_REF + '.trick.fam', OUTPUT_REF + '.rearranged.fam')) # RUN_Bash('rm {}'.format(OUTPUT_INPUT+'.trick.fam')) RUN_Bash('rm {}'.format(OUTPUT_REF + '.trick.fam')) if SUBSET_BGL: # RUN_Bash('head -100 %s | awk \'{print $1" "$2}\' > %s' % (OUTPUT_INPUT+'.rearranged.fam', OUTPUT_INPUT+'.subset.samples')) RUN_Bash( 'head -200 %s | tail -n 100 | awk \'{print $1" "$2}\' > %s' % (OUTPUT_REF + '.rearranged.fam', OUTPUT_INPUT + '.subset.samples')) RUN_Bash('head -100 %s | awk \'{print $1" "$2}\' > %s' % (OUTPUT_REF + '.rearranged.fam', OUTPUT_REF + '.subset.samples')) # RUN_Bash(PLINK + ' --bfile {} --keep {} --recode --out {}'.format(_input, OUTPUT_INPUT+'.subset.samples', OUTPUT_INPUT+'.subset')) # RUN_Bash(PLINK + ' --bfile {} --keep {} --make-bed --out {}'.format(_input, OUTPUT_INPUT+'.subset.samples', OUTPUT_INPUT+'.subset')) RUN_Bash(PLINK + ' --bfile {} --keep {} --recode --out {}'.format( _reference, OUTPUT_INPUT + '.subset.samples', OUTPUT_INPUT + '.subset')) RUN_Bash(PLINK + ' --bfile {} --keep {} --make-bed --out {}'.format( _reference, OUTPUT_INPUT + '.subset.samples', OUTPUT_INPUT + '.subset')) RUN_Bash("cut -d ' ' -f1-5,7- {} > {}".format( OUTPUT_INPUT + '.subset.ped', OUTPUT_INPUT + '.subset.nopheno.ped')) RUN_Bash( 'awk \'{print "M " $2}\' %s > %s' % (OUTPUT_INPUT + '.subset.map', OUTPUT_INPUT + '.subset.dat')) RUN_Bash(LINKAGE2BEAGLE + ' {} {} > {}'.format( OUTPUT_INPUT + '.subset.dat', OUTPUT_INPUT + '.subset.nopheno.ped', OUTPUT_INPUT + '.subset.bgl.phased')) RUN_Bash('awk \'{print $2" "$4" "$5" "$6}\' %s > %s' % (OUTPUT_INPUT + '.subset.bim', OUTPUT_INPUT + '.subset.markers')) Panel_Subset(_reference, OUTPUT_REF + '.subset.samples', 'all', OUTPUT_REF + '.subset') RUN_Bash(BGL2GC_TRICK_BGL + ' {} {} {} {}'.format( OUTPUT_INPUT + '.subset.bgl.phased', OUTPUT_INPUT + '.subset.markers', OUTPUT_INPUT + '.subset.GCchange.bgl.phased', OUTPUT_INPUT + '.subset.GCchange.markers')) RUN_Bash(BGL2GC_TRICK_BGL + ' {} {} {} {}'.format( OUTPUT_REF + '.subset.bgl.phased', OUTPUT_REF + '.subset.markers', OUTPUT_REF + '.subset.GCchange.bgl.phased', OUTPUT_REF + '.subset.GCchange.markers')) # RUN_Bash('rm {}'.format(OUTPUT_INPUT+'.rearranged.fam')) # Small Sample RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.subset.samples')) RUN_Bash('rm {}'.format(OUTPUT_REF + '.rearranged.fam')) RUN_Bash('rm {}'.format(OUTPUT_REF + '.subset.samples')) RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.subset.bgl.phased')) RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.subset.markers')) RUN_Bash('rm {}'.format(OUTPUT_REF + '.subset.bgl.phased')) RUN_Bash('rm {}'.format(OUTPUT_REF + '.subset.markers')) else: if RANDOM: RUN_Bash('awk \'{print $1" "$2" ""0"" ""0"" "$5" "$6}\' %s > %s' % (_input + '.fam', OUTPUT_INPUT + '.trick.fam')) RUN_Bash('awk \'{print $1" "$2" ""0"" ""0"" "$5" "$6}\' %s > %s' % (_reference + '.fam', OUTPUT_REF + '.trick.fam')) RUN_Bash(RANDOMIZE_FAM + ' {} {}'.format(OUTPUT_INPUT + '.trick.fam', OUTPUT_INPUT + '.rearranged.fam')) RUN_Bash(RANDOMIZE_FAM + ' {} {}'.format(OUTPUT_REF + '.trick.fam', OUTPUT_REF + '.rearranged.fam')) RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.trick.fam')) RUN_Bash('rm {}'.format(OUTPUT_REF + '.trick.fam')) if SUBSET_BGL: RUN_Bash('head -100 %s | awk \'{print $1" "$2}\' > %s' % (OUTPUT_INPUT + '.rearranged.fam', OUTPUT_INPUT + '.subset.samples')) RUN_Bash('head -100 %s | awk \'{print $1" "$2}\' > %s' % (OUTPUT_REF + '.rearranged.fam', OUTPUT_REF + '.subset.samples')) RUN_Bash(PLINK + ' --bfile {} --keep {} --recode --out {}'.format( _input, OUTPUT_INPUT + '.subset.samples', OUTPUT_INPUT + '.subset')) RUN_Bash(PLINK + ' --bfile {} --keep {} --make-bed --out {}'.format( _input, OUTPUT_INPUT + '.subset.samples', OUTPUT_INPUT + '.subset')) RUN_Bash("cut -d ' ' -f1-5,7- {} > {}".format( OUTPUT_INPUT + '.subset.ped', OUTPUT_INPUT + '.subset.nopheno.ped')) RUN_Bash( 'awk \'{print "M " $2}\' %s > %s' % (OUTPUT_INPUT + '.subset.map', OUTPUT_INPUT + '.subset.dat')) RUN_Bash(LINKAGE2BEAGLE + ' {} {} > {}'.format( OUTPUT_INPUT + '.subset.dat', OUTPUT_INPUT + '.subset.nopheno.ped', OUTPUT_INPUT + '.subset.bgl.phased')) RUN_Bash('awk \'{print $2" "$4" "$5" "$6}\' %s > %s' % (OUTPUT_INPUT + '.subset.bim', OUTPUT_INPUT + '.subset.markers')) Panel_Subset(_reference, OUTPUT_REF + '.subset.samples', 'all', OUTPUT_REF + '.subset') RUN_Bash(BGL2GC_TRICK_BGL + ' {} {} {} {}'.format( OUTPUT_INPUT + '.subset.bgl.phased', OUTPUT_INPUT + '.subset.markers', OUTPUT_INPUT + '.subset.GCchange.bgl.phased', OUTPUT_INPUT + '.subset.GCchange.markers')) RUN_Bash(BGL2GC_TRICK_BGL + ' {} {} {} {}'.format( OUTPUT_REF + '.subset.bgl.phased', OUTPUT_REF + '.subset.markers', OUTPUT_REF + '.subset.GCchange.bgl.phased', OUTPUT_REF + '.subset.GCchange.markers')) RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.rearranged.fam')) RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.subset.samples')) RUN_Bash('rm {}'.format(OUTPUT_REF + '.rearranged.fam')) RUN_Bash('rm {}'.format(OUTPUT_REF + '.subset.samples')) RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.subset.bgl.phased')) RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.subset.markers')) RUN_Bash('rm {}'.format(OUTPUT_REF + '.subset.bgl.phased')) RUN_Bash('rm {}'.format(OUTPUT_REF + '.subset.markers')) if MAKING_MACH_INPUT: RUN_Bash('bash {} {} {} {} {}'.format( BGL2BED, OUTPUT_INPUT + '.subset.GCchange', OUTPUT_INPUT + '.subset.GCchange', _p_beagle2linkage, _p_plink)) RUN_Bash(PLINK + ' --bfile {} --recode --out {}'.format( OUTPUT_INPUT + '.subset.GCchange', OUTPUT_INPUT + '.subset.GCchange')) RUN_Bash('awk \'{print "M", $2}\' %s > %s' % (OUTPUT_INPUT + '.subset.GCchange.map', OUTPUT_INPUT + '.subset.GCchange.dat')) # -d RUN_Bash('cut -d " " -f1-5,7- %s > %s' % (OUTPUT_INPUT + '.subset.GCchange.ped', OUTPUT_INPUT + '.subset.GCchange.nophe.ped')) # -p RUN_Bash('cat {} | java -jar {} > {}'.format( OUTPUT_REF + '.subset.GCchange.bgl.phased', _p_transpose, OUTPUT_REF + '.subset.GCchange.bgl.phased.tr')) RUN_Bash('cut -d " " -f1,2,6- %s | tail -n+3 > %s' % (OUTPUT_REF + '.subset.GCchange.bgl.phased.tr', OUTPUT_REF + '.subset.GCchange.haps')) # -h RUN_Bash('cut -d " " -f1 %s > %s' % (OUTPUT_REF + '.subset.GCchange.markers', OUTPUT_REF + '.subset.GCchange.haps.snps')) # -s if RUNMACH: RUN_Bash(_p_mach + ' -d {} -p {} -h {} -s {} --rounds 20 --greedy --prefix {}'. format(OUTPUT_INPUT + '.subset.GCchange.dat', OUTPUT_INPUT + '.subset.GCchange.nophe.ped', OUTPUT_REF + '.subset.GCchange.haps', OUTPUT_REF + '.subset.GCchange.haps.snps', _out + '.mach_step')) if BUILDING_MAP: # RUN_Bash(STEP4_buildMap+' {} {} {} {} {} > {}'.format( RUN_Bash(STEP4_buildMap + ' {} {} {} {} {} {}'.format( _out + '.mach_step.erate', _out + '.mach_step.rec', OUTPUT_REF + '.subset.GCchange.markers', _out + '.mach_step.gmap.avg', _out + '.mach_step.gmap.last', _out + '.aver.erate')) RUN_Bash(STEP5_collapseHLA + ' {} {} {}'.format(_out + '.mach_step.gmap.avg', _out + '.mach_step.avg.clpsA', _out + '.mach_step.avg.clpsB')) # Final output check Flag_OUTPUT = True if not os.path.exists(_out + '.aver.erate'): print(std_WARNING_MAIN_PROCESS_NAME + "'{}' wasn't created.".format(_out + '.aver.erate')) Flag_OUTPUT = False # if not os.path.exists(_out+'.mach_step.avg.clpsA'): # print(std_WARNING_MAIN_PROCESS_NAME + "'{}' wasn't created.".format(_out+'.mach_step.avg.clpsA')) # Flag_OUTPUT = False if not os.path.exists(_out + '.mach_step.avg.clpsB'): print(std_WARNING_MAIN_PROCESS_NAME + "'{}' wasn't created.".format(_out + '.mach_step.avg.clpsB')) Flag_OUTPUT = False # if not os.path.exists(_out+'.mach_step.erate'): # print(std_WARNING_MAIN_PROCESS_NAME + "'{}' wasn't created.".format(_out+'.mach_step.erate')) # Flag_OUTPUT = False # if not os.path.exists(_out+'.mach_step.gmap.avg'): # print(std_WARNING_MAIN_PROCESS_NAME + "'{}' wasn't created.".format(_out+'.mach_step.gmap.avg')) # Flag_OUTPUT = False # if not os.path.exists(_out+'.mach_step.gmap.last'): # print(std_WARNING_MAIN_PROCESS_NAME + "'{}' wasn't created.".format(_out+'.mach_step.gmap.last')) # Flag_OUTPUT = False # if not os.path.exists(_out+'.mach_step.rec'): # print(std_WARNING_MAIN_PROCESS_NAME + "'{}' wasn't created.".format(_out+'.mach_step.rec')) # Flag_OUTPUT = False if Cleanup: RUN_Bash('rm {}'.format(OUTPUT_INPUT + '.subset.*')) RUN_Bash('rm {}'.format(OUTPUT_REF + '.subset.*')) RUN_Bash('rm {}'.format(_out + '.mach_step.avg.clpsA')) RUN_Bash('rm {}'.format(_out + '.mach_step.erate')) RUN_Bash('rm {}'.format(_out + '.mach_step.gmap.avg')) RUN_Bash('rm {}'.format(_out + '.mach_step.gmap.last')) RUN_Bash('rm {}'.format(_out + '.mach_step.rec')) if Flag_OUTPUT: return (_out + '.mach_step.avg.clpsB', _out + '.aver.erate') else: return (-1, -1)
def __init__(self, idx_process, MHC, _reference, _out, _hg, _nthreads, _AdaptiveGeneticMap, _Average_Erate, _LINKAGE2BEAGLE, _BEAGLE2LINKAGE, _BEAGLE2VCF, _VCF2BEAGLE, _PLINK, _BEAGLE4, _CSH, _answer=None, f_save_intermediates=False, _MultP=1, _given_prephased=None, f_prephasing=False, f_remove_raw_IMP_results=False, f_measureAcc_v2=False): ### General self.idx_process = idx_process self.__save_intermediates = f_save_intermediates self.FLAG_AdaptiveGeneticMap = _AdaptiveGeneticMap and _Average_Erate # (***) Deciding whether to use Adaptive genetic map or not. # Prefixes self.OUTPUT_dir = os.path.dirname(_out) self.OUTPUT_dir_ref = join(self.OUTPUT_dir, os.path.basename(_reference)) self.OUTPUT_dir_GM = join( self.OUTPUT_dir, os.path.basename( _AdaptiveGeneticMap)) if self.FLAG_AdaptiveGeneticMap else None # Result self.Exon234_Panel = None self.dict_ExonN_Panel = {_exonN: None for _exonN in __EXON__} self.dict_ExonN_AGM = {_exonN: None for _exonN in __EXON__} self.dict_IMP_Result = { _exonN: {_overlap: None for _overlap in __overlap__} for _exonN in __EXON__ } self.accuracy = None self.HLA_IMPUTATION_OUT = None self.dict_DOUBLED_PHASED_RESULT = {_exonN: None for _exonN in __EXON__} self.dict_REF_PHASED_VCF = {_exonN: None for _exonN in __EXON__} # Dependencies self.LINKAGE2BEAGLE = _LINKAGE2BEAGLE self.BEAGLE2LINKAGE = _BEAGLE2LINKAGE self.BEAGLE2VCF = _BEAGLE2VCF self.VCF2BEAGLE = _VCF2BEAGLE self.PLINK = _PLINK self.BEAGLE4 = _BEAGLE4 # created in 'CONVERT_IN' # self.refined_REF_markers = None # used in 'CONVERT_OUT' # self.refined_Genetic_Map = None # used in 'IMPUTE' # self.GCchangeBGL = None # used in 'CONVERT_OUT' # Adaptive Genetic Map self.__AGM__ = _AdaptiveGeneticMap if self.FLAG_AdaptiveGeneticMap else None self.__AVER__ = _Average_Erate if self.FLAG_AdaptiveGeneticMap else None ###### < Reference panel for Exon 2, 3, 4 > ###### multiple_panels = HLA_MultipleRefs(_reference, self.OUTPUT_dir_ref, _hg, self.BEAGLE2LINKAGE, self.BEAGLE2VCF, self.PLINK, _MultP=_MultP, __AGM__=self.__AGM__, _out_AGM=self.OUTPUT_dir_GM) self.dict_ExonN_Panel = multiple_panels.ExonN_Panel self.Exon234_Panel = multiple_panels.EXON234_Panel self.dict_ExonN_AGM = multiple_panels.ExonN_AGM if self.FLAG_AdaptiveGeneticMap else { _exonN: None for _exonN in __EXON__ } # [Temporary Hard-coding] # self.Exon234_Panel = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/T1DGC_REF.exon234' # # self.dict_ExonN_Panel['exon2'] = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/T1DGC_REF.exon2' # self.dict_ExonN_Panel['exon3'] = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/T1DGC_REF.exon3' # self.dict_ExonN_Panel['exon4'] = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/T1DGC_REF.exon4' # # self.dict_ExonN_AGM['exon2'] = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/CEU_T1DGC.mach_step.avg.clpsB.exon2.txt' # self.dict_ExonN_AGM['exon3'] = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/CEU_T1DGC.mach_step.avg.clpsB.exon3.txt' # self.dict_ExonN_AGM['exon4'] = '/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/CEU_T1DGC.mach_step.avg.clpsB.exon4.txt' ###### < Main - 'CONVERT_IN', 'IMPUTE', 'CONVERT_OUT' > ###### ### (1) CONVERT_IN IMPUTATION_INPUT = self.CONVERT_IN(MHC, self.Exon234_Panel, _out, _hg, _given_prephased=_given_prephased, f_prephasing=f_prephasing) # Only one time of pre-phasing with Exon234 reference panel. ### (2) Imputation if _MultP == 1: imputation_serial_start = time() ## Serial implementation of main. for _exonN in __EXON__: for _overlap in __overlap__: self.dict_IMP_Result[_exonN][_overlap] = \ self.IMPUTE(MHC, _out, IMPUTATION_INPUT, self.dict_ExonN_Panel[_exonN] + '.phased.vcf', _overlap, _exonN, _nthreads, self.__AVER__, self.dict_ExonN_AGM[_exonN], f_prephasing=f_prephasing) imputation_serial_end = time() imputation_serial_time = (imputation_serial_end - imputation_serial_start) / 60 print(std_MAIN_PROCESS_NAME + "Total imputation time of Serial implementation: {}(min)\n". format(imputation_serial_time)) else: ## Parallel implementation of main. imputation_parallel_start = time() pool = mp.Pool(processes=_MultP if _MultP <= 9 else 9) dict_Pool = { _exonN: { _overlap: pool.apply_async( self.IMPUTE, (MHC, _out, IMPUTATION_INPUT, self.dict_ExonN_Panel[_exonN] + '.phased.vcf', _overlap, _exonN, _nthreads, self.__AVER__, self.dict_ExonN_AGM[_exonN], f_prephasing)) for _overlap in __overlap__ } for _exonN in __EXON__ } pool.close() pool.join() for _exonN in __EXON__: for _overlap in __overlap__: self.dict_IMP_Result[_exonN][_overlap] = dict_Pool[_exonN][ _overlap].get() imputation_parallel_end = time() imputation_parallel_time = (imputation_parallel_end - imputation_parallel_start) / 60 print( std_MAIN_PROCESS_NAME + "Total imputation time of Parallel implementation (with {} core(s)): {}(min)\n" .format(_MultP, imputation_parallel_time)) self.idx_process += 1 # [Temporary Hard-coding] # self.dict_IMP_Result['exon2'][3000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon2.overlap3000.MHC.QC.double.imputation_out.vcf' # self.dict_IMP_Result['exon2'][4000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon2.overlap4000.MHC.QC.double.imputation_out.vcf' # self.dict_IMP_Result['exon2'][5000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon2.overlap5000.MHC.QC.double.imputation_out.vcf' # self.dict_IMP_Result['exon3'][3000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon3.overlap3000.MHC.QC.double.imputation_out.vcf' # self.dict_IMP_Result['exon3'][4000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon3.overlap4000.MHC.QC.double.imputation_out.vcf' # self.dict_IMP_Result['exon3'][5000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon3.overlap5000.MHC.QC.double.imputation_out.vcf' # self.dict_IMP_Result['exon4'][3000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon4.overlap3000.MHC.QC.double.imputation_out.vcf' # self.dict_IMP_Result['exon4'][4000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon4.overlap4000.MHC.QC.double.imputation_out.vcf' # self.dict_IMP_Result['exon4'][5000] = '/home/wanson/Git_Projects/CookHLA/tests/_3_CookHLA/20190731_MM_AGM/HM_CEU_T1DGC_REF.MM.AGM.noprephasing.exon4.overlap5000.MHC.QC.double.imputation_out.vcf' ### (3) CONVERT_OUT self.HLA_IMPUTATION_OUT = self.CONVERT_OUT(self.dict_IMP_Result, MHC + '.HLA_IMPUTATION_OUT', _CSH, f_prephasing=f_prephasing) print(std_MAIN_PROCESS_NAME + 'IMPUTATION_OUT:\n{}'.format(self.HLA_IMPUTATION_OUT)) ## Acquiring accuracy if bool(_answer): print(std_MAIN_PROCESS_NAME + "Calculating accuracy of each HLA gene. (answer: '{}')". format(_answer)) if not os.path.exists(_answer): print( std_WARNING_MAIN_PROCESS_NAME + "Given answer file doesn't exist. Please check '--answer/-an' argument again.\n" "Skipping calculating imputation accuracy.") elif os.path.getsize(_answer) == 0: print( std_WARNING_MAIN_PROCESS_NAME + "Given answer file doesn't have any content. Please check '--answer/-an' argument again.\n" "Skipping calculating imputation accuracy.") else: if f_measureAcc_v2: # measureAcc_v2 self.accuracy = measureAccuracy( _answer, self.HLA_IMPUTATION_OUT, 'all', outfile=self.HLA_IMPUTATION_OUT + '.accuracy', __only4digits=True) else: # measureAcc_v3.5 measureAcc_start = time() t = CookHLA_measureAcc(_answer, self.HLA_IMPUTATION_OUT, self.HLA_IMPUTATION_OUT) self.accuracy = t.accuracy measureAcc_end = time() measureAcc_time = (measureAcc_end - measureAcc_start) / 60 print("\nAccuracy : {}".format(self.accuracy)) print("measureAccuracy time: {}(min)\n".format( measureAcc_time)) ### General Removal if not self.__save_intermediates: # 'Exon234 panel' RUN_Bash('rm {}'.format(self.Exon234_Panel + '.bed')) RUN_Bash('rm {}'.format(self.Exon234_Panel + '.bim')) RUN_Bash('rm {}'.format(self.Exon234_Panel + '.fam')) RUN_Bash('rm {}'.format(self.Exon234_Panel + '.FRQ.frq')) RUN_Bash('rm {}'.format(self.Exon234_Panel + '.markers')) RUN_Bash('rm {}'.format(self.Exon234_Panel + '.bgl.phased')) RUN_Bash('rm {}'.format(self.Exon234_Panel + '.GCchange.markers')) RUN_Bash('rm {}'.format(self.Exon234_Panel + '.GCchange.bgl.phased')) RUN_Bash('rm {}'.format(self.Exon234_Panel + '.phased.vcf')) RUN_Bash( 'rm {}'.format(self.Exon234_Panel + '.refined.markers')) # only in Exon234 panel # 'Exon 2,3,4 panel' for _exonN in __EXON__: RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] + '.bed')) RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] + '.bim')) RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] + '.fam')) RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] + '.FRQ.frq')) RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] + '.markers')) RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] + '.bgl.phased')) RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] + '.GCchange.markers')) RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] + '.GCchange.bgl.phased')) RUN_Bash('rm {}'.format(self.dict_ExonN_Panel[_exonN] + '.phased.vcf')) # 'Exon 2,3,4 AGM' RUN_Bash('rm {}'.format(multiple_panels.EXON234_AGM)) for _exonN in __EXON__: RUN_Bash('rm {}'.format(self.dict_ExonN_AGM[_exonN])) # # 'CONVERT_IN' # RUN_Bash('rm {}'.format(MHC + '.QC.nopheno.ped')) # RUN_Bash('rm {}'.format(MHC + '.QC.dat')) # 'CONVERT_OUT' for _exonN in __EXON__: for _overlap in __overlap__: for _hla in HLA_names: RUN_Bash('rm {}'.format( self.dict_IMP_Result[_exonN][_overlap] + '.HLA_{}'.format(_hla))) if f_remove_raw_IMP_results: RUN_Bash('rm {}'.format( self.dict_IMP_Result[_exonN][_overlap])) RUN_Bash( 'rm {}'.format(self.dict_IMP_Result[_exonN] [_overlap].rstrip('.vcf') + '.log'))
def IMPUTE(self, MHC, _out, _IMPUTATION_INPUT, _REF_PHASED_VCF, _overlap, _exonN, _nthreads, _aver_erate, _Refined_Genetic_Map, f_prephasing=False): if os.path.getsize(_IMPUTATION_INPUT) == 0: print( std_ERROR_MAIN_PROCESS_NAME + "Input file for imputation('{}') contains nothing. Please check it again." .format(_IMPUTATION_INPUT)) sys.exit() # print("[{}] Performing HLA imputation (see {}.MHC.QC.imputation_out.log for progress).".format(self.idx_process, _out)) print("\n[{}] Performing HLA imputation({} / overlap:{}).".format( self.idx_process, _exonN, _overlap)) # self.idx_process += 1 raw_HLA_IMPUTATION_OUT = MHC + ( '.QC.{}.{}.doubled.raw_imputation_out'.format(_exonN, _overlap) if f_prephasing else '.QC.{}.{}.raw_imputation_out'.format( _exonN, _overlap)) if self.FLAG_AdaptiveGeneticMap: # With Adatpive Genetic Map """ ### MM + AGM # prephasing java -jar beagle4.jar gt=$MHC.QC.phasing_out_double.vcf ref=$REFERENCE.phased.vcf out=$MHC.QC.double.imputation_out impute=true lowmem=true gprobs=true ne=10000 overlap=${OVERLAP} err=$aver_erate map=$geneticMap.refined.map # No-prephasing java -jar beagle4.jar gt=$MHC.QC.vcf ref=$REFERENCE.phased.vcf out=$MHC.QC.double.imputation_out impute=true lowmem=true gprobs=true ne=10000 overlap=${OVERLAP} err=$aver_erate map=$geneticMap.refined.map """ # aver_erate with open(_aver_erate, 'r') as f: aver_erate = f.readline().rstrip('\n') command = '{} gt={} ref={} out={} impute=true lowmem=true gprobs=true ne=10000 overlap={} err={} map={} nthreads={}'.format( self.BEAGLE4, _IMPUTATION_INPUT, _REF_PHASED_VCF, raw_HLA_IMPUTATION_OUT, _overlap, aver_erate, _Refined_Genetic_Map, _nthreads) # print(command) try: f_log = open(raw_HLA_IMPUTATION_OUT + '.log', 'w') imputation_start = time() subprocess.run(re.split('\s+', command), check=True, stdout=f_log, stderr=f_log) imputation_end = time() except subprocess.CalledProcessError: raise CookHLAImputationError( std_ERROR_MAIN_PROCESS_NAME + "Imputation({} / overlap:{}) failed.\n".format( _exonN, _overlap)) # sys.stderr.write(std_ERROR_MAIN_PROCESS_NAME + "Imputation({} / overlap:{}) failed.\n".format(_exonN, _overlap)) # return -1 else: # print(std_MAIN_PROCESS_NAME+"Imputation({} / overlap:{}) done.".format(_exonN, _overlap)) # os.system("rm {}".format(raw_HLA_IMPUTATION_OUT+'.err.log')) f_log.close() imputation_time = (imputation_end - imputation_start) / 60 sys.stdout.write( "Imputation({} / overlap:{}) time: {}(min)\n".format( _exonN, _overlap, imputation_time)) else: # Without Adaptive Genetic Map """ ### MM # prephasing java -jar beagle4.jar gt=$MHC.QC.phasing_out_double.vcf ref=$REFERENCE.phased.vcf out=$MHC.QC.double.imputation_out impute=true lowmem=true overlap=$OVERLAP gprobs=true # No-prephasing java -jar beagle4.jar gt=$MHC.QC.vcf ref=$REFERENCE.phased.vcf out=$MHC.QC.double.imputation_out impute=true lowmem=true overlap=$OVERLAP gprobs=true """ command = '{} gt={} ref={} out={} impute=true lowmem=true overlap={} gprobs=true nthreads={}'.format( self.BEAGLE4, _IMPUTATION_INPUT, _REF_PHASED_VCF, raw_HLA_IMPUTATION_OUT, _overlap, _nthreads) # print(command) try: f_log = open(raw_HLA_IMPUTATION_OUT + '.log', 'w') imputation_start = time() subprocess.run(re.split('\s+', command), check=True, stdout=f_log, stderr=f_log) imputation_end = time() except subprocess.CalledProcessError: raise CookHLAImputationError( std_ERROR_MAIN_PROCESS_NAME + "Imputation({} / overlap:{}) failed.\n".format( _exonN, _overlap)) else: # print(std_MAIN_PROCESS_NAME+"Imputation({} / overlap:{}) done.".format(_exonN, _overlap)) f_log.close() imputation_time = (imputation_end - imputation_start) / 60 sys.stdout.write( "Imputation({} / overlap:{}) time: {}(min)\n".format( _exonN, _overlap, imputation_time)) RUN_Bash('gzip -d -f {}.vcf.gz'.format(raw_HLA_IMPUTATION_OUT)) return raw_HLA_IMPUTATION_OUT + '.vcf'
def CONVERT_IN(self, MHC, _reference, _out, _hg, _given_prephased=None, f_prephasing=False): if _given_prephased and f_prephasing: print( "(Test Purpose) Given pre-phased result will be used. ('{}')". format(_given_prephased)) ############### < Multiple Markers > ############### ### Phasing & Doubling (only on Target Sample.) # Phasing (If previously prephased result is given, then the process to make new phased result will be skipped. PHASED_RESULT = _given_prephased # Doubling DOUBLED_PHASED_RESULT = self.Doubling(MHC, PHASED_RESULT.rstrip('.vcf')) return DOUBLED_PHASED_RESULT OUTPUT_dir_Exon234_ref = join(self.OUTPUT_dir, os.path.basename(_reference)) print("[{}] Converting data to beagle format.".format( self.idx_process)) self.idx_process += 1 RUN_Bash(self.LINKAGE2BEAGLE + ' pedigree={} data={} beagle={} standard=true > {}'.format( MHC + '.QC.nopheno.ped', MHC + '.QC.dat', MHC + '.QC.bgl', _out + '.bgl.log')) # if not self.__save_intermediates: # os.system('rm {}'.format(MHC + '.QC.nopheno.ped')) # os.system('rm {}'.format(MHC + '.QC.dat')) # os.system('rm {}'.format(_out + '.bgl.log')) ### Converting data to reference_markers_Position (Dispersing same genomic position of some markers.) # RefinedMarkers = redefineBP(_reference + '.markers', OUTPUT_dir_Exon234_ref + '.refined.markers') RUN_Bash( 'cp {} {}'.format(_reference + '.markers', OUTPUT_dir_Exon234_ref + '.refined.markers')) RefinedMarkers = OUTPUT_dir_Exon234_ref + '.refined.markers' # self.refined_REF_markers = RefinedMarkers # => This will be used in 'CONVERT_OUT'. ### Converting data to target_markers_Position and extract not_including snp. RUN_Bash('awk \'{print $2" "$4" "$5" "$6}\' %s > %s' % (MHC + '.QC.bim', MHC + '.QC.markers')) RUN_Bash( 'Rscript src/excluding_snp_and_refine_target_position-v1COOK02222017.R {} {} {}' .format(MHC + '.QC.markers', RefinedMarkers, MHC + '.QC.pre.markers')) if not self.__save_intermediates: os.system(' '.join(['rm', MHC + '.QC.markers'])) RUN_Bash('mv {} {}'.format(MHC + '.QC.bgl', MHC + '.QC.pre.bgl.phased')) RUN_Bash("awk '{print $1}' %s > %s" % (MHC + '.QC.pre.markers', join(self.OUTPUT_dir, 'selected_snp.txt'))) from src.Panel_subset import Panel_Subset qc_refined = Panel_Subset(MHC + '.QC.pre', 'all', join(self.OUTPUT_dir, 'selected_snp.txt'), MHC + '.QC.refined') if not self.__save_intermediates: RUN_Bash('rm {}'.format(MHC + '.QC.pre.bgl.phased')) RUN_Bash('rm {}'.format(MHC + '.QC.pre.markers')) RUN_Bash('rm {}'.format(join(self.OUTPUT_dir, 'selected_snp.txt'))) ### Converting data to GC_change_beagle format. from src.bgl2GC_trick_bgl import Bgl2GC # target [GCchangeBGL, GCchangeMarkers] = Bgl2GC(MHC + '.QC.refined.bgl.phased', MHC + '.QC.refined.markers', MHC + '.QC.GCchange.bgl', MHC + '.QC.GCchange.markers') self.GCchangeBGL = GCchangeBGL # it will be used in 'CONVERT_OUT' with Genetic Map # print("<Target GCchanged bgl and marker file>\n" # "bgl : {}\n" # "markers : {}".format(GCchangeBGL, GCchangeMarkers)) # reference [GCchangeBGL_REF, GCchangeMarkers_REF ] = Bgl2GC(_reference + '.bgl.phased', RefinedMarkers, OUTPUT_dir_Exon234_ref + '.GCchange.bgl.phased', OUTPUT_dir_Exon234_ref + '.GCchange.markers') # print("<Reference GCchanged bgl and marker file>\n" # "bgl : {}\n" # "markers : {}".format(GCchangeBGL_REF, GCchangeMarkers_REF)) if not self.__save_intermediates: RUN_Bash('rm {}'.format(MHC + '.QC.refined.bgl.phased')) RUN_Bash('rm {}'.format(MHC + '.QC.refined.markers')) # RUN_Bash('rm {}'.format(RefinedMarkers)) # os.system(' '.join(['rm', RefinedMarkers])) # => This will be used in 'CONVERT_OUT" when not using Multiple Markers. ### Converting data to vcf_format # target RUN_Bash(self.BEAGLE2VCF + ' 6 {} {} 0 > {}'.format(GCchangeMarkers, GCchangeBGL, MHC + '.QC.vcf')) MHC_QC_VCF_exonN = MHC + '.QC.vcf' # reference RUN_Bash(self.BEAGLE2VCF + ' 6 {} {} 0 > {}'.format(GCchangeMarkers_REF, GCchangeBGL_REF, OUTPUT_dir_Exon234_ref + '.vcf')) reference_vcf = OUTPUT_dir_Exon234_ref + '.vcf' ### Converting data to reference_phased RUN_Bash('sed "s%/%|%g" {} > {}'.format( reference_vcf, OUTPUT_dir_Exon234_ref + '.phased.vcf')) REF_PHASED_VCF = OUTPUT_dir_Exon234_ref + '.phased.vcf' if not self.__save_intermediates: RUN_Bash('rm {}'.format(reference_vcf)) # # if self.f_useMultipleMarkers: # if not self.f_useGeneticMap: # os.system(' '.join(['rm {}'.format(GCchangeBGL)])) # 'GCchangeBGL' will be used in 'CONVERT_OUT' # os.system(' '.join(['rm {}'.format(GCchangeMarkers_REF)])) # 'GCchangeMarkers_REF' will be used in 'CONVERT_OUT' # os.system(' '.join(['rm {}'.format(GCchangeMarkers)])) # os.system(' '.join(['rm {}'.format(GCchangeBGL_REF)])) """ (1) `MHC_QC_VCF_exonN` := MHC + '.QC.vcf', (2) `REF_PHASED_VCF` := OUTPUT_dir_Exon234_ref + '.phased.vcf' These two files are to be passed into Beagle phasing; """ if f_prephasing: ############### < Multiple Markers > ############### ### Phasing & Doubling (only on Target Sample.) # Phasing PHASED_RESULT = self.Phasing(MHC, MHC_QC_VCF_exonN, REF_PHASED_VCF) # [Temporary Hardcoding for Phased Result] # PHASED_RESULT = "/Users/wansun/Git_Projects/CookHLA/tests/_3_CookHLA/20190716_BOTH/_3_HM_CEU_T1DGC_REF.MHC.QC.phasing_out_not_double" # print("[Temporary Hardcoding]Phased Result:\n{}".format(PHASED_RESULT)) # Doubling DOUBLED_PHASED_RESULT = self.Doubling(MHC, PHASED_RESULT) return DOUBLED_PHASED_RESULT else: return MHC_QC_VCF_exonN