def determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, combineBins, scaleFactor ): outBaseName = os.path.basename( inFileStr ) if outID == None: if '_wm_pos_' in inFileStr: outFileStr = outBaseName.replace( '_wm_pos_', '_epigenotype-results_{:s}_'.format( bth_util.binSizeToStr(binSize) ) ) else: outFileStr = 'out_epigenotype-results_{:s}.tsv'.format( bth_util.binSizeToStr(binSize) ) else: outFileStr = '{:s}_epigenotype-results_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize) ) # combining bins if combineBins > 0: outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format( combineBins ) ) # scale factor if scaleFactor != 1: s = str( scaleFactor ).replace('.','-') outFileStr = outFileStr.replace( '.tsv', '_s{:s}.tsv'.format(s)) # decoding and uniform if decoding != 'N' and isUniform: outFileStr = outFileStr.replace( '.tsv', '_uni-{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else 'vit-fb') ) ) elif decoding != 'N': outFileStr = outFileStr.replace( '.tsv', '_{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else 'vit-fb') ) ) elif isUniform: outFileStr = outFileStr.replace( '.tsv', '_uni.tsv' ) return outFileStr
def determineOutputFileName( inFileStr, outID, binSize, decoding, classProbs, combineBins ): outBaseName = os.path.basename( inFileStr ) if outID == None: if '_wm_pos_' in inFileStr: outFileStr = outBaseName.replace( '_wm_pos_', '_epigenotype-v7.1_{:s}_'.format( bth_util.binSizeToStr(binSize) ) ) else: outFileStr = 'out_epigenotype-v7.1_{:s}.tsv'.format( bth_util.binSizeToStr(binSize) ) else: outFileStr = '{:s}_epigenotype-v7.1_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize) ) # combining bins if combineBins > 0: outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format( combineBins ) ) # decoding and uniform if decoding != 'N' and classProbs == 'U': outFileStr = outFileStr.replace( '.tsv', '_uni-{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else ('both' if decoding == 'B' else 'vit-fb')) ) ) elif decoding != 'N' and classProbs == 'E': outFileStr = outFileStr.replace( '.tsv', '_epiril-{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else ('both' if decoding == 'B' else 'vit-fb')) ) ) elif decoding != 'N': outFileStr = outFileStr.replace( '.tsv', '_{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else ('both' if decoding == 'B' else 'vit-fb')) ) ) elif classProbs == 'U': outFileStr = outFileStr.replace( '.tsv', '_uni.tsv' ) elif classProbs == 'E': outFileStr = outFileStr.replace( '.tsv', '_epiril.tsv' ) return outFileStr
def determineOutputFileName(inFileStr, outID, binSize, decoding, generation, combineBins): outBaseName = os.path.basename(inFileStr) if outID == None: if '_wm_pos_' in inFileStr: outFileStr = outBaseName.replace( '_wm_pos_', '_epigenotype-v7.4_{:s}_g-{:d}_'.format( bth_util.binSizeToStr(binSize), generation)) else: outFileStr = 'out_epigenotype-v7.4_{:s}_g-{:d}.tsv'.format( bth_util.binSizeToStr(binSize), generation) else: outFileStr = '{:s}_epigenotype-v7.4_{:s}_g-{:d}.tsv'.format( outID, bth_util.binSizeToStr(binSize), generation) # combining bins if combineBins > 0: outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format(combineBins)) # decoding if decoding != 'N': outFileStr = outFileStr.replace( '.tsv', '_{:s}.tsv'.format('vit' if decoding == 'V' else ( 'fb' if decoding == 'F' else ( 'both' if decoding == 'B' else 'vit-fb')))) return outFileStr
def determineOutputFileName(inFileStr, outID, binSize, decoding, isUniform, combineBins): outBaseName = os.path.basename(inFileStr) if outID == None: if '_wm_pos_' in inFileStr: outFileStr = outBaseName.replace( '_wm_pos_', '_epigenotype_{:s}_'.format(bth_util.binSizeToStr(binSize))) else: outFileStr = 'out_epigenotype_{:s}.tsv'.format( bth_util.binSizeToStr(binSize)) else: outFileStr = '{:s}_epigenotype_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize)) # combining bins if combineBins > 0: outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format(combineBins)) # decoding and uniform if decoding != 'N' and isUniform: outFileStr = outFileStr.replace( '.tsv', '_uni-{:s}.tsv'.format('vit' if decoding == 'V' else ( 'fb' if decoding == 'F' else 'vit-fb'))) elif decoding != 'N': outFileStr = outFileStr.replace( '.tsv', '_{:s}.tsv'.format('vit' if decoding == 'V' else ( 'fb' if decoding == 'F' else 'vit-fb'))) elif isUniform: outFileStr = outFileStr.replace('.tsv', '_uni.tsv') return outFileStr
def determineTransFileName(inFileStr, outID, binSize, combineBins, scaleFactor): outBaseName = os.path.basename(inFileStr) if outID == None: if '_wm_pos_' in inFileStr: outFileStr = outBaseName.replace( '_wm_pos_', '_epigenotype-trans_{:s}_'.format( bth_util.binSizeToStr(binSize))) else: outFileStr = 'out_epigenotype-trans_{:s}.tsv'.format( bth_util.binSizeToStr(binSize)) else: outFileStr = '{:s}_epigenotype-trans_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize)) # combining bins if combineBins > 0: outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format(combineBins)) if scaleFactor != 1: s = str(scaleFactor).replace('.', '-') outFileStr = outFileStr.replace('.tsv', '_s{:s}.tsv'.format(s)) return outFileStr
def determineOutputFileName(inFileStr, outID, binSize, decoding, isUniform): outBaseName = os.path.basename(inFileStr) if outID == None: if '_wm_pos_' in inFileStr: outFileStr = outBaseName.replace( '_wm_pos_', '_logreg_{:s}_'.format(bth_util.binSizeToStr(binSize))) else: outFileStr = 'out_logreg_{:s}.tsv'.format( bth_util.binSizeToStr(binSize)) else: outFileStr = '{:s}_logreg_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize)) if decoding != 'N' and isUniform: outFileStr = outFileStr.replace( '.tsv', '_uni-{:s}.tsv'.format('vit' if decoding == 'V' else ( 'fb' if decoding == 'F' else 'vit_fb'))) elif decoding != 'N': outFileStr = outFileStr.replace( '.tsv', '_{:s}.tsv'.format('vit' if decoding == 'V' else ( 'fb' if decoding == 'F' else 'vit_fb'))) elif isUniform: outFileStr = outFileStr.replace('.tsv', '_uni.tsv') return outFileStr
def determineTransFileName( inFileStr, outID, binSize, combineBins ): outBaseName = os.path.basename( inFileStr ) if outID == None: if '_wm_pos_' in inFileStr: outFileStr = outBaseName.replace( '_wm_pos_', '_transition_{:s}_'.format( bth_util.binSizeToStr(binSize) ) ) else: outFileStr = 'out_transition_{:s}.tsv'.format( bth_util.binSizeToStr(binSize) ) else: outFileStr = '{:s}_transition_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize) ) # combining bins if combineBins > 0: outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format( combineBins ) ) return outFileStr
def processInputs( regionFileStr, bedFileAr, numProc, isStrand, isCSSR, cssrDist, outID ): sampleNamesAr = getSampleNames( bedFileAr ) # read region file print( 'Reading region file {:s}'.format( os.path.basename( regionFileStr ) ) ) if isCSSR: regionAr = readCSSRFile( regionFileStr, cssrDist ) else: regionAr = readRegionFile( regionFileStr, isStrand ) # process BED files useStrand = isStrand or isCSSR if outID == None: outID = bth_util.fileBaseName( regionFileStr ) print( 'Begin processing with {:d} processors'.format( numProc ) ) pool = multiprocessing.Pool( processes=numProc ) results = [ pool.apply_async( processBedFile, args=(f, regionAr, useStrand) ) for f in bedFileAr ] outDictMat = [ p.get() for p in results ] if isCSSR: outFileStr = '{:s}_rpm_cssr_{:s}.tsv'.format( outID, bth_util.binSizeToStr( cssrDist ) ) elif isStrand: outFileStr = '{:s}_rpm_stranded.tsv'.format( outID ) else: outFileStr = '{:s}_rpm.tsv'.format( outID ) print( 'Writing output to', outFileStr ) writeOutput( outFileStr, regionAr, outDictMat, sampleNamesAr, useStrand ) print( 'Done' )
def determineOutputFileName( inFileStr, outID, binSize, isSmoothing, isUniform ): outBaseName = os.path.basename( inFileStr ) if outID == None: if '_wm_pos_' in inFileStr: outFileStr = outBaseName.replace( '_wm_pos_', '_logreg_{:s}_'.format( bth_util.binSizeToStr(binSize) ) ) else: outFileStr = 'out_logreg_{:s}.tsv'.format( bth_util.binSizeToStr(binSize) ) else: outFileStr = '{:s}_logreg_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize) ) if isSmoothing and isUniform: outFileStr = outFileStr.replace( '.tsv', '_uni-opt.tsv' ) elif isSmoothing: outFileStr = outFileStr.replace( '.tsv', '_opt.tsv' ) elif isUniform: outFileStr = outFileStr.replace( '.tsv', '_uni.tsv' ) return outFileStr
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, isIndiv ): dType = ('Viterbi' if decoding == 'V' else ('Forward-backward' if decoding == 'F' else ('Viterbi and Forward-backward' if decoding == 'A' else 'None') ) ) info = '#from_script:epigenotype_by_logreg.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; indiv_transitions:{:s}\n'.format( os.path.basename( inFileStr), bth_util.binSizeToStr( binSize ), dType.lower().replace(' and ', ','), str(isUniform), str(isIndiv) ) print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label:', parentLabelAr[0] ) print( 'Father label:', parentLabelAr[1] ) print( 'Uniform classification probabilities:', str( isUniform ) ) print( 'Decoding algorithm:', dType) print( 'Individual transition probabilities:', str( isIndiv ) ) # build data frame df = pd.read_table( inFileStr, header=1 ) # check parent labels checkParents( df['sample'], parentLabelAr ) # group by bin and analyze df['bin'] = df.pos // binSize nbins = max(df['bin'])+1 dfg = df.groupby('bin') if numProc > 1: print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) res_class = runMultiClassification( dfg, numProc, parentLabelAr, isUniform ) else: print( 'Begin classifying {:d} bins'.format( nbins ) ) res_class = dfg.apply( classLogReg, pla=parentLabelAr, u=isUniform ) res_class.reset_index(inplace=True) # decode if necessary if decoding != 'N': ignoreAr = parentLabelAr + ['MPV'] if isIndiv: transitions = np.array([]) else: print( 'Generating transition matrix' ) transition = Transitions( res_class, ignore=ignoreAr ) transitions = transition.getTransitions() print(transitions) # find optimum path for all samples groups = res_class.groupby( 'sample' ) nsamples = len(groups.groups) if numProc > 1: print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format( dType, nsamples, numProc ) ) results = runMultiPath( groups, numProc, transitions, isUniform, decoding ) else: print( 'Begin {:s} decoding {:d} samples'.format( dType, nsamples ) ) results = groups.apply( findOptimalPath, trans=transitions, u=isUniform, d=decoding ) results.set_index( ['bin', 'sample'], inplace=True ) else: results = res_class # output file outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, isIndiv ) # write output print( 'Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info) results.to_csv( outFileStr, sep='\t', mode='a' ) print( 'Done' )
def printHelp(): print( 'Usage:\npython epigenotyping_pe_combbin.py [-u] [-c=bin_thresh] [-d=decoding_type][-p=num_proc]\n[-o=out_id] [-m=mother_sample] [-f=father_sample] [-b=bin_size] <input_file>' ) print('Requried:') print( 'input_file\tfile of of weighted methylation by position for samples') print('Optional:') print( '-u\t\tuniform class weights [default 1:2:1 for mother,\n\t\tMPV,father]' ) print( '-d=decode_type\tdecoding type to use (capitlization ignored) [default {:s}]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tBoth="all" or "a"\n\t\tOff="false", "none", or "n"' .format(DECODE)) print( '-o=out_id\tidentifier for output file [default "out" or variation of\n\t\tinput file name]' ) print('-p=num_proc\tnumber of processors [default {:d}'.format(NUMPROC)) print( '-c=bin_thresh\tminimum number of features per bin to be classified\n\t\tgroups bins to reach this number [default {:d}' .format(COMBINE)) print( '-m=mother_label\tsample name of mother; for correct classification\n\t\t[default mother]' ) print( '-f=father_label\tsample name of father; for correct classification\n\t\t[default father]' ) print('-b=bin_size\tsize of bins in bp [default {:s}]'.format( bth_util.binSizeToStr(BINSIZE)))
def printHelp(): print( 'Usage: python epigenotyping_pe.py [-u] [-d=decoding_type] [-p=num_proc] [-o=out_id] [-m=mother_samples] [-mx=add_mother_labels] [-f=father_samples] [-fx=add_father_labels] [-b=bin_size] <input_file>' ) print('Requried:') print( 'input_file\tfile of of weighted methylation by position for samples') print('Optional:') print( '-u\t\tuniform class weights [default 1:2:1 for mother,\n\t\tMPV,father]' ) print( '-d=decode_type\tdecoding type to use (capitlization ignored) [default A]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tBoth="all" or "a"\n\t\tOff="false", "none", or "n"' ) print( '-o=out_id\tidentifier for output file [default out or variation of\n\t\tinput file name]' ) print('-p=num_proc\tnumber of processors') print( '-m=mother_labels\tsample name(s) of mother; for correct classification\n\t\tand MPV calculation [default mother]' ) print( '-mx=add_mother_labels\tadditional samples to train as mother\n\t\tnot used for MPV calculation' ) print( '-f=father_labels\tsample name(s) of father; for correct classification\n\t\tand MPV calculation [default father]' ) print( '-fx=add_mother_labels\tadditional samples to train as father\n\t\tnot used for MPV calculation' ) print('-b=bin_size\tsize of bins in bp [default {:s}]'.format( bth_util.binSizeToStr(BINSIZE)))
def printHelp(): print( 'Usage:\npython epigenotyping_pe_v7.2.py [-q] [-g=generation] [-c=bin_thresh] [-d=decoding_type][-p=num_proc]\n[-o=out_id] [-m=mother_sample(s)] [-f=father_sample(s)] [-b=bin_size] <input_file>' ) print('Requried:') print( 'input_file\tfile of of weighted methylation by position for samples') print('Optional:') print('-q\t\tquiet, do not print progress') print( '-g=generation\tgeneration of self-crossing; used to determine\n\t\tclassification probabilities; use 0 for uniform weight\n\t\t[default 2]' ) print( '-d=decode_type\tdecoding type to use (capitlization ignored) [default {:s}]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tBoth="all" or "a"\n\t\tOff="false", "none", or "n"' .format(DECODE)) print( '-o=out_id\tidentifier for output file [default "out" or variation of\n\t\tinput file name]' ) print('-p=num_proc\tnumber of processors [default {:d}'.format(NUMPROC)) print( '-c=bin_thresh\tminimum number of features per bin to be classified\n\t\tgroups bins to reach this number [default {:d}' .format(COMBINE)) print( '-m=mother_samples\tsample name(s) of mother; for correct classification\n\t\t[default mother]' ) print( '-f=father_samples\tsample name(s) of father; for correct classification\n\t\t[default father]' ) print('-b=bin_size\tsize of bins in bp [default {:s}]'.format( bth_util.binSizeToStr(BINSIZE)))
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, parentAddLabelAr, decoding, isUniform ): info = '#from_script: epigenotyping_pe.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; mother_samples:{:s}; father_samples:{:s}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower(), ','.join(parentLabelAr[0]), ','.join(parentLabelAr[1]) ) print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label(s):', ', '.join(parentLabelAr[0]) ) print( 'Father label(s):', ', '.join(parentLabelAr[1]) ) if len(parentAddLabelAr[0]) != 0 or len(parentAddLabelAr[1]) != 0: print( 'Additional mother training label(s):', ('None' if len(parentAddLabelAr[0])==0 else ', '.join(parentAddLabelAr[0])) ) print( 'Additional father training label(s):', ('None' if len(parentAddLabelAr[1]) == 0 else ', '.join(parentAddLabelAr[1])) ) print( 'Uniform classification probabilities:', str(isUniform) ) print( 'Decoding algorithm:', formatDecoding( decoding ) ) # build dataframe df = pd.read_table( inFileStr, header=1 ) # check parent labels parentLabelAr = checkParents( df['sample'], parentLabelAr ) # check additional training data labels if len(parentAddLabelAr[0]) != 0 or len(parentAddLabelAr[1]) != 0: parentAddLabelAr = checkParents( df['sample'], parentAddLabelAr ) # group by bin df['bin'] = df.pos // binSize nbins = max(df['bin'])+1 dfBinGroup = df.groupby( 'bin' ) # classify by bin print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, parentAddLabelAr, isUniform ) dfClass.reset_index(inplace=True) #print( dfClass.head ) del(df, dfBinGroup ) # decode, if necessary if decoding != 'N': totalParentLabelAr = [parentLabelAr[0] + parentAddLabelAr[0], parentLabelAr[1] + parentAddLabelAr[1]] ignoreAr = flattenList( totalParentLabelAr ) + ['MPV'] transition = Transitions( dfClass, ignore = ignoreAr ) transitionMatrix = transition.getTransitions() # group by sample dfSampleGroup = dfClass.groupby( 'sample' ) nsamples = len(dfSampleGroup.groups ) print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding(decoding), nsamples, numProc ) ) dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, decoding ) dfOutput.set_index( ['bin', 'sample'], inplace=True ) del( dfSampleGroup ) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform ) print( 'Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info+'\n') dfOutput.to_csv( outFileStr, sep='\t', mode='a' ) print( 'Done' )
def determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform ): outBaseName = os.path.basename( inFileStr ) if outID == None: if '_wm_pos_' in inFileStr: outFileStr = outBaseName.replace( '_wm_pos_', '_logreg_{:s}_'.format( bth_util.binSizeToStr(binSize) ) ) else: outFileStr = 'out_logreg_{:s}.tsv'.format( bth_util.binSizeToStr(binSize) ) else: outFileStr = '{:s}_logreg_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize) ) if decoding != 'N' and isUniform: outFileStr = outFileStr.replace( '.tsv', '_uni-{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else 'vit_fb') ) ) elif decoding != 'N': outFileStr = outFileStr.replace( '.tsv', '_{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else 'vit_fb') ) ) elif isUniform: outFileStr = outFileStr.replace( '.tsv', '_uni.tsv' ) return outFileStr
def processInputs(gffFileStrAr, fastaIndexStr, labels, calcType, outID, chrmList, numProc, numBins, binSize): if labels == None: labels = getSampleNames( gffFileStrAr ) chrmDict = readFastaIndex( fastaIndexStr, chrmList ) print( 'Read FASTA index.' ) #print( chrmDict ) aNumBins, chrmDict = determineNumBins( chrmDict, numBins, binSize ) outFileStr = 'chrm_gff' if outID != '': outFileStr += '_' + outID outFileStr += '_' + ( 'length' if calcType == 'l' else 'number' ) if numBins != -1: outFileStr += '_n{:d}'.format( numBins ) elif binSize != -1: outFileStr += '_{:s}'.format( bth_util.binSizeToStr( binSize ) ) outFileStr += '.tsv' print( 'Begin processing with {:d} processors'.format( numProc ) ) pool = multiprocessing.Pool( processes=numProc ) results = [ pool.apply_async( processGFFFile, args=(f, chrmDict, binSize, aNumBins, calcType ) ) for f in gffFileStrAr ] gffDictAr = [ p.get() for p in results ] info = "#from_script:chrom_plot_gff_pe.py; " # gffAr gffTmpAr = [ os.path.basename(x) for x in gffFileStrAr ] info += 'gff_files:' + ','.join(gffTmpAr ) + ';' if binSize == -1: info += "num_bins:{:d}".format( aNumBins ) else: info += "bin_size:{:s}".format( bth_util.binSizeToStr( binSize ) ) info += "; num_chrms:{:d};".format( len( chrmDict.keys() ) ) info += " unit:{:s} per ".format( 'number' if calcType == 'n' else 'total bp' ) if binSize == -1: info += '10kb' elif (binSize // 1000000) > 0: info += str( binSize / 1000000 ) + 'mbp' elif (binSize // 1000) > 0: info += str( binSize / 1000 ) + 'kbp' else: info += str( binSize ) + 'bp' print( 'Writing output to {:s}...'.format( outFileStr ) ) writeOutput( outFileStr, gffDictAr, labels, info )
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, isSmoothing, isUniform ): info = '#from_script:epigenotype_by_logreg.py; in_file:{:s}; bin_size:{:s}'.format( os.path.basename( inFileStr), bth_util.binSizeToStr( binSize ) ) print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label:', parentLabelAr[0] ) print( 'Father label:', parentLabelAr[1] ) print( 'Smoothing:', str(isSmoothing) ) print( 'Uniform classification probabilities:', str( isUniform ) ) info += '; smoothing:{:s}; uni_class_prob:{:s}\n'.format( str(isSmoothing), str(isUniform) ) # build data frame df = pd.read_table( inFileStr, header=1 ) # check parent labels checkParents( df['sample'], parentLabelAr ) # put in bins and analyze df['bin'] = df.pos // binSize nbins = max(df['bin'])+1 dfg = df.groupby('bin') if numProc > 1: print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) res_class = runMultiprocessing( dfg, numProc, parentLabelAr, isUniform ) else: print( 'Begin classifying {:d} bins'.format( nbins ) ) res_class = dfg.apply( classLogRegImproved, pla=parentLabelAr, u=isUniform ) res_class.reset_index(inplace=True) # smooth by sample if isSmoothing: ignoreAr = parentLabelAr + ['MPV'] #transProbMat = computeTransitions( res_class, ignoreAr ) transition = SimpleTransitions( res_class, ignore=ignoreAr ) transProbMat = transition.run() print( transProbMat ) groups = res_class.groupby( 'sample' ) nsamples = len(groups.groups) # find optimum path for all samples, group by sample if numProc > 1: print( 'Begin smoothing {:d} samples with {:d} processors'.format( nsamples, numProc ) ) results = runMulti( groups, numProc, transProbMat ) else: print( 'Begin smoothing {:d} samples'.format( nsamples ) ) results = groups.apply( findOptimalPath, trans=transProbMat ) results.set_index( ['bin', 'sample'], inplace=True ) else: results = res_class # output file outFileStr = determineOutputFileName( inFileStr, outID, binSize, isSmoothing, isUniform ) # write output print( 'Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info) results.to_csv( outFileStr, sep='\t', mode='a' ) print( 'Done' )
def determineTransFileName( inFileStr, outID, binSize, combineBins, scaleFactor ): outBaseName = os.path.basename( inFileStr ) if outID == None: if '_wm_pos_' in inFileStr: outFileStr = outBaseName.replace( '_wm_pos_', '_epigenotype-trans_{:s}_'.format( bth_util.binSizeToStr(binSize) ) ) else: outFileStr = 'out_epigenotype-trans_{:s}.tsv'.format( bth_util.binSizeToStr(binSize) ) else: outFileStr = '{:s}_epigenotype-trans_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize) ) # combining bins if combineBins > 0: outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format( combineBins ) ) if scaleFactor != 1: s = str( scaleFactor ).replace('.','-') outFileStr = outFileStr.replace( '.tsv', '_s{:s}.tsv'.format(s)) return outFileStr
def determineOutputFileName( inFileStr, outID, binSize, decoding, generation, combineBins ): outBaseName = os.path.basename( inFileStr ) if outID == None: if '_wm_pos_' in inFileStr: outFileStr = outBaseName.replace( '_wm_pos_', '_epigenotype-v7.4_{:s}_g-{:d}_'.format( bth_util.binSizeToStr(binSize), generation ) ) else: outFileStr = 'out_epigenotype-v7.4_{:s}_g-{:d}.tsv'.format( bth_util.binSizeToStr(binSize), generation ) else: outFileStr = '{:s}_epigenotype-v7.4_{:s}_g-{:d}.tsv'.format( outID, bth_util.binSizeToStr(binSize), generation ) # combining bins if combineBins > 0: outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format( combineBins ) ) # decoding if decoding != 'N': outFileStr = outFileStr.replace( '.tsv', '_{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else ('both' if decoding == 'B' else 'vit-fb')) ) ) return outFileStr
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, maxIter ): dType = ('Viterbi' if decoding == 'V' else ('Forward-backward' if decoding == 'F' else ('Viterbi and Forward-backward' if decoding == 'A' else 'None') ) ) info = '#from_script:epigenotype_by_logreg.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}\n'.format( os.path.basename( inFileStr), bth_util.binSizeToStr( binSize ), dType.lower().replace(' and ', ','), str(isUniform) ) print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label:', parentLabelAr[0] ) print( 'Father label:', parentLabelAr[1] ) print( 'Uniform classification probabilities:', str( isUniform ) ) print( 'Decoding algorithm:', dType) # build data frame df = pd.read_table( inFileStr, header=1 ) # check parent labels checkParents( df['sample'], parentLabelAr ) # group by bin and analyze df['bin'] = df.pos // binSize nbins = max(df['bin'])+1 dfg = df.groupby('bin') if numProc > 1: print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) res_class = runMultiClassification( dfg, numProc, parentLabelAr, isUniform ) else: print( 'Begin classifying {:d} bins'.format( nbins ) ) res_class = dfg.apply( classLogReg, pla=parentLabelAr, u=isUniform ) res_class.reset_index(inplace=True) # decode if necessary if decoding != 'N': ignoreAr = parentLabelAr + ['MPV'] print( 'Generating transition matrix' ) transition = Transitions( res_class, ignore=ignoreAr ) transitions = transition.getTransitions() # find optimum path for all samples groups = res_class.groupby( 'sample' ) nsamples = len(groups.groups) if numProc > 1: print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format( dType, nsamples, numProc ) ) results = runMultiPath( groups, numProc, transitions, isUniform, decoding ) else: print( 'Begin {:s} decoding {:d} samples'.format( dType, nsamples ) ) results = groups.apply( findOptimalPath, trans=transitions, u=isUniform, d=decoding ) results.set_index( ['bin', 'sample'], inplace=True ) else: results = res_class # output file outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform ) # write output print( 'Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info) results.to_csv( outFileStr, sep='\t', mode='a' ) print( 'Done' )
def determineOutputFileName(inFileStr, outID, binSize, isSmoothing, isUniform): outBaseName = os.path.basename(inFileStr) if outID == None: if '_wm_pos_' in inFileStr: outFileStr = outBaseName.replace( '_wm_pos_', '_logreg_{:s}_'.format(bth_util.binSizeToStr(binSize))) else: outFileStr = 'out_logreg_{:s}.tsv'.format( bth_util.binSizeToStr(binSize)) else: outFileStr = '{:s}_logreg_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize)) if isSmoothing and isUniform: outFileStr = outFileStr.replace('.tsv', '_uni-opt.tsv') elif isSmoothing: outFileStr = outFileStr.replace('.tsv', '_opt.tsv') elif isUniform: outFileStr = outFileStr.replace('.tsv', '_uni.tsv') return outFileStr
def determineTransFileName(inFileStr, outID, binSize, combineBins): outBaseName = os.path.basename(inFileStr) if outID == None: if '_wm_pos_' in inFileStr: outFileStr = outBaseName.replace( '_wm_pos_', '_transition_{:s}_'.format(bth_util.binSizeToStr(binSize))) else: outFileStr = 'out_transition_{:s}.tsv'.format( bth_util.binSizeToStr(binSize)) else: outFileStr = '{:s}_transition_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize)) # combining bins if combineBins > 0: outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format(combineBins)) return outFileStr
def parseInputs(argv): numProc = NUMPROC binSize = BINSIZE outID = None parentLabelAr = ['mother', 'father', 0] isSmoothing = True isUniform = False startInd = 0 for i in range(min(7, len(argv))): if argv[i].startswith('-o='): outID = argv[i][3:] startInd += 1 elif argv[i].startswith('-b='): inStr = argv[i][3:] binSize = bth_util.strToDistance(inStr) if binSize == False: print( 'WARNING: cannot convert {:s} to bin size...using default {:s}' .format(inStr, bth_util.binSizeToStr(BINSIZE))) binSize = BINSIZE startInd += 1 elif argv[i].startswith('-p='): try: numProc = int(argv[i][3:]) startInd += 1 except ValueError: print( 'WARNING: number of processors must be integer...using 1') numProc = NUMPROC elif argv[i].startswith('-m='): parentLabelAr[0] = argv[i][3:] parentLabelAr[2] += 1 startInd += 1 elif argv[i].startswith('-f='): parentLabelAr[1] = argv[i][3:] parentLabelAr[2] += 2 startInd += 1 elif argv[i] == '-n': isSmoothing = False startInd += 1 elif argv[i] == '-u': isUniform = True startInd += 1 elif argv[i] in ['-h', '--help', '-help']: printHelp() exit() elif argv[i].startswith('-'): print('ERROR: {:s} is not a valid option'.format(argv[i])) exit() # end for inFileStr = argv[startInd] processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, isSmoothing, isUniform)
def printHelp(): print( 'Usage: python epigenotyping_pe.py [-u] [-d=decoding_type] [-p=num_proc] [-o=out_id] [-m=mother_sample] [-f=father_sample] [-b=bin_size] <input_file>' ) print( 'Requried:' ) print( 'input_file\tfile of of weighted methylation by position for samples' ) print( 'Optional:' ) print( '-u\t\tuniform class weights [default 1:2:1 for mother,\n\t\tMPV,father]' ) print( '-d=decode_type\tdecoding type to use (capitlization ignored) [default A]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tBoth="all" or "a"\n\t\tOff="false", "none", or "n"' ) print( '-o=out_id\tidentifier for output file [default out or variation of\n\t\tinput file name]' ) print( '-p=num_proc\tnumber of processors' ) print( '-m=mother_label\tsample name of mother; for correct classification\n\t\t[default mother]' ) print( '-f=father_label\tsample name of father; for correct classification\n\t\t[default father]' ) print( '-b=bin_size\tsize of bins in bp [default {:s}]'.format( bth_util.binSizeToStr( BINSIZE ) ) )
def processInputs(bedFileStrAr, fastaIndexStr, labels, outID, chrmList, numProc, numBins, binSize, percentile): if labels == None: labels = getSampleNames(bedFileStrAr) print('Reading FASTA index') chrmDict = readFastaIndex(fastaIndexStr, chrmList) #print( chrmDict ) aNumBins, chrmDict = determineNumBins(chrmDict, numBins, binSize) outFileStr = 'chrm_bed' if outID != '': outFileStr += '_' + outID if numBins != -1: outFileStr += '_n{:d}'.format(numBins) elif binSize != -1: outFileStr += '_{:s}'.format(bth_util.binSizeToStr(binSize)) outFileStr += '.tsv' print('Begin processing with {:d} processors'.format(numProc)) pool = multiprocessing.Pool(processes=numProc) results = [ pool.apply_async(processBEDFile, args=(f, chrmDict, binSize, aNumBins, percentile)) for f in bedFileStrAr ] bedDictAr = [p.get() for p in results] info = "#from_script:chrom_plot_bed_mid_pe.py; " if binSize == -1: info += "num_bins:{:d}; ".format(aNumBins) else: info += "bin_size:{:s}; ".format(bth_util.binSizeToStr(binSize)) info += "num_chrms:{:d}; ".format(len(chrmDict.keys())) info += "percentile:{:.1f}; ".format(percentile * 100) info += "unit:million reads per bin normalized by library size".format() print('Writing output to {:s}...'.format(outFileStr)) writeOutput(outFileStr, bedDictAr, labels, info)
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform ): info = '#from_script: epigenotyping_pe.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower() ) print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label:', parentLabelAr[0] ) print( 'Father label:', parentLabelAr[1] ) print( 'Uniform classification probabilities:', str(isUniform) ) print( 'Decoding algorithm:', formatDecoding( decoding ) ) # build dataframe df = pd.read_table( inFileStr, header=1 ) # check parent labels checkParents( df['sample'], parentLabelAr ) # group by bin df['bin'] = df.pos // binSize nbins = max(df['bin'])+1 dfBinGroup = df.groupby( 'bin' ) # classify by bin print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, isUniform ) dfClass.reset_index(inplace=True) #print( dfClass.head ) del(df, dfBinGroup ) # decode, if necessary if decoding != 'N': ignoreAr = parentLabelAr[:2] + ['MPV'] transition = Transitions( dfClass, ignore = ignoreAr ) transitionMatrix = transition.getTransitions() # group by sample dfSampleGroup = dfClass.groupby( 'sample' ) nsamples = len(dfSampleGroup.groups ) print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding(decoding), nsamples, numProc ) ) dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, decoding ) dfOutput.set_index( ['bin', 'sample'], inplace=True ) del( dfSampleGroup ) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform ) print( 'Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info+'\n') dfOutput.to_csv( outFileStr, sep='\t', mode='a' ) print( 'Done' )
def determineOutputFileName(inFileStr, outID, binSize, decoding, classProbs, scaleTransitions, combineBins): outBaseName = os.path.basename(inFileStr) if outID == None: if '_wm_pos_' in inFileStr: outFileStr = outBaseName.replace( '_wm_pos_', '_epigenotype-v9_{:s}_'.format(bth_util.binSizeToStr(binSize))) else: outFileStr = 'out_epigenotype-v9_{:s}.tsv'.format( bth_util.binSizeToStr(binSize)) else: outFileStr = '{:s}_epigenotype-v9_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize)) # combining bins if combineBins > 0: outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format(combineBins)) if scaleTransitions: outFileStr = outFileStr.replace('.tsv', '_scaled.tsv'.format(combineBins)) # decoding and uniform if decoding != 'N' and classProbs == 'E': outFileStr = outFileStr.replace( '.tsv', '_epiril-{:s}.tsv'.format('vit' if decoding == 'V' else ( 'fb' if decoding == 'F' else ( 'both' if decoding == 'B' else 'vit-fb')))) elif decoding != 'N': outFileStr = outFileStr.replace( '.tsv', '_{:s}.tsv'.format('vit' if decoding == 'V' else ( 'fb' if decoding == 'F' else ( 'both' if decoding == 'B' else 'vit-fb')))) elif classProbs == 'E': outFileStr = outFileStr.replace('.tsv', '_epiril.tsv') return outFileStr
def printHelp(): print( 'Usage:\npython epigenotyping_combin_iter-trans.py [-u] [-c=bin_thresh] [-d=decoding_type][-p=num_proc]\n[-o=out_id] [-m=mother_sample] [-f=father_sample] [-b=bin_size] [-n=max_iter] <input_file>' ) print( 'Requried:' ) print( 'input_file\tfile of of weighted methylation by position for samples' ) print( 'Optional:' ) print( '-u\t\tuniform class weights [default 1:2:1 for mother,\n\t\tMPV,father]' ) print( '-d=decode_type\tdecoding type to use (capitlization ignored) [default {:s}]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tBoth="all" or "a"\n\t\tOff="false", "none", or "n"'.format(DECODE) ) print( '-o=out_id\tidentifier for output file [default "out" or variation of\n\t\tinput file name]' ) print( '-p=num_proc\tnumber of processors [default {:d}]'.format(NUMPROC) ) print( '-c=bin_thresh\tminimum number of features per bin to be classified\n\t\tgroups bins to reach this number [default {:d}]'.format(COMBINE) ) print( '-m=mother_label\tsample name of mother; for correct classification\n\t\t[default mother]' ) print( '-f=father_label\tsample name of father; for correct classification\n\t\t[default father]' ) print( '-b=bin_size\tsize of bins in bp [default {:s}]'.format( bth_util.binSizeToStr( BINSIZE ) ) ) print( '-n=max_iter\tmaximum iterations to improve transition matrix [default {:s}]'.format(MAXITER) )
def printHelp(): print( 'Usage:\npython epigenotyping_pe_v8.2.py [-e | -u] [-q] [-c=bin_thresh] [-d=decoding_type][-p=num_proc]\n[-o=out_id] [-m=mother_sample(s)] [-f=father_sample(s)] [-b=bin_size] <input_file>' ) print( 'Requried:' ) print( 'input_file\tfile of of weighted methylation by position for samples' ) print( 'Optional:' ) print( '-e\t\tclass weights for epiRILs; 1:0:1 for mother,MPV,father' ) print( '-q\t\tquiet, do not print progress' ) print( '-d=decode_type\tdecoding type to use (capitlization ignored) [default {:s}]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tBoth="all" or "a"\n\t\tOff="false", "none", or "n"'.format(DECODE) ) print( '-o=out_id\tidentifier for output file [default "out" or variation of\n\t\tinput file name]' ) print( '-p=num_proc\tnumber of processors [default {:d}'.format(NUMPROC) ) print( '-c=bin_thresh\tminimum number of features per bin to be classified\n\t\tgroups bins to reach this number [default {:d}'.format(COMBINE) ) print( '-m=mother_samples\tsample name(s) of mother; for correct classification\n\t\t[default mother]' ) print( '-f=father_samples\tsample name(s) of father; for correct classification\n\t\t[default father]' ) print( '-b=bin_size\tsize of bins in bp [default {:s}]'.format( bth_util.binSizeToStr( BINSIZE ) ) )
def printHelp(): print( 'Usage:\npython epigenotyping_pe_combbin_scaled.py [-u] [-c=bin_thresh] [-d=decoding_type][-p=num_proc]\n[-o=out_id] [-m=mother_sample] [-f=father_sample] [-b=bin_size] [-t=cent_start,cent_end] [-s=scale_factor] <input_file>' ) print( 'Requried:' ) print( 'input_file\tfile of of weighted methylation by position for samples' ) print( 'Optional:' ) print( '-u\t\tuniform class weights [default 1:2:1 for mother,\n\t\tMPV,father]' ) print( '-d=decode_type\tdecoding type to use (capitlization ignored) [default {:s}]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tBoth="all" or "a"\n\t\tOff="false", "none", or "n"'.format(DECODE) ) print( '-o=out_id\tidentifier for output file [default "out" or variation of\n\t\tinput file name]' ) print( '-p=num_proc\tnumber of processors [default {:d}'.format(NUMPROC) ) print( '-c=bin_thresh\tminimum number of features per bin to be classified\n\t\tgroups bins to reach this number [default {:d}'.format(COMBINE) ) print( '-m=mother_label\tsample name of mother; for correct classification\n\t\t[default mother]' ) print( '-f=father_label\tsample name of father; for correct classification\n\t\t[default father]' ) print( '-b=bin_size\tsize of bins in bp [default {:s}]'.format( bth_util.binSizeToStr( BINSIZE ) ) ) print( '-t=cent_start,cent_end\tcoordinates for centromere [default None]\n\t\twhen included, ignores this region for transitions and decoding' ) print( '-s=scale_factor\tmultiplicative factor for weighting prediction probability over transition probability [default {:d} (unscaled)]'.format( SCALE ) )
def processInputs( bedFileStrAr, fastaIndexStr, labels, outID, chrmList, numProc, numBins, binSize, percentile ): if labels == None: labels = getSampleNames( bedFileStrAr ) print( 'Reading FASTA index' ) chrmDict = readFastaIndex( fastaIndexStr, chrmList ) #print( chrmDict ) aNumBins, chrmDict = determineNumBins( chrmDict, numBins, binSize ) outFileStr = 'chrm_bed' if outID != '': outFileStr += '_' + outID if numBins != -1: outFileStr += '_n{:d}'.format( numBins ) elif binSize != -1: outFileStr += '_{:s}'.format( bth_util.binSizeToStr( binSize ) ) outFileStr += '.tsv' print( 'Begin processing with {:d} processors'.format( numProc ) ) pool = multiprocessing.Pool( processes=numProc ) results = [ pool.apply_async( processBEDFile, args=(f, chrmDict, binSize, aNumBins, percentile ) ) for f in bedFileStrAr ] bedDictAr = [ p.get() for p in results ] info = "#from_script:chrom_plot_bed_mid_pe.py; " if binSize == -1: info += "num_bins:{:d}; ".format( aNumBins ) else: info += "bin_size:{:s}; ".format( bth_util.binSizeToStr( binSize ) ) info += "num_chrms:{:d}; ".format( len( chrmDict.keys() ) ) info += "percentile:{:.1f}; ".format( percentile*100 ) info += "unit:million reads per bin normalized by library size".format( ) print( 'Writing output to {:s}...'.format( outFileStr ) ) writeOutput( outFileStr, bedDictAr, labels, info )
def printHelp(): print( 'Usage:\tpython epigenotyping_pe_v7.3.py [-q] [-n-mpv] [-t-out] [-g=generation]\n\t[-c=bin_thresh] [-d=decoding_type] [-p=num_proc] [-o=out_id] [-m=mother_\n\tsamples][-f=father_samples] [-b=bin_size] [-t=centromere] <input_file>' ) print() print( 'Requried:' ) print( 'input_file\ttab-delimited file of of weighted methylation by position for samples' ) print() print( 'Optional:' ) print( '-q\t\tquiet; do not print progress' ) print( '-h\t\tprint help and exit' ) print( '-n-mpv\t\tdo not check for systematic mid-parent bias' ) print( '-t-out\t\twrite transition matrix to file' ) print( '-g=generation\tgeneration of self-crossing; used to determine classification\n\t\tprobabilities; use 0 for uniform weight [default {:d}]'.format( GENERATION) ) print( '-d=decode_type\tdecoding type to use (capitlization ignored) [default {:s}]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tAll (FB and Vit independently)="all" or "a"\n\t\tBoth (FB then Vit)="both" or "b"\n\t\tOff="false", "none", or "n"'.format(DECODE) ) print( '-o=out_id\tidentifier for output file [default "out" or variation of\n\t\tinput file name]' ) print( '-p=num_proc\tnumber of processors [default {:d}'.format(NUMPROC) ) print( '-c=bin_thresh\tminimum number of features per bin to be classified\n\t\tgroups bins to reach this number [default {:d}'.format(COMBINE) ) print( '-m=mother_samples\tcomma-separated sample name(s) of mother\n\t\t[default mother]' ) print( '-f=father_samples\tcomma-separated sample name(s) of father\n\t\t[default father]' ) print( '-b=bin_size\tsize of bins in bp [default {:s}]'.format( bth_util.binSizeToStr( BINSIZE ) ) ) print( '-t=centromere\tcentromere coordinates as "start,end"; can include multipe\n\t\tcentromeres as "start1,end1,start2,end2..." [default None]' )
def printHelp(): print( 'Usage:\tpython epigenotyping_pe_v7.3.py [-q] [-n-mpv] [-t-out] [-g=generation]\n\t[-c=bin_thresh] [-d=decoding_type] [-p=num_proc] [-o=out_id] [-m=mother_\n\tsamples][-f=father_samples] [-b=bin_size] [-t=centromere] <input_file>' ) print() print('Requried:') print( 'input_file\ttab-delimited file of of weighted methylation by position for samples' ) print() print('Optional:') print('-q\t\tquiet; do not print progress') print('-h\t\tprint help and exit') print('-n-mpv\t\tdo not check for systematic mid-parent bias') print('-t-out\t\twrite transition matrix to file') print( '-g=generation\tgeneration of self-crossing; used to determine classification\n\t\tprobabilities; use 0 for uniform weight [default {:d}]' .format(GENERATION)) print( '-d=decode_type\tdecoding type to use (capitlization ignored) [default {:s}]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tAll (FB and Vit independently)="all" or "a"\n\t\tBoth (FB then Vit)="both" or "b"\n\t\tOff="false", "none", or "n"' .format(DECODE)) print( '-o=out_id\tidentifier for output file [default "out" or variation of\n\t\tinput file name]' ) print('-p=num_proc\tnumber of processors [default {:d}'.format(NUMPROC)) print( '-c=bin_thresh\tminimum number of features per bin to be classified\n\t\tgroups bins to reach this number [default {:d}' .format(COMBINE)) print( '-m=mother_samples\tcomma-separated sample name(s) of mother\n\t\t[default mother]' ) print( '-f=father_samples\tcomma-separated sample name(s) of father\n\t\t[default father]' ) print('-b=bin_size\tsize of bins in bp [default {:s}]'.format( bth_util.binSizeToStr(BINSIZE))) print( '-t=centromere\tcentromere coordinates as "start,end"; can include multipe\n\t\tcentromeres as "start1,end1,start2,end2..." [default None]' )
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, classProbs, combineBins, cent, isPrint ): info = '#from_script: epigenotyping_pe_v7.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; class_prob:{:s}; combine_bins_threshold:{:d}; centromere_{:s}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), formatClassProbs(classProbs).lower(), combineBins, ('None' if cent == None else '{:s}-{:s}'.format( bth_util.binSizeToStr( cent[0] ), bth_util.binSizeToStr( cent[1] ) ) ) ) if isPrint: print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label(s):', parentLabelAr[0] ) print( 'Father label(s):', parentLabelAr[1] ) print( 'Classification probabilities:', formatClassProbs( classProbs ) ) print( 'Decoding algorithm:', formatDecoding( decoding ) ) print( 'Combine bin feature threshold:', combineBins ) if cent == None: centStr = 'None' else: centStr = '' for i in range(len(cent)//2): si = i*2 centStr += '; {:s}-{:s}'.format( bth_util.binSizeToStr( cent[si] ), bth_util.binSizeToStr( cent[si+1] ) ) centStr = centStr[2:] if isPrint: print( 'Centromere:', centStr ) # build dataframe if isPrint: print( ' Reading input file', os.path.basename( inFileStr ) ) df = pd.read_table( inFileStr, header=1 ) # check parent labels newParentLabelAr = checkParents( df['sample'], parentLabelAr ) tIgnoreAr = flattenList( newParentLabelAr[:2] ) for i in range(len(newParentLabelAr[0])): tIgnoreAr += [ 'MPV{:d}'.format( i ) ] # group by bin df['bin'] = df.pos // binSize transformation = None # get centromere bins if necessary if cent == None: centBins = [] else: cent = [ x // binSize for x in cent ] centBins = [] #centBins = list( range(cent[0], cent[1]+1) ) for i in range(len(cent) // 2 ): si = i * 2 centBins += list( range(cent[si], cent[si+1]+1) ) # combine bins if necessary nbins = max(df['bin'])+1 if combineBins > 0: if isPrint: print( ' Merging bins', end=' ... ' ) df['tBin'] = df['bin'] transformation = binTransformation( df, combineBins ) # apply the transformation df['bin'] = df['tBin'].apply( lambda x: transformation[x] ) dfBinGroup = df.groupby( 'bin' ) if combineBins > 0: newNBins = len( dfBinGroup.groups ) info += '; non-functional_bins:{:d}'.format( nbins - newNBins ) if isPrint: print( 'combined {:d} non-functional bins'.format( nbins - newNBins ) ) # classify by bin if isPrint: print( ' Classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) dfClass = runClassification( dfBinGroup, numProc, newParentLabelAr, classProbs ) dfClass.reset_index(inplace=True) #print( dfClass.head ) del(df, dfBinGroup ) # decode, if necessary if decoding != 'N': #ignoreAr = parentLabelAr[:2] + ['MPV'] transition = Transitions( dfClass, ignore = tIgnoreAr ) transitionMatrix = transition.getTransitions() # write this matrix to file #outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins ) #tLabels = [ 'mother', 'MPV', 'father' ] #transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels ) #with open( outFStr, 'w' ) as f: # f.write(info+'\n') #transData.to_csv( outFStr, sep='\t', mode='a' ) # group by sample dfSampleGroup = dfClass.groupby( 'sample' ) nsamples = len( dfSampleGroup.groups ) tmpDecoding = ( 'F' if decoding == 'B' else decoding ) if isPrint: print( ' {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding(tmpDecoding), nsamples, numProc ) ) dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, tmpDecoding, centBins ) if decoding == 'B': dfNew = dfOutput.loc[:,['bin','sample']].copy() dfNew['MPV'] = np.log(dfOutput['fb.score.MPV']) dfNew['mother'] = np.log(dfOutput['fb.score.mother']) dfNew['father'] = np.log(dfOutput['fb.score.father']) dfNew['prediction'] = dfOutput['fb.prediction'] #print(dfOutput.head()) #print(dfNew.head()) transition = Transitions( dfNew, ignore = tIgnoreAr ) transitionMatrix = transition.getTransitions() dfSampleGroup = dfNew.groupby( 'sample' ) nsamples = len( dfSampleGroup.groups ) if isPrint: print( ' {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding('V'), nsamples, numProc ) ) dfOutputN = runDecoding( dfSampleGroup, numProc, transitionMatrix, 'V', centBins ) dfOutput[['vit.score.mother', 'vit.score.father', 'vit.score.MPV', 'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV', 'vit.prediction']] = dfOutputN[['vit.score.mother', 'vit.score.father', 'vit.score.MPV', 'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV', 'vit.prediction']] #print( dfOutput.head() ) # end decoding == B dfOutput.set_index( ['bin', 'sample'], inplace=True ) del( dfSampleGroup ) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, classProbs, combineBins ) # if combination, undo transformation by applying the predictions to additional bins if combineBins > 0: dfOutput.reset_index(inplace=True) dfOutput['cBin'] = dfOutput['bin'] dfOutputT = undoBinTransformation( dfOutput, transformation ) else: dfOutputT = dfOutput.drop('cBin', axis=1) if isPrint: print( ' Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info+'\n') dfOutputT.to_csv( outFileStr, sep='\t', mode='a' ) if isPrint: print( 'Done' )
def processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, isSmoothing, isUniform): info = '#from_script:epigenotype_by_logreg.py; in_file:{:s}; bin_size:{:s}'.format( os.path.basename(inFileStr), bth_util.binSizeToStr(binSize)) print('Weighted methylation file:', os.path.basename(inFileStr)) print('Bin size:', bth_util.binSizeToStr(binSize)) print('Mother label:', parentLabelAr[0]) print('Father label:', parentLabelAr[1]) print('Smoothing:', str(isSmoothing)) print('Uniform classification probabilities:', str(isUniform)) info += '; smoothing:{:s}; uni_class_prob:{:s}\n'.format( str(isSmoothing), str(isUniform)) # build data frame df = pd.read_table(inFileStr, header=1) # check parent labels checkParents(df['sample'], parentLabelAr) # put in bins and analyze df['bin'] = df.pos // binSize nbins = max(df['bin']) + 1 dfg = df.groupby('bin') if numProc > 1: print('Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc)) res_class = runMultiprocessing(dfg, numProc, parentLabelAr, isUniform) else: print('Begin classifying {:d} bins'.format(nbins)) res_class = dfg.apply(classLogRegImproved, pla=parentLabelAr, u=isUniform) res_class.reset_index(inplace=True) # smooth by sample if isSmoothing: ignoreAr = parentLabelAr + ['MPV'] #transProbMat = computeTransitions( res_class, ignoreAr ) transition = SimpleTransitions(res_class, ignore=ignoreAr) transProbMat = transition.run() print(transProbMat) groups = res_class.groupby('sample') nsamples = len(groups.groups) # find optimum path for all samples, group by sample if numProc > 1: print('Begin smoothing {:d} samples with {:d} processors'.format( nsamples, numProc)) results = runMulti(groups, numProc, transProbMat) else: print('Begin smoothing {:d} samples'.format(nsamples)) results = groups.apply(findOptimalPath, trans=transProbMat) results.set_index(['bin', 'sample'], inplace=True) else: results = res_class # output file outFileStr = determineOutputFileName(inFileStr, outID, binSize, isSmoothing, isUniform) # write output print('Writing output to', outFileStr) with open(outFileStr, 'w') as f: f.write(info) results.to_csv(outFileStr, sep='\t', mode='a') print('Done')
def parseInputs(argv): numProc = NUMPROC binSize = BINSIZE outID = None parentLabelAr = ['mother', 'father', 0] decoding = DECODE isUniform = UNIFORM combineBins = COMBINE centromere = None scaleFactor = SCALE startInd = 0 for i in range(min(9, len(argv) - 1)): if argv[i].startswith('-o='): outID = argv[i][3:] startInd += 1 elif argv[i].startswith('-b='): inStr = argv[i][3:] binSize = bth_util.strToDistance(inStr) if binSize == False: print( 'WARNING: cannot convert {:s} to bin size...using default {:s}' .format(inStr, bth_util.binSizeToStr(BINSIZE))) binSize = BINSIZE startInd += 1 elif argv[i].startswith('-p='): try: numProc = int(argv[i][3:]) startInd += 1 except ValueError: print( 'WARNING: number of processors must be integer...using 1') numProc = NUMPROC elif argv[i].startswith('-c='): try: combineBins = int(argv[i][3:]) startInd += 1 except ValueError: print( 'WARNING: number of processors must be integer...using default {:s}' .format(COMBINE)) combineBins = COMBINE elif argv[i].startswith('-m='): parentLabelAr[0] = argv[i][3:] parentLabelAr[2] += 1 startInd += 1 elif argv[i].startswith('-f='): parentLabelAr[1] = argv[i][3:] parentLabelAr[2] += 2 startInd += 1 elif argv[i].startswith('-d='): opt = argv[i][3:].lower() if opt == 'false' or opt == 'none' or opt == 'n': decoding = 'N' elif opt == 'viterbi' or opt == 'v': decoding = 'V' elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb': decoding = 'F' elif opt == 'all' or opt == 'a': decoding = 'A' else: print( 'WARNING: decoding option {:s} not recognized...using default {:s}' .format(opt, DECODE)) startInd += 1 elif argv[i] == '-u': isUniform = True startInd += 1 elif argv[i].startswith('-t='): tmp = argv[i][3:].split(',') tmp2 = [bth_util.strToDistance(x) for x in tmp] if len(tmp2) != 2 or (False in tmp2): print('WARNING: centromere coordinates bad...not using') else: centromere = tmp2 startInd += 1 elif argv[i].startswith('-s='): try: scaleFactor = float(argv[i][3:]) startInd += 1 if scaleFactor == 0: print( 'WARNING: scale factor must be greater than 0...using default', SCALE) except ValueError: print( 'WARNING: scale factor must be numeric...using default {:s}' .format(SCALE)) scaleFactor = SCALE elif argv[i] in ['-h', '--help', '-help']: printHelp() exit() elif argv[i].startswith('-'): print('ERROR: {:s} is not a valid option'.format(argv[i])) exit() # end for inFileStr = argv[startInd] processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins, centromere, scaleFactor)
def parseInputs(argv): numProc = NUMPROC binSize = BINSIZE outID = None parentLabelAr = ['mother', 'father', 0] decoding = DECODE isUniform = UNIFORM combineBins = COMBINE startInd = 0 for i in range(min(7, len(argv) - 1)): if argv[i].startswith('-o='): outID = argv[i][3:] startInd += 1 elif argv[i].startswith('-b='): inStr = argv[i][3:] binSize = bth_util.strToDistance(inStr) if binSize == False: print( 'WARNING: cannot convert {:s} to bin size...using default {:s}' .format(inStr, bth_util.binSizeToStr(BINSIZE))) binSize = BINSIZE startInd += 1 elif argv[i].startswith('-p='): try: numProc = int(argv[i][3:]) startInd += 1 except ValueError: print( 'WARNING: number of processors must be integer...using 1') numProc = NUMPROC elif argv[i].startswith('-c='): try: combineBins = int(argv[i][3:]) startInd += 1 except ValueError: print( 'WARNING: number of processors must be integer...using default {:s}' .format(COMBINE)) combineBins = COMBINE elif argv[i].startswith('-m='): parentLabelAr[0] = argv[i][3:] parentLabelAr[2] += 1 startInd += 1 elif argv[i].startswith('-f='): parentLabelAr[1] = argv[i][3:] parentLabelAr[2] += 2 startInd += 1 elif argv[i].startswith('-d='): opt = argv[i][3:].lower() if opt == 'false' or opt == 'none' or opt == 'n': decoding = 'N' elif opt == 'viterbi' or opt == 'v': decoding = 'V' elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb': decoding = 'F' elif opt == 'all' or opt == 'a': decoding = 'A' else: print( 'WARNING: decoding option {:s} not recognized...using default viterbi' .format(opt)) startInd += 1 elif argv[i] == '-u': isUniform = True startInd += 1 elif argv[i] in ['-h', '--help', '-help']: printHelp() exit() elif argv[i].startswith('-'): print('ERROR: {:s} is not a valid option'.format(argv[i])) exit() # end for inFileStr = argv[startInd] processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins)
def parseInputs( argv ): numProc = NUMPROC binSize = BINSIZE outID = None parentLabelAr = ['mother', 'father', 0] decoding = DECODE isUniform = UNIFORM combineBins = COMBINE centromere=None scaleFactor = SCALE startInd = 0 for i in range( min(9, len(argv)-1) ): if argv[i].startswith( '-o=' ): outID = argv[i][3:] startInd += 1 elif argv[i].startswith( '-b=' ): inStr = argv[i][3:] binSize = bth_util.strToDistance( inStr ) if binSize == False: print( 'WARNING: cannot convert {:s} to bin size...using default {:s}'.format( inStr, bth_util.binSizeToStr(BINSIZE) ) ) binSize = BINSIZE startInd += 1 elif argv[i].startswith( '-p=' ): try: numProc = int( argv[i][3:] ) startInd += 1 except ValueError: print( 'WARNING: number of processors must be integer...using 1' ) numProc = NUMPROC elif argv[i].startswith( '-c=' ): try: combineBins = int( argv[i][3:] ) startInd += 1 except ValueError: print( 'WARNING: number of processors must be integer...using default {:s}'.format(COMBINE) ) combineBins = COMBINE elif argv[i].startswith( '-m=' ): parentLabelAr[0] = argv[i][3:] parentLabelAr[2] += 1 startInd += 1 elif argv[i].startswith( '-f=' ): parentLabelAr[1] = argv[i][3:] parentLabelAr[2] += 2 startInd += 1 elif argv[i].startswith( '-d=' ): opt = argv[i][3:].lower() if opt == 'false' or opt == 'none' or opt== 'n': decoding = 'N' elif opt == 'viterbi' or opt == 'v': decoding = 'V'; elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb': decoding = 'F' elif opt == 'all' or opt == 'a': decoding = 'A' else: print( 'WARNING: decoding option {:s} not recognized...using default {:s}'.format(opt, DECODE) ) startInd += 1 elif argv[i] == '-u': isUniform = True startInd += 1 elif argv[i].startswith( '-t=' ): tmp = argv[i][3:].split(',') tmp2 = [ bth_util.strToDistance( x ) for x in tmp ] if len(tmp2) != 2 or (False in tmp2): print( 'WARNING: centromere coordinates bad...not using' ) else: centromere = tmp2 startInd += 1 elif argv[i].startswith( '-s=' ): try: scaleFactor = float( argv[i][3:] ) startInd += 1 if scaleFactor == 0: print( 'WARNING: scale factor must be greater than 0...using default', SCALE ) except ValueError: print( 'WARNING: scale factor must be numeric...using default {:s}'.format(SCALE) ) scaleFactor = SCALE elif argv[i] in [ '-h', '--help', '-help']: printHelp() exit() elif argv[i].startswith( '-' ): print( 'ERROR: {:s} is not a valid option'.format( argv[i] ) ) exit() # end for inFileStr = argv[startInd] processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins, centromere, scaleFactor )
def parseInputs( argv ): numProc = NUMPROC binSize = BINSIZE outID = None parentLabelAr = ['mother', 'father',0] isSmoothing = True isUniform = False startInd = 0 for i in range(min(7,len(argv))): if argv[i].startswith( '-o=' ): outID = argv[i][3:] startInd += 1 elif argv[i].startswith( '-b=' ): inStr = argv[i][3:] binSize = bth_util.strToDistance( inStr ) if binSize == False: print( 'WARNING: cannot convert {:s} to bin size...using default {:s}'.format( inStr, bth_util.binSizeToStr(BINSIZE) ) ) binSize = BINSIZE startInd += 1 elif argv[i].startswith( '-p=' ): try: numProc = int( argv[i][3:] ) startInd += 1 except ValueError: print( 'WARNING: number of processors must be integer...using 1' ) numProc = NUMPROC elif argv[i].startswith( '-m=' ): parentLabelAr[0] = argv[i][3:] parentLabelAr[2] += 1 startInd += 1 elif argv[i].startswith( '-f=' ): parentLabelAr[1] = argv[i][3:] parentLabelAr[2] += 2 startInd += 1 elif argv[i] == '-n': isSmoothing = False startInd += 1 elif argv[i] == '-u': isUniform = True startInd += 1 elif argv[i] in [ '-h', '--help', '-help']: printHelp() exit() elif argv[i].startswith( '-' ): print( 'ERROR: {:s} is not a valid option'.format( argv[i] ) ) exit() # end for inFileStr = argv[startInd] processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, isSmoothing, isUniform )
def processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins): info = '#from_script: epigenotyping_pe_combbin_smpt.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}'.format( os.path.basename(inFileStr), bth_util.binSizeToStr(binSize), formatDecoding(decoding).lower().replace('and', ','), str(isUniform).lower(), combineBins) print('Weighted methylation file:', os.path.basename(inFileStr)) print('Bin size:', bth_util.binSizeToStr(binSize)) print('Mother label:', parentLabelAr[0]) print('Father label:', parentLabelAr[1]) print('Uniform classification probabilities:', str(isUniform)) print('Decoding algorithm:', formatDecoding(decoding)) print('Combine bin feature threshold:', combineBins) # build dataframe print(' Reading input file', os.path.basename(inFileStr)) df = pd.read_table(inFileStr, header=1) # check parent labels checkParents(df['sample'], parentLabelAr) # group by bin df['bin'] = df.pos // binSize transformation = None # combine bins if necessary nbins = max(df['bin']) + 1 if combineBins > 0: print(' Merging bins', end=' ... ') df['tBin'] = df['bin'] transformation = binTransformation(df, combineBins) # apply the transformation df['bin'] = df['tBin'].apply(lambda x: transformation[x]) dfBinGroup = df.groupby('bin') if combineBins > 0: newNBins = len(dfBinGroup.groups) print('combined {:d} non-functional bins'.format(nbins - newNBins)) # classify by bin print(' Classifying {:d} bins with {:d} processors'.format(nbins, numProc)) dfClass = runClassification(dfBinGroup, numProc, parentLabelAr, isUniform) dfClass.reset_index(inplace=True) #print( dfClass.head ) del (df, dfBinGroup) # decode, if necessary if decoding != 'N': ignoreAr = parentLabelAr[:2] + ['MPV'] #transition = Transitions( dfClass, ignore = ignoreAr ) #transitionMatrix = transition.getTransitions() # write this matrix to file outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins) with open(outFStr, 'w') as f: f.write(info + '\n') #tLabels = [ 'mother', 'MPV', 'father' ] #transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels ) #transData.to_csv( outFStr, sep='\t', mode='a' ) # group by sample dfSampleGroup = dfClass.groupby('sample') nsamples = len(dfSampleGroup.groups) print(' {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding(decoding), nsamples, numProc)) dfOutput = runDecoding(dfSampleGroup, numProc, decoding, outFStr) dfOutput.set_index(['bin', 'sample'], inplace=True) del (dfSampleGroup) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName(inFileStr, outID, binSize, decoding, isUniform, combineBins) # if combination, undo transformation by applying the predictions to additional bins if combineBins > 0: dfOutput.reset_index(inplace=True) dfOutput['cBin'] = dfOutput['bin'] dfOutputT = undoBinTransformation(dfOutput, transformation) else: dfOutputT = dfOutput.drop('cBin', axis=1) print(' Writing output to', outFileStr) with open(outFileStr, 'w') as f: f.write(info + '\n') dfOutputT.to_csv(outFileStr, sep='\t', mode='a') print('Done')
def parseInputs( argv ): numProc = NUMPROC binSize = BINSIZE outID = None parentLabelAr = ['mother', 'father', 0] decoding = DECODE isUniform = UNIFORM combineBins = COMBINE startInd = 0 for i in range( min(7, len(argv)-1) ): if argv[i].startswith( '-o=' ): outID = argv[i][3:] startInd += 1 elif argv[i].startswith( '-b=' ): inStr = argv[i][3:] binSize = bth_util.strToDistance( inStr ) if binSize == False: print( 'WARNING: cannot convert {:s} to bin size...using default {:s}'.format( inStr, bth_util.binSizeToStr(BINSIZE) ) ) binSize = BINSIZE startInd += 1 elif argv[i].startswith( '-p=' ): try: numProc = int( argv[i][3:] ) startInd += 1 except ValueError: print( 'WARNING: number of processors must be integer...using 1' ) numProc = NUMPROC elif argv[i].startswith( '-c=' ): try: combineBins = int( argv[i][3:] ) startInd += 1 except ValueError: print( 'WARNING: number of processors must be integer...using default {:s}'.format(COMBINE) ) combineBins = COMBINE elif argv[i].startswith( '-m=' ): parentLabelAr[0] = argv[i][3:] parentLabelAr[2] += 1 startInd += 1 elif argv[i].startswith( '-f=' ): parentLabelAr[1] = argv[i][3:] parentLabelAr[2] += 2 startInd += 1 elif argv[i].startswith( '-d=' ): opt = argv[i][3:].lower() if opt == 'false' or opt == 'none' or opt== 'n': decoding = 'N' elif opt == 'viterbi' or opt == 'v': decoding = 'V'; elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb': decoding = 'F' elif opt == 'all' or opt == 'a': decoding = 'A' else: print( 'WARNING: decoding option {:s} not recognized...using default viterbi'.format(opt) ) startInd += 1 elif argv[i] == '-u': isUniform = True startInd += 1 elif argv[i] in [ '-h', '--help', '-help']: printHelp() exit() elif argv[i].startswith( '-' ): print( 'ERROR: {:s} is not a valid option'.format( argv[i] ) ) exit() # end for inFileStr = argv[startInd] processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins )
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins ): info = '#from_script: epigenotyping_pe_combbin.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower(), combineBins ) print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label:', parentLabelAr[0] ) print( 'Father label:', parentLabelAr[1] ) print( 'Uniform classification probabilities:', str(isUniform) ) print( 'Decoding algorithm:', formatDecoding( decoding ) ) print( 'Combine bin feature threshold:', combineBins ) # build dataframe print( ' Reading input file', os.path.basename( inFileStr ) ) df = pd.read_table( inFileStr, header=1 ) # check parent labels checkParents( df['sample'], parentLabelAr ) # group by bin df['bin'] = df.pos // binSize transformation = None # combine bins if necessary nbins = max(df['bin'])+1 if combineBins > 0: print( ' Merging bins', end=' ... ' ) df['tBin'] = df['bin'] transformation = binTransformation( df, combineBins ) # apply the transformation df['bin'] = df['tBin'].apply( lambda x: transformation[x] ) dfBinGroup = df.groupby( 'bin' ) if combineBins > 0: newNBins = len(dfBinGroup.groups ) print( 'combined {:d} non-functional bins'.format( nbins - newNBins ) ) # classify by bin print( ' Classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, isUniform ) dfClass.reset_index(inplace=True) #print( dfClass.head ) del(df, dfBinGroup ) # decode, if necessary if decoding != 'N': ignoreAr = parentLabelAr[:2] + ['MPV'] print( ' Obtaining initial transitions' ) transition = Transitions( dfClass, ignore = ignoreAr ) transitionMatrix = transition.getTransitions() # multiply for array of transitions transitionMatrixArray = np.array( [ np.copy( transitionMatrix ) for i in range(nbins ) ] ) if maxIter > 0: print( ' Iteratively improving transitions with maximum', maxIter, 'iterations' ) at = AdaptiveTransitions( dfClass, transitionMatrixArray, ignoreAr, maxIter ) iterations, transitionMatrix = at.run() trInfo += '; iterations_to_convergence:' if iterations == maxIter: trInfo += 'NA' print( ' Did not converge in 10 iterations' ) else: trInfo += str(iterations) print( ' Convergence in', iterations, 'iterations' ) '''# write this matrix to file #outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins ) tLabels = [ 'mother', 'MPV', 'father' ] transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels ) with open( outFStr, 'w' ) as f: f.write(info+'\n') transData.to_csv( outFStr, sep='\t', mode='a' )''' # group by sample dfSampleGroup = dfClass.groupby( 'sample' ) nsamples = len( dfSampleGroup.groups ) print( ' {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding(decoding), nsamples, numProc ) ) dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrixArray, decoding ) dfOutput.set_index( ['bin', 'sample'], inplace=True ) del( dfSampleGroup ) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, combineBins ) # if combination, undo transformation by applying the predictions to additional bins if combineBins > 0: dfOutput.reset_index(inplace=True) dfOutput['cBin'] = dfOutput['bin'] dfOutputT = undoBinTransformation( dfOutput, transformation ) else: dfOutputT = dfOutput.drop('cBin', axis=1) print( ' Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info+'\n') dfOutputT.to_csv( outFileStr, sep='\t', mode='a' ) print( 'Done' )
def parseInputs(argv): numProc = NUMPROC binSize = BINSIZE outID = None parentLabelAr = ['mother', 'father', 0] decoding = DECODE classProbs = CLASSPROB combineBins = COMBINE isPrint = ISPRINT centromere = None scaleTransitions = SCALETRANS startInd = 0 for i in range(min(10, len(argv) - 1)): if argv[i].startswith('-o='): outID = argv[i][3:] startInd += 1 elif argv[i].startswith('-b='): inStr = argv[i][3:] binSize = bth_util.strToDistance(inStr) if binSize == False: print( 'WARNING: cannot convert {:s} to bin size...using default {:s}' .format(inStr, bth_util.binSizeToStr(BINSIZE))) binSize = BINSIZE startInd += 1 elif argv[i].startswith('-p='): try: numProc = int(argv[i][3:]) startInd += 1 except ValueError: print( 'WARNING: number of processors must be integer...using 1') numProc = NUMPROC elif argv[i].startswith('-c='): try: combineBins = int(argv[i][3:]) startInd += 1 except ValueError: print( 'WARNING: combine bins must be integer...using default {:s}' .format(COMBINE)) combineBins = COMBINE elif argv[i].startswith('-m='): parentLabelAr[0] = argv[i][3:] parentLabelAr[2] += 1 startInd += 1 elif argv[i].startswith('-f='): parentLabelAr[1] = argv[i][3:] parentLabelAr[2] += 2 startInd += 1 elif argv[i].startswith('-d='): opt = argv[i][3:].lower() if opt == 'false' or opt == 'none' or opt == 'n': decoding = 'N' elif opt == 'viterbi' or opt == 'v': decoding = 'V' elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb': decoding = 'F' elif opt == 'all' or opt == 'a': decoding = 'A' elif opt == 'both' or opt == 'b': decoding = 'B' else: print( 'WARNING: decoding option {:s} not recognized...using default viterbi' .format(opt)) startInd += 1 elif argv[i] == '-e': classProbs = 'E' startInd += 1 elif argv[i] == '-s': scaleTransitions = True startInd += 1 elif argv[i] == '-q': isPrint = False startInd += 1 elif argv[i].startswith('-t='): tmp = argv[i][3:].split(',') tmp2 = [bth_util.strToDistance(x) for x in tmp] if len(tmp2) % 2 != 0 or (False in tmp2): print('WARNING: centromere coordinates bad...not using') else: centromere = tmp2 startInd += 1 elif argv[i] in ['-h', '--help', '-help']: printHelp() exit() elif argv[i].startswith('-'): print('ERROR: {:s} is not a valid option'.format(argv[i])) exit() # end for inFileStr = argv[startInd] processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding, classProbs, combineBins, centromere, scaleTransitions, isPrint)
def parseInputs(argv): numProc = NUMPROC binSize = BINSIZE outID = None parentLabelAr = ['mother', 'father', 0] decoding = DECODE generation = GENERATION combineBins = COMBINE mpvCheck = MPVCHECK isPrint = ISPRINT tmOut = TMOUT centromere = None startInd = 0 for i in range(min(11, len(argv) - 1)): if argv[i].startswith('-o='): outID = argv[i][3:] startInd += 1 elif argv[i].startswith('-b='): inStr = argv[i][3:] binSize = bth_util.strToDistance(inStr) if binSize == False: print( 'WARNING: cannot convert {:s} to bin size...using default {:s}' .format(inStr, bth_util.binSizeToStr(BINSIZE))) binSize = BINSIZE startInd += 1 elif argv[i].startswith('-p='): try: numProc = int(argv[i][3:]) except ValueError: print('WARNING: number of processors must be integer...using', NUMPROC) numProc = NUMPROC startInd += 1 elif argv[i].startswith('-c='): try: combineBins = int(argv[i][3:]) except ValueError: print('WARNING: combine bins must be integer...using default', COMBINE) combineBins = COMBINE startInd += 1 elif argv[i].startswith('-m='): parentLabelAr[0] = argv[i][3:] parentLabelAr[2] += 1 startInd += 1 elif argv[i].startswith('-f='): parentLabelAr[1] = argv[i][3:] parentLabelAr[2] += 2 startInd += 1 elif argv[i].startswith('-d='): opt = argv[i][3:].lower() if opt == 'false' or opt == 'none' or opt == 'n': decoding = 'N' elif opt == 'viterbi' or opt == 'v': decoding = 'V' elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb': decoding = 'F' elif opt == 'all' or opt == 'a': decoding = 'A' elif opt == 'both' or opt == 'b': decoding = 'B' else: print( 'WARNING: decoding option {:s} not recognized...using default both' .format(opt)) startInd += 1 elif argv[i].startswith('-g='): try: generation = int(argv[i][3:]) except ValueError: print( 'WARNING: generation must be integer...using default {:s}'. format(COMBINE)) generation = GENERATION startInd += 1 elif argv[i] == '-q': isPrint = False startInd += 1 elif argv[i] == '-n-mpv': mpvCheck = False startInd += 1 elif argv[i] == '-t-out': tmOut = True startInd += 1 elif argv[i].startswith('-t='): tmp = argv[i][3:].split(',') tmp2 = [bth_util.strToDistance(x) for x in tmp] if len(tmp2) % 2 != 0 or (False in tmp2): print('WARNING: centromere coordinates bad...not using') else: centromere = tmp2 startInd += 1 elif argv[i] in ['-h', '--help', '-help']: printHelp() exit() elif argv[i].startswith('-'): print('ERROR: {:s} is not a valid option'.format(argv[i])) exit() # end for inFileStr = argv[startInd] processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding, generation, combineBins, centromere, isPrint, mpvCheck, tmOut)
def processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding, classProbs, combineBins, cent, scaleTransitions, isPrint): info = '#from_script: epigenotyping_pe_v9.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; class_prob:{:s}; combine_bins_threshold:{:d}; centromere_{:s}; scale_transitions:{:s}'.format( os.path.basename(inFileStr), bth_util.binSizeToStr(binSize), formatDecoding(decoding).lower().replace('and', ','), formatClassProbs(classProbs).lower(), combineBins, ('None' if cent == None else '{:s}-{:s}'.format( bth_util.binSizeToStr(cent[0]), bth_util.binSizeToStr(cent[1]))), str(scaleTransitions)) if isPrint: print('Weighted methylation file:', os.path.basename(inFileStr)) print('Bin size:', bth_util.binSizeToStr(binSize)) print('Mother label(s):', parentLabelAr[0]) print('Father label(s):', parentLabelAr[1]) print('Classification probabilities:', formatClassProbs(classProbs)) print('Decoding algorithm:', formatDecoding(decoding)) print('Combine bin feature threshold:', combineBins) print('Scale transitions by sample size:', scaleTransitions) if cent == None: centStr = 'None' else: centStr = '' for i in range(len(cent) // 2): si = i * 2 centStr += '; {:s}-{:s}'.format( bth_util.binSizeToStr(cent[si]), bth_util.binSizeToStr(cent[si + 1])) centStr = centStr[2:] if isPrint: print('Centromere:', centStr) # build dataframe if isPrint: print(' Reading input file', os.path.basename(inFileStr)) df = pd.read_table(inFileStr, header=1) # check parent labels newParentLabelAr = checkParents(df['sample'], parentLabelAr) tIgnoreAr = flattenList(newParentLabelAr[:2]) for i in range(len(newParentLabelAr[0])): tIgnoreAr += ['MPV{:d}'.format(i)] # group by bin df['bin'] = df.pos // binSize transformation = None # get centromere bins if necessary if cent == None: centBins = [] else: cent = [x // binSize for x in cent] centBins = [] #centBins = list( range(cent[0], cent[1]+1) ) for i in range(len(cent) // 2): si = i * 2 centBins += list(range(cent[si], cent[si + 1] + 1)) # combine bins if necessary nbins = max(df['bin']) + 1 if combineBins > 0: if isPrint: print(' Merging bins', end=' ... ') df['tBin'] = df['bin'] transformation = binTransformation(df, combineBins) # apply the transformation df['bin'] = df['tBin'].apply(lambda x: transformation[x]) dfBinGroup = df.groupby('bin') if combineBins > 0: newNBins = len(dfBinGroup.groups) info += '; non-functional_bins:{:d}'.format(nbins - newNBins) if isPrint: print('combined {:d} non-functional bins'.format(nbins - newNBins)) # classify by bin if isPrint: print(' Classifying {:d} bins with {:d} processors'.format( nbins, numProc)) dfClass = runClassification(dfBinGroup, numProc, newParentLabelAr, classProbs) dfClass.reset_index(inplace=True) #print( dfClass.head ) del (df, dfBinGroup) # decode, if necessary if decoding != 'N': #ignoreAr = parentLabelAr[:2] + ['MPV'] transition = Transitions(dfClass, ignore=tIgnoreAr) transitionMatrix = transition.getTransitions() # write this matrix to file '''outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins ) tLabels = [ 'mother', 'MPV', 'father' ] transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels ) with open( outFStr, 'w' ) as f: f.write(info+'\n') transData.to_csv( outFStr, sep='\t', mode='a' )''' # group by sample dfSampleGroup = dfClass.groupby('sample') nsamples = len(dfSampleGroup.groups) if scaleTransitions: scaleFactor = float(nsamples - len(tIgnoreAr) - 1) / float(nsamples - len(tIgnoreAr)) else: scaleFactor = 1 tmpDecoding = ('F' if decoding == 'B' else decoding) if isPrint: print(' {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding(tmpDecoding), nsamples, numProc)) dfOutput = runDecoding(dfSampleGroup, numProc, transitionMatrix, tmpDecoding, centBins, scaleFactor) if decoding == 'B': dfNew = dfOutput.loc[:, ['bin', 'sample']].copy() dfNew['MPV'] = np.log(dfOutput['fb.score.MPV']) dfNew['mother'] = np.log(dfOutput['fb.score.mother']) dfNew['father'] = np.log(dfOutput['fb.score.father']) dfNew['prediction'] = dfOutput['fb.prediction'] #print(dfOutput.head()) #print(dfNew.head()) transition = Transitions(dfNew, ignore=tIgnoreAr) transitionMatrix = transition.getTransitions() dfSampleGroup = dfNew.groupby('sample') nsamples = len(dfSampleGroup.groups) if isPrint: print( ' {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding('V'), nsamples, numProc)) dfOutputN = runDecoding(dfSampleGroup, numProc, transitionMatrix, 'V', centBins, scaleFactor) dfOutput[[ 'vit.score.mother', 'vit.score.father', 'vit.score.MPV', 'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV', 'vit.prediction' ]] = dfOutputN[[ 'vit.score.mother', 'vit.score.father', 'vit.score.MPV', 'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV', 'vit.prediction' ]] #print( dfOutput.head() ) # end decoding == B dfOutput.set_index(['bin', 'sample'], inplace=True) del (dfSampleGroup) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName(inFileStr, outID, binSize, decoding, classProbs, scaleTransitions, combineBins) # if combination, undo transformation by applying the predictions to additional bins if combineBins > 0: dfOutput.reset_index(inplace=True) dfOutput['cBin'] = dfOutput['bin'] dfOutputT = undoBinTransformation(dfOutput, transformation) else: dfOutputT = dfOutput.drop('cBin', axis=1) if isPrint: print(' Writing output to', outFileStr) with open(outFileStr, 'w') as f: f.write(info + '\n') dfOutputT.to_csv(outFileStr, sep='\t', mode='a') if isPrint: print('Done')
def parseInputs( argv ): numProc = NUMPROC binSize = BINSIZE outID = None parentLabelAr = ['mother', 'father', 0] decoding = DECODE generation = GENERATION combineBins = COMBINE mpvCheck = MPVCHECK isPrint = ISPRINT tmOut = TMOUT centromere = None startInd = 0 for i in range( min(11, len(argv)-1) ): if argv[i].startswith( '-o=' ): outID = argv[i][3:] startInd += 1 elif argv[i].startswith( '-b=' ): inStr = argv[i][3:] binSize = bth_util.strToDistance( inStr ) if binSize == False: print( 'WARNING: cannot convert {:s} to bin size...using default {:s}'.format( inStr, bth_util.binSizeToStr(BINSIZE) ) ) binSize = BINSIZE startInd += 1 elif argv[i].startswith( '-p=' ): try: numProc = int( argv[i][3:] ) except ValueError: print( 'WARNING: number of processors must be integer...using', NUMPROC ) numProc = NUMPROC startInd += 1 elif argv[i].startswith( '-c=' ): try: combineBins = int( argv[i][3:] ) except ValueError: print( 'WARNING: combine bins must be integer...using default', COMBINE ) combineBins = COMBINE startInd += 1 elif argv[i].startswith( '-m=' ): parentLabelAr[0] = argv[i][3:] parentLabelAr[2] += 1 startInd += 1 elif argv[i].startswith( '-f=' ): parentLabelAr[1] = argv[i][3:] parentLabelAr[2] += 2 startInd += 1 elif argv[i].startswith( '-d=' ): opt = argv[i][3:].lower() if opt == 'false' or opt == 'none' or opt== 'n': decoding = 'N' elif opt == 'viterbi' or opt == 'v': decoding = 'V'; elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb': decoding = 'F' elif opt == 'all' or opt == 'a': decoding = 'A' elif opt == 'both' or opt == 'b': decoding = 'B' else: print( 'WARNING: decoding option {:s} not recognized...using default both'.format(opt) ) startInd += 1 elif argv[i].startswith( '-g=' ): try: generation = int( argv[i][3:] ) except ValueError: print( 'WARNING: generation must be integer...using default {:s}'.format(COMBINE) ) generation = GENERATION startInd += 1 elif argv[i] == '-q': isPrint = False startInd += 1 elif argv[i] == '-n-mpv': mpvCheck = False startInd += 1 elif argv[i] == '-t-out': tmOut = True startInd += 1 elif argv[i].startswith( '-t=' ): tmp = argv[i][3:].split(',') tmp2 = [ bth_util.strToDistance( x ) for x in tmp ] if len(tmp2) % 2 != 0 or (False in tmp2): print( 'WARNING: centromere coordinates bad...not using' ) else: centromere = tmp2 startInd += 1 elif argv[i] in [ '-h', '--help', '-help']: printHelp() exit() elif argv[i].startswith( '-' ): print( 'ERROR: {:s} is not a valid option'.format( argv[i] ) ) exit() # end for inFileStr = argv[startInd] processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, generation, combineBins, centromere, isPrint, mpvCheck, tmOut )
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins, cent ): info = '#from_script: epigenotyping_pe_combbin-init.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}; centromere:{:s}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower(), combineBins, ('None' if cent == None else '{:s}-{:s}'.format( bth_util.binSizeToStr( cent[0] ), bth_util.binSizeToStr( cent[1] ) ) ) ) print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label:', parentLabelAr[0] ) print( 'Father label:', parentLabelAr[1] ) print( 'Uniform classification probabilities:', str(isUniform) ) print( 'Decoding algorithm:', formatDecoding( decoding ) ) print( 'Combine bin feature threshold:', combineBins ) print( 'Centromere:', ( 'None' if cent == None else '{:s}-{:s}'.format( bth_util.binSizeToStr(cent[0]), bth_util.binSizeToStr(cent[1]) ) ) ) # build dataframe print( ' Reading input file', os.path.basename( inFileStr ) ) df = pd.read_table( inFileStr, header=1 ) # check parent labels checkParents( df['sample'], parentLabelAr ) # group by bin df['bin'] = df.pos // binSize transformation = None # get centromere bins if necessary if cent == None: centro = [] else: cent = [ x // binSize for x in cent ] centro = list( range(cent[0], cent[1]+1) ) print(centro) # combine bins if necessary nbins = max(df['bin'])+1 if combineBins > 0: print( ' Merging bins', end=' ... ' ) df['tBin'] = df['bin'] transformation = binTransformation( df, combineBins ) # apply the transformation df['bin'] = df['tBin'].apply( lambda x: transformation[x] ) dfBinGroup = df.groupby( 'bin' ) if combineBins > 0: newNBins = len(dfBinGroup.groups ) print( 'combined {:d} non-functional bins'.format( nbins - newNBins ) ) # classify by bin print( ' Classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, isUniform ) dfClass.reset_index(inplace=True) #print( dfClass.head ) del(df, dfBinGroup ) # decode, if necessary if decoding != 'N': ignoreAr = parentLabelAr[:2] + ['MPV'] transition = Transitions( dfClass, ignore = ignoreAr, cent=centro ) transitionMatrix = transition.getTransitions() # write this matrix to file outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins ) tLabels = [ 'mother', 'MPV', 'father' ] transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels ) with open( outFStr, 'w' ) as f: f.write(info+'\n') transData.to_csv( outFStr, sep='\t', mode='a' ) # group by sample dfSampleGroup = dfClass.groupby( 'sample' ) nsamples = len( dfSampleGroup.groups ) print( ' {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding(decoding), nsamples, numProc ) ) dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, decoding, centro ) dfOutput.set_index( ['bin', 'sample'], inplace=True ) del( dfSampleGroup ) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, combineBins ) # if combination, undo transformation by applying the predictions to additional bins if combineBins > 0: dfOutput.reset_index(inplace=True) dfOutput['cBin'] = dfOutput['bin'] dfOutputT = undoBinTransformation( dfOutput, transformation ) else: dfOutputT = dfOutput.drop('cBin', axis=1) print( ' Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info+'\n') dfOutputT.to_csv( outFileStr, sep='\t', mode='a' ) print( 'Done' )
def parseInputs( argv ): numProc = NUMPROC binSize = BINSIZE outID = None parentLabelAr = ['mother', 'father', 0] decoding = DECODE classProbs = CLASSPROB combineBins = COMBINE isPrint = ISPRINT centromere=None startInd = 0 for i in range( min(10, len(argv)-1) ): if argv[i].startswith( '-o=' ): outID = argv[i][3:] startInd += 1 elif argv[i].startswith( '-b=' ): inStr = argv[i][3:] binSize = bth_util.strToDistance( inStr ) if binSize == False: print( 'WARNING: cannot convert {:s} to bin size...using default {:s}'.format( inStr, bth_util.binSizeToStr(BINSIZE) ) ) binSize = BINSIZE startInd += 1 elif argv[i].startswith( '-p=' ): try: numProc = int( argv[i][3:] ) startInd += 1 except ValueError: print( 'WARNING: number of processors must be integer...using 1' ) numProc = NUMPROC elif argv[i].startswith( '-c=' ): try: combineBins = int( argv[i][3:] ) startInd += 1 except ValueError: print( 'WARNING: combine bins must be integer...using default {:s}'.format(COMBINE) ) combineBins = COMBINE elif argv[i].startswith( '-m=' ): parentLabelAr[0] = argv[i][3:] parentLabelAr[2] += 1 startInd += 1 elif argv[i].startswith( '-f=' ): parentLabelAr[1] = argv[i][3:] parentLabelAr[2] += 2 startInd += 1 elif argv[i].startswith( '-d=' ): opt = argv[i][3:].lower() if opt == 'false' or opt == 'none' or opt== 'n': decoding = 'N' elif opt == 'viterbi' or opt == 'v': decoding = 'V'; elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb': decoding = 'F' elif opt == 'all' or opt == 'a': decoding = 'A' elif opt == 'both' or opt == 'b': decoding = 'B' else: print( 'WARNING: decoding option {:s} not recognized...using default viterbi'.format(opt) ) startInd += 1 elif argv[i] == '-u': if classProbs != CLASSPROB: print( 'WARNING: cannot specify uniform and epiRIL class weights...using default' ) classProbs = CLASSPROB else: classProbs = 'U' startInd += 1 elif argv[i] == '-e': if classProbs != CLASSPROB: print( 'WARNING: cannot specify uniform and epiRIL class weights...using default' ) classProbs = CLASSPROB else: classProbs = 'E' startInd += 1 elif argv[i] == '-q': isPrint = False startInd += 1 elif argv[i].startswith( '-t=' ): tmp = argv[i][3:].split(',') tmp2 = [ bth_util.strToDistance( x ) for x in tmp ] if len(tmp2) % 2 != 0 or (False in tmp2): print( 'WARNING: centromere coordinates bad...not using' ) else: centromere = tmp2 startInd += 1 elif argv[i] in [ '-h', '--help', '-help']: printHelp() exit() elif argv[i].startswith( '-' ): print( 'ERROR: {:s} is not a valid option'.format( argv[i] ) ) exit() # end for inFileStr = argv[startInd] processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, classProbs, combineBins, centromere, isPrint )
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins, maxIter ): info = '#from_script: epigenotyping_combin_smp-iter-trans.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}; maximum_iterations:{:d}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower(), combineBins, maxIter ) print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label:', parentLabelAr[0] ) print( 'Father label:', parentLabelAr[1] ) print( 'Uniform classification probabilities:', str(isUniform) ) print( 'Decoding algorithm:', formatDecoding( decoding ) ) print( 'Combine bin feature threshold:', combineBins ) print( 'Maximum transition matrix iterations:', maxIter ) # build dataframe print( ' Reading input file', os.path.basename( inFileStr ) ) df = pd.read_table( inFileStr, header=1 ) # check parent labels checkParents( df['sample'], parentLabelAr ) # group by bin df['bin'] = df.pos // binSize transformation = None # combine bins if necessary nbins = max(df['bin'])+1 if combineBins > 0: print( ' Merging bins', end=' ... ' ) df['tBin'] = df['bin'] transformation = binTransformation( df, combineBins ) # apply the transformation df['bin'] = df['tBin'].apply( lambda x: transformation[x] ) dfBinGroup = df.groupby( 'bin' ) if combineBins > 0: newNBins = len(dfBinGroup.groups ) print( 'combined {:d} non-functional bins'.format( nbins - newNBins ) ) # classify by bin print( ' Classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, isUniform ) dfClass.reset_index(inplace=True) #print( dfClass.head ) del(df, dfBinGroup ) # decode, if necessary if decoding != 'N': ignoreAr = parentLabelAr[:2] + ['MPV'] print( ' Obtaining initial transitions' ) transition = Transitions( dfClass, ignore = ignoreAr ) transitionMatrix = transition.getTransitions() outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins ) with open( outFStr, 'w' ) as f: f.write(info+'\n') # group by sample #print(dfClass.head()) dfSampleGroup = dfClass.groupby( 'sample' ) nsamples = len( dfSampleGroup.groups ) print( ' {:s} decoding and optimizing transition matrices for {:d} samples with {:d} processors'.format( formatDecoding(decoding), nsamples, numProc ) ) ## note: decoding will now include improved transition matrix calculations dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, decoding, maxIter, outFStr ) dfOutput.set_index( ['bin', 'sample'], inplace=True ) del( dfSampleGroup ) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, combineBins ) # if combination, undo transformation by applying the predictions to additional bins if combineBins > 0: dfOutput.reset_index(inplace=True) dfOutput['cBin'] = dfOutput['bin'] dfOutputT = undoBinTransformation( dfOutput, transformation ) else: dfOutputT = dfOutput.drop('cBin', axis=1) print( ' Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info+'\n') dfOutputT.to_csv( outFileStr, sep='\t', mode='a' ) print( 'Done' )