def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, isIndiv ): dType = ('Viterbi' if decoding == 'V' else ('Forward-backward' if decoding == 'F' else ('Viterbi and Forward-backward' if decoding == 'A' else 'None') ) ) info = '#from_script:epigenotype_by_logreg.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; indiv_transitions:{:s}\n'.format( os.path.basename( inFileStr), bth_util.binSizeToStr( binSize ), dType.lower().replace(' and ', ','), str(isUniform), str(isIndiv) ) print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label:', parentLabelAr[0] ) print( 'Father label:', parentLabelAr[1] ) print( 'Uniform classification probabilities:', str( isUniform ) ) print( 'Decoding algorithm:', dType) print( 'Individual transition probabilities:', str( isIndiv ) ) # build data frame df = pd.read_table( inFileStr, header=1 ) # check parent labels checkParents( df['sample'], parentLabelAr ) # group by bin and analyze df['bin'] = df.pos // binSize nbins = max(df['bin'])+1 dfg = df.groupby('bin') if numProc > 1: print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) res_class = runMultiClassification( dfg, numProc, parentLabelAr, isUniform ) else: print( 'Begin classifying {:d} bins'.format( nbins ) ) res_class = dfg.apply( classLogReg, pla=parentLabelAr, u=isUniform ) res_class.reset_index(inplace=True) # decode if necessary if decoding != 'N': ignoreAr = parentLabelAr + ['MPV'] if isIndiv: transitions = np.array([]) else: print( 'Generating transition matrix' ) transition = Transitions( res_class, ignore=ignoreAr ) transitions = transition.getTransitions() print(transitions) # find optimum path for all samples groups = res_class.groupby( 'sample' ) nsamples = len(groups.groups) if numProc > 1: print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format( dType, nsamples, numProc ) ) results = runMultiPath( groups, numProc, transitions, isUniform, decoding ) else: print( 'Begin {:s} decoding {:d} samples'.format( dType, nsamples ) ) results = groups.apply( findOptimalPath, trans=transitions, u=isUniform, d=decoding ) results.set_index( ['bin', 'sample'], inplace=True ) else: results = res_class # output file outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, isIndiv ) # write output print( 'Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info) results.to_csv( outFileStr, sep='\t', mode='a' ) print( 'Done' )
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, parentAddLabelAr, decoding, isUniform ): info = '#from_script: epigenotyping_pe.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; mother_samples:{:s}; father_samples:{:s}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower(), ','.join(parentLabelAr[0]), ','.join(parentLabelAr[1]) ) print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label(s):', ', '.join(parentLabelAr[0]) ) print( 'Father label(s):', ', '.join(parentLabelAr[1]) ) if len(parentAddLabelAr[0]) != 0 or len(parentAddLabelAr[1]) != 0: print( 'Additional mother training label(s):', ('None' if len(parentAddLabelAr[0])==0 else ', '.join(parentAddLabelAr[0])) ) print( 'Additional father training label(s):', ('None' if len(parentAddLabelAr[1]) == 0 else ', '.join(parentAddLabelAr[1])) ) print( 'Uniform classification probabilities:', str(isUniform) ) print( 'Decoding algorithm:', formatDecoding( decoding ) ) # build dataframe df = pd.read_table( inFileStr, header=1 ) # check parent labels parentLabelAr = checkParents( df['sample'], parentLabelAr ) # check additional training data labels if len(parentAddLabelAr[0]) != 0 or len(parentAddLabelAr[1]) != 0: parentAddLabelAr = checkParents( df['sample'], parentAddLabelAr ) # group by bin df['bin'] = df.pos // binSize nbins = max(df['bin'])+1 dfBinGroup = df.groupby( 'bin' ) # classify by bin print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, parentAddLabelAr, isUniform ) dfClass.reset_index(inplace=True) #print( dfClass.head ) del(df, dfBinGroup ) # decode, if necessary if decoding != 'N': totalParentLabelAr = [parentLabelAr[0] + parentAddLabelAr[0], parentLabelAr[1] + parentAddLabelAr[1]] ignoreAr = flattenList( totalParentLabelAr ) + ['MPV'] transition = Transitions( dfClass, ignore = ignoreAr ) transitionMatrix = transition.getTransitions() # group by sample dfSampleGroup = dfClass.groupby( 'sample' ) nsamples = len(dfSampleGroup.groups ) print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding(decoding), nsamples, numProc ) ) dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, decoding ) dfOutput.set_index( ['bin', 'sample'], inplace=True ) del( dfSampleGroup ) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform ) print( 'Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info+'\n') dfOutput.to_csv( outFileStr, sep='\t', mode='a' ) print( 'Done' )
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, maxIter ): dType = ('Viterbi' if decoding == 'V' else ('Forward-backward' if decoding == 'F' else ('Viterbi and Forward-backward' if decoding == 'A' else 'None') ) ) info = '#from_script:epigenotype_by_logreg.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}\n'.format( os.path.basename( inFileStr), bth_util.binSizeToStr( binSize ), dType.lower().replace(' and ', ','), str(isUniform) ) print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label:', parentLabelAr[0] ) print( 'Father label:', parentLabelAr[1] ) print( 'Uniform classification probabilities:', str( isUniform ) ) print( 'Decoding algorithm:', dType) # build data frame df = pd.read_table( inFileStr, header=1 ) # check parent labels checkParents( df['sample'], parentLabelAr ) # group by bin and analyze df['bin'] = df.pos // binSize nbins = max(df['bin'])+1 dfg = df.groupby('bin') if numProc > 1: print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) res_class = runMultiClassification( dfg, numProc, parentLabelAr, isUniform ) else: print( 'Begin classifying {:d} bins'.format( nbins ) ) res_class = dfg.apply( classLogReg, pla=parentLabelAr, u=isUniform ) res_class.reset_index(inplace=True) # decode if necessary if decoding != 'N': ignoreAr = parentLabelAr + ['MPV'] print( 'Generating transition matrix' ) transition = Transitions( res_class, ignore=ignoreAr ) transitions = transition.getTransitions() # find optimum path for all samples groups = res_class.groupby( 'sample' ) nsamples = len(groups.groups) if numProc > 1: print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format( dType, nsamples, numProc ) ) results = runMultiPath( groups, numProc, transitions, isUniform, decoding ) else: print( 'Begin {:s} decoding {:d} samples'.format( dType, nsamples ) ) results = groups.apply( findOptimalPath, trans=transitions, u=isUniform, d=decoding ) results.set_index( ['bin', 'sample'], inplace=True ) else: results = res_class # output file outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform ) # write output print( 'Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info) results.to_csv( outFileStr, sep='\t', mode='a' ) print( 'Done' )
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform ): info = '#from_script: epigenotyping_pe.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower() ) print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label:', parentLabelAr[0] ) print( 'Father label:', parentLabelAr[1] ) print( 'Uniform classification probabilities:', str(isUniform) ) print( 'Decoding algorithm:', formatDecoding( decoding ) ) # build dataframe df = pd.read_table( inFileStr, header=1 ) # check parent labels checkParents( df['sample'], parentLabelAr ) # group by bin df['bin'] = df.pos // binSize nbins = max(df['bin'])+1 dfBinGroup = df.groupby( 'bin' ) # classify by bin print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, isUniform ) dfClass.reset_index(inplace=True) #print( dfClass.head ) del(df, dfBinGroup ) # decode, if necessary if decoding != 'N': ignoreAr = parentLabelAr[:2] + ['MPV'] transition = Transitions( dfClass, ignore = ignoreAr ) transitionMatrix = transition.getTransitions() # group by sample dfSampleGroup = dfClass.groupby( 'sample' ) nsamples = len(dfSampleGroup.groups ) print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding(decoding), nsamples, numProc ) ) dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, decoding ) dfOutput.set_index( ['bin', 'sample'], inplace=True ) del( dfSampleGroup ) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform ) print( 'Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info+'\n') dfOutput.to_csv( outFileStr, sep='\t', mode='a' ) print( 'Done' )
def findOptimalPath(df, trans=np.array([]), u=False, d='V'): # generate individual transition matrix if trans.size == 0: transObj = Transitions(df, ignore=[]) trans = transObj.getTransitions() if d == 'A': path = ProbPathAll(df, trans, u) elif d == 'F': path = ProbPathFB(df, trans, u) else: #path = ProbPathViterbi( df, trans, u ) path = ProbPathSimple(df, trans) outDf = path.run() return outDf
def findOptimalPath( df, trans=np.array([]), u=False, d='V' ): # generate individual transition matrix if trans.size == 0: transObj = Transitions( df, ignore=[] ) trans = transObj.getTransitions() if d == 'A': path = ProbPathAll( df, trans, u ) elif d == 'F': path = ProbPathFB( df, trans, u ) else: #path = ProbPathViterbi( df, trans, u ) path = ProbPathSimple( df, trans) outDf = path.run() return outDf
def run( self ): ### going to try running forward-backward, updating transitions from # that until convergence #nSamples = self.observations - len(self.ignore) curIter = 0 while curIter < self.maxIter: curIter += 1 oldTransitions = np.copy( self.transitions ) # get predictions newPredictions = self.groups.apply( runFB, oldTransitions ) #print( newPredictions.head() ) # get transitions t = Transitions( newPredictions, self.ignore ) self.transitions = t.getTransitions() #print( self.transitions ) # check if transitions converged if np.linalg.norm( oldTransitions - self.transitions ) < EPS: break return curIter, self.transitions
def run(self): ### going to try running forward-backward, updating transitions from # that until convergence #nSamples = self.observations - len(self.ignore) curIter = 0 while curIter < self.maxIter: curIter += 1 oldTransitions = np.copy(self.transitions) # get predictions newPredictions = self.groups.apply(runFB, oldTransitions) #print( newPredictions.head() ) # get transitions t = Transitions(newPredictions, self.ignore) self.transitions = t.getTransitions() #print( self.transitions ) # check if transitions converged if np.linalg.norm(oldTransitions - self.transitions) < EPS: break return curIter, self.transitions
def run( self ): ### going to try running forward-backward, updating transitions from # that until convergence #nSamples = self.observations - len(self.ignore) curIter = 0 while curIter < self.maxIter: curIter += 1 oldTransitions = np.copy( self.transitions ) # get predictions #newPredictions = self.groups.apply( runFB, oldTransitions ) newPredictions = runFB(self.data, oldTransitions ) #print( newPredictions.head() ) # get transitions t = Transitions( newPredictions, [] ) self.transitions = t.getTransitions() #print( self.transitions ) # check if transitions converged if np.linalg.norm( oldTransitions - self.transitions ) < EPS: break print( 'did not converge' if curIter == self.maxIter else 'coverged in {:d} iterations'.format( curIter ) ) return pd.DataFrame(self.transitions,index=self.labels,columns=self.labels)
def run(self): ### going to try running forward-backward, updating transitions from # that until convergence #nSamples = self.observations - len(self.ignore) curIter = 0 while curIter < self.maxIter: curIter += 1 oldTransitions = np.copy(self.transitions) # get predictions #newPredictions = self.groups.apply( runFB, oldTransitions ) newPredictions = runFB(self.data, oldTransitions) #print( newPredictions.head() ) # get transitions t = Transitions(newPredictions, []) self.transitions = t.getTransitions() #print( self.transitions ) # check if transitions converged if np.linalg.norm(oldTransitions - self.transitions) < EPS: break print('did not converge' if curIter == self.maxIter else 'coverged in {:d} iterations'.format(curIter)) return pd.DataFrame(self.transitions, index=self.labels, columns=self.labels)
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, classProbs, combineBins, cent, isPrint ): info = '#from_script: epigenotyping_pe_v7.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; class_prob:{:s}; combine_bins_threshold:{:d}; centromere_{:s}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), formatClassProbs(classProbs).lower(), combineBins, ('None' if cent == None else '{:s}-{:s}'.format( bth_util.binSizeToStr( cent[0] ), bth_util.binSizeToStr( cent[1] ) ) ) ) if isPrint: print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label(s):', parentLabelAr[0] ) print( 'Father label(s):', parentLabelAr[1] ) print( 'Classification probabilities:', formatClassProbs( classProbs ) ) print( 'Decoding algorithm:', formatDecoding( decoding ) ) print( 'Combine bin feature threshold:', combineBins ) if cent == None: centStr = 'None' else: centStr = '' for i in range(len(cent)//2): si = i*2 centStr += '; {:s}-{:s}'.format( bth_util.binSizeToStr( cent[si] ), bth_util.binSizeToStr( cent[si+1] ) ) centStr = centStr[2:] if isPrint: print( 'Centromere:', centStr ) # build dataframe if isPrint: print( ' Reading input file', os.path.basename( inFileStr ) ) df = pd.read_table( inFileStr, header=1 ) # check parent labels newParentLabelAr = checkParents( df['sample'], parentLabelAr ) tIgnoreAr = flattenList( newParentLabelAr[:2] ) for i in range(len(newParentLabelAr[0])): tIgnoreAr += [ 'MPV{:d}'.format( i ) ] # group by bin df['bin'] = df.pos // binSize transformation = None # get centromere bins if necessary if cent == None: centBins = [] else: cent = [ x // binSize for x in cent ] centBins = [] #centBins = list( range(cent[0], cent[1]+1) ) for i in range(len(cent) // 2 ): si = i * 2 centBins += list( range(cent[si], cent[si+1]+1) ) # combine bins if necessary nbins = max(df['bin'])+1 if combineBins > 0: if isPrint: print( ' Merging bins', end=' ... ' ) df['tBin'] = df['bin'] transformation = binTransformation( df, combineBins ) # apply the transformation df['bin'] = df['tBin'].apply( lambda x: transformation[x] ) dfBinGroup = df.groupby( 'bin' ) if combineBins > 0: newNBins = len( dfBinGroup.groups ) info += '; non-functional_bins:{:d}'.format( nbins - newNBins ) if isPrint: print( 'combined {:d} non-functional bins'.format( nbins - newNBins ) ) # classify by bin if isPrint: print( ' Classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) dfClass = runClassification( dfBinGroup, numProc, newParentLabelAr, classProbs ) dfClass.reset_index(inplace=True) #print( dfClass.head ) del(df, dfBinGroup ) # decode, if necessary if decoding != 'N': #ignoreAr = parentLabelAr[:2] + ['MPV'] transition = Transitions( dfClass, ignore = tIgnoreAr ) transitionMatrix = transition.getTransitions() # write this matrix to file #outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins ) #tLabels = [ 'mother', 'MPV', 'father' ] #transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels ) #with open( outFStr, 'w' ) as f: # f.write(info+'\n') #transData.to_csv( outFStr, sep='\t', mode='a' ) # group by sample dfSampleGroup = dfClass.groupby( 'sample' ) nsamples = len( dfSampleGroup.groups ) tmpDecoding = ( 'F' if decoding == 'B' else decoding ) if isPrint: print( ' {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding(tmpDecoding), nsamples, numProc ) ) dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, tmpDecoding, centBins ) if decoding == 'B': dfNew = dfOutput.loc[:,['bin','sample']].copy() dfNew['MPV'] = np.log(dfOutput['fb.score.MPV']) dfNew['mother'] = np.log(dfOutput['fb.score.mother']) dfNew['father'] = np.log(dfOutput['fb.score.father']) dfNew['prediction'] = dfOutput['fb.prediction'] #print(dfOutput.head()) #print(dfNew.head()) transition = Transitions( dfNew, ignore = tIgnoreAr ) transitionMatrix = transition.getTransitions() dfSampleGroup = dfNew.groupby( 'sample' ) nsamples = len( dfSampleGroup.groups ) if isPrint: print( ' {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding('V'), nsamples, numProc ) ) dfOutputN = runDecoding( dfSampleGroup, numProc, transitionMatrix, 'V', centBins ) dfOutput[['vit.score.mother', 'vit.score.father', 'vit.score.MPV', 'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV', 'vit.prediction']] = dfOutputN[['vit.score.mother', 'vit.score.father', 'vit.score.MPV', 'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV', 'vit.prediction']] #print( dfOutput.head() ) # end decoding == B dfOutput.set_index( ['bin', 'sample'], inplace=True ) del( dfSampleGroup ) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, classProbs, combineBins ) # if combination, undo transformation by applying the predictions to additional bins if combineBins > 0: dfOutput.reset_index(inplace=True) dfOutput['cBin'] = dfOutput['bin'] dfOutputT = undoBinTransformation( dfOutput, transformation ) else: dfOutputT = dfOutput.drop('cBin', axis=1) if isPrint: print( ' Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info+'\n') dfOutputT.to_csv( outFileStr, sep='\t', mode='a' ) if isPrint: print( 'Done' )
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins ): info = '#from_script: epigenotyping_pe_combbin.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower(), combineBins ) print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label:', parentLabelAr[0] ) print( 'Father label:', parentLabelAr[1] ) print( 'Uniform classification probabilities:', str(isUniform) ) print( 'Decoding algorithm:', formatDecoding( decoding ) ) print( 'Combine bin feature threshold:', combineBins ) # build dataframe print( ' Reading input file', os.path.basename( inFileStr ) ) df = pd.read_table( inFileStr, header=1 ) # check parent labels checkParents( df['sample'], parentLabelAr ) # group by bin df['bin'] = df.pos // binSize transformation = None # combine bins if necessary nbins = max(df['bin'])+1 if combineBins > 0: print( ' Merging bins', end=' ... ' ) df['tBin'] = df['bin'] transformation = binTransformation( df, combineBins ) # apply the transformation df['bin'] = df['tBin'].apply( lambda x: transformation[x] ) dfBinGroup = df.groupby( 'bin' ) if combineBins > 0: newNBins = len(dfBinGroup.groups ) print( 'combined {:d} non-functional bins'.format( nbins - newNBins ) ) # classify by bin print( ' Classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, isUniform ) dfClass.reset_index(inplace=True) #print( dfClass.head ) del(df, dfBinGroup ) # decode, if necessary if decoding != 'N': ignoreAr = parentLabelAr[:2] + ['MPV'] print( ' Obtaining initial transitions' ) transition = Transitions( dfClass, ignore = ignoreAr ) transitionMatrix = transition.getTransitions() # multiply for array of transitions transitionMatrixArray = np.array( [ np.copy( transitionMatrix ) for i in range(nbins ) ] ) if maxIter > 0: print( ' Iteratively improving transitions with maximum', maxIter, 'iterations' ) at = AdaptiveTransitions( dfClass, transitionMatrixArray, ignoreAr, maxIter ) iterations, transitionMatrix = at.run() trInfo += '; iterations_to_convergence:' if iterations == maxIter: trInfo += 'NA' print( ' Did not converge in 10 iterations' ) else: trInfo += str(iterations) print( ' Convergence in', iterations, 'iterations' ) '''# write this matrix to file #outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins ) tLabels = [ 'mother', 'MPV', 'father' ] transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels ) with open( outFStr, 'w' ) as f: f.write(info+'\n') transData.to_csv( outFStr, sep='\t', mode='a' )''' # group by sample dfSampleGroup = dfClass.groupby( 'sample' ) nsamples = len( dfSampleGroup.groups ) print( ' {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding(decoding), nsamples, numProc ) ) dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrixArray, decoding ) dfOutput.set_index( ['bin', 'sample'], inplace=True ) del( dfSampleGroup ) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, combineBins ) # if combination, undo transformation by applying the predictions to additional bins if combineBins > 0: dfOutput.reset_index(inplace=True) dfOutput['cBin'] = dfOutput['bin'] dfOutputT = undoBinTransformation( dfOutput, transformation ) else: dfOutputT = dfOutput.drop('cBin', axis=1) print( ' Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info+'\n') dfOutputT.to_csv( outFileStr, sep='\t', mode='a' ) print( 'Done' )
def processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding, classProbs, combineBins, cent, scaleTransitions, isPrint): info = '#from_script: epigenotyping_pe_v9.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; class_prob:{:s}; combine_bins_threshold:{:d}; centromere_{:s}; scale_transitions:{:s}'.format( os.path.basename(inFileStr), bth_util.binSizeToStr(binSize), formatDecoding(decoding).lower().replace('and', ','), formatClassProbs(classProbs).lower(), combineBins, ('None' if cent == None else '{:s}-{:s}'.format( bth_util.binSizeToStr(cent[0]), bth_util.binSizeToStr(cent[1]))), str(scaleTransitions)) if isPrint: print('Weighted methylation file:', os.path.basename(inFileStr)) print('Bin size:', bth_util.binSizeToStr(binSize)) print('Mother label(s):', parentLabelAr[0]) print('Father label(s):', parentLabelAr[1]) print('Classification probabilities:', formatClassProbs(classProbs)) print('Decoding algorithm:', formatDecoding(decoding)) print('Combine bin feature threshold:', combineBins) print('Scale transitions by sample size:', scaleTransitions) if cent == None: centStr = 'None' else: centStr = '' for i in range(len(cent) // 2): si = i * 2 centStr += '; {:s}-{:s}'.format( bth_util.binSizeToStr(cent[si]), bth_util.binSizeToStr(cent[si + 1])) centStr = centStr[2:] if isPrint: print('Centromere:', centStr) # build dataframe if isPrint: print(' Reading input file', os.path.basename(inFileStr)) df = pd.read_table(inFileStr, header=1) # check parent labels newParentLabelAr = checkParents(df['sample'], parentLabelAr) tIgnoreAr = flattenList(newParentLabelAr[:2]) for i in range(len(newParentLabelAr[0])): tIgnoreAr += ['MPV{:d}'.format(i)] # group by bin df['bin'] = df.pos // binSize transformation = None # get centromere bins if necessary if cent == None: centBins = [] else: cent = [x // binSize for x in cent] centBins = [] #centBins = list( range(cent[0], cent[1]+1) ) for i in range(len(cent) // 2): si = i * 2 centBins += list(range(cent[si], cent[si + 1] + 1)) # combine bins if necessary nbins = max(df['bin']) + 1 if combineBins > 0: if isPrint: print(' Merging bins', end=' ... ') df['tBin'] = df['bin'] transformation = binTransformation(df, combineBins) # apply the transformation df['bin'] = df['tBin'].apply(lambda x: transformation[x]) dfBinGroup = df.groupby('bin') if combineBins > 0: newNBins = len(dfBinGroup.groups) info += '; non-functional_bins:{:d}'.format(nbins - newNBins) if isPrint: print('combined {:d} non-functional bins'.format(nbins - newNBins)) # classify by bin if isPrint: print(' Classifying {:d} bins with {:d} processors'.format( nbins, numProc)) dfClass = runClassification(dfBinGroup, numProc, newParentLabelAr, classProbs) dfClass.reset_index(inplace=True) #print( dfClass.head ) del (df, dfBinGroup) # decode, if necessary if decoding != 'N': #ignoreAr = parentLabelAr[:2] + ['MPV'] transition = Transitions(dfClass, ignore=tIgnoreAr) transitionMatrix = transition.getTransitions() # write this matrix to file '''outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins ) tLabels = [ 'mother', 'MPV', 'father' ] transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels ) with open( outFStr, 'w' ) as f: f.write(info+'\n') transData.to_csv( outFStr, sep='\t', mode='a' )''' # group by sample dfSampleGroup = dfClass.groupby('sample') nsamples = len(dfSampleGroup.groups) if scaleTransitions: scaleFactor = float(nsamples - len(tIgnoreAr) - 1) / float(nsamples - len(tIgnoreAr)) else: scaleFactor = 1 tmpDecoding = ('F' if decoding == 'B' else decoding) if isPrint: print(' {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding(tmpDecoding), nsamples, numProc)) dfOutput = runDecoding(dfSampleGroup, numProc, transitionMatrix, tmpDecoding, centBins, scaleFactor) if decoding == 'B': dfNew = dfOutput.loc[:, ['bin', 'sample']].copy() dfNew['MPV'] = np.log(dfOutput['fb.score.MPV']) dfNew['mother'] = np.log(dfOutput['fb.score.mother']) dfNew['father'] = np.log(dfOutput['fb.score.father']) dfNew['prediction'] = dfOutput['fb.prediction'] #print(dfOutput.head()) #print(dfNew.head()) transition = Transitions(dfNew, ignore=tIgnoreAr) transitionMatrix = transition.getTransitions() dfSampleGroup = dfNew.groupby('sample') nsamples = len(dfSampleGroup.groups) if isPrint: print( ' {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding('V'), nsamples, numProc)) dfOutputN = runDecoding(dfSampleGroup, numProc, transitionMatrix, 'V', centBins, scaleFactor) dfOutput[[ 'vit.score.mother', 'vit.score.father', 'vit.score.MPV', 'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV', 'vit.prediction' ]] = dfOutputN[[ 'vit.score.mother', 'vit.score.father', 'vit.score.MPV', 'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV', 'vit.prediction' ]] #print( dfOutput.head() ) # end decoding == B dfOutput.set_index(['bin', 'sample'], inplace=True) del (dfSampleGroup) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName(inFileStr, outID, binSize, decoding, classProbs, scaleTransitions, combineBins) # if combination, undo transformation by applying the predictions to additional bins if combineBins > 0: dfOutput.reset_index(inplace=True) dfOutput['cBin'] = dfOutput['bin'] dfOutputT = undoBinTransformation(dfOutput, transformation) else: dfOutputT = dfOutput.drop('cBin', axis=1) if isPrint: print(' Writing output to', outFileStr) with open(outFileStr, 'w') as f: f.write(info + '\n') dfOutputT.to_csv(outFileStr, sep='\t', mode='a') if isPrint: print('Done')
def processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins): info = '#from_script: epigenotyping_pe_combbin.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}'.format( os.path.basename(inFileStr), bth_util.binSizeToStr(binSize), formatDecoding(decoding).lower().replace('and', ','), str(isUniform).lower(), combineBins) print('Weighted methylation file:', os.path.basename(inFileStr)) print('Bin size:', bth_util.binSizeToStr(binSize)) print('Mother label:', parentLabelAr[0]) print('Father label:', parentLabelAr[1]) print('Uniform classification probabilities:', str(isUniform)) print('Decoding algorithm:', formatDecoding(decoding)) print('Combine bin feature threshold:', combineBins) # build dataframe print(' Reading input file', os.path.basename(inFileStr)) df = pd.read_table(inFileStr, header=1) # check parent labels checkParents(df['sample'], parentLabelAr) # group by bin df['bin'] = df.pos // binSize transformation = None # combine bins if necessary nbins = max(df['bin']) + 1 if combineBins > 0: print(' Merging bins', end=' ... ') df['tBin'] = df['bin'] transformation = binTransformation(df, combineBins) # apply the transformation df['bin'] = df['tBin'].apply(lambda x: transformation[x]) dfBinGroup = df.groupby('bin') if combineBins > 0: newNBins = len(dfBinGroup.groups) print('combined {:d} non-functional bins'.format(nbins - newNBins)) # classify by bin print(' Classifying {:d} bins with {:d} processors'.format(nbins, numProc)) dfClass = runClassification(dfBinGroup, numProc, parentLabelAr, isUniform) dfClass.reset_index(inplace=True) #print( dfClass.head ) del (df, dfBinGroup) # decode, if necessary if decoding != 'N': ignoreAr = parentLabelAr[:2] + ['MPV'] print(' Obtaining initial transitions') transition = Transitions(dfClass, ignore=ignoreAr) transitionMatrix = transition.getTransitions() # multiply for array of transitions transitionMatrixArray = np.array( [np.copy(transitionMatrix) for i in range(nbins)]) if maxIter > 0: print(' Iteratively improving transitions with maximum', maxIter, 'iterations') at = AdaptiveTransitions(dfClass, transitionMatrixArray, ignoreAr, maxIter) iterations, transitionMatrix = at.run() trInfo += '; iterations_to_convergence:' if iterations == maxIter: trInfo += 'NA' print(' Did not converge in 10 iterations') else: trInfo += str(iterations) print(' Convergence in', iterations, 'iterations') '''# write this matrix to file #outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins ) tLabels = [ 'mother', 'MPV', 'father' ] transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels ) with open( outFStr, 'w' ) as f: f.write(info+'\n') transData.to_csv( outFStr, sep='\t', mode='a' )''' # group by sample dfSampleGroup = dfClass.groupby('sample') nsamples = len(dfSampleGroup.groups) print(' {:s} decoding {:d} samples with {:d} processors'.format( formatDecoding(decoding), nsamples, numProc)) dfOutput = runDecoding(dfSampleGroup, numProc, transitionMatrixArray, decoding) dfOutput.set_index(['bin', 'sample'], inplace=True) del (dfSampleGroup) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName(inFileStr, outID, binSize, decoding, isUniform, combineBins) # if combination, undo transformation by applying the predictions to additional bins if combineBins > 0: dfOutput.reset_index(inplace=True) dfOutput['cBin'] = dfOutput['bin'] dfOutputT = undoBinTransformation(dfOutput, transformation) else: dfOutputT = dfOutput.drop('cBin', axis=1) print(' Writing output to', outFileStr) with open(outFileStr, 'w') as f: f.write(info + '\n') dfOutputT.to_csv(outFileStr, sep='\t', mode='a') print('Done')
def processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins, maxIter): info = '#from_script: epigenotyping_combin_smp-iter-trans.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}; maximum_iterations:{:d}'.format( os.path.basename(inFileStr), bth_util.binSizeToStr(binSize), formatDecoding(decoding).lower().replace('and', ','), str(isUniform).lower(), combineBins, maxIter) print('Weighted methylation file:', os.path.basename(inFileStr)) print('Bin size:', bth_util.binSizeToStr(binSize)) print('Mother label:', parentLabelAr[0]) print('Father label:', parentLabelAr[1]) print('Uniform classification probabilities:', str(isUniform)) print('Decoding algorithm:', formatDecoding(decoding)) print('Combine bin feature threshold:', combineBins) print('Maximum transition matrix iterations:', maxIter) # build dataframe print(' Reading input file', os.path.basename(inFileStr)) df = pd.read_table(inFileStr, header=1) # check parent labels checkParents(df['sample'], parentLabelAr) # group by bin df['bin'] = df.pos // binSize transformation = None # combine bins if necessary nbins = max(df['bin']) + 1 if combineBins > 0: print(' Merging bins', end=' ... ') df['tBin'] = df['bin'] transformation = binTransformation(df, combineBins) # apply the transformation df['bin'] = df['tBin'].apply(lambda x: transformation[x]) dfBinGroup = df.groupby('bin') if combineBins > 0: newNBins = len(dfBinGroup.groups) print('combined {:d} non-functional bins'.format(nbins - newNBins)) # classify by bin print(' Classifying {:d} bins with {:d} processors'.format(nbins, numProc)) dfClass = runClassification(dfBinGroup, numProc, parentLabelAr, isUniform) dfClass.reset_index(inplace=True) #print( dfClass.head ) del (df, dfBinGroup) # decode, if necessary if decoding != 'N': ignoreAr = parentLabelAr[:2] + ['MPV'] print(' Obtaining initial transitions') transition = Transitions(dfClass, ignore=ignoreAr) transitionMatrix = transition.getTransitions() outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins) with open(outFStr, 'w') as f: f.write(info + '\n') # group by sample #print(dfClass.head()) dfSampleGroup = dfClass.groupby('sample') nsamples = len(dfSampleGroup.groups) print( ' {:s} decoding and optimizing transition matrices for {:d} samples with {:d} processors' .format(formatDecoding(decoding), nsamples, numProc)) ## note: decoding will now include improved transition matrix calculations dfOutput = runDecoding(dfSampleGroup, numProc, transitionMatrix, decoding, maxIter, outFStr) dfOutput.set_index(['bin', 'sample'], inplace=True) del (dfSampleGroup) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName(inFileStr, outID, binSize, decoding, isUniform, combineBins) # if combination, undo transformation by applying the predictions to additional bins if combineBins > 0: dfOutput.reset_index(inplace=True) dfOutput['cBin'] = dfOutput['bin'] dfOutputT = undoBinTransformation(dfOutput, transformation) else: dfOutputT = dfOutput.drop('cBin', axis=1) print(' Writing output to', outFileStr) with open(outFileStr, 'w') as f: f.write(info + '\n') dfOutputT.to_csv(outFileStr, sep='\t', mode='a') print('Done')
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins, maxIter ): info = '#from_script: epigenotyping_combin_smp-iter-trans.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}; maximum_iterations:{:d}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower(), combineBins, maxIter ) print( 'Weighted methylation file:', os.path.basename( inFileStr ) ) print( 'Bin size:', bth_util.binSizeToStr( binSize ) ) print( 'Mother label:', parentLabelAr[0] ) print( 'Father label:', parentLabelAr[1] ) print( 'Uniform classification probabilities:', str(isUniform) ) print( 'Decoding algorithm:', formatDecoding( decoding ) ) print( 'Combine bin feature threshold:', combineBins ) print( 'Maximum transition matrix iterations:', maxIter ) # build dataframe print( ' Reading input file', os.path.basename( inFileStr ) ) df = pd.read_table( inFileStr, header=1 ) # check parent labels checkParents( df['sample'], parentLabelAr ) # group by bin df['bin'] = df.pos // binSize transformation = None # combine bins if necessary nbins = max(df['bin'])+1 if combineBins > 0: print( ' Merging bins', end=' ... ' ) df['tBin'] = df['bin'] transformation = binTransformation( df, combineBins ) # apply the transformation df['bin'] = df['tBin'].apply( lambda x: transformation[x] ) dfBinGroup = df.groupby( 'bin' ) if combineBins > 0: newNBins = len(dfBinGroup.groups ) print( 'combined {:d} non-functional bins'.format( nbins - newNBins ) ) # classify by bin print( ' Classifying {:d} bins with {:d} processors'.format( nbins, numProc ) ) dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, isUniform ) dfClass.reset_index(inplace=True) #print( dfClass.head ) del(df, dfBinGroup ) # decode, if necessary if decoding != 'N': ignoreAr = parentLabelAr[:2] + ['MPV'] print( ' Obtaining initial transitions' ) transition = Transitions( dfClass, ignore = ignoreAr ) transitionMatrix = transition.getTransitions() outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins ) with open( outFStr, 'w' ) as f: f.write(info+'\n') # group by sample #print(dfClass.head()) dfSampleGroup = dfClass.groupby( 'sample' ) nsamples = len( dfSampleGroup.groups ) print( ' {:s} decoding and optimizing transition matrices for {:d} samples with {:d} processors'.format( formatDecoding(decoding), nsamples, numProc ) ) ## note: decoding will now include improved transition matrix calculations dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, decoding, maxIter, outFStr ) dfOutput.set_index( ['bin', 'sample'], inplace=True ) del( dfSampleGroup ) else: dfOutput = dfClass # write output outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, combineBins ) # if combination, undo transformation by applying the predictions to additional bins if combineBins > 0: dfOutput.reset_index(inplace=True) dfOutput['cBin'] = dfOutput['bin'] dfOutputT = undoBinTransformation( dfOutput, transformation ) else: dfOutputT = dfOutput.drop('cBin', axis=1) print( ' Writing output to', outFileStr ) with open( outFileStr, 'w' ) as f: f.write(info+'\n') dfOutputT.to_csv( outFileStr, sep='\t', mode='a' ) print( 'Done' )