コード例 #1
0
ファイル: finite_automaton.py プロジェクト: hiImDeni/flcd
 def __init__(self, file):
     self.q = set()  # states
     self.alphabet = set()  # sigma
     self.transitions = Transitions()
     self.F = set()  # final states
     self.__file = file
     self.__read_file()
コード例 #2
0
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, isIndiv ):
	dType = ('Viterbi' if decoding == 'V' else ('Forward-backward' if decoding == 'F' else ('Viterbi and Forward-backward' if decoding == 'A' else 'None') ) )
	info = '#from_script:epigenotype_by_logreg.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; indiv_transitions:{:s}\n'.format( os.path.basename( inFileStr), bth_util.binSizeToStr( binSize ), dType.lower().replace(' and ', ','), str(isUniform), str(isIndiv) )
	print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
	print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
	print( 'Mother label:', parentLabelAr[0] )
	print( 'Father label:', parentLabelAr[1] )
	print( 'Uniform classification probabilities:', str( isUniform ) )
	print( 'Decoding algorithm:', dType)
	print( 'Individual transition probabilities:', str( isIndiv ) )
	
	# build data frame
	df = pd.read_table( inFileStr, header=1 )
	# check parent labels
	checkParents( df['sample'], parentLabelAr )
	
	# group by bin and analyze
	df['bin'] = df.pos // binSize
	nbins = max(df['bin'])+1
	dfg = df.groupby('bin')
	if numProc > 1:
		print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
		res_class = runMultiClassification( dfg, numProc, parentLabelAr, isUniform )
	else:
		print( 'Begin classifying {:d} bins'.format( nbins ) )
		res_class = dfg.apply( classLogReg, pla=parentLabelAr, u=isUniform )
	res_class.reset_index(inplace=True)
	
	# decode if necessary
	if decoding != 'N':
		ignoreAr = parentLabelAr + ['MPV']
		if isIndiv:
			transitions = np.array([])
		else:
			print( 'Generating transition matrix' )
			transition = Transitions( res_class, ignore=ignoreAr )
			transitions = transition.getTransitions()
			print(transitions)
		# find optimum path for all samples
		groups = res_class.groupby( 'sample' )
		nsamples = len(groups.groups)
		
		if numProc > 1:
			print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format(  dType, nsamples, numProc ) )
			results = runMultiPath( groups, numProc, transitions, isUniform, decoding )
		else:
			print( 'Begin {:s} decoding {:d} samples'.format( dType, nsamples ) )
			results = groups.apply( findOptimalPath, trans=transitions, u=isUniform, d=decoding )
		results.set_index( ['bin', 'sample'], inplace=True )
	else:
		results = res_class
	
	# output file
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, isIndiv )
	# write output
	print( 'Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info)
	results.to_csv( outFileStr, sep='\t', mode='a' )
	print( 'Done' )
コード例 #3
0
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, parentAddLabelAr, decoding, isUniform ):
	
	info = '#from_script: epigenotyping_pe.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; mother_samples:{:s}; father_samples:{:s}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower(), ','.join(parentLabelAr[0]), ','.join(parentLabelAr[1]) )
	print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
	print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
	print( 'Mother label(s):', ', '.join(parentLabelAr[0]) )
	print( 'Father label(s):', ', '.join(parentLabelAr[1]) )
	if len(parentAddLabelAr[0]) != 0 or len(parentAddLabelAr[1]) != 0:
		print( 'Additional mother training label(s):', ('None' if len(parentAddLabelAr[0])==0 else ', '.join(parentAddLabelAr[0])) )
		print( 'Additional father training label(s):', ('None' if len(parentAddLabelAr[1]) == 0 else ', '.join(parentAddLabelAr[1])) )
	print( 'Uniform classification probabilities:', str(isUniform) )
	print( 'Decoding algorithm:', formatDecoding( decoding ) )
	
	# build dataframe
	df = pd.read_table( inFileStr, header=1 )
	
	# check parent labels
	parentLabelAr = checkParents( df['sample'], parentLabelAr )
	# check additional training data labels
	if len(parentAddLabelAr[0]) != 0 or len(parentAddLabelAr[1]) != 0:
		parentAddLabelAr = checkParents( df['sample'], parentAddLabelAr )
	
	# group by bin
	df['bin'] = df.pos // binSize
	nbins = max(df['bin'])+1
	dfBinGroup = df.groupby( 'bin' )
	
	# classify by bin
	print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
	dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, parentAddLabelAr, isUniform )
	dfClass.reset_index(inplace=True)
	#print( dfClass.head )
	del(df, dfBinGroup )
	# decode, if necessary
	if decoding != 'N':
		totalParentLabelAr = [parentLabelAr[0] + parentAddLabelAr[0], parentLabelAr[1] + parentAddLabelAr[1]]
		ignoreAr = flattenList( totalParentLabelAr ) + ['MPV']
		transition = Transitions( dfClass, ignore = ignoreAr )
		transitionMatrix = transition.getTransitions()
		
		# group by sample
		dfSampleGroup = dfClass.groupby( 'sample' )
		nsamples = len(dfSampleGroup.groups )
		
		print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format(  formatDecoding(decoding), nsamples, numProc ) )
		dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, decoding )
		dfOutput.set_index( ['bin', 'sample'], inplace=True )
		del( dfSampleGroup )
	else:
		dfOutput = dfClass
	
	# write output
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform )
	print( 'Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info+'\n')
	dfOutput.to_csv( outFileStr, sep='\t', mode='a' )
	
	print( 'Done' )
コード例 #4
0
ファイル: FiniteAutomata.py プロジェクト: AndreiJeler/FLCD
 def __init__(self, file_name):
     self.file_name = file_name
     self.Q = set()
     self.E = set()
     self.q0 = None
     self.F = set()
     self.delta = Transitions()
     self.read_file()
コード例 #5
0
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, maxIter ):
	dType = ('Viterbi' if decoding == 'V' else ('Forward-backward' if decoding == 'F' else ('Viterbi and Forward-backward' if decoding == 'A' else 'None') ) )
	info = '#from_script:epigenotype_by_logreg.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}\n'.format( os.path.basename( inFileStr), bth_util.binSizeToStr( binSize ), dType.lower().replace(' and ', ','), str(isUniform) )
	print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
	print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
	print( 'Mother label:', parentLabelAr[0] )
	print( 'Father label:', parentLabelAr[1] )
	print( 'Uniform classification probabilities:', str( isUniform ) )
	print( 'Decoding algorithm:', dType)
	
	# build data frame
	df = pd.read_table( inFileStr, header=1 )
	# check parent labels
	checkParents( df['sample'], parentLabelAr )
	
	# group by bin and analyze
	df['bin'] = df.pos // binSize
	nbins = max(df['bin'])+1
	dfg = df.groupby('bin')
	if numProc > 1:
		print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
		res_class = runMultiClassification( dfg, numProc, parentLabelAr, isUniform )
	else:
		print( 'Begin classifying {:d} bins'.format( nbins ) )
		res_class = dfg.apply( classLogReg, pla=parentLabelAr, u=isUniform )
	res_class.reset_index(inplace=True)
	
	# decode if necessary
	if decoding != 'N':
		ignoreAr = parentLabelAr + ['MPV']
		print( 'Generating transition matrix' )
		transition = Transitions( res_class, ignore=ignoreAr )
		transitions = transition.getTransitions()
		# find optimum path for all samples
		groups = res_class.groupby( 'sample' )
		nsamples = len(groups.groups)
		
		if numProc > 1:
			print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format(  dType, nsamples, numProc ) )
			results = runMultiPath( groups, numProc, transitions, isUniform, decoding )
		else:
			print( 'Begin {:s} decoding {:d} samples'.format( dType, nsamples ) )
			results = groups.apply( findOptimalPath, trans=transitions, u=isUniform, d=decoding )
		results.set_index( ['bin', 'sample'], inplace=True )
	else:
		results = res_class
	
	# output file
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform )
	# write output
	print( 'Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info)
	results.to_csv( outFileStr, sep='\t', mode='a' )
	print( 'Done' )
コード例 #6
0
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform ):
	
	info = '#from_script: epigenotyping_pe.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower() )
	print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
	print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
	print( 'Mother label:', parentLabelAr[0] )
	print( 'Father label:', parentLabelAr[1] )
	print( 'Uniform classification probabilities:', str(isUniform) )
	print( 'Decoding algorithm:', formatDecoding( decoding ) )
	
	# build dataframe
	df = pd.read_table( inFileStr, header=1 )
	
	# check parent labels
	checkParents( df['sample'], parentLabelAr )
	
	# group by bin
	df['bin'] = df.pos // binSize
	nbins = max(df['bin'])+1
	dfBinGroup = df.groupby( 'bin' )
	
	# classify by bin
	print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
	dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, isUniform )
	dfClass.reset_index(inplace=True)
	#print( dfClass.head )
	del(df, dfBinGroup )
	# decode, if necessary
	if decoding != 'N':
		ignoreAr = parentLabelAr[:2] + ['MPV']
		transition = Transitions( dfClass, ignore = ignoreAr )
		transitionMatrix = transition.getTransitions()
		
		# group by sample
		dfSampleGroup = dfClass.groupby( 'sample' )
		nsamples = len(dfSampleGroup.groups )
		
		print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format(  formatDecoding(decoding), nsamples, numProc ) )
		dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, decoding )
		dfOutput.set_index( ['bin', 'sample'], inplace=True )
		del( dfSampleGroup )
	else:
		dfOutput = dfClass
	
	# write output
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform )
	print( 'Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info+'\n')
	dfOutput.to_csv( outFileStr, sep='\t', mode='a' )
	
	print( 'Done' )
コード例 #7
0
def findOptimalPath(df, trans=np.array([]), u=False, d='V'):
    # generate individual transition matrix
    if trans.size == 0:
        transObj = Transitions(df, ignore=[])
        trans = transObj.getTransitions()
    if d == 'A':
        path = ProbPathAll(df, trans, u)
    elif d == 'F':
        path = ProbPathFB(df, trans, u)
    else:
        #path = ProbPathViterbi( df, trans, u )
        path = ProbPathSimple(df, trans)
    outDf = path.run()
    return outDf
コード例 #8
0
def findOptimalPath( df, trans=np.array([]), u=False, d='V' ):
	# generate individual transition matrix
	if trans.size == 0:
		transObj = Transitions( df, ignore=[] )
		trans = transObj.getTransitions()
	if d == 'A':
		path = ProbPathAll( df, trans, u )
	elif d == 'F':
		path = ProbPathFB( df, trans, u )
	else:
		#path = ProbPathViterbi( df, trans, u )
		path = ProbPathSimple( df, trans)
	outDf = path.run()
	return outDf
コード例 #9
0
ファイル: finite_automaton.py プロジェクト: hiImDeni/flcd
class FiniteAutomaton:
    def __init__(self, file):
        self.q = set()  # states
        self.alphabet = set()  # sigma
        self.transitions = Transitions()
        self.F = set()  # final states
        self.__file = file
        self.__read_file()

    def __read_file(self):
        with open(self.__file, 'r') as file:
            self.q = file.readline().strip().split(' ')
            self.alphabet = file.readline().strip().split(' ')
            self.__q0 = self.q[0]
            self.F = file.readline().strip().split(' ')
            line = file.readline().strip()
            while line:
                line = line.split(' ')
                if line[0] not in self.q or line[1] not in self.q:
                    raise ValueError("State does not exist")
                for i in range(2, len(line)):
                    if line[i] not in self.alphabet:
                        raise ValueError("Symbol " + line[i] +
                                         " is not in the alphabet")
                    self.transitions.add(line[0], line[1], line[i])
                line = file.readline().strip()

    def is_deterministic(self):
        keys = self.transitions.get_keys()
        for start in keys:
            for letter in self.alphabet:
                if len(self.transitions.get_transitions_to(start, letter)) > 1:
                    return False
        return True

    def is_accepted(self, word):
        if not self.is_deterministic():
            return None
        currentState = self.__q0
        for char in word:
            nextTransitions = self.transitions.get_transitions_to(
                currentState, char)
            if len(nextTransitions) == 0:
                return False
            nextTransition = nextTransitions[0]
            currentState = nextTransition[0]
        if currentState not in self.F:
            return False
        return True
コード例 #10
0
def read_config(filename):
    global log, Args
    # load INI files
    config = SafeConfigParser()
    config.read(filename)
    # read frame size
    size = config.get('output', 'size').split('x')
    size = [int(size[0]), int(size[1])]
    # read frames per second
    fps = int(config.get('output', 'fps'))
    # read composites from configuration
    log.info("reading composites from configuration...")
    composites = Composites.configure(config.items('composites'), size)
    log.debug("read %d composites:\n\t%s\t" %
              (len(composites), '\n\t'.join(sorted(composites))))
    # maybe overwirte targets by arguments
    if Args.composite:
        # check for composites in arguments
        targets = [composites[c] for c in set(Args.composite)]
    else:
        # list of all relevant composites we like to target
        targets = Composites.targets(composites)
    intermediates = Composites.intermediates(composites)
    # list targets and itermediates
    if Args.list:
        print("%d targetable composite(s):\n\t%s\t" %
              (len(targets), '\n\t'.join([t.name for t in targets])))
        print("%d intermediate composite(s):\n\t%s\t" %
              (len(intermediates), '\n\t'.join([t.name
                                                for t in intermediates])))
    # read transitions from configuration
    log.info("reading transitions from configuration...")
    transitions = Transitions.configure(config.items('transitions'),
                                        composites, targets, fps)
    log.info("read %d transition(s)" % transitions.count())
    if Args.map:
        print("transition table:\n%s" % transitions)
    # maybe overwirte targets by arguments
    if Args.composite:
        # check for composites in arguments
        sequence = Args.composite
    else:
        # generate sequence of targets
        sequence = Transitions.travel([t.name for t in targets])
    log.debug("using %d target composite(s):\n\t%s\t" %
              (len(targets), '\n\t'.join([t.name for t in targets])))

    # return config
    return size, fps, sequence, transitions, composites
コード例 #11
0
def find_parameters(filenames,pbc,model,
      dv,dw,dwrad,D0,dtimezero,temp,temp_end,nmc,nmc_update,seed,outfile, ncosF,ncosD,ncosDrad,
      move_timezero,initfile,k,
      lmax,reduction): 
    print "python program to extract diffusion coefficient and free energy from transition counts"
    print "copyright: Gerhard Hummer (NIH, July 2012)"
    print "adapted by An Ghysels (August 2012)\n"

    if seed is not None:
        np.random.seed(seed)

    # start Monte Carlo object
    MC = MCState(pbc,lmax)
    # settings
    MC.set_MC_params(dv,dw,dwrad,D0,dtimezero,temp,nmc,nmc_update,move_timezero,k,temp_end=temp_end,)
    #MC.print_MC_params()

    # INPUT and INITIALIZATION model/MC
    if MC.do_radial:
        data = RadTransitions(filenames)
    else:
        data = Transitions(filenames,reduction=reduction)
    MC.set_model(model,data,ncosF,ncosD,ncosDrad)

    # USE INFO from INITFILE
    if initfile is not None:
        import sys
        f = sys.stdout
        MC.use_initfile(initfile)
        MC.print_MC_params(f)
        MC.print_coeffs_laststate(f)

    logger = Logger(MC)

    # MONTE CARLO OPTIMIZATION
    do_mc_cycles(MC,logger)

    # print final results (potential and diffusion coefficient)
    #----------------------------------------------------------
    # choose filename for pickle object
    if outfile is None:
        import sys
        f = sys.stdout
        picfile = "mc.pic"
    else:
        f = file(outfile,"w+")  # print final model to a file
        picfile = outfile+".pic"

    # print to screen
    #MC.print_log_like()
    MC.print_statistics()

    MC.print_laststate(f,final=True)  # print model, coeffs
    if outfile is not None:
        f.close()

    logger.model = MC.model   # this is not a hard copy
    logger.dump(picfile)
    logger.statistics(MC)  #st=1000)
    return()
コード例 #12
0
 def run(self):
     ### going to try running forward-backward, updating transitions from
     # that until convergence
     #nSamples = self.observations - len(self.ignore)
     curIter = 0
     while curIter < self.maxIter:
         curIter += 1
         oldTransitions = np.copy(self.transitions)
         # get predictions
         newPredictions = self.groups.apply(runFB, oldTransitions)
         #print( newPredictions.head() )
         # get transitions
         t = Transitions(newPredictions, self.ignore)
         self.transitions = t.getTransitions()
         #print( self.transitions )
         # check if transitions converged
         if np.linalg.norm(oldTransitions - self.transitions) < EPS:
             break
     return curIter, self.transitions
コード例 #13
0
	def run( self ):
		### going to try running forward-backward, updating transitions from
		# that until convergence
		#nSamples = self.observations - len(self.ignore)
		curIter = 0
		while curIter < self.maxIter:
			curIter += 1
			oldTransitions = np.copy( self.transitions )
			# get predictions
			newPredictions = self.groups.apply( runFB, oldTransitions )
			#print( newPredictions.head() )
			# get transitions
			t = Transitions( newPredictions, self.ignore )
			self.transitions = t.getTransitions()
			#print( self.transitions )
			# check if transitions converged
			if np.linalg.norm( oldTransitions - self.transitions ) < EPS:
				break
		return curIter, self.transitions
コード例 #14
0
ファイル: value_iterator.py プロジェクト: vjache/bellman
class ValueIterator:
    def __init__(self, target_position):
        self.target_position = target_position
        self._tran = Transitions()
        self._rewards = Rewarder(target_position)
        self._q_tab = QTable()
        self._v_tab = VTable()

    def update(self, debug=False):
        for s1 in self.all_states():
            for a in range(len(Config.actions)):
                s2 = self._tran.run(s1, a)
                rew = self._rewards[s1, s2]
                if s2:
                    q = rew + Config.gamma * self._v_tab[s2]
                else:
                    q = rew
                self._q_tab[s1, a] = q

                if debug:
                    pprint_transition(s1, a, s2, rew)

        self._v_tab.update_from_q_table(self._q_tab)

    # noinspection PyMethodMayBeStatic
    def all_states(self):
        for i in range(len(Config.letters)):
            for j in range(len(Config.numbers)):
                if (i, j) == self.target_position:
                    continue
                for o in range(len(Config.orientations)):
                    yield i, j, o

    def path(self, s0):
        a, _ = self._q_tab.get_best_action(s0)
        s1 = self._tran.run(s0, a)
        if not s1:
            raise ValueError("Переход в запрещенное состояние: " + state_to_str(s0) + "-" + action_to_str(a) + "-> None")
        elif (s1[0], s1[1]) == self.target_position:
            return [s0, a, s1]
        return [s0, a] + self.path(s1)
コード例 #15
0
	def run( self ):
		### going to try running forward-backward, updating transitions from
		# that until convergence
		#nSamples = self.observations - len(self.ignore)
		curIter = 0
		while curIter < self.maxIter:
			curIter += 1
			oldTransitions = np.copy( self.transitions )
			# get predictions
			#newPredictions = self.groups.apply( runFB, oldTransitions )
			newPredictions = runFB(self.data, oldTransitions )
			#print( newPredictions.head() )
			# get transitions
			t = Transitions( newPredictions, [] )
			self.transitions = t.getTransitions()
			#print( self.transitions )
			# check if transitions converged
			if np.linalg.norm( oldTransitions - self.transitions ) < EPS:
				break
		print( 'did not converge' if curIter == self.maxIter else 'coverged in {:d} iterations'.format( curIter ) )
		return pd.DataFrame(self.transitions,index=self.labels,columns=self.labels)
コード例 #16
0
 def run(self):
     ### going to try running forward-backward, updating transitions from
     # that until convergence
     #nSamples = self.observations - len(self.ignore)
     curIter = 0
     while curIter < self.maxIter:
         curIter += 1
         oldTransitions = np.copy(self.transitions)
         # get predictions
         #newPredictions = self.groups.apply( runFB, oldTransitions )
         newPredictions = runFB(self.data, oldTransitions)
         #print( newPredictions.head() )
         # get transitions
         t = Transitions(newPredictions, [])
         self.transitions = t.getTransitions()
         #print( self.transitions )
         # check if transitions converged
         if np.linalg.norm(oldTransitions - self.transitions) < EPS:
             break
     print('did not converge' if curIter ==
           self.maxIter else 'coverged in {:d} iterations'.format(curIter))
     return pd.DataFrame(self.transitions,
                         index=self.labels,
                         columns=self.labels)
コード例 #17
0
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, classProbs, combineBins, cent, isPrint ):
	
	info = '#from_script: epigenotyping_pe_v7.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; class_prob:{:s}; combine_bins_threshold:{:d}; centromere_{:s}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), formatClassProbs(classProbs).lower(), combineBins, ('None' if cent == None else '{:s}-{:s}'.format( bth_util.binSizeToStr( cent[0] ), bth_util.binSizeToStr( cent[1] ) ) ) )
	if isPrint:
		print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
		print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
		print( 'Mother label(s):', parentLabelAr[0] )
		print( 'Father label(s):', parentLabelAr[1] )
		print( 'Classification probabilities:', formatClassProbs( classProbs ) )
		print( 'Decoding algorithm:', formatDecoding( decoding ) )
		print( 'Combine bin feature threshold:', combineBins )
	if cent == None:
		centStr = 'None'
	else:
		centStr = ''
		for i in range(len(cent)//2):
			si = i*2
			centStr += '; {:s}-{:s}'.format( bth_util.binSizeToStr( cent[si] ), bth_util.binSizeToStr( cent[si+1] ) )
		centStr = centStr[2:]
			
	if isPrint:
		print( 'Centromere:', centStr )
	
	# build dataframe
	if isPrint:
		print( ' Reading input file', os.path.basename( inFileStr ) )
	df = pd.read_table( inFileStr, header=1 )
	
	# check parent labels
	newParentLabelAr = checkParents( df['sample'], parentLabelAr )
	tIgnoreAr = flattenList( newParentLabelAr[:2] )
	for i in range(len(newParentLabelAr[0])):
		tIgnoreAr += [ 'MPV{:d}'.format( i ) ]
	
	# group by bin
	df['bin'] = df.pos // binSize
	transformation = None
	
	# get centromere bins if necessary
	if cent == None:
		centBins = []
	else:
		cent = [ x // binSize for x in cent ]
		centBins = []
		#centBins = list( range(cent[0], cent[1]+1) )
		for i in range(len(cent) // 2 ):
			si = i * 2
			centBins += list( range(cent[si], cent[si+1]+1) )
	
	# combine bins if necessary
	nbins = max(df['bin'])+1
	if combineBins > 0:
		if isPrint:
			print( ' Merging bins', end=' ... ' )
		df['tBin'] = df['bin']
		transformation = binTransformation( df, combineBins )
		# apply the transformation
		df['bin'] = df['tBin'].apply( lambda x: transformation[x] )
		
	
	dfBinGroup = df.groupby( 'bin' )
	if combineBins > 0:
		newNBins = len( dfBinGroup.groups )
		info += '; non-functional_bins:{:d}'.format( nbins - newNBins )
		if isPrint:
			print( 'combined {:d} non-functional bins'.format( nbins - newNBins ) )
	
	# classify by bin
	if isPrint:
		print( ' Classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
	dfClass = runClassification( dfBinGroup, numProc, newParentLabelAr, classProbs )
	dfClass.reset_index(inplace=True)
	#print( dfClass.head )
	del(df, dfBinGroup )
	# decode, if necessary
	if decoding != 'N':
		#ignoreAr = parentLabelAr[:2] + ['MPV']
		transition = Transitions( dfClass, ignore = tIgnoreAr )
		transitionMatrix = transition.getTransitions()
		# write this matrix to file
		#outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins )
		#tLabels = [ 'mother', 'MPV', 'father' ]
		#transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels )
		#with open( outFStr, 'w' ) as f:
		#	f.write(info+'\n')
		#transData.to_csv( outFStr, sep='\t', mode='a' )
		
		# group by sample
		dfSampleGroup = dfClass.groupby( 'sample' )
		nsamples = len( dfSampleGroup.groups )
		tmpDecoding = ( 'F' if decoding == 'B' else decoding )
		if isPrint:
			print( ' {:s} decoding {:d} samples with {:d} processors'.format(  formatDecoding(tmpDecoding), nsamples, numProc ) )
		
		dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, tmpDecoding, centBins )
		
		
		if decoding == 'B':
			dfNew = dfOutput.loc[:,['bin','sample']].copy()
			dfNew['MPV'] = np.log(dfOutput['fb.score.MPV'])
			dfNew['mother'] = np.log(dfOutput['fb.score.mother'])
			dfNew['father'] = np.log(dfOutput['fb.score.father'])
			dfNew['prediction'] = dfOutput['fb.prediction']
			#print(dfOutput.head())
			#print(dfNew.head())
			transition = Transitions( dfNew, ignore = tIgnoreAr )
			transitionMatrix = transition.getTransitions()
			dfSampleGroup = dfNew.groupby( 'sample' )
			nsamples = len( dfSampleGroup.groups )
			
			if isPrint:
				print( ' {:s} decoding {:d} samples with {:d} processors'.format(  formatDecoding('V'), nsamples, numProc ) )
			dfOutputN = runDecoding( dfSampleGroup, numProc, transitionMatrix, 'V', centBins )
			dfOutput[['vit.score.mother', 'vit.score.father', 'vit.score.MPV', 'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV', 'vit.prediction']] = dfOutputN[['vit.score.mother', 'vit.score.father', 'vit.score.MPV', 'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV', 'vit.prediction']]
			#print( dfOutput.head() )
		# end decoding == B
		dfOutput.set_index( ['bin', 'sample'], inplace=True )
		del( dfSampleGroup )
	else:
		dfOutput = dfClass
	
	# write output
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, classProbs, combineBins )
	# if combination, undo transformation by applying the predictions to additional bins
	if combineBins > 0:
		dfOutput.reset_index(inplace=True)
		dfOutput['cBin'] = dfOutput['bin']
		dfOutputT = undoBinTransformation( dfOutput, transformation )
	else:
		dfOutputT = dfOutput.drop('cBin', axis=1)
	if isPrint:
		print( ' Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info+'\n')
	dfOutputT.to_csv( outFileStr, sep='\t', mode='a' )
	
	if isPrint:
		print( 'Done' )
コード例 #18
0
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins ):
	
	info = '#from_script: epigenotyping_pe_combbin.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower(), combineBins )
	print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
	print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
	print( 'Mother label:', parentLabelAr[0] )
	print( 'Father label:', parentLabelAr[1] )
	print( 'Uniform classification probabilities:', str(isUniform) )
	print( 'Decoding algorithm:', formatDecoding( decoding ) )
	print( 'Combine bin feature threshold:', combineBins )
	
	# build dataframe
	print( ' Reading input file', os.path.basename( inFileStr ) )
	df = pd.read_table( inFileStr, header=1 )
	
	# check parent labels
	checkParents( df['sample'], parentLabelAr )
	
	# group by bin
	df['bin'] = df.pos // binSize
	transformation = None
	
	# combine bins if necessary
	nbins = max(df['bin'])+1
	if combineBins > 0:
		print( ' Merging bins', end=' ... ' )
		df['tBin'] = df['bin']
		transformation = binTransformation( df, combineBins )
		# apply the transformation
		df['bin'] = df['tBin'].apply( lambda x: transformation[x] )
		
	
	dfBinGroup = df.groupby( 'bin' )
	if combineBins > 0:
		newNBins = len(dfBinGroup.groups )
		print( 'combined {:d} non-functional bins'.format( nbins - newNBins ) )
	
	# classify by bin
	print( ' Classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
	dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, isUniform )
	dfClass.reset_index(inplace=True)
	#print( dfClass.head )
	del(df, dfBinGroup )
	# decode, if necessary
	if decoding != 'N':
		ignoreAr = parentLabelAr[:2] + ['MPV']
		print( ' Obtaining initial transitions' )
		transition = Transitions( dfClass, ignore = ignoreAr )
		transitionMatrix = transition.getTransitions()
		# multiply for array of transitions
		transitionMatrixArray = np.array( [ np.copy( transitionMatrix ) for i in range(nbins ) ] )
		
		if maxIter > 0:
			print( ' Iteratively improving transitions with maximum', maxIter, 'iterations' )
			at = AdaptiveTransitions( dfClass, transitionMatrixArray, ignoreAr, maxIter )
			iterations, transitionMatrix = at.run()
			trInfo += '; iterations_to_convergence:'
			if iterations == maxIter:
				trInfo += 'NA'
				print( '  Did not converge in 10 iterations' )
			else:
				trInfo += str(iterations)
				print( '  Convergence in', iterations, 'iterations' )
		
		'''# write this matrix to file
		#outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins )
		tLabels = [ 'mother', 'MPV', 'father' ]
		transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels )
		with open( outFStr, 'w' ) as f:
			f.write(info+'\n')
		transData.to_csv( outFStr, sep='\t', mode='a' )'''
		
		# group by sample
		dfSampleGroup = dfClass.groupby( 'sample' )
		nsamples = len( dfSampleGroup.groups )
		
		print( ' {:s} decoding {:d} samples with {:d} processors'.format(  formatDecoding(decoding), nsamples, numProc ) )
		dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrixArray, decoding )
		dfOutput.set_index( ['bin', 'sample'], inplace=True )
		del( dfSampleGroup )
	else:
		dfOutput = dfClass
	
	# write output
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, combineBins )
	# if combination, undo transformation by applying the predictions to additional bins
	if combineBins > 0:
		dfOutput.reset_index(inplace=True)
		dfOutput['cBin'] = dfOutput['bin']
		dfOutputT = undoBinTransformation( dfOutput, transformation )
	else:
		dfOutputT = dfOutput.drop('cBin', axis=1)
	print( ' Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info+'\n')
	dfOutputT.to_csv( outFileStr, sep='\t', mode='a' )
	
	print( 'Done' )
コード例 #19
0
def processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding,
                  classProbs, combineBins, cent, scaleTransitions, isPrint):

    info = '#from_script: epigenotyping_pe_v9.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; class_prob:{:s}; combine_bins_threshold:{:d}; centromere_{:s}; scale_transitions:{:s}'.format(
        os.path.basename(inFileStr), bth_util.binSizeToStr(binSize),
        formatDecoding(decoding).lower().replace('and', ','),
        formatClassProbs(classProbs).lower(), combineBins,
        ('None' if cent == None else '{:s}-{:s}'.format(
            bth_util.binSizeToStr(cent[0]), bth_util.binSizeToStr(cent[1]))),
        str(scaleTransitions))
    if isPrint:
        print('Weighted methylation file:', os.path.basename(inFileStr))
        print('Bin size:', bth_util.binSizeToStr(binSize))
        print('Mother label(s):', parentLabelAr[0])
        print('Father label(s):', parentLabelAr[1])
        print('Classification probabilities:', formatClassProbs(classProbs))
        print('Decoding algorithm:', formatDecoding(decoding))
        print('Combine bin feature threshold:', combineBins)
        print('Scale transitions by sample size:', scaleTransitions)
    if cent == None:
        centStr = 'None'
    else:
        centStr = ''
        for i in range(len(cent) // 2):
            si = i * 2
            centStr += '; {:s}-{:s}'.format(
                bth_util.binSizeToStr(cent[si]),
                bth_util.binSizeToStr(cent[si + 1]))
        centStr = centStr[2:]

    if isPrint:
        print('Centromere:', centStr)

    # build dataframe
    if isPrint:
        print(' Reading input file', os.path.basename(inFileStr))
    df = pd.read_table(inFileStr, header=1)

    # check parent labels
    newParentLabelAr = checkParents(df['sample'], parentLabelAr)
    tIgnoreAr = flattenList(newParentLabelAr[:2])
    for i in range(len(newParentLabelAr[0])):
        tIgnoreAr += ['MPV{:d}'.format(i)]

    # group by bin
    df['bin'] = df.pos // binSize
    transformation = None

    # get centromere bins if necessary
    if cent == None:
        centBins = []
    else:
        cent = [x // binSize for x in cent]
        centBins = []
        #centBins = list( range(cent[0], cent[1]+1) )
        for i in range(len(cent) // 2):
            si = i * 2
            centBins += list(range(cent[si], cent[si + 1] + 1))

    # combine bins if necessary
    nbins = max(df['bin']) + 1
    if combineBins > 0:
        if isPrint:
            print(' Merging bins', end=' ... ')
        df['tBin'] = df['bin']
        transformation = binTransformation(df, combineBins)
        # apply the transformation
        df['bin'] = df['tBin'].apply(lambda x: transformation[x])

    dfBinGroup = df.groupby('bin')
    if combineBins > 0:
        newNBins = len(dfBinGroup.groups)
        info += '; non-functional_bins:{:d}'.format(nbins - newNBins)
        if isPrint:
            print('combined {:d} non-functional bins'.format(nbins - newNBins))

    # classify by bin
    if isPrint:
        print(' Classifying {:d} bins with {:d} processors'.format(
            nbins, numProc))
    dfClass = runClassification(dfBinGroup, numProc, newParentLabelAr,
                                classProbs)
    dfClass.reset_index(inplace=True)
    #print( dfClass.head )
    del (df, dfBinGroup)
    # decode, if necessary
    if decoding != 'N':
        #ignoreAr = parentLabelAr[:2] + ['MPV']
        transition = Transitions(dfClass, ignore=tIgnoreAr)
        transitionMatrix = transition.getTransitions()
        # write this matrix to file
        '''outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins )
		tLabels = [ 'mother', 'MPV', 'father' ]
		transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels )
		with open( outFStr, 'w' ) as f:
			f.write(info+'\n')
		transData.to_csv( outFStr, sep='\t', mode='a' )'''

        # group by sample
        dfSampleGroup = dfClass.groupby('sample')
        nsamples = len(dfSampleGroup.groups)
        if scaleTransitions:
            scaleFactor = float(nsamples - len(tIgnoreAr) -
                                1) / float(nsamples - len(tIgnoreAr))
        else:
            scaleFactor = 1
        tmpDecoding = ('F' if decoding == 'B' else decoding)
        if isPrint:
            print(' {:s} decoding {:d} samples with {:d} processors'.format(
                formatDecoding(tmpDecoding), nsamples, numProc))

        dfOutput = runDecoding(dfSampleGroup, numProc, transitionMatrix,
                               tmpDecoding, centBins, scaleFactor)

        if decoding == 'B':
            dfNew = dfOutput.loc[:, ['bin', 'sample']].copy()
            dfNew['MPV'] = np.log(dfOutput['fb.score.MPV'])
            dfNew['mother'] = np.log(dfOutput['fb.score.mother'])
            dfNew['father'] = np.log(dfOutput['fb.score.father'])
            dfNew['prediction'] = dfOutput['fb.prediction']
            #print(dfOutput.head())
            #print(dfNew.head())
            transition = Transitions(dfNew, ignore=tIgnoreAr)
            transitionMatrix = transition.getTransitions()
            dfSampleGroup = dfNew.groupby('sample')
            nsamples = len(dfSampleGroup.groups)

            if isPrint:
                print(
                    ' {:s} decoding {:d} samples with {:d} processors'.format(
                        formatDecoding('V'), nsamples, numProc))
            dfOutputN = runDecoding(dfSampleGroup, numProc, transitionMatrix,
                                    'V', centBins, scaleFactor)
            dfOutput[[
                'vit.score.mother', 'vit.score.father', 'vit.score.MPV',
                'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV',
                'vit.prediction'
            ]] = dfOutputN[[
                'vit.score.mother', 'vit.score.father', 'vit.score.MPV',
                'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV',
                'vit.prediction'
            ]]
            #print( dfOutput.head() )
        # end decoding == B
        dfOutput.set_index(['bin', 'sample'], inplace=True)
        del (dfSampleGroup)
    else:
        dfOutput = dfClass

    # write output
    outFileStr = determineOutputFileName(inFileStr, outID, binSize, decoding,
                                         classProbs, scaleTransitions,
                                         combineBins)
    # if combination, undo transformation by applying the predictions to additional bins
    if combineBins > 0:
        dfOutput.reset_index(inplace=True)
        dfOutput['cBin'] = dfOutput['bin']
        dfOutputT = undoBinTransformation(dfOutput, transformation)
    else:
        dfOutputT = dfOutput.drop('cBin', axis=1)
    if isPrint:
        print(' Writing output to', outFileStr)
    with open(outFileStr, 'w') as f:
        f.write(info + '\n')
    dfOutputT.to_csv(outFileStr, sep='\t', mode='a')

    if isPrint:
        print('Done')
コード例 #20
0
def processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding,
                  isUniform, combineBins, maxIter):

    info = '#from_script: epigenotyping_combin_smp-iter-trans.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}; maximum_iterations:{:d}'.format(
        os.path.basename(inFileStr), bth_util.binSizeToStr(binSize),
        formatDecoding(decoding).lower().replace('and', ','),
        str(isUniform).lower(), combineBins, maxIter)
    print('Weighted methylation file:', os.path.basename(inFileStr))
    print('Bin size:', bth_util.binSizeToStr(binSize))
    print('Mother label:', parentLabelAr[0])
    print('Father label:', parentLabelAr[1])
    print('Uniform classification probabilities:', str(isUniform))
    print('Decoding algorithm:', formatDecoding(decoding))
    print('Combine bin feature threshold:', combineBins)
    print('Maximum transition matrix iterations:', maxIter)

    # build dataframe
    print(' Reading input file', os.path.basename(inFileStr))
    df = pd.read_table(inFileStr, header=1)

    # check parent labels
    checkParents(df['sample'], parentLabelAr)

    # group by bin
    df['bin'] = df.pos // binSize
    transformation = None

    # combine bins if necessary
    nbins = max(df['bin']) + 1
    if combineBins > 0:
        print(' Merging bins', end=' ... ')
        df['tBin'] = df['bin']
        transformation = binTransformation(df, combineBins)
        # apply the transformation
        df['bin'] = df['tBin'].apply(lambda x: transformation[x])

    dfBinGroup = df.groupby('bin')
    if combineBins > 0:
        newNBins = len(dfBinGroup.groups)
        print('combined {:d} non-functional bins'.format(nbins - newNBins))

    # classify by bin
    print(' Classifying {:d} bins with {:d} processors'.format(nbins, numProc))
    dfClass = runClassification(dfBinGroup, numProc, parentLabelAr, isUniform)
    dfClass.reset_index(inplace=True)
    #print( dfClass.head )
    del (df, dfBinGroup)
    # decode, if necessary
    if decoding != 'N':
        ignoreAr = parentLabelAr[:2] + ['MPV']
        print(' Obtaining initial transitions')
        transition = Transitions(dfClass, ignore=ignoreAr)
        transitionMatrix = transition.getTransitions()
        outFStr = determineTransFileName(inFileStr, outID, binSize,
                                         combineBins)
        with open(outFStr, 'w') as f:
            f.write(info + '\n')
        # group by sample
        #print(dfClass.head())
        dfSampleGroup = dfClass.groupby('sample')
        nsamples = len(dfSampleGroup.groups)

        print(
            ' {:s} decoding and optimizing transition matrices for {:d} samples with {:d} processors'
            .format(formatDecoding(decoding), nsamples, numProc))
        ## note: decoding will now include improved transition matrix calculations
        dfOutput = runDecoding(dfSampleGroup, numProc, transitionMatrix,
                               decoding, maxIter, outFStr)
        dfOutput.set_index(['bin', 'sample'], inplace=True)
        del (dfSampleGroup)
    else:
        dfOutput = dfClass

    # write output
    outFileStr = determineOutputFileName(inFileStr, outID, binSize, decoding,
                                         isUniform, combineBins)
    # if combination, undo transformation by applying the predictions to additional bins
    if combineBins > 0:
        dfOutput.reset_index(inplace=True)
        dfOutput['cBin'] = dfOutput['bin']
        dfOutputT = undoBinTransformation(dfOutput, transformation)
    else:
        dfOutputT = dfOutput.drop('cBin', axis=1)
    print(' Writing output to', outFileStr)
    with open(outFileStr, 'w') as f:
        f.write(info + '\n')
    dfOutputT.to_csv(outFileStr, sep='\t', mode='a')

    print('Done')
コード例 #21
0
ファイル: value_iterator.py プロジェクト: vjache/bellman
 def __init__(self, target_position):
     self.target_position = target_position
     self._tran = Transitions()
     self._rewards = Rewarder(target_position)
     self._q_tab = QTable()
     self._v_tab = VTable()
コード例 #22
0
ファイル: FiniteAutomata.py プロジェクト: AndreiJeler/FLCD
class FiniteAutomatan:
    def __init__(self, file_name):
        self.file_name = file_name
        self.Q = set()
        self.E = set()
        self.q0 = None
        self.F = set()
        self.delta = Transitions()
        self.read_file()

    def read_file(self):
        with open(self.file_name, 'r') as file:
            self.Q = file.readline().strip('\n').split(' ')
            self.E = file.readline().strip('\n').split(' ')
            self.q0 = file.readline().strip('\n')

            if self.q0 not in self.Q:
                raise Exception("Invalid initial state " + self.q0)
            self.F = file.readline().strip('\n').split(' ')

            for state in self.F:
                if state not in self.Q:
                    raise Exception("Invalid final state " + state)
            line = file.readline().strip('\n')
            while line:
                line = line.split(' ')

                if line[0] not in self.Q or line[1] not in self.Q:
                    raise Exception("Invalid transition " + line[0] + "->" +
                                    line[1] + ", one state is invalid")

                for i in range(2, len(line)):
                    if line[i] not in self.E:
                        raise Exception("Invalid symbol " + line[i] +
                                        " in transition " + ' '.join(line))

                    self.delta.add_transition(line[0], line[1], line[i])
                line = file.readline().strip('\n')

    def print_states(self):
        print("List of states:")
        for state in self.Q:
            print(state)

    def print_alphabet(self):
        print("The alphabet of the FA:")
        for val in self.E:
            print(val)

    def print_initial_state(self):
        print("The initial state of the FA: ", self.q0)

    def print_final_states(self):
        print("The list of final states:")
        for state in self.F:
            print(state)

    def print_transitions(self):
        print("The list of transitions:")
        print(self.delta)

    def is_deterministic(self):
        for state in self.Q:
            transitions = self.delta.get_transitions_with_start(state)
            for transition in transitions:
                unique_transitions = list(
                    filter(lambda x: x[1] != transition[1], transitions))
                if len(unique_transitions) != len(transitions) - 1:
                    return False
        return True

    def is_accepted(self, string):
        if not self.is_deterministic():
            raise Exception("The FA is not deterministic")
        current_state = self.q0
        for i in range(len(string)):
            possible_transitions = self.delta.get_transitions_with_start(
                current_state)
            current_value = string[i]
            current_state = None
            for transition in possible_transitions:
                if transition[1] == current_value:
                    current_state = transition[0]
            if current_state is None:
                return False
        if current_state not in self.F:
            return False
        return True
コード例 #23
0
def processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding,
                  isUniform, combineBins):

    info = '#from_script: epigenotyping_pe_combbin.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}'.format(
        os.path.basename(inFileStr), bth_util.binSizeToStr(binSize),
        formatDecoding(decoding).lower().replace('and', ','),
        str(isUniform).lower(), combineBins)
    print('Weighted methylation file:', os.path.basename(inFileStr))
    print('Bin size:', bth_util.binSizeToStr(binSize))
    print('Mother label:', parentLabelAr[0])
    print('Father label:', parentLabelAr[1])
    print('Uniform classification probabilities:', str(isUniform))
    print('Decoding algorithm:', formatDecoding(decoding))
    print('Combine bin feature threshold:', combineBins)

    # build dataframe
    print(' Reading input file', os.path.basename(inFileStr))
    df = pd.read_table(inFileStr, header=1)

    # check parent labels
    checkParents(df['sample'], parentLabelAr)

    # group by bin
    df['bin'] = df.pos // binSize
    transformation = None

    # combine bins if necessary
    nbins = max(df['bin']) + 1
    if combineBins > 0:
        print(' Merging bins', end=' ... ')
        df['tBin'] = df['bin']
        transformation = binTransformation(df, combineBins)
        # apply the transformation
        df['bin'] = df['tBin'].apply(lambda x: transformation[x])

    dfBinGroup = df.groupby('bin')
    if combineBins > 0:
        newNBins = len(dfBinGroup.groups)
        print('combined {:d} non-functional bins'.format(nbins - newNBins))

    # classify by bin
    print(' Classifying {:d} bins with {:d} processors'.format(nbins, numProc))
    dfClass = runClassification(dfBinGroup, numProc, parentLabelAr, isUniform)
    dfClass.reset_index(inplace=True)
    #print( dfClass.head )
    del (df, dfBinGroup)
    # decode, if necessary
    if decoding != 'N':
        ignoreAr = parentLabelAr[:2] + ['MPV']
        print(' Obtaining initial transitions')
        transition = Transitions(dfClass, ignore=ignoreAr)
        transitionMatrix = transition.getTransitions()
        # multiply for array of transitions
        transitionMatrixArray = np.array(
            [np.copy(transitionMatrix) for i in range(nbins)])

        if maxIter > 0:
            print(' Iteratively improving transitions with maximum', maxIter,
                  'iterations')
            at = AdaptiveTransitions(dfClass, transitionMatrixArray, ignoreAr,
                                     maxIter)
            iterations, transitionMatrix = at.run()
            trInfo += '; iterations_to_convergence:'
            if iterations == maxIter:
                trInfo += 'NA'
                print('  Did not converge in 10 iterations')
            else:
                trInfo += str(iterations)
                print('  Convergence in', iterations, 'iterations')
        '''# write this matrix to file
		#outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins )
		tLabels = [ 'mother', 'MPV', 'father' ]
		transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels )
		with open( outFStr, 'w' ) as f:
			f.write(info+'\n')
		transData.to_csv( outFStr, sep='\t', mode='a' )'''

        # group by sample
        dfSampleGroup = dfClass.groupby('sample')
        nsamples = len(dfSampleGroup.groups)

        print(' {:s} decoding {:d} samples with {:d} processors'.format(
            formatDecoding(decoding), nsamples, numProc))
        dfOutput = runDecoding(dfSampleGroup, numProc, transitionMatrixArray,
                               decoding)
        dfOutput.set_index(['bin', 'sample'], inplace=True)
        del (dfSampleGroup)
    else:
        dfOutput = dfClass

    # write output
    outFileStr = determineOutputFileName(inFileStr, outID, binSize, decoding,
                                         isUniform, combineBins)
    # if combination, undo transformation by applying the predictions to additional bins
    if combineBins > 0:
        dfOutput.reset_index(inplace=True)
        dfOutput['cBin'] = dfOutput['bin']
        dfOutputT = undoBinTransformation(dfOutput, transformation)
    else:
        dfOutputT = dfOutput.drop('cBin', axis=1)
    print(' Writing output to', outFileStr)
    with open(outFileStr, 'w') as f:
        f.write(info + '\n')
    dfOutputT.to_csv(outFileStr, sep='\t', mode='a')

    print('Done')
コード例 #24
0
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins, maxIter ):
	
	info = '#from_script: epigenotyping_combin_smp-iter-trans.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}; maximum_iterations:{:d}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower(), combineBins, maxIter )
	print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
	print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
	print( 'Mother label:', parentLabelAr[0] )
	print( 'Father label:', parentLabelAr[1] )
	print( 'Uniform classification probabilities:', str(isUniform) )
	print( 'Decoding algorithm:', formatDecoding( decoding ) )
	print( 'Combine bin feature threshold:', combineBins )
	print( 'Maximum transition matrix iterations:', maxIter )
	
	# build dataframe
	print( ' Reading input file', os.path.basename( inFileStr ) )
	df = pd.read_table( inFileStr, header=1 )
	
	# check parent labels
	checkParents( df['sample'], parentLabelAr )
	
	# group by bin
	df['bin'] = df.pos // binSize
	transformation = None
	
	# combine bins if necessary
	nbins = max(df['bin'])+1
	if combineBins > 0:
		print( ' Merging bins', end=' ... ' )
		df['tBin'] = df['bin']
		transformation = binTransformation( df, combineBins )
		# apply the transformation
		df['bin'] = df['tBin'].apply( lambda x: transformation[x] )
		
	
	dfBinGroup = df.groupby( 'bin' )
	if combineBins > 0:
		newNBins = len(dfBinGroup.groups )
		print( 'combined {:d} non-functional bins'.format( nbins - newNBins ) )
	
	# classify by bin
	print( ' Classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
	dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, isUniform )
	dfClass.reset_index(inplace=True)
	#print( dfClass.head )
	del(df, dfBinGroup )
	# decode, if necessary
	if decoding != 'N':
		ignoreAr = parentLabelAr[:2] + ['MPV']
		print( ' Obtaining initial transitions' )
		transition = Transitions( dfClass, ignore = ignoreAr )
		transitionMatrix = transition.getTransitions()
		outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins )
		with open( outFStr, 'w' ) as f:
			f.write(info+'\n')
		# group by sample
		#print(dfClass.head())
		dfSampleGroup = dfClass.groupby( 'sample' )
		nsamples = len( dfSampleGroup.groups )
		
		print( ' {:s} decoding and optimizing transition matrices for {:d} samples with {:d} processors'.format(  formatDecoding(decoding), nsamples, numProc ) )
		## note: decoding will now include improved transition matrix calculations
		dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, decoding, maxIter, outFStr )
		dfOutput.set_index( ['bin', 'sample'], inplace=True )
		del( dfSampleGroup )
	else:
		dfOutput = dfClass
	
	# write output
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, combineBins )
	# if combination, undo transformation by applying the predictions to additional bins
	if combineBins > 0:
		dfOutput.reset_index(inplace=True)
		dfOutput['cBin'] = dfOutput['bin']
		dfOutputT = undoBinTransformation( dfOutput, transformation )
	else:
		dfOutputT = dfOutput.drop('cBin', axis=1)
	print( ' Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info+'\n')
	dfOutputT.to_csv( outFileStr, sep='\t', mode='a' )
	
	print( 'Done' )