def determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, combineBins, scaleFactor ):
	outBaseName = os.path.basename( inFileStr )
	if outID == None:
		if '_wm_pos_' in inFileStr:
			outFileStr = outBaseName.replace( '_wm_pos_', '_epigenotype-results_{:s}_'.format( bth_util.binSizeToStr(binSize) ) )
		else:
			outFileStr = 'out_epigenotype-results_{:s}.tsv'.format( bth_util.binSizeToStr(binSize) )
	else:
		outFileStr = '{:s}_epigenotype-results_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize) )
		
	# combining bins
	if combineBins > 0:
		outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format( combineBins ) )
	
	# scale factor
	if scaleFactor != 1:
		s = str( scaleFactor ).replace('.','-')
		outFileStr = outFileStr.replace( '.tsv', '_s{:s}.tsv'.format(s))
	
	# decoding and uniform
	if decoding != 'N' and isUniform:
		outFileStr = outFileStr.replace( '.tsv', '_uni-{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else 'vit-fb') ) )
	elif decoding != 'N':
		outFileStr = outFileStr.replace( '.tsv', '_{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else 'vit-fb') ) )
	elif isUniform:
		outFileStr = outFileStr.replace( '.tsv', '_uni.tsv' )
	
	return outFileStr
def determineOutputFileName( inFileStr, outID, binSize, decoding, classProbs, combineBins ):
	outBaseName = os.path.basename( inFileStr )
	if outID == None:
		if '_wm_pos_' in inFileStr:
			outFileStr = outBaseName.replace( '_wm_pos_', '_epigenotype-v7.1_{:s}_'.format( bth_util.binSizeToStr(binSize) ) )
		else:
			outFileStr = 'out_epigenotype-v7.1_{:s}.tsv'.format( bth_util.binSizeToStr(binSize) )
	else:
		outFileStr = '{:s}_epigenotype-v7.1_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize) )
		
	# combining bins
	if combineBins > 0:
		outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format( combineBins ) )
	
	# decoding and uniform
	if decoding != 'N' and classProbs == 'U':
		outFileStr = outFileStr.replace( '.tsv', '_uni-{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else ('both' if decoding == 'B' else 'vit-fb')) ) )
	elif decoding != 'N' and classProbs == 'E':
		outFileStr = outFileStr.replace( '.tsv', '_epiril-{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else ('both' if decoding == 'B' else 'vit-fb')) ) )
	elif decoding != 'N':
		outFileStr = outFileStr.replace( '.tsv', '_{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else ('both' if decoding == 'B' else 'vit-fb')) ) )
	elif classProbs == 'U':
		outFileStr = outFileStr.replace( '.tsv', '_uni.tsv' )
	elif classProbs == 'E':
		outFileStr = outFileStr.replace( '.tsv', '_epiril.tsv' )
	
	return outFileStr
def determineOutputFileName(inFileStr, outID, binSize, decoding, generation,
                            combineBins):
    outBaseName = os.path.basename(inFileStr)
    if outID == None:
        if '_wm_pos_' in inFileStr:
            outFileStr = outBaseName.replace(
                '_wm_pos_', '_epigenotype-v7.4_{:s}_g-{:d}_'.format(
                    bth_util.binSizeToStr(binSize), generation))
        else:
            outFileStr = 'out_epigenotype-v7.4_{:s}_g-{:d}.tsv'.format(
                bth_util.binSizeToStr(binSize), generation)
    else:
        outFileStr = '{:s}_epigenotype-v7.4_{:s}_g-{:d}.tsv'.format(
            outID, bth_util.binSizeToStr(binSize), generation)

    # combining bins
    if combineBins > 0:
        outFileStr = outFileStr.replace('.tsv',
                                        '_cb-{:d}.tsv'.format(combineBins))

    # decoding
    if decoding != 'N':
        outFileStr = outFileStr.replace(
            '.tsv', '_{:s}.tsv'.format('vit' if decoding == 'V' else (
                'fb' if decoding == 'F' else (
                    'both' if decoding == 'B' else 'vit-fb'))))

    return outFileStr
예제 #4
0
def determineOutputFileName(inFileStr, outID, binSize, decoding, isUniform,
                            combineBins):
    outBaseName = os.path.basename(inFileStr)
    if outID == None:
        if '_wm_pos_' in inFileStr:
            outFileStr = outBaseName.replace(
                '_wm_pos_',
                '_epigenotype_{:s}_'.format(bth_util.binSizeToStr(binSize)))
        else:
            outFileStr = 'out_epigenotype_{:s}.tsv'.format(
                bth_util.binSizeToStr(binSize))
    else:
        outFileStr = '{:s}_epigenotype_{:s}.tsv'.format(
            outID, bth_util.binSizeToStr(binSize))

    # combining bins
    if combineBins > 0:
        outFileStr = outFileStr.replace('.tsv',
                                        '_cb-{:d}.tsv'.format(combineBins))

    # decoding and uniform
    if decoding != 'N' and isUniform:
        outFileStr = outFileStr.replace(
            '.tsv', '_uni-{:s}.tsv'.format('vit' if decoding == 'V' else (
                'fb' if decoding == 'F' else 'vit-fb')))
    elif decoding != 'N':
        outFileStr = outFileStr.replace(
            '.tsv', '_{:s}.tsv'.format('vit' if decoding == 'V' else (
                'fb' if decoding == 'F' else 'vit-fb')))
    elif isUniform:
        outFileStr = outFileStr.replace('.tsv', '_uni.tsv')

    return outFileStr
def determineTransFileName(inFileStr, outID, binSize, combineBins,
                           scaleFactor):
    outBaseName = os.path.basename(inFileStr)
    if outID == None:
        if '_wm_pos_' in inFileStr:
            outFileStr = outBaseName.replace(
                '_wm_pos_', '_epigenotype-trans_{:s}_'.format(
                    bth_util.binSizeToStr(binSize)))
        else:
            outFileStr = 'out_epigenotype-trans_{:s}.tsv'.format(
                bth_util.binSizeToStr(binSize))
    else:
        outFileStr = '{:s}_epigenotype-trans_{:s}.tsv'.format(
            outID, bth_util.binSizeToStr(binSize))

        # combining bins
    if combineBins > 0:
        outFileStr = outFileStr.replace('.tsv',
                                        '_cb-{:d}.tsv'.format(combineBins))

    if scaleFactor != 1:
        s = str(scaleFactor).replace('.', '-')
        outFileStr = outFileStr.replace('.tsv', '_s{:s}.tsv'.format(s))

    return outFileStr
예제 #6
0
def determineOutputFileName(inFileStr, outID, binSize, decoding, isUniform):
    outBaseName = os.path.basename(inFileStr)
    if outID == None:
        if '_wm_pos_' in inFileStr:
            outFileStr = outBaseName.replace(
                '_wm_pos_',
                '_logreg_{:s}_'.format(bth_util.binSizeToStr(binSize)))
        else:
            outFileStr = 'out_logreg_{:s}.tsv'.format(
                bth_util.binSizeToStr(binSize))
    else:
        outFileStr = '{:s}_logreg_{:s}.tsv'.format(
            outID, bth_util.binSizeToStr(binSize))

    if decoding != 'N' and isUniform:
        outFileStr = outFileStr.replace(
            '.tsv', '_uni-{:s}.tsv'.format('vit' if decoding == 'V' else (
                'fb' if decoding == 'F' else 'vit_fb')))
    elif decoding != 'N':
        outFileStr = outFileStr.replace(
            '.tsv', '_{:s}.tsv'.format('vit' if decoding == 'V' else (
                'fb' if decoding == 'F' else 'vit_fb')))
    elif isUniform:
        outFileStr = outFileStr.replace('.tsv', '_uni.tsv')
    return outFileStr
def determineTransFileName( inFileStr, outID, binSize, combineBins ):
	outBaseName = os.path.basename( inFileStr )
	if outID == None:
		if '_wm_pos_' in inFileStr:
			outFileStr = outBaseName.replace( '_wm_pos_', '_transition_{:s}_'.format( bth_util.binSizeToStr(binSize) ) )
		else:
			outFileStr = 'out_transition_{:s}.tsv'.format( bth_util.binSizeToStr(binSize) )
	else:
		outFileStr = '{:s}_transition_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize) )
		
		# combining bins
	if combineBins > 0:
		outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format( combineBins ) )
		
		return outFileStr
예제 #8
0
def processInputs( regionFileStr, bedFileAr, numProc, isStrand, isCSSR, cssrDist, outID ):
	sampleNamesAr = getSampleNames( bedFileAr )
	# read region file
	print( 'Reading region file {:s}'.format( os.path.basename( regionFileStr ) ) )	
	if isCSSR:
		regionAr = readCSSRFile( regionFileStr, cssrDist )
	else:
		regionAr = readRegionFile( regionFileStr, isStrand )
	
	# process BED files
	useStrand = isStrand or isCSSR
	if outID == None:
		outID = bth_util.fileBaseName( regionFileStr )
	
	print( 'Begin processing with {:d} processors'.format( numProc ) )
	pool = multiprocessing.Pool( processes=numProc )
	results = [ pool.apply_async( processBedFile, args=(f, regionAr, useStrand) ) for f in bedFileAr ]
	outDictMat = [ p.get() for p in results ]
	
	if isCSSR:
		outFileStr = '{:s}_rpm_cssr_{:s}.tsv'.format( outID, bth_util.binSizeToStr( cssrDist ) )
	elif isStrand:
		outFileStr = '{:s}_rpm_stranded.tsv'.format( outID )
	else:
		outFileStr = '{:s}_rpm.tsv'.format( outID )
	print( 'Writing output to', outFileStr )
	writeOutput( outFileStr, regionAr, outDictMat, sampleNamesAr, useStrand )
	print( 'Done' )
def determineOutputFileName( inFileStr, outID, binSize, isSmoothing, isUniform ):
	outBaseName = os.path.basename( inFileStr )
	if outID == None:
		if '_wm_pos_' in inFileStr:
			outFileStr = outBaseName.replace( '_wm_pos_', '_logreg_{:s}_'.format( bth_util.binSizeToStr(binSize) ) )
		else:
			outFileStr = 'out_logreg_{:s}.tsv'.format( bth_util.binSizeToStr(binSize) )
	else:
		outFileStr = '{:s}_logreg_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize) )
	if isSmoothing and isUniform:
		outFileStr = outFileStr.replace( '.tsv', '_uni-opt.tsv' )
	elif isSmoothing:
		outFileStr = outFileStr.replace( '.tsv', '_opt.tsv' )
	elif isUniform:
		outFileStr = outFileStr.replace( '.tsv', '_uni.tsv' )	
	return outFileStr
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, isIndiv ):
	dType = ('Viterbi' if decoding == 'V' else ('Forward-backward' if decoding == 'F' else ('Viterbi and Forward-backward' if decoding == 'A' else 'None') ) )
	info = '#from_script:epigenotype_by_logreg.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; indiv_transitions:{:s}\n'.format( os.path.basename( inFileStr), bth_util.binSizeToStr( binSize ), dType.lower().replace(' and ', ','), str(isUniform), str(isIndiv) )
	print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
	print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
	print( 'Mother label:', parentLabelAr[0] )
	print( 'Father label:', parentLabelAr[1] )
	print( 'Uniform classification probabilities:', str( isUniform ) )
	print( 'Decoding algorithm:', dType)
	print( 'Individual transition probabilities:', str( isIndiv ) )
	
	# build data frame
	df = pd.read_table( inFileStr, header=1 )
	# check parent labels
	checkParents( df['sample'], parentLabelAr )
	
	# group by bin and analyze
	df['bin'] = df.pos // binSize
	nbins = max(df['bin'])+1
	dfg = df.groupby('bin')
	if numProc > 1:
		print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
		res_class = runMultiClassification( dfg, numProc, parentLabelAr, isUniform )
	else:
		print( 'Begin classifying {:d} bins'.format( nbins ) )
		res_class = dfg.apply( classLogReg, pla=parentLabelAr, u=isUniform )
	res_class.reset_index(inplace=True)
	
	# decode if necessary
	if decoding != 'N':
		ignoreAr = parentLabelAr + ['MPV']
		if isIndiv:
			transitions = np.array([])
		else:
			print( 'Generating transition matrix' )
			transition = Transitions( res_class, ignore=ignoreAr )
			transitions = transition.getTransitions()
			print(transitions)
		# find optimum path for all samples
		groups = res_class.groupby( 'sample' )
		nsamples = len(groups.groups)
		
		if numProc > 1:
			print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format(  dType, nsamples, numProc ) )
			results = runMultiPath( groups, numProc, transitions, isUniform, decoding )
		else:
			print( 'Begin {:s} decoding {:d} samples'.format( dType, nsamples ) )
			results = groups.apply( findOptimalPath, trans=transitions, u=isUniform, d=decoding )
		results.set_index( ['bin', 'sample'], inplace=True )
	else:
		results = res_class
	
	# output file
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, isIndiv )
	# write output
	print( 'Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info)
	results.to_csv( outFileStr, sep='\t', mode='a' )
	print( 'Done' )
예제 #11
0
def printHelp():
    print(
        'Usage:\npython epigenotyping_pe_combbin.py [-u] [-c=bin_thresh] [-d=decoding_type][-p=num_proc]\n[-o=out_id] [-m=mother_sample] [-f=father_sample] [-b=bin_size] <input_file>'
    )
    print('Requried:')
    print(
        'input_file\tfile of of weighted methylation by position for samples')
    print('Optional:')
    print(
        '-u\t\tuniform class weights [default 1:2:1 for mother,\n\t\tMPV,father]'
    )
    print(
        '-d=decode_type\tdecoding type to use (capitlization ignored) [default {:s}]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tBoth="all" or "a"\n\t\tOff="false", "none", or "n"'
        .format(DECODE))
    print(
        '-o=out_id\tidentifier for output file [default "out" or variation of\n\t\tinput file name]'
    )
    print('-p=num_proc\tnumber of processors [default {:d}'.format(NUMPROC))
    print(
        '-c=bin_thresh\tminimum number of features per bin to be classified\n\t\tgroups bins to reach this number [default {:d}'
        .format(COMBINE))
    print(
        '-m=mother_label\tsample name of mother; for correct classification\n\t\t[default mother]'
    )
    print(
        '-f=father_label\tsample name of father; for correct classification\n\t\t[default father]'
    )
    print('-b=bin_size\tsize of bins in bp [default {:s}]'.format(
        bth_util.binSizeToStr(BINSIZE)))
예제 #12
0
def printHelp():
    print(
        'Usage: python epigenotyping_pe.py [-u] [-d=decoding_type] [-p=num_proc] [-o=out_id] [-m=mother_samples] [-mx=add_mother_labels] [-f=father_samples] [-fx=add_father_labels] [-b=bin_size] <input_file>'
    )
    print('Requried:')
    print(
        'input_file\tfile of of weighted methylation by position for samples')
    print('Optional:')
    print(
        '-u\t\tuniform class weights [default 1:2:1 for mother,\n\t\tMPV,father]'
    )
    print(
        '-d=decode_type\tdecoding type to use (capitlization ignored) [default A]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tBoth="all" or "a"\n\t\tOff="false", "none", or "n"'
    )
    print(
        '-o=out_id\tidentifier for output file [default out or variation of\n\t\tinput file name]'
    )
    print('-p=num_proc\tnumber of processors')
    print(
        '-m=mother_labels\tsample name(s) of mother; for correct classification\n\t\tand MPV calculation [default mother]'
    )
    print(
        '-mx=add_mother_labels\tadditional samples to train as mother\n\t\tnot used for MPV calculation'
    )
    print(
        '-f=father_labels\tsample name(s) of father; for correct classification\n\t\tand MPV calculation [default father]'
    )
    print(
        '-fx=add_mother_labels\tadditional samples to train as father\n\t\tnot used for MPV calculation'
    )
    print('-b=bin_size\tsize of bins in bp [default {:s}]'.format(
        bth_util.binSizeToStr(BINSIZE)))
예제 #13
0
def printHelp():
    print(
        'Usage:\npython epigenotyping_pe_v7.2.py [-q] [-g=generation]  [-c=bin_thresh] [-d=decoding_type][-p=num_proc]\n[-o=out_id] [-m=mother_sample(s)] [-f=father_sample(s)] [-b=bin_size] <input_file>'
    )
    print('Requried:')
    print(
        'input_file\tfile of of weighted methylation by position for samples')
    print('Optional:')
    print('-q\t\tquiet, do not print progress')
    print(
        '-g=generation\tgeneration of self-crossing; used to determine\n\t\tclassification probabilities; use 0 for uniform weight\n\t\t[default 2]'
    )
    print(
        '-d=decode_type\tdecoding type to use (capitlization ignored) [default {:s}]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tBoth="all" or "a"\n\t\tOff="false", "none", or "n"'
        .format(DECODE))
    print(
        '-o=out_id\tidentifier for output file [default "out" or variation of\n\t\tinput file name]'
    )
    print('-p=num_proc\tnumber of processors [default {:d}'.format(NUMPROC))
    print(
        '-c=bin_thresh\tminimum number of features per bin to be classified\n\t\tgroups bins to reach this number [default {:d}'
        .format(COMBINE))
    print(
        '-m=mother_samples\tsample name(s) of mother; for correct classification\n\t\t[default mother]'
    )
    print(
        '-f=father_samples\tsample name(s) of father; for correct classification\n\t\t[default father]'
    )
    print('-b=bin_size\tsize of bins in bp [default {:s}]'.format(
        bth_util.binSizeToStr(BINSIZE)))
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, parentAddLabelAr, decoding, isUniform ):
	
	info = '#from_script: epigenotyping_pe.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; mother_samples:{:s}; father_samples:{:s}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower(), ','.join(parentLabelAr[0]), ','.join(parentLabelAr[1]) )
	print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
	print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
	print( 'Mother label(s):', ', '.join(parentLabelAr[0]) )
	print( 'Father label(s):', ', '.join(parentLabelAr[1]) )
	if len(parentAddLabelAr[0]) != 0 or len(parentAddLabelAr[1]) != 0:
		print( 'Additional mother training label(s):', ('None' if len(parentAddLabelAr[0])==0 else ', '.join(parentAddLabelAr[0])) )
		print( 'Additional father training label(s):', ('None' if len(parentAddLabelAr[1]) == 0 else ', '.join(parentAddLabelAr[1])) )
	print( 'Uniform classification probabilities:', str(isUniform) )
	print( 'Decoding algorithm:', formatDecoding( decoding ) )
	
	# build dataframe
	df = pd.read_table( inFileStr, header=1 )
	
	# check parent labels
	parentLabelAr = checkParents( df['sample'], parentLabelAr )
	# check additional training data labels
	if len(parentAddLabelAr[0]) != 0 or len(parentAddLabelAr[1]) != 0:
		parentAddLabelAr = checkParents( df['sample'], parentAddLabelAr )
	
	# group by bin
	df['bin'] = df.pos // binSize
	nbins = max(df['bin'])+1
	dfBinGroup = df.groupby( 'bin' )
	
	# classify by bin
	print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
	dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, parentAddLabelAr, isUniform )
	dfClass.reset_index(inplace=True)
	#print( dfClass.head )
	del(df, dfBinGroup )
	# decode, if necessary
	if decoding != 'N':
		totalParentLabelAr = [parentLabelAr[0] + parentAddLabelAr[0], parentLabelAr[1] + parentAddLabelAr[1]]
		ignoreAr = flattenList( totalParentLabelAr ) + ['MPV']
		transition = Transitions( dfClass, ignore = ignoreAr )
		transitionMatrix = transition.getTransitions()
		
		# group by sample
		dfSampleGroup = dfClass.groupby( 'sample' )
		nsamples = len(dfSampleGroup.groups )
		
		print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format(  formatDecoding(decoding), nsamples, numProc ) )
		dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, decoding )
		dfOutput.set_index( ['bin', 'sample'], inplace=True )
		del( dfSampleGroup )
	else:
		dfOutput = dfClass
	
	# write output
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform )
	print( 'Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info+'\n')
	dfOutput.to_csv( outFileStr, sep='\t', mode='a' )
	
	print( 'Done' )
def determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform ):
	outBaseName = os.path.basename( inFileStr )
	if outID == None:
		if '_wm_pos_' in inFileStr:
			outFileStr = outBaseName.replace( '_wm_pos_', '_logreg_{:s}_'.format( bth_util.binSizeToStr(binSize) ) )
		else:
			outFileStr = 'out_logreg_{:s}.tsv'.format( bth_util.binSizeToStr(binSize) )
	else:
		outFileStr = '{:s}_logreg_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize) )
	
	if decoding != 'N' and isUniform:
		outFileStr = outFileStr.replace( '.tsv', '_uni-{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else 'vit_fb') ) )
	elif decoding != 'N':
		outFileStr = outFileStr.replace( '.tsv', '_{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else 'vit_fb') ) )
	elif isUniform:
		outFileStr = outFileStr.replace( '.tsv', '_uni.tsv' )	
	return outFileStr
예제 #16
0
def processInputs(gffFileStrAr, fastaIndexStr, labels, calcType, outID, chrmList, numProc, numBins, binSize):

	if labels == None:
		labels = getSampleNames( gffFileStrAr )
		
	chrmDict = readFastaIndex( fastaIndexStr, chrmList )
	print( 'Read FASTA index.' )
	#print( chrmDict )
	aNumBins, chrmDict = determineNumBins( chrmDict, numBins, binSize )
	
	outFileStr = 'chrm_gff'
	if outID != '':
		outFileStr += '_' + outID
	outFileStr += '_' + ( 'length' if calcType == 'l' else 'number' )
	if numBins != -1:
		outFileStr += '_n{:d}'.format( numBins )
	elif binSize != -1:
		outFileStr += '_{:s}'.format( bth_util.binSizeToStr( binSize ) )
	outFileStr += '.tsv'
	
	print( 'Begin processing with {:d} processors'.format( numProc ) )
	pool = multiprocessing.Pool( processes=numProc )
	results = [ pool.apply_async( processGFFFile, args=(f, chrmDict, binSize, aNumBins, calcType ) ) for f in gffFileStrAr ]
	gffDictAr = [ p.get() for p in results ]
	
	info = "#from_script:chrom_plot_gff_pe.py; "
	# gffAr
	gffTmpAr = [ os.path.basename(x) for x in gffFileStrAr ]
	info += 'gff_files:' + ','.join(gffTmpAr ) + ';'
	if binSize == -1:
		info += "num_bins:{:d}".format( aNumBins )
	else:
		info += "bin_size:{:s}".format( bth_util.binSizeToStr( binSize ) )
	info += "; num_chrms:{:d};".format( len( chrmDict.keys() ) )
	info += " unit:{:s} per ".format( 'number' if calcType == 'n' else 'total bp' )
	if binSize == -1:
		info += '10kb'
	elif (binSize // 1000000) > 0:
		info += str( binSize / 1000000 ) + 'mbp'
	elif (binSize // 1000) > 0:
		info += str( binSize / 1000 ) + 'kbp'
	else:
		info += str( binSize ) + 'bp'
	
	print( 'Writing output to {:s}...'.format( outFileStr ) )
	writeOutput( outFileStr, gffDictAr, labels, info )
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, isSmoothing, isUniform ):
	info = '#from_script:epigenotype_by_logreg.py; in_file:{:s}; bin_size:{:s}'.format( os.path.basename( inFileStr), bth_util.binSizeToStr( binSize ) )
	print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
	print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
	print( 'Mother label:', parentLabelAr[0] )
	print( 'Father label:', parentLabelAr[1] )
	print( 'Smoothing:', str(isSmoothing) )
	print( 'Uniform classification probabilities:', str( isUniform ) )
	info += '; smoothing:{:s}; uni_class_prob:{:s}\n'.format( str(isSmoothing), str(isUniform) )

	# build data frame
	df = pd.read_table( inFileStr, header=1 )
	# check parent labels
	checkParents( df['sample'], parentLabelAr )

	# put in bins and analyze
	df['bin'] = df.pos // binSize
	nbins = max(df['bin'])+1
	dfg = df.groupby('bin')
	if numProc > 1:
		print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
		res_class = runMultiprocessing( dfg, numProc, parentLabelAr, isUniform )
	else:
		print( 'Begin classifying {:d} bins'.format( nbins ) )
		res_class = dfg.apply( classLogRegImproved, pla=parentLabelAr, u=isUniform )
	res_class.reset_index(inplace=True)
	
	# smooth by sample
	if isSmoothing:
		ignoreAr = parentLabelAr + ['MPV']
		#transProbMat = computeTransitions( res_class, ignoreAr )
		transition = SimpleTransitions( res_class, ignore=ignoreAr )
		transProbMat = transition.run()
		print( transProbMat )
		groups = res_class.groupby( 'sample' )
		nsamples = len(groups.groups)
	
		# find optimum path for all samples, group by sample
		if numProc > 1:
			print( 'Begin smoothing {:d} samples with {:d} processors'.format(  nsamples, numProc ) )
			results = runMulti( groups, numProc, transProbMat )
		else:
			print( 'Begin smoothing {:d} samples'.format( nsamples ) )
			results = groups.apply( findOptimalPath, trans=transProbMat )
		results.set_index( ['bin', 'sample'], inplace=True )
	else:
		results = res_class
	
	# output file
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, isSmoothing, isUniform )
	# write output
	print( 'Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info)
	results.to_csv( outFileStr, sep='\t', mode='a' )
	print( 'Done' )
def determineTransFileName( inFileStr, outID, binSize, combineBins, scaleFactor ):
	outBaseName = os.path.basename( inFileStr )
	if outID == None:
		if '_wm_pos_' in inFileStr:
			outFileStr = outBaseName.replace( '_wm_pos_', '_epigenotype-trans_{:s}_'.format( bth_util.binSizeToStr(binSize) ) )
		else:
			outFileStr = 'out_epigenotype-trans_{:s}.tsv'.format( bth_util.binSizeToStr(binSize) )
	else:
		outFileStr = '{:s}_epigenotype-trans_{:s}.tsv'.format( outID, bth_util.binSizeToStr(binSize) )
		
		# combining bins
	if combineBins > 0:
		outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format( combineBins ) )
		
	if scaleFactor != 1:
		s = str( scaleFactor ).replace('.','-')
		outFileStr = outFileStr.replace( '.tsv', '_s{:s}.tsv'.format(s))
		
	return outFileStr
def determineOutputFileName( inFileStr, outID, binSize, decoding, generation, combineBins ):
	outBaseName = os.path.basename( inFileStr )
	if outID == None:
		if '_wm_pos_' in inFileStr:
			outFileStr = outBaseName.replace( '_wm_pos_', '_epigenotype-v7.4_{:s}_g-{:d}_'.format( bth_util.binSizeToStr(binSize), generation ) )
		else:
			outFileStr = 'out_epigenotype-v7.4_{:s}_g-{:d}.tsv'.format( bth_util.binSizeToStr(binSize), generation )
	else:
		outFileStr = '{:s}_epigenotype-v7.4_{:s}_g-{:d}.tsv'.format( outID, bth_util.binSizeToStr(binSize), generation )
		
	# combining bins
	if combineBins > 0:
		outFileStr = outFileStr.replace('.tsv', '_cb-{:d}.tsv'.format( combineBins ) )
	
	# decoding
	if decoding != 'N':
		outFileStr = outFileStr.replace( '.tsv', '_{:s}.tsv'.format( 'vit' if decoding == 'V' else ('fb' if decoding == 'F' else ('both' if decoding == 'B' else 'vit-fb')) ) )

	return outFileStr
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, maxIter ):
	dType = ('Viterbi' if decoding == 'V' else ('Forward-backward' if decoding == 'F' else ('Viterbi and Forward-backward' if decoding == 'A' else 'None') ) )
	info = '#from_script:epigenotype_by_logreg.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}\n'.format( os.path.basename( inFileStr), bth_util.binSizeToStr( binSize ), dType.lower().replace(' and ', ','), str(isUniform) )
	print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
	print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
	print( 'Mother label:', parentLabelAr[0] )
	print( 'Father label:', parentLabelAr[1] )
	print( 'Uniform classification probabilities:', str( isUniform ) )
	print( 'Decoding algorithm:', dType)
	
	# build data frame
	df = pd.read_table( inFileStr, header=1 )
	# check parent labels
	checkParents( df['sample'], parentLabelAr )
	
	# group by bin and analyze
	df['bin'] = df.pos // binSize
	nbins = max(df['bin'])+1
	dfg = df.groupby('bin')
	if numProc > 1:
		print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
		res_class = runMultiClassification( dfg, numProc, parentLabelAr, isUniform )
	else:
		print( 'Begin classifying {:d} bins'.format( nbins ) )
		res_class = dfg.apply( classLogReg, pla=parentLabelAr, u=isUniform )
	res_class.reset_index(inplace=True)
	
	# decode if necessary
	if decoding != 'N':
		ignoreAr = parentLabelAr + ['MPV']
		print( 'Generating transition matrix' )
		transition = Transitions( res_class, ignore=ignoreAr )
		transitions = transition.getTransitions()
		# find optimum path for all samples
		groups = res_class.groupby( 'sample' )
		nsamples = len(groups.groups)
		
		if numProc > 1:
			print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format(  dType, nsamples, numProc ) )
			results = runMultiPath( groups, numProc, transitions, isUniform, decoding )
		else:
			print( 'Begin {:s} decoding {:d} samples'.format( dType, nsamples ) )
			results = groups.apply( findOptimalPath, trans=transitions, u=isUniform, d=decoding )
		results.set_index( ['bin', 'sample'], inplace=True )
	else:
		results = res_class
	
	# output file
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform )
	# write output
	print( 'Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info)
	results.to_csv( outFileStr, sep='\t', mode='a' )
	print( 'Done' )
def determineOutputFileName(inFileStr, outID, binSize, isSmoothing, isUniform):
    outBaseName = os.path.basename(inFileStr)
    if outID == None:
        if '_wm_pos_' in inFileStr:
            outFileStr = outBaseName.replace(
                '_wm_pos_',
                '_logreg_{:s}_'.format(bth_util.binSizeToStr(binSize)))
        else:
            outFileStr = 'out_logreg_{:s}.tsv'.format(
                bth_util.binSizeToStr(binSize))
    else:
        outFileStr = '{:s}_logreg_{:s}.tsv'.format(
            outID, bth_util.binSizeToStr(binSize))
    if isSmoothing and isUniform:
        outFileStr = outFileStr.replace('.tsv', '_uni-opt.tsv')
    elif isSmoothing:
        outFileStr = outFileStr.replace('.tsv', '_opt.tsv')
    elif isUniform:
        outFileStr = outFileStr.replace('.tsv', '_uni.tsv')
    return outFileStr
예제 #22
0
def determineTransFileName(inFileStr, outID, binSize, combineBins):
    outBaseName = os.path.basename(inFileStr)
    if outID == None:
        if '_wm_pos_' in inFileStr:
            outFileStr = outBaseName.replace(
                '_wm_pos_',
                '_transition_{:s}_'.format(bth_util.binSizeToStr(binSize)))
        else:
            outFileStr = 'out_transition_{:s}.tsv'.format(
                bth_util.binSizeToStr(binSize))
    else:
        outFileStr = '{:s}_transition_{:s}.tsv'.format(
            outID, bth_util.binSizeToStr(binSize))

        # combining bins
    if combineBins > 0:
        outFileStr = outFileStr.replace('.tsv',
                                        '_cb-{:d}.tsv'.format(combineBins))

        return outFileStr
def parseInputs(argv):
    numProc = NUMPROC
    binSize = BINSIZE
    outID = None
    parentLabelAr = ['mother', 'father', 0]
    isSmoothing = True
    isUniform = False
    startInd = 0

    for i in range(min(7, len(argv))):
        if argv[i].startswith('-o='):
            outID = argv[i][3:]
            startInd += 1
        elif argv[i].startswith('-b='):
            inStr = argv[i][3:]
            binSize = bth_util.strToDistance(inStr)
            if binSize == False:
                print(
                    'WARNING: cannot convert {:s} to bin size...using default {:s}'
                    .format(inStr, bth_util.binSizeToStr(BINSIZE)))
                binSize = BINSIZE
            startInd += 1
        elif argv[i].startswith('-p='):
            try:
                numProc = int(argv[i][3:])
                startInd += 1
            except ValueError:
                print(
                    'WARNING: number of processors must be integer...using 1')
                numProc = NUMPROC
        elif argv[i].startswith('-m='):
            parentLabelAr[0] = argv[i][3:]
            parentLabelAr[2] += 1
            startInd += 1
        elif argv[i].startswith('-f='):
            parentLabelAr[1] = argv[i][3:]
            parentLabelAr[2] += 2
            startInd += 1
        elif argv[i] == '-n':
            isSmoothing = False
            startInd += 1
        elif argv[i] == '-u':
            isUniform = True
            startInd += 1
        elif argv[i] in ['-h', '--help', '-help']:
            printHelp()
            exit()
        elif argv[i].startswith('-'):
            print('ERROR: {:s} is not a valid option'.format(argv[i]))
            exit()
    # end for
    inFileStr = argv[startInd]
    processInputs(inFileStr, numProc, binSize, outID, parentLabelAr,
                  isSmoothing, isUniform)
예제 #24
0
def printHelp():
	print( 'Usage: python epigenotyping_pe.py [-u] [-d=decoding_type] [-p=num_proc] [-o=out_id] [-m=mother_sample] [-f=father_sample] [-b=bin_size] <input_file>' )
	print( 'Requried:' )
	print( 'input_file\tfile of of weighted methylation by position for samples' )
	print( 'Optional:' )
	print( '-u\t\tuniform class weights [default 1:2:1 for mother,\n\t\tMPV,father]' )
	print( '-d=decode_type\tdecoding type to use (capitlization ignored) [default A]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tBoth="all" or "a"\n\t\tOff="false", "none", or "n"' )
	print( '-o=out_id\tidentifier for output file [default out or variation of\n\t\tinput file name]' )
	print( '-p=num_proc\tnumber of processors' )
	print( '-m=mother_label\tsample name of mother; for correct classification\n\t\t[default mother]' )
	print( '-f=father_label\tsample name of father; for correct classification\n\t\t[default father]' )
	print( '-b=bin_size\tsize of bins in bp [default {:s}]'.format( bth_util.binSizeToStr( BINSIZE ) ) )
예제 #25
0
def processInputs(bedFileStrAr, fastaIndexStr, labels, outID, chrmList,
                  numProc, numBins, binSize, percentile):

    if labels == None:
        labels = getSampleNames(bedFileStrAr)
    print('Reading FASTA index')
    chrmDict = readFastaIndex(fastaIndexStr, chrmList)

    #print( chrmDict )
    aNumBins, chrmDict = determineNumBins(chrmDict, numBins, binSize)

    outFileStr = 'chrm_bed'
    if outID != '':
        outFileStr += '_' + outID
    if numBins != -1:
        outFileStr += '_n{:d}'.format(numBins)
    elif binSize != -1:
        outFileStr += '_{:s}'.format(bth_util.binSizeToStr(binSize))
    outFileStr += '.tsv'

    print('Begin processing with {:d} processors'.format(numProc))
    pool = multiprocessing.Pool(processes=numProc)
    results = [
        pool.apply_async(processBEDFile,
                         args=(f, chrmDict, binSize, aNumBins, percentile))
        for f in bedFileStrAr
    ]
    bedDictAr = [p.get() for p in results]

    info = "#from_script:chrom_plot_bed_mid_pe.py; "
    if binSize == -1:
        info += "num_bins:{:d}; ".format(aNumBins)
    else:
        info += "bin_size:{:s}; ".format(bth_util.binSizeToStr(binSize))
    info += "num_chrms:{:d}; ".format(len(chrmDict.keys()))
    info += "percentile:{:.1f}; ".format(percentile * 100)
    info += "unit:million reads per bin normalized by library size".format()

    print('Writing output to {:s}...'.format(outFileStr))
    writeOutput(outFileStr, bedDictAr, labels, info)
예제 #26
0
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform ):
	
	info = '#from_script: epigenotyping_pe.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower() )
	print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
	print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
	print( 'Mother label:', parentLabelAr[0] )
	print( 'Father label:', parentLabelAr[1] )
	print( 'Uniform classification probabilities:', str(isUniform) )
	print( 'Decoding algorithm:', formatDecoding( decoding ) )
	
	# build dataframe
	df = pd.read_table( inFileStr, header=1 )
	
	# check parent labels
	checkParents( df['sample'], parentLabelAr )
	
	# group by bin
	df['bin'] = df.pos // binSize
	nbins = max(df['bin'])+1
	dfBinGroup = df.groupby( 'bin' )
	
	# classify by bin
	print( 'Begin classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
	dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, isUniform )
	dfClass.reset_index(inplace=True)
	#print( dfClass.head )
	del(df, dfBinGroup )
	# decode, if necessary
	if decoding != 'N':
		ignoreAr = parentLabelAr[:2] + ['MPV']
		transition = Transitions( dfClass, ignore = ignoreAr )
		transitionMatrix = transition.getTransitions()
		
		# group by sample
		dfSampleGroup = dfClass.groupby( 'sample' )
		nsamples = len(dfSampleGroup.groups )
		
		print( 'Begin {:s} decoding {:d} samples with {:d} processors'.format(  formatDecoding(decoding), nsamples, numProc ) )
		dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, decoding )
		dfOutput.set_index( ['bin', 'sample'], inplace=True )
		del( dfSampleGroup )
	else:
		dfOutput = dfClass
	
	# write output
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform )
	print( 'Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info+'\n')
	dfOutput.to_csv( outFileStr, sep='\t', mode='a' )
	
	print( 'Done' )
def determineOutputFileName(inFileStr, outID, binSize, decoding, classProbs,
                            scaleTransitions, combineBins):
    outBaseName = os.path.basename(inFileStr)
    if outID == None:
        if '_wm_pos_' in inFileStr:
            outFileStr = outBaseName.replace(
                '_wm_pos_',
                '_epigenotype-v9_{:s}_'.format(bth_util.binSizeToStr(binSize)))
        else:
            outFileStr = 'out_epigenotype-v9_{:s}.tsv'.format(
                bth_util.binSizeToStr(binSize))
    else:
        outFileStr = '{:s}_epigenotype-v9_{:s}.tsv'.format(
            outID, bth_util.binSizeToStr(binSize))

    # combining bins
    if combineBins > 0:
        outFileStr = outFileStr.replace('.tsv',
                                        '_cb-{:d}.tsv'.format(combineBins))

    if scaleTransitions:
        outFileStr = outFileStr.replace('.tsv',
                                        '_scaled.tsv'.format(combineBins))

    # decoding and uniform
    if decoding != 'N' and classProbs == 'E':
        outFileStr = outFileStr.replace(
            '.tsv', '_epiril-{:s}.tsv'.format('vit' if decoding == 'V' else (
                'fb' if decoding == 'F' else (
                    'both' if decoding == 'B' else 'vit-fb'))))
    elif decoding != 'N':
        outFileStr = outFileStr.replace(
            '.tsv', '_{:s}.tsv'.format('vit' if decoding == 'V' else (
                'fb' if decoding == 'F' else (
                    'both' if decoding == 'B' else 'vit-fb'))))
    elif classProbs == 'E':
        outFileStr = outFileStr.replace('.tsv', '_epiril.tsv')

    return outFileStr
def printHelp():
	print( 'Usage:\npython epigenotyping_combin_iter-trans.py [-u] [-c=bin_thresh] [-d=decoding_type][-p=num_proc]\n[-o=out_id] [-m=mother_sample] [-f=father_sample] [-b=bin_size] [-n=max_iter] <input_file>' )
	print( 'Requried:' )
	print( 'input_file\tfile of of weighted methylation by position for samples' )
	print( 'Optional:' )
	print( '-u\t\tuniform class weights [default 1:2:1 for mother,\n\t\tMPV,father]' )
	print( '-d=decode_type\tdecoding type to use (capitlization ignored) [default {:s}]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tBoth="all" or "a"\n\t\tOff="false", "none", or "n"'.format(DECODE) )
	print( '-o=out_id\tidentifier for output file [default "out" or variation of\n\t\tinput file name]' )
	print( '-p=num_proc\tnumber of processors [default {:d}]'.format(NUMPROC) )
	print( '-c=bin_thresh\tminimum number of features per bin to be classified\n\t\tgroups bins to reach this number [default {:d}]'.format(COMBINE) )
	print( '-m=mother_label\tsample name of mother; for correct classification\n\t\t[default mother]' )
	print( '-f=father_label\tsample name of father; for correct classification\n\t\t[default father]' )
	print( '-b=bin_size\tsize of bins in bp [default {:s}]'.format( bth_util.binSizeToStr( BINSIZE ) ) )
	print( '-n=max_iter\tmaximum iterations to improve transition matrix [default {:s}]'.format(MAXITER) )
def printHelp():
	print( 'Usage:\npython epigenotyping_pe_v8.2.py [-e | -u] [-q] [-c=bin_thresh] [-d=decoding_type][-p=num_proc]\n[-o=out_id] [-m=mother_sample(s)] [-f=father_sample(s)] [-b=bin_size] <input_file>' )
	print( 'Requried:' )
	print( 'input_file\tfile of of weighted methylation by position for samples' )
	print( 'Optional:' )
	print( '-e\t\tclass weights for epiRILs; 1:0:1 for mother,MPV,father' )
	print( '-q\t\tquiet, do not print progress' )
	print( '-d=decode_type\tdecoding type to use (capitlization ignored) [default {:s}]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tBoth="all" or "a"\n\t\tOff="false", "none", or "n"'.format(DECODE) )
	print( '-o=out_id\tidentifier for output file [default "out" or variation of\n\t\tinput file name]' )
	print( '-p=num_proc\tnumber of processors [default {:d}'.format(NUMPROC) )
	print( '-c=bin_thresh\tminimum number of features per bin to be classified\n\t\tgroups bins to reach this number [default {:d}'.format(COMBINE) )
	print( '-m=mother_samples\tsample name(s) of mother; for correct classification\n\t\t[default mother]' )
	print( '-f=father_samples\tsample name(s) of father; for correct classification\n\t\t[default father]' )
	print( '-b=bin_size\tsize of bins in bp [default {:s}]'.format( bth_util.binSizeToStr( BINSIZE ) ) )
def printHelp():
	print( 'Usage:\npython epigenotyping_pe_combbin_scaled.py [-u] [-c=bin_thresh] [-d=decoding_type][-p=num_proc]\n[-o=out_id] [-m=mother_sample] [-f=father_sample] [-b=bin_size] [-t=cent_start,cent_end] [-s=scale_factor] <input_file>' )
	print( 'Requried:' )
	print( 'input_file\tfile of of weighted methylation by position for samples' )
	print( 'Optional:' )
	print( '-u\t\tuniform class weights [default 1:2:1 for mother,\n\t\tMPV,father]' )
	print( '-d=decode_type\tdecoding type to use (capitlization ignored) [default {:s}]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tBoth="all" or "a"\n\t\tOff="false", "none", or "n"'.format(DECODE) )
	print( '-o=out_id\tidentifier for output file [default "out" or variation of\n\t\tinput file name]' )
	print( '-p=num_proc\tnumber of processors [default {:d}'.format(NUMPROC) )
	print( '-c=bin_thresh\tminimum number of features per bin to be classified\n\t\tgroups bins to reach this number [default {:d}'.format(COMBINE) )
	print( '-m=mother_label\tsample name of mother; for correct classification\n\t\t[default mother]' )
	print( '-f=father_label\tsample name of father; for correct classification\n\t\t[default father]' )
	print( '-b=bin_size\tsize of bins in bp [default {:s}]'.format( bth_util.binSizeToStr( BINSIZE ) ) )
	print( '-t=cent_start,cent_end\tcoordinates for centromere [default None]\n\t\twhen included, ignores this region for transitions and decoding' )
	print( '-s=scale_factor\tmultiplicative factor for weighting prediction probability over transition probability [default {:d} (unscaled)]'.format( SCALE ) )
def processInputs( bedFileStrAr, fastaIndexStr, labels, outID, chrmList, numProc, numBins, binSize, percentile ):

	if labels == None:
		labels = getSampleNames( bedFileStrAr )
	print( 'Reading FASTA index' )
	chrmDict = readFastaIndex( fastaIndexStr, chrmList )
	
	#print( chrmDict )
	aNumBins, chrmDict = determineNumBins( chrmDict, numBins, binSize )
	
	outFileStr = 'chrm_bed'
	if outID != '':
		outFileStr += '_' + outID
	if numBins != -1:
		outFileStr += '_n{:d}'.format( numBins )
	elif binSize != -1:
		outFileStr += '_{:s}'.format( bth_util.binSizeToStr( binSize ) )
	outFileStr += '.tsv'
	
	print( 'Begin processing with {:d} processors'.format( numProc ) )
	pool = multiprocessing.Pool( processes=numProc )
	results = [ pool.apply_async( processBEDFile, args=(f, chrmDict, binSize, aNumBins, percentile ) ) for f in bedFileStrAr ]
	bedDictAr = [ p.get() for p in results ]
	
	info = "#from_script:chrom_plot_bed_mid_pe.py; "
	if binSize == -1:
		info += "num_bins:{:d}; ".format( aNumBins )
	else:
		info += "bin_size:{:s}; ".format( bth_util.binSizeToStr( binSize ) )
	info += "num_chrms:{:d}; ".format( len( chrmDict.keys() ) )
	info += "percentile:{:.1f}; ".format( percentile*100 )
	info += "unit:million reads per bin normalized by library size".format( )
	
	
	print( 'Writing output to {:s}...'.format( outFileStr ) )
	writeOutput( outFileStr, bedDictAr, labels, info )
def printHelp():
	print( 'Usage:\tpython epigenotyping_pe_v7.3.py [-q] [-n-mpv] [-t-out] [-g=generation]\n\t[-c=bin_thresh] [-d=decoding_type] [-p=num_proc] [-o=out_id] [-m=mother_\n\tsamples][-f=father_samples] [-b=bin_size] [-t=centromere] <input_file>' )
	print()
	print( 'Requried:' )
	print( 'input_file\ttab-delimited file of of weighted methylation by position for samples' )
	print()
	print( 'Optional:' )
	print( '-q\t\tquiet; do not print progress' )
	print( '-h\t\tprint help and exit' )
	print( '-n-mpv\t\tdo not check for systematic mid-parent bias' )
	print( '-t-out\t\twrite transition matrix to file' )
	print( '-g=generation\tgeneration of self-crossing; used to determine classification\n\t\tprobabilities; use 0 for uniform weight [default {:d}]'.format( GENERATION) )
	print( '-d=decode_type\tdecoding type to use (capitlization ignored) [default {:s}]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tAll (FB and Vit independently)="all" or "a"\n\t\tBoth (FB then Vit)="both" or "b"\n\t\tOff="false", "none", or "n"'.format(DECODE) )
	print( '-o=out_id\tidentifier for output file [default "out" or variation of\n\t\tinput file name]' )
	print( '-p=num_proc\tnumber of processors [default {:d}'.format(NUMPROC) )
	print( '-c=bin_thresh\tminimum number of features per bin to be classified\n\t\tgroups bins to reach this number [default {:d}'.format(COMBINE) )
	print( '-m=mother_samples\tcomma-separated sample name(s) of mother\n\t\t[default mother]' )
	print( '-f=father_samples\tcomma-separated sample name(s) of father\n\t\t[default father]' )
	print( '-b=bin_size\tsize of bins in bp [default {:s}]'.format( bth_util.binSizeToStr( BINSIZE ) ) )
	print( '-t=centromere\tcentromere coordinates as "start,end"; can include multipe\n\t\tcentromeres as "start1,end1,start2,end2..." [default None]' )
def printHelp():
    print(
        'Usage:\tpython epigenotyping_pe_v7.3.py [-q] [-n-mpv] [-t-out] [-g=generation]\n\t[-c=bin_thresh] [-d=decoding_type] [-p=num_proc] [-o=out_id] [-m=mother_\n\tsamples][-f=father_samples] [-b=bin_size] [-t=centromere] <input_file>'
    )
    print()
    print('Requried:')
    print(
        'input_file\ttab-delimited file of of weighted methylation by position for samples'
    )
    print()
    print('Optional:')
    print('-q\t\tquiet; do not print progress')
    print('-h\t\tprint help and exit')
    print('-n-mpv\t\tdo not check for systematic mid-parent bias')
    print('-t-out\t\twrite transition matrix to file')
    print(
        '-g=generation\tgeneration of self-crossing; used to determine classification\n\t\tprobabilities; use 0 for uniform weight [default {:d}]'
        .format(GENERATION))
    print(
        '-d=decode_type\tdecoding type to use (capitlization ignored) [default {:s}]\n\t\tViterbi="v" or "viterbi"\n\t\tForward-Backward="forwardbackward", "f" or "fb"\n\t\tAll (FB and Vit independently)="all" or "a"\n\t\tBoth (FB then Vit)="both" or "b"\n\t\tOff="false", "none", or "n"'
        .format(DECODE))
    print(
        '-o=out_id\tidentifier for output file [default "out" or variation of\n\t\tinput file name]'
    )
    print('-p=num_proc\tnumber of processors [default {:d}'.format(NUMPROC))
    print(
        '-c=bin_thresh\tminimum number of features per bin to be classified\n\t\tgroups bins to reach this number [default {:d}'
        .format(COMBINE))
    print(
        '-m=mother_samples\tcomma-separated sample name(s) of mother\n\t\t[default mother]'
    )
    print(
        '-f=father_samples\tcomma-separated sample name(s) of father\n\t\t[default father]'
    )
    print('-b=bin_size\tsize of bins in bp [default {:s}]'.format(
        bth_util.binSizeToStr(BINSIZE)))
    print(
        '-t=centromere\tcentromere coordinates as "start,end"; can include multipe\n\t\tcentromeres as "start1,end1,start2,end2..." [default None]'
    )
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, classProbs, combineBins, cent, isPrint ):
	
	info = '#from_script: epigenotyping_pe_v7.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; class_prob:{:s}; combine_bins_threshold:{:d}; centromere_{:s}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), formatClassProbs(classProbs).lower(), combineBins, ('None' if cent == None else '{:s}-{:s}'.format( bth_util.binSizeToStr( cent[0] ), bth_util.binSizeToStr( cent[1] ) ) ) )
	if isPrint:
		print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
		print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
		print( 'Mother label(s):', parentLabelAr[0] )
		print( 'Father label(s):', parentLabelAr[1] )
		print( 'Classification probabilities:', formatClassProbs( classProbs ) )
		print( 'Decoding algorithm:', formatDecoding( decoding ) )
		print( 'Combine bin feature threshold:', combineBins )
	if cent == None:
		centStr = 'None'
	else:
		centStr = ''
		for i in range(len(cent)//2):
			si = i*2
			centStr += '; {:s}-{:s}'.format( bth_util.binSizeToStr( cent[si] ), bth_util.binSizeToStr( cent[si+1] ) )
		centStr = centStr[2:]
			
	if isPrint:
		print( 'Centromere:', centStr )
	
	# build dataframe
	if isPrint:
		print( ' Reading input file', os.path.basename( inFileStr ) )
	df = pd.read_table( inFileStr, header=1 )
	
	# check parent labels
	newParentLabelAr = checkParents( df['sample'], parentLabelAr )
	tIgnoreAr = flattenList( newParentLabelAr[:2] )
	for i in range(len(newParentLabelAr[0])):
		tIgnoreAr += [ 'MPV{:d}'.format( i ) ]
	
	# group by bin
	df['bin'] = df.pos // binSize
	transformation = None
	
	# get centromere bins if necessary
	if cent == None:
		centBins = []
	else:
		cent = [ x // binSize for x in cent ]
		centBins = []
		#centBins = list( range(cent[0], cent[1]+1) )
		for i in range(len(cent) // 2 ):
			si = i * 2
			centBins += list( range(cent[si], cent[si+1]+1) )
	
	# combine bins if necessary
	nbins = max(df['bin'])+1
	if combineBins > 0:
		if isPrint:
			print( ' Merging bins', end=' ... ' )
		df['tBin'] = df['bin']
		transformation = binTransformation( df, combineBins )
		# apply the transformation
		df['bin'] = df['tBin'].apply( lambda x: transformation[x] )
		
	
	dfBinGroup = df.groupby( 'bin' )
	if combineBins > 0:
		newNBins = len( dfBinGroup.groups )
		info += '; non-functional_bins:{:d}'.format( nbins - newNBins )
		if isPrint:
			print( 'combined {:d} non-functional bins'.format( nbins - newNBins ) )
	
	# classify by bin
	if isPrint:
		print( ' Classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
	dfClass = runClassification( dfBinGroup, numProc, newParentLabelAr, classProbs )
	dfClass.reset_index(inplace=True)
	#print( dfClass.head )
	del(df, dfBinGroup )
	# decode, if necessary
	if decoding != 'N':
		#ignoreAr = parentLabelAr[:2] + ['MPV']
		transition = Transitions( dfClass, ignore = tIgnoreAr )
		transitionMatrix = transition.getTransitions()
		# write this matrix to file
		#outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins )
		#tLabels = [ 'mother', 'MPV', 'father' ]
		#transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels )
		#with open( outFStr, 'w' ) as f:
		#	f.write(info+'\n')
		#transData.to_csv( outFStr, sep='\t', mode='a' )
		
		# group by sample
		dfSampleGroup = dfClass.groupby( 'sample' )
		nsamples = len( dfSampleGroup.groups )
		tmpDecoding = ( 'F' if decoding == 'B' else decoding )
		if isPrint:
			print( ' {:s} decoding {:d} samples with {:d} processors'.format(  formatDecoding(tmpDecoding), nsamples, numProc ) )
		
		dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, tmpDecoding, centBins )
		
		
		if decoding == 'B':
			dfNew = dfOutput.loc[:,['bin','sample']].copy()
			dfNew['MPV'] = np.log(dfOutput['fb.score.MPV'])
			dfNew['mother'] = np.log(dfOutput['fb.score.mother'])
			dfNew['father'] = np.log(dfOutput['fb.score.father'])
			dfNew['prediction'] = dfOutput['fb.prediction']
			#print(dfOutput.head())
			#print(dfNew.head())
			transition = Transitions( dfNew, ignore = tIgnoreAr )
			transitionMatrix = transition.getTransitions()
			dfSampleGroup = dfNew.groupby( 'sample' )
			nsamples = len( dfSampleGroup.groups )
			
			if isPrint:
				print( ' {:s} decoding {:d} samples with {:d} processors'.format(  formatDecoding('V'), nsamples, numProc ) )
			dfOutputN = runDecoding( dfSampleGroup, numProc, transitionMatrix, 'V', centBins )
			dfOutput[['vit.score.mother', 'vit.score.father', 'vit.score.MPV', 'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV', 'vit.prediction']] = dfOutputN[['vit.score.mother', 'vit.score.father', 'vit.score.MPV', 'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV', 'vit.prediction']]
			#print( dfOutput.head() )
		# end decoding == B
		dfOutput.set_index( ['bin', 'sample'], inplace=True )
		del( dfSampleGroup )
	else:
		dfOutput = dfClass
	
	# write output
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, classProbs, combineBins )
	# if combination, undo transformation by applying the predictions to additional bins
	if combineBins > 0:
		dfOutput.reset_index(inplace=True)
		dfOutput['cBin'] = dfOutput['bin']
		dfOutputT = undoBinTransformation( dfOutput, transformation )
	else:
		dfOutputT = dfOutput.drop('cBin', axis=1)
	if isPrint:
		print( ' Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info+'\n')
	dfOutputT.to_csv( outFileStr, sep='\t', mode='a' )
	
	if isPrint:
		print( 'Done' )
def processInputs(inFileStr, numProc, binSize, outID, parentLabelAr,
                  isSmoothing, isUniform):
    info = '#from_script:epigenotype_by_logreg.py; in_file:{:s}; bin_size:{:s}'.format(
        os.path.basename(inFileStr), bth_util.binSizeToStr(binSize))
    print('Weighted methylation file:', os.path.basename(inFileStr))
    print('Bin size:', bth_util.binSizeToStr(binSize))
    print('Mother label:', parentLabelAr[0])
    print('Father label:', parentLabelAr[1])
    print('Smoothing:', str(isSmoothing))
    print('Uniform classification probabilities:', str(isUniform))
    info += '; smoothing:{:s}; uni_class_prob:{:s}\n'.format(
        str(isSmoothing), str(isUniform))

    # build data frame
    df = pd.read_table(inFileStr, header=1)
    # check parent labels
    checkParents(df['sample'], parentLabelAr)

    # put in bins and analyze
    df['bin'] = df.pos // binSize
    nbins = max(df['bin']) + 1
    dfg = df.groupby('bin')
    if numProc > 1:
        print('Begin classifying {:d} bins with {:d} processors'.format(
            nbins, numProc))
        res_class = runMultiprocessing(dfg, numProc, parentLabelAr, isUniform)
    else:
        print('Begin classifying {:d} bins'.format(nbins))
        res_class = dfg.apply(classLogRegImproved,
                              pla=parentLabelAr,
                              u=isUniform)
    res_class.reset_index(inplace=True)

    # smooth by sample
    if isSmoothing:
        ignoreAr = parentLabelAr + ['MPV']
        #transProbMat = computeTransitions( res_class, ignoreAr )
        transition = SimpleTransitions(res_class, ignore=ignoreAr)
        transProbMat = transition.run()
        print(transProbMat)
        groups = res_class.groupby('sample')
        nsamples = len(groups.groups)

        # find optimum path for all samples, group by sample
        if numProc > 1:
            print('Begin smoothing {:d} samples with {:d} processors'.format(
                nsamples, numProc))
            results = runMulti(groups, numProc, transProbMat)
        else:
            print('Begin smoothing {:d} samples'.format(nsamples))
            results = groups.apply(findOptimalPath, trans=transProbMat)
        results.set_index(['bin', 'sample'], inplace=True)
    else:
        results = res_class

    # output file
    outFileStr = determineOutputFileName(inFileStr, outID, binSize,
                                         isSmoothing, isUniform)
    # write output
    print('Writing output to', outFileStr)
    with open(outFileStr, 'w') as f:
        f.write(info)
    results.to_csv(outFileStr, sep='\t', mode='a')
    print('Done')
def parseInputs(argv):
    numProc = NUMPROC
    binSize = BINSIZE
    outID = None
    parentLabelAr = ['mother', 'father', 0]
    decoding = DECODE
    isUniform = UNIFORM
    combineBins = COMBINE
    centromere = None
    scaleFactor = SCALE
    startInd = 0

    for i in range(min(9, len(argv) - 1)):
        if argv[i].startswith('-o='):
            outID = argv[i][3:]
            startInd += 1
        elif argv[i].startswith('-b='):
            inStr = argv[i][3:]
            binSize = bth_util.strToDistance(inStr)
            if binSize == False:
                print(
                    'WARNING: cannot convert {:s} to bin size...using default {:s}'
                    .format(inStr, bth_util.binSizeToStr(BINSIZE)))
                binSize = BINSIZE
            startInd += 1
        elif argv[i].startswith('-p='):
            try:
                numProc = int(argv[i][3:])
                startInd += 1
            except ValueError:
                print(
                    'WARNING: number of processors must be integer...using 1')
                numProc = NUMPROC
        elif argv[i].startswith('-c='):
            try:
                combineBins = int(argv[i][3:])
                startInd += 1
            except ValueError:
                print(
                    'WARNING: number of processors must be integer...using default {:s}'
                    .format(COMBINE))
                combineBins = COMBINE
        elif argv[i].startswith('-m='):
            parentLabelAr[0] = argv[i][3:]
            parentLabelAr[2] += 1
            startInd += 1
        elif argv[i].startswith('-f='):
            parentLabelAr[1] = argv[i][3:]
            parentLabelAr[2] += 2
            startInd += 1
        elif argv[i].startswith('-d='):
            opt = argv[i][3:].lower()
            if opt == 'false' or opt == 'none' or opt == 'n':
                decoding = 'N'
            elif opt == 'viterbi' or opt == 'v':
                decoding = 'V'
            elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb':
                decoding = 'F'
            elif opt == 'all' or opt == 'a':
                decoding = 'A'
            else:
                print(
                    'WARNING: decoding option {:s} not recognized...using default {:s}'
                    .format(opt, DECODE))
            startInd += 1
        elif argv[i] == '-u':
            isUniform = True
            startInd += 1
        elif argv[i].startswith('-t='):
            tmp = argv[i][3:].split(',')
            tmp2 = [bth_util.strToDistance(x) for x in tmp]
            if len(tmp2) != 2 or (False in tmp2):
                print('WARNING: centromere coordinates bad...not using')
            else:
                centromere = tmp2
            startInd += 1
        elif argv[i].startswith('-s='):
            try:
                scaleFactor = float(argv[i][3:])
                startInd += 1
                if scaleFactor == 0:
                    print(
                        'WARNING: scale factor must be greater than 0...using default',
                        SCALE)
            except ValueError:
                print(
                    'WARNING: scale factor must be numeric...using default {:s}'
                    .format(SCALE))
                scaleFactor = SCALE
        elif argv[i] in ['-h', '--help', '-help']:
            printHelp()
            exit()
        elif argv[i].startswith('-'):
            print('ERROR: {:s} is not a valid option'.format(argv[i]))
            exit()
    # end for

    inFileStr = argv[startInd]
    processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding,
                  isUniform, combineBins, centromere, scaleFactor)
예제 #37
0
def parseInputs(argv):
    numProc = NUMPROC
    binSize = BINSIZE
    outID = None
    parentLabelAr = ['mother', 'father', 0]
    decoding = DECODE
    isUniform = UNIFORM
    combineBins = COMBINE
    startInd = 0

    for i in range(min(7, len(argv) - 1)):
        if argv[i].startswith('-o='):
            outID = argv[i][3:]
            startInd += 1
        elif argv[i].startswith('-b='):
            inStr = argv[i][3:]
            binSize = bth_util.strToDistance(inStr)
            if binSize == False:
                print(
                    'WARNING: cannot convert {:s} to bin size...using default {:s}'
                    .format(inStr, bth_util.binSizeToStr(BINSIZE)))
                binSize = BINSIZE
            startInd += 1
        elif argv[i].startswith('-p='):
            try:
                numProc = int(argv[i][3:])
                startInd += 1
            except ValueError:
                print(
                    'WARNING: number of processors must be integer...using 1')
                numProc = NUMPROC
        elif argv[i].startswith('-c='):
            try:
                combineBins = int(argv[i][3:])
                startInd += 1
            except ValueError:
                print(
                    'WARNING: number of processors must be integer...using default {:s}'
                    .format(COMBINE))
                combineBins = COMBINE
        elif argv[i].startswith('-m='):
            parentLabelAr[0] = argv[i][3:]
            parentLabelAr[2] += 1
            startInd += 1
        elif argv[i].startswith('-f='):
            parentLabelAr[1] = argv[i][3:]
            parentLabelAr[2] += 2
            startInd += 1
        elif argv[i].startswith('-d='):
            opt = argv[i][3:].lower()
            if opt == 'false' or opt == 'none' or opt == 'n':
                decoding = 'N'
            elif opt == 'viterbi' or opt == 'v':
                decoding = 'V'
            elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb':
                decoding = 'F'
            elif opt == 'all' or opt == 'a':
                decoding = 'A'
            else:
                print(
                    'WARNING: decoding option {:s} not recognized...using default viterbi'
                    .format(opt))
            startInd += 1
        elif argv[i] == '-u':
            isUniform = True
            startInd += 1
        elif argv[i] in ['-h', '--help', '-help']:
            printHelp()
            exit()
        elif argv[i].startswith('-'):
            print('ERROR: {:s} is not a valid option'.format(argv[i]))
            exit()
    # end for

    inFileStr = argv[startInd]
    processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding,
                  isUniform, combineBins)
def parseInputs( argv ):
	numProc = NUMPROC
	binSize = BINSIZE
	outID = None
	parentLabelAr = ['mother', 'father', 0]
	decoding = DECODE
	isUniform = UNIFORM
	combineBins = COMBINE
	centromere=None
	scaleFactor = SCALE
	startInd = 0
	
	for i in range( min(9, len(argv)-1) ):
		if argv[i].startswith( '-o=' ):
			outID = argv[i][3:]
			startInd += 1
		elif argv[i].startswith( '-b=' ):
			inStr = argv[i][3:]
			binSize = bth_util.strToDistance( inStr )
			if binSize == False:
				print( 'WARNING: cannot convert {:s} to bin size...using default {:s}'.format( inStr, bth_util.binSizeToStr(BINSIZE) ) )
				binSize = BINSIZE
			startInd += 1
		elif argv[i].startswith( '-p=' ):
			try:
				numProc = int( argv[i][3:] )
				startInd += 1
			except ValueError:
				print( 'WARNING: number of processors must be integer...using 1' )
				numProc = NUMPROC
		elif argv[i].startswith( '-c=' ):
			try:
				combineBins = int( argv[i][3:] )
				startInd += 1
			except ValueError:
				print( 'WARNING: number of processors must be integer...using default {:s}'.format(COMBINE) )
				combineBins = COMBINE
		elif argv[i].startswith( '-m=' ):
			parentLabelAr[0] = argv[i][3:]
			parentLabelAr[2] += 1
			startInd += 1
		elif argv[i].startswith( '-f=' ):
			parentLabelAr[1] = argv[i][3:]
			parentLabelAr[2] += 2
			startInd += 1
		elif argv[i].startswith( '-d=' ):
			opt = argv[i][3:].lower()
			if opt == 'false' or opt == 'none' or opt== 'n':
				decoding = 'N'
			elif opt == 'viterbi' or opt == 'v':
				decoding = 'V';
			elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb':
				decoding = 'F'
			elif opt == 'all' or opt == 'a':
				decoding = 'A'
			else:
				print( 'WARNING: decoding option {:s} not recognized...using default {:s}'.format(opt, DECODE) )
			startInd += 1
		elif argv[i] == '-u':
			isUniform = True
			startInd += 1
		elif argv[i].startswith( '-t=' ):
			tmp = argv[i][3:].split(',')
			tmp2 = [ bth_util.strToDistance( x ) for x in tmp ]
			if len(tmp2) != 2 or (False in tmp2):
				print( 'WARNING: centromere coordinates bad...not using' )
			else:
				centromere = tmp2
			startInd += 1
		elif argv[i].startswith( '-s=' ):
			try:
				scaleFactor = float( argv[i][3:] )
				startInd += 1
				if scaleFactor == 0:
					print( 'WARNING: scale factor must be greater than 0...using default', SCALE )
			except ValueError:
				print( 'WARNING: scale factor must be numeric...using default {:s}'.format(SCALE) )
				scaleFactor = SCALE		
		elif argv[i] in [ '-h', '--help', '-help']:
			printHelp()
			exit()
		elif argv[i].startswith( '-' ):
			print( 'ERROR: {:s} is not a valid option'.format( argv[i] ) )
			exit()
	# end for
	
	inFileStr = argv[startInd]
	processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins, centromere, scaleFactor )
def parseInputs( argv ):
	numProc = NUMPROC
	binSize = BINSIZE
	outID = None
	parentLabelAr = ['mother', 'father',0]
	isSmoothing = True
	isUniform = False
	startInd = 0

	for i in range(min(7,len(argv))):
		if argv[i].startswith( '-o=' ):
			outID = argv[i][3:]
			startInd += 1
		elif argv[i].startswith( '-b=' ):
			inStr = argv[i][3:]
			binSize = bth_util.strToDistance( inStr )
			if binSize == False:
				print( 'WARNING: cannot convert {:s} to bin size...using default {:s}'.format( inStr, bth_util.binSizeToStr(BINSIZE) ) )
				binSize = BINSIZE
			startInd += 1
		elif argv[i].startswith( '-p=' ):
			try:
				numProc = int( argv[i][3:] )
				startInd += 1
			except ValueError:
				print( 'WARNING: number of processors must be integer...using 1' )
				numProc = NUMPROC
		elif argv[i].startswith( '-m=' ):
			parentLabelAr[0] = argv[i][3:]
			parentLabelAr[2] += 1
			startInd += 1
		elif argv[i].startswith( '-f=' ):
			parentLabelAr[1] = argv[i][3:]
			parentLabelAr[2] += 2
			startInd += 1
		elif argv[i] == '-n':
			isSmoothing = False
			startInd += 1
		elif argv[i] == '-u':
			isUniform = True
			startInd += 1
		elif argv[i] in [ '-h', '--help', '-help']:
			printHelp()
			exit()
		elif argv[i].startswith( '-' ):
			print( 'ERROR: {:s} is not a valid option'.format( argv[i] ) )
			exit()
	# end for
	inFileStr = argv[startInd]
	processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, isSmoothing, isUniform )
예제 #40
0
def processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding,
                  isUniform, combineBins):

    info = '#from_script: epigenotyping_pe_combbin_smpt.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}'.format(
        os.path.basename(inFileStr), bth_util.binSizeToStr(binSize),
        formatDecoding(decoding).lower().replace('and', ','),
        str(isUniform).lower(), combineBins)
    print('Weighted methylation file:', os.path.basename(inFileStr))
    print('Bin size:', bth_util.binSizeToStr(binSize))
    print('Mother label:', parentLabelAr[0])
    print('Father label:', parentLabelAr[1])
    print('Uniform classification probabilities:', str(isUniform))
    print('Decoding algorithm:', formatDecoding(decoding))
    print('Combine bin feature threshold:', combineBins)

    # build dataframe
    print(' Reading input file', os.path.basename(inFileStr))
    df = pd.read_table(inFileStr, header=1)

    # check parent labels
    checkParents(df['sample'], parentLabelAr)

    # group by bin
    df['bin'] = df.pos // binSize
    transformation = None

    # combine bins if necessary
    nbins = max(df['bin']) + 1
    if combineBins > 0:
        print(' Merging bins', end=' ... ')
        df['tBin'] = df['bin']
        transformation = binTransformation(df, combineBins)
        # apply the transformation
        df['bin'] = df['tBin'].apply(lambda x: transformation[x])

    dfBinGroup = df.groupby('bin')
    if combineBins > 0:
        newNBins = len(dfBinGroup.groups)
        print('combined {:d} non-functional bins'.format(nbins - newNBins))

    # classify by bin
    print(' Classifying {:d} bins with {:d} processors'.format(nbins, numProc))
    dfClass = runClassification(dfBinGroup, numProc, parentLabelAr, isUniform)
    dfClass.reset_index(inplace=True)
    #print( dfClass.head )
    del (df, dfBinGroup)
    # decode, if necessary
    if decoding != 'N':
        ignoreAr = parentLabelAr[:2] + ['MPV']
        #transition = Transitions( dfClass, ignore = ignoreAr )
        #transitionMatrix = transition.getTransitions()
        # write this matrix to file
        outFStr = determineTransFileName(inFileStr, outID, binSize,
                                         combineBins)
        with open(outFStr, 'w') as f:
            f.write(info + '\n')

        #tLabels = [ 'mother', 'MPV', 'father' ]
        #transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels )

        #transData.to_csv( outFStr, sep='\t', mode='a' )

        # group by sample
        dfSampleGroup = dfClass.groupby('sample')
        nsamples = len(dfSampleGroup.groups)

        print(' {:s} decoding {:d} samples with {:d} processors'.format(
            formatDecoding(decoding), nsamples, numProc))
        dfOutput = runDecoding(dfSampleGroup, numProc, decoding, outFStr)
        dfOutput.set_index(['bin', 'sample'], inplace=True)
        del (dfSampleGroup)
    else:
        dfOutput = dfClass

    # write output
    outFileStr = determineOutputFileName(inFileStr, outID, binSize, decoding,
                                         isUniform, combineBins)
    # if combination, undo transformation by applying the predictions to additional bins
    if combineBins > 0:
        dfOutput.reset_index(inplace=True)
        dfOutput['cBin'] = dfOutput['bin']
        dfOutputT = undoBinTransformation(dfOutput, transformation)
    else:
        dfOutputT = dfOutput.drop('cBin', axis=1)
    print(' Writing output to', outFileStr)
    with open(outFileStr, 'w') as f:
        f.write(info + '\n')
    dfOutputT.to_csv(outFileStr, sep='\t', mode='a')

    print('Done')
def parseInputs( argv ):
	numProc = NUMPROC
	binSize = BINSIZE
	outID = None
	parentLabelAr = ['mother', 'father', 0]
	decoding = DECODE
	isUniform = UNIFORM
	combineBins = COMBINE
	startInd = 0
	
	for i in range( min(7, len(argv)-1) ):
		if argv[i].startswith( '-o=' ):
			outID = argv[i][3:]
			startInd += 1
		elif argv[i].startswith( '-b=' ):
			inStr = argv[i][3:]
			binSize = bth_util.strToDistance( inStr )
			if binSize == False:
				print( 'WARNING: cannot convert {:s} to bin size...using default {:s}'.format( inStr, bth_util.binSizeToStr(BINSIZE) ) )
				binSize = BINSIZE
			startInd += 1
		elif argv[i].startswith( '-p=' ):
			try:
				numProc = int( argv[i][3:] )
				startInd += 1
			except ValueError:
				print( 'WARNING: number of processors must be integer...using 1' )
				numProc = NUMPROC
		elif argv[i].startswith( '-c=' ):
			try:
				combineBins = int( argv[i][3:] )
				startInd += 1
			except ValueError:
				print( 'WARNING: number of processors must be integer...using default {:s}'.format(COMBINE) )
				combineBins = COMBINE
		elif argv[i].startswith( '-m=' ):
			parentLabelAr[0] = argv[i][3:]
			parentLabelAr[2] += 1
			startInd += 1
		elif argv[i].startswith( '-f=' ):
			parentLabelAr[1] = argv[i][3:]
			parentLabelAr[2] += 2
			startInd += 1
		elif argv[i].startswith( '-d=' ):
			opt = argv[i][3:].lower()
			if opt == 'false' or opt == 'none' or opt== 'n':
				decoding = 'N'
			elif opt == 'viterbi' or opt == 'v':
				decoding = 'V';
			elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb':
				decoding = 'F'
			elif opt == 'all' or opt == 'a':
				decoding = 'A'
			else:
				print( 'WARNING: decoding option {:s} not recognized...using default viterbi'.format(opt) )
			startInd += 1
		elif argv[i] == '-u':
			isUniform = True
			startInd += 1
		elif argv[i] in [ '-h', '--help', '-help']:
			printHelp()
			exit()
		elif argv[i].startswith( '-' ):
			print( 'ERROR: {:s} is not a valid option'.format( argv[i] ) )
			exit()
	# end for
	
	inFileStr = argv[startInd]
	processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins )
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins ):
	
	info = '#from_script: epigenotyping_pe_combbin.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower(), combineBins )
	print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
	print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
	print( 'Mother label:', parentLabelAr[0] )
	print( 'Father label:', parentLabelAr[1] )
	print( 'Uniform classification probabilities:', str(isUniform) )
	print( 'Decoding algorithm:', formatDecoding( decoding ) )
	print( 'Combine bin feature threshold:', combineBins )
	
	# build dataframe
	print( ' Reading input file', os.path.basename( inFileStr ) )
	df = pd.read_table( inFileStr, header=1 )
	
	# check parent labels
	checkParents( df['sample'], parentLabelAr )
	
	# group by bin
	df['bin'] = df.pos // binSize
	transformation = None
	
	# combine bins if necessary
	nbins = max(df['bin'])+1
	if combineBins > 0:
		print( ' Merging bins', end=' ... ' )
		df['tBin'] = df['bin']
		transformation = binTransformation( df, combineBins )
		# apply the transformation
		df['bin'] = df['tBin'].apply( lambda x: transformation[x] )
		
	
	dfBinGroup = df.groupby( 'bin' )
	if combineBins > 0:
		newNBins = len(dfBinGroup.groups )
		print( 'combined {:d} non-functional bins'.format( nbins - newNBins ) )
	
	# classify by bin
	print( ' Classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
	dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, isUniform )
	dfClass.reset_index(inplace=True)
	#print( dfClass.head )
	del(df, dfBinGroup )
	# decode, if necessary
	if decoding != 'N':
		ignoreAr = parentLabelAr[:2] + ['MPV']
		print( ' Obtaining initial transitions' )
		transition = Transitions( dfClass, ignore = ignoreAr )
		transitionMatrix = transition.getTransitions()
		# multiply for array of transitions
		transitionMatrixArray = np.array( [ np.copy( transitionMatrix ) for i in range(nbins ) ] )
		
		if maxIter > 0:
			print( ' Iteratively improving transitions with maximum', maxIter, 'iterations' )
			at = AdaptiveTransitions( dfClass, transitionMatrixArray, ignoreAr, maxIter )
			iterations, transitionMatrix = at.run()
			trInfo += '; iterations_to_convergence:'
			if iterations == maxIter:
				trInfo += 'NA'
				print( '  Did not converge in 10 iterations' )
			else:
				trInfo += str(iterations)
				print( '  Convergence in', iterations, 'iterations' )
		
		'''# write this matrix to file
		#outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins )
		tLabels = [ 'mother', 'MPV', 'father' ]
		transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels )
		with open( outFStr, 'w' ) as f:
			f.write(info+'\n')
		transData.to_csv( outFStr, sep='\t', mode='a' )'''
		
		# group by sample
		dfSampleGroup = dfClass.groupby( 'sample' )
		nsamples = len( dfSampleGroup.groups )
		
		print( ' {:s} decoding {:d} samples with {:d} processors'.format(  formatDecoding(decoding), nsamples, numProc ) )
		dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrixArray, decoding )
		dfOutput.set_index( ['bin', 'sample'], inplace=True )
		del( dfSampleGroup )
	else:
		dfOutput = dfClass
	
	# write output
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, combineBins )
	# if combination, undo transformation by applying the predictions to additional bins
	if combineBins > 0:
		dfOutput.reset_index(inplace=True)
		dfOutput['cBin'] = dfOutput['bin']
		dfOutputT = undoBinTransformation( dfOutput, transformation )
	else:
		dfOutputT = dfOutput.drop('cBin', axis=1)
	print( ' Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info+'\n')
	dfOutputT.to_csv( outFileStr, sep='\t', mode='a' )
	
	print( 'Done' )
def parseInputs(argv):
    numProc = NUMPROC
    binSize = BINSIZE
    outID = None
    parentLabelAr = ['mother', 'father', 0]
    decoding = DECODE
    classProbs = CLASSPROB
    combineBins = COMBINE
    isPrint = ISPRINT
    centromere = None
    scaleTransitions = SCALETRANS
    startInd = 0

    for i in range(min(10, len(argv) - 1)):
        if argv[i].startswith('-o='):
            outID = argv[i][3:]
            startInd += 1
        elif argv[i].startswith('-b='):
            inStr = argv[i][3:]
            binSize = bth_util.strToDistance(inStr)
            if binSize == False:
                print(
                    'WARNING: cannot convert {:s} to bin size...using default {:s}'
                    .format(inStr, bth_util.binSizeToStr(BINSIZE)))
                binSize = BINSIZE
            startInd += 1
        elif argv[i].startswith('-p='):
            try:
                numProc = int(argv[i][3:])
                startInd += 1
            except ValueError:
                print(
                    'WARNING: number of processors must be integer...using 1')
                numProc = NUMPROC
        elif argv[i].startswith('-c='):
            try:
                combineBins = int(argv[i][3:])
                startInd += 1
            except ValueError:
                print(
                    'WARNING: combine bins must be integer...using default {:s}'
                    .format(COMBINE))
                combineBins = COMBINE
        elif argv[i].startswith('-m='):
            parentLabelAr[0] = argv[i][3:]
            parentLabelAr[2] += 1
            startInd += 1
        elif argv[i].startswith('-f='):
            parentLabelAr[1] = argv[i][3:]
            parentLabelAr[2] += 2
            startInd += 1
        elif argv[i].startswith('-d='):
            opt = argv[i][3:].lower()
            if opt == 'false' or opt == 'none' or opt == 'n':
                decoding = 'N'
            elif opt == 'viterbi' or opt == 'v':
                decoding = 'V'
            elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb':
                decoding = 'F'
            elif opt == 'all' or opt == 'a':
                decoding = 'A'
            elif opt == 'both' or opt == 'b':
                decoding = 'B'
            else:
                print(
                    'WARNING: decoding option {:s} not recognized...using default viterbi'
                    .format(opt))
            startInd += 1
        elif argv[i] == '-e':
            classProbs = 'E'
            startInd += 1
        elif argv[i] == '-s':
            scaleTransitions = True
            startInd += 1
        elif argv[i] == '-q':
            isPrint = False
            startInd += 1
        elif argv[i].startswith('-t='):
            tmp = argv[i][3:].split(',')
            tmp2 = [bth_util.strToDistance(x) for x in tmp]
            if len(tmp2) % 2 != 0 or (False in tmp2):
                print('WARNING: centromere coordinates bad...not using')
            else:
                centromere = tmp2
            startInd += 1
        elif argv[i] in ['-h', '--help', '-help']:
            printHelp()
            exit()
        elif argv[i].startswith('-'):
            print('ERROR: {:s} is not a valid option'.format(argv[i]))
            exit()
    # end for

    inFileStr = argv[startInd]
    processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding,
                  classProbs, combineBins, centromere, scaleTransitions,
                  isPrint)
def parseInputs(argv):
    numProc = NUMPROC
    binSize = BINSIZE
    outID = None
    parentLabelAr = ['mother', 'father', 0]
    decoding = DECODE
    generation = GENERATION
    combineBins = COMBINE
    mpvCheck = MPVCHECK
    isPrint = ISPRINT
    tmOut = TMOUT
    centromere = None
    startInd = 0

    for i in range(min(11, len(argv) - 1)):
        if argv[i].startswith('-o='):
            outID = argv[i][3:]
            startInd += 1
        elif argv[i].startswith('-b='):
            inStr = argv[i][3:]
            binSize = bth_util.strToDistance(inStr)
            if binSize == False:
                print(
                    'WARNING: cannot convert {:s} to bin size...using default {:s}'
                    .format(inStr, bth_util.binSizeToStr(BINSIZE)))
                binSize = BINSIZE
            startInd += 1
        elif argv[i].startswith('-p='):
            try:
                numProc = int(argv[i][3:])
            except ValueError:
                print('WARNING: number of processors must be integer...using',
                      NUMPROC)
                numProc = NUMPROC
            startInd += 1
        elif argv[i].startswith('-c='):
            try:
                combineBins = int(argv[i][3:])
            except ValueError:
                print('WARNING: combine bins must be integer...using default',
                      COMBINE)
                combineBins = COMBINE
            startInd += 1
        elif argv[i].startswith('-m='):
            parentLabelAr[0] = argv[i][3:]
            parentLabelAr[2] += 1
            startInd += 1
        elif argv[i].startswith('-f='):
            parentLabelAr[1] = argv[i][3:]
            parentLabelAr[2] += 2
            startInd += 1
        elif argv[i].startswith('-d='):
            opt = argv[i][3:].lower()
            if opt == 'false' or opt == 'none' or opt == 'n':
                decoding = 'N'
            elif opt == 'viterbi' or opt == 'v':
                decoding = 'V'
            elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb':
                decoding = 'F'
            elif opt == 'all' or opt == 'a':
                decoding = 'A'
            elif opt == 'both' or opt == 'b':
                decoding = 'B'
            else:
                print(
                    'WARNING: decoding option {:s} not recognized...using default both'
                    .format(opt))
            startInd += 1
        elif argv[i].startswith('-g='):
            try:
                generation = int(argv[i][3:])
            except ValueError:
                print(
                    'WARNING: generation must be integer...using default {:s}'.
                    format(COMBINE))
                generation = GENERATION
            startInd += 1
        elif argv[i] == '-q':
            isPrint = False
            startInd += 1
        elif argv[i] == '-n-mpv':
            mpvCheck = False
            startInd += 1
        elif argv[i] == '-t-out':
            tmOut = True
            startInd += 1
        elif argv[i].startswith('-t='):
            tmp = argv[i][3:].split(',')
            tmp2 = [bth_util.strToDistance(x) for x in tmp]
            if len(tmp2) % 2 != 0 or (False in tmp2):
                print('WARNING: centromere coordinates bad...not using')
            else:
                centromere = tmp2
            startInd += 1
        elif argv[i] in ['-h', '--help', '-help']:
            printHelp()
            exit()
        elif argv[i].startswith('-'):
            print('ERROR: {:s} is not a valid option'.format(argv[i]))
            exit()
    # end for

    inFileStr = argv[startInd]
    processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding,
                  generation, combineBins, centromere, isPrint, mpvCheck,
                  tmOut)
def processInputs(inFileStr, numProc, binSize, outID, parentLabelAr, decoding,
                  classProbs, combineBins, cent, scaleTransitions, isPrint):

    info = '#from_script: epigenotyping_pe_v9.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; class_prob:{:s}; combine_bins_threshold:{:d}; centromere_{:s}; scale_transitions:{:s}'.format(
        os.path.basename(inFileStr), bth_util.binSizeToStr(binSize),
        formatDecoding(decoding).lower().replace('and', ','),
        formatClassProbs(classProbs).lower(), combineBins,
        ('None' if cent == None else '{:s}-{:s}'.format(
            bth_util.binSizeToStr(cent[0]), bth_util.binSizeToStr(cent[1]))),
        str(scaleTransitions))
    if isPrint:
        print('Weighted methylation file:', os.path.basename(inFileStr))
        print('Bin size:', bth_util.binSizeToStr(binSize))
        print('Mother label(s):', parentLabelAr[0])
        print('Father label(s):', parentLabelAr[1])
        print('Classification probabilities:', formatClassProbs(classProbs))
        print('Decoding algorithm:', formatDecoding(decoding))
        print('Combine bin feature threshold:', combineBins)
        print('Scale transitions by sample size:', scaleTransitions)
    if cent == None:
        centStr = 'None'
    else:
        centStr = ''
        for i in range(len(cent) // 2):
            si = i * 2
            centStr += '; {:s}-{:s}'.format(
                bth_util.binSizeToStr(cent[si]),
                bth_util.binSizeToStr(cent[si + 1]))
        centStr = centStr[2:]

    if isPrint:
        print('Centromere:', centStr)

    # build dataframe
    if isPrint:
        print(' Reading input file', os.path.basename(inFileStr))
    df = pd.read_table(inFileStr, header=1)

    # check parent labels
    newParentLabelAr = checkParents(df['sample'], parentLabelAr)
    tIgnoreAr = flattenList(newParentLabelAr[:2])
    for i in range(len(newParentLabelAr[0])):
        tIgnoreAr += ['MPV{:d}'.format(i)]

    # group by bin
    df['bin'] = df.pos // binSize
    transformation = None

    # get centromere bins if necessary
    if cent == None:
        centBins = []
    else:
        cent = [x // binSize for x in cent]
        centBins = []
        #centBins = list( range(cent[0], cent[1]+1) )
        for i in range(len(cent) // 2):
            si = i * 2
            centBins += list(range(cent[si], cent[si + 1] + 1))

    # combine bins if necessary
    nbins = max(df['bin']) + 1
    if combineBins > 0:
        if isPrint:
            print(' Merging bins', end=' ... ')
        df['tBin'] = df['bin']
        transformation = binTransformation(df, combineBins)
        # apply the transformation
        df['bin'] = df['tBin'].apply(lambda x: transformation[x])

    dfBinGroup = df.groupby('bin')
    if combineBins > 0:
        newNBins = len(dfBinGroup.groups)
        info += '; non-functional_bins:{:d}'.format(nbins - newNBins)
        if isPrint:
            print('combined {:d} non-functional bins'.format(nbins - newNBins))

    # classify by bin
    if isPrint:
        print(' Classifying {:d} bins with {:d} processors'.format(
            nbins, numProc))
    dfClass = runClassification(dfBinGroup, numProc, newParentLabelAr,
                                classProbs)
    dfClass.reset_index(inplace=True)
    #print( dfClass.head )
    del (df, dfBinGroup)
    # decode, if necessary
    if decoding != 'N':
        #ignoreAr = parentLabelAr[:2] + ['MPV']
        transition = Transitions(dfClass, ignore=tIgnoreAr)
        transitionMatrix = transition.getTransitions()
        # write this matrix to file
        '''outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins )
		tLabels = [ 'mother', 'MPV', 'father' ]
		transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels )
		with open( outFStr, 'w' ) as f:
			f.write(info+'\n')
		transData.to_csv( outFStr, sep='\t', mode='a' )'''

        # group by sample
        dfSampleGroup = dfClass.groupby('sample')
        nsamples = len(dfSampleGroup.groups)
        if scaleTransitions:
            scaleFactor = float(nsamples - len(tIgnoreAr) -
                                1) / float(nsamples - len(tIgnoreAr))
        else:
            scaleFactor = 1
        tmpDecoding = ('F' if decoding == 'B' else decoding)
        if isPrint:
            print(' {:s} decoding {:d} samples with {:d} processors'.format(
                formatDecoding(tmpDecoding), nsamples, numProc))

        dfOutput = runDecoding(dfSampleGroup, numProc, transitionMatrix,
                               tmpDecoding, centBins, scaleFactor)

        if decoding == 'B':
            dfNew = dfOutput.loc[:, ['bin', 'sample']].copy()
            dfNew['MPV'] = np.log(dfOutput['fb.score.MPV'])
            dfNew['mother'] = np.log(dfOutput['fb.score.mother'])
            dfNew['father'] = np.log(dfOutput['fb.score.father'])
            dfNew['prediction'] = dfOutput['fb.prediction']
            #print(dfOutput.head())
            #print(dfNew.head())
            transition = Transitions(dfNew, ignore=tIgnoreAr)
            transitionMatrix = transition.getTransitions()
            dfSampleGroup = dfNew.groupby('sample')
            nsamples = len(dfSampleGroup.groups)

            if isPrint:
                print(
                    ' {:s} decoding {:d} samples with {:d} processors'.format(
                        formatDecoding('V'), nsamples, numProc))
            dfOutputN = runDecoding(dfSampleGroup, numProc, transitionMatrix,
                                    'V', centBins, scaleFactor)
            dfOutput[[
                'vit.score.mother', 'vit.score.father', 'vit.score.MPV',
                'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV',
                'vit.prediction'
            ]] = dfOutputN[[
                'vit.score.mother', 'vit.score.father', 'vit.score.MPV',
                'vit.prob.mother', 'vit.prob.father', 'vit.prob.MPV',
                'vit.prediction'
            ]]
            #print( dfOutput.head() )
        # end decoding == B
        dfOutput.set_index(['bin', 'sample'], inplace=True)
        del (dfSampleGroup)
    else:
        dfOutput = dfClass

    # write output
    outFileStr = determineOutputFileName(inFileStr, outID, binSize, decoding,
                                         classProbs, scaleTransitions,
                                         combineBins)
    # if combination, undo transformation by applying the predictions to additional bins
    if combineBins > 0:
        dfOutput.reset_index(inplace=True)
        dfOutput['cBin'] = dfOutput['bin']
        dfOutputT = undoBinTransformation(dfOutput, transformation)
    else:
        dfOutputT = dfOutput.drop('cBin', axis=1)
    if isPrint:
        print(' Writing output to', outFileStr)
    with open(outFileStr, 'w') as f:
        f.write(info + '\n')
    dfOutputT.to_csv(outFileStr, sep='\t', mode='a')

    if isPrint:
        print('Done')
def parseInputs( argv ):
	numProc = NUMPROC
	binSize = BINSIZE
	outID = None
	parentLabelAr = ['mother', 'father', 0]
	decoding = DECODE
	generation = GENERATION
	combineBins = COMBINE
	mpvCheck = MPVCHECK
	isPrint = ISPRINT
	tmOut = TMOUT
	centromere = None
	startInd = 0
	
	for i in range( min(11, len(argv)-1) ):
		if argv[i].startswith( '-o=' ):
			outID = argv[i][3:]
			startInd += 1
		elif argv[i].startswith( '-b=' ):
			inStr = argv[i][3:]
			binSize = bth_util.strToDistance( inStr )
			if binSize == False:
				print( 'WARNING: cannot convert {:s} to bin size...using default {:s}'.format( inStr, bth_util.binSizeToStr(BINSIZE) ) )
				binSize = BINSIZE
			startInd += 1
		elif argv[i].startswith( '-p=' ):
			try:
				numProc = int( argv[i][3:] )
			except ValueError:
				print( 'WARNING: number of processors must be integer...using', NUMPROC )
				numProc = NUMPROC
			startInd += 1
		elif argv[i].startswith( '-c=' ):
			try:
				combineBins = int( argv[i][3:] )
			except ValueError:
				print( 'WARNING: combine bins must be integer...using default', COMBINE )
				combineBins = COMBINE
			startInd += 1
		elif argv[i].startswith( '-m=' ):
			parentLabelAr[0] = argv[i][3:]
			parentLabelAr[2] += 1
			startInd += 1
		elif argv[i].startswith( '-f=' ):
			parentLabelAr[1] = argv[i][3:]
			parentLabelAr[2] += 2
			startInd += 1
		elif argv[i].startswith( '-d=' ):
			opt = argv[i][3:].lower()
			if opt == 'false' or opt == 'none' or opt== 'n':
				decoding = 'N'
			elif opt == 'viterbi' or opt == 'v':
				decoding = 'V';
			elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb':
				decoding = 'F'
			elif opt == 'all' or opt == 'a':
				decoding = 'A'
			elif opt == 'both' or opt == 'b':
				decoding = 'B'
			else:
				print( 'WARNING: decoding option {:s} not recognized...using default both'.format(opt) )
			startInd += 1
		elif argv[i].startswith( '-g=' ):
			try:
				generation = int( argv[i][3:] )
			except ValueError:
				print( 'WARNING: generation must be integer...using default {:s}'.format(COMBINE) )
				generation = GENERATION
			startInd += 1
		elif argv[i] == '-q':
			isPrint = False
			startInd += 1
		elif argv[i] == '-n-mpv':
			mpvCheck = False
			startInd += 1
		elif argv[i] == '-t-out':
			tmOut = True
			startInd += 1
		elif argv[i].startswith( '-t=' ):
			tmp = argv[i][3:].split(',')
			tmp2 = [ bth_util.strToDistance( x ) for x in tmp ]
			if len(tmp2) % 2 != 0 or (False in tmp2):
				print( 'WARNING: centromere coordinates bad...not using' )
			else:
				centromere = tmp2
			startInd += 1
		elif argv[i] in [ '-h', '--help', '-help']:
			printHelp()
			exit()
		elif argv[i].startswith( '-' ):
			print( 'ERROR: {:s} is not a valid option'.format( argv[i] ) )
			exit()
	# end for
	
	inFileStr = argv[startInd]
	processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, generation, combineBins, centromere, isPrint, mpvCheck, tmOut )
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins, cent ):
	
	info = '#from_script: epigenotyping_pe_combbin-init.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}; centromere:{:s}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower(), combineBins, ('None' if cent == None else '{:s}-{:s}'.format( bth_util.binSizeToStr( cent[0] ), bth_util.binSizeToStr( cent[1] ) ) ) )
	print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
	print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
	print( 'Mother label:', parentLabelAr[0] )
	print( 'Father label:', parentLabelAr[1] )
	print( 'Uniform classification probabilities:', str(isUniform) )
	print( 'Decoding algorithm:', formatDecoding( decoding ) )
	print( 'Combine bin feature threshold:', combineBins )
	print( 'Centromere:', ( 'None' if cent == None else '{:s}-{:s}'.format( bth_util.binSizeToStr(cent[0]), bth_util.binSizeToStr(cent[1]) ) ) )
	
	# build dataframe
	print( ' Reading input file', os.path.basename( inFileStr ) )
	df = pd.read_table( inFileStr, header=1 )
	
	# check parent labels
	checkParents( df['sample'], parentLabelAr )
	
	# group by bin
	df['bin'] = df.pos // binSize
	transformation = None
	
	# get centromere bins if necessary
	if cent == None:
		centro = []
	else:
		cent = [ x // binSize for x in cent ]
		centro = list( range(cent[0], cent[1]+1) )
		print(centro)
	
	# combine bins if necessary
	nbins = max(df['bin'])+1
	if combineBins > 0:
		print( ' Merging bins', end=' ... ' )
		df['tBin'] = df['bin']
		transformation = binTransformation( df, combineBins )
		# apply the transformation
		df['bin'] = df['tBin'].apply( lambda x: transformation[x] )
		
	
	dfBinGroup = df.groupby( 'bin' )
	if combineBins > 0:
		newNBins = len(dfBinGroup.groups )
		print( 'combined {:d} non-functional bins'.format( nbins - newNBins ) )
	
	# classify by bin
	print( ' Classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
	dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, isUniform )
	dfClass.reset_index(inplace=True)
	#print( dfClass.head )
	del(df, dfBinGroup )
	# decode, if necessary
	if decoding != 'N':
		ignoreAr = parentLabelAr[:2] + ['MPV']
		transition = Transitions( dfClass, ignore = ignoreAr, cent=centro )
		transitionMatrix = transition.getTransitions()
		# write this matrix to file
		outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins )
		tLabels = [ 'mother', 'MPV', 'father' ]
		transData = pd.DataFrame( transitionMatrix, index=tLabels, columns= tLabels )
		with open( outFStr, 'w' ) as f:
			f.write(info+'\n')
		transData.to_csv( outFStr, sep='\t', mode='a' )
		
		# group by sample
		dfSampleGroup = dfClass.groupby( 'sample' )
		nsamples = len( dfSampleGroup.groups )
		
		print( ' {:s} decoding {:d} samples with {:d} processors'.format(  formatDecoding(decoding), nsamples, numProc ) )
		dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, decoding, centro )
		dfOutput.set_index( ['bin', 'sample'], inplace=True )
		del( dfSampleGroup )
	else:
		dfOutput = dfClass
	
	# write output
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, combineBins )
	# if combination, undo transformation by applying the predictions to additional bins
	if combineBins > 0:
		dfOutput.reset_index(inplace=True)
		dfOutput['cBin'] = dfOutput['bin']
		dfOutputT = undoBinTransformation( dfOutput, transformation )
	else:
		dfOutputT = dfOutput.drop('cBin', axis=1)
	print( ' Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info+'\n')
	dfOutputT.to_csv( outFileStr, sep='\t', mode='a' )
	
	print( 'Done' )
def parseInputs( argv ):
	numProc = NUMPROC
	binSize = BINSIZE
	outID = None
	parentLabelAr = ['mother', 'father', 0]
	decoding = DECODE
	classProbs = CLASSPROB
	combineBins = COMBINE
	isPrint = ISPRINT
	centromere=None
	startInd = 0
	
	for i in range( min(10, len(argv)-1) ):
		if argv[i].startswith( '-o=' ):
			outID = argv[i][3:]
			startInd += 1
		elif argv[i].startswith( '-b=' ):
			inStr = argv[i][3:]
			binSize = bth_util.strToDistance( inStr )
			if binSize == False:
				print( 'WARNING: cannot convert {:s} to bin size...using default {:s}'.format( inStr, bth_util.binSizeToStr(BINSIZE) ) )
				binSize = BINSIZE
			startInd += 1
		elif argv[i].startswith( '-p=' ):
			try:
				numProc = int( argv[i][3:] )
				startInd += 1
			except ValueError:
				print( 'WARNING: number of processors must be integer...using 1' )
				numProc = NUMPROC
		elif argv[i].startswith( '-c=' ):
			try:
				combineBins = int( argv[i][3:] )
				startInd += 1
			except ValueError:
				print( 'WARNING: combine bins must be integer...using default {:s}'.format(COMBINE) )
				combineBins = COMBINE
		elif argv[i].startswith( '-m=' ):
			parentLabelAr[0] = argv[i][3:]
			parentLabelAr[2] += 1
			startInd += 1
		elif argv[i].startswith( '-f=' ):
			parentLabelAr[1] = argv[i][3:]
			parentLabelAr[2] += 2
			startInd += 1
		elif argv[i].startswith( '-d=' ):
			opt = argv[i][3:].lower()
			if opt == 'false' or opt == 'none' or opt== 'n':
				decoding = 'N'
			elif opt == 'viterbi' or opt == 'v':
				decoding = 'V';
			elif opt == 'forwardbackward' or opt == 'f' or opt == 'fb':
				decoding = 'F'
			elif opt == 'all' or opt == 'a':
				decoding = 'A'
			elif opt == 'both' or opt == 'b':
				decoding = 'B'
			else:
				print( 'WARNING: decoding option {:s} not recognized...using default viterbi'.format(opt) )
			startInd += 1
		elif argv[i] == '-u':
			if classProbs != CLASSPROB:
				print( 'WARNING: cannot specify uniform and epiRIL class weights...using default' )
				classProbs = CLASSPROB
			else:
				classProbs = 'U'
			startInd += 1
		elif argv[i] == '-e':
			if classProbs != CLASSPROB:
				print( 'WARNING: cannot specify uniform and epiRIL class weights...using default' )
				classProbs = CLASSPROB
			else:
				classProbs = 'E'
			startInd += 1
		elif argv[i] == '-q':
			isPrint = False
			startInd += 1
		elif argv[i].startswith( '-t=' ):
			tmp = argv[i][3:].split(',')
			tmp2 = [ bth_util.strToDistance( x ) for x in tmp ]
			if len(tmp2) % 2 != 0 or (False in tmp2):
				print( 'WARNING: centromere coordinates bad...not using' )
			else:
				centromere = tmp2
			startInd += 1
		elif argv[i] in [ '-h', '--help', '-help']:
			printHelp()
			exit()
		elif argv[i].startswith( '-' ):
			print( 'ERROR: {:s} is not a valid option'.format( argv[i] ) )
			exit()
	# end for
	
	inFileStr = argv[startInd]
	processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, classProbs, combineBins, centromere, isPrint )
def processInputs( inFileStr, numProc, binSize, outID, parentLabelAr, decoding, isUniform, combineBins, maxIter ):
	
	info = '#from_script: epigenotyping_combin_smp-iter-trans.py; in_file:{:s}; bin_size:{:s}; decoding:{:s}; uni_class_prob:{:s}; combine_bins_threshold:{:d}; maximum_iterations:{:d}'.format( os.path.basename( inFileStr ), bth_util.binSizeToStr( binSize ), formatDecoding( decoding).lower().replace('and',','), str(isUniform).lower(), combineBins, maxIter )
	print( 'Weighted methylation file:', os.path.basename( inFileStr ) )
	print( 'Bin size:', bth_util.binSizeToStr( binSize ) )
	print( 'Mother label:', parentLabelAr[0] )
	print( 'Father label:', parentLabelAr[1] )
	print( 'Uniform classification probabilities:', str(isUniform) )
	print( 'Decoding algorithm:', formatDecoding( decoding ) )
	print( 'Combine bin feature threshold:', combineBins )
	print( 'Maximum transition matrix iterations:', maxIter )
	
	# build dataframe
	print( ' Reading input file', os.path.basename( inFileStr ) )
	df = pd.read_table( inFileStr, header=1 )
	
	# check parent labels
	checkParents( df['sample'], parentLabelAr )
	
	# group by bin
	df['bin'] = df.pos // binSize
	transformation = None
	
	# combine bins if necessary
	nbins = max(df['bin'])+1
	if combineBins > 0:
		print( ' Merging bins', end=' ... ' )
		df['tBin'] = df['bin']
		transformation = binTransformation( df, combineBins )
		# apply the transformation
		df['bin'] = df['tBin'].apply( lambda x: transformation[x] )
		
	
	dfBinGroup = df.groupby( 'bin' )
	if combineBins > 0:
		newNBins = len(dfBinGroup.groups )
		print( 'combined {:d} non-functional bins'.format( nbins - newNBins ) )
	
	# classify by bin
	print( ' Classifying {:d} bins with {:d} processors'.format( nbins, numProc ) )
	dfClass = runClassification( dfBinGroup, numProc, parentLabelAr, isUniform )
	dfClass.reset_index(inplace=True)
	#print( dfClass.head )
	del(df, dfBinGroup )
	# decode, if necessary
	if decoding != 'N':
		ignoreAr = parentLabelAr[:2] + ['MPV']
		print( ' Obtaining initial transitions' )
		transition = Transitions( dfClass, ignore = ignoreAr )
		transitionMatrix = transition.getTransitions()
		outFStr = determineTransFileName(inFileStr, outID, binSize, combineBins )
		with open( outFStr, 'w' ) as f:
			f.write(info+'\n')
		# group by sample
		#print(dfClass.head())
		dfSampleGroup = dfClass.groupby( 'sample' )
		nsamples = len( dfSampleGroup.groups )
		
		print( ' {:s} decoding and optimizing transition matrices for {:d} samples with {:d} processors'.format(  formatDecoding(decoding), nsamples, numProc ) )
		## note: decoding will now include improved transition matrix calculations
		dfOutput = runDecoding( dfSampleGroup, numProc, transitionMatrix, decoding, maxIter, outFStr )
		dfOutput.set_index( ['bin', 'sample'], inplace=True )
		del( dfSampleGroup )
	else:
		dfOutput = dfClass
	
	# write output
	outFileStr = determineOutputFileName( inFileStr, outID, binSize, decoding, isUniform, combineBins )
	# if combination, undo transformation by applying the predictions to additional bins
	if combineBins > 0:
		dfOutput.reset_index(inplace=True)
		dfOutput['cBin'] = dfOutput['bin']
		dfOutputT = undoBinTransformation( dfOutput, transformation )
	else:
		dfOutputT = dfOutput.drop('cBin', axis=1)
	print( ' Writing output to', outFileStr )
	with open( outFileStr, 'w' ) as f:
		f.write(info+'\n')
	dfOutputT.to_csv( outFileStr, sep='\t', mode='a' )
	
	print( 'Done' )