Exemple #1
0
# SVM params
svmMaxIter = 500
svmKernel = 'rbf'  # linear, poly, rbf, sigmoid

# text delimiter in output files
textDelim = '\t'

####### ####### ####### #######

####### ####### ####### #######
# BEGIN MAIN FUNCTION

tstart = time.time()
print("\nPerforming regression(s) on {}".format(sDir))

mp.setParamVerbose(newVerbose)

# 0) Create the useLabel variable
# string: label for the output files
# ie: ClusVote_c<Las/Enet/Log><P for Pos>_f<P for pathsim><N for neighborhood>
if maxClusters > 0:
    useLabel = 'Clus{}Vote'.format(maxClusters)
else:
    useLabel = 'ClusVote'
#end if
useLabel = useLabel + '_c'
if useCfier == 1:
    useLabel = useLabel + 'Las'
elif useCfier == 2:
    useLabel = useLabel + 'Enet'
elif useCfier == 3:
def predictIterative(printFlag) :

	if printFlag :
		print("\nPerforming regression(s) on {}".format(sDir))

	mp.setParamVerbose(verbose)


	# 0) Create the useLabel variable
	# string: label for the output files
	# ie: ClusVote_c<Las/Enet/Log/SVM><P for Pos>_f<P for pathsim><Z for z-score>
	#	<T for term weights><N for neighborhood>
	useLabel = 'Iter{}V{}_c'.format(numIterations, numVotes)
	if useCfier == 1 :
		useLabel = useLabel + 'Las'
	elif useCfier == 2 :
		useLabel = useLabel + 'Enet'
	else :
		print("ERROR: useCfier value is unrecognized: {}".format(useCfier))
	#end if
	if usePos :
		useLabel = useLabel + 'Pos'
	useLabel = useLabel + '_f'
	if useFeatPathSim :
		useLabel = useLabel + 'P'
		if limitMPLen :
			for item in limitMPLen :
				useLabel = useLabel + '{}'.format(item)
	if useFeatPathZScore :
		useLabel = useLabel + 'Z'
	if useFeatTermWeights :
		useLabel = useLabel + 'T'
	if useFeatNeighbor :
		useLabel = useLabel + 'N'
	#end if
	useLabel = useLabel + '_m{}'.format(negMultiplier)
	if retryOnZeroCoeffs :
		useLabel = useLabel + '_wRS2' # indicating resample on 0 score
	#end if
	if alwaysNewAlpha :
		useLabel = useLabel + '_aA'

	if printFlag :
		print("Using label: {}".format(useLabel))


	# 1) Load the gene-index dictionary & path names
	geneDict, pathDict = mp.getGeneAndPathDict(sDir)
	geneNames = list(geneDict.keys())
	geneNames.sort()
	pathNames = mp.removeInvertedPaths(pathDict)
	del pathDict

	# limit the metapaths by length
	#	part 1: get the desired indices
	idx = -1
	featPSIdx = list()
	for name in pathNames :
		idx += 1
		pLen = name.count('-') + 1
		if pLen in limitMPLen :
			featPSIdx.append( int(idx) )
	#end loop
	if printFlag :
		print("Limiting metapaths to {}, of length(s) {}".format(
			len(featPSIdx), limitMPLen))


	# 2) Load the network general features
	numFN = 0
	if useFeatNeighbor :
		featNbVals, featNbNames = mp.getFeaturesNeighborhood(sDir, 'LogScale')
		featNbNames = np.ravel(featNbNames)
		numFN = len(featNbNames)
	#end if
	numTW = 0
	if useFeatTermWeights :
		featTWVals, featTWNames = mp.getFeaturesTerms(sDir, 'Orig')
		featTWNames = np.ravel(featTWNames)
		numTW = len(featTWNames)
	#end if


	# 3) Loop over the list of the sample subdirectories
	dSubDirs = mp.getSubDirectoryList(sDir)

	thisRound = 0
	#for si in dSubDirs[0:1] :
	for si in dSubDirs :
		thisRound += 1

		# Display directory to examine
		sv = si.split('/')
		if printFlag :
			print("\n{}/{}/".format(sv[-3],sv[-2]))

		# Create index lists for Known, Hidden, Unknown, TrueNeg from files
		giKnown, giUnknown, giHidden, giTrueNeg = getGeneIndexLists(si, geneDict)


		# 4) Load the sample-specific features
		# PathSim features
		numFP = 0
		if useFeatPathSim :
			featPSVals = np.loadtxt(si + fSimilarity)
			# NOTE: previous version of mpPredict04a had extra zeros at end of vectors;
			#	discarding those columns
			featPSVals = featPSVals[:,0:len(pathNames)]
			featPSNames = pathNames
			numFP = len(featPSNames)

			# limit the metapaths by length
			#	part 2: keep only the desired columns
			if limitMPLen :
				featPSVals = featPSVals[:,featPSIdx]
				newFeatPSNames = list()
				for idx in featPSIdx :
					newFeatPSNames.append(featPSNames[idx])
				featPSNames = newFeatPSNames
				numFP = len(featPSNames)
		#end if

		# z-score of path counts features
		if useFeatPathZScore :
			featZSVals = np.loadtxt(si + fZScoreSim)
			featZSVals = featZSVals[:,0:len(pathNames)]
			featZSNames = pathNames
			numFP = len(featZSNames)
		#end if


		# 5) Combine the features as specified by parameters (useFeat...)
		features = np.zeros( (len(geneDict), 0), dtype=np.float32)
		featNames = list()

		if useFeatPathSim :
			if printFlag :
				print("    ... including PathSim sum features")
			features = np.hstack( (features, featPSVals) )
			featNames.extend(featPSNames)
		if useFeatPathZScore :
			if printFlag :
				print("    ... including path z-score features")
			features = np.hstack( (features, featZSVals) )
			featNames.extend(featZSNames)
		if useFeatNeighbor :
			if printFlag :
				print("    ... including neighborhood features")
			features = np.hstack( (features, featNbVals) )
			featNames.extend(np.ravel(featNbNames))
		if useFeatTermWeights :
			# Remove terms with no connection to gene set
			sumFTV = np.sum(featTWVals[giKnown,:], axis=0)
			keepIdx = np.nonzero(sumFTV)
			numTW = len(keepIdx[0])
			if printFlag :
				print("    ... including term membership features")
			features = np.hstack( (features, featTWVals[:,keepIdx[0]]) )
			featNames.extend(np.ravel(featTWNames[keepIdx]))
		# verify some features have been loaded
		if features.shape[1] == 0 :
			print("ERROR: No features were specified for classification.")
			sys.exit 
		#end if

		# Normalize the feature values
		features = mp.normalizeFeatureColumns(features)



		# Peform N recursive iterations, each voted across K random samples
		
		# Create the structure to rank the Unknown genes & paths
		geneRanks = np.zeros( (len(geneDict), 1), dtype=np.int32 )
		geneScores = np.zeros( (len(geneDict), numIterations), dtype=np.float32 )

#TODO: How to save feature rankings ??

		# set the gene indices for the first iteration
		iterKnown = giKnown
		iterUnknown = giUnknown
		iterAll = list()
		iterAll.extend(iterKnown)
		iterAll.extend(iterUnknown)
		iterAll.sort()

		for itr in range(numIterations) :

			# store the results for each random sample
#			iterNumGenes = len(iterKnown) + len(iterUnknown)
			iterNumGenes = len(iterAll)
			voteScores = np.zeros( (iterNumGenes, numVotes), dtype=np.float32)
#			voteScores = np.zeros( (len(geneDict), numVotes), dtype=np.float32)

			if printFlag :
				print("  iteration {} of {}; {} votes; cfier {}".format( (itr + 1),
					numIterations, numVotes, useCfier))
				print("  known: {}, total: {}, trainSet: {}".format( len(iterKnown),
					iterNumGenes, (len(iterKnown) * (1 + negMultiplier)) ))
			#end if


			# 6) Prepare the test/train vectors & labels
			# Extract the vectors for the pos sets
#			posTrain = features[iterKnown,:]
#			posTrainLabel = np.ones( (len(iterKnown), 1) ) * pLabel

			retryNewAlpha = True
			retrySubSample = False
			retries = 0
#			numSubSample = len(iterKnown)
			vote = 0
			while vote < numVotes :
#			for vote in range(numVotes) :
#				print("--  vote: {}, retry: {} ...  --".format(vote+1, retrySubSample))

				if retrySubSample :
					#print("--  Creating new sub-sample ...  --")
					retrySubSample = False

					numSubSample = int(numSubSample * retrySubPortion) + 1
					retryIterKnown = random.sample(iterKnown, numSubSample)
					if len(retryIterKnown) < retryMinValid :
						retryIterKnown = random.sample(iterKnown, retryMinValid)
	
					posTrain = features[retryIterKnown,:]
					posTrainLabel = np.ones( (len(retryIterKnown), 1) ) * pLabel
	
					nExamples = min( negMultiplier * len(retryIterKnown), (iterNumGenes - len(retryIterKnown)))
				else :
					numSubSample = len(iterKnown)
		
					posTrain = features[iterKnown,:]
					posTrainLabel = np.ones( (len(iterKnown), 1) ) * pLabel

					#print("--  Using full sample ...  --")
					nExamples = min( negMultiplier * len(iterKnown), (iterNumGenes - len(iterKnown)) )
				#end if

				# Extract the vectors for neg sets
				# as one-class: train with rand samp from Unknown
				#		test with all Unknown (TrueNeg + Hidden/TruePos)
				giTrainNeg = random.sample(iterUnknown, nExamples)
				negTrain = features[giTrainNeg,:]
				negTrainLabel = np.ones( (len(giTrainNeg), 1) ) * nLabel

				# Combine to create the full train & test data sets
				# as one-class:
				trainSet = np.vstack( (posTrain, negTrain) )
				trainLabel = np.vstack( (posTrainLabel, negTrainLabel) )

#				testSet = features[iterUnknown,:]
				testSet = features[iterAll,:]

				# Some versions want the labels reshaped
				trainLabel = np.reshape(trainLabel, [trainLabel.shape[0],])


				# 7) Train classifier, predict on test, collect scores

#TODO: add other classifier options ??
				if alwaysNewAlpha :
					retryNewAlpha = True
				if useCfier == 1 :	# 1 = Lasso
					if retryNewAlpha :
						cfier = lm.LassoCV(alphas=useGivenRange, positive=usePos,
							max_iter=lMaxIter, normalize=lNorm, fit_intercept=lFitIcpt)
						cfier.fit(trainSet, trainLabel)
						foundAlpha = cfier.alpha_
						retryNewAlpha = False
					else :
						cfier = lm.Lasso(alpha=foundAlpha, max_iter=lMaxIter, normalize=lNorm,
							positive=usePos, fit_intercept=lFitIcpt)
						cfier.fit(trainSet, trainLabel)
					#end if
				elif useCfier == 2 :	# 2 = ElasticNet
					cfier = lm.ElasticNetCV(l1_ratio=enRatios, positive=usePos, fit_intercept=enFitIncept,
						n_alphas=enNAlphas, normalize=enNorm, copy_X=enCopy, max_iter=enMaxIter)
					cfier.fit(trainSet, trainLabel)
					foundAlpha = cfier.alpha_
				else :
					print("ERROR: specified classifier unrecognized: useCfier = {}".format(useCfier))
				#end if


#TODO: Decide on verbose vs printFlag
				if verbose :
					# view quick statistics from this training session
					if useCfier < 3 :
						if printFlag :
							print("    Vote {}-{}; iters {:3d}, alpha {:.5f}, score {:.3f}; coeffs {}; sample {}".format(
								(itr + 1), (vote + 1), cfier.n_iter_, foundAlpha, cfier.score(trainSet, trainLabel),
								len(np.nonzero(cfier.coef_)[0]), len(posTrainLabel) ))
							if useCfier == 2 :	# 2 = ElasticNet
								print("    l1 ratio: {}".format( cfier.l1_ratio_ ))
				#end if

#TODO: What if 0 coefficients are chosen? Select new sample? Let it slide (affect avg?)
				if useCfier < 3 :
					cfPredLabel = cfier.predict(testSet)
				#end if
				cfPredLabel = np.ravel(cfPredLabel)

				# If no coeffs (train score == 0) try again
				if retryOnZeroCoeffs :
#					print("--  coeffs: {}".format(len(np.nonzero(cfier.coef_)[0])))
					if len(np.nonzero(cfier.coef_)[0]) <= 0 :
						if retries < (numVotes * 5) :
#							if printFlag :
#								print("--  No coefficients, re-sampling  --")
							retryNewAlpha = True
							retrySubSample = True
							vote = vote - 1
							retries += 1
#						else :
#							if printFlag :
#								print("--  Used all {} retries".format(retries))
					else :
						numSubSample = len(iterKnown)
				#end if

				voteScores[:,vote] = cfPredLabel

				vote += 1
			#end loop (vote)

#TODO: think about this...
			# # In case an iteration is no longer useful (all scores == 0)
			# if (vote > 0) and (cfier.score(trainSet, trainLabel) <= 0) :
			# 	break


			# 8) Place the scores into the array and store across iterations

			# first, average across the random negative samples (votes)
#TODO: really, I should either normalize the score or vote across rank
#	Does the value of the intersection matter here?
			voteScores = mp.normalizeFeatureColumns(voteScores)
			voteAvgScore = np.mean(voteScores, axis=1)

			# then, place into full gene score array
			#	NOTE: carry the scores forward from each iteration
#			for u in range(len(iterUnknown)) :
#				geneScores[iterUnknown[u],iter] = voteAvgScore[u]
			for g in range(len(iterAll)) :
				geneScores[iterAll[g],itr] = voteAvgScore[g]
			for i in range(itr + 1, numIterations) :
				geneScores[:,i] = geneScores[:,itr]
			#end loop


			# 9) Select Known & Unknown for the next round
#TODO: Base this on mis-labelled Known genes
#	for now, just take a percentage of least-confident scores

			# find the cutoff value for scores to keep
#			idxKeep = len(iterAll) - int(len(iterAll) / float(numIterations))
			cutoffIdx = iterNumGenes - int(iterNumGenes / float(numIterations))
			absScore = np.absolute(voteAvgScore)
			absScore.sort()
			cutoffVal = absScore[cutoffIdx]

			# get the upper & lower indices to extract for next round
			# x = 0
			# testVal = voteAvgScore
			# while testVal >= cutoffVal :

			# extract indices for any genes scoring less than cutoff
			iterKeep = list()
			for x in range(len(iterAll)) :
				if abs(voteAvgScore[x]) < cutoffVal :
					iterKeep.append(iterAll[x])
			#end loop

			# find intersections of Keep w/ previous Known & Unknown
			setKeep = set(iterKeep)
			newKnown = [gi for gi in iterKnown if gi in setKeep]
#			newKnown.sort()
			newUnknown = [gi for gi in iterUnknown if gi in setKeep]
#			newUnknown.sort()

			# set the gene indices for the next iteration
			iterKnown = newKnown
			iterUnknown = newUnknown
			iterAll = list()
			iterAll.extend(iterKnown)
			iterAll.extend(iterUnknown)
			iterAll.sort()

			numKnown = len(iterKnown)
			numUnknown = len(iterUnknown)
			if (numKnown <= numExitKnown) or (numUnknown <= numExitUnknown) :
				if printFlag :
					print("known: {}, unknown: {}; exiting loop".format(numKnown, numUnknown))
				break
		#end loop (itr)

		# 10) Rank the genes across the iterations
#TODO: should I average these, or just take the last column ?
#	test that option later
		useScore = np.mean(geneScores[giUnknown,0:(itr+1)], axis=1)

		ranker = np.recarray(len(giUnknown),
			dtype=[('inverse', 'f4'), ('score', 'f4'), ('geneIdx', 'i4')])

		ranker['score'] = useScore
		ranker['inverse'] = np.multiply(useScore, -1)
		ranker['geneIdx'] = giUnknown

		ranker.sort(order=['inverse', 'geneIdx'])
	


		# 11) Output the ranked genes to file

		# write the file
		fname = 'ranked_genes-' + useLabel + '_Avg.txt'
		if printFlag :
			print("  Saving ranked genes to file {}".format(fname))
		with open(si+fname, 'w') as fout :
			firstRow = True
			for row in range(len(ranker)) :
				if not firstRow :
					fout.write('\n')
				fout.write('{:3.3f}{}{}'.format(ranker['score'][row],
					textDelim, geneNames[ranker['geneIdx'][row]]))
				firstRow = False
		#end with


		# 10-b) Rank the genes across the iterations
#TODO: should I average these, or just take the last column ?
#	test that option later
		useScore = geneScores[giUnknown,itr]

		ranker = np.recarray(len(giUnknown),
			dtype=[('inverse', 'f4'), ('score', 'f4'), ('geneIdx', 'i4')])
		ranker['score'] = useScore
		ranker['inverse'] = np.multiply(useScore, -1)
		ranker['geneIdx'] = giUnknown
		ranker.sort(order=['inverse', 'geneIdx'])
	
		# 11-b) Output the ranked genes to file
		# write the file
		fname = 'ranked_genes-' + useLabel + '_Last.txt'
		if printFlag :
			print("  Saving ranked genes to file {}".format(fname))
		with open(si+fname, 'w') as fout :
			firstRow = True
			for row in range(len(ranker)) :
				if not firstRow :
					fout.write('\n')
				fout.write('{:3.3f}{}{}'.format(ranker['score'][row],
					textDelim, geneNames[ranker['geneIdx'][row]]))
				firstRow = False
		#end with

		# 10-c) Rank the genes across the iterations
#TODO: should I average these, or just take the last column ?
#	test that option later
		useScore = geneScores[giUnknown,0]
		ranker = np.recarray(len(giUnknown),
			dtype=[('inverse', 'f4'), ('score', 'f4'), ('geneIdx', 'i4')])
		ranker['score'] = useScore
		ranker['inverse'] = np.multiply(useScore, -1)
		ranker['geneIdx'] = giUnknown
		ranker.sort(order=['inverse', 'geneIdx'])
			# 11-b) Output the ranked genes to file
		# write the file
		fname = 'ranked_genes-' + useLabel + '_First.txt'
		if printFlag :
			print("  Saving ranked genes to file {}".format(fname))
		with open(si+fname, 'w') as fout :
			firstRow = True
			for row in range(len(ranker)) :
				if not firstRow :
					fout.write('\n')
				fout.write('{:3.3f}{}{}'.format(ranker['score'][row],
					textDelim, geneNames[ranker['geneIdx'][row]]))
				firstRow = False
		#end with


		# 12) Output the selected feature info to file
#TODO: this


		# 13) Output the parameters to file
#TODO: this
		fname = 'parameters-' + useLabel + '.txt'
		with open(si+fname, 'w') as fout :
			fout.write('\n')
			fout.write('Sampling Method for Neg examples\n')
			fout.write('  as One-Class w/ iterations on the weaker predictions\n')
			fout.write('\n')
			fout.write('Features Used\n')
			fout.write('PathSim sum:{}{}\n'.format(textDelim, useFeatPathSim))
			fout.write('path Z-Score:{}{}\n'.format(textDelim, useFeatPathZScore))
			fout.write('Neighborhood:{}{}\n'.format(textDelim, useFeatNeighbor))
			fout.write('Term Weights:{}{}\n'.format(textDelim, useFeatTermWeights))
			fout.write('\n')

	#TODO: collect some stats (ie: common alphas, l1 ratios, etc)
			fout.write('Classifier Parameters\n')
			if useCfier == 1 :
				fout.write('method:{}Lasso\n'.format(textDelim))
			elif useCfier == 2: 
				fout.write('method:{}ElasticNet\n'.format(textDelim))
			fout.write('positive:{}{}\n'.format(textDelim, usePos))
			# fout.write('alpha range:{}{}\n'.format(textDelim, useGivenRange))
			# fout.write('alpha chosen:{}{}\n'.format(textDelim, cfier.alpha_))
			fout.write('max_iter:{}{}\n'.format(textDelim, lMaxIter))
			fout.write('normalize:{}{}\n'.format(textDelim, lNorm))
			fout.write('fit_intercept:{}{}\n'.format(textDelim, lFitIcpt))
			fout.write('\n')
		#end with

		if printFlag :
			print("--{} of {}".format(thisRound, len(dSubDirs)))