def transformeInIncrementalDict(userDict, nseed, n=-1, proportional=-1, groupsToUse=None, values=[10,20,50,100]): listOfLabels = list() mapOfDicts = defaultdict(list) p = shuffleIndices(len(userDict), nseed) if proportional > 0: n = int( int(proportional)/100.0 * len(userDict) ) #for v, (key, userId) in zip(p, userDict.iteritems()): for v, (userId, userc) in zip(p, userDict.iteritems()): if n >= 0 and v >= n: continue nq = userc.numberOfQueries - 1 #print userId, nq for i, v in zip(range(len(values)), values): idxq = int(nq * (v/100.0)) #idxq = 1 if idxq == 0 else idxq #print v, idxq intermediateList = userc.toDict(idxq, groupsToUse) mapOfDicts[i].append(intermediateList) listOfLabels.append(userc.label) #Returning a list of list of queries of a single user and list of labels return mapOfDicts, listOfLabels
def transformeInDict(exampleVector, nseed, n=-1, proportional=-1): listOfDicts = list() listOfLabels = list() p = shuffleIndices(len(exampleVector), nseed) if proportional > 0: n = int( int(proportional)/100.0 * len(exampleVector) ) for (v, (_,example)) in zip(p, exampleVector): if n >= 0 and v >= n: continue listOfDicts.append(example.toDict()) listOfLabels.append(example.label) #print udict #### Check how this features are related with the features calculated by the random tree method return listOfDicts, listOfLabels
def transformeInDict(userDict, nseed, n=-1, proportional=-1, groupsToUse=None): listOfDicts = list() listOfLabels = list() p = shuffleIndices(len(userDict), nseed) if proportional > 0: n = int( int(proportional)/100.0 * len(userDict) ) for v, (_, user) in zip(p, userDict.iteritems()): if n >= 0 and v >= n: continue udict = user.toDict(user.numberOfQueries - 1, groupsToUse) listOfDicts.append(udict) listOfLabels.append(user.label) #print user.label, udict #print udict #### Check how this features are related with the features calculated by the random tree method return listOfDicts, listOfLabels
def runClassify(preProcessingMethod, forceBalance, proportional, nseed, explanation, gridSearch, generatePickle, hasPlotLibs, paralled, nJobs, listOfClassifiers, outfileName, nCV, measureProbas): positiveOutputFile = "positive-%s.pk" % (explanation) validationPosOutputFile = "positive-validation.pk" negativeOutputFile = "negative-%s.pk" % (explanation) validationNegOutputFile = "negative-validation.pk" testOutputFile = "test-%s.pk" % (explanation) logging.info("Using seed: %d", nseed) logging.info("Loading: %s and %s", positiveOutputFile, negativeOutputFile) logging.info("Processing method used: %s", preProcessingMethod) if forceBalance > 0: logging.warning("Forcing only %s examples for each dataset",forceBalance) if proportional > 0: logging.warning("Using proportional representation. %s percente of the base.",proportional) if forceBalance > 0 and proportional > 0: logging.error("ERROR! YOU SHOULD CHOOSE OR FORCEBALANCE OR PROPORTIONAL DATA!") print "ERROR! YOU SHOULD CHOOSE OR FORCEBALANCE OR PROPORTIONAL DATA!" exit(0) #### ### Load Datasets ## # logging.info("Loading the datasets...") with open(negativeOutputFile, 'rb') as input: negativeFV = pickle.load(input) with open(validationNegOutputFile, 'rb') as input: validationNegFV = pickle.load(input) with open(positiveOutputFile, 'rb') as input: positiveFV = pickle.load(input) with open(validationPosOutputFile, 'rb') as input: validationPosFV = pickle.load(input) with open(testOutputFile, 'rb') as input: testFV = pickle.load(input) logging.info("Loaded") testFV = sorted(testFV.iteritems(), key=lambda k: int(k[0])) logging.info("Transforming datasets into Dictionaries...") ld1, ll1 = transformeInDict(sorted(negativeFV.iteritems()), nseed, forceBalance, proportional) ld2, ll2 = transformeInDict(sorted(positiveFV.iteritems()), nseed, forceBalance, proportional) ldTest, llTest = transformeInDict(testFV, nseed, forceBalance, proportional) valldNeg, valllNeg = transformeInDict(sorted(validationNegFV.iteritems()), nseed, forceBalance, proportional) valldPos, valllPos = transformeInDict(sorted(validationPosFV.iteritems()), nseed, forceBalance, proportional) valY = np.array( valllNeg + valllPos) valDicts = valldNeg + valldPos logging.info("Transformed") listOfDicts = ld1 + ld2 listOfLabels = ll1 + ll2 y = np.array( listOfLabels ) greatestClass = 0 if len(ll1) > len(ll2) else 1 y_greatest = np.array((len(ll1) + len(ll2)) * [greatestClass] ) logging.info("Using %d positive examples -- class %s" % (len(ll1), ll1[0])) logging.info("Using %d negative examples -- class %s" % (len(ll2), ll2[0])) baselines = calculateBaselines(y, y_greatest) logging.info("Vectorizing dictionaries...") vec, X_noProcess = vectorizeData(listOfDicts) if X_noProcess != []: logging.info("Feature Names: %s", vec.get_feature_names()) logging.info("Vectorized") logging.info("Preprocessing data") X = preprocessing(X_noProcess, preProcessingMethod) #print "X_noProcess ----> ", X_noProcess #print "X ---> ", X logging.info("Data preprocessed") #Prepare Test data: Xtest = vec.transform(ldTest).toarray() Xtest = preprocessing(Xtest, preProcessingMethod) valX = vec.transform(valDicts).toarray() valX = preprocessing(valX, preProcessingMethod) #### ### Shuffer samples (TODO: Cross-validation) ## # logging.info("Shuffling the data...") n_samples = len(y) newIndices = shuffleIndices(n_samples, nseed) X = X[newIndices] y = y[newIndices] n_samples_val = len(valY) newIndices = shuffleIndices(n_samples_val, nseed) valX = valX[newIndices] valY = valY[newIndices] logging.debug("X - %s", X) # Shuffle samples logging.info("Shuffled") #### ### Run classifiers ## # precRecall, roc = {}, {} results = [] logging.info("Running classifiers...") if "dmfc" in listOfClassifiers: dmfc = DummyClassifier(strategy='most_frequent') results.append(classify(dmfc, "DummyMostFrequent", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, Xtest)) # ================================================================ if "nbc" in listOfClassifiers or "nb" in listOfClassifiers: nbc = GaussianNB() results.append(classify(nbc, "Naive Bayes", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, Xtest)) # ================================================================ if "knnc" in listOfClassifiers or "knn" in listOfClassifiers: knnc = KNeighborsClassifier(n_neighbors=classifyParameters["KNN-K"]) results.append(classify(knnc, "KNN", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridKNN, "measureProbas":measureProbas}, Xtest)) # ================================================================ if "lrc" in listOfClassifiers or "lgr" in listOfClassifiers or "lr" in listOfClassifiers: lrc = LogisticRegression(C=classifyParameters["LR-C"]) results.append(classify(lrc, "Logistic Regression", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridLR, "measureProbas":measureProbas}, Xtest, valX, valY)) # ================================================================ if "dtc" in listOfClassifiers: dtc = DecisionTreeClassifier( criterion=classifyParameters["DT-criterion"], max_features=classifyParameters["DT-max_features"] ) results.append(classify(dtc, "Decision Tree", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridDT, "measureProbas":measureProbas}, Xtest)) # ================================================================ if "svmc" in listOfClassifiers or "svm" in listOfClassifiers: #if SVMKernel == "linear": # svmc = LinearSVC(C=classifyParameters["SVM-C"], class_weight=classifyParameters["SVM-class_weight"]) #else: # svmc = SVC(kernel=classifyParameters["SVM-kernel"], cache_size=classifyParameters["SVM-cacheSize"], C=classifyParameters["SVM-C"], max_iter=classifyParameters["SVM-maxIter"], probability=measureProbas, gamma=classifyParameters["SVM-gamma"], class_weight=classifyParameters["SVM-class_weight"]) #results.append(classify(svmc, "SVM", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridSVM, "measureProbas":measureProbas}, Xtest)) pass # ================================================================ if "etc" in listOfClassifiers: etc = ExtraTreesClassifier(random_state=0, n_jobs=nJobs, n_estimators=classifyParameters["ETC-n_estimators"], criterion=classifyParameters["ETC-criterion"], max_features=classifyParameters["ETC-max_features"]) results.append(classify(etc, "Random Forest", X, y, nCV, nJobs, baselines, {"tryToMeasureFeatureImportance":measureProbas, "featuresOutFilename":(outfileName + ".pk"), "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridETC, "measureProbas":measureProbas}, Xtest, valX, valY)) # ================================================================ if "sgd" in listOfClassifiers: sgd = SGDClassifier(n_jobs=nJobs) results.append(classify(sgd, "SGD", X, y, nCV, nJobs, baselines, {"featuresOutFilename":(outfileName + ".pk"), "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridSGD, "measureProbas":measureProbas}, Xtest, valX, valY)) # ================================================================ if "gbc" in listOfClassifiers: gbc = GradientBoostingClassifier(n_estimators=300,subsample=0.6,max_depth=4,random_state=nseed) results.append(classify(gbc, "GBC", X, y, nCV, nJobs, baselines, {"featuresOutFilename":(outfileName + ".pk"), "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridSGD, "measureProbas":measureProbas}, Xtest, valX, valY)) # ================================================================ precRecall, roc = getCurves(results) roc["Random Classifier"] = ([0,1],[0,1]) plotGraph(precRecall, fileName=PRECRECALLNAME, xlabel="Recall", ylabel="Precision", generatePickle=generatePickle, hasPlotLibs=hasPlotLibs) plotGraph(roc, fileName=ROCNAME, xlabel="False Positive Rate", ylabel="True Positive Rate", generatePickle=generatePickle, hasPlotLibs=hasPlotLibs) fo = open(outfileName, "a") listProbas = [] for r in results: clfName = r[0] resultMetrics = r[1] fo.write("%s, %.3f, %.3f, %.3f, %.3f\n" % (clfName, 100.0*resultMetrics.acc, 100.0*resultMetrics.sf1, 100.0*resultMetrics.mf1, 100.0*resultMetrics.wf1)) print "%s, %.3f, %.3f, %.3f, %.3f" % (clfName, 100.0*resultMetrics.acc, 100.0*resultMetrics.sf1, 100.0*resultMetrics.mf1, 100.0*resultMetrics.wf1) yTraining = r[4] yTrainingProbas = r[5] yTest = r[6] yTestProbas = r[7] writeOutput(clfName + ".csv", yTest) listProbas.append(yTestProbas) #for t,p in zip(yTest, yTestProbas): # print t, p mergedYTest = voting(listProbas) writeOutput("merged.csv", mergedYTest) fo.close() logging.info("Done")
def runClassify(preProcessingMethod, forceBalance, proportional, minNumberOfQueries, nseed, explanation, healthUsers, gridSearch, generatePickle, hasPlotLibs, paralled, nJobs, listOfClassifiers, groupsToUse, usingIncremental, outfileName, nCV, measureProbas, incrementalVector): if healthUsers: positiveOutputFile = "healthUser-%d-%s.pk" % (minNumberOfQueries, explanation) negativeOutputFile = "notHealthUser-%d-%s.pk" % (minNumberOfQueries, explanation) else: negativeOutputFile = "regularUser-%d-%s.pk" % (minNumberOfQueries, explanation) positiveOutputFile = "medicalUser-%d-%s.pk" % (minNumberOfQueries, explanation) logging.info("Using seed: %d", nseed) logging.info("Loading: %s and %s", positiveOutputFile, negativeOutputFile) logging.info("Processing method used: %s", preProcessingMethod) if forceBalance > 0: logging.warning("Forcing only %s examples for each dataset",forceBalance) if proportional > 0: logging.warning("Using proportional representation. %s percente of the base.",proportional) if forceBalance > 0 and proportional > 0: logging.error("ERROR! YOU SHOULD CHOOSE OR FORCEBALANCE OR PROPORTIONAL DATA!") print "ERROR! YOU SHOULD CHOOSE OR FORCEBALANCE OR PROPORTIONAL DATA!" exit(0) #### ### Load Datasets ## # logging.info("Loading the datasets...") with open(negativeOutputFile, 'rb') as input: negativeUserFV = pickle.load(input) with open(positiveOutputFile, 'rb') as input: positiveUserFV = pickle.load(input) logging.info("Loaded") logging.info("Transforming datasets into Dictionaries...") if usingIncremental: negativeUserFV,ll1 = transformeInIncrementalDict(negativeUserFV, nseed, forceBalance, proportional, groupsToUse, incrementalVector) positiveUserFV,ll2 = transformeInIncrementalDict(positiveUserFV, nseed, forceBalance, proportional, groupsToUse, incrementalVector) ld1, ld2 = [], [] lm1 = len(negativeUserFV) if lm1 != len(positiveUserFV): logging.error("ERROR MAP SIZES ARE NOT EQUAL!") print "ERROR MAP SIZES ARE NOT EQUAL!" exit(0) incrementalFV = defaultdict(list) for i in range(lm1): incrementalFV[i] = negativeUserFV[i] + positiveUserFV[i] else: ld1, ll1 = transformeInDict(negativeUserFV, nseed, forceBalance, proportional, groupsToUse) ld2, ll2 = transformeInDict(positiveUserFV, nseed, forceBalance, proportional, groupsToUse) #Free memory del positiveUserFV del negativeUserFV logging.info("Transformed") listOfDicts = ld1 + ld2 listOfLabels = ll1 + ll2 y = np.array( listOfLabels ) greatestClass = 0 if len(ll1) > len(ll2) else 1 y_greatest = np.array((len(ll1) + len(ll2)) * [greatestClass] ) logging.info("Using %d regular users -- class %s" % (len(ll1), ll1[0])) logging.info("Using %d medical users -- class %s" % (len(ll2), ll2[0])) baselines = calculateBaselines(y, y_greatest) logging.info("Vectorizing dictionaries...") vec, X_noProcess = vectorizeData(listOfDicts) if X_noProcess != []: logging.info("Feature Names: %s", vec.get_feature_names()) logging.info("Vectorized") logging.info("Preprocessing data") X = preprocessing(X_noProcess, preProcessingMethod) #print "X_noProcess ----> ", X_noProcess #print "X ---> ", X logging.info("Data preprocessed") if usingIncremental: incrementalFV = [preprocessing(vec.fit_transform(l).toarray(), preProcessingMethod) for k, l in incrementalFV.iteritems()] else: incrementalFV = None #### ### Shuffer samples (TODO: Cross-validation) ## # logging.info("Shuffling the data...") n_samples = len(y) newIndices = shuffleIndices(n_samples, nseed) if X != []: X = X[newIndices] y = y[newIndices] if usingIncremental: incrementalFV = [ fv[newIndices] for fv in incrementalFV ] logging.debug("X - %s", X) # Shuffle samples logging.info("Shuffled") #### ### Run classifiers ## # precRecall, roc = {}, {} clfrs = [] logging.info("Running classifiers...") if "dmfc" in listOfClassifiers: dmfc = DummyClassifier(strategy='most_frequent') clfrs.append( (dmfc, "DummyMostFrequent", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}) ) # ================================================================ if "dsc" in listOfClassifiers: dsc = DummyClassifier(strategy='stratified') clfrs.append( (dsc, "DummyStratified", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}) ) # ================================================================ if "duc" in listOfClassifiers: duc = DummyClassifier(strategy='uniform') clfrs.append( (duc, "DummyUniform", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}) ) # ================================================================ if "nbc" in listOfClassifiers or "nb" in listOfClassifiers: nbc = GaussianNB() clfrs.append( (nbc, "Naive Bayes", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}) ) # ================================================================ if "knnc" in listOfClassifiers or "knn" in listOfClassifiers: knnc = KNeighborsClassifier(n_neighbors=classifyParameters["KNN-K"]) clfrs.append( (knnc, "KNN", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridKNN, "measureProbas":measureProbas}) ) # ================================================================ if "lrc" in listOfClassifiers: lrc = LogisticRegression(C=classifyParameters["LR-C"]) clfrs.append( (lrc, "Logistic Regression", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridLR, "measureProbas":measureProbas})) # ================================================================ if "dtc" in listOfClassifiers: dtc = DecisionTreeClassifier( criterion=classifyParameters["DT-criterion"], max_features=classifyParameters["DT-max_features"] ) clfrs.append( (dtc, "Decision Tree", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridDT, "measureProbas":measureProbas}) ) # ================================================================ if "svmc" in listOfClassifiers or "svm" in listOfClassifiers: if SVMKernel == "linear": svmc = LinearSVC(C=classifyParameters["SVM-C"], class_weight=classifyParameters["SVM-class_weight"]) else: svmc = SVC(kernel=classifyParameters["SVM-kernel"], cache_size=classifyParameters["SVM-cacheSize"], C=classifyParameters["SVM-C"], max_iter=classifyParameters["SVM-maxIter"], probability=measureProbas, gamma=classifyParameters["SVM-gamma"], class_weight=classifyParameters["SVM-class_weight"]) clfrs.append( (svmc, "SVM", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridSVM, "measureProbas":measureProbas}) ) # ================================================================ if "etc" in listOfClassifiers: etc = ExtraTreesClassifier(random_state=0, n_jobs=nJobs, n_estimators=classifyParameters["ETC-n_estimators"], criterion=classifyParameters["ETC-criterion"], max_features=classifyParameters["ETC-max_features"]) clfrs.append( (etc, "Random Forest", X, y, nCV, nJobs, baselines, {"tryToMeasureFeatureImportance":True, "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridETC, "measureProbas":measureProbas, "featuresOutFilename":(outfileName + ".pk")}) ) results = [] if paralled: from scoop import futures results = futures.map(parallelClassify,clfrs) else: if "dmfc" in listOfClassifiers: results.append(classify(dmfc, "DummyMostFrequent", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, incremental=incrementalFV)) if "dsc" in listOfClassifiers: results.append(classify(dsc, "DummyStratified", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, incremental=incrementalFV)) if "duc" in listOfClassifiers: results.append(classify(duc, "DummyUniform", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, incremental=incrementalFV)) if "nbc" in listOfClassifiers or "nb" in listOfClassifiers: results.append(classify(nbc, "Naive Bayes", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, incremental=incrementalFV)) if "knnc" in listOfClassifiers or "knn" in listOfClassifiers: results.append(classify(knnc, "KNN", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridKNN, "measureProbas":measureProbas}, incremental=incrementalFV)) if "lrc" in listOfClassifiers: results.append(classify(lrc, "Logistic Regression", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridLR, "measureProbas":measureProbas}, incremental=incrementalFV)) if "dtc" in listOfClassifiers: results.append(classify(dtc, "Decision Tree", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridDT, "measureProbas":measureProbas}, incremental=incrementalFV)) if "svmc" in listOfClassifiers or "svm" in listOfClassifiers: results.append(classify(svmc, "SVM", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridSVM, "measureProbas":measureProbas}, incremental=incrementalFV)) if "etc" in listOfClassifiers: results.append(classify(etc, "Random Forest", X, y, nCV, nJobs, baselines, {"tryToMeasureFeatureImportance":measureProbas, "featuresOutFilename":(outfileName + ".pk"), "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridETC, "measureProbas":measureProbas}, incremental=incrementalFV)) precRecall, roc = getCurves(results) roc["Random Classifier"] = ([0,1],[0,1]) plotGraph(precRecall, fileName=PRECRECALLNAME, xlabel="Recall", ylabel="Precision", generatePickle=generatePickle, hasPlotLibs=hasPlotLibs) plotGraph(roc, fileName=ROCNAME, xlabel="False Positive Rate", ylabel="True Positive Rate", generatePickle=generatePickle, hasPlotLibs=hasPlotLibs) fo = open(outfileName, "a") for r in results: label = r[0] resultMetrics = r[1] if usingIncremental: for i, part in zip(range(len(incrementalVector)), incrementalVector): fo.write("%s, Partition %d, %.3f, %.3f, %.3f, %.3f\n" % (label, part/10, 100.0*(resultMetrics.acc[i]), 100.0*resultMetrics.sf1[i], 100.0*resultMetrics.mf1[i], 100.0*resultMetrics.wf1[i])) print "%s, Partition %d, %.3f, %.3f, %.3f, %.3f" % (label, part/10, 100.0*(resultMetrics.acc[i]), 100.0*resultMetrics.sf1[i], 100.0*resultMetrics.mf1[i], 100.0*resultMetrics.wf1[i]) print "Means ----- %s, %.3f, %.3f, %.3f, %.3f" % (label, 100.0*(np.mean(resultMetrics.acc)), 100.0*np.mean(resultMetrics.sf1), 100.0*np.mean(resultMetrics.mf1), 100.0*np.mean(resultMetrics.wf1)) else: fo.write("%s, %.3f, %.3f, %.3f, %.3f\n" % (label, 100.0*resultMetrics.acc, 100.0*resultMetrics.sf1, 100.0*resultMetrics.mf1, 100.0*resultMetrics.wf1)) print "%s, %.3f, %.3f, %.3f, %.3f" % (label, 100.0*resultMetrics.acc, 100.0*resultMetrics.sf1, 100.0*resultMetrics.mf1, 100.0*resultMetrics.wf1) fo.close() logging.info("Done")