def runClassify(preProcessingMethod, forceBalance, proportional, nseed, explanation, gridSearch, generatePickle, hasPlotLibs, paralled, nJobs, listOfClassifiers, outfileName, nCV, measureProbas): positiveOutputFile = "positive-%s.pk" % (explanation) validationPosOutputFile = "positive-validation.pk" negativeOutputFile = "negative-%s.pk" % (explanation) validationNegOutputFile = "negative-validation.pk" testOutputFile = "test-%s.pk" % (explanation) logging.info("Using seed: %d", nseed) logging.info("Loading: %s and %s", positiveOutputFile, negativeOutputFile) logging.info("Processing method used: %s", preProcessingMethod) if forceBalance > 0: logging.warning("Forcing only %s examples for each dataset",forceBalance) if proportional > 0: logging.warning("Using proportional representation. %s percente of the base.",proportional) if forceBalance > 0 and proportional > 0: logging.error("ERROR! YOU SHOULD CHOOSE OR FORCEBALANCE OR PROPORTIONAL DATA!") print "ERROR! YOU SHOULD CHOOSE OR FORCEBALANCE OR PROPORTIONAL DATA!" exit(0) #### ### Load Datasets ## # logging.info("Loading the datasets...") with open(negativeOutputFile, 'rb') as input: negativeFV = pickle.load(input) with open(validationNegOutputFile, 'rb') as input: validationNegFV = pickle.load(input) with open(positiveOutputFile, 'rb') as input: positiveFV = pickle.load(input) with open(validationPosOutputFile, 'rb') as input: validationPosFV = pickle.load(input) with open(testOutputFile, 'rb') as input: testFV = pickle.load(input) logging.info("Loaded") testFV = sorted(testFV.iteritems(), key=lambda k: int(k[0])) logging.info("Transforming datasets into Dictionaries...") ld1, ll1 = transformeInDict(sorted(negativeFV.iteritems()), nseed, forceBalance, proportional) ld2, ll2 = transformeInDict(sorted(positiveFV.iteritems()), nseed, forceBalance, proportional) ldTest, llTest = transformeInDict(testFV, nseed, forceBalance, proportional) valldNeg, valllNeg = transformeInDict(sorted(validationNegFV.iteritems()), nseed, forceBalance, proportional) valldPos, valllPos = transformeInDict(sorted(validationPosFV.iteritems()), nseed, forceBalance, proportional) valY = np.array( valllNeg + valllPos) valDicts = valldNeg + valldPos logging.info("Transformed") listOfDicts = ld1 + ld2 listOfLabels = ll1 + ll2 y = np.array( listOfLabels ) greatestClass = 0 if len(ll1) > len(ll2) else 1 y_greatest = np.array((len(ll1) + len(ll2)) * [greatestClass] ) logging.info("Using %d positive examples -- class %s" % (len(ll1), ll1[0])) logging.info("Using %d negative examples -- class %s" % (len(ll2), ll2[0])) baselines = calculateBaselines(y, y_greatest) logging.info("Vectorizing dictionaries...") vec, X_noProcess = vectorizeData(listOfDicts) if X_noProcess != []: logging.info("Feature Names: %s", vec.get_feature_names()) logging.info("Vectorized") logging.info("Preprocessing data") X = preprocessing(X_noProcess, preProcessingMethod) #print "X_noProcess ----> ", X_noProcess #print "X ---> ", X logging.info("Data preprocessed") #Prepare Test data: Xtest = vec.transform(ldTest).toarray() Xtest = preprocessing(Xtest, preProcessingMethod) valX = vec.transform(valDicts).toarray() valX = preprocessing(valX, preProcessingMethod) #### ### Shuffer samples (TODO: Cross-validation) ## # logging.info("Shuffling the data...") n_samples = len(y) newIndices = shuffleIndices(n_samples, nseed) X = X[newIndices] y = y[newIndices] n_samples_val = len(valY) newIndices = shuffleIndices(n_samples_val, nseed) valX = valX[newIndices] valY = valY[newIndices] logging.debug("X - %s", X) # Shuffle samples logging.info("Shuffled") #### ### Run classifiers ## # precRecall, roc = {}, {} results = [] logging.info("Running classifiers...") if "dmfc" in listOfClassifiers: dmfc = DummyClassifier(strategy='most_frequent') results.append(classify(dmfc, "DummyMostFrequent", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, Xtest)) # ================================================================ if "nbc" in listOfClassifiers or "nb" in listOfClassifiers: nbc = GaussianNB() results.append(classify(nbc, "Naive Bayes", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, Xtest)) # ================================================================ if "knnc" in listOfClassifiers or "knn" in listOfClassifiers: knnc = KNeighborsClassifier(n_neighbors=classifyParameters["KNN-K"]) results.append(classify(knnc, "KNN", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridKNN, "measureProbas":measureProbas}, Xtest)) # ================================================================ if "lrc" in listOfClassifiers or "lgr" in listOfClassifiers or "lr" in listOfClassifiers: lrc = LogisticRegression(C=classifyParameters["LR-C"]) results.append(classify(lrc, "Logistic Regression", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridLR, "measureProbas":measureProbas}, Xtest, valX, valY)) # ================================================================ if "dtc" in listOfClassifiers: dtc = DecisionTreeClassifier( criterion=classifyParameters["DT-criterion"], max_features=classifyParameters["DT-max_features"] ) results.append(classify(dtc, "Decision Tree", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridDT, "measureProbas":measureProbas}, Xtest)) # ================================================================ if "svmc" in listOfClassifiers or "svm" in listOfClassifiers: #if SVMKernel == "linear": # svmc = LinearSVC(C=classifyParameters["SVM-C"], class_weight=classifyParameters["SVM-class_weight"]) #else: # svmc = SVC(kernel=classifyParameters["SVM-kernel"], cache_size=classifyParameters["SVM-cacheSize"], C=classifyParameters["SVM-C"], max_iter=classifyParameters["SVM-maxIter"], probability=measureProbas, gamma=classifyParameters["SVM-gamma"], class_weight=classifyParameters["SVM-class_weight"]) #results.append(classify(svmc, "SVM", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridSVM, "measureProbas":measureProbas}, Xtest)) pass # ================================================================ if "etc" in listOfClassifiers: etc = ExtraTreesClassifier(random_state=0, n_jobs=nJobs, n_estimators=classifyParameters["ETC-n_estimators"], criterion=classifyParameters["ETC-criterion"], max_features=classifyParameters["ETC-max_features"]) results.append(classify(etc, "Random Forest", X, y, nCV, nJobs, baselines, {"tryToMeasureFeatureImportance":measureProbas, "featuresOutFilename":(outfileName + ".pk"), "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridETC, "measureProbas":measureProbas}, Xtest, valX, valY)) # ================================================================ if "sgd" in listOfClassifiers: sgd = SGDClassifier(n_jobs=nJobs) results.append(classify(sgd, "SGD", X, y, nCV, nJobs, baselines, {"featuresOutFilename":(outfileName + ".pk"), "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridSGD, "measureProbas":measureProbas}, Xtest, valX, valY)) # ================================================================ if "gbc" in listOfClassifiers: gbc = GradientBoostingClassifier(n_estimators=300,subsample=0.6,max_depth=4,random_state=nseed) results.append(classify(gbc, "GBC", X, y, nCV, nJobs, baselines, {"featuresOutFilename":(outfileName + ".pk"), "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridSGD, "measureProbas":measureProbas}, Xtest, valX, valY)) # ================================================================ precRecall, roc = getCurves(results) roc["Random Classifier"] = ([0,1],[0,1]) plotGraph(precRecall, fileName=PRECRECALLNAME, xlabel="Recall", ylabel="Precision", generatePickle=generatePickle, hasPlotLibs=hasPlotLibs) plotGraph(roc, fileName=ROCNAME, xlabel="False Positive Rate", ylabel="True Positive Rate", generatePickle=generatePickle, hasPlotLibs=hasPlotLibs) fo = open(outfileName, "a") listProbas = [] for r in results: clfName = r[0] resultMetrics = r[1] fo.write("%s, %.3f, %.3f, %.3f, %.3f\n" % (clfName, 100.0*resultMetrics.acc, 100.0*resultMetrics.sf1, 100.0*resultMetrics.mf1, 100.0*resultMetrics.wf1)) print "%s, %.3f, %.3f, %.3f, %.3f" % (clfName, 100.0*resultMetrics.acc, 100.0*resultMetrics.sf1, 100.0*resultMetrics.mf1, 100.0*resultMetrics.wf1) yTraining = r[4] yTrainingProbas = r[5] yTest = r[6] yTestProbas = r[7] writeOutput(clfName + ".csv", yTest) listProbas.append(yTestProbas) #for t,p in zip(yTest, yTestProbas): # print t, p mergedYTest = voting(listProbas) writeOutput("merged.csv", mergedYTest) fo.close() logging.info("Done")
def runClassify(preProcessingMethod, forceBalance, proportional, minNumberOfQueries, nseed, explanation, healthUsers, gridSearch, generatePickle, hasPlotLibs, paralled, nJobs, listOfClassifiers, groupsToUse, usingIncremental, outfileName, nCV, measureProbas, incrementalVector): if healthUsers: positiveOutputFile = "healthUser-%d-%s.pk" % (minNumberOfQueries, explanation) negativeOutputFile = "notHealthUser-%d-%s.pk" % (minNumberOfQueries, explanation) else: negativeOutputFile = "regularUser-%d-%s.pk" % (minNumberOfQueries, explanation) positiveOutputFile = "medicalUser-%d-%s.pk" % (minNumberOfQueries, explanation) logging.info("Using seed: %d", nseed) logging.info("Loading: %s and %s", positiveOutputFile, negativeOutputFile) logging.info("Processing method used: %s", preProcessingMethod) if forceBalance > 0: logging.warning("Forcing only %s examples for each dataset",forceBalance) if proportional > 0: logging.warning("Using proportional representation. %s percente of the base.",proportional) if forceBalance > 0 and proportional > 0: logging.error("ERROR! YOU SHOULD CHOOSE OR FORCEBALANCE OR PROPORTIONAL DATA!") print "ERROR! YOU SHOULD CHOOSE OR FORCEBALANCE OR PROPORTIONAL DATA!" exit(0) #### ### Load Datasets ## # logging.info("Loading the datasets...") with open(negativeOutputFile, 'rb') as input: negativeUserFV = pickle.load(input) with open(positiveOutputFile, 'rb') as input: positiveUserFV = pickle.load(input) logging.info("Loaded") logging.info("Transforming datasets into Dictionaries...") if usingIncremental: negativeUserFV,ll1 = transformeInIncrementalDict(negativeUserFV, nseed, forceBalance, proportional, groupsToUse, incrementalVector) positiveUserFV,ll2 = transformeInIncrementalDict(positiveUserFV, nseed, forceBalance, proportional, groupsToUse, incrementalVector) ld1, ld2 = [], [] lm1 = len(negativeUserFV) if lm1 != len(positiveUserFV): logging.error("ERROR MAP SIZES ARE NOT EQUAL!") print "ERROR MAP SIZES ARE NOT EQUAL!" exit(0) incrementalFV = defaultdict(list) for i in range(lm1): incrementalFV[i] = negativeUserFV[i] + positiveUserFV[i] else: ld1, ll1 = transformeInDict(negativeUserFV, nseed, forceBalance, proportional, groupsToUse) ld2, ll2 = transformeInDict(positiveUserFV, nseed, forceBalance, proportional, groupsToUse) #Free memory del positiveUserFV del negativeUserFV logging.info("Transformed") listOfDicts = ld1 + ld2 listOfLabels = ll1 + ll2 y = np.array( listOfLabels ) greatestClass = 0 if len(ll1) > len(ll2) else 1 y_greatest = np.array((len(ll1) + len(ll2)) * [greatestClass] ) logging.info("Using %d regular users -- class %s" % (len(ll1), ll1[0])) logging.info("Using %d medical users -- class %s" % (len(ll2), ll2[0])) baselines = calculateBaselines(y, y_greatest) logging.info("Vectorizing dictionaries...") vec, X_noProcess = vectorizeData(listOfDicts) if X_noProcess != []: logging.info("Feature Names: %s", vec.get_feature_names()) logging.info("Vectorized") logging.info("Preprocessing data") X = preprocessing(X_noProcess, preProcessingMethod) #print "X_noProcess ----> ", X_noProcess #print "X ---> ", X logging.info("Data preprocessed") if usingIncremental: incrementalFV = [preprocessing(vec.fit_transform(l).toarray(), preProcessingMethod) for k, l in incrementalFV.iteritems()] else: incrementalFV = None #### ### Shuffer samples (TODO: Cross-validation) ## # logging.info("Shuffling the data...") n_samples = len(y) newIndices = shuffleIndices(n_samples, nseed) if X != []: X = X[newIndices] y = y[newIndices] if usingIncremental: incrementalFV = [ fv[newIndices] for fv in incrementalFV ] logging.debug("X - %s", X) # Shuffle samples logging.info("Shuffled") #### ### Run classifiers ## # precRecall, roc = {}, {} clfrs = [] logging.info("Running classifiers...") if "dmfc" in listOfClassifiers: dmfc = DummyClassifier(strategy='most_frequent') clfrs.append( (dmfc, "DummyMostFrequent", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}) ) # ================================================================ if "dsc" in listOfClassifiers: dsc = DummyClassifier(strategy='stratified') clfrs.append( (dsc, "DummyStratified", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}) ) # ================================================================ if "duc" in listOfClassifiers: duc = DummyClassifier(strategy='uniform') clfrs.append( (duc, "DummyUniform", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}) ) # ================================================================ if "nbc" in listOfClassifiers or "nb" in listOfClassifiers: nbc = GaussianNB() clfrs.append( (nbc, "Naive Bayes", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}) ) # ================================================================ if "knnc" in listOfClassifiers or "knn" in listOfClassifiers: knnc = KNeighborsClassifier(n_neighbors=classifyParameters["KNN-K"]) clfrs.append( (knnc, "KNN", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridKNN, "measureProbas":measureProbas}) ) # ================================================================ if "lrc" in listOfClassifiers: lrc = LogisticRegression(C=classifyParameters["LR-C"]) clfrs.append( (lrc, "Logistic Regression", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridLR, "measureProbas":measureProbas})) # ================================================================ if "dtc" in listOfClassifiers: dtc = DecisionTreeClassifier( criterion=classifyParameters["DT-criterion"], max_features=classifyParameters["DT-max_features"] ) clfrs.append( (dtc, "Decision Tree", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridDT, "measureProbas":measureProbas}) ) # ================================================================ if "svmc" in listOfClassifiers or "svm" in listOfClassifiers: if SVMKernel == "linear": svmc = LinearSVC(C=classifyParameters["SVM-C"], class_weight=classifyParameters["SVM-class_weight"]) else: svmc = SVC(kernel=classifyParameters["SVM-kernel"], cache_size=classifyParameters["SVM-cacheSize"], C=classifyParameters["SVM-C"], max_iter=classifyParameters["SVM-maxIter"], probability=measureProbas, gamma=classifyParameters["SVM-gamma"], class_weight=classifyParameters["SVM-class_weight"]) clfrs.append( (svmc, "SVM", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridSVM, "measureProbas":measureProbas}) ) # ================================================================ if "etc" in listOfClassifiers: etc = ExtraTreesClassifier(random_state=0, n_jobs=nJobs, n_estimators=classifyParameters["ETC-n_estimators"], criterion=classifyParameters["ETC-criterion"], max_features=classifyParameters["ETC-max_features"]) clfrs.append( (etc, "Random Forest", X, y, nCV, nJobs, baselines, {"tryToMeasureFeatureImportance":True, "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridETC, "measureProbas":measureProbas, "featuresOutFilename":(outfileName + ".pk")}) ) results = [] if paralled: from scoop import futures results = futures.map(parallelClassify,clfrs) else: if "dmfc" in listOfClassifiers: results.append(classify(dmfc, "DummyMostFrequent", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, incremental=incrementalFV)) if "dsc" in listOfClassifiers: results.append(classify(dsc, "DummyStratified", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, incremental=incrementalFV)) if "duc" in listOfClassifiers: results.append(classify(duc, "DummyUniform", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, incremental=incrementalFV)) if "nbc" in listOfClassifiers or "nb" in listOfClassifiers: results.append(classify(nbc, "Naive Bayes", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, incremental=incrementalFV)) if "knnc" in listOfClassifiers or "knn" in listOfClassifiers: results.append(classify(knnc, "KNN", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridKNN, "measureProbas":measureProbas}, incremental=incrementalFV)) if "lrc" in listOfClassifiers: results.append(classify(lrc, "Logistic Regression", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridLR, "measureProbas":measureProbas}, incremental=incrementalFV)) if "dtc" in listOfClassifiers: results.append(classify(dtc, "Decision Tree", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridDT, "measureProbas":measureProbas}, incremental=incrementalFV)) if "svmc" in listOfClassifiers or "svm" in listOfClassifiers: results.append(classify(svmc, "SVM", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridSVM, "measureProbas":measureProbas}, incremental=incrementalFV)) if "etc" in listOfClassifiers: results.append(classify(etc, "Random Forest", X, y, nCV, nJobs, baselines, {"tryToMeasureFeatureImportance":measureProbas, "featuresOutFilename":(outfileName + ".pk"), "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridETC, "measureProbas":measureProbas}, incremental=incrementalFV)) precRecall, roc = getCurves(results) roc["Random Classifier"] = ([0,1],[0,1]) plotGraph(precRecall, fileName=PRECRECALLNAME, xlabel="Recall", ylabel="Precision", generatePickle=generatePickle, hasPlotLibs=hasPlotLibs) plotGraph(roc, fileName=ROCNAME, xlabel="False Positive Rate", ylabel="True Positive Rate", generatePickle=generatePickle, hasPlotLibs=hasPlotLibs) fo = open(outfileName, "a") for r in results: label = r[0] resultMetrics = r[1] if usingIncremental: for i, part in zip(range(len(incrementalVector)), incrementalVector): fo.write("%s, Partition %d, %.3f, %.3f, %.3f, %.3f\n" % (label, part/10, 100.0*(resultMetrics.acc[i]), 100.0*resultMetrics.sf1[i], 100.0*resultMetrics.mf1[i], 100.0*resultMetrics.wf1[i])) print "%s, Partition %d, %.3f, %.3f, %.3f, %.3f" % (label, part/10, 100.0*(resultMetrics.acc[i]), 100.0*resultMetrics.sf1[i], 100.0*resultMetrics.mf1[i], 100.0*resultMetrics.wf1[i]) print "Means ----- %s, %.3f, %.3f, %.3f, %.3f" % (label, 100.0*(np.mean(resultMetrics.acc)), 100.0*np.mean(resultMetrics.sf1), 100.0*np.mean(resultMetrics.mf1), 100.0*np.mean(resultMetrics.wf1)) else: fo.write("%s, %.3f, %.3f, %.3f, %.3f\n" % (label, 100.0*resultMetrics.acc, 100.0*resultMetrics.sf1, 100.0*resultMetrics.mf1, 100.0*resultMetrics.wf1)) print "%s, %.3f, %.3f, %.3f, %.3f" % (label, 100.0*resultMetrics.acc, 100.0*resultMetrics.sf1, 100.0*resultMetrics.mf1, 100.0*resultMetrics.wf1) fo.close() logging.info("Done")