Example #1
0
def runClassify(preProcessingMethod, forceBalance, proportional, nseed, explanation, gridSearch, generatePickle, hasPlotLibs, paralled, nJobs, listOfClassifiers, outfileName, nCV, measureProbas):
   
    positiveOutputFile = "positive-%s.pk" % (explanation)
    validationPosOutputFile = "positive-validation.pk"
    negativeOutputFile = "negative-%s.pk" % (explanation)
    validationNegOutputFile = "negative-validation.pk"
    testOutputFile = "test-%s.pk" % (explanation)

    logging.info("Using seed: %d", nseed)
    logging.info("Loading: %s and %s", positiveOutputFile, negativeOutputFile)
    logging.info("Processing method used: %s", preProcessingMethod)

    if forceBalance > 0:
        logging.warning("Forcing only %s examples for each dataset",forceBalance)

    if proportional > 0:
        logging.warning("Using proportional representation. %s percente of the base.",proportional)
    
    if forceBalance > 0 and proportional > 0:
        logging.error("ERROR! YOU SHOULD CHOOSE OR FORCEBALANCE OR PROPORTIONAL DATA!")
        print "ERROR! YOU SHOULD CHOOSE OR FORCEBALANCE OR PROPORTIONAL DATA!"
        exit(0)

    ####
    ### Load Datasets
    ##
    #
    logging.info("Loading the datasets...")
    with open(negativeOutputFile, 'rb') as input:
        negativeFV = pickle.load(input)
    
    with open(validationNegOutputFile, 'rb') as input:
        validationNegFV = pickle.load(input)
    
    with open(positiveOutputFile, 'rb') as input:
        positiveFV = pickle.load(input)
    
    with open(validationPosOutputFile, 'rb') as input:
        validationPosFV = pickle.load(input)

    with open(testOutputFile, 'rb') as input:
        testFV = pickle.load(input)
    logging.info("Loaded")

    testFV = sorted(testFV.iteritems(), key=lambda k: int(k[0])) 

    logging.info("Transforming datasets into Dictionaries...")
    ld1, ll1 = transformeInDict(sorted(negativeFV.iteritems()), nseed, forceBalance, proportional)
    ld2, ll2 = transformeInDict(sorted(positiveFV.iteritems()), nseed, forceBalance, proportional)
    ldTest, llTest = transformeInDict(testFV, nseed, forceBalance, proportional)

    valldNeg, valllNeg = transformeInDict(sorted(validationNegFV.iteritems()), nseed, forceBalance, proportional)
    valldPos, valllPos = transformeInDict(sorted(validationPosFV.iteritems()), nseed, forceBalance, proportional)
    valY = np.array( valllNeg + valllPos)
    valDicts = valldNeg + valldPos
    
    logging.info("Transformed")
    
    listOfDicts = ld1 + ld2
    listOfLabels = ll1 + ll2
    y = np.array( listOfLabels )
    
    greatestClass = 0 if len(ll1) > len(ll2) else 1
    y_greatest =  np.array((len(ll1) + len(ll2)) * [greatestClass] )

    logging.info("Using %d positive examples -- class %s" % (len(ll1), ll1[0]))
    logging.info("Using %d negative examples -- class %s" % (len(ll2), ll2[0]))
    
    baselines = calculateBaselines(y, y_greatest)
    
    logging.info("Vectorizing dictionaries...")
    vec, X_noProcess = vectorizeData(listOfDicts) 
    if X_noProcess != []:
        logging.info("Feature Names: %s", vec.get_feature_names())
    logging.info("Vectorized")
   
    logging.info("Preprocessing data")
    X = preprocessing(X_noProcess, preProcessingMethod)
    #print "X_noProcess ----> ", X_noProcess
    #print "X ---> ", X
    logging.info("Data preprocessed")

    #Prepare Test data: 
    Xtest = vec.transform(ldTest).toarray()
    Xtest = preprocessing(Xtest, preProcessingMethod)

    valX = vec.transform(valDicts).toarray()
    valX = preprocessing(valX, preProcessingMethod)
    
    ####
    ### Shuffer samples  (TODO: Cross-validation)
    ##
    #
    logging.info("Shuffling the data...")
    n_samples = len(y)
    newIndices = shuffleIndices(n_samples, nseed)
    X = X[newIndices]
    y = y[newIndices]

    n_samples_val = len(valY)
    newIndices = shuffleIndices(n_samples_val, nseed)
    valX = valX[newIndices]
    valY = valY[newIndices]

    logging.debug("X - %s", X)
    # Shuffle samples
    logging.info("Shuffled")
    
    ####
    ### Run classifiers
    ##
    #
    precRecall, roc = {}, {}
    results = []

    logging.info("Running classifiers...")
    
    if "dmfc" in listOfClassifiers:
        dmfc = DummyClassifier(strategy='most_frequent')
        results.append(classify(dmfc, "DummyMostFrequent", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, Xtest))
    # ================================================================
    if "nbc" in listOfClassifiers or "nb" in listOfClassifiers:
        nbc = GaussianNB()
        results.append(classify(nbc, "Naive Bayes", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, Xtest))
    # ================================================================
    if "knnc" in listOfClassifiers or "knn" in listOfClassifiers:
        knnc = KNeighborsClassifier(n_neighbors=classifyParameters["KNN-K"])
        results.append(classify(knnc, "KNN", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridKNN, "measureProbas":measureProbas}, Xtest))
    # ================================================================
    if "lrc" in listOfClassifiers or "lgr" in listOfClassifiers or "lr" in listOfClassifiers:
        lrc = LogisticRegression(C=classifyParameters["LR-C"])
        results.append(classify(lrc, "Logistic Regression", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridLR, "measureProbas":measureProbas}, Xtest, valX, valY))
    # ================================================================
    if "dtc" in listOfClassifiers:
        dtc = DecisionTreeClassifier( criterion=classifyParameters["DT-criterion"], max_features=classifyParameters["DT-max_features"] )
        results.append(classify(dtc, "Decision Tree", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridDT, "measureProbas":measureProbas}, Xtest))
    # ================================================================
    if "svmc" in listOfClassifiers or "svm" in listOfClassifiers:
        #if SVMKernel == "linear":
        #    svmc = LinearSVC(C=classifyParameters["SVM-C"], class_weight=classifyParameters["SVM-class_weight"])
        #else:
        #    svmc = SVC(kernel=classifyParameters["SVM-kernel"], cache_size=classifyParameters["SVM-cacheSize"], C=classifyParameters["SVM-C"], max_iter=classifyParameters["SVM-maxIter"], probability=measureProbas, gamma=classifyParameters["SVM-gamma"], class_weight=classifyParameters["SVM-class_weight"])
        #results.append(classify(svmc, "SVM", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridSVM, "measureProbas":measureProbas}, Xtest))
        pass
    # ================================================================
    if "etc" in listOfClassifiers:
        etc = ExtraTreesClassifier(random_state=0, n_jobs=nJobs, n_estimators=classifyParameters["ETC-n_estimators"], criterion=classifyParameters["ETC-criterion"], max_features=classifyParameters["ETC-max_features"])
        results.append(classify(etc, "Random Forest", X, y, nCV, nJobs, baselines, {"tryToMeasureFeatureImportance":measureProbas, "featuresOutFilename":(outfileName + ".pk"), "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridETC, "measureProbas":measureProbas}, Xtest, valX, valY))
    
    # ================================================================
    if "sgd" in listOfClassifiers:
        sgd = SGDClassifier(n_jobs=nJobs)
        results.append(classify(sgd, "SGD", X, y, nCV, nJobs, baselines, {"featuresOutFilename":(outfileName + ".pk"), "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridSGD, "measureProbas":measureProbas}, Xtest, valX, valY))

    # ================================================================
    if "gbc" in listOfClassifiers:
        gbc = GradientBoostingClassifier(n_estimators=300,subsample=0.6,max_depth=4,random_state=nseed)
        results.append(classify(gbc, "GBC", X, y, nCV, nJobs, baselines, {"featuresOutFilename":(outfileName + ".pk"), "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridSGD, "measureProbas":measureProbas}, Xtest, valX, valY))
    # ================================================================
    
    
    precRecall, roc = getCurves(results)
    roc["Random Classifier"] = ([0,1],[0,1])

    plotGraph(precRecall, fileName=PRECRECALLNAME, xlabel="Recall", ylabel="Precision", generatePickle=generatePickle, hasPlotLibs=hasPlotLibs)
    plotGraph(roc, fileName=ROCNAME, xlabel="False Positive Rate", ylabel="True Positive Rate", generatePickle=generatePickle, hasPlotLibs=hasPlotLibs)
   
    fo = open(outfileName, "a")

    listProbas = []
    for r in results:
        clfName = r[0]
        resultMetrics = r[1]
        fo.write("%s, %.3f, %.3f, %.3f, %.3f\n" % (clfName, 100.0*resultMetrics.acc, 100.0*resultMetrics.sf1, 100.0*resultMetrics.mf1, 100.0*resultMetrics.wf1))
        print "%s, %.3f, %.3f, %.3f, %.3f" % (clfName, 100.0*resultMetrics.acc, 100.0*resultMetrics.sf1, 100.0*resultMetrics.mf1, 100.0*resultMetrics.wf1)
        
        yTraining = r[4]
        yTrainingProbas = r[5]
        yTest = r[6]
        yTestProbas = r[7]
        writeOutput(clfName + ".csv", yTest)
        
        listProbas.append(yTestProbas)
        #for t,p in zip(yTest, yTestProbas):
        #    print t, p

    mergedYTest = voting(listProbas)
    writeOutput("merged.csv", mergedYTest)


    fo.close()
    logging.info("Done")
def runClassify(preProcessingMethod, forceBalance, proportional, minNumberOfQueries, nseed, explanation, healthUsers, gridSearch, generatePickle, hasPlotLibs, paralled, nJobs, listOfClassifiers, groupsToUse, usingIncremental, outfileName, nCV, measureProbas, incrementalVector):
   
    if healthUsers:
        positiveOutputFile = "healthUser-%d-%s.pk" % (minNumberOfQueries, explanation)
        negativeOutputFile = "notHealthUser-%d-%s.pk" % (minNumberOfQueries, explanation)
    else:
        negativeOutputFile = "regularUser-%d-%s.pk" % (minNumberOfQueries, explanation)
        positiveOutputFile = "medicalUser-%d-%s.pk" % (minNumberOfQueries, explanation)
    
    logging.info("Using seed: %d", nseed)
    logging.info("Loading: %s and %s", positiveOutputFile, negativeOutputFile)
    logging.info("Processing method used: %s", preProcessingMethod)

    if forceBalance > 0:
        logging.warning("Forcing only %s examples for each dataset",forceBalance)

    if proportional > 0:
        logging.warning("Using proportional representation. %s percente of the base.",proportional)
    
    if forceBalance > 0 and proportional > 0:
        logging.error("ERROR! YOU SHOULD CHOOSE OR FORCEBALANCE OR PROPORTIONAL DATA!")
        print "ERROR! YOU SHOULD CHOOSE OR FORCEBALANCE OR PROPORTIONAL DATA!"
        exit(0)

    ####
    ### Load Datasets
    ##
    #
    logging.info("Loading the datasets...")
    with open(negativeOutputFile, 'rb') as input:
        negativeUserFV = pickle.load(input)
    
    with open(positiveOutputFile, 'rb') as input:
        positiveUserFV = pickle.load(input)
    logging.info("Loaded")

    logging.info("Transforming datasets into Dictionaries...")
    if usingIncremental:
        negativeUserFV,ll1 = transformeInIncrementalDict(negativeUserFV, nseed, forceBalance, proportional, groupsToUse, incrementalVector)
        positiveUserFV,ll2 = transformeInIncrementalDict(positiveUserFV, nseed, forceBalance, proportional, groupsToUse, incrementalVector)
        ld1, ld2 = [], []

        lm1 = len(negativeUserFV)
        if lm1 != len(positiveUserFV):
            logging.error("ERROR MAP SIZES ARE NOT EQUAL!")
            print "ERROR MAP SIZES ARE NOT EQUAL!"
            exit(0)

        incrementalFV = defaultdict(list)
        for i in range(lm1):
            incrementalFV[i] = negativeUserFV[i] + positiveUserFV[i]
        
    else:
        ld1, ll1 = transformeInDict(negativeUserFV, nseed, forceBalance, proportional, groupsToUse)
        ld2, ll2 = transformeInDict(positiveUserFV, nseed, forceBalance, proportional, groupsToUse)
    #Free memory
    del positiveUserFV
    del negativeUserFV

    logging.info("Transformed")
    
    listOfDicts = ld1 + ld2
    listOfLabels = ll1 + ll2
    y = np.array( listOfLabels )
    
    greatestClass = 0 if len(ll1) > len(ll2) else 1
    y_greatest =  np.array((len(ll1) + len(ll2)) * [greatestClass] )

    logging.info("Using %d regular users -- class %s" % (len(ll1), ll1[0]))
    logging.info("Using %d medical users -- class %s" % (len(ll2), ll2[0]))
    
    baselines = calculateBaselines(y, y_greatest)
    
    logging.info("Vectorizing dictionaries...")
    vec, X_noProcess = vectorizeData(listOfDicts) 
    if X_noProcess != []:
        logging.info("Feature Names: %s", vec.get_feature_names())
    logging.info("Vectorized")
   
    logging.info("Preprocessing data")
    X = preprocessing(X_noProcess, preProcessingMethod)
    #print "X_noProcess ----> ", X_noProcess
    #print "X ---> ", X
    logging.info("Data preprocessed")

    if usingIncremental:
        incrementalFV = [preprocessing(vec.fit_transform(l).toarray(), preProcessingMethod) for k, l in incrementalFV.iteritems()]
    else:
        incrementalFV = None

    ####
    ### Shuffer samples  (TODO: Cross-validation)
    ##
    #
    logging.info("Shuffling the data...")
    n_samples = len(y)
    newIndices = shuffleIndices(n_samples, nseed)
    if X != []:
        X = X[newIndices]
    y = y[newIndices]
    if usingIncremental:
        incrementalFV = [ fv[newIndices] for fv in incrementalFV ]

    logging.debug("X - %s", X)
    # Shuffle samples
    logging.info("Shuffled")
    
    ####
    ### Run classifiers
    ##
    #
    precRecall, roc = {}, {}
    clfrs = []

    logging.info("Running classifiers...")
    
    if "dmfc" in listOfClassifiers:
        dmfc = DummyClassifier(strategy='most_frequent')
        clfrs.append( (dmfc, "DummyMostFrequent", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}) )
    # ================================================================
    if "dsc" in listOfClassifiers:
        dsc = DummyClassifier(strategy='stratified')
        clfrs.append( (dsc, "DummyStratified", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}) )
    # ================================================================
    if "duc" in listOfClassifiers:
        duc = DummyClassifier(strategy='uniform')
        clfrs.append( (duc, "DummyUniform", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}) )
    # ================================================================
    if "nbc" in listOfClassifiers or "nb" in listOfClassifiers:
        nbc = GaussianNB()
        clfrs.append( (nbc, "Naive Bayes", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}) )
    # ================================================================
    if "knnc" in listOfClassifiers or "knn" in listOfClassifiers:
        knnc = KNeighborsClassifier(n_neighbors=classifyParameters["KNN-K"])
        clfrs.append( (knnc, "KNN", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridKNN, "measureProbas":measureProbas}) )
    # ================================================================
    if "lrc" in listOfClassifiers:
        lrc = LogisticRegression(C=classifyParameters["LR-C"])
        clfrs.append( (lrc, "Logistic Regression", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridLR, "measureProbas":measureProbas}))
    # ================================================================
    if "dtc" in listOfClassifiers:
        dtc = DecisionTreeClassifier( criterion=classifyParameters["DT-criterion"], max_features=classifyParameters["DT-max_features"] )
        clfrs.append( (dtc, "Decision Tree", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridDT, "measureProbas":measureProbas}) )
    # ================================================================
    if "svmc" in listOfClassifiers or "svm" in listOfClassifiers:
        if SVMKernel == "linear":
            svmc = LinearSVC(C=classifyParameters["SVM-C"], class_weight=classifyParameters["SVM-class_weight"])
        else:
            svmc = SVC(kernel=classifyParameters["SVM-kernel"], cache_size=classifyParameters["SVM-cacheSize"], C=classifyParameters["SVM-C"], max_iter=classifyParameters["SVM-maxIter"], probability=measureProbas, gamma=classifyParameters["SVM-gamma"], class_weight=classifyParameters["SVM-class_weight"])

        clfrs.append( (svmc, "SVM", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridSVM, "measureProbas":measureProbas}) )
    # ================================================================
    if "etc" in listOfClassifiers:
        etc = ExtraTreesClassifier(random_state=0, n_jobs=nJobs, n_estimators=classifyParameters["ETC-n_estimators"], criterion=classifyParameters["ETC-criterion"], max_features=classifyParameters["ETC-max_features"])
        clfrs.append( (etc, "Random Forest", X, y, nCV, nJobs, baselines, {"tryToMeasureFeatureImportance":True, "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridETC, "measureProbas":measureProbas, "featuresOutFilename":(outfileName + ".pk")}) )
    
    results = []
    if paralled:
        from scoop import futures
        results = futures.map(parallelClassify,clfrs)
    else:
        if "dmfc" in listOfClassifiers:
            results.append(classify(dmfc, "DummyMostFrequent", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, incremental=incrementalFV))
        if "dsc" in listOfClassifiers:
            results.append(classify(dsc, "DummyStratified", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, incremental=incrementalFV))
        if "duc" in listOfClassifiers:
            results.append(classify(duc, "DummyUniform", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, incremental=incrementalFV))
        if "nbc" in listOfClassifiers or "nb" in listOfClassifiers:
            results.append(classify(nbc, "Naive Bayes", X, y, nCV, nJobs, baselines, {"measureProbas":measureProbas}, incremental=incrementalFV))
        if "knnc" in listOfClassifiers or "knn" in listOfClassifiers:
            results.append(classify(knnc, "KNN", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridKNN, "measureProbas":measureProbas}, incremental=incrementalFV))
        if "lrc" in listOfClassifiers:
            results.append(classify(lrc, "Logistic Regression", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridLR, "measureProbas":measureProbas}, incremental=incrementalFV))
        if "dtc" in listOfClassifiers:
            results.append(classify(dtc, "Decision Tree", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridDT, "measureProbas":measureProbas}, incremental=incrementalFV))
        if "svmc" in listOfClassifiers or "svm" in listOfClassifiers:
            results.append(classify(svmc, "SVM", X, y, nCV, nJobs, baselines, {"useGridSearch":gridSearch, "gridParameters":gridSVM, "measureProbas":measureProbas}, incremental=incrementalFV))
        if "etc" in listOfClassifiers:
            results.append(classify(etc, "Random Forest", X, y, nCV, nJobs, baselines, {"tryToMeasureFeatureImportance":measureProbas, "featuresOutFilename":(outfileName + ".pk"), "featureNames":vec.get_feature_names(), "useGridSearch":gridSearch, "gridParameters":gridETC, "measureProbas":measureProbas}, incremental=incrementalFV))

    precRecall, roc = getCurves(results)
    roc["Random Classifier"] = ([0,1],[0,1])

    plotGraph(precRecall, fileName=PRECRECALLNAME, xlabel="Recall", ylabel="Precision", generatePickle=generatePickle, hasPlotLibs=hasPlotLibs)
    plotGraph(roc, fileName=ROCNAME, xlabel="False Positive Rate", ylabel="True Positive Rate", generatePickle=generatePickle, hasPlotLibs=hasPlotLibs)
   
    fo = open(outfileName, "a")

    for r in results:
        label = r[0]
        resultMetrics = r[1]
        if usingIncremental:
            for i, part in zip(range(len(incrementalVector)), incrementalVector):
                fo.write("%s, Partition %d, %.3f, %.3f, %.3f, %.3f\n" % (label, part/10, 100.0*(resultMetrics.acc[i]), 100.0*resultMetrics.sf1[i], 100.0*resultMetrics.mf1[i], 100.0*resultMetrics.wf1[i]))
                print "%s, Partition %d, %.3f, %.3f, %.3f, %.3f" % (label, part/10, 100.0*(resultMetrics.acc[i]), 100.0*resultMetrics.sf1[i], 100.0*resultMetrics.mf1[i], 100.0*resultMetrics.wf1[i])
            
            print "Means ----- %s, %.3f, %.3f, %.3f, %.3f" % (label, 100.0*(np.mean(resultMetrics.acc)), 100.0*np.mean(resultMetrics.sf1), 100.0*np.mean(resultMetrics.mf1), 100.0*np.mean(resultMetrics.wf1))
        else:
            fo.write("%s, %.3f, %.3f, %.3f, %.3f\n" % (label, 100.0*resultMetrics.acc, 100.0*resultMetrics.sf1, 100.0*resultMetrics.mf1, 100.0*resultMetrics.wf1))
            print "%s, %.3f, %.3f, %.3f, %.3f" % (label, 100.0*resultMetrics.acc, 100.0*resultMetrics.sf1, 100.0*resultMetrics.mf1, 100.0*resultMetrics.wf1)

    fo.close()
    logging.info("Done")