Example #1
0
def main(met_train, met_test, aqi_train, aqi_test, test):
    while True:
        ch = int(
            input("\n\nchose among the following classifier\n"
                  "1.Rnadom Forrest\n"
                  "2.K-NN\n"
                  "3.SVM\n"
                  "4.Decision Tree\n"
                  "5.exit\n"))
        if ch == 1:
            model, accuracy = Classifiers.Random_Forest_Classifier(
                met_train, met_test, aqi_train, aqi_test)
            print(model.predict(test))
            print(accuracy)

        elif ch == 2:
            model, accuracy = Classifiers.KNN(met_train, met_test, aqi_train,
                                              aqi_test)
            print(model.predict(test))
            print(accuracy)

        elif ch == 3:
            model, accuracy = Classifiers.SVM(met_train, met_test, aqi_train,
                                              aqi_test)
            print(model.predict(test))
            print(accuracy)
        elif ch == 4:
            model, accuracy = Classifiers.Decision_tree(
                met_train, met_test, aqi_train, aqi_test)
            print(model.predict(test))
            print(accuracy)

        elif ch == 5:
            break
Example #2
0
def clf_vote(dataVolume,no_rand_rate=0):
    data = createData.createData(dataVolume)  # ꘟꜟ(2) å¤©ę°”(2) ꗶ闓(6) č”Œčæ›ę–¹å‘(4) ē¦»åŸŗē«™ę–¹å‘(6) 前äø€äøŖBS(6)
    x, y = data[:, :-1], data[:, -1]
    if no_rand_rate != 0 :
        for i in range(len(y)):
            if y[i] % no_rand_rate == 0:
                y[i] = random.randint(1, 6)
    X_train, X_test, y_train, y_test = train_test_split(x , y, test_size = 0.5, random_state = 42)
    svm_clf = Classifiers.svm_estimator(X_train,y_train)
    RF_clf = Classifiers.RandomForest_best_estimator(X_train,y_train)
    DT_clf = Classifiers.DecisionTree_best_estimator(X_train,y_train)
    mlp_clf = Classifiers.MLPClassifier_estimator(X_train,y_train)
    count = 0.0
    time_start = time.time()
    pre_svm = list(svm_clf.predict(X_test))
    pre_RF = list(RF_clf.predict(X_test))
    pre_DT = list(DT_clf.predict(X_test))
    pre_MLP = list(mlp_clf.predict(X_test))
    time_end = time.time()
    time_pre = time_end - time_start
    for i in range(len(pre_svm)):
        vote_dict = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
        vote_dict[pre_svm[i]] += 1
        vote_dict[pre_RF[i]] += 1
        vote_dict[pre_DT[i]] += 1
        vote_dict[pre_MLP[i]] += 1
        vote_result = max(vote_dict, key=vote_dict.get)
        if vote_result == y_test[i]:
            count +=1
    acc = count / len(y_test)
    return time_pre,acc
Example #3
0
def Cluster():
    regex = re.compile("^.*.xlsx$")
    Lobj = ListObjects.ListObjects()

    #Group based on number of non-zero columns
    if os.path.exists(path):
        for Exfile in os.listdir(path):
            if regex.match(Exfile):
                read = pd.ExcelFile(path + "/" + Exfile)
                for sheet in read.sheet_names:
                    dframe = read.parse(sheet, header=None)
                    dframe = dframe.as_matrix()
                    key = group.GetNonZeroColumn(dframe, Exfile, sheet)
                    if key != None:
                        group.ClassifyPattern(key, dframe, Lobj)

    else:
        print "Path " + path + " Doesn't exist"

    #Group Based on weight
    group.SubClassify("weight", Lobj)
    #Not worth classifying patterns on rows, as there are cases which gets
    #are affected due to internal column rotation
    #group.SubClassify("row",Lobj)
    group.SubClassify("zeroweight", Lobj)
    keys = Lobj.PatternObjectList.keys()
    #group.ExcelWrite(Lobj)
    DB = db.Write2Db("pattern", "set1")
    for key in keys:
        (column, weight, zeroweight) = key.split("_")
        matrices = Lobj.PatternObjectList[key]
        DB.Insert(key, column, weight, zeroweight, matrices)
    return
Example #4
0
def real_vs_fake_flixster():
    X, T = RFData.load_real_fake_data_flixster(file_index=4)
    # X, T = RFData.load_real_fake_data_ML_100k()
    # print(type(Y[0]))
    # Classifiers.log_reg(X, Y)
    #X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    #X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    Classifiers.log_reg(X, T)
Example #5
0
def real_vs_fake_libimseti():
    import LibimSeTiData as LD
    X, T = RFData.load_real_fake_data_libimseti(file_index=11)
    # X, T = RFData.load_real_fake_data_ML_100k()
    # print(type(Y[0]))
    # Classifiers.log_reg(X, Y)
    #X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    #X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    Classifiers.log_reg(X, T)
Example #6
0
def real_vs_fake():
    X, T = RFData.load_real_fake_data_ML_1m(file_index=49)
    #X, T = RFData.load_real_fake_data_ML_100k()
    #print(type(Y[0]))
    # Classifiers.log_reg(X, Y)
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    Classifiers.log_reg(X_train, T_train)

    from sklearn.model_selection import StratifiedKFold
    from sklearn.linear_model import LogisticRegression

    cv = StratifiedKFold(n_splits=10)
    coefs = []
    avg_coefs = np.zeros(shape=(len(X_train[1]),))

    random_state = np.random.RandomState(0)
    for train, test in cv.split(X_train, T_train):
        x, t = X_train[train], T_train[train]
        model = LogisticRegression(penalty='l2', random_state=random_state)
        model.fit(x, t)
        # rank the coefs:
        ranks = ss.rankdata(model.coef_[0])
        coefs.append(ranks)
        # print(len(model.coef_[0]),len(X_train[0]))
        avg_coefs += model.coef_[0]

    coefs = np.average(coefs, axis=0)
    coefs = [[coefs[i], i, avg_coefs[i]] for i in range(len(coefs))]
    coefs = np.asarray(list(sorted(coefs)))

    values = coefs[:, 2]
    index_zero = np.where(values == np.min(np.abs(values)))
    top_male = index_zero[0][0]
    top_female = index_zero[0][-1]
    L_m = coefs[:top_male, 1]
    R_m = 3952 - coefs[:top_male, 0]
    C_m = np.abs(coefs[:top_male, 2])
    L_f = coefs[coefs.shape[0] - top_female:, 1]
    L_f = list(reversed(L_f))
    R_f = coefs[coefs.shape[0] - top_female:, 0]
    R_f = list(reversed(R_f))
    C_f = coefs[coefs.shape[0] - top_female:, 2]
    C_f = list(reversed(np.abs(C_f)))
    id_index, index_id = MD.load_movie_id_index_dict()
    movies = []
    with open("ml-1m/movies.dat", 'r') as f:
        for line in f.readlines():
            movies.append(line.replace("\n", ""))

    for index, val in enumerate(L_m[0:min(10,len(L_m))]):
        print(index, movies[id_index[int(val)+1]], C_m[index])
    for index, val in enumerate(L_f[0:min(10,len(L_f))]):
        print(index, movies[id_index[int(val)+1]], C_f[index])
 def ExtraTrees_LBGLCM(self):
     global accuracies, all_classifiers, labels
     lbglcm_feat = self.compute_LBGLCM()
     n_trees = int(self.NotreesXtra.text())
     max_feats = self.FeaturesXtra.currentText()
     clf, x_x2, y_x2, dict4 = Classifiers.Xtra(
         lbglcm_feat, n_trees, max_feats
     )  #Collecting the trained classifier, x_test, y_test and labels
     Y_pred_x2 = Classifiers.pred(clf, x_x2)
     acc_test = Classifiers.display_results(Y_pred_x2, y_x2)
     all_classifiers.append(clf)
     accuracies.append(acc_test)
     labels.append(dict4)
 def RandomTrees_LBGLCM(self):
     global accuracies, all_classifiers, labels
     lbglcm_feat = self.compute_LBGLCM()
     n_trees = self.notreesRF.text()
     max_feats = self.FeaturesRF.currentText()
     clf, x_rf2, y_rf2, dict2 = Classifiers.RF_train(
         lbglcm_feat, n_trees, max_feats
     )  #Collecting the trained classifier, x_test, y_test and labels
     Y_pred_rf2 = Classifiers.pred(clf, x_rf2)
     acc_test = Classifiers.display_results(Y_pred_rf2, y_rf2)
     all_classifiers.append(clf)
     accuracies.append(acc_test)
     labels.append(dict2)
 def GB_LBGLCM(self):
     global accuracies, all_classifiers
     lbglcm_feat = self.compute_LBGLCM()
     n_est = int(self.Estimators_gb.text())
     max_feats = self.Features_gb.currentText()
     Lrate = float(self.lineEdit_4.text())
     clf, x_g2, y_g2, dict6 = Classifiers.GB(
         lbglcm_feat, n_est, max_feats, Lrate
     )  #Collecting the trained classifier, x_test, y_test and labels
     Y_pred_g2 = Classifiers.pred(clf, x_g2)
     acc_test = Classifiers.display_results(Y_pred_g2, y_g2)
     all_classifiers.append(clf)
     accuracies.append(acc_test)
     labels.append(dict6)
 def RandomTrees_GLCM(self):
     global accuracies, all_classifiers, labels
     glcm_feat = self.compute_GLCM()
     n_trees = self.notreesRF.text()
     max_feats = self.FeaturesRF.currentText()
     clf, x_rf1, y_rf1, dict1 = Classifiers.RF_train(
         glcm_feat, n_trees, max_feats
     )  #Collecting the trained classifier, x_test, y_test and labels
     Y_pred_rf1 = Classifiers.pred(clf,
                                   x_rf1)  #Predicting the x_test labels
     acc_test = Classifiers.display_results(Y_pred_rf1,
                                            y_rf1)  #accuracy of prediction
     all_classifiers.append(clf)
     accuracies.append(acc_test)
     labels.append(dict1)
Example #11
0
	def __init__(self,num_filters):
		self.featExtractor = Classifiers.IMU_CNN_3D_FEATURE_EXTRACTOR(suffix="40Hz",num_filters=num_filters,patience=250,layers=3,kern_size=32,divide_kernel_size=True)
		self.featExtractor.loadBestWeights()
		#Output of grid search:
		#{'bootstrap': False, 'max_depth': 142, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 60}
		#Initializing RF
		self.clf_RF = RandomForestClassifier(n_estimators=60,min_samples_split=2,min_samples_leaf=2,max_features='auto',max_depth=142,bootstrap=False)
Example #12
0
def Analyze_SubTopic_Scores(metadata, bidx, cl, outfile, **CVargs):
    '''
    grid_search reveals little variation. Linear 100 or linear 10 seem best, but not a huge effect
    Number of cases doesn't matter for MNB because most are small sample sizes (largest is 6,000).
    '''
    print 'SUBTOPIC ANALYSIS'

    #num=1000
    #metadata=ImportMeta(-1)
    path = 'Twitter/Data/'
    PREDS = {}
    for cat in set([line[1] for line in metadata.values()]):
        if cat == 'category' or cat == 'party':
            continue
        if cat == 'Student' or cat == 'indUnk':
            args = {'n_iter': 20, 'test_size': .9, 'random_state': 0}
        else:
            args = CVargs.copy()
        print 'RUNNINING ', cat, ' SUBTOPIC SCORES'
        f = 'Twitter_' + cat + '_Topic_Scores.csv'
        data = ImportCSVFeatureData(path + f, -1)
        vec = np.array([[float(l) for l in line[1:]]
                        for line in data])  #exclude cases where sex is unknown
        labels = np.array([metadata[line[0]][0]
                           for line in data])  # if 'age' not in line])
        IDX = np.array([line[0] for line in data])
        vec, labels, IDX = balance(vec, labels, IDX, bidx)
        Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **args)
        print 'standardizing scores'
        preds = {}
        for k, score in Preds.iteritems():
            if np.inf in score:
                original = len(score)
                x = list(np.array(score)[np.logical_not(np.isinf(score))])
                try:
                    max(x)
                except:
                    continue
                x.append(max(x))
                preds[k] = np.mean(x)
            elif -1 * np.inf in score:
                original = len(score)
                x = list(np.array(score)[np.logical_not(np.isinf(score))])
                try:
                    min(x)
                except:
                    continue
                x.append(min(x))
                preds[k] = np.mean(x)
            else:
                preds[k] = np.mean(score)
        m = np.mean(preds.values())
        sd = np.std(preds.values())
        for k, score in preds.iteritems():
            preds[k] = (score - m) / sd
        PREDS.update(preds)
    Write_Scores(PREDS, ['id', 'subtopic_score'], outfile)
    return
 def CNN(self):
     global accuracies, all_classifiers
     epoch = int(self.epochs.text())
     dataset_loc = self.FileLocation.text()
     val_split = float(self.validation_split.text())
     accuracy, clf, val_datagen = Classifiers.CNN(
         dataset_loc, epoch, val_split
     )  #Collecting the test accuracy, trained classifier and y_test
     accuracies.append(accuracy[0])
     all_classifiers.append(clf)
Example #14
0
def Analyze_KBest_Scores(metadata, bidx, cl, outfile, **CVargs):
    filename = 'KBest'
    vec, ids, words = importArray(filename)

    labels = np.array([metadata[idx][0]
                       for idx in ids])  # if 'age' not in line])
    IDX = np.array(ids)
    #
    #filename='Twitter/Data/Twitter_KBest_Scores.csv'
    #data=ImportCSVFeatureData(filename,-1)
    #print 'drawing samples'
    #vec=np.array([[float(l) for l in line[1:]] for line in data])   #exclude cases where sex is unknown
    #labels=np.array([metadata[line[0]][0] for line in data])# if 'age' not in line])
    #IDX=np.array([line[0] for line in data])

    vec, labels, IDX = balance(vec, labels, IDX, bidx)

    print 'drawing samples'
    labels = np.array([metadata[idx][0]
                       for idx in ids])  # if 'age' not in line])
    IDX = np.array(ids)

    vec, labels, IDX = balance(vec, labels, IDX, bidx)
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)
    print 'standardizing scores'
    preds = {}
    for k, score in Preds.iteritems():
        if np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                max(x)
            except:
                continue
            x.append(max(x))
            preds[k] = np.mean(x)
        elif -1 * np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                min(x)
            except:
                continue
            x.append(min(x))
            preds[k] = np.mean(x)
        else:
            preds[k] = np.mean(score)
    m = np.mean(preds.values())
    sd = np.std(preds.values())

    for k, score in preds.iteritems():
        preds[k] = (score - m) / sd
    #fname='Brown/Results/Raw_Preds.csv'
    Write_Scores(preds, ['id', 'kbest_score'], outfile)
    return
def main():

    u_data, v_data, fg_imgs, original_imgs, abnormal_fg_imgs = load_data(
    )  #load data from ref_data

    weight = Weight_matrix().get_weight_matrix(
    )  #use normalization method for feature correction

    thisFeatureExtractor = Feature_extractor(original_imgs, fg_imgs,
                                             abnormal_fg_imgs, u_data, v_data,
                                             weight)

    train_data, train_labels = thisFeatureExtractor.get_features_and_labels(
        80, 140)  #training frames

    ########################## To see the train data features distribution, uncomment next line##################################
    #uvPlot(train_data[:,0],train_data[:,1],train_labels,False)
    #############################################################################################################################

    classifiers = Classifiers(train_data, train_labels)

    test_data, test_labels = thisFeatureExtractor.get_features_and_labels(
        140, 199)  #testing frames

    for name, model in classifiers.models.items():  #get each classifier
        for ind, original_img in enumerate(
                original_imgs[:-1]):  #get each frame

            pos, thisImg, _, _ = thisFeatureExtractor.getPosition(
                fg_imgs, ind)  #get the position of each person in this frame

            features, _ = thisFeatureExtractor.get_features_and_labels(
                ind, ind + 1,
                False)  #get the features for each person in the frame

            labels = classifiers.models[name].predict(features)  #predict label

            plot(pos, labels, thisImg, name)  #show

        classifiers.prediction_metrics(
            test_data, test_labels,
            name)  #metrics for each classifier based on the test data
def quickDemo():
    trials = 5000
    X, y = cl.makeData(train)
    index = rn.sample(range(0, 39739), trials)
    smallX = np.empty((trials, len(X[0])))
    smally = np.empty(trials, dtype='|S30')
    count = 0
    for i in index:
        smallX[count] = X[i]
        smally[count] = y[i]
        count = count + 1
    start = time.time()
    cl.SVM(smallX, smally)
    print((time.time() - start) / 5)
    start = time.time()
    cl.NearestNeighbor(smallX, smally)
    print((time.time() - start) / 5)
    start = time.time()
    cl.MLP(smallX, smally)
    print((time.time() - start) / 5)
Example #17
0
def Analyze_Nonword_Scores(metadata, bidx, cl, outfile, **CVargs):
    '''
    
    check rows 1535 and 15349 for inf data. Should no longer have to recode 8 and 12 (herndanV and LnM)
    
    '''
    print 'NONWORD ANALYSIS'

    #metadata=ImportMeta(-1)
    filename = 'Twitter/Data/Twitter_Nonword_Scores.csv'
    data = ImportCSVFeatureData(filename, -1)
    print 'drawing samples'
    vec = np.array([[float(l) for l in line[1:]]
                    for line in data])  #exclude cases where sex is unknown
    #vec[:,8]=vec[:,8]*-1    #herndanV is always neg (changed in Make_Twitter_Data now)
    #vec[:,12]=vec[:,12]*-1   #LnM is always neg (changed in Make_Twitter_Data now)
    labels = np.array([metadata[line[0]][0]
                       for line in data])  # if 'age' not in line])
    IDX = np.array([line[0] for line in data])
    vec, labels, IDX = balance(vec, labels, IDX, bidx)
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)
    print 'standardizing scores'
    preds = {}
    for k, score in Preds.iteritems():
        if np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                max(x)
            except:
                continue
            x.append(max(x))
            preds[k] = np.mean(x)
        elif -1 * np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                min(x)
            except:
                continue
            x.append(min(x))
            preds[k] = np.mean(x)
        else:
            preds[k] = np.mean(score)
    m = np.mean(preds.values())
    sd = np.std(preds.values())
    for k, score in preds.iteritems():
        preds[k] = (score - m) / sd
    #fname='Nonwords_Preds.csv'
    Write_Scores(preds, ['id', 'nonword_score'], outfile)

    return
def dataVolume_runtime(dataVolume, no_rand_rate=2):
    # data = cd.createData(dataVolume)
    data = createData.createData(
        dataVolume)  #ꘟꜟ(2) å¤©ę°”(2) ꗶ闓(6) č”Œčæ›ę–¹å‘(4) ē¦»åŸŗē«™ę–¹å‘(6) 前äø€äøŖBS(6)
    # data = data_normalized.createData(dataVolume)
    x, y = data[:, :-1], data[:, -1]
    for i in range(len(y)):
        if y[i] % no_rand_rate == 0:
            y[i] = random.randint(1, 6)
    print('data set done!')

    #DecisionTree:
    bestTree = clf.DecisionTree_best_estimator(x, y)
    DTScore = cross_val_score(bestTree, x, y)

    #k-Neighbors:
    k_best = clf.k_neighbors_best_estimator(x, y)
    k_Neighbors_score = cross_val_score(k_best, x, y)

    #randomforest:
    rf_best = clf.RandomForest_best_estimator(x, y)
    rf_score = cross_val_score(rf_best, x, y)

    #svm:
    svm_model = clf.svm_estimator(x, y)
    svm_score = cross_val_score(svm_model, x, y)

    #MLPClassifier
    nn_best = clf.MLPClassifier_estimator(x, y)
    nn_score = cross_val_score(nn_best, x, y, cv=6)

    #vote
    vote_time, vote_acc = vote.clf_vote(dataVolume * 2,
                                        no_rand_rate=no_rand_rate)

    # return DTScore,k_Neighbors_score,rf_score,svm_score,dttime,knntime,rftime,svmtime
    return float(sum(DTScore)) / len(DTScore),float(sum(k_Neighbors_score)) / len(k_Neighbors_score),\
           float(sum(rf_score)) / len(rf_score),float(sum(svm_score)) / len(svm_score), \
           float(sum(nn_score)) / len(nn_score),vote_acc,vote_time
Example #19
0
def Analyze_Raw(metadata, bidx, cl, outfile, **CVargs):
    '''
    mnb max's out at 69/70% accurate at 3,000 (or 600 training) texts.  Does not increase in accuracy after that.
    svm: grid search showed ideal is linear kernal with C=1,10, or 100; also max's out at 74% accurate for 3,000 (goes to 76 at 10,000)
    
    '''
    print 'running Raw analysis'

    #metadata=ImportMeta(-1)
    filename = 'Raw'
    vec, ids, words = importArray(filename)
    print 'drawing samples'
    #vec=data[0:,1:] #grab all but zeroth column
    #labels=data[0:,0]   #grab all of zeroth column
    labels = np.array([metadata[idx][0]
                       for idx in ids])  # if 'age' not in line])
    IDX = np.array(ids)

    vec, labels, IDX = balance(vec, labels, IDX, bidx)
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)
    print 'standardizing scores'
    preds = {}
    for k, score in Preds.iteritems():
        if np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                max(x)
            except:
                continue
            x.append(max(x))
            preds[k] = np.mean(x)
        elif -1 * np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                min(x)
            except:
                continue
            x.append(min(x))
            preds[k] = np.mean(x)
        else:
            preds[k] = np.mean(score)
    m = np.mean(preds.values())
    sd = np.std(preds.values())

    for k, score in preds.iteritems():
        preds[k] = (score - m) / sd
    #fname='Raw_Preds.csv'
    Write_Scores(preds, ['id', 'raw_score'], outfile)
    return
Example #20
0
def Analyze_Raw_Topic_Scores(metadata, bidx, cl, outfile, **CVargs):
    '''
    grid_search shows C>=1 is ideal. remains 71% from 500 through 7000
    remains at 71% at sample sizes from 500 through 10000.
    '''
    print 'RAW TOPIC ANALYSIS'

    #metadata=ImportMeta(-1)
    filename = 'Twitter/Data/Raw_Topic_Scores.csv'
    data = ImportCSVFeatureData(filename, -1)
    print 'drawing samples'
    vec = np.array([[float(l) for l in line[1:]]
                    for line in data])  #exclude cases where sex is unknown
    labels = np.array([metadata[line[0]][0]
                       for line in data])  # if 'age' not in line])
    IDX = np.array([line[0] for line in data])

    vec, labels, IDX = balance(vec, labels, IDX, bidx)
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)
    print 'standardizing scores'
    preds = {}
    for k, score in Preds.iteritems():
        if np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                max(x)
            except:
                continue
            x.append(max(x))
            preds[k] = np.mean(x)
        elif -1 * np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                min(x)
            except:
                continue
            x.append(min(x))
            preds[k] = np.mean(x)
        else:
            preds[k] = np.mean(score)
    m = np.mean(preds.values())
    sd = np.std(preds.values())

    for k, score in preds.iteritems():
        preds[k] = (score - m) / sd
    #fname='Raw_Topic_Preds.csv'
    Write_Scores(preds, ['id', 'rawTopic_score'], outfile)

    return
def classify(selected_classifier, directory_of_image, trained_classifiers,
             labels):
    if selected_classifier == 'GLCM+Random Forest':
        feat = extract(selected_classifier, directory_of_image)
        Ans = Classifiers.pred(trained_classifiers[0], feat)
        dict = labels[0]
        return dict[Ans[0]]

    if selected_classifier == "LBGLCM + Random Forest":
        feat = extract(selected_classifier, directory_of_image)
        Ans = Classifiers.pred(trained_classifiers[1], feat)
        dict = labels[1]
        return dict[Ans[0]]

    if selected_classifier == "GLCM + Extra Trees Classifier":
        feat = extract(selected_classifier, directory_of_image)
        Ans = Classifiers.pred(trained_classifiers[2], feat)
        dict = labels[2]
        return dict[Ans[0]]

    if selected_classifier == "LBGLCM + Extra Trees Classifier":
        feat = extract(selected_classifier, directory_of_image)
        Ans = Classifiers.pred(trained_classifiers[3], feat)
        dict = labels[3]
        return dict[Ans[0]]

    if selected_classifier == "GLCM + Gradient Boosting Classifier":
        feat = extract(selected_classifier, directory_of_image)
        Ans = Classifiers.pred(trained_classifiers[4], feat)
        dict = labels[4]
        return dict[Ans[0]]

    if selected_classifier == "LBGLCM + Gradient Boosting Classifier":
        feat = extract(selected_classifier, directory_of_image)
        Ans = Classifiers.pred(trained_classifiers[5], feat)
        dict = labels[5]
        return dict[Ans[0]]

    if selected_classifier == 'Convolutional Neural Networks':
        test_image = image.load_img(directory_of_image, target_size=(64, 64))
        test_image = image.img_to_array(test_image)
        test_image = np.expand_dims(test_image, axis=0)
        test_image /= 255.
        Ans = Classifiers.pred(trained_classifiers[6], test_image)
        final_ans = Ans[0]
        dict = {}
        dict[0] = 'Crazing'
        dict[1] = 'Inclusion'
        dict[2] = 'Patches'
        dict[3] = 'Pitted Surface'
        dict[4] = 'RS'
        dict[5] = 'Scratch'
        return dict[np.argmax(final_ans)]
Example #22
0
def Analyze_Individual(metadata, bidx, cl, outfile, **CVargs):
    '''
    grid search shows C>=1 is optimal
    accuracy is unrelated to sample size (remains 84-89% throughout)
    '''
    print 'INDIVIDUAL ANALYSIS'
    #metadata=ImportMeta(-1)
    filename = 'Twitter/Data/Twitter_Individual_Scores.txt'
    data = ImportFeatureData(filename, -1)
    vec = np.array([line[2:] for line in data if line[1] != 1.0
                    ])  #exclude cases where sex is never mentioned
    labels = np.array([
        metadata[line[0]][0] for line in data if line[1] != 1.0
    ])  # if 'age' not in line])
    IDX = np.array([line[0] for line in data if line[1] != 1.0])

    vec, labels, IDX = balance(vec, labels, IDX, bidx)
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)
    print 'standardizing scores'
    preds = {}
    for k, score in Preds.iteritems():
        if np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                max(x)
            except:
                continue
            x.append(max(x))
            preds[k] = np.mean(x)
        elif -1 * np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                min(x)
            except:
                continue
            x.append(min(x))
            preds[k] = np.mean(x)
        else:
            preds[k] = np.mean(score)
    m = np.mean(preds.values())
    sd = np.std(preds.values())
    for k, score in preds.iteritems():
        preds[k] = (score - m) / sd
    #fname='Individual_Preds.csv'
    Write_Scores(preds, ['id', 'indiv_score'], outfile)

    return
Example #23
0
def worker(positives, negatives, classifiersToUse, feats, outFile, i, return_dict):
    """thread worker function"""
    # positives, negatives, featuresToUse, whereToPrint, verbose, classifiersToUse
    results = Classifiers.runClassifiers(positives, negatives, feats, "output.txt", False, classifiersToUse)
    print("done ", i)

    for r in results:
        return_dict["accuracy"] += r[1]
        return_dict["pos_precision"] += r[2]
        return_dict["pos_recall"] += r[3]
        return_dict["pos_f1"] += r[4]
        return_dict["neg_precision"] += r[5]
        return_dict["neg_recall"] += r[6]
        return_dict["neg_f1"] += r[7]

    return
Example #24
0
def Analyze_LIWC(metadata, bidx, cl, outfile, **CVargs):
    filename = 'Twitter/Data/Twitter_LIWC_Scores.csv'
    data = ImportCSVFeatureData(filename, -1)
    print 'drawing samples'
    vec = np.array([[float(l) for l in line[1:]]
                    for line in data])  #exclude cases where sex is unknown
    labels = np.array([metadata[line[0]][0]
                       for line in data])  # if 'age' not in line])
    IDX = np.array([line[0] for line in data])

    vec, labels, IDX = balance(vec, labels, IDX, bidx)
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)
    preds = {}
    for k, score in Preds.iteritems():
        if np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                max(x)
            except:
                continue
            x.append(max(x))
            preds[k] = np.mean(x)
        elif -1 * np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                min(x)
            except:
                continue
            x.append(min(x))
            preds[k] = np.mean(x)
        else:
            preds[k] = np.mean(score)
    m = np.mean(preds.values())
    sd = np.std(preds.values())

    for k, score in preds.iteritems():
        preds[k] = (score - m) / sd
    #fname='LIWC_Preds.csv'
    Write_Scores(preds, ['id', 'liwc_score'], outfile)
    return
    def main(self):

        pre = Preprocessing.Preprocessing(self.features_to_consider)

        if self.normalise:
            pre.scale()  # TODO different scaling options

        if self.classification_model == 'logisticRegression':
            logReg_predict = Classifiers.Classifiers(pre.features_dataframe,
                                                     self.features_to_consider)

            if self.splitByDate:
                logReg_predict.splitDataset(self.trainingDateStart,
                                            self.trainingDateEnd,
                                            self.testDateStart,
                                            self.testDateEnd)

            logReg_predict.LogRegTrainModel()
            logReg_predict.LogRegTestModel()

        else:
            print("other model")
def elim(df):
    #print df
    latlong,_=Classifiers.cut_loca_price(df)
    boolean=np.ones((len(latlong)), dtype=bool)
    boolean=np.invert(boolean)
    for i in range(len(latlong)):
        for j in range(i):
            if np.array_equal(latlong[i], latlong[j]):
                boolean[j]=True
    tokeep=np.where(np.invert(boolean))
    df=df.iloc[tokeep[0]]
    return df

# LOCATION=Constants.filelocations.GOOGLE_NOMINATIM_HOUSEPRICE
# df=DO.readgeofile(LOCATION)
# train,test,train_index,test_index=DO.dataseperator(df)
# print train['Latitude'][301],train['Longitude'][301]
# print train['Latitude'][577],train['Longitude'][577]
# redf=elim(train)
# print redf

    
    
Example #27
0
    def evaluate(self):

        # Evaluate using k-fold crossvalidation
        k = 5
        k_fold = StratifiedKFold(k)

        print("Performing %s-fold crossvalidation" % k)

        real_labels = []
        predicted_labels = []

        for train, test in k_fold.split(self.feature_vector,
                                        self.target_vector):
            training_set = self.feature_vector[train]
            testing_set = self.feature_vector[test]

            if self.type_dimensionality_reduction:
                mapping = DimensionalityReduction.reduce_dimensionality(
                    self.feature_vector[train], self.target_vector[train],
                    self.type_dimensionality_reduction)
                training_set = mapping.transform(training_set)
                testing_set = mapping.transform(testing_set)

            classifier = Classifiers.train(self.classifier_name, training_set,
                                           self.target_vector[train],
                                           self.classifier_variables)

            real_labels.extend(self.target_vector[test])
            predicted_labels.extend(classifier.predict(testing_set))

        # Get the evaluation metrics
        resulting_metrics = EvaluationMetrics(real_labels, predicted_labels,
                                              self.labels_order)
        print("\t\tDone!!")

        return resulting_metrics
Example #28
0
def main():
    directory = '/xxxx/features'
    feature_names = [name for name in os.listdir(directory) if name == '200']
    feature_names.sort(reverse=True)
    for feature_name in feature_names:
        # K=dictKvalue[feature_name]
        K = 40
        data_dir = os.path.join(directory, feature_name)
        dataset = load_dataset('sigactcuboid', data_dir)
        classifiers = Classifiers(dataset.get_data(), dataset.feature_name,
                                  dataset.model_dir, K)

        for name, model in classifiers.models.items():  #get each classifier
            model_directory = os.path.join(dataset.model_dir)
            modname = os.path.join(
                model_directory,
                name + '_' + 'train' + '_' + feature_name + '.pkl')
            model_data = load_model(modname)
            labels, scores, test_time = model_data.predict()

            timefile = os.path.join(
                model_directory,
                name + '_' + 'test' + '_' + feature_name + '.json')
            times2json(test_time, timefile)
Example #29
0
X = [
    list(map(int,
             x.split(',')[:-1]))
    for x in open('covtype.data').read().splitlines()[:SIZE_DATA]
]
_Y = [
    x.split(',')[-1]
    for x in open('covtype.data').read().splitlines()[:SIZE_DATA]
]
larg = largestClass(_Y)
# treat the largest class as positive, the rest as negative
Y = [1 if x == larg else -1 for x in _Y]

xTrain, xTest, yTrain, yTest = cv.train_test_split(X,
                                                   Y,
                                                   train_size=5000 / len(X))

# In[2]:

import Classifiers as clfs
clfs.KNN(xTrain, xTest, yTrain, yTest)
clfs.RandomForest(xTrain, xTest, yTrain, yTest)
clfs.BoostedDecisionTree(xTrain, xTest, yTrain, yTest)
clfs.NeuralNets(xTrain, xTest, yTrain, yTest)
#clfs.SVM(xTrain, xTest, yTrain, yTest)
clfs.linearSVC(xTrain, xTest, yTrain, yTest)
import Classifiers as clfs
clfs.XGBoost(xTrain, xTest, yTrain, yTest)

# In[ ]:
	def parse_doc(self, doc, doc_id, connClassifier, argPosClassifier, senseClassifier, argClassifier, implicitsenseClassifier,PSarg1Classifier):
         store=0         
         output = []
         num_sentences = len(doc['sentences'])
         token_id = 0
         token_id_sentence=0
         for i in range(num_sentences):
              total=set(range(num_sentences))
              covered=set()
              uncovered=set()
              sentence1 = doc['sentences'][i]
              len_sentence1 = len(sentence1['words'])
              j=0
              while j < len_sentence1:
                  
                  wordString,connLabel,skip=Classifiers.classifyConnective(sentence1,j,connClassifier)       
                  if connLabel=='N' or connLabel == 'False':
                      token_id+=skip+1
                      j+=skip+1
                      continue
                  
                  argPosLabel,senseLabel,arg1List,arg2List=Classifiers.classifyOther(sentence1,wordString,j,skip,argPosClassifier,senseClassifier,argClassifier)
                  #print doc_id
                  if (argPosLabel=='PS' and i==0):
                      token_id+=skip+1
                      j+=skip+1
                      continue
                  try:
                      sentence2 = doc['sentences'][i-1]
                      len_sentence2 = len(sentence2['words'])
                      words = sentence2['words']
                  except IndexError:
                      store=i
                  covered.add(i) 
                  relation = {}
                  relation['DocID'] = doc_id
                  relation['Connective'] = {}
                  relation['Arg1'] = {}
                  relation['Arg2'] = {}

                  relation['Connective']['TokenList'] = range(token_id,token_id+skip+1)
                  relation['Type'] = 'Explicit'

                  if argPosLabel=='PS':

                      #relation['Arg1']['TokenList'] = range((token_id_sentence - len_sentence2), token_id_sentence - 1)
                      
                      #relation['Arg2']['TokenList'] = range(token_id_sentence, (token_id_sentence + len_sentence1) - 1)
                      arg1List = 3_ArgExtractor.arg(doc['sentences'][i-1]['parsetree'], wordString, PSarg1Classifier,doc['sentences'][i-1]['words'])
                      relation['Arg1']['TokenList'] = [i+token_id-j for i in arg1List]
                      #l = list(set(range(token_id-j, token_id -j + len_sentence1-1))-set([token_id]))
                      #l.sort()
                      #relation['Arg2']['TokenList'] =l
                      relation['Arg2']['TokenList'] = kong-finalPSArg2Extractor.argsExtract(PSarg2classifier,doc['sentences'][i-1]['parsetree'],relation['Connective']['TokenList'])
                  elif argPosLabel=='SS':
                      covered.add(i) 
                      relation['Arg1']['TokenList']=[i+token_id-j for i in arg1List] 
                      relation['Arg2']['TokenList']=[i+token_id-j for i in arg2List]
                    

                  relation['Sense'] = [senseLabel]
                  output.append(relation)
                  token_id += skip
                  token_id+=1
                  j+=skip+1
              token_id_sentence+=len_sentence1
           
         uncovered=list(total-covered)
         uncovered.sort()
         token_id=0	
	
	 featureSet = []
         for i in range(num_sentences-1):
             if i in uncovered:
                 sentence1 = doc['sentences'][i]
                 len_sentence1 = len(sentence1['words'])
                 token_id += len_sentence1
                 sentence2 = doc['sentences'][i+1]
                 len_sentence2 = len(sentence2['words'])
	         relation = {}
	         relation['Type'] = 'Implicit'
                 relation['DocID'] = doc_id
                 relation['Arg1'] = {}
                 relation['Arg1']['TokenList'] = range((token_id - len_sentence1), token_id - 1)
                 relation['Arg2'] = {}
       	         relation['Arg2']['TokenList'] = range(token_id, (token_id + len_sentence2) - 1)
		 print sentence1, sentence2, doc_id
		 feature = ImplicitClassifier2.extractFeatures(sentence1, sentence2, doc_id, model)
		 featureSet.append(feature)
		 senseType = implicitsenseClassifier.classify(feature)
		 print senseType
		 senseType = unicode(senseType)
	         relation['Sense'] = [senseType]
        	 relation['Connective'] = {}
               	 relation['Connective']['TokenList'] = []
	         output.append(relation)
	 f = pickle.dump(featureSet, open('implicitFeatureSet.p', 'wb'))
         return output
Example #31
0
im = []
iiList = []
label = []
numNot = 200  # Number of non-face images to use
numYes = 200  # Number of     face images to use

strong = pickle.load(open("strongClassifier.pkl", 'rb'))

filesNOT = glob.glob(
    '../faces/test/non-face/*.pgm')  # List of filenames of non-faces
filesYES = glob.glob(
    '../faces/test/face/*.pgm')  # List of filenames of     faces
for count in range(numNot):  # loop through desired number of non-faces
    i = cv2.imread(filesNOT[count], -1)  # Read image
    im.append(i)  # Append image to list of images
    ii = Classifiers.getIntegralImage(i)  # Calculate integral image
    iiList.append(ii)  # Append ii to list of integral images
    label.append(0)  # Append label 0 = non-face
print('Loaded not images')
for count in range(numYes):  # loop through desired number of faces
    i = cv2.imread(filesYES[count], -1)  # Read image
    im.append(i)  # Append image to list of images
    ii = Classifiers.getIntegralImage(i)  # Calculate integral image
    iiList.append(ii)  # Append ii to list of integral images
    label.append(1)  # Append label 1 = face
print('Loaded is  images')
label = np.array(label)  # Convert list to 1D array

print(strong.getPerformance(iiList, label))

predict = strong.predict(iiList)
        with open(os.path.join(TRAINING_SET_FOLDER, filename), 'r') as f:
            for line in f:

                line2 = line.strip().split('\t')
                if len(line2) == 2:
                    data.append(line2)
    #print(len(data))
    #data = data[:500]

    N = len(data)
    limit = (4 * N) // 5  # limite entre indices d'entraƮnement et de test

    labels_training = [line[0] for line in data[:limit]]
    X_training = [line[1] for line in data[:limit]]

    X_token_tr = c.tokenize(X_training)
    labels_bin_tr = [
        1 if label in ['Positive'] else 0 for label in labels_training
    ]
    labels_bin_tr = np.array(labels_bin_tr)

    labels_learning = [line[0] for line in data[limit:]]
    X_learning = [line[1] for line in data[limit:]]

    X_token_te = c.tokenize(X_learning)
    labels_bin_te = [
        1 if label in ['Positive'] else 0 for label in labels_learning
    ]
    labels_bin_te = np.array(labels_bin_te)

    print("Proportion de revues positives:", np.mean(labels_bin_te))