Beispiel #1
0
def runBestRegressionModelKFoldwFS(dataSets=[], regModels=[], names=[]):

    myResults = {}
    for ds in dataSets:
        myData, myTrain, myVal = dataEncoding(ds, taskID='filesReg')
        myTrain = skb(f_regression, k=5).fit_transform(myTrain, myVal)
        splits = kf(n_splits=10, shuffle=True, random_state=42)
        infinity = float("inf")
        index = -1
        count = -1
        for reg in regModels:
            count = count + 1
            reg.fit(myTrain, myVal)
            cvsScores = cvs(reg,
                            myTrain,
                            myVal,
                            cv=splits,
                            scoring='neg_mean_squared_error')
            meanSquareRootError = np.sqrt(-1 * cvsScores.mean())
            print(regsNames[names[count]], meanSquareRootError)
            if (meanSquareRootError < infinity):
                infinity = meanSquareRootError
                index = count
                L1, L2, L3, L4, L5, L6 = regsNames[
                    names[index]], reg.intercept_, reg.coef_, np.exp(
                        reg.coef_), cvsScores, infinity
        print(filesReg[ds], regsNames[names[index]], infinity)
        myResults[filesReg[ds]] = {1: L1, 2: L2, 3: L3, 4: L4, 5: L5, 6: L6}
        print('\n')
    return myResults
Beispiel #2
0
def runBestRegsCompKFold(dataSets=[], regModels=[], names=[]):

    myResults = {}
    for ds in dataSets:
        myData, myTrain, myVal = dataEncoding(ds, taskID='filesReg')
        #myTrain = skb(f_regression, k=3).fit_transform(myTrain,myVal)
        for name in myTrain.columns:
            if (not (myTrain[name].dtype == 'O')):
                myTrain[name] = pre.minmax_scale(myTrain[name].astype('float'))
        splits = kf(n_splits=10, shuffle=True, random_state=42)
        infinity = float("inf")
        index = -1
        count = -1
        for reg in regModels:
            count = count + 1
            reg.fit(myTrain, myVal)
            cvsScores = cvs(reg,
                            myTrain,
                            myVal,
                            cv=splits,
                            scoring='neg_mean_squared_error')
            meanSquareRootError = np.sqrt(-1 * cvsScores.mean())
            print(RegsCompNames[names[count]], meanSquareRootError)
            if (meanSquareRootError < infinity):
                infinity = meanSquareRootError
                index = count
                L1, L2, L3 = RegsCompNames[names[index]], cvsScores, infinity
        print(filesReg[ds], RegsCompNames[names[index]], infinity)
        myResults[filesReg[ds]] = {1: L1, 2: L2, 3: L3}
        print('\n')
    return myResults
Beispiel #3
0
def runBestRegressionModelKFoldPrintFolderErrors(dataSets=[],
                                                 regModels=[],
                                                 names=[]):

    myResults = {}
    for ds in dataSets:
        myData, myTrain, myVal = dataEncoding(ds, taskID='filesReg')
        splits = kf(n_splits=2, shuffle=True, random_state=42)
        infinity = float("inf")
        index = -1
        count = -1
        for reg in regModels:
            count = count + 1
            xval_err = 0
            for train, test in splits.split(myTrain):
                reg.fit(myTrain.ix[train], myVal.ix[train])
                p = reg.predict(myTrain.ix[test])
                e = p - myVal.ix[test]
                print(e)
                xval_err += np.dot(e, e)
            rmse_10cv = np.sqrt(xval_err / len(myTrain))
            print(rmse_10cv)
            input("Press any key")
            if (rmse_10cv < infinity):
                infinity = rmse_10cv
                index = count
                L1, L2, L3, L4, L5 = regsNames[
                    names[index]], reg.intercept_, reg.coef_, np.exp(
                        reg.coef_), infinity
        print(filesReg[ds], regsNames[names[index]], infinity)
        myResults[filesReg[ds]] = {1: L1, 2: L2, 3: L3, 4: L4, 5: L5}
        print('\n')
    return myResults
Beispiel #4
0
def main():

    #train, validation = train_test_split(train_full, test_size = 0.2, random_state = None)
    train_full = pd.read_csv('D:/Sem1/INF552/Project/Data/Final1/train.csv',
                             converters={
                                 'Date Occurred': date_time_converter,
                                 'Time Occurred': format_time,
                                 'Crime Code': format_crime_code
                             })
    print("Completed reading csv.....")
    train_full = train_full[np.isfinite(train_full['Date Occurred'])]
    #train_full = train_full[~np.isnan(train_full['Time Occurred'])]
    train_full = train_full[~np.isnan(train_full['Crime Code'])]
    print(len(train_full))
    X_full = train_full.iloc[:, [3, 4, 8]].values
    y_full = train_full.iloc[:, 5].values

    #divide training data into 10 pairs of train and dev
    kfn = kf(n_splits=10, random_state=seed, shuffle=False)
    kfn.get_n_splits(X_full)
    i = 0
    for train_index, dev_index in kfn.split(X_full):
        #print("train:", train_index, "dev:", dev_index)
        i += 1
        print("In iteration: ")
        print(i)
        X_train, X_dev = X_full[train_index], X_full[dev_index]
        y_train, y_dev = y_full[train_index], y_full[dev_index]
        apply_models(X_train, y_train, X_dev, y_dev)
Beispiel #5
0
def runRidgeRegressiontoEstAlpha(dataSets=[]):

    from sklearn.linear_model import Ridge

    print('Ridge Regression')
    print('alpha\t RMSE_train\t RMSE_10cv\n')
    alpha = np.linspace(.01, 20, 50)

    for ds in dataSets:
        myData, myTrain, myVal = dataEncoding(ds, taskID='filesReg')
        #for name in myTrain.columns:
        #if (not(myTrain[name].dtype=='O')):
        #myTrain[name]=pre.minmax_scale(myTrain[name].astype('float'))
        t_rmse = np.array([])
        cv_rmse = np.array([])

        for a in alpha:
            ridge = Ridge(fit_intercept=True, alpha=a)
            ridge.fit(myTrain, myVal)
            p = ridge.predict(myTrain)
            err = p - myVal
            total_error = np.dot(err, err)
            rmse_train = np.sqrt(total_error / len(p))

            splits = kf(n_splits=10, shuffle=True, random_state=42)
            xval_err = 0
            for train, test in splits.split(myTrain):
                ridge.fit(myTrain.ix[train], myVal.ix[train])
                p = ridge.predict(myTrain.ix[test])
                e = p - myVal.ix[test]
                xval_err += np.dot(e, e)
            rmse_10cv = np.sqrt(xval_err / len(myTrain))

            t_rmse = np.append(t_rmse, [rmse_train])
            cv_rmse = np.append(cv_rmse, [rmse_10cv])
            print('{:.3f}\t {:.4f}\t\t {:.4f}'.format(a, rmse_train,
                                                      rmse_10cv))
        input("Press Any Key")

    plt.plot(alpha, t_rmse, label='RMSE-Train')
    plt.plot(alpha, cv_rmse, label='RMSE_CV')
    plt.legend(('RMSE-Train', 'RMSE_XCV'))
    plt.ylabel('RMSE')
    plt.xlabel('Alpha')
    plt.show()
 def cross_validation_svm(self):
     """
     Effectue la validation croisée pour le classifieurs SVM
     """
     print("Recherche des meilleurs paramètres du SVM Classifier...")
     C_value = [0.5, 1, 1.5]
     kernel = ['linear', 'poly', 'rbf', 'sigmoid']
     param_grid = dict(C=C_value, kernel=kernel)
     model = SVC()
     kfold = kf(n_splits=self.num_folds, random_state=self.seed)
     grid = gridsearchcv(estimator=model,
                         param_grid=param_grid,
                         scoring=self.scoring,
                         cv=kfold)
     grid_result = grid.fit(self.x_data_scale, self.data_y_train)
     self.C_svm = grid_result.best_params_['C']
     self.kernel = grid_result.best_params_['kernel']
     print("...Done")
 def cross_validation_ada_boost(self):
     """
     Effectue la validation croisée pour le classifieurs AdaBoost
     """
     print("Recherche des meilleurs paramètres du AdaBoost Classifier...")
     n_estimators = [25, 50, 75]
     learning_rate_ada = [0.5, 1, 1.5]
     grid_param = dict(n_estimators=n_estimators,
                       learning_rate=learning_rate_ada)
     model = AdaBoostClassifier()
     kfold = kf(n_splits=self.num_folds, random_state=self.seed)
     grid = gridsearchcv(estimator=model,
                         param_grid=grid_param,
                         scoring=self.scoring,
                         cv=kfold)
     grid_result = grid.fit(self.x_data_scale, self.data_y_train)
     self.n_estimators = grid_result.best_params_['n_estimators']
     self.learning_rate_ada = grid_result.best_params_['learning_rate']
     print("...Done")
    def cross_validation_knn(self):
        """
        Effectue la validation croisée pour le classifieurs K Nearest Neighbour
        afin de trouver les meilleurs paramètres
        """
        print(
            "Recherche des meilleurs paramètres du K Nearest Neighbour Classifier..."
        )
        neighbors = [1, 3, 5, 7, 9]
        grid_param = dict(n_neighbors=neighbors)
        model = KNeighborsClassifier()
        kfold = kf(n_splits=self.num_folds, random_state=self.seed)
        grid = gridsearchcv(estimator=model,
                            param_grid=grid_param,
                            scoring=self.scoring,
                            cv=kfold)
        grid_result = grid.fit(self.x_data_scale, self.data_y_train)

        self.n_neighbors = grid_result.best_params_['n_neighbors']
        print("...Done")
 def cross_validation_random_forest(self):
     """
     Effectue la validation croisée pour le classifieurs Random Forest
     """
     print(
         "Recherche des meilleurs paramètres du Random Forest Classifier..."
     )
     max_depth = [10, 50, 100]
     bootstrap = [True, False]
     grid_param = dict(max_depth=max_depth, bootstrap=bootstrap)
     model = RandomForestClassifier()
     kfold = kf(n_splits=self.num_folds, random_state=self.seed)
     grid = gridsearchcv(estimator=model,
                         param_grid=grid_param,
                         scoring=self.scoring,
                         cv=kfold)
     grid_result = grid.fit(self.x_data_scale, self.data_y_train)
     self.max_depths = grid_result.best_params_['max_depth']
     self.bootstrap = grid_result.best_params_['bootstrap']
     print("...Done")
 def cross_validation_nn(self):
     """
     Effectue la validation croisée pour le classifieurs Neural Networks
     """
     print(
         "Recherche des meilleurs paramètres du Neural Network Classifier..."
     )
     solver = ['lbfgs', 'sgd', 'adam']
     learning_rate = ['constant', 'invscaling', 'adaptive']
     param_grid = dict(solver=solver, learning_rate=learning_rate)
     model = MLPClassifier()
     kfold = kf(n_splits=self.num_folds, random_state=self.seed)
     grid = gridsearchcv(estimator=model,
                         param_grid=param_grid,
                         scoring=self.scoring,
                         cv=kfold)
     grid_result = grid.fit(self.x_data_scale, self.data_y_train)
     self.solver = grid_result.best_params_['solver']
     self.learning_rate = grid_result.best_params_['learning_rate']
     print("...Done")
 def cross_validation_logistic_regression(self):
     """
     Effectue la validation croisée pour le classifieurs Logistic Regression
     afin de trouver les meilleurs paramètres
     """
     print(
         "Recherche des meilleurs paramètres du Logistic Regression Classifier..."
     )
     C = [1, 10, 50]
     tol = [0.005, 0.003, 0.001]
     grid_param = dict(C=C, tol=tol)
     model = LogisticRegression(solver='newton-cg',
                                multi_class='multinomial')
     kfold = kf(n_splits=self.num_folds, random_state=self.seed)
     grid = gridsearchcv(estimator=model,
                         param_grid=grid_param,
                         scoring=self.scoring,
                         cv=kfold)
     grid_result = grid.fit(self.x_data_scale, self.data_y_train)
     self.C = grid_result.best_params_['C']
     self.tol = grid_result.best_params_['tol']
     print("...Done")
Beispiel #12
0
    def __init__(self,
                 Cs=500,
                 cv=10,
                 sampler='skf',
                 solver='liblinear',
                 **kwargs):

        super(self.__class__, self).__init__()

        self.penalty = 'l1'
        self.solver = solver
        self.Cs = Cs
        self.sampler = sampler
        self.cv_folds = cv

        if self.sampler == 'skf':
            self.cv = skf(n_splits=self.cv_folds)

        elif self.sampler == 'sss':
            self.cv = sss(n_splits=self.cv_folds)

        elif self.sampler == 'kf':
            self.cv = kf(n_splits=self.cv_folds)

        elif self.sampler == 'ss':
            self.cv = ss(n_splits=self.cv_folds)

        else:
            raise (Exception(
                'Selected sampler is not a valid. Please choose '
                '"skf" for stratified K-fold or "sss" for '
                'stratified shuffle split. Also "sk" and "ss" for '
                'the respective non-stratified methods.'))

        for k, v in kwargs.items():
            setattr(self, k, v)

        self.x = None
        self.y = None
Beispiel #13
0
    def kFoldValidatoin(self, XMap, Y, k=10, classNum=2):
        randomIndex = random.sample(range(len(Y)), len(Y))
        for clfName in XMap:
            XMap[clfName] = XMap[clfName][randomIndex]
        y = Y[randomIndex]
        cmTotal = np.zeros((classNum, classNum))
        index = kf(n_splits=k, random_state=666).split(list(range(len(Y))))
        for line in index:
            trainIndex = line[0]
            testIndex = line[1]
            trainX, testX = {}, {}
            trainy = y[trainIndex]
            testy = y[testIndex]
            for clfName in XMap:
                from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
                featureProcessor = LinearDiscriminantAnalysis(n_components=100)

                trainX[clfName] = XMap[clfName][trainIndex]
                testX[clfName] = XMap[clfName][testIndex]
                featureProcessor.fit(trainX[clfName], trainy)
                trainX[clfName] = featureProcessor.transform(trainX[clfName])
                testX[clfName] = featureProcessor.transform(testX[clfName])

            import copy
            self.baseModels = copy.deepcopy(self.oriBaseModels)
            self.metaModel = copy.deepcopy(self.oriMetaModel)
            self.fit(trainX, trainy)
            print("训练集表现:")
            y_train = self.predict(trainX)
            cm = confusion_matrix(trainy, y_train)
            print(cm)
            y_pred = self.predict(testX)
            cm = confusion_matrix(testy, y_pred)
            cmTotal += np.array(cm)
            print("测试集表现:")
            print(cm)
        print(cmTotal)
        return cmTotal
def Gaussian_Naive_Bayes(test_set):
    errors = 0
    for i in range(0, test_set['class_label'].count()):
        class_value = predict_class_gnb(test_set.iloc[i])
        if class_value != test_set.iloc[i]['class_label']:
            errors += 1

    return (test_set['class_label'].count() -
            errors) * 100 / test_set['class_label'].count()


data = pd.read_csv('bank_data',
                   header=None,
                   names=['f1', 'f2', 'f3', 'f4', 'class_label'])
fraction_list = [.01, .02, .05, .1, .625, 1]
kfold = kf(3, True, 1)

gnb_sum_kfold = {}
for i in fraction_list:
    gnb_sum_kfold[i] = 0.0

for tr_ind, te_ind in kfold.split(data):
    for fraction in fraction_list:
        gnb_sum_acc = 0
        for ii in range(0, 5):
            train_set = data.iloc[tr_ind].sample(frac=fraction)
            test_set = data.iloc[te_ind]
            frequency = train_set['class_label'].value_counts()

            distribution_mean = train_set.groupby('class_label').mean()
            distribution_variance = train_set.groupby('class_label').var()
Beispiel #15
0
def testClassifier(featureCollectionName="", scale=True):
    def scala(x):
        res = []
        for i in range(len(x)):
            temp = []
            for n in x[i]:
                v = 1 if n > 0 else 0
                temp.append(v)
            res.append(temp)
        return np.array(res)

    print("读取数据")
    collection = db[featureCollectionName]
    data = collection.find({})  #从mongo中查询这个特征以及对应的性别标签
    data = list(data)  #[:500]
    df = pd.DataFrame(data)

    dfClean = df.drop(columns=['gender', '_id', 'uid'])
    X, Y = dfClean.values, df['gender']
    if scale:
        X = scala(X)
    print("开始交叉验证")
    index = kf(n_splits=10, random_state=666).split(list(range(len(Y))))
    cmTotal = np.zeros((2, 2))
    count = 0
    for line in index:
        # clf = LogisticRegression(max_iter=50, solver='newton-cg', C=0.1)#基于词频0.75
        clf = LogisticRegression(max_iter=50,
                                 solver='newton-cg',
                                 C=0.1,
                                 class_weight={
                                     0: 0.5,
                                     1: 0.7
                                 })  #基于词频0.75
        # clf = DecisionTreeClassifier()
        # clf = RandomForestClassifier()
        # clf = MLPClassifier(hidden_layer_sizes=(200,50))
        # clf = SVC(C=0.8)
        trainIndex = line[0]
        testIndex = line[1]
        trainX = X[trainIndex]
        testX = X[testIndex]
        trainy = Y[trainIndex]
        testy = Y[testIndex]
        # from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
        # featureProcessor = LinearDiscriminantAnalysis(n_components=500)
        # featureProcessor.fit(trainX, trainy)
        # trainX = featureProcessor.transform(trainX)
        # testX = featureProcessor.transform(testX)

        clf.fit(trainX, trainy)
        count += 1
        print(count, "训练集表现:")
        y_train = clf.predict(trainX)
        cm = confusion_matrix(trainy, y_train)
        print(cm)
        y_pred = clf.predict(testX)
        cm = confusion_matrix(testy, y_pred)
        cmTotal += np.array(cm)
        print(count, "测试集表现:")
        print(cm)
        pickle.dump(clf, open('genderClassfier.pkl', 'wb'))
        # dataList = pickle.load(open('genderClassfier.pkl','rb'))
    print(cmTotal)
    print((cmTotal[0, 0] + cmTotal[1, 1]) / (sum(sum(cmTotal))))
    print("召回率是", cmTotal[1, 1] / (cmTotal[1, 0] + cmTotal[1, 1]), "精度是",
          cmTotal[1, 1] / (cmTotal[0, 1] + cmTotal[1, 1]))
Beispiel #16
0
def testClassifierComplexFeatures():
    featureNames = ['lastLoginTime', 'registerTime', 'homeTeams', 'fans', 'theOrg', 'follow',
                    'location', 'uid', 'onlineTime', 'userNumCame',
                    'communityRPScore','HPLevel', 'gender']
    featureNames = set(featureNames)
    featureNamesMap = {}
    for name in featureNames:
        featureNamesMap[name] = 1
    featureNamesMap['_id'] = 0
    print("读取数据") # , db['bigram'],db['postagBigramFreq']]
    uids = db[userOriFeatureCollection].find({}, featureNamesMap)
    dataList = []
    count = 0
    progressCount = 0
    for line in uids:
        progressCount += 1
        print("正在读取第", progressCount, "个用户。", )
        # print(set(uid.keys())&featureNames)
        if len(set(line.keys())&featureNames)<7 or 'gender' not in line:
            continue
        # if line['gender']=='m' and len(dataList)>0 and dataList[-1]['gender']=='f':
        #     pass
        if line['gender']=='m' and random.uniform(0, 1) > 0.05:
            continue
        count += 1
        if 'communityRPScore' in line and line['communityRPScore'] !=None and line['communityRPScore']<0:
            line['communityRPBad'] = 1
        if 'userName' in line:
            line = len(line['userName'])
        if 'location' in line:
            line['location'] = 1
        if 'follow' in line:
            line['follow'] = len(line['follow'])
        if 'fans' in line:
            line['fans'] = len(line['fans'])
        if 'theOrg' in line:
            if line['theOrg']=='小黑屋住户':
                line['theOrgBlock'] = 1
            line['theOrg'] = 1
        if 'homeTeams' in line:
            line['homeTeams'] = len(line['homeTeams'])
        if 'registerTime' in line:
            line['registerTime'] = time2timestamp(line['registerTime'])-\
                                   time2timestamp('2003-1-1')
            line['registerTime'] = int(line['registerTime']/3600)
        if 'lastLoginTime' in line:
            line['lastLoginTime'] = int(time.time()) - time2timestamp(line['lastLoginTime'])
            line['lastLoginTime'] = int(line['lastLoginTime']/3600)
        # if count == 200000:
        #     break
        dataList.append(line)
    df = pd.DataFrame(dataList)
    dfClean = df[list(featureNames)].drop(columns=['gender', 'uid']).fillna(0)
    X, Y = dfClean.values, df['gender']
    print(Y)
    print("开始交叉验证")
    index = kf(n_splits=10, random_state=666).split(list(range(len(Y))))
    cmTotal = np.zeros((2, 2))
    count = 0
    for line in index:
        # clf = LogisticRegression(max_iter=1000, solver='lbfgs', C=100)#基于词频0.75
        # clf = DecisionTreeClassifier()
        # clf = RandomForestClassifier()
        clf = RandomForestClassifier(n_estimators=200, max_depth=6, random_state=666, n_jobs=6)
        #clf = MLPClassifier(hidden_layer_sizes=(50,10), max_iter=500, learning_rate_init=0.01)
        # clf = SVC(C=0.9)
        trainIndex = line[0]
        testIndex = line[1]
        trainX = X[trainIndex]
        testX = X[testIndex]
        trainy = Y[trainIndex]
        testy = Y[testIndex]
        # from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
        # featureProcessor = LinearDiscriminantAnalysis(n_components=150)
        # featureProcessor.fit(trainX, trainy)
        # trainX = featureProcessor.transform(trainX)
        # testX = featureProcessor.transform(testX)

        clf.fit(trainX, trainy)
        count += 1
        print(count, "训练集表现:")
        y_train = clf.predict(trainX)
        cm = confusion_matrix(trainy, y_train)
        print(cm)
        y_pred = clf.predict(testX)
        cm = confusion_matrix(testy, y_pred)
        cmTotal += np.array(cm)
        print(count, "测试集表现:")
        print(cm)
    print(cmTotal)
    print((cmTotal[0, 0] + cmTotal[1, 1]) / (sum(sum(cmTotal))))
Beispiel #17
0
    classifier = LogisticRegression(random_state = seed, solver='newton-cg', multi_class='multinomial')
    classifier.fit(X_train, y_train)
    
    y_pred = classifier.predict(X_dev)
    
    cm = confusion_matrix(y_dev, y_pred)
    ac = accuracy_score(y_dev, y_pred)
    print("********************************************************************************")
    print(classification_report(y_dev, y_pred))
    print(cm)
    print(ac)
    print("********************************************************************************")


#train, validation = train_test_split(train_full, test_size = 0.2, random_state = None)
train_full = pd.read_csv('D:/Sem1/INF552/Project/Data/Final1/train.csv')
train_full = train_full[np.isfinite(train_full['Victim Age'])]
X_full = train_full.iloc[:,[11]].values
y_full = train_full.iloc[:, 8].values

#divide training data into 10 pairs of train and dev
kfn = kf(n_splits=10, random_state=seed, shuffle=False)
kfn.get_n_splits(X_full)



for train_index, dev_index in kfn.split(X_full):
    #print("train:", train_index, "dev:", dev_index)
    X_train, X_dev = X_full[train_index], X_full[dev_index]
    y_train, y_dev = y_full[train_index], y_full[dev_index]
    apply_models(X_train, y_train, X_dev, y_dev)
Beispiel #18
0
from sklearn.naive_bayes import GaussianNB as gnb, BernoulliNB as bnb

with open('train.json') as data:
    train = json.load(data)

cuisine = []
ingredients = []
for i in train:
    cuisine.append(i["cuisine"])
    ingredients.extend(i["ingredients"])

singredients = list(set(ingredients))
traind = []
d = {singredients[i]: i for i in range(len(singredients))}
for i in train:
    row = [0] * len(singredients)
    for j in i["ingredients"]:
        row[d[j]] = 1
    traind.append(row)

k_fold = kf(n_splits=3)
ga = cvs(gnb(), traind, cuisine, cv=k_fold, n_jobs=-1)
ba = cvs(bnb(), traind, cuisine, cv=k_fold, n_jobs=-1)

f = open('2d', 'wb')
s = "Gaussian accuracy is: " + str(np.mean(ga))
print s
f.write(s)
s = "Bernoulli accuracy is: " + str(np.mean(ba))
print s
f.write(s)
Beispiel #19
0
def testClassifierLSTM(featureCollectionName="", scale=True):
    def scala(x):
        res = []
        for i in range(len(x)):
            temp = []
            for n in x[i]:
                v = 1 if n > 0 else 0
                temp.append(v)
            res.append(temp)
        return np.array(res)

    print("读取数据")
    collection = db[featureCollectionName]
    data = collection.find({})  #从mongo中查询这个特征以及对应的性别标签
    maleSpecialWords = ['武器库', 'UFC', '硬邦邦的', '龟头', '前臂', '尼玛比']
    maleSpecialWords = set(maleSpecialWords)
    femaleSpecialWords = ['小女子', '小宝贝', "美少年", '萌图', '治愈系', '防晒霜', '萌系']
    femaleSpecialWords = set(femaleSpecialWords)
    data = list(data)  #[]
    maleData, femaleData, otherData = [], [], []
    for line in data:
        gender = decideGenderBySpecialWords(line, maleSpecialWords,
                                            femaleSpecialWords)
        if gender == 'male':
            maleData.append(line)
        elif gender == 'female':
            femaleData.append(line)
        else:
            otherData.append(line)
    print("基于特殊词语判断性别,得到男性和女性个数分别是", len(maleData), len(femaleData))
    print("还剩下的用户数是", len(otherData))
    df = pd.DataFrame(otherData)
    dropFeatureNames = []
    import re

    for line in df.columns:
        if len(re.findall('[0-9]', line)) > 0:
            dropFeatureNames.append(line)

    # dfClean = df.drop(columns=['gender','_id', 'uid'])
    dfClean = df.drop(columns=dropFeatureNames + ['gender', '_id', 'uid'])
    X, Y = dfClean.values, df['gender']
    if scale:
        X = scala(X)
    Y = np.array(Y).reshape(-1, 1)
    X = X[:, :]
    # Y = list(map(lambda x: [x], Y))
    # Y = np.array(Y).reshape(-1,1)
    oneHotEncoder4Y = OneHotEncoder().fit(Y)
    print("开始交叉验证")
    index = kf(n_splits=10, random_state=666).split(list(range(len(Y))))
    cmTotal = np.zeros((2, 2))
    count = 0
    for line in index:
        num_feature = len(X[0])
        clf = deepLearning.LSTMClassifier(2,
                                          num_feature,
                                          learning_rate=1e-3,
                                          layer_num=1,
                                          hidden_size=100,
                                          timestep_size=100)
        clf.initGraph(ifDecrLR=True)
        trainIndex = line[0]
        testIndex = line[1]
        trainX = X[trainIndex]
        testX = X[testIndex]
        trainy = Y[trainIndex]
        testy = Y[testIndex]
        clf.initOneHotEncoder4Y(trainy)
        batch_ys = oneHotEncoder4Y.transform(trainy).todense().astype(
            np.float32)
        batch_xs = np.array(trainX).astype(np.float32)
        for i in range(50):
            print("这是第", count, "折,第", i, "轮训练。", len(trainy))
            stepsize = 100
            for j in range(0, len(trainy), stepsize):
                batch_ys = clf.oneHotEncode(trainy[j:j + stepsize, :])
                batch_xs = np.array(trainX[j:j + stepsize, :]).astype(
                    np.float32)
                clf.fit(batch_xs, batch_ys)
            print("学习率是", clf.learning_rate, clf.global_step)
            batch_ys = clf.oneHotEncode(trainy)
            batch_xs = np.array(trainX).astype(np.float32)
            pred_train = clf.test(batch_xs, batch_ys)
            pred_train = list(
                map(lambda x: 1 if x[0] < x[1] else 0, pred_train))
            pred_train = np.array(pred_train).reshape(-1, 1)
            # print(len(trainy), len(pred_train))
            cm = confusion_matrix(trainy, pred_train)
            print("训练集混淆矩阵", cm)
            count += 1
            batch_ys = clf.oneHotEncode(testy)
            batch_xs = np.array(testX).astype(np.float32)
            print(count, "测试集表现:")
            y_pred = clf.test(batch_xs, batch_ys)
            y_pred = list(map(lambda x: 1 if x[0] < x[1] else 0, y_pred))
            y_pred = np.array(y_pred).reshape(-1, 1)
            cm = confusion_matrix(testy, y_pred)
            print("测试集混淆矩阵", cm)
            print("召回率是", cm[1, 1] / (cm[1, 0] + cm[1, 1]), "精度是",
                  cm[1, 1] / (cm[0, 1] + cm[1, 1]))
        break
Beispiel #20
0
def testClassifierComplexFeatures(featureNames=[], scale=True):
    def scala(x):
        res = []
        for i in range(len(x)):
            temp = []
            for n in x[i]:
                v = 1 if n > 0 else 0
                temp.append(v)
            res.append(temp)
        return np.array(res)

    print("读取数据")
    dataCursorList = [db[name] for name in featureNames
                      ]  #, db['bigram'],db['postagBigramFreq']]
    uids = dataCursorList[0].find({}, {'uid': 1})
    dataList = []
    count = 0
    for uid in uids:
        uid = uid['uid']
        count += 1
        # if count == 200:
        #     break
        print("正在读取第", count, "个用户。")
        dataTemp = {}
        flag = 0
        for i in range(len(dataCursorList)):
            cursor = dataCursorList[i]
            line = cursor.find_one({'uid': uid})
            if line == None:
                flag = 1
                break
            dataTemp.update(line)
        if flag == 1:
            continue
        dataList.append(dataTemp)
    df = pd.DataFrame(dataList)
    dfClean = df.drop(columns=['gender', '_id', 'uid'])
    X, Y = dfClean.values, df['gender']
    X = scala(X)
    print("开始交叉验证")
    index = kf(n_splits=10, random_state=666).split(list(range(len(Y))))
    cmTotal = np.zeros((2, 2))
    count = 0
    for line in index:
        # clf = LogisticRegression(max_iter=1000, solver='lbfgs', C=100)#基于词频0.75
        # clf = DecisionTreeClassifier()
        # clf = RandomForestClassifier()
        # clf = RandomForestClassifier(n_estimators=30, max_depth=6, random_state=666)
        # clf = MLPClassifier(hidden_layer_sizes=(200,10))
        clf = SVC(C=0.9)
        trainIndex = line[0]
        testIndex = line[1]
        trainX = X[trainIndex]
        testX = X[testIndex]
        trainy = Y[trainIndex]
        testy = Y[testIndex]
        # from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
        # featureProcessor = LinearDiscriminantAnalysis(n_components=150)
        # featureProcessor.fit(trainX, trainy)
        # trainX = featureProcessor.transform(trainX)
        # testX = featureProcessor.transform(testX)

        clf.fit(trainX, trainy)
        count += 1
        print(count, "训练集表现:")
        y_train = clf.predict(trainX)
        cm = confusion_matrix(trainy, y_train)
        print(cm)
        y_pred = clf.predict(testX)
        cm = confusion_matrix(testy, y_pred)
        cmTotal += np.array(cm)
        print(count, "测试集表现:")
        print(cm)
    print(cmTotal)
    print((cmTotal[0, 0] + cmTotal[1, 1]) / (sum(sum(cmTotal))))
Beispiel #21
0
def indexToyExample():

    dataDict = {
        'Age': list(np.random.uniform(low=30.0, high=79.0, size=1000)),
        'Sex': list(np.random.randint(2, size=1000)),
        'SBP': list(np.random.uniform(low=90.0, high=180.0, size=1000)),
        'Smoker': list(np.random.randint(2, size=1000)),
        'CHF': list(np.random.randint(2, size=1000))
    }
    myPD = pd.DataFrame(dataDict)

    predictors = ['Age', 'Sex', 'SBP']
    target = ['CHF']

    myTrain = myPD[predictors]
    newMyTrain = myPD[predictors]
    myVal = myPD[target]

    splits = kf(n_splits=10, shuffle=True, random_state=42)
    LR = Classifiers[0].fit(myTrain, myVal)
    cvsScores = cvs(LR,
                    myTrain,
                    myVal,
                    cv=splits,
                    scoring='neg_mean_squared_error')
    LR.predict(myTrain)
    meanSquareRootError = np.sqrt(-1 * cvsScores.mean())
    L1 = {
        1: LR.intercept_,
        2: LR.coef_,
        3: np.exp(LR.coef_),
        4: cvsScores,
        5: meanSquareRootError
    }

    LS = []
    LY = []
    for index, row in myTrain.iterrows():
        if (row['Age'] < 39):
            LS.append(1)
        elif (row['Age'] <= 49):
            LS.append(2)
        elif (row['Age'] <= 59):
            LS.append(3)
        elif (row['Age'] <= 69):
            LS.append(4)
        elif (row['Age'] <= 80):
            LS.append(5)

        if (row['SBP'] < 120):
            LY.append(1)
        elif (row['SBP'] < 129):
            LY.append(2)
        elif (row['SBP'] < 139):
            LY.append(3)
        elif (row['SBP'] < 159):
            LY.append(4)
        else:
            LY.append(5)

    newMyTrain['Age'] = LS
    newMyTrain['SBP'] = LY

    newMyPD = pd.DataFrame()

    newMyPD['Age'] = newMyTrain['Age']
    newMyPD['Sex'] = newMyTrain['Sex']
    newMyPD['SBP'] = newMyTrain['SBP']
    newMyPD['CHF'] = myVal

    Freq = {
        'Age': newMyPD.groupby(['Age', 'CHF']).size(),
        'Sex': newMyPD.groupby(['Sex', 'CHF']).size(),
        'SBP': newMyPD.groupby(['SBP', 'CHF']).size()
    }

    return myPD, myTrain, myVal, newMyPD, L1, Freq