def __init__(self, modelType): """ :param modelType: 'logIT','logAT','ordRidge','lad','multiclasslogistic' """ if (modelType.lower() == 'logit'): print("Using Logistic Immediate-Threshold variant") self.model = mord.LogisticIT(alpha=1.0, verbose=0, max_iter=1000) elif (modelType.lower() == 'logat'): print("Using Logistic All-Threshold variant") self.model = mord.LogisticAT(alpha=1.0, verbose=0, max_iter=1000) elif (modelType.lower() == 'ordridge'): print("Using Ordinal Ridge variant") # Best Score: -0.4885966615485714 # Best Param: {'alpha': 0.02, 'solver': 'sag', 'max_iter': 100000, 'fit_intercept': True, 'copy_X': True, 'tol': 0.01, 'normalize': True} # self.model = mord.OrdinalRidge(alpha=1,fit_intercept=True,normalize=False,copy_X=True,max_iter=None,tol=0.001,solver='auto') # Best Score: -0.48869761710226156 # Best Param: {'alpha': 5e-05, 'fit_intercept': True, 'max_iter': 50000, 'copy_X': True, 'normalize': False, # 'solver': 'cholesky', 'tol': 5e-05} #### Completed: OrdinalRegression ordridge training #### # self.model = mord.OrdinalRidge(alpha=0.00005, fit_intercept=True, normalize=False, copy_X=True, max_iter=50000, # tol=0.00005, solver='cholesky') self.model = mord.OrdinalRidge(alpha=0.0001, fit_intercept=True, normalize=False, copy_X=True, max_iter=3000000, tol=0.0001, solver='auto') # self.model = mord.OrdinalRidge(alpha=0.00001, fit_intercept=True, normalize=True, copy_X=True, max_iter=1000000,tol=0.0000001, solver='auto') elif (modelType.lower() == 'lad'): print("Using Least Absolute Deviation") self.model = mord.LAD(epsilon=0.0, tol=0.0001, C=1.0, loss='l1', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=1000) elif (modelType.lower() == 'multiclasslogistic'): print("Using Multiclass Logistic") self.model = mord.MulticlassLogistic(alpha=1.0, verbose=0, maxiter=1000) else: print( "Model selection not recognised.\nDefaulted to Logistic All-Threshold variant" ) self.model = mord.LogisticIT(alpha=1.0, verbose=1, max_iter=1000)
def order_logit_regression(): data = read_csv(CSV_PATH) bunch = Bunch(data=data.iloc[:, 1:-1], target=data.iloc[:, -1]) d = bunch.data train_len = int(0.75 * d.shape[0]) trainX, trainY = d.ix[:train_len-1, :], bunch.target[:train_len] testX, testY = d.ix[train_len:, :], bunch.target[train_len:] clf1 = mord.LogisticAT(alpha=0.5) clf1.fit(trainX, trainY) pred = clf1.predict(testX) draw_acc_matrix(testY, pred, train_len) print 'Accuracy of LogisticAT: %s' % metrics.accuracy_score(testY, pred) print 'Mean absolute error of LogisticAT: %s' % \ metrics.mean_absolute_error(pred, testY) clf2 = mord.LogisticIT(alpha=0.5) clf2.fit(trainX, trainY) pred2 = clf2.predict(testX) draw_acc_matrix(testY, pred2, train_len) print 'Accuracy of LogisticIT: %s' % metrics.accuracy_score(testY, pred2) print 'Mean absolute error of LogisticIT: %s' % \ metrics.mean_absolute_error(pred2, testY) clf3 = mord.LogisticSE(alpha=0.5) clf3.fit(trainX, trainY) pred3 = clf3.predict(testX) draw_acc_matrix(testY, pred3, train_len) print 'Accuracy of LogisticSE: %s' % metrics.accuracy_score(testY, pred3) print 'Mean absolute error of LogisticSE: %s' % \ metrics.mean_absolute_error(pred3, testY)
def build_and_evaluate_sklearn(sampleTexts, y): #para poder utilizar LogisticIT import numpy as np y = np.asarray(y) '''Build vector of token counts''' from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_counts = count_vect.fit_transform( sampleTexts) #list of texts, each text is a string X = X_counts from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #se utiliza ordinal progression import mord as m clf = m.LogisticIT() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) from sklearn import metrics from sklearn.metrics import confusion_matrix print('Accuracy of prediction is', clf.score(X_test, y_test)) print('Confusion matrix:\n', confusion_matrix(y_test, y_pred)) print(metrics.classification_report(y_test, y_pred))
def main(): data = pd.read_csv('./final_data.csv') X_train, X_test, y_train, y_test = split.split(data) clf = mord.LogisticIT() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print('accuracy_score:%06f' % accuracy_score(y_test, y_pred)) print('precision_score:%06f' % precision_score(y_test, y_pred, average='macro')) print('recall_score:%06f' % recall_score(y_test, y_pred, average='macro')) target_names = ['VeryGood', 'Good', 'Fair', 'Bad', 'VeryBad'] print('classification_report:', classification_report(y_test, y_pred, target_names=target_names))
def fit_logistic_it_with_crossvalidation(X, y): """An ordinal model of dataset with hyperparameter cross-validation. Immediate-Threshold (logistic/threshold) variant. Parameters & returns as per other training functions. """ basemod = mord.LogisticIT() cv = 5 param_grid = {'alpha': [0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0, 3.0]} return fit_classifier_with_crossvalidation(X, y, basemod, cv, param_grid, verbose=False)
def testModel(sampleTexts, y): y = np.asarray(y) count_vect = CountVectorizer() X_counts = count_vect.fit_transform(sampleTexts) X = X_counts X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) clf = m.LogisticIT() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print('Accuracy of prediction is', clf.score(X_test, y_test)) print('Confusion matrix:\n', confusion_matrix(y_test, y_pred)) print(metrics.classification_report(y_test, y_pred))
def build_and_evaluate_sklearn(sampleTexts, y): '''Build vector of token counts''' count_vect = CountVectorizer() X_counts = count_vect.fit_transform( sampleTexts) #list of texts, each text is a string X = X_counts print(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #clf = LogisticRegression() clf = m.LogisticIT() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print('Accuracy of prediction is', clf.score(X_test, y_test)) print('Confusion matrix:\n', confusion_matrix(y_test, y_pred)) print(metrics.classification_report(y_test, y_pred))
def test_predict_proba_nonnegative(): """ Test that predict_proba() function outputs a tuple of non-negative values """ def check_for_negative_prob(proba): for p in np.ravel(proba): assert_greater_equal(np.round(p, 7), 0) clf = mord.LogisticAT(alpha=0.) clf.fit(X, y) check_for_negative_prob(clf.predict_proba(X)) clf2 = mord.LogisticIT(alpha=0.) clf2.fit(X, y) check_for_negative_prob(clf2.predict_proba(X)) clf3 = mord.LogisticSE(alpha=0.) clf3.fit(X, y) check_for_negative_prob(clf3.predict_proba(X))
def build_and_evaluate_sklearn(sampleTexts, y): '''Build vector of token counts''' from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_counts = count_vect.fit_transform( sampleTexts) #list of texts, each text is a string X = X_counts from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() X_tfidf = tfidf_transformer.fit_transform(X_counts) X = X_tfidf from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) from sklearn.naive_bayes import MultinomialNB import mord as m clf = m.LogisticIT() #clf = MultinomialNB() ########### IMPORTANTE ###################### # Support Vector Machine #vease http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html #from sklearn.svm import LinearSVC # kNN algorithm #vease http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier #from sklearn.neighbors import KNeighborsClassifier # Logistic Regression #vease http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html #from sklearn.linear_model import LogisticRegression clf.fit(X_train, y_train) y_pred = clf.predict(X_test) from sklearn import metrics from sklearn.metrics import confusion_matrix print('Accuracy of prediction is', clf.score(X_test, y_test)) print('Confusion matrix:\n', confusion_matrix(y_test, y_pred)) print(metrics.classification_report(y_test, y_pred))
def fit_logistic_it_with_crossvalidation(X, y, alpha=1.0): """An ordinal model of dataset with hyperparameter cross-validation. Immediate-Threshold (logistic/threshold) variant. Parameters & returns as per other training functions. alpha: float : Regularization parameter. Zero is no regularization, higher values increate the squared l2 regularization. """ basemod = mord.LogisticIT(alpha=alpha) cv = 5 param_grid = {'alpha': [0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0, 3.0]} return fit_classifier_with_crossvalidation(X, y, basemod, cv, param_grid, verbose=False)
def train_ordinal_logistic(train_features, train_labels, skip_grid_search, evaluation, num_jobs, loss, alpha, cost, ordinal_algorithm): """ returns the trained ordinal logistic model. loss, alpha and cost are ignored if grid search is requested. alpha: used only for se, it, at, and ridge and if grid search is not requested cost: used only for lad and if grid search is not requested loss: used only for lad and if grid search is not requested """ # requested grid search. find best parameters, to achieve highest average score if not skip_grid_search: penalty_weights = 'dummy' clf = grid_search.grid_search(evaluation, train_features, train_labels, penalty_weights, ordinal_algorithm, num_jobs) params = clf.best_params_ if 'penalty' in params: loss = params['loss'] if 'alpha' in params: alpha = params['alpha'] if 'cost' in params: cost = params['cost'] # Now perform the training on full train data. if ordinal_algorithm == 'logisticse': model = mord.LogisticSE(alpha=alpha, max_iter=20000) elif ordinal_algorithm == 'logisticit': model = mord.LogisticIT(alpha=alpha, max_iter=20000) elif ordinal_algorithm == 'logisticat': model = mord.LogisticAT(alpha=alpha, max_iter=20000) elif ordinal_algorithm == 'ordinalridge': model = mord.OrdinalRidge(alpha=alpha) elif ordinal_algorithm == 'lad': model = mord.LAD(C=cost, loss=loss, max_iter=10000) model = model.fit(train_features, train_labels) return model
def __init__(self, wrangl, nsub, num_labels=None, classifier=None): self.wrangl = wrangl self.n_splits = wrangl.n_splits self.t = wrangl.t if wrangl.num_labels: self.num_labels = wrangl.num_labels if num_labels: self.num_labels = num_labels if self.num_labels is None: raise Exception( 'Must provide number of num_labels to Classification') self.nsub = nsub if classifier: self.classifier = classifier else: self.classifier = mord.LogisticIT() self.scaler = StandardScaler() self.acc = np.zeros( (self.nsub, np.size(self.t), self.n_splits)) * np.nan self.acc_shuff = np.zeros( (self.nsub, np.size(self.t), self.n_splits)) * np.nan self.conf_mat = np.zeros((self.nsub, np.size( self.t), self.n_splits, self.num_labels, self.num_labels)) * np.nan
features.loc[features.Cont == 'Medium', 'Cont'] = 2 features.loc[features.Cont == 'High', 'Cont'] = 3 le = preprocessing.LabelEncoder() le.fit(features.loc[:, 'Type']) features.loc[:, 'type_encoded'] = le.transform(features.loc[:, 'Type']) X, y = features.loc[:, ('Infl', 'Cont', 'type_encoded')], data.target clf1 = linear_model.LogisticRegression(solver='lbfgs', multi_class='multinomial') clf1.fit(X, y) print('Mean Absolute Error of LogisticRegression: %s' % metrics.mean_absolute_error(clf1.predict(X), y)) clf2 = mord.LogisticAT(alpha=1.) clf2.fit(X, y) print('Mean Absolute Error of LogisticAT %s' % metrics.mean_absolute_error(clf2.predict(X), y)) clf3 = mord.LogisticIT(alpha=1.) clf3.fit(X, y) print('Mean Absolute Error of LogisticIT %s' % metrics.mean_absolute_error(clf3.predict(X), y)) clf4 = mord.LogisticSE(alpha=1.) clf4.fit(X, y) print('Mean Absolute Error of LogisticSE %s' % metrics.mean_absolute_error(clf4.predict(X), y))
rawText = getTemcorpus(path2, 'latin-1') if rawXML == False or rawText == False: fails += 1 continue categories.append(int(getRankXML(rawXML, path1))) text.append(getLemmas(rawText)) print("Total de errores:", fails) countOBJ = CountVectorizer() #ser de magia :v tfidfOBJ = TfidfTransformer() #ser de magia :v x_count = countOBJ.fit_transform(text) x_tdidf = tfidfOBJ.fit_transform(x_count) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(x_tdidf, np.array(categories), test_size=0.2) mordObj = m.LogisticIT() mordObj.fit(x_tdidf, np.array(categories)) y_pred = mordObj.predict(X_test) from sklearn import metrics from sklearn.metrics import confusion_matrix print('Accuracy of prediction is', mordObj.score(X_test, y_test)) print('Confusion matrix:\n', confusion_matrix(y_test, y_pred)) print(metrics.classification_report(y_test, y_pred))
print('Accuracy of prediction is', clf.score(X_test, y_test)) # print('Confusion matrix:\n', confusion_matrix(y_test, y_pred)) targetNames = ['yes', 'no'] report = metrics.classification_report(y_test, y_pred, target_names=targetNames, output_dict=True) result = { targetNames[0]: report[targetNames[0]], targetNames[1]: report[targetNames[1]], } return result if __name__ == '__main__': models = [mord.LogisticIT()] cleaningLevels = [2] cleaningDescription = [ "Tokens originales", "Tokens con letras", "Tokens con letras sin stopwords" ] for model in models: for cl in cleaningLevels: for lemmanized in [True]: # list of texts, each text is a string (a sms) sampleTexts, y = readMessages(cl, lemmanized) # print(len(sampleTexts), "messages in corpus") # print(y.count(0), " spam messages in corpus") # print(y.count(1), " ham messages in corpus")
# TF-IDF vectorizer = TfidfVectorizer(norm='l2', smooth_idf=True, use_idf=True) vec = vectorizer.fit_transform(cleanReviews) X = np.round(vec.todense(), 2) Y = np.array(ranks) print(len(X)) print(len(Y)) n = 3600 trainingX = np.array(X[:n]) trainingY = np.array(Y[:n]) testX = np.array(X[n:]) testY = np.array(Y[n:]) c = mord.LogisticIT() c.fit(trainingX, trainingY) Ypredict = c.predict(testX) print(len(Ypredict)) print(len(testY)) print("") print("") print("-------------- COMPARATION --------------") for i in range(0, len(Ypredict)): if i % 100 == 0: print("Prediction:", Ypredict[i], " Real:", testY[i]) print("") print("------------ CONFUSION MATRIX -----------")
#y<- etiquetas de los textos #X<- Lista de características count_vect = CountVectorizer() X_counts = count_vect.fit_transform(sampleTexts) #input(type(X_counts)) X = X_counts X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) import mord as m clf = m.LogisticIT() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) archivo = open('resultados.txt', "w") from sklearn import metrics print("Precisión de prediccion: ", clf.score(X_test, y_test)) print("Matriz de confusión: \n", metrics.confusion_matrix(y_test, y_pred)) print("Classification report: \n", metrics.classification_report(y_test, y_pred)) archivo.write("Precisión de prediccion: \n") archivo.write(str(clf.score(X_test, y_test))) archivo.write("\n\nMatriz de confusión: \n") archivo.write(
print('Accuracy of prediction is', clf.score(X_test, y_test)) print('Confusion matrix:\n', confusion_matrix(y_test, y_pred)) targetNames = ['1', '2', '3', '4', '5'] report = metrics.classification_report(y_test, y_pred, target_names=targetNames, output_dict=True) result = { targetNames[0]: report[targetNames[0]], targetNames[1]: report[targetNames[1]], } return result if __name__ == '__main__': models = [mord.LogisticIT(alpha=1.0)] cleaningLevels = [0] cleaningDescription = [ "Tokens originales", "Tokens con letras", "Tokens con letras sin stopwords" ] for model in models: for cl in cleaningLevels: for lemmanized in [True]: # list of texts, each text is a string (a sms) sampleTexts, y = readMessages(cl, lemmanized) print(len(sampleTexts), "messages in corpus") print(y.count(1), " 1 messages in corpus") print(y.count(2), " 2 messages in corpus")
def regressionData(readinData, target_name, output): readinData = readinData.iloc[:, 1:] convert = 24 * 60 * 60 * 365 * 1000000000 #readinData['Fst_ACCT_OPEN_DT'] = datetime.datetime(readinData['Fst_ACCT_OPEN_DT'].astype(str).split('-')) readinData['Fst_ACCT_OPEN_DT'] = (datetime.datetime(2017, 12, 31) - readinData['Fst_ACCT_OPEN_DT']) #print readinData['Fst_ACCT_OPEN_DT'] readinData['Fst_ACCT_OPEN_DT'] = readinData['Fst_ACCT_OPEN_DT'].astype( np.int64) #print readinData['Fst_ACCT_OPEN_DT'] readinData['Fst_ACCT_OPEN_DT'] = pd.to_numeric( readinData['Fst_ACCT_OPEN_DT'] / convert) #print readinData['Fst_ACCT_OPEN_DT'] varNames = readinData.columns target = list(varNames).index(target_name) target_data = readinData.iloc[:, target] #print target_data readinData.pop(target_name) #print readinData.dtypes print "Ready for the model" train_data_X, test_data_X, train_data_Y, test_data_Y = train_test_split( readinData, target_data, test_size=0.3, random_state=0) train_data_X = train_data_X.astype(np.float64) test_data_X = test_data_X.astype(np.float64) #print train_data_Y #print train_data_X.dtypes train_data_Y = train_data_Y.astype(np.int64) test_data_Y = test_data_Y.astype(np.int64) print train_data_Y.dtypes print test_data_Y.dtypes print train_data_X.dtypes print test_data_X.dtypes #print train_data_X LR = m.LogisticIT().fit(train_data_X, train_data_Y) #LR.fit(train_data_X, train_data_Y) predict_data_Y = LR.predict(test_data_X) print "finish predicting" #print test_data_X #predict_data_Y_prob = LR.predict_proba(test_data_X) overall_acc = metrics.accuracy_score(test_data_Y, predict_data_Y) print overall_acc #cm = confusion_matrix(test_data_Y, predict_data_Y) #result_table = classification_report(test_data_Y, predict_data_Y) readinData = readinData.rename( columns={ 'Gender_Cd': '性别', 'Age': '年龄', 'Fst_ACCT_OPEN_DT': '开户时长', 'clu73': '活期存款业务活跃度', 'Is_PP_Cust': '是否开通手机贴膜卡业务', 'Is_EP_Cust': '是否开通第三方支付业务', 'clu19': '是否持有信用卡', 'clu20': '是否持有借记卡', 'clu21': '是否持有存折', 'clu212': '是否持有存单', 'clu213': '是否持有定期一本通', 'Is_INSU_Cust': '是否社保客户', 'clu214': '是否持有活期一本通', 'AUM_0_5': 'AUM资产在0至5万之间客户数', 'Is_DFDK_CARD_Cust': '代发客户是否持有卡', 'clu37': '持有定期产品数量', 'clu38': '持有大额存单数量', 'clu39': '理财产品数量', 'clu40': '基金产品数量', 'clu41': '贵金属产品数量', 'Is_DFDK_CZ_Cust': '代发客户是否持有存折', 'clu42': '信托产品数量', 'clu43': '代销储蓄国债产品数量', 'clu44': '代理保险产品数量', 'clu45': '银证第三方存管产品数量', 'clu46': '个人消费贷款产品数量', 'clu47': '个人经营贷款产品数量', 'clu471': '个人委托贷款产品数量', 'clu48': '信用卡数量', 'clu72': '定期存款业务活跃度', 'clu74': '贷款业务活跃度', 'clu75': '理财业务活跃度', 'Is_NW_Cust': '是否开通网上银行业务', 'Is_PB_Cust': '是否开通手机银行业务', 'Is_WE_Cust': '是否开通微信银行业务', 'Is_DFDK_Cust': '是否代发客户', 'CB_CT_TX_NUM': '核心客户柜面使用频率', 'CB_PB_TX_NUM': '核心客户手机银行使用频率', 'CB_PP_TX_NUM': '核心客户手机贴膜卡使用频率', 'CB_NW_TX_NUM': '核心客户网上银行使用频率', 'CB_WE_TX_NUM': '核心客户微信银行使用频率(非动帐)', 'CB_ATM_TX_NUM': '核心客户ATM使用频率', 'CB_EP_TX_NUM': '核心客户第三方支付平台使用频率', 'CB_POS_TX_NUM': '核心客户POS/TPOS使用频率', 'indicator_new': '是否过路资金账户' }) new_var_name = readinData.columns new_coef = LR.coef_ scores, pvalues = chi2(train_data_X, train_data_Y) print "Start writing" resultdf = pd.DataFrame(columns=["Coef", "Variable", "pvalue"]) for i in range(len(new_var_name)): temp = pd.DataFrame([[new_coef[i], new_var_name[i], pvalues[i]]], columns=["Coef", "Variable", "pvalue"]) resultdf = pd.concat([resultdf, temp], ignore_index=True) resultdf = pd.concat([ resultdf, pd.DataFrame([[target_name, overall_acc, 1]], columns=["Coef", "Variable", "pvalue"]) ]) #data_v2= pd.DataFrame(columns=["prob", "result"]) #print test_data_Y[1][1] #print len(test_data_Y) #print len(predict_data_Y_prob) #for i in range(len(predict_data_Y_prob)): # temp = pd.DataFrame([[predict_data_Y_prob[i], test_data_Y[i]]], columns=["prob", "result"]) # data_v2 = pd.concat([data_v2, temp], ignore_index=True) resultdf.to_csv(output, encoding='utf_8_sig', index=False) print "Done"
def doAll(trainFileName, testFileName): trainSet = makeListEntries(trainFileName) testSet = makeListEntries(testFileName) """**************************************""" # data listTrainText = makeListText(trainSet) listTestText = makeListText(testSet) # target listTrainStars = makeListStars(trainSet) listTestStars = makeListStars(testSet) """*************************************""" # could do CountVectorizer cv = CountVectorizer(stop_words='english') trainCVMatr = cv.fit_transform(listTrainText) testCVMatr = cv.transform(listTestText) # could do TfidfVectorizer # tv = TfidfVectorizer(stop_words = 'english') # trainTVMatr = cv.fit_transform(listTrainText) # testTVMatr = cv.transform(listTestText) """*************************************""" # using CountVectorizer LR_CV_model = LogisticRegression(multi_class='multinomial', max_iter=1000, class_weight='balanced') LR_CV_model.fit(trainCVMatr, listTrainStars) # get it to predict LR_CV_prediction = LR_CV_model.predict(testCVMatr) # get accuracy score LR_CV_score = metrics.accuracy_score(listTestStars, LR_CV_prediction) LR_CV_f1 = metrics.f1_score(listTestStars, LR_CV_prediction, average='micro') LR_CV_r2 = metrics.r2_score(listTestStars, LR_CV_prediction, multioutput='variance_weighted') LR_my = betterScoring(listTestStars, LR_CV_prediction) # this is the bit with the tfidf vectorizer # LR_TV_model = LogisticRegression(multi_class = 'multinomial', max_iter=1000) # LR_TV_model.fit(trainTVMatr, listTrainStars) # get it to predict # LR_TV_prediction = LR_TV_model.predict(testTVMatr) # get accuracy score # LR_TV_score = metrics.accuracy_score(listTestStars, LR_TV_prediction) # what do the data say? #print("Multiclass, logistic regression, CountVectorizer: " + str(LR_CV_score)) #print("Multiclass, logistic regression, TfidfVectorizer: " + str(LR_TV_score)) """*************************************""" # using CountVectorizer NB_CV_model = MultinomialNB() NB_CV_model.fit(trainCVMatr, listTrainStars) # get it to predict NB_CV_prediction = NB_CV_model.predict(testCVMatr) # get accuracy score NB_CV_score = metrics.accuracy_score(listTestStars, NB_CV_prediction) NB_CV_f1 = metrics.f1_score(listTestStars, NB_CV_prediction, average='micro') NB_CV_r2 = metrics.r2_score(listTestStars, NB_CV_prediction, multioutput='variance_weighted') NB_my = betterScoring(listTestStars, NB_CV_prediction) # this is the bit with the tfidf vectorizer # NB_TV_model = MultinomialNB() # NB_TV_model.fit(trainCVMatr, listTrainStars) # get it to predict # NB_TV_prediction = NB_TV_model.predict(testTVMatr) # get accuracy score # NB_TV_score = metrics.accuracy_score(listTestStars, NB_TV_prediction) # what do the data say? #print("Naive Bayes, CountVectorizer: " + str(NB_CV_score)) # print("Naive Bayes, TfidfVectorizer: " + str(NB_TV_score)) """*************************************""" sid = SentimentIntensityAnalyzer() listOfRes = [] data2 = [json.loads(line) for line in open(testFileName, 'r')] for entry in data2: listOfRes.append(sid.polarity_scores(entry['review_body'])['compound']) scaledRes = [] size = len(listOfRes) for i in range(size): num = listOfRes[i] score = -1 if num >= q0 and num < q1: score = 1 elif num >= q1 and num < q2: score = 2 elif num >= q2 and num < q3: score = 3 elif num >= q3 and num < q4: score = 4 elif num >= q4 and num <= q5: score = 5 # add score back in scaledRes.append(score) vader_acc = metrics.accuracy_score(listTestStars, scaledRes) vader_f1 = metrics.f1_score(listTestStars, scaledRes, average='micro') vader_r2 = metrics.r2_score(listTestStars, scaledRes, multioutput='variance_weighted') vader_my = betterScoring(listTestStars, scaledRes) """*************************************""" # dealing with the ordinal regression ord_model = OrdinalClassifier(DecisionTreeClassifier()) ord_model.fit(trainCVMatr, listTrainStars) ord_model_prediction = ord_model.predict(testCVMatr) size = len(listTestStars) for i in range(size): if (ord_model_prediction[i] < 1): ord_model_prediction[i] = 1 ord_acc = metrics.accuracy_score(listTestStars, ord_model_prediction) ord_f1 = metrics.f1_score(listTestStars, ord_model_prediction, average='micro') ord_r2 = metrics.r2_score(listTestStars, ord_model_prediction, multioutput='variance_weighted') ord_my = betterScoring(listTestStars, ord_model_prediction) """*************************************""" # trying mord arr = np.asarray(listTrainStars) clf2 = mord.LogisticAT(alpha=1.) clf2.fit(trainCVMatr, arr) clf2_prediction = clf2.predict(testCVMatr) LAT_acc = metrics.accuracy_score(listTestStars, clf2_prediction) LAT_f1 = metrics.f1_score(listTestStars, clf2_prediction, average='micro') LAT_r2 = metrics.r2_score(listTestStars, clf2_prediction, multioutput='variance_weighted') LAT_my = betterScoring(listTestStars, clf2_prediction) #print('AccuracyScore of LogisticAT %s' % #metrics.accuracy_score(listTestStars, clf2.predict(testCVMatr))) clf3 = mord.LogisticIT(alpha=1.) clf3.fit(trainCVMatr, arr) clf3_prediction = clf3.predict(testCVMatr) LIT_acc = metrics.accuracy_score(listTestStars, clf3_prediction) LIT_f1 = metrics.f1_score(listTestStars, clf3_prediction, average='micro') LIT_r2 = metrics.r2_score(listTestStars, clf3_prediction, multioutput='variance_weighted') LIT_my = betterScoring(listTestStars, clf3_prediction) #print('AccuracyScore of LogisticIT %s' % #metrics.accuracy_score(listTestStars, clf3.predict(testCVMatr))) clf4 = mord.LogisticSE(alpha=1.) clf4.fit(trainCVMatr, arr) clf4_prediction = clf4.predict(testCVMatr) LSE_acc = metrics.accuracy_score(listTestStars, clf4_prediction) LSE_f1 = metrics.f1_score(listTestStars, clf4_prediction, average='micro') LSE_r2 = metrics.r2_score(listTestStars, clf4_prediction, multioutput='variance_weighted') LSE_my = betterScoring(listTestStars, clf4_prediction) #print('AccuracyScore of LogisticSE %s' % #metrics.accuracy_score(listTestStars, clf4.predict(testCVMatr))) """*************************************""" # return value categoryName = trainFileName.replace("dataset/prodAnalysis/train_", "") categoryName = categoryName.replace(".json", "") return [ categoryName, LR_CV_score, LR_CV_f1, LR_CV_r2, LR_my, NB_CV_score, NB_CV_f1, NB_CV_r2, NB_my, vader_acc, vader_f1, vader_r2, vader_my, ord_acc, ord_f1, ord_r2, ord_my, LAT_acc, LAT_f1, LAT_r2, LAT_my, LIT_acc, LIT_f1, LIT_r2, LIT_my, LSE_acc, LSE_f1, LSE_r2, LSE_my, ]
def ordinal_regression_bucketed_evaluation(annotator_df, position_df, args): train_df = prepare_dataset(annotator_df, position_df, keep_first_sentence=False) agreement_data = [] for i, row in train_df.iterrows(): agreement_data.append({ "worker_id": str(row["worker_id"]), "sentence_id": str(row["sentence_id_y"]), "value": row["suspense"], "type": "human" }) print(f"Evaluated rows - training {len(train_df)}, test {len(train_df)}") results_data = [] for col in model_prediction_columns: for feature_col in [f"{col}"]: results_dict = OrderedDict() results_dict["measure"] = feature_col train_features = features(feature_col, train_df) train_target = train_df[annotator_prediction_column].astype( int).to_numpy() class_weights = class_weight.compute_class_weight( 'balanced', numpy.unique(train_target), train_target) # class_weights = [max(0.5, min(c, 10.0)) for c in class_weights] sample_weights = [class_weights[x - 1] for x in train_target] print("Class Weights", class_weights) model = mord.LogisticIT(alpha=0.0) params = {} pipeline = Pipeline([('model', model)]) print('Estimator: ', model) grid = GridSearchCV(pipeline, params, scoring='neg_mean_absolute_error', n_jobs=1, cv=args["folds"]) grid.fit(train_features, train_target) #model__sample_weight=sample_weights) pred = grid.best_estimator_.predict(train_features) classification_report = metrics.classification_report( train_target, numpy.round(pred).astype(int), output_dict=True) classification_report = flatten(classification_report) results_dict = {**results_dict, **classification_report} agreement_triples = [] for pred, target_value, sentence in zip(pred, train_target, train_df["sentence_id_x"]): agreement_triples.append( (str("model"), str(array_to_first_value(sentence)), pred)) agreement_triples.append( (str("target"), str(array_to_first_value(sentence)), array_to_first_value(target_value))) agreement_data.append({ "worker_id": f"{feature_col}", "sentence_id": str(sentence), "value": pred, "type": "model_fitted" }) agreement(agreement_triples, "regression", results_dict) results_data.append(flatten(results_dict)) proportion_counts = train_df[f"{feature_col}_scaled"].loc[ train_df[f"{feature_col}_scaled"] != 0].value_counts( normalize=True, sort=False) total = 0.0 category_threshold_dict = OrderedDict() features_as_numpy = train_df[f"{feature_col}_scaled"].values for item, value in proportion_counts.iteritems(): total += value category_threshold_dict[item] = numpy.percentile( features_as_numpy, max(min(total * 100, 100.0), 0)) for k in ["prop", "std"]: agreement_triples = [] results_dict = OrderedDict() results_dict["measure"] = f"{feature_col}_{k}" predictions = [] for pred, target_value, sentence in zip( train_features, train_target, train_df["sentence_id_x"]): if k == "std:": if pred >= 2.0: mapped_pred = 5 elif pred >= 1.0: mapped_pred = 4 elif pred < -2.0: mapped_pred = 1 elif pred <= -1.0: mapped_pred = 2 else: mapped_pred = 3 else: mapped_pred = 5 # Default to the biggest, reassign if less for key, value in category_threshold_dict.items(): if pred < value: mapped_pred = key break predictions.append(mapped_pred) agreement_triples.append( (str("model"), str(array_to_first_value(sentence)), array_to_first_value(mapped_pred))) agreement_triples.append( (str("target"), str(array_to_first_value(sentence)), array_to_first_value(target_value))) agreement_data.append({ "worker_id": f"{feature_col}_{k}", "sentence_id": str(sentence), "value": pred, "type": f"model_{k}" }) classification_report = metrics.classification_report( train_target, numpy.array(predictions).astype(int), output_dict=True) classification_report = flatten(classification_report) results_dict = {**results_dict, **classification_report} agreement(agreement_triples, "regression", results_dict) results_data.append(flatten(results_dict)) results_df = pd.DataFrame(data=results_data) results_df.to_csv( f"{args['output_dir']}/sentence_model_evaluation/categorical_evaluation.csv" ) agreement_df = pandas.DataFrame(data=agreement_data) worker_pairwise_agreements = [] for (worker, other_worker) in combinations(agreement_df["worker_id"].unique(), 2): worker_df = agreement_df.loc[agreement_df["worker_id"] == worker] other_worker_df = agreement_df.loc[agreement_df["worker_id"] == other_worker] triples = [] agreement_dict = {} agreement_dict["worker_id"] = worker agreement_dict["type"] = worker_df["type"].values[0] agreement_dict["worker_id_2"] = other_worker agreement_dict["type_2"] = other_worker_df["type"].values[0] combined_df = pandas.merge(worker_df, other_worker_df, on="sentence_id", how="inner") if len(combined_df) > 0: predictions = [] targets = [] for i, row in combined_df.iterrows(): triples.append( ("worker", str(array_to_first_value(row["sentence_id"])), array_to_first_value(row["value_x"]))) targets.append(array_to_first_value(row["value_x"])) triples.append( ("other", str(array_to_first_value(row["sentence_id"])), array_to_first_value(row["value_y"]))) predictions.append(array_to_first_value(row["value_y"])) classification_report = metrics.classification_report( numpy.array(targets).astype(int), numpy.array(predictions).astype(int), output_dict=True) classification_report = flatten(classification_report) agreement_dict = {**agreement_dict, **classification_report} agreement_dict["num_prediction_points"] = len(combined_df) agreement(triples, "agreement", agreement_dict) worker_pairwise_agreements.append(agreement_dict) cross_pairwise_agreements_df = pd.DataFrame( data=worker_pairwise_agreements) cross_pairwise_agreements_df.to_csv( f"{args['output_dir']}/sentence_model_evaluation/all_pairwise_agreements.csv" )
report = metrics.classification_report(y_test, y_pred, target_names=targetNames, output_dict=True) result = { targetNames[0]: report[targetNames[0]], targetNames[1]: report[targetNames[1]], targetNames[2]: report[targetNames[2]], targetNames[3]: report[targetNames[3]], targetNames[4]: report[targetNames[4]], } return result if __name__ == '__main__': model = mord.LogisticIT(alpha=1.0) cl = 0 lemmanized = True cleaningDescription = [ "Tokens originales", "Tokens con letras", "Tokens con letras sin stopwords" ] # list of texts, each text is a string (a sms) sampleTexts, y, stats = readMessages(cl, lemmanized) print(len(sampleTexts), "messages in corpus") print(y.count(1), " 1 messages in corpus") print(y.count(2), " 2 messages in corpus") print(y.count(3), " 3 messages in corpus") print(y.count(4), " 4 messages in corpus")
y = points.loc[points.interactions > 0, 'interactions'] y_bin = points.loc[points.interactions > 0, 'binary_interactions'] log_reg.fit(X, y) print(log_reg.score(X, y)) print(log_reg.coef_) log_reg1.fit(X1, y) print(log_reg1.score(X1, y)) print(log_reg1.coef_) log_reg_bin.fit(X, y_bin) print(log_reg_bin.score(X, y_bin)) print(log_reg_bin.coef_) log_reg_bin1.fit(X1, y_bin) print(log_reg_bin1.score(X1, y_bin)) print(log_reg_bin1.coef_) ord_log_reg = mord.LogisticIT() ord_log_reg1 = mord.LogisticIT() ord_log_reg.fit(X, y) print(ord_log_reg.score(X, y)) print(ord_log_reg.coef_) ord_log_reg1.fit(X1, y) print(ord_log_reg1.score(X1, y)) print(ord_log_reg1.coef_) Xp = points.loc[:, ['overpoints']] Xp1 = points.loc[:, ['overpoints', 'to_end_seconds']] y_ord = ord_log_reg.predict(Xp) y_nom = log_reg.predict(Xp) y_ord1 = ord_log_reg1.predict(Xp1) y_nom1 = log_reg1.predict(Xp1) y_binp = log_reg_bin.predict(Xp)
def run_classification(X_train, X_test, y_train, y_test, how='rfc', random_state=0, n_jobs=2, cv=False, stand=False, verbose=True, full_output=False, **classpar): """ """ if stand: X_train = StandardScaler().fit_transform(X_train) X_test = StandardScaler().fit_transform(X_test) if how == 'or1': pars = {'alpha': 1e0, 'verbose': 1, 'max_iter': 1e5} for par in pars: if par not in classpar: classpar.update({par: pars.get(par)}) clasif = mord.LogisticAT(**classpar) elif how == 'or2': pars = {'alpha': 1e0, 'verbose': 1, 'max_iter': 1e5} for par in pars: if par not in classpar: classpar.update({par: pars.get(par)}) clasif = mord.LogisticIT(**classpar) elif how == 'or3': pars = { 'alpha': 1e0, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.001, 'solver': 'auto' } for par in pars: if par not in classpar: classpar.update({par: pars.get(par)}) clasif = mord.OrdinalRidge(random_state=random_state, **classpar) elif how == 'or4': pars = { 'epsilon': 0.0, 'tol': 0.0001, 'C': 1.0, 'loss': 'l1', 'fit_intercept': True, 'intercept_scaling': 1.0, 'dual': True, 'verbose': 0, 'max_iter': 10000 } for par in pars: if par not in classpar: classpar.update({par: pars.get(par)}) clasif = mord.LAD(random_state=random_state, **classpar) elif how == 'prank': pars = {'n_iter': 1000, 'shuffle': True} for par in pars: if par not in classpar: classpar.update({par: pars.get(par)}) clasif = ranking.PRank(random_state=random_state, **classpar) elif how == 'kprank': pars = { 'n_iter': 200, 'shuffle': True, 'kernel': 'rbf', 'gamma': 1e2, 'degree': 3, 'coef0': 1 } for par in pars: if par not in classpar: classpar.update({par: pars.get(par)}) clasif = ranking.KernelPRank(random_state=random_state, **classpar) elif how == 'rfc': pars = { 'n_estimators': 1000, 'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_split': 1e-07, 'bootstrap': True, 'oob_score': True, 'verbose': 0, 'warm_start': False, 'class_weight': None } for par in pars: if par not in classpar: classpar.update({par: pars.get(par)}) clasif = RFC(random_state=random_state, n_jobs=n_jobs, **classpar) elif how == 'svc': pars = { 'C': 1.0, 'kernel': 'rbf', 'degree': 3, 'gamma': 'auto', 'coef0': 0.0, 'shrinking': True, 'probability': False, 'tol': 0.001, 'cache_size': 200, 'class_weight': None, 'verbose': False, 'max_iter': -1, 'decision_function_shape': None } for par in pars: if par not in classpar: classpar.update({par: pars.get(par)}) clasif = SVC(random_state=random_state, **classpar) else: print 'Classifier not yet supported' return if cv: crosv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=random_state) # y_pred = cross_val_predict(clasif, X_train, y_train, cv=5, n_jobs=n_jobs, # verbose=1) # f1 = f1_score(y_test, y_pred, average='weighted') # ck = cohen_kappa_score(y_test, y_pred) # rec = recall_score(y_test, y_pred, average='weighted') # if verbose: # print '\nF1={:.2f}, Recall={:.2f}, Cohen Kappa={:.2f}'.format(f1, rec, ck) # return f1, rec, ck f1_cv_scores = cross_val_score(clasif, X_train, y_train, cv=crosv, scoring='f1_weighted', verbose=1, n_jobs=n_jobs) mean_cv_f1 = np.mean(f1_cv_scores) if verbose: print f1_cv_scores print 'Mean F1 score={:.3f}'.format(mean_cv_f1) return mean_cv_f1, f1_cv_scores else: if verbose: print clasif.fit(X_train, y_train.astype(int)) else: clasif.fit(X_train, y_train.astype(int)) y_pred = clasif.predict(X_test) if verbose: print '\n', imbmet.classification_report_imbalanced(y_test, y_pred) if verbose and hasattr(clasif, 'feature_importances_'): print 'Feature importances:' print clasif.feature_importances_ ck = cohen_kappa_score(y_test, y_pred) rec = recall_score(y_test, y_pred, average='weighted') f1 = f1_score(y_test, y_pred, average='weighted') if verbose: print '\nF1={:.2f}, Recall={:.2f}, Cohen Kappa={:.2f}'.format( f1, rec, ck) if full_output: return clasif, f1, rec, ck else: return f1, rec, ck
pca_scale=True, inputation=True, strategy='median', remove_low_variance=False) columns_to_drop = ['Response'] x = train.drop(columns_to_drop, axis=1) y = train.Response - 1 test_x = test.drop(columns_to_drop, axis=1) # ============================================================================= # Threshold base models # ============================================================================= # Intermediate Threshold Model lad_model_IT = mord.LogisticIT(alpha=1, verbose=1, max_iter=5000) # fit model lad_model_IT.fit(x, y) # predict train_y_pred = lad_model_IT.predict(x) y_pred = lad_model_IT.predict(test_x) + 1 # evaluate quadratic_weighted_kappa(train_y_pred, y) # All-Threshold Model lad_model_AT = mord.LogisticAT(alpha=0.5, verbose=1, max_iter=5000) # fit model