Ejemplo n.º 1
0
    def __init__(self, modelType):
        """
        :param modelType: 'logIT','logAT','ordRidge','lad','multiclasslogistic'
        """

        if (modelType.lower() == 'logit'):
            print("Using Logistic Immediate-Threshold variant")
            self.model = mord.LogisticIT(alpha=1.0, verbose=0, max_iter=1000)
        elif (modelType.lower() == 'logat'):
            print("Using Logistic All-Threshold variant")
            self.model = mord.LogisticAT(alpha=1.0, verbose=0, max_iter=1000)
        elif (modelType.lower() == 'ordridge'):
            print("Using Ordinal Ridge variant")
            # Best Score: -0.4885966615485714
            # Best Param: {'alpha': 0.02, 'solver': 'sag', 'max_iter': 100000, 'fit_intercept': True, 'copy_X': True, 'tol': 0.01, 'normalize': True}

            # self.model = mord.OrdinalRidge(alpha=1,fit_intercept=True,normalize=False,copy_X=True,max_iter=None,tol=0.001,solver='auto')

            # Best Score: -0.48869761710226156
            # Best Param: {'alpha': 5e-05, 'fit_intercept': True, 'max_iter': 50000, 'copy_X': True, 'normalize': False,
            #         'solver': 'cholesky', 'tol': 5e-05}
            ####  Completed: OrdinalRegression ordridge training ####
            # self.model = mord.OrdinalRidge(alpha=0.00005, fit_intercept=True, normalize=False, copy_X=True, max_iter=50000,
            #                                tol=0.00005, solver='cholesky')

            self.model = mord.OrdinalRidge(alpha=0.0001,
                                           fit_intercept=True,
                                           normalize=False,
                                           copy_X=True,
                                           max_iter=3000000,
                                           tol=0.0001,
                                           solver='auto')
            # self.model = mord.OrdinalRidge(alpha=0.00001, fit_intercept=True, normalize=True, copy_X=True, max_iter=1000000,tol=0.0000001, solver='auto')
        elif (modelType.lower() == 'lad'):
            print("Using Least Absolute Deviation")
            self.model = mord.LAD(epsilon=0.0,
                                  tol=0.0001,
                                  C=1.0,
                                  loss='l1',
                                  fit_intercept=True,
                                  intercept_scaling=1.0,
                                  dual=True,
                                  verbose=0,
                                  random_state=None,
                                  max_iter=1000)
        elif (modelType.lower() == 'multiclasslogistic'):
            print("Using Multiclass Logistic")
            self.model = mord.MulticlassLogistic(alpha=1.0,
                                                 verbose=0,
                                                 maxiter=1000)
        else:
            print(
                "Model selection not recognised.\nDefaulted to Logistic All-Threshold variant"
            )
            self.model = mord.LogisticIT(alpha=1.0, verbose=1, max_iter=1000)
Ejemplo n.º 2
0
def order_logit_regression():
    data = read_csv(CSV_PATH)
    bunch = Bunch(data=data.iloc[:, 1:-1], target=data.iloc[:, -1])
    d = bunch.data

    train_len = int(0.75 * d.shape[0])
    trainX, trainY = d.ix[:train_len-1, :], bunch.target[:train_len]
    testX, testY = d.ix[train_len:, :], bunch.target[train_len:]

    clf1 = mord.LogisticAT(alpha=0.5)
    clf1.fit(trainX, trainY)
    pred = clf1.predict(testX)
    draw_acc_matrix(testY, pred, train_len)
    print 'Accuracy of LogisticAT: %s' % metrics.accuracy_score(testY, pred)
    print 'Mean absolute error of LogisticAT: %s' % \
          metrics.mean_absolute_error(pred, testY)

    clf2 = mord.LogisticIT(alpha=0.5)
    clf2.fit(trainX, trainY)
    pred2 = clf2.predict(testX)
    draw_acc_matrix(testY, pred2, train_len)
    print 'Accuracy of LogisticIT: %s' % metrics.accuracy_score(testY, pred2)
    print 'Mean absolute error of LogisticIT: %s' % \
          metrics.mean_absolute_error(pred2, testY)

    clf3 = mord.LogisticSE(alpha=0.5)
    clf3.fit(trainX, trainY)
    pred3 = clf3.predict(testX)
    draw_acc_matrix(testY, pred3, train_len)
    print 'Accuracy of LogisticSE: %s' % metrics.accuracy_score(testY, pred3)
    print 'Mean absolute error of LogisticSE: %s' % \
          metrics.mean_absolute_error(pred3, testY)
Ejemplo n.º 3
0
def build_and_evaluate_sklearn(sampleTexts, y):
    #para poder utilizar LogisticIT
    import numpy as np
    y = np.asarray(y)
    '''Build vector of token counts'''
    from sklearn.feature_extraction.text import CountVectorizer
    count_vect = CountVectorizer()
    X_counts = count_vect.fit_transform(
        sampleTexts)  #list of texts, each text is a string
    X = X_counts

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    #se utiliza ordinal progression
    import mord as m
    clf = m.LogisticIT()

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    from sklearn import metrics
    from sklearn.metrics import confusion_matrix
    print('Accuracy of prediction is', clf.score(X_test, y_test))
    print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))
Ejemplo n.º 4
0
def main():
    data = pd.read_csv('./final_data.csv')
    X_train, X_test, y_train, y_test = split.split(data)

    clf = mord.LogisticIT()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print('accuracy_score:%06f' % accuracy_score(y_test, y_pred))
    print('precision_score:%06f' %
          precision_score(y_test, y_pred, average='macro'))
    print('recall_score:%06f' % recall_score(y_test, y_pred, average='macro'))

    target_names = ['VeryGood', 'Good', 'Fair', 'Bad', 'VeryBad']
    print('classification_report:',
          classification_report(y_test, y_pred, target_names=target_names))
Ejemplo n.º 5
0
def fit_logistic_it_with_crossvalidation(X, y):
    """An ordinal model of dataset with hyperparameter 
    cross-validation.  Immediate-Threshold (logistic/threshold) variant.
    
    Parameters & returns as per other training functions.
    """

    basemod = mord.LogisticIT()
    cv = 5
    param_grid = {'alpha': [0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0, 3.0]}
    return fit_classifier_with_crossvalidation(X,
                                               y,
                                               basemod,
                                               cv,
                                               param_grid,
                                               verbose=False)
def testModel(sampleTexts, y):
    y = np.asarray(y)

    count_vect = CountVectorizer()
    X_counts = count_vect.fit_transform(sampleTexts)
    X = X_counts

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    clf = m.LogisticIT()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print('Accuracy of prediction is', clf.score(X_test, y_test))
    print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))
Ejemplo n.º 7
0
def build_and_evaluate_sklearn(sampleTexts, y):
    '''Build vector of token counts'''

    count_vect = CountVectorizer()
    X_counts = count_vect.fit_transform(
        sampleTexts)  #list of texts, each text is a string
    X = X_counts
    print(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    #clf = LogisticRegression()
    clf = m.LogisticIT()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print('Accuracy of prediction is', clf.score(X_test, y_test))
    print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))
Ejemplo n.º 8
0
def test_predict_proba_nonnegative():
    """
    Test that predict_proba() function outputs a tuple of non-negative values
    """
    def check_for_negative_prob(proba):
        for p in np.ravel(proba):
            assert_greater_equal(np.round(p, 7), 0)

    clf = mord.LogisticAT(alpha=0.)
    clf.fit(X, y)
    check_for_negative_prob(clf.predict_proba(X))

    clf2 = mord.LogisticIT(alpha=0.)
    clf2.fit(X, y)
    check_for_negative_prob(clf2.predict_proba(X))

    clf3 = mord.LogisticSE(alpha=0.)
    clf3.fit(X, y)
    check_for_negative_prob(clf3.predict_proba(X))
Ejemplo n.º 9
0
def build_and_evaluate_sklearn(sampleTexts, y):
    '''Build vector of token counts'''
    from sklearn.feature_extraction.text import CountVectorizer
    count_vect = CountVectorizer()
    X_counts = count_vect.fit_transform(
        sampleTexts)  #list of texts, each text is a string
    X = X_counts

    from sklearn.feature_extraction.text import TfidfTransformer
    tfidf_transformer = TfidfTransformer()
    X_tfidf = tfidf_transformer.fit_transform(X_counts)
    X = X_tfidf

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    from sklearn.naive_bayes import MultinomialNB
    import mord as m
    clf = m.LogisticIT()
    #clf = MultinomialNB()

    ###########      IMPORTANTE     ######################

    # Support Vector Machine
    #vease http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
    #from sklearn.svm import LinearSVC

    # kNN algorithm
    #vease http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
    #from sklearn.neighbors import KNeighborsClassifier

    # Logistic Regression
    #vease http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
    #from sklearn.linear_model import LogisticRegression

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    from sklearn import metrics
    from sklearn.metrics import confusion_matrix
    print('Accuracy of prediction is', clf.score(X_test, y_test))
    print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))
Ejemplo n.º 10
0
def fit_logistic_it_with_crossvalidation(X, y, alpha=1.0):
    """An ordinal model of dataset with hyperparameter 
    cross-validation.  Immediate-Threshold (logistic/threshold) variant.
    
    Parameters & returns as per other training functions.
    
    alpha: float :
        Regularization parameter. Zero is no regularization, 
        higher values increate the squared l2 regularization.
    """

    basemod = mord.LogisticIT(alpha=alpha)
    cv = 5
    param_grid = {'alpha': [0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0, 3.0]}
    return fit_classifier_with_crossvalidation(X,
                                               y,
                                               basemod,
                                               cv,
                                               param_grid,
                                               verbose=False)
Ejemplo n.º 11
0
def train_ordinal_logistic(train_features, train_labels, skip_grid_search,
                           evaluation, num_jobs, loss, alpha, cost,
                           ordinal_algorithm):
    """
  returns the trained ordinal logistic model. loss, alpha and cost are ignored if grid
  search is requested.
  alpha: used only for se, it, at, and ridge and if grid search is not requested
  cost: used only for lad and if grid search is not requested
  loss: used only for lad and if grid search is not requested
  """
    # requested grid search. find best parameters, to achieve highest average score
    if not skip_grid_search:
        penalty_weights = 'dummy'
        clf = grid_search.grid_search(evaluation, train_features, train_labels,
                                      penalty_weights, ordinal_algorithm,
                                      num_jobs)
        params = clf.best_params_
        if 'penalty' in params:
            loss = params['loss']
        if 'alpha' in params:
            alpha = params['alpha']
        if 'cost' in params:
            cost = params['cost']

    # Now perform the training on full train data.
    if ordinal_algorithm == 'logisticse':
        model = mord.LogisticSE(alpha=alpha, max_iter=20000)
    elif ordinal_algorithm == 'logisticit':
        model = mord.LogisticIT(alpha=alpha, max_iter=20000)
    elif ordinal_algorithm == 'logisticat':
        model = mord.LogisticAT(alpha=alpha, max_iter=20000)
    elif ordinal_algorithm == 'ordinalridge':
        model = mord.OrdinalRidge(alpha=alpha)
    elif ordinal_algorithm == 'lad':
        model = mord.LAD(C=cost, loss=loss, max_iter=10000)
    model = model.fit(train_features, train_labels)

    return model
Ejemplo n.º 12
0
    def __init__(self, wrangl, nsub, num_labels=None, classifier=None):
        self.wrangl = wrangl
        self.n_splits = wrangl.n_splits
        self.t = wrangl.t
        if wrangl.num_labels: self.num_labels = wrangl.num_labels
        if num_labels: self.num_labels = num_labels
        if self.num_labels is None:
            raise Exception(
                'Must provide number of num_labels to Classification')

        self.nsub = nsub

        if classifier:
            self.classifier = classifier
        else:
            self.classifier = mord.LogisticIT()
        self.scaler = StandardScaler()

        self.acc = np.zeros(
            (self.nsub, np.size(self.t), self.n_splits)) * np.nan
        self.acc_shuff = np.zeros(
            (self.nsub, np.size(self.t), self.n_splits)) * np.nan
        self.conf_mat = np.zeros((self.nsub, np.size(
            self.t), self.n_splits, self.num_labels, self.num_labels)) * np.nan
Ejemplo n.º 13
0
features.loc[features.Cont == 'Medium', 'Cont'] = 2
features.loc[features.Cont == 'High', 'Cont'] = 3

le = preprocessing.LabelEncoder()
le.fit(features.loc[:, 'Type'])
features.loc[:, 'type_encoded'] = le.transform(features.loc[:, 'Type'])

X, y = features.loc[:, ('Infl', 'Cont', 'type_encoded')], data.target

clf1 = linear_model.LogisticRegression(solver='lbfgs',
                                       multi_class='multinomial')
clf1.fit(X, y)

print('Mean Absolute Error of LogisticRegression: %s' %
      metrics.mean_absolute_error(clf1.predict(X), y))

clf2 = mord.LogisticAT(alpha=1.)
clf2.fit(X, y)
print('Mean Absolute Error of LogisticAT %s' %
      metrics.mean_absolute_error(clf2.predict(X), y))

clf3 = mord.LogisticIT(alpha=1.)
clf3.fit(X, y)
print('Mean Absolute Error of LogisticIT %s' %
      metrics.mean_absolute_error(clf3.predict(X), y))

clf4 = mord.LogisticSE(alpha=1.)
clf4.fit(X, y)
print('Mean Absolute Error of LogisticSE %s' %
      metrics.mean_absolute_error(clf4.predict(X), y))
Ejemplo n.º 14
0
Archivo: cine.py Proyecto: zolunga/PLN
    rawText = getTemcorpus(path2, 'latin-1')

    if rawXML == False or rawText == False:
        fails += 1
        continue
    categories.append(int(getRankXML(rawXML, path1)))
    text.append(getLemmas(rawText))

print("Total de errores:", fails)

countOBJ = CountVectorizer()  #ser de magia :v
tfidfOBJ = TfidfTransformer()  #ser de magia :v
x_count = countOBJ.fit_transform(text)
x_tdidf = tfidfOBJ.fit_transform(x_count)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_tdidf,
                                                    np.array(categories),
                                                    test_size=0.2)

mordObj = m.LogisticIT()
mordObj.fit(x_tdidf, np.array(categories))

y_pred = mordObj.predict(X_test)

from sklearn import metrics
from sklearn.metrics import confusion_matrix
print('Accuracy of prediction is', mordObj.score(X_test, y_test))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
    print('Accuracy of prediction is', clf.score(X_test, y_test))
    # print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
    targetNames = ['yes', 'no']
    report = metrics.classification_report(y_test,
                                           y_pred,
                                           target_names=targetNames,
                                           output_dict=True)
    result = {
        targetNames[0]: report[targetNames[0]],
        targetNames[1]: report[targetNames[1]],
    }
    return result


if __name__ == '__main__':
    models = [mord.LogisticIT()]
    cleaningLevels = [2]
    cleaningDescription = [
        "Tokens originales", "Tokens con letras",
        "Tokens con letras sin stopwords"
    ]

    for model in models:
        for cl in cleaningLevels:
            for lemmanized in [True]:
                # list of texts, each text is a string (a sms)
                sampleTexts, y = readMessages(cl, lemmanized)

                # print(len(sampleTexts), "messages in corpus")
                # print(y.count(0), " spam messages in corpus")
                # print(y.count(1), " ham messages in corpus")
# TF-IDF
vectorizer = TfidfVectorizer(norm='l2', smooth_idf=True, use_idf=True)
vec = vectorizer.fit_transform(cleanReviews)

X = np.round(vec.todense(), 2)
Y = np.array(ranks)
print(len(X))
print(len(Y))

n = 3600
trainingX = np.array(X[:n])
trainingY = np.array(Y[:n])
testX = np.array(X[n:])
testY = np.array(Y[n:])

c = mord.LogisticIT()
c.fit(trainingX, trainingY)

Ypredict = c.predict(testX)
print(len(Ypredict))
print(len(testY))

print("")
print("")
print("-------------- COMPARATION --------------")
for i in range(0, len(Ypredict)):
    if i % 100 == 0:
        print("Prediction:", Ypredict[i], " Real:", testY[i])
print("")

print("------------ CONFUSION MATRIX -----------")
#y<- etiquetas de los textos
#X<- Lista de características
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(sampleTexts)
#input(type(X_counts))
X = X_counts

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

import mord as m

clf = m.LogisticIT()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

archivo = open('resultados.txt', "w")

from sklearn import metrics
print("Precisión de prediccion: ", clf.score(X_test, y_test))
print("Matriz de confusión: \n", metrics.confusion_matrix(y_test, y_pred))
print("Classification report: \n",
      metrics.classification_report(y_test, y_pred))

archivo.write("Precisión de prediccion: \n")
archivo.write(str(clf.score(X_test, y_test)))
archivo.write("\n\nMatriz de confusión: \n")
archivo.write(
Ejemplo n.º 18
0
    print('Accuracy of prediction is', clf.score(X_test, y_test))
    print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
    targetNames = ['1', '2', '3', '4', '5']
    report = metrics.classification_report(y_test,
                                           y_pred,
                                           target_names=targetNames,
                                           output_dict=True)
    result = {
        targetNames[0]: report[targetNames[0]],
        targetNames[1]: report[targetNames[1]],
    }
    return result


if __name__ == '__main__':
    models = [mord.LogisticIT(alpha=1.0)]
    cleaningLevels = [0]
    cleaningDescription = [
        "Tokens originales", "Tokens con letras",
        "Tokens con letras sin stopwords"
    ]

    for model in models:
        for cl in cleaningLevels:
            for lemmanized in [True]:
                # list of texts, each text is a string (a sms)
                sampleTexts, y = readMessages(cl, lemmanized)

                print(len(sampleTexts), "messages in corpus")
                print(y.count(1), " 1 messages in corpus")
                print(y.count(2), " 2 messages in corpus")
Ejemplo n.º 19
0
def regressionData(readinData, target_name, output):
    readinData = readinData.iloc[:, 1:]
    convert = 24 * 60 * 60 * 365 * 1000000000

    #readinData['Fst_ACCT_OPEN_DT'] = datetime.datetime(readinData['Fst_ACCT_OPEN_DT'].astype(str).split('-'))
    readinData['Fst_ACCT_OPEN_DT'] = (datetime.datetime(2017, 12, 31) -
                                      readinData['Fst_ACCT_OPEN_DT'])
    #print readinData['Fst_ACCT_OPEN_DT']
    readinData['Fst_ACCT_OPEN_DT'] = readinData['Fst_ACCT_OPEN_DT'].astype(
        np.int64)
    #print readinData['Fst_ACCT_OPEN_DT']
    readinData['Fst_ACCT_OPEN_DT'] = pd.to_numeric(
        readinData['Fst_ACCT_OPEN_DT'] / convert)
    #print readinData['Fst_ACCT_OPEN_DT']

    varNames = readinData.columns
    target = list(varNames).index(target_name)
    target_data = readinData.iloc[:, target]
    #print target_data
    readinData.pop(target_name)
    #print readinData.dtypes
    print "Ready for the model"
    train_data_X, test_data_X, train_data_Y, test_data_Y = train_test_split(
        readinData, target_data, test_size=0.3, random_state=0)

    train_data_X = train_data_X.astype(np.float64)
    test_data_X = test_data_X.astype(np.float64)
    #print train_data_Y
    #print train_data_X.dtypes

    train_data_Y = train_data_Y.astype(np.int64)
    test_data_Y = test_data_Y.astype(np.int64)
    print train_data_Y.dtypes
    print test_data_Y.dtypes
    print train_data_X.dtypes
    print test_data_X.dtypes

    #print train_data_X
    LR = m.LogisticIT().fit(train_data_X, train_data_Y)
    #LR.fit(train_data_X, train_data_Y)
    predict_data_Y = LR.predict(test_data_X)
    print "finish predicting"
    #print test_data_X

    #predict_data_Y_prob = LR.predict_proba(test_data_X)
    overall_acc = metrics.accuracy_score(test_data_Y, predict_data_Y)
    print overall_acc
    #cm = confusion_matrix(test_data_Y, predict_data_Y)
    #result_table = classification_report(test_data_Y, predict_data_Y)

    readinData = readinData.rename(
        columns={
            'Gender_Cd': '性别',
            'Age': '年龄',
            'Fst_ACCT_OPEN_DT': '开户时长',
            'clu73': '活期存款业务活跃度',
            'Is_PP_Cust': '是否开通手机贴膜卡业务',
            'Is_EP_Cust': '是否开通第三方支付业务',
            'clu19': '是否持有信用卡',
            'clu20': '是否持有借记卡',
            'clu21': '是否持有存折',
            'clu212': '是否持有存单',
            'clu213': '是否持有定期一本通',
            'Is_INSU_Cust': '是否社保客户',
            'clu214': '是否持有活期一本通',
            'AUM_0_5': 'AUM资产在0至5万之间客户数',
            'Is_DFDK_CARD_Cust': '代发客户是否持有卡',
            'clu37': '持有定期产品数量',
            'clu38': '持有大额存单数量',
            'clu39': '理财产品数量',
            'clu40': '基金产品数量',
            'clu41': '贵金属产品数量',
            'Is_DFDK_CZ_Cust': '代发客户是否持有存折',
            'clu42': '信托产品数量',
            'clu43': '代销储蓄国债产品数量',
            'clu44': '代理保险产品数量',
            'clu45': '银证第三方存管产品数量',
            'clu46': '个人消费贷款产品数量',
            'clu47': '个人经营贷款产品数量',
            'clu471': '个人委托贷款产品数量',
            'clu48': '信用卡数量',
            'clu72': '定期存款业务活跃度',
            'clu74': '贷款业务活跃度',
            'clu75': '理财业务活跃度',
            'Is_NW_Cust': '是否开通网上银行业务',
            'Is_PB_Cust': '是否开通手机银行业务',
            'Is_WE_Cust': '是否开通微信银行业务',
            'Is_DFDK_Cust': '是否代发客户',
            'CB_CT_TX_NUM': '核心客户柜面使用频率',
            'CB_PB_TX_NUM': '核心客户手机银行使用频率',
            'CB_PP_TX_NUM': '核心客户手机贴膜卡使用频率',
            'CB_NW_TX_NUM': '核心客户网上银行使用频率',
            'CB_WE_TX_NUM': '核心客户微信银行使用频率(非动帐)',
            'CB_ATM_TX_NUM': '核心客户ATM使用频率',
            'CB_EP_TX_NUM': '核心客户第三方支付平台使用频率',
            'CB_POS_TX_NUM': '核心客户POS/TPOS使用频率',
            'indicator_new': '是否过路资金账户'
        })

    new_var_name = readinData.columns
    new_coef = LR.coef_

    scores, pvalues = chi2(train_data_X, train_data_Y)

    print "Start writing"
    resultdf = pd.DataFrame(columns=["Coef", "Variable", "pvalue"])
    for i in range(len(new_var_name)):
        temp = pd.DataFrame([[new_coef[i], new_var_name[i], pvalues[i]]],
                            columns=["Coef", "Variable", "pvalue"])
        resultdf = pd.concat([resultdf, temp], ignore_index=True)
    resultdf = pd.concat([
        resultdf,
        pd.DataFrame([[target_name, overall_acc, 1]],
                     columns=["Coef", "Variable", "pvalue"])
    ])

    #data_v2= pd.DataFrame(columns=["prob", "result"])
    #print test_data_Y[1][1]
    #print len(test_data_Y)
    #print len(predict_data_Y_prob)
    #for i in range(len(predict_data_Y_prob)):

    #    temp = pd.DataFrame([[predict_data_Y_prob[i], test_data_Y[i]]], columns=["prob", "result"])
    #    data_v2 = pd.concat([data_v2, temp], ignore_index=True)

    resultdf.to_csv(output, encoding='utf_8_sig', index=False)
    print "Done"
Ejemplo n.º 20
0
def doAll(trainFileName, testFileName):
    trainSet = makeListEntries(trainFileName)
    testSet = makeListEntries(testFileName)
    """**************************************"""
    # data
    listTrainText = makeListText(trainSet)
    listTestText = makeListText(testSet)

    # target
    listTrainStars = makeListStars(trainSet)
    listTestStars = makeListStars(testSet)
    """*************************************"""
    # could do CountVectorizer
    cv = CountVectorizer(stop_words='english')

    trainCVMatr = cv.fit_transform(listTrainText)
    testCVMatr = cv.transform(listTestText)

    # could do TfidfVectorizer
    # tv = TfidfVectorizer(stop_words = 'english')

    # trainTVMatr = cv.fit_transform(listTrainText)
    # testTVMatr = cv.transform(listTestText)
    """*************************************"""
    # using CountVectorizer
    LR_CV_model = LogisticRegression(multi_class='multinomial',
                                     max_iter=1000,
                                     class_weight='balanced')
    LR_CV_model.fit(trainCVMatr, listTrainStars)

    # get it to predict
    LR_CV_prediction = LR_CV_model.predict(testCVMatr)

    # get accuracy score
    LR_CV_score = metrics.accuracy_score(listTestStars, LR_CV_prediction)
    LR_CV_f1 = metrics.f1_score(listTestStars,
                                LR_CV_prediction,
                                average='micro')
    LR_CV_r2 = metrics.r2_score(listTestStars,
                                LR_CV_prediction,
                                multioutput='variance_weighted')
    LR_my = betterScoring(listTestStars, LR_CV_prediction)
    # this is the bit with the tfidf vectorizer
    # LR_TV_model = LogisticRegression(multi_class = 'multinomial', max_iter=1000)
    # LR_TV_model.fit(trainTVMatr, listTrainStars)

    # get it to predict
    # LR_TV_prediction = LR_TV_model.predict(testTVMatr)

    # get accuracy score
    # LR_TV_score = metrics.accuracy_score(listTestStars, LR_TV_prediction)

    # what do the data say?
    #print("Multiclass, logistic regression, CountVectorizer: " + str(LR_CV_score))
    #print("Multiclass, logistic regression, TfidfVectorizer: " + str(LR_TV_score))
    """*************************************"""
    # using CountVectorizer
    NB_CV_model = MultinomialNB()
    NB_CV_model.fit(trainCVMatr, listTrainStars)

    # get it to predict
    NB_CV_prediction = NB_CV_model.predict(testCVMatr)

    # get accuracy score
    NB_CV_score = metrics.accuracy_score(listTestStars, NB_CV_prediction)
    NB_CV_f1 = metrics.f1_score(listTestStars,
                                NB_CV_prediction,
                                average='micro')
    NB_CV_r2 = metrics.r2_score(listTestStars,
                                NB_CV_prediction,
                                multioutput='variance_weighted')
    NB_my = betterScoring(listTestStars, NB_CV_prediction)
    # this is the bit with the tfidf vectorizer
    # NB_TV_model = MultinomialNB()
    # NB_TV_model.fit(trainCVMatr, listTrainStars)

    # get it to predict
    # NB_TV_prediction = NB_TV_model.predict(testTVMatr)

    # get accuracy score
    # NB_TV_score = metrics.accuracy_score(listTestStars, NB_TV_prediction)

    # what do the data say?
    #print("Naive Bayes, CountVectorizer: " + str(NB_CV_score))
    # print("Naive Bayes, TfidfVectorizer: " + str(NB_TV_score))
    """*************************************"""
    sid = SentimentIntensityAnalyzer()
    listOfRes = []

    data2 = [json.loads(line) for line in open(testFileName, 'r')]

    for entry in data2:
        listOfRes.append(sid.polarity_scores(entry['review_body'])['compound'])

    scaledRes = []
    size = len(listOfRes)
    for i in range(size):
        num = listOfRes[i]
        score = -1
        if num >= q0 and num < q1:
            score = 1
        elif num >= q1 and num < q2:
            score = 2
        elif num >= q2 and num < q3:
            score = 3
        elif num >= q3 and num < q4:
            score = 4
        elif num >= q4 and num <= q5:
            score = 5

        # add score back in
        scaledRes.append(score)

    vader_acc = metrics.accuracy_score(listTestStars, scaledRes)
    vader_f1 = metrics.f1_score(listTestStars, scaledRes, average='micro')
    vader_r2 = metrics.r2_score(listTestStars,
                                scaledRes,
                                multioutput='variance_weighted')
    vader_my = betterScoring(listTestStars, scaledRes)
    """*************************************"""
    # dealing with the ordinal regression
    ord_model = OrdinalClassifier(DecisionTreeClassifier())
    ord_model.fit(trainCVMatr, listTrainStars)
    ord_model_prediction = ord_model.predict(testCVMatr)

    size = len(listTestStars)
    for i in range(size):
        if (ord_model_prediction[i] < 1):
            ord_model_prediction[i] = 1

    ord_acc = metrics.accuracy_score(listTestStars, ord_model_prediction)
    ord_f1 = metrics.f1_score(listTestStars,
                              ord_model_prediction,
                              average='micro')
    ord_r2 = metrics.r2_score(listTestStars,
                              ord_model_prediction,
                              multioutput='variance_weighted')
    ord_my = betterScoring(listTestStars, ord_model_prediction)
    """*************************************"""
    # trying mord

    arr = np.asarray(listTrainStars)
    clf2 = mord.LogisticAT(alpha=1.)
    clf2.fit(trainCVMatr, arr)
    clf2_prediction = clf2.predict(testCVMatr)

    LAT_acc = metrics.accuracy_score(listTestStars, clf2_prediction)
    LAT_f1 = metrics.f1_score(listTestStars, clf2_prediction, average='micro')
    LAT_r2 = metrics.r2_score(listTestStars,
                              clf2_prediction,
                              multioutput='variance_weighted')
    LAT_my = betterScoring(listTestStars, clf2_prediction)
    #print('AccuracyScore of LogisticAT %s' %
    #metrics.accuracy_score(listTestStars, clf2.predict(testCVMatr)))

    clf3 = mord.LogisticIT(alpha=1.)
    clf3.fit(trainCVMatr, arr)
    clf3_prediction = clf3.predict(testCVMatr)

    LIT_acc = metrics.accuracy_score(listTestStars, clf3_prediction)
    LIT_f1 = metrics.f1_score(listTestStars, clf3_prediction, average='micro')
    LIT_r2 = metrics.r2_score(listTestStars,
                              clf3_prediction,
                              multioutput='variance_weighted')
    LIT_my = betterScoring(listTestStars, clf3_prediction)
    #print('AccuracyScore of LogisticIT %s' %
    #metrics.accuracy_score(listTestStars, clf3.predict(testCVMatr)))

    clf4 = mord.LogisticSE(alpha=1.)
    clf4.fit(trainCVMatr, arr)
    clf4_prediction = clf4.predict(testCVMatr)

    LSE_acc = metrics.accuracy_score(listTestStars, clf4_prediction)
    LSE_f1 = metrics.f1_score(listTestStars, clf4_prediction, average='micro')
    LSE_r2 = metrics.r2_score(listTestStars,
                              clf4_prediction,
                              multioutput='variance_weighted')
    LSE_my = betterScoring(listTestStars, clf4_prediction)
    #print('AccuracyScore of LogisticSE %s' %
    #metrics.accuracy_score(listTestStars, clf4.predict(testCVMatr)))
    """*************************************"""

    # return value
    categoryName = trainFileName.replace("dataset/prodAnalysis/train_", "")
    categoryName = categoryName.replace(".json", "")
    return [
        categoryName,
        LR_CV_score,
        LR_CV_f1,
        LR_CV_r2,
        LR_my,
        NB_CV_score,
        NB_CV_f1,
        NB_CV_r2,
        NB_my,
        vader_acc,
        vader_f1,
        vader_r2,
        vader_my,
        ord_acc,
        ord_f1,
        ord_r2,
        ord_my,
        LAT_acc,
        LAT_f1,
        LAT_r2,
        LAT_my,
        LIT_acc,
        LIT_f1,
        LIT_r2,
        LIT_my,
        LSE_acc,
        LSE_f1,
        LSE_r2,
        LSE_my,
    ]
Ejemplo n.º 21
0
def ordinal_regression_bucketed_evaluation(annotator_df, position_df, args):
    train_df = prepare_dataset(annotator_df,
                               position_df,
                               keep_first_sentence=False)

    agreement_data = []

    for i, row in train_df.iterrows():
        agreement_data.append({
            "worker_id": str(row["worker_id"]),
            "sentence_id": str(row["sentence_id_y"]),
            "value": row["suspense"],
            "type": "human"
        })

    print(f"Evaluated rows - training {len(train_df)}, test {len(train_df)}")

    results_data = []

    for col in model_prediction_columns:
        for feature_col in [f"{col}"]:
            results_dict = OrderedDict()
            results_dict["measure"] = feature_col

            train_features = features(feature_col, train_df)
            train_target = train_df[annotator_prediction_column].astype(
                int).to_numpy()

            class_weights = class_weight.compute_class_weight(
                'balanced', numpy.unique(train_target), train_target)

            # class_weights = [max(0.5, min(c, 10.0)) for c in class_weights]

            sample_weights = [class_weights[x - 1] for x in train_target]

            print("Class Weights", class_weights)

            model = mord.LogisticIT(alpha=0.0)

            params = {}

            pipeline = Pipeline([('model', model)])

            print('Estimator: ', model)
            grid = GridSearchCV(pipeline,
                                params,
                                scoring='neg_mean_absolute_error',
                                n_jobs=1,
                                cv=args["folds"])
            grid.fit(train_features,
                     train_target)  #model__sample_weight=sample_weights)
            pred = grid.best_estimator_.predict(train_features)
            classification_report = metrics.classification_report(
                train_target, numpy.round(pred).astype(int), output_dict=True)

            classification_report = flatten(classification_report)

            results_dict = {**results_dict, **classification_report}

            agreement_triples = []
            for pred, target_value, sentence in zip(pred, train_target,
                                                    train_df["sentence_id_x"]):

                agreement_triples.append(
                    (str("model"), str(array_to_first_value(sentence)), pred))
                agreement_triples.append(
                    (str("target"), str(array_to_first_value(sentence)),
                     array_to_first_value(target_value)))

                agreement_data.append({
                    "worker_id": f"{feature_col}",
                    "sentence_id": str(sentence),
                    "value": pred,
                    "type": "model_fitted"
                })

            agreement(agreement_triples, "regression", results_dict)

            results_data.append(flatten(results_dict))

            proportion_counts = train_df[f"{feature_col}_scaled"].loc[
                train_df[f"{feature_col}_scaled"] != 0].value_counts(
                    normalize=True, sort=False)

            total = 0.0
            category_threshold_dict = OrderedDict()
            features_as_numpy = train_df[f"{feature_col}_scaled"].values
            for item, value in proportion_counts.iteritems():
                total += value
                category_threshold_dict[item] = numpy.percentile(
                    features_as_numpy, max(min(total * 100, 100.0), 0))

            for k in ["prop", "std"]:

                agreement_triples = []
                results_dict = OrderedDict()
                results_dict["measure"] = f"{feature_col}_{k}"
                predictions = []
                for pred, target_value, sentence in zip(
                        train_features, train_target,
                        train_df["sentence_id_x"]):

                    if k == "std:":

                        if pred >= 2.0:
                            mapped_pred = 5
                        elif pred >= 1.0:
                            mapped_pred = 4
                        elif pred < -2.0:
                            mapped_pred = 1
                        elif pred <= -1.0:
                            mapped_pred = 2
                        else:
                            mapped_pred = 3

                    else:
                        mapped_pred = 5  # Default to the biggest, reassign if less
                        for key, value in category_threshold_dict.items():
                            if pred < value:
                                mapped_pred = key
                                break

                    predictions.append(mapped_pred)

                    agreement_triples.append(
                        (str("model"), str(array_to_first_value(sentence)),
                         array_to_first_value(mapped_pred)))
                    agreement_triples.append(
                        (str("target"), str(array_to_first_value(sentence)),
                         array_to_first_value(target_value)))

                    agreement_data.append({
                        "worker_id": f"{feature_col}_{k}",
                        "sentence_id": str(sentence),
                        "value": pred,
                        "type": f"model_{k}"
                    })

                classification_report = metrics.classification_report(
                    train_target,
                    numpy.array(predictions).astype(int),
                    output_dict=True)

                classification_report = flatten(classification_report)

                results_dict = {**results_dict, **classification_report}

                agreement(agreement_triples, "regression", results_dict)

                results_data.append(flatten(results_dict))

    results_df = pd.DataFrame(data=results_data)
    results_df.to_csv(
        f"{args['output_dir']}/sentence_model_evaluation/categorical_evaluation.csv"
    )

    agreement_df = pandas.DataFrame(data=agreement_data)
    worker_pairwise_agreements = []
    for (worker,
         other_worker) in combinations(agreement_df["worker_id"].unique(), 2):
        worker_df = agreement_df.loc[agreement_df["worker_id"] == worker]

        other_worker_df = agreement_df.loc[agreement_df["worker_id"] ==
                                           other_worker]

        triples = []

        agreement_dict = {}
        agreement_dict["worker_id"] = worker
        agreement_dict["type"] = worker_df["type"].values[0]

        agreement_dict["worker_id_2"] = other_worker
        agreement_dict["type_2"] = other_worker_df["type"].values[0]

        combined_df = pandas.merge(worker_df,
                                   other_worker_df,
                                   on="sentence_id",
                                   how="inner")

        if len(combined_df) > 0:

            predictions = []
            targets = []

            for i, row in combined_df.iterrows():
                triples.append(
                    ("worker", str(array_to_first_value(row["sentence_id"])),
                     array_to_first_value(row["value_x"])))
                targets.append(array_to_first_value(row["value_x"]))

                triples.append(
                    ("other", str(array_to_first_value(row["sentence_id"])),
                     array_to_first_value(row["value_y"])))
                predictions.append(array_to_first_value(row["value_y"]))

            classification_report = metrics.classification_report(
                numpy.array(targets).astype(int),
                numpy.array(predictions).astype(int),
                output_dict=True)
            classification_report = flatten(classification_report)
            agreement_dict = {**agreement_dict, **classification_report}

            agreement_dict["num_prediction_points"] = len(combined_df)

            agreement(triples, "agreement", agreement_dict)
            worker_pairwise_agreements.append(agreement_dict)

    cross_pairwise_agreements_df = pd.DataFrame(
        data=worker_pairwise_agreements)
    cross_pairwise_agreements_df.to_csv(
        f"{args['output_dir']}/sentence_model_evaluation/all_pairwise_agreements.csv"
    )
Ejemplo n.º 22
0
    report = metrics.classification_report(y_test,
                                           y_pred,
                                           target_names=targetNames,
                                           output_dict=True)
    result = {
        targetNames[0]: report[targetNames[0]],
        targetNames[1]: report[targetNames[1]],
        targetNames[2]: report[targetNames[2]],
        targetNames[3]: report[targetNames[3]],
        targetNames[4]: report[targetNames[4]],
    }
    return result


if __name__ == '__main__':
    model = mord.LogisticIT(alpha=1.0)
    cl = 0
    lemmanized = True
    cleaningDescription = [
        "Tokens originales", "Tokens con letras",
        "Tokens con letras sin stopwords"
    ]

    # list of texts, each text is a string (a sms)
    sampleTexts, y, stats = readMessages(cl, lemmanized)

    print(len(sampleTexts), "messages in corpus")
    print(y.count(1), " 1 messages in corpus")
    print(y.count(2), " 2 messages in corpus")
    print(y.count(3), " 3 messages in corpus")
    print(y.count(4), " 4 messages in corpus")
Ejemplo n.º 23
0
y = points.loc[points.interactions > 0, 'interactions']
y_bin = points.loc[points.interactions > 0, 'binary_interactions']
log_reg.fit(X, y)
print(log_reg.score(X, y))
print(log_reg.coef_)
log_reg1.fit(X1, y)
print(log_reg1.score(X1, y))
print(log_reg1.coef_)
log_reg_bin.fit(X, y_bin)
print(log_reg_bin.score(X, y_bin))
print(log_reg_bin.coef_)
log_reg_bin1.fit(X1, y_bin)
print(log_reg_bin1.score(X1, y_bin))
print(log_reg_bin1.coef_)

ord_log_reg = mord.LogisticIT()
ord_log_reg1 = mord.LogisticIT()
ord_log_reg.fit(X, y)
print(ord_log_reg.score(X, y))
print(ord_log_reg.coef_)
ord_log_reg1.fit(X1, y)
print(ord_log_reg1.score(X1, y))
print(ord_log_reg1.coef_)

Xp = points.loc[:, ['overpoints']]
Xp1 = points.loc[:, ['overpoints', 'to_end_seconds']]
y_ord = ord_log_reg.predict(Xp)
y_nom = log_reg.predict(Xp)
y_ord1 = ord_log_reg1.predict(Xp1)
y_nom1 = log_reg1.predict(Xp1)
y_binp = log_reg_bin.predict(Xp)
Ejemplo n.º 24
0
def run_classification(X_train,
                       X_test,
                       y_train,
                       y_test,
                       how='rfc',
                       random_state=0,
                       n_jobs=2,
                       cv=False,
                       stand=False,
                       verbose=True,
                       full_output=False,
                       **classpar):
    """

    """
    if stand:
        X_train = StandardScaler().fit_transform(X_train)
        X_test = StandardScaler().fit_transform(X_test)

    if how == 'or1':
        pars = {'alpha': 1e0, 'verbose': 1, 'max_iter': 1e5}
        for par in pars:
            if par not in classpar: classpar.update({par: pars.get(par)})
        clasif = mord.LogisticAT(**classpar)
    elif how == 'or2':
        pars = {'alpha': 1e0, 'verbose': 1, 'max_iter': 1e5}
        for par in pars:
            if par not in classpar: classpar.update({par: pars.get(par)})
        clasif = mord.LogisticIT(**classpar)
    elif how == 'or3':
        pars = {
            'alpha': 1e0,
            'fit_intercept': True,
            'normalize': False,
            'copy_X': True,
            'max_iter': None,
            'tol': 0.001,
            'solver': 'auto'
        }
        for par in pars:
            if par not in classpar: classpar.update({par: pars.get(par)})
        clasif = mord.OrdinalRidge(random_state=random_state, **classpar)
    elif how == 'or4':
        pars = {
            'epsilon': 0.0,
            'tol': 0.0001,
            'C': 1.0,
            'loss': 'l1',
            'fit_intercept': True,
            'intercept_scaling': 1.0,
            'dual': True,
            'verbose': 0,
            'max_iter': 10000
        }
        for par in pars:
            if par not in classpar: classpar.update({par: pars.get(par)})
        clasif = mord.LAD(random_state=random_state, **classpar)
    elif how == 'prank':
        pars = {'n_iter': 1000, 'shuffle': True}
        for par in pars:
            if par not in classpar: classpar.update({par: pars.get(par)})
        clasif = ranking.PRank(random_state=random_state, **classpar)
    elif how == 'kprank':
        pars = {
            'n_iter': 200,
            'shuffle': True,
            'kernel': 'rbf',
            'gamma': 1e2,
            'degree': 3,
            'coef0': 1
        }
        for par in pars:
            if par not in classpar: classpar.update({par: pars.get(par)})
        clasif = ranking.KernelPRank(random_state=random_state, **classpar)
    elif how == 'rfc':
        pars = {
            'n_estimators': 1000,
            'criterion': 'gini',
            'max_depth': None,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'min_weight_fraction_leaf': 0.0,
            'max_features': 'auto',
            'max_leaf_nodes': None,
            'min_impurity_split': 1e-07,
            'bootstrap': True,
            'oob_score': True,
            'verbose': 0,
            'warm_start': False,
            'class_weight': None
        }
        for par in pars:
            if par not in classpar: classpar.update({par: pars.get(par)})
        clasif = RFC(random_state=random_state, n_jobs=n_jobs, **classpar)
    elif how == 'svc':
        pars = {
            'C': 1.0,
            'kernel': 'rbf',
            'degree': 3,
            'gamma': 'auto',
            'coef0': 0.0,
            'shrinking': True,
            'probability': False,
            'tol': 0.001,
            'cache_size': 200,
            'class_weight': None,
            'verbose': False,
            'max_iter': -1,
            'decision_function_shape': None
        }
        for par in pars:
            if par not in classpar: classpar.update({par: pars.get(par)})
        clasif = SVC(random_state=random_state, **classpar)
    else:
        print 'Classifier not yet supported'
        return

    if cv:
        crosv = ShuffleSplit(n_splits=5,
                             test_size=0.3,
                             random_state=random_state)
        #         y_pred = cross_val_predict(clasif, X_train, y_train, cv=5, n_jobs=n_jobs,
        #                                    verbose=1)

        #         f1 = f1_score(y_test, y_pred, average='weighted')
        #         ck = cohen_kappa_score(y_test, y_pred)
        #         rec = recall_score(y_test, y_pred, average='weighted')

        #         if verbose:
        #             print '\nF1={:.2f}, Recall={:.2f}, Cohen Kappa={:.2f}'.format(f1, rec, ck)

        #         return f1, rec, ck

        f1_cv_scores = cross_val_score(clasif,
                                       X_train,
                                       y_train,
                                       cv=crosv,
                                       scoring='f1_weighted',
                                       verbose=1,
                                       n_jobs=n_jobs)
        mean_cv_f1 = np.mean(f1_cv_scores)
        if verbose:
            print f1_cv_scores
            print 'Mean F1 score={:.3f}'.format(mean_cv_f1)
        return mean_cv_f1, f1_cv_scores

    else:
        if verbose:
            print clasif.fit(X_train, y_train.astype(int))
        else:
            clasif.fit(X_train, y_train.astype(int))

        y_pred = clasif.predict(X_test)

        if verbose:
            print '\n', imbmet.classification_report_imbalanced(y_test, y_pred)
            if verbose and hasattr(clasif, 'feature_importances_'):
                print 'Feature importances:'
                print clasif.feature_importances_

        ck = cohen_kappa_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        if verbose:
            print '\nF1={:.2f}, Recall={:.2f}, Cohen Kappa={:.2f}'.format(
                f1, rec, ck)

        if full_output:
            return clasif, f1, rec, ck
        else:
            return f1, rec, ck
Ejemplo n.º 25
0
                        pca_scale=True,
                        inputation=True,
                        strategy='median',
                        remove_low_variance=False)

columns_to_drop = ['Response']
x = train.drop(columns_to_drop, axis=1)
y = train.Response - 1
test_x = test.drop(columns_to_drop, axis=1)

# =============================================================================
# Threshold base models
# =============================================================================

# Intermediate Threshold Model
lad_model_IT = mord.LogisticIT(alpha=1, verbose=1, max_iter=5000)

# fit model
lad_model_IT.fit(x, y)

# predict
train_y_pred = lad_model_IT.predict(x)
y_pred = lad_model_IT.predict(test_x) + 1

# evaluate
quadratic_weighted_kappa(train_y_pred, y)

# All-Threshold Model
lad_model_AT = mord.LogisticAT(alpha=0.5, verbose=1, max_iter=5000)

# fit model