예제 #1
0
def model_training_out_of_time(df,
                               holdout,
                               target,
                               features,
                               algo,
                               standard=True,
                               show_holdout=False,
                               downsample=False):

    X = df[features]
    y = df[target].astype('bool')
    X_holdout = holdout[features].drop('sec_filing_date', axis=1)
    y_holdout = holdout[target].astype('bool')

    if standard:
        clf = make_pipeline(StandardScaler(), algo)
    else:
        clf = algo

    alog_name = str(clf.steps[1][1]).split('(')[0]
    alog_name = " ".join(re.findall('[A-Z][^A-Z]*', alog_name))
    if alog_name == 'S V C':
        alog_name = 'SVM Classifier'
    print('### ' + alog_name + ' ###')

    scores = {'acc': [], 'f1': []}
    cf_matrix_val = np.zeros((2, 2), dtype=np.int)
    tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years')
    tbcv_folds = tbcv.split(df,
                            validation_split_date=datetime.date(2008, 12, 31),
                            date_column='sec_filing_date')
    for train_index, test_index in tbcv_folds:

        if downsample:
            df_kfold = down_sample_majority_class(df, 'rating_downgrade')
            df_kfold_index = df_kfold.index.tolist()
            train_index = [
                idx for idx in list(train_index) if idx in df_kfold_index
            ]

        data_train = X.loc[train_index].drop('sec_filing_date', axis=1)
        target_train = y.loc[train_index]

        data_test = X.loc[test_index].drop('sec_filing_date', axis=1)
        target_test = y.loc[test_index]

        clf.fit(data_train, target_train.values.ravel())
        preds = clf.predict(data_test)

        # accuracy for the current fold only
        score = clf.score(data_test, target_test)

        f1 = f1_score(target_test, preds)

        cf_matrix_val += confusion_matrix(target_test, preds)
        scores['acc'].append(score)
        scores['f1'].append(f1)

    print("Cross Validation Score: " +
          str(sum(scores['acc']) / len(scores['acc'])))

    if show_holdout:

        # Test model trained on last three years on holdout data

        frames = [test_index for train_index, test_index in tbcv_folds[-3:]]
        frames = [item for sublist in frames for item in sublist]
        data_train = X.loc[frames].drop('sec_filing_date', axis=1)
        target_train = y.loc[frames]
        clf.fit(data_train, target_train.values.ravel())
        holdout_preds = clf.predict(X_holdout)
        cf_matrix = confusion_matrix(y_holdout, holdout_preds)

        print("Holdout Score: " + str(clf.score(X_holdout, y_holdout)))
        print('\n')
        # Visualize confusion matrix for holdout data
        labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        categories = ['No Downgrade', 'Downgrade']
        make_confusion_matrix(cf_matrix,
                              group_names=labels,
                              categories=categories,
                              cbar=False,
                              title='Confusion Matrix: ' + alog_name,
                              figsize=(10, 10))

    else:
        #Visualize confusion matrix for cross-val data
        labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        categories = ['No Downgrade', 'Downgrade']
        make_confusion_matrix(cf_matrix_val,
                              group_names=labels,
                              categories=categories,
                              cbar=False,
                              title='Confusion Matrix: ' + alog_name,
                              figsize=(10, 10))

    return scores, clf, cf_matrix_val
예제 #2
0
def LDA_model_out_of_time_tpot(df,
                               features,
                               target,
                               dest_all_model,
                               downsample=False):

    X = df[features]
    y = df[target].astype('bool')

    scores = {'acc': [], 'f1': []}
    cf_matrix_val = np.zeros((2, 2), dtype=np.int)

    tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years')
    tbcv_folds = tbcv.split(df,
                            validation_split_date=datetime.date(2008, 12, 31),
                            date_column='sec_filing_date')
    k_folds = len(tbcv_folds)
    for k_index, (train_index, test_index) in enumerate(tbcv_folds):

        dest_train, dest_val = dest_all_model[str(k_index + 1)]

        data_train = X.loc[train_index].drop('sec_filing_date', axis=1)
        target_train = y.loc[train_index]

        data_test = X.loc[test_index].drop('sec_filing_date', axis=1)
        target_test = y.loc[test_index]

        print("=========================================")
        print("==== K Fold Validation step => %d/%d ======" %
              (k_index + 1, k_folds))
        print("=========================================")

        lda_model_train = LdaMulticore.load(lda_data_dir + dest_train)

        if downsample:

            try:
                data_train = pickle.load(
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_downsample_data_train.list',
                        "rb"))
            except:
                print("Prepare Train data")
                data_train = get_topic_proba(data_train, lda_model_train)
                pickle.dump(
                    data_train,
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_downsample_data_train.list',
                        "wb"))

            try:
                data_test = pickle.load(
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_downsample_data_test.list', "rb"))
            except:
                print("Prepare Test data")
                data_test = get_topic_proba(data_test, lda_model_train)
                pickle.dump(
                    data_test,
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_downsample_data_test.list', "wb"))

        else:

            try:
                data_train = pickle.load(
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_data_train.list', "rb"))
            except:
                print("Prepare Train data")
                data_train = get_topic_proba(data_train, lda_model_train)
                pickle.dump(
                    data_train,
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_data_train.list', "wb"))

            try:
                data_test = pickle.load(
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_data_test.list', "rb"))
            except:
                print("Prepare Test data")
                data_test = get_topic_proba(data_test, lda_model_train)
                pickle.dump(
                    data_test,
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_data_test.list', "wb"))

        clf = TPOTClassifier(generations=5,
                             population_size=50,
                             verbosity=2,
                             max_time_mins=5)
        clf.fit(data_train, target_train.values.ravel())
        preds = clf.predict(data_test)

        # accuracy for the current fold only
        score = clf.score(data_test, target_test)

        f1 = f1_score(target_test, preds)

        cf_matrix_val += confusion_matrix(target_test, preds)
        scores['acc'].append(score)
        scores['f1'].append(f1)

    print("Cross Validation Score: " +
          str(sum(scores['acc']) / len(scores['acc'])))

    #Visualize confusion matrix for cross-val data
    labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
    categories = ['No Downgrade', 'Downgrade']
    make_confusion_matrix(cf_matrix_val,
                          group_names=labels,
                          categories=categories,
                          cbar=False,
                          title='Confusion Matrix: TPOT',
                          figsize=(10, 10))

    return scores, clf, cf_matrix_val
예제 #3
0
def model_training_out_of_time_tpot(df,
                                    holdout,
                                    target,
                                    features,
                                    show_holdout=False,
                                    downsample=False):

    X = df[features]
    y = df[target].astype('bool')
    X_holdout = holdout[features].drop('sec_filing_date', axis=1)
    y_holdout = holdout[target].astype('bool')

    scores = {'acc': [], 'f1': []}
    cf_matrix_val = np.zeros((2, 2), dtype=np.int)
    tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years')
    tbcv_folds = tbcv.split(df,
                            validation_split_date=datetime.date(2008, 12, 31),
                            date_column='sec_filing_date')
    for train_index, test_index in tbcv_folds:

        if downsample:
            df_kfold = down_sample_majority_class(df, 'rating_downgrade')
            df_kfold_index = df_kfold.index.tolist()
            train_index = [
                idx for idx in list(train_index) if idx in df_kfold_index
            ]

        data_train = X.loc[train_index].drop('sec_filing_date', axis=1)
        target_train = y.loc[train_index]

        data_test = X.loc[test_index].drop('sec_filing_date', axis=1)
        target_test = y.loc[test_index]

        clf = TPOTClassifier(generations=5,
                             population_size=50,
                             verbosity=2,
                             max_time_mins=5)
        clf.fit(data_train, target_train)
        preds = clf.predict(data_test)

        # accuracy for the current fold only
        score = clf.score(data_test, target_test)
        f1 = f1_score(target_test, preds)

        cf_matrix_val += confusion_matrix(target_test, preds)
        scores['acc'].append(score)
        scores['f1'].append(f1)

    print("Cross Validation Score: " +
          str(sum(scores['acc']) / len(scores['acc'])))

    if show_holdout:

        # Test model trained on last three years on holdout data

        frames = [test_index for train_index, test_index in tbcv_folds[-3:]]
        frames = [item for sublist in frames for item in sublist]
        data_train = X.loc[frames].drop('sec_filing_date', axis=1)
        target_train = y.loc[frames]
        clf.fit(data_train, target_train)
        holdout_preds = clf.predict(X_holdout)
        cf_matrix = confusion_matrix(y_holdout, holdout_preds)

        print("Holdout Score: " + str(clf.score(X_holdout, y_holdout)))
        print('\n')
        # Visualize confusion matrix for holdout data
        labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        categories = ['No Downgrade', 'Downgrade']
        make_confusion_matrix(cf_matrix,
                              group_names=labels,
                              categories=categories,
                              cbar=False,
                              title='Confusion Matrix: TPOT',
                              figsize=(10, 10))
    else:
        #Visualize confusion matrix for cross-val data
        labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        categories = ['No Downgrade', 'Downgrade']
        make_confusion_matrix(cf_matrix_val,
                              group_names=labels,
                              categories=categories,
                              cbar=False,
                              title='Confusion Matrix: TPOT',
                              figsize=(10, 10))

    return scores, clf, cf_matrix_val
예제 #4
0
def model_training_out_of_time_pretrained(df,
                                          holdout,
                                          target,
                                          features,
                                          show_holdout=False,
                                          downsample=False):
    """
    Title: Learning Word Embeddings from 10-K Filings for Financial NLP Tasks
    Author: Saurabh Sehrawat
    Date: 2019
    Code version: 1.0
    Availability: https://github.com/ssehrawat/10K-word-embeddings
    """

    embed = torch.load(main_dir +
                       'data/10K-word-embeddings/10k_word_embeddings.tar')
    vocab_to_int = torch.load(main_dir +
                              'data/10K-word-embeddings/vocab_to_int.tar')

    X = df[features]
    y = df[target].astype('bool')
    X_holdout = holdout[features].drop('sec_filing_date', axis=1)
    y_holdout = holdout[target].astype('bool')

    alog_name = 'CNN'
    print('### ' + alog_name + ' ###')

    scores = dict()
    cf_matrix_val = np.zeros((2, 2), dtype=np.int)
    tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years')
    tbcv_folds = tbcv.split(df,
                            validation_split_date=datetime.date(2008, 12, 31),
                            date_column='sec_filing_date')
    k_folds = len(tbcv_folds)
    for k_index, (train_index, test_index) in enumerate(tbcv_folds):

        if downsample:
            df_kfold = down_sample_majority_class(df, 'rating_downgrade')
            df_kfold_index = df_kfold.index.tolist()
            train_index = [
                idx for idx in list(train_index) if idx in df_kfold_index
            ]

        data_train = X.loc[train_index].drop('sec_filing_date', axis=1)
        target_train = y.loc[train_index]

        data_test = X.loc[test_index].drop('sec_filing_date', axis=1)
        target_test = y.loc[test_index]

        print("=========================================")
        print("==== K Fold Validation step => %d/%d ======" %
              (k_index + 1, k_folds))
        print("=========================================")

        x_train, y_train, x_val, y_val, embedding_layer, MAX_SEQUENCE_LENGTH = create_embedding_layer(
            data_train,
            target_train,
            data_test,
            target_test,
            embed,
            vocab_to_int,
            trainable=False)
        history, model = train_model_keras_CNN(x_train, y_train, x_val, y_val,
                                               embedding_layer,
                                               MAX_SEQUENCE_LENGTH)
        print(history.history)
        scores[k_index] = history.history

        preds_y = model.predict(x_val)
        preds_y = np.rint(preds_y)

        preds_y = preds_y.argmax(axis=-1)
        y_val = y_val.argmax(axis=-1)
        cf_matrix_val += confusion_matrix(y_val, preds_y)

    if show_holdout:
        # Test model trained on last three years on holdout data

        frames = [test_index for train_index, test_index in tbcv_folds[-3:]]
        frames = [item for sublist in frames for item in sublist]
        data_train = X.loc[frames].drop('sec_filing_date', axis=1)
        target_train = y.loc[frames]
        x_train, y_train, x_val, y_val, embedding_layer, MAX_SEQUENCE_LENGTH = create_embedding_layer(
            data_train,
            target_train,
            X_holdout,
            y_holdout,
            embed,
            vocab_to_int,
            trainable=False)
        history, model = train_model_keras_CNN(x_train, y_train, x_val, y_val,
                                               embedding_layer,
                                               MAX_SEQUENCE_LENGTH)

        preds_y = model.predict(x_val)
        preds_y = np.rint(preds_y)

        preds_y = preds_y.argmax(axis=-1)
        y_val = y_val.argmax(axis=-1)
        cf_matrix = confusion_matrix(y_val, preds_y)

        scores['holdout'] = history.history

        #print("Holdout Score: " + str(clf.score(x_val, y_val)))
        #print('\n')
        # Visualize confusion matrix for holdout data
        labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        categories = ['No Downgrade', 'Downgrade']
        make_confusion_matrix(cf_matrix,
                              group_names=labels,
                              categories=categories,
                              cbar=False,
                              title='Confusion Matrix: ' + alog_name,
                              figsize=(10, 10))
    else:

        labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        categories = ['No Downgrade', 'Downgrade']
        make_confusion_matrix(cf_matrix_val,
                              group_names=labels,
                              categories=categories,
                              cbar=False,
                              title='Confusion Matrix: ' + alog_name,
                              figsize=(10, 10))

    return scores, cf_matrix_val