Ejemplo n.º 1
0
def prepare_data_sim(df, downsample=False):

    # Clean and convert data
    df_sim_clean = df.dropna(subset=['Sim_Cosine', 'Sim_Jaccard'])
    sec_filing_date = pd.to_datetime(df_sim_clean['sec_filing_date'],
                                     format='%Y%m%d')
    df_sim_clean['sec_filing_date'] = sec_filing_date
    df_sim_clean = df_sim_clean[
        df_sim_clean['sec_filing_date'] < datetime.datetime(2019, 1, 1)]
    df_sim_clean = df_sim_clean[
        df_sim_clean['sec_filing_date'] > datetime.datetime(2006, 12, 31)]

    # Create train & Validation set
    df_sim_clean = df_sim_clean[[
        'rating_downgrade', 'sec_year', 'Sim_Cosine', 'Sim_Jaccard', 'default',
        'sec_filing_date'
    ]]
    df_sim_clean = df_sim_clean.dropna()
    df_sim_clean = calculate_previous_quantile(df_sim_clean, 5,
                                               ['Sim_Jaccard', 'Sim_Cosine'])
    df_sim_clean = df_sim_clean.dropna()

    # Create holdoutset
    df_holdout = df_sim_clean[
        df_sim_clean['sec_filing_date'] > datetime.datetime(2017, 12, 31)]
    if downsample:
        df_holdout = down_sample_majority_class(df_holdout, 'rating_downgrade')

    # Create train & Validation set
    df_train = df_sim_clean[
        df_sim_clean['sec_filing_date'] <= datetime.datetime(2017, 12, 31)]
    if downsample:
        df_train = down_sample_majority_class(df_train, 'rating_downgrade')

    return df_train, df_holdout
Ejemplo n.º 2
0
def prepare_data_avg_embedding(df, downsample=False):
    """
    Title: Learning Word Embeddings from 10-K Filings for Financial NLP Tasks
    Author: Saurabh Sehrawat
    Date: 2019
    Code version: 1.0
    Availability: https://github.com/ssehrawat/10K-word-embeddings
    """
    embed = torch.load(main_dir +
                       'data/10K-word-embeddings/10k_word_embeddings.tar')
    vocab_to_int = torch.load(main_dir +
                              'data/10K-word-embeddings/vocab_to_int.tar')

    df = df.dropna(subset=['diff_text'])

    avg_embd = np.zeros([1, 300], dtype=float)
    embd_matrix = np.zeros([1, 300], dtype=float)
    for index, row in tqdm(df.iterrows()):
        doc = row['diff_text']
        doc = doc.lower()
        doc = doc.split()
        embd_array = np.zeros([1, 300], dtype=float)
        for word in doc:
            if word in vocab_to_int.keys():
                word_emd = embed[vocab_to_int[word]]
                embd_array = np.append(embd_array,
                                       word_emd.reshape(1, 300),
                                       axis=0)
        avg_embd = np.average(embd_array[1:], axis=0).reshape(1, 300)
        embd_matrix = np.concatenate((embd_matrix, avg_embd), axis=0)

    data = pd.DataFrame(embd_matrix[1:])

    sec_filing_date = pd.to_datetime(df['sec_filing_date'], format='%Y%m%d')
    data['sec_filing_date'] = sec_filing_date
    data['rating_downgrade'] = df['rating_downgrade']

    # Create train & Validation set
    data = data.dropna()
    data = data[data['sec_filing_date'] < datetime.datetime(2019, 1, 1)]

    # Create holdoutset
    df_holdout = data[
        data['sec_filing_date'] > datetime.datetime(2017, 12, 31)]

    if downsample:
        df_holdout = down_sample_majority_class(df_holdout, 'rating_downgrade')

    # Create train & Validation set
    df_train = data[data['sec_filing_date'] <= datetime.datetime(2017, 12, 31)]

    if downsample:
        df_train = down_sample_majority_class(df_train, 'rating_downgrade')

    return df_train, df_holdout, data
Ejemplo n.º 3
0
def prepare_data_tfidf(df, downsample=False):

    df = df.dropna(subset=['diff_text'])

    stemmer = WordNetLemmatizer()
    docs = []
    for index, row in df.iterrows():
        doc = row['diff_text']
        doc = doc.lower()
        doc = doc.split()

        doc = [stemmer.lemmatize(word) for word in doc]
        doc = ' '.join(doc)
        docs.append(doc)

    tfidf = TfidfVectorizer(max_features=1500,
                            min_df=5,
                            max_df=0.7,
                            stop_words=stopwords.words('english'))
    X = tfidf.fit_transform(docs).toarray()

    data = pd.DataFrame(X)
    sec_filing_date = pd.to_datetime(df['sec_filing_date'], format='%Y%m%d')
    data['sec_filing_date'] = sec_filing_date
    data['rating_downgrade'] = df['rating_downgrade']

    # Create train & Validation set
    data = data.dropna()
    data = data[data['sec_filing_date'] < datetime.datetime(2019, 1, 1)]

    # Create holdoutset
    df_holdout = data[
        data['sec_filing_date'] > datetime.datetime(2017, 12, 31)]
    if downsample:
        df_holdout = down_sample_majority_class(df_holdout, 'rating_downgrade')

    # Create train & Validation set
    df_train = data[data['sec_filing_date'] <= datetime.datetime(2017, 12, 31)]
    if downsample:
        df_train = down_sample_majority_class(df_train, 'rating_downgrade')

    return df_train, df_holdout
Ejemplo n.º 4
0
def prepare_data_LDA(df, text_feature, downsample=False):

    # Clean and convert data
    data = df.dropna(subset=[text_feature])
    sec_filing_date = pd.to_datetime(data['sec_filing_date'], format='%Y%m%d')
    data['sec_filing_date'] = sec_filing_date
    data['rating_downgrade'] = data['rating_downgrade']

    # Create train & Validation set
    data = data[data['sec_filing_date'] < datetime.datetime(2019, 1, 1)]

    # Create holdoutset
    df_holdout = data[
        data['sec_filing_date'] > datetime.datetime(2017, 12, 31)]
    if downsample:
        df_holdout = down_sample_majority_class(df_holdout, 'rating_downgrade')

    # Create train & Validation set
    df_train = data[data['sec_filing_date'] <= datetime.datetime(2017, 12, 31)]
    if downsample:
        df_train = down_sample_majority_class(df_train, 'rating_downgrade')

    return df_train, df_holdout
Ejemplo n.º 5
0
def model_training_out_of_time(df,
                               holdout,
                               target,
                               features,
                               algo,
                               standard=True,
                               show_holdout=False,
                               downsample=False):

    X = df[features]
    y = df[target].astype('bool')
    X_holdout = holdout[features].drop('sec_filing_date', axis=1)
    y_holdout = holdout[target].astype('bool')

    if standard:
        clf = make_pipeline(StandardScaler(), algo)
    else:
        clf = algo

    alog_name = str(clf.steps[1][1]).split('(')[0]
    alog_name = " ".join(re.findall('[A-Z][^A-Z]*', alog_name))
    if alog_name == 'S V C':
        alog_name = 'SVM Classifier'
    print('### ' + alog_name + ' ###')

    scores = {'acc': [], 'f1': []}
    cf_matrix_val = np.zeros((2, 2), dtype=np.int)
    tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years')
    tbcv_folds = tbcv.split(df,
                            validation_split_date=datetime.date(2008, 12, 31),
                            date_column='sec_filing_date')
    for train_index, test_index in tbcv_folds:

        if downsample:
            df_kfold = down_sample_majority_class(df, 'rating_downgrade')
            df_kfold_index = df_kfold.index.tolist()
            train_index = [
                idx for idx in list(train_index) if idx in df_kfold_index
            ]

        data_train = X.loc[train_index].drop('sec_filing_date', axis=1)
        target_train = y.loc[train_index]

        data_test = X.loc[test_index].drop('sec_filing_date', axis=1)
        target_test = y.loc[test_index]

        clf.fit(data_train, target_train.values.ravel())
        preds = clf.predict(data_test)

        # accuracy for the current fold only
        score = clf.score(data_test, target_test)

        f1 = f1_score(target_test, preds)

        cf_matrix_val += confusion_matrix(target_test, preds)
        scores['acc'].append(score)
        scores['f1'].append(f1)

    print("Cross Validation Score: " +
          str(sum(scores['acc']) / len(scores['acc'])))

    if show_holdout:

        # Test model trained on last three years on holdout data

        frames = [test_index for train_index, test_index in tbcv_folds[-3:]]
        frames = [item for sublist in frames for item in sublist]
        data_train = X.loc[frames].drop('sec_filing_date', axis=1)
        target_train = y.loc[frames]
        clf.fit(data_train, target_train.values.ravel())
        holdout_preds = clf.predict(X_holdout)
        cf_matrix = confusion_matrix(y_holdout, holdout_preds)

        print("Holdout Score: " + str(clf.score(X_holdout, y_holdout)))
        print('\n')
        # Visualize confusion matrix for holdout data
        labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        categories = ['No Downgrade', 'Downgrade']
        make_confusion_matrix(cf_matrix,
                              group_names=labels,
                              categories=categories,
                              cbar=False,
                              title='Confusion Matrix: ' + alog_name,
                              figsize=(10, 10))

    else:
        #Visualize confusion matrix for cross-val data
        labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        categories = ['No Downgrade', 'Downgrade']
        make_confusion_matrix(cf_matrix_val,
                              group_names=labels,
                              categories=categories,
                              cbar=False,
                              title='Confusion Matrix: ' + alog_name,
                              figsize=(10, 10))

    return scores, clf, cf_matrix_val
Ejemplo n.º 6
0
def model_training_out_of_time_tpot(df,
                                    holdout,
                                    target,
                                    features,
                                    show_holdout=False,
                                    downsample=False):

    X = df[features]
    y = df[target].astype('bool')
    X_holdout = holdout[features].drop('sec_filing_date', axis=1)
    y_holdout = holdout[target].astype('bool')

    scores = {'acc': [], 'f1': []}
    cf_matrix_val = np.zeros((2, 2), dtype=np.int)
    tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years')
    tbcv_folds = tbcv.split(df,
                            validation_split_date=datetime.date(2008, 12, 31),
                            date_column='sec_filing_date')
    for train_index, test_index in tbcv_folds:

        if downsample:
            df_kfold = down_sample_majority_class(df, 'rating_downgrade')
            df_kfold_index = df_kfold.index.tolist()
            train_index = [
                idx for idx in list(train_index) if idx in df_kfold_index
            ]

        data_train = X.loc[train_index].drop('sec_filing_date', axis=1)
        target_train = y.loc[train_index]

        data_test = X.loc[test_index].drop('sec_filing_date', axis=1)
        target_test = y.loc[test_index]

        clf = TPOTClassifier(generations=5,
                             population_size=50,
                             verbosity=2,
                             max_time_mins=5)
        clf.fit(data_train, target_train)
        preds = clf.predict(data_test)

        # accuracy for the current fold only
        score = clf.score(data_test, target_test)
        f1 = f1_score(target_test, preds)

        cf_matrix_val += confusion_matrix(target_test, preds)
        scores['acc'].append(score)
        scores['f1'].append(f1)

    print("Cross Validation Score: " +
          str(sum(scores['acc']) / len(scores['acc'])))

    if show_holdout:

        # Test model trained on last three years on holdout data

        frames = [test_index for train_index, test_index in tbcv_folds[-3:]]
        frames = [item for sublist in frames for item in sublist]
        data_train = X.loc[frames].drop('sec_filing_date', axis=1)
        target_train = y.loc[frames]
        clf.fit(data_train, target_train)
        holdout_preds = clf.predict(X_holdout)
        cf_matrix = confusion_matrix(y_holdout, holdout_preds)

        print("Holdout Score: " + str(clf.score(X_holdout, y_holdout)))
        print('\n')
        # Visualize confusion matrix for holdout data
        labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        categories = ['No Downgrade', 'Downgrade']
        make_confusion_matrix(cf_matrix,
                              group_names=labels,
                              categories=categories,
                              cbar=False,
                              title='Confusion Matrix: TPOT',
                              figsize=(10, 10))
    else:
        #Visualize confusion matrix for cross-val data
        labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        categories = ['No Downgrade', 'Downgrade']
        make_confusion_matrix(cf_matrix_val,
                              group_names=labels,
                              categories=categories,
                              cbar=False,
                              title='Confusion Matrix: TPOT',
                              figsize=(10, 10))

    return scores, clf, cf_matrix_val
Ejemplo n.º 7
0
def model_training_out_of_time_pretrained(df,
                                          holdout,
                                          target,
                                          features,
                                          show_holdout=False,
                                          downsample=False):
    """
    Title: Learning Word Embeddings from 10-K Filings for Financial NLP Tasks
    Author: Saurabh Sehrawat
    Date: 2019
    Code version: 1.0
    Availability: https://github.com/ssehrawat/10K-word-embeddings
    """

    embed = torch.load(main_dir +
                       'data/10K-word-embeddings/10k_word_embeddings.tar')
    vocab_to_int = torch.load(main_dir +
                              'data/10K-word-embeddings/vocab_to_int.tar')

    X = df[features]
    y = df[target].astype('bool')
    X_holdout = holdout[features].drop('sec_filing_date', axis=1)
    y_holdout = holdout[target].astype('bool')

    alog_name = 'CNN'
    print('### ' + alog_name + ' ###')

    scores = dict()
    cf_matrix_val = np.zeros((2, 2), dtype=np.int)
    tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years')
    tbcv_folds = tbcv.split(df,
                            validation_split_date=datetime.date(2008, 12, 31),
                            date_column='sec_filing_date')
    k_folds = len(tbcv_folds)
    for k_index, (train_index, test_index) in enumerate(tbcv_folds):

        if downsample:
            df_kfold = down_sample_majority_class(df, 'rating_downgrade')
            df_kfold_index = df_kfold.index.tolist()
            train_index = [
                idx for idx in list(train_index) if idx in df_kfold_index
            ]

        data_train = X.loc[train_index].drop('sec_filing_date', axis=1)
        target_train = y.loc[train_index]

        data_test = X.loc[test_index].drop('sec_filing_date', axis=1)
        target_test = y.loc[test_index]

        print("=========================================")
        print("==== K Fold Validation step => %d/%d ======" %
              (k_index + 1, k_folds))
        print("=========================================")

        x_train, y_train, x_val, y_val, embedding_layer, MAX_SEQUENCE_LENGTH = create_embedding_layer(
            data_train,
            target_train,
            data_test,
            target_test,
            embed,
            vocab_to_int,
            trainable=False)
        history, model = train_model_keras_CNN(x_train, y_train, x_val, y_val,
                                               embedding_layer,
                                               MAX_SEQUENCE_LENGTH)
        print(history.history)
        scores[k_index] = history.history

        preds_y = model.predict(x_val)
        preds_y = np.rint(preds_y)

        preds_y = preds_y.argmax(axis=-1)
        y_val = y_val.argmax(axis=-1)
        cf_matrix_val += confusion_matrix(y_val, preds_y)

    if show_holdout:
        # Test model trained on last three years on holdout data

        frames = [test_index for train_index, test_index in tbcv_folds[-3:]]
        frames = [item for sublist in frames for item in sublist]
        data_train = X.loc[frames].drop('sec_filing_date', axis=1)
        target_train = y.loc[frames]
        x_train, y_train, x_val, y_val, embedding_layer, MAX_SEQUENCE_LENGTH = create_embedding_layer(
            data_train,
            target_train,
            X_holdout,
            y_holdout,
            embed,
            vocab_to_int,
            trainable=False)
        history, model = train_model_keras_CNN(x_train, y_train, x_val, y_val,
                                               embedding_layer,
                                               MAX_SEQUENCE_LENGTH)

        preds_y = model.predict(x_val)
        preds_y = np.rint(preds_y)

        preds_y = preds_y.argmax(axis=-1)
        y_val = y_val.argmax(axis=-1)
        cf_matrix = confusion_matrix(y_val, preds_y)

        scores['holdout'] = history.history

        #print("Holdout Score: " + str(clf.score(x_val, y_val)))
        #print('\n')
        # Visualize confusion matrix for holdout data
        labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        categories = ['No Downgrade', 'Downgrade']
        make_confusion_matrix(cf_matrix,
                              group_names=labels,
                              categories=categories,
                              cbar=False,
                              title='Confusion Matrix: ' + alog_name,
                              figsize=(10, 10))
    else:

        labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        categories = ['No Downgrade', 'Downgrade']
        make_confusion_matrix(cf_matrix_val,
                              group_names=labels,
                              categories=categories,
                              cbar=False,
                              title='Confusion Matrix: ' + alog_name,
                              figsize=(10, 10))

    return scores, cf_matrix_val