def model_training_out_of_time(df, holdout, target, features, algo, standard=True, show_holdout=False, downsample=False): X = df[features] y = df[target].astype('bool') X_holdout = holdout[features].drop('sec_filing_date', axis=1) y_holdout = holdout[target].astype('bool') if standard: clf = make_pipeline(StandardScaler(), algo) else: clf = algo alog_name = str(clf.steps[1][1]).split('(')[0] alog_name = " ".join(re.findall('[A-Z][^A-Z]*', alog_name)) if alog_name == 'S V C': alog_name = 'SVM Classifier' print('### ' + alog_name + ' ###') scores = {'acc': [], 'f1': []} cf_matrix_val = np.zeros((2, 2), dtype=np.int) tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years') tbcv_folds = tbcv.split(df, validation_split_date=datetime.date(2008, 12, 31), date_column='sec_filing_date') for train_index, test_index in tbcv_folds: if downsample: df_kfold = down_sample_majority_class(df, 'rating_downgrade') df_kfold_index = df_kfold.index.tolist() train_index = [ idx for idx in list(train_index) if idx in df_kfold_index ] data_train = X.loc[train_index].drop('sec_filing_date', axis=1) target_train = y.loc[train_index] data_test = X.loc[test_index].drop('sec_filing_date', axis=1) target_test = y.loc[test_index] clf.fit(data_train, target_train.values.ravel()) preds = clf.predict(data_test) # accuracy for the current fold only score = clf.score(data_test, target_test) f1 = f1_score(target_test, preds) cf_matrix_val += confusion_matrix(target_test, preds) scores['acc'].append(score) scores['f1'].append(f1) print("Cross Validation Score: " + str(sum(scores['acc']) / len(scores['acc']))) if show_holdout: # Test model trained on last three years on holdout data frames = [test_index for train_index, test_index in tbcv_folds[-3:]] frames = [item for sublist in frames for item in sublist] data_train = X.loc[frames].drop('sec_filing_date', axis=1) target_train = y.loc[frames] clf.fit(data_train, target_train.values.ravel()) holdout_preds = clf.predict(X_holdout) cf_matrix = confusion_matrix(y_holdout, holdout_preds) print("Holdout Score: " + str(clf.score(X_holdout, y_holdout))) print('\n') # Visualize confusion matrix for holdout data labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos'] categories = ['No Downgrade', 'Downgrade'] make_confusion_matrix(cf_matrix, group_names=labels, categories=categories, cbar=False, title='Confusion Matrix: ' + alog_name, figsize=(10, 10)) else: #Visualize confusion matrix for cross-val data labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos'] categories = ['No Downgrade', 'Downgrade'] make_confusion_matrix(cf_matrix_val, group_names=labels, categories=categories, cbar=False, title='Confusion Matrix: ' + alog_name, figsize=(10, 10)) return scores, clf, cf_matrix_val
def LDA_model_out_of_time_tpot(df, features, target, dest_all_model, downsample=False): X = df[features] y = df[target].astype('bool') scores = {'acc': [], 'f1': []} cf_matrix_val = np.zeros((2, 2), dtype=np.int) tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years') tbcv_folds = tbcv.split(df, validation_split_date=datetime.date(2008, 12, 31), date_column='sec_filing_date') k_folds = len(tbcv_folds) for k_index, (train_index, test_index) in enumerate(tbcv_folds): dest_train, dest_val = dest_all_model[str(k_index + 1)] data_train = X.loc[train_index].drop('sec_filing_date', axis=1) target_train = y.loc[train_index] data_test = X.loc[test_index].drop('sec_filing_date', axis=1) target_test = y.loc[test_index] print("=========================================") print("==== K Fold Validation step => %d/%d ======" % (k_index + 1, k_folds)) print("=========================================") lda_model_train = LdaMulticore.load(lda_data_dir + dest_train) if downsample: try: data_train = pickle.load( open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_downsample_data_train.list', "rb")) except: print("Prepare Train data") data_train = get_topic_proba(data_train, lda_model_train) pickle.dump( data_train, open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_downsample_data_train.list', "wb")) try: data_test = pickle.load( open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_downsample_data_test.list', "rb")) except: print("Prepare Test data") data_test = get_topic_proba(data_test, lda_model_train) pickle.dump( data_test, open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_downsample_data_test.list', "wb")) else: try: data_train = pickle.load( open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_data_train.list', "rb")) except: print("Prepare Train data") data_train = get_topic_proba(data_train, lda_model_train) pickle.dump( data_train, open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_data_train.list', "wb")) try: data_test = pickle.load( open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_data_test.list', "rb")) except: print("Prepare Test data") data_test = get_topic_proba(data_test, lda_model_train) pickle.dump( data_test, open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_data_test.list', "wb")) clf = TPOTClassifier(generations=5, population_size=50, verbosity=2, max_time_mins=5) clf.fit(data_train, target_train.values.ravel()) preds = clf.predict(data_test) # accuracy for the current fold only score = clf.score(data_test, target_test) f1 = f1_score(target_test, preds) cf_matrix_val += confusion_matrix(target_test, preds) scores['acc'].append(score) scores['f1'].append(f1) print("Cross Validation Score: " + str(sum(scores['acc']) / len(scores['acc']))) #Visualize confusion matrix for cross-val data labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos'] categories = ['No Downgrade', 'Downgrade'] make_confusion_matrix(cf_matrix_val, group_names=labels, categories=categories, cbar=False, title='Confusion Matrix: TPOT', figsize=(10, 10)) return scores, clf, cf_matrix_val
def model_training_out_of_time_tpot(df, holdout, target, features, show_holdout=False, downsample=False): X = df[features] y = df[target].astype('bool') X_holdout = holdout[features].drop('sec_filing_date', axis=1) y_holdout = holdout[target].astype('bool') scores = {'acc': [], 'f1': []} cf_matrix_val = np.zeros((2, 2), dtype=np.int) tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years') tbcv_folds = tbcv.split(df, validation_split_date=datetime.date(2008, 12, 31), date_column='sec_filing_date') for train_index, test_index in tbcv_folds: if downsample: df_kfold = down_sample_majority_class(df, 'rating_downgrade') df_kfold_index = df_kfold.index.tolist() train_index = [ idx for idx in list(train_index) if idx in df_kfold_index ] data_train = X.loc[train_index].drop('sec_filing_date', axis=1) target_train = y.loc[train_index] data_test = X.loc[test_index].drop('sec_filing_date', axis=1) target_test = y.loc[test_index] clf = TPOTClassifier(generations=5, population_size=50, verbosity=2, max_time_mins=5) clf.fit(data_train, target_train) preds = clf.predict(data_test) # accuracy for the current fold only score = clf.score(data_test, target_test) f1 = f1_score(target_test, preds) cf_matrix_val += confusion_matrix(target_test, preds) scores['acc'].append(score) scores['f1'].append(f1) print("Cross Validation Score: " + str(sum(scores['acc']) / len(scores['acc']))) if show_holdout: # Test model trained on last three years on holdout data frames = [test_index for train_index, test_index in tbcv_folds[-3:]] frames = [item for sublist in frames for item in sublist] data_train = X.loc[frames].drop('sec_filing_date', axis=1) target_train = y.loc[frames] clf.fit(data_train, target_train) holdout_preds = clf.predict(X_holdout) cf_matrix = confusion_matrix(y_holdout, holdout_preds) print("Holdout Score: " + str(clf.score(X_holdout, y_holdout))) print('\n') # Visualize confusion matrix for holdout data labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos'] categories = ['No Downgrade', 'Downgrade'] make_confusion_matrix(cf_matrix, group_names=labels, categories=categories, cbar=False, title='Confusion Matrix: TPOT', figsize=(10, 10)) else: #Visualize confusion matrix for cross-val data labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos'] categories = ['No Downgrade', 'Downgrade'] make_confusion_matrix(cf_matrix_val, group_names=labels, categories=categories, cbar=False, title='Confusion Matrix: TPOT', figsize=(10, 10)) return scores, clf, cf_matrix_val
def model_training_out_of_time_pretrained(df, holdout, target, features, show_holdout=False, downsample=False): """ Title: Learning Word Embeddings from 10-K Filings for Financial NLP Tasks Author: Saurabh Sehrawat Date: 2019 Code version: 1.0 Availability: https://github.com/ssehrawat/10K-word-embeddings """ embed = torch.load(main_dir + 'data/10K-word-embeddings/10k_word_embeddings.tar') vocab_to_int = torch.load(main_dir + 'data/10K-word-embeddings/vocab_to_int.tar') X = df[features] y = df[target].astype('bool') X_holdout = holdout[features].drop('sec_filing_date', axis=1) y_holdout = holdout[target].astype('bool') alog_name = 'CNN' print('### ' + alog_name + ' ###') scores = dict() cf_matrix_val = np.zeros((2, 2), dtype=np.int) tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years') tbcv_folds = tbcv.split(df, validation_split_date=datetime.date(2008, 12, 31), date_column='sec_filing_date') k_folds = len(tbcv_folds) for k_index, (train_index, test_index) in enumerate(tbcv_folds): if downsample: df_kfold = down_sample_majority_class(df, 'rating_downgrade') df_kfold_index = df_kfold.index.tolist() train_index = [ idx for idx in list(train_index) if idx in df_kfold_index ] data_train = X.loc[train_index].drop('sec_filing_date', axis=1) target_train = y.loc[train_index] data_test = X.loc[test_index].drop('sec_filing_date', axis=1) target_test = y.loc[test_index] print("=========================================") print("==== K Fold Validation step => %d/%d ======" % (k_index + 1, k_folds)) print("=========================================") x_train, y_train, x_val, y_val, embedding_layer, MAX_SEQUENCE_LENGTH = create_embedding_layer( data_train, target_train, data_test, target_test, embed, vocab_to_int, trainable=False) history, model = train_model_keras_CNN(x_train, y_train, x_val, y_val, embedding_layer, MAX_SEQUENCE_LENGTH) print(history.history) scores[k_index] = history.history preds_y = model.predict(x_val) preds_y = np.rint(preds_y) preds_y = preds_y.argmax(axis=-1) y_val = y_val.argmax(axis=-1) cf_matrix_val += confusion_matrix(y_val, preds_y) if show_holdout: # Test model trained on last three years on holdout data frames = [test_index for train_index, test_index in tbcv_folds[-3:]] frames = [item for sublist in frames for item in sublist] data_train = X.loc[frames].drop('sec_filing_date', axis=1) target_train = y.loc[frames] x_train, y_train, x_val, y_val, embedding_layer, MAX_SEQUENCE_LENGTH = create_embedding_layer( data_train, target_train, X_holdout, y_holdout, embed, vocab_to_int, trainable=False) history, model = train_model_keras_CNN(x_train, y_train, x_val, y_val, embedding_layer, MAX_SEQUENCE_LENGTH) preds_y = model.predict(x_val) preds_y = np.rint(preds_y) preds_y = preds_y.argmax(axis=-1) y_val = y_val.argmax(axis=-1) cf_matrix = confusion_matrix(y_val, preds_y) scores['holdout'] = history.history #print("Holdout Score: " + str(clf.score(x_val, y_val))) #print('\n') # Visualize confusion matrix for holdout data labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos'] categories = ['No Downgrade', 'Downgrade'] make_confusion_matrix(cf_matrix, group_names=labels, categories=categories, cbar=False, title='Confusion Matrix: ' + alog_name, figsize=(10, 10)) else: labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos'] categories = ['No Downgrade', 'Downgrade'] make_confusion_matrix(cf_matrix_val, group_names=labels, categories=categories, cbar=False, title='Confusion Matrix: ' + alog_name, figsize=(10, 10)) return scores, cf_matrix_val