def run_keras_experiment(): print('Reading files') # Reading File Section - This should change train_2018 = pd.read_csv("data/german/germeval2018.training.txt", sep='\t', names=['tweet', 'sub_task_1', 'sub_task_2']) train_2019 = pd.read_csv("data/german/germeval2019_training_subtask12.txt", sep='\t', names=['tweet', 'sub_task_1', 'sub_task_2']) train = pd.concat([train_2018, train_2019]) test = pd.read_csv("data/german/germeval2018.test_.txt", sep='\t', names=['tweet', 'sub_task_1', 'sub_task_2']) test_2019 = pd.read_csv("data/german/germeval2019_Testdata_Subtask12.txt", sep='\t', names=['tweet']) print('Completed reading') ############# print("Train shape : ", train.shape) print("Test shape : ", test.shape) print("2019 Test shape :", test_2019.shape) # Variables TEXT_COLUMN = "tweet" LABEL_COLUMN = "sub_task_1" configParser = configparser.RawConfigParser() configFilePath = "config.txt" configParser.read(configFilePath) EMBEDDING_FILE = configParser.get('sub_task_1_model-config', 'EMBEDDING_FILE') MODEL_PATH = configParser.get('sub_task_1_model-config', 'MODEL_PATH') PREDICTION_FILE = configParser.get('sub_task_1_model-config', 'PREDICTION_FILE') print(train.head()) print("Removing usernames") train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: remove_names(x)) test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: remove_names(x)) print(train.head()) # # print("Identifying names") # # train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: entity_recognizing(x)) # test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: entity_recognizing(x)) # print(train.head()) print("Converting to lower-case") train[TEXT_COLUMN] = train[TEXT_COLUMN].str.lower() test[TEXT_COLUMN] = test[TEXT_COLUMN].str.lower() test_2019[TEXT_COLUMN] = test_2019[TEXT_COLUMN].str.lower() print(train.head()) print("Cleaning punctuation marks") train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: clean_text(x)) test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: clean_text(x)) test_2019[TEXT_COLUMN] = test_2019[TEXT_COLUMN].apply( lambda x: clean_text(x)) print(train.head()) train['doc_len'] = train[TEXT_COLUMN].apply( lambda words: len(words.split(" "))) max_seq_len = np.round(train['doc_len'].mean() + train['doc_len'].std()).astype(int) embed_size = 300 # how big is each word vector max_features = None # how many unique words to use (i.e num rows in embedding vector) maxlen = max_seq_len # max number of words in a question to use #99.99% # fill up the missing values X = train[TEXT_COLUMN].fillna("_na_").values X_test = test[TEXT_COLUMN].fillna("_na_").values X_test_2019 = test_2019[TEXT_COLUMN].fillna("_na_").values # Tokenize the sentences tokenizer = Tokenizer(num_words=max_features, filters='') tokenizer.fit_on_texts(list(X)) X = tokenizer.texts_to_sequences(X) X_test = tokenizer.texts_to_sequences(X_test) X_test_2019 = tokenizer.texts_to_sequences(X_test_2019) # Pad the sentences X = pad_sequences(X, maxlen=maxlen) X_test = pad_sequences(X_test, maxlen=maxlen) X_test_2019 = pad_sequences(X_test_2019, maxlen=maxlen) # Get the target values Y = train[LABEL_COLUMN].values le = LabelEncoder() le.fit(Y) encoded_Y = le.transform(Y) word_index = tokenizer.word_index max_features = len(word_index) + 1 print('Loading Embeddings') embedding_matrix = get_emb_matrix(word_index, max_features, EMBEDDING_FILE) print('Finished loading Embeddings') print('Start Training') kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True) bestscore = [] y_test = np.zeros((X_test.shape[0], )) y_test_2019 = np.zeros((X_test_2019.shape[0], )) for i, (train_index, valid_index) in enumerate(kfold.split(X, encoded_Y)): X_train, X_val, Y_train, Y_val = X[train_index], X[ valid_index], encoded_Y[train_index], encoded_Y[valid_index] filepath = MODEL_PATH checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=2, save_best_only=True, mode='min') reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.6, patience=1, min_lr=0.0001, verbose=2) earlystopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=2, verbose=2, mode='auto') callbacks = [checkpoint, reduce_lr] model = pooled_gru(maxlen, max_features, embed_size, embedding_matrix, 1) if i == 0: print(model.summary()) model.fit( X_train, Y_train, batch_size=64, epochs=20, validation_data=(X_val, Y_val), verbose=2, callbacks=callbacks, ) model.load_weights(filepath) y_pred = model.predict([X_val], batch_size=64, verbose=2) y_test += np.squeeze(model.predict([X_test], batch_size=64, verbose=2)) / 5 y_test_2019 += np.squeeze( model.predict([X_test_2019], batch_size=64, verbose=2)) / 5 f1, threshold = f1_smart(np.squeeze(Y_val), np.squeeze(y_pred)) print('Optimal F1: {:.4f} at threshold: {:.4f}'.format(f1, threshold)) bestscore.append(threshold) print('Finished Training') y_test = y_test.reshape((-1, 1)) pred_test_y = (y_test > np.mean(bestscore)).astype(int) test['predictions'] = le.inverse_transform(pred_test_y) y_test_2019 = y_test_2019.reshape((-1, 1)) pred_test_y_2019 = (y_test_2019 > np.mean(bestscore)).astype(int) test_2019_temp = pd.read_csv( "data/german/germeval2019_Testdata_Subtask12.txt", sep='\t', names=['tweet']) test_2019['predictions'] = le.inverse_transform(pred_test_y_2019) test_2019['tweet'] = test_2019_temp['tweet'] # save predictions file_path = PREDICTION_FILE test_2019.to_csv(file_path, sep='\t', encoding='utf-8', header=False, index=False) print('Saved Predictions') # post analysis tn, fp, fn, tp = confusion_matrix(test[LABEL_COLUMN], test['predictions']).ravel() weighted_f1 = f1_score(test[LABEL_COLUMN], test['predictions'], average='weighted') accuracy = accuracy_score(test[LABEL_COLUMN], test['predictions']) weighted_recall = recall_score(test[LABEL_COLUMN], test['predictions'], average='weighted') weighted_precision = precision_score(test[LABEL_COLUMN], test['predictions'], average='weighted') print("Confusion Matrix (tn, fp, fn, tp) {} {} {} {}".format( tn, fp, fn, tp)) print("Accuracy ", accuracy) print("Weighted F1 ", weighted_f1) print("Weighted Recall ", weighted_recall) print("Weighted Precision ", weighted_precision)
X_train, Y_train, batch_size=64, epochs=20, validation_data=(X_val, Y_val), verbose=2, callbacks=callbacks, ) model.load_weights(filepath) y_pred = model.predict([X_val], batch_size=64, verbose=2) y_test += np.squeeze(model.predict([X_test], batch_size=64, verbose=2)) / 5 y_test_2019 += np.squeeze( model.predict([X_test_2019], batch_size=64, verbose=2)) / 5 f1, threshold = f1_smart(np.squeeze(Y_val), np.squeeze(y_pred)) print('Optimal F1: {:.4f} at threshold: {:.4f}'.format(f1, threshold)) bestscore.append(threshold) print('Finished Training') y_test = y_test.reshape((-1, 1)) pred_test_y = (y_test > np.mean(bestscore)).astype(int) test['predictions'] = le.inverse_transform(pred_test_y) y_test_2019 = y_test_2019.reshape((-1, 1)) pred_test_y_2019 = (y_test_2019 > np.mean(bestscore)).astype(int) test_2019_temp = pd.read_csv( "data/german/germeval2019_Testdata_Subtask3.txt", sep='\t',