def _get_train_test_and_embedding(train_csv, test_csv, sentence_length, embedding_file=None): UNKNOWN_WORD = "_UNK_" END_WORD = "_END_" NAN_WORD = "_NAN_" CLASSES = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] train_data = pd.read_csv(train_csv) test_data = pd.read_csv(test_csv) list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values y_train = train_data[CLASSES].values print("Tokenizing sentences in train set...") tokenized_sentences_train, words_dict = tokenize_sentences( list_sentences_train, {}) print("Tokenizing sentences in test set...") tokenized_sentences_test, words_dict = tokenize_sentences( list_sentences_test, words_dict) words_dict[UNKNOWN_WORD] = len(words_dict) print("Loading embeddings...") embedding_list, embedding_word_dict = read_embedding_list(embedding_file) embedding_size = len(embedding_list[0]) print("Preparing data...") embedding_list, embedding_word_dict = clear_embedding_list( embedding_list, embedding_word_dict, words_dict) embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict) embedding_list.append([0.] * embedding_size) embedding_word_dict[END_WORD] = len(embedding_word_dict) embedding_list.append([-1.] * embedding_size) embedding_matrix = np.array(embedding_list) id_to_word = dict((id, word) for word, id in words_dict.items()) train_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_train, id_to_word, embedding_word_dict, sentence_length) test_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_test, id_to_word, embedding_word_dict, sentence_length) X_train = np.array(train_list_of_token_ids) X_test = np.array(test_list_of_token_ids) return X_train, y_train, X_test, embedding_matrix
def main(): parser = argparse.ArgumentParser( description="Recurrent neural network for identifying and classifying toxic online comments") parser.add_argument("train_file_path") parser.add_argument("test_file_path") parser.add_argument("embedding_path") parser.add_argument("--result-path", default="toxic_results") parser.add_argument("--batch-size", type=int, default=256) parser.add_argument("--sentences-length", type=int, default=500) parser.add_argument("--recurrent-units", type=int, default=64) parser.add_argument("--dropout-rate", type=float, default=0.3) parser.add_argument("--dense-size", type=int, default=32) parser.add_argument("--fold-count", type=int, default=10) args = parser.parse_args() if args.fold_count <= 1: raise ValueError("fold-count should be more than 1") print("Loading data...") train_data = pd.read_csv(args.train_file_path) test_data = pd.read_csv(args.test_file_path) list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values y_train = train_data[CLASSES].values n_classes = y_train.nunique() y_train = keras.utils.to_categorical(y_train, num_classes=n_classes) print("Tokenizing sentences in train set...") tokenized_sentences_train, words_dict = tokenize_sentences(list_sentences_train, {}) print("Tokenizing sentences in test set...") tokenized_sentences_test, words_dict = tokenize_sentences(list_sentences_test, words_dict) words_dict[UNKNOWN_WORD] = len(words_dict) print("Loading embeddings...") embedding_list, embedding_word_dict = read_embedding_list(args.embedding_path) embedding_size = len(embedding_list[0]) print("Preparing data...") embedding_list, embedding_word_dict = clear_embedding_list(embedding_list, embedding_word_dict, words_dict) embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict) embedding_list.append([0.] * embedding_size) embedding_word_dict[END_WORD] = len(embedding_word_dict) embedding_list.append([-1.] * embedding_size) embedding_matrix = np.array(embedding_list) id_to_word = dict((id, word) for word, id in words_dict.items()) train_list_of_token_ids = convert_tokens_to_ids( tokenized_sentences_train, id_to_word, embedding_word_dict, args.sentences_length) test_list_of_token_ids = convert_tokens_to_ids( tokenized_sentences_test, id_to_word, embedding_word_dict, args.sentences_length) X_train = np.array(train_list_of_token_ids) X_test = np.array(test_list_of_token_ids) get_model_func = lambda: get_model( embedding_matrix, args.sentences_length, args.dropout_rate, args.recurrent_units, args.dense_size) print("Starting to train models...") models = train_folds(X_train, y_train, args.fold_count, args.batch_size, get_model_func) if not os.path.exists(args.result_path): os.mkdir(args.result_path) print("Predicting results...") test_predicts_list = [] for fold_id, model in enumerate(models): model_path = os.path.join(args.result_path, "model{0}_weights.npy".format(fold_id)) np.save(model_path, model.get_weights()) test_predicts_path = os.path.join(args.result_path, "test_predicts{0}.npy".format(fold_id)) test_predicts = model.predict(X_test, batch_size=args.batch_size) test_predicts_list.append(test_predicts) np.save(test_predicts_path, test_predicts) test_predicts = np.ones(test_predicts_list[0].shape) for fold_predict in test_predicts_list: test_predicts *= fold_predict test_predicts **= (1. / len(test_predicts_list)) test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT #test_ids = test_data["id"].values #test_ids = test_ids.reshape((len(test_ids), 1)) y_pred = test_predicts.argmax(axis=-1) #test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES) #test_predicts["id"] = test_ids #test_predicts = test_predicts[["id"] + CLASSES] test_data['emotion_pred'] = y_pred submit_path = os.path.join(args.result_path, "submit") test_predicts.to_csv(submit_path, index=False)
def main(): parser = argparse.ArgumentParser( description= "Recurrent neural network for identifying and classifying toxic online comments" ) parser.add_argument("train_file_path") parser.add_argument("test_file_path") parser.add_argument("embedding_path") parser.add_argument("--result-path", default="toxic_results") parser.add_argument("--batch-size", type=int, default=256) parser.add_argument("--sentences-length", type=int, default=500) parser.add_argument("--recurrent-units", type=int, default=64) parser.add_argument("--dropout-rate", type=float, default=0.3) parser.add_argument("--dense-size", type=int, default=32) parser.add_argument("--fold-count", type=int, default=10) # parser.add_argument("train_file_path") # parser.add_argument("test_file_path") # parser.add_argument("embedding_path") # parser.add_argument("--result-path", default="toxic_results") # parser.add_argument("--batch-size", type=int, default=32) # parser.add_argument("--sentences-length", type=int, default=200) # parser.add_argument("--recurrent-units", type=int, default=80) # parser.add_argument("--dropout-rate", type=float, default=0.3) # parser.add_argument("--dense-size", type=int, default=32) # parser.add_argument("--fold-count", type=int, default=10) args = parser.parse_args() if args.fold_count <= 1: raise ValueError("fold-count should be more than 1") print("Loading data...") train_data = pd.read_csv(args.train_file_path) test_data = pd.read_csv(args.test_file_path) list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values y_train = train_data[CLASSES].values print("Tokenizing sentences in train set...") tokenized_sentences_train, words_dict = tokenize_sentences( list_sentences_train, {}) print("Tokenizing sentences in test set...") tokenized_sentences_test, words_dict = tokenize_sentences( list_sentences_test, words_dict) words_dict[UNKNOWN_WORD] = len(words_dict) print("Loading embeddings...") embedding_list, embedding_word_dict = read_embedding_list( args.embedding_path) embedding_size = len(embedding_list[0]) print("Preparing data...") embedding_list, embedding_word_dict = clear_embedding_list( embedding_list, embedding_word_dict, words_dict) embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict) embedding_list.append([0.] * embedding_size) embedding_word_dict[END_WORD] = len(embedding_word_dict) embedding_list.append([-1.] * embedding_size) embedding_matrix = np.array(embedding_list) id_to_word = dict((id, word) for word, id in words_dict.items()) train_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_train, id_to_word, embedding_word_dict, args.sentences_length) test_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_test, id_to_word, embedding_word_dict, args.sentences_length) X_train = np.array(train_list_of_token_ids) X_test = np.array(test_list_of_token_ids) get_model_func = lambda: get_model(embedding_matrix, args.sentences_length, args.dropout_rate, args.recurrent_units, args.dense_size) import gc del train_data, test_data, list_sentences_train, list_sentences_test del tokenized_sentences_train, tokenized_sentences_test, words_dict del embedding_list, embedding_word_dict del train_list_of_token_ids, test_list_of_token_ids gc.collect() print("Starting to train models...") models = train_folds(X_train, y_train, X_test, args.fold_count, args.batch_size, get_model_func)
model = Model(inputs=input_layer, outputs=output_layer) model.compile(loss='binary_crossentropy', optimizer=RMSprop(clipvalue=1, clipnorm=1), metrics=['accuracy']) comment_text = "Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...f****n white boys get things right next time." list_texts_to_predict = [] list_texts_to_predict.append(comment_text) tokenized_sentences_test, words_dict = tokenize_sentences( list_texts_to_predict, words_dict) id_to_word = dict((id, word) for word, id in words_dict.items()) test_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_test, id_to_word, embedding_word_dict, sequence_length) X_test = np.array(test_list_of_token_ids) test_predicts_list = [] for i in range(0, 10): print("model{0}_weights.h5".format(i)) model.load_weights("model{0}_weights.h5".format(i)) test_predicts = model.predict(X_test, batch_size=1) print(test_predicts) test_predicts_list.append(test_predicts) print(test_predicts_list) test_predicts = np.ones(test_predicts_list[0].shape) for fold_predict in test_predicts_list:
def main(): parser = argparse.ArgumentParser( description= "Recurrent neural network for identifying and classifying toxic online comments" ) parser.add_argument("train_file_path") parser.add_argument("test_file_path") parser.add_argument("embedding_path") parser.add_argument("--result-path", default="toxic_results") parser.add_argument("--batch-size", type=int, default=256) parser.add_argument("--sentences-length", type=int, default=500) parser.add_argument("--recurrent-units", type=int, default=64) parser.add_argument("--dropout-rate", type=float, default=0.3) parser.add_argument("--dense-size", type=int, default=32) parser.add_argument("--fold-count", type=int, default=10) args = parser.parse_args() if args.fold_count <= 1: raise ValueError("fold-count should be more than 1") print("Loading data...") train_data = pd.read_csv(args.train_file_path) test_data = pd.read_csv(args.test_file_path) # Identify language #train_data['language'] = train_data['comment_text'].apply(detect_language) #test_data['language'] = test_data['comment_text'].apply(detect_language) # Translate the non-english to the english. #train_data['comment_text'] = train_data.apply(lambda x: translate(x.comment_text, x.language),axis=1) #test_data['comment_text'] = test_data.apply(lambda x: translate(x.comment_text, x.language),axis=1) #train_data.to_csv("train_data_translated.csv") #test_data.to_csv("test_data_translated.csv") #train_data['comment_text'] = train_data.apply(lambda x: clean(x.comment_text), axis=1) #train_data['comment_text'] = train_data.apply(lambda x: clean(x.comment_text), axis=1) #train_data.to_csv("train_data_cleaned_after_translate.csv") #test_data.to_csv("test_data_cleaned_after_translate.csv") list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values y_train = train_data[CLASSES].values print("Tokenizing sentences in train set...") tokenized_sentences_train, words_dict = tokenize_sentences( list_sentences_train, {}) print("Tokenizing sentences in test set...") tokenized_sentences_test, words_dict = tokenize_sentences( list_sentences_test, words_dict) words_dict[UNKNOWN_WORD] = len(words_dict) print("Loading embeddings...") embedding_list, embedding_word_dict = read_embedding_list( args.embedding_path) embedding_size = len(embedding_list[0]) print("Preparing data...") embedding_list, embedding_word_dict = clear_embedding_list( embedding_list, embedding_word_dict, words_dict) embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict) embedding_list.append([0.] * embedding_size) embedding_word_dict[END_WORD] = len(embedding_word_dict) embedding_list.append([-1.] * embedding_size) embedding_matrix = np.array(embedding_list) embedding_matrix_path = os.path.join(args.result_path, "embedding_matrix.npy") np.save(embedding_matrix_path, embedding_matrix) words_dict_path = os.path.join(args.result_path, "words_dict.npy") np.save(words_dict_path, words_dict) id_to_word = dict((id, word) for word, id in words_dict.items()) train_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_train, id_to_word, embedding_word_dict, args.sentences_length) test_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_test, id_to_word, embedding_word_dict, args.sentences_length) X_train = np.array(train_list_of_token_ids) X_test = np.array(test_list_of_token_ids) print(embedding_matrix.shape) print(embedding_matrix.shape[0]) print(embedding_matrix.shape[1]) get_model_func = lambda: get_model(embedding_matrix, args.sentences_length, args.dropout_rate, args.recurrent_units, args.dense_size) print("Starting to train models...") models = train_folds(X_train, y_train, args.fold_count, args.batch_size, get_model_func) if not os.path.exists(args.result_path): os.mkdir(args.result_path) print("Predicting results...") test_predicts_list = [] for fold_id, model in enumerate(models): model_path = os.path.join(args.result_path, "model{0}_weights.npy".format(fold_id)) np.save(model_path, model.get_weights()) model.save_weights("model{0}_weights.h5".format(fold_id)) test_predicts_path = os.path.join( args.result_path, "test_predicts{0}.npy".format(fold_id)) test_predicts = model.predict(X_test, batch_size=args.batch_size) test_predicts_list.append(test_predicts) np.save(test_predicts_path, test_predicts) test_predicts = np.ones(test_predicts_list[0].shape) for fold_predict in test_predicts_list: test_predicts *= fold_predict test_predicts **= (1. / len(test_predicts_list)) test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT test_ids = test_data["id"].values test_ids = test_ids.reshape((len(test_ids), 1)) test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES) test_predicts["id"] = test_ids test_predicts = test_predicts[["id"] + CLASSES] submit_path = os.path.join(args.result_path, "submit") test_predicts.to_csv(submit_path, index=False) print("Predicting Discussion posts...") posts = pd.read_csv("posts_cleaned.csv") posts = posts.dropna() discussion_posts = posts['MSG_TEXT'].tolist() tokenized_discussion_posts, words_dict = tokenize_sentences( discussion_posts, words_dict) #id_to_word = dict((id, word) for word, id in words_dict.items()) discussion_list_of_token_ids = convert_tokens_to_ids( tokenized_discussion_posts, id_to_word, embedding_word_dict, args.sentences_length) X_test = np.array(discussion_list_of_token_ids) discussion_predict_list = [] for fold_id, model in enumerate(models): discussion_predicts = model.predict(X_test, batch_size=args.batch_size) discussion_predict_list.append(discussion_predicts) discussion_predicts = np.ones(discussion_predict_list[0].shape) for fold_predict in discussion_predict_list: discussion_predicts *= fold_predict discussion_predicts **= (1. / len(discussion_predict_list)) discussion_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT discussion_predicts = pd.DataFrame(data=discussion_predicts, columns=CLASSES) discussion_predicts['MSG_TEXT'] = discussion_posts discussion_predicts = discussion_predicts[["MSG_TEXT"] + CLASSES] discussion_predicts_path = os.path.join(args.result_path, "discussion_predicts.csv") discussion_predicts.to_csv(discussion_predicts_path, index=False)
def main(): parser = argparse.ArgumentParser( description= "Recurrent neural network for identifying and classifying toxic online comments" ) parser.add_argument("train_file_path") parser.add_argument("test_file_path") parser.add_argument("embedding_path") parser.add_argument("--result-path", default="toxic_results") parser.add_argument("--batch-size", type=int, default=256) parser.add_argument("--sentences-length", type=int, default=500) parser.add_argument("--recurrent-units", type=int, default=64) parser.add_argument("--dropout-rate", type=float, default=0.3) parser.add_argument("--dense-size", type=int, default=32) parser.add_argument("--fold-count", type=int, default=10) parser.add_argument("--modelname-prefix", type=str, default="") parser.add_argument("--cv", type=str, default="True") parser.add_argument("--use-roc", type=str, default="False") args = parser.parse_args() if args.fold_count <= 1: raise ValueError("fold-count should be more than 1") print('Input params') print(args) start = time.time() print('#' * 50) print("Loading data...") print('#' * 50) if os.path.exists( os.path.join( args.result_path, 'tokenized_sentences_train.pkl')) and os.path.exists( os.path.join( args.result_path, 'tokenized_sentences_train.pkl')) and os.path.exists( os.path.join(args.result_path, 'tokenized_sentences_train.pkl')): print('Preprocessed files found. Reading preprocess files') train_data = pd.read_csv(args.train_file_path) test_data = pd.read_csv(args.test_file_path) y_train = train_data[CLASSES].values with open( os.path.join(args.result_path, 'tokenized_sentences_train.pkl'), 'rb') as f: tokenized_sentences_train = pickle.load(f) with open( os.path.join(args.result_path, 'tokenized_sentences_test.pkl'), 'rb') as f: tokenized_sentences_test = pickle.load(f) with open(os.path.join(args.result_path, 'words_dict.pkl'), 'rb') as f: words_dict = pickle.load(f) else: print('Preprocessed files not found.') train_data = pd.read_csv(args.train_file_path) test_data = pd.read_csv(args.test_file_path) list_sentences_train = train_data["comment_text"].fillna( NAN_WORD).values list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values y_train = train_data[CLASSES].values print('#' * 50) print("Tokenizing sentences in train set...") print('#' * 50) tokenized_sentences_train, words_dict = tokenize_sentences( list_sentences_train, {}) print('#' * 50) print("Tokenizing sentences in test set...") print('#' * 50) tokenized_sentences_test, words_dict = tokenize_sentences( list_sentences_test, words_dict) print('Saving preprocess files...') with open( os.path.join(args.result_path, 'tokenized_sentences_train.pkl'), 'wb') as f: pickle.dump(tokenized_sentences_train, f) with open( os.path.join(args.result_path, 'tokenized_sentences_test.pkl'), 'wb') as f: pickle.dump(tokenized_sentences_test, f) with open(os.path.join(args.result_path, 'words_dict.pkl'), 'wb') as f: pickle.dump(words_dict, f) print('total words', len(words_dict)) words_dict[UNKNOWN_WORD] = len(words_dict) print('#' * 50) print("Loading embeddings...") print('#' * 50) if 'glove' in args.embedding_path: print('Reading Glove embedding') embedding_list, embedding_word_dict = read_embedding_list_glove( args.embedding_path) else: print('Reading Fasttext embedding') embedding_list, embedding_word_dict = read_embedding_list( args.embedding_path) embedding_size = len(embedding_list[0]) print('Embedding size', embedding_size) print('#' * 50) print("Preparing data...") print('#' * 50) embedding_list, embedding_word_dict = clear_embedding_list( embedding_list, embedding_word_dict, words_dict) embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict) embedding_list.append([0.] * embedding_size) embedding_word_dict[END_WORD] = len(embedding_word_dict) embedding_list.append([-1.] * embedding_size) embedding_matrix = np.array(embedding_list) print('Embedding matrix shape:', embedding_matrix.shape) id_to_word = dict((id, word) for word, id in words_dict.items()) train_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_train, id_to_word, embedding_word_dict, args.sentences_length) test_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_test, id_to_word, embedding_word_dict, args.sentences_length) X_train = np.array(train_list_of_token_ids) X_test = np.array(test_list_of_token_ids) # GRU cross validation # get_model_func = lambda: get_model( # embedding_matrix, # args.sentences_length, # args.dropout_rate, # args.recurrent_units, # args.dense_size) # GRU maxpool, avgpool # get_model_func = lambda: get_model_pool( # embedding_matrix, # args.sentences_length, # args.dropout_rate, # args.recurrent_units, # args.dense_size) # deepmoji style # get_model_func = lambda: get_model_deepmoji_style( # embedding_matrix, # args.sentences_length, # args.dropout_rate, # args.recurrent_units, # args.dense_size) # GRU maxpool, avgpool validation # get_model_func = lambda: get_gru_model( # embedding_matrix, # args.sentences_length, # args.dropout_rate, # args.recurrent_units, # args.dense_size) # GRU maxpool, avgpool + cnn validation # get_model_func = lambda: get_model_pool_gru_cnn( # embedding_matrix, # args.sentences_length, # args.dropout_rate, # args.recurrent_units, # args.dense_size) # dpcnn validation get_model_func = lambda: get_dpcnn_model( embedding_matrix, args.sentences_length, args.dropout_rate, args. dense_size) # lstm with attention cross val # get_model_func = lambda: get_model_att_lstm( # embedding_matrix, # args.sentences_length, # args.dropout_rate, # args.recurrent_units, # args.dense_size) # capsule net # get_model_func = lambda: get_capsnet_model( # embedding_matrix, # args.sentences_length, # args.dropout_rate, # args.recurrent_units, # args.dense_size) print('#' * 50) print("Starting to train models...") print('#' * 50) models = train_folds(X_train, y_train, args.fold_count, args.batch_size, get_model_func, args.cv, args.use_roc) if not os.path.exists(args.result_path): os.mkdir(args.result_path) print('#' * 50) print("Predicting results...") print('#' * 50) if args.cv == "True": test_predicts_list = [] for fold_id, model in enumerate(models): model_path = os.path.join( args.result_path, "{0}_model{1}_weights.npy".format(args.modelname_prefix, fold_id)) np.save(model_path, model.get_weights()) test_predicts_path = os.path.join( args.result_path, "{0}_test_predicts{1}.npy".format(args.modelname_prefix, fold_id)) test_predicts = model.predict(X_test, batch_size=args.batch_size * 2) test_predicts_list.append(test_predicts) np.save(test_predicts_path, test_predicts) test_predicts = np.ones(test_predicts_list[0].shape) for fold_predict in test_predicts_list: test_predicts *= fold_predict test_predicts **= (1. / len(test_predicts_list)) # test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT test_ids = test_data["id"].values test_ids = test_ids.reshape((len(test_ids), 1)) test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES) test_predicts["id"] = test_ids test_predicts = test_predicts[["id"] + CLASSES] submit_path = os.path.join(args.result_path, "{}_submit".format(args.modelname_prefix)) test_predicts.to_csv(submit_path, index=False) print('#' * 50) print('Prediction Completed...') print('#' * 50) total_time = time.time() - start mins, sec = divmod(total_time, 60) hrs, mins = divmod(mins, 60) print('Total time taken : {:.0f}h {:.0f}m {:.0f}s'.format( hrs, mins, sec)) else: print('No Cross Validation') test_predicts = models.predict(X_test, batch_size=args.batch_size * 2) test_ids = test_data["id"].values test_ids = test_ids.reshape((len(test_ids), 1)) test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES) test_predicts["id"] = test_ids test_predicts = test_predicts[["id"] + CLASSES] submit_path = os.path.join( args.result_path, "{}_submit_nocv".format(args.modelname_prefix)) test_predicts.to_csv(submit_path, index=False) total_time = time.time() - start mins, sec = divmod(total_time, 60) hrs, mins = divmod(mins, 60) print('Total time taken : {:.0f}h {:.0f}m {:.0f}s'.format( hrs, mins, sec))
def main(): parser = argparse.ArgumentParser( description= "Recurrent neural network for identifying and classifying toxic online comments" ) parser.add_argument("train_file_path") parser.add_argument("test_file_path") parser.add_argument("embedding_path") parser.add_argument("--result-path", default="predict_results") parser.add_argument("--batch-size", type=int, default=256) parser.add_argument("--sentences-length", type=int, default=500) parser.add_argument("--recurrent-units", type=int, default=64) parser.add_argument("--dropout-rate", type=float, default=0.3) parser.add_argument("--dense-size", type=int, default=32) parser.add_argument("--fold-count", type=int, default=10) args = parser.parse_args() if args.fold_count <= 1: raise ValueError("fold-count should be more than 1") start = time.time() print("Loading data...") train_data = pd.read_csv(args.train_file_path) test_data = pd.read_csv(args.test_file_path) list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values y_train = train_data[CLASSES].values # print("list_sentences_test = ", list_sentences_test) print("Tokenizing sentences in train set...") tokenized_sentences_train, words_dict = tokenize_sentences( list_sentences_train, {}) # print("words_dict before = ", words_dict) print("Tokenizing sentences in test set...") tokenized_sentences_test, words_dict = tokenize_sentences( list_sentences_test, words_dict) # print("words_dict after = ", words_dict) words_dict[UNKNOWN_WORD] = len(words_dict) print("Loading embeddings...") embedding_list, embedding_word_dict = read_embedding_list( args.embedding_path) embedding_size = len(embedding_list[0]) """ with open('embedding_list.txt', 'wb') as file: pickle.dump(embedding_list, file) with open('embedding_word_dict.txt', 'wb') as file: pickle.dump(embedding_word_dict, file) with open('embedding_list.txt', 'rb') as file: embedding_list = pickle.load(file) with open('embedding_word_dict.txt', 'rb') as file: embedding_word_dict = pickle.load(file) print(embedding_list) print(embedding_word_dict) """ end = time.time() print("Time elapsed: " + end - start) print("Preparing data...") embedding_list, embedding_word_dict = clear_embedding_list( embedding_list, embedding_word_dict, words_dict) embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict) embedding_list.append([0.] * embedding_size) embedding_word_dict[END_WORD] = len(embedding_word_dict) embedding_list.append([-1.] * embedding_size) embedding_matrix = np.array(embedding_list) id_to_word = dict((id, word) for word, id in words_dict.items()) train_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_train, id_to_word, embedding_word_dict, args.sentences_length) test_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_test, id_to_word, embedding_word_dict, args.sentences_length) X_train = np.array(train_list_of_token_ids) X_test = np.array(test_list_of_token_ids) get_model_func = lambda: get_model(embedding_matrix, args.sentences_length, args.dropout_rate, args.recurrent_units, args.dense_size) print("Starting to train models...") models = train_folds(X_train, y_train, args.fold_count, args.batch_size, get_model_func) if not os.path.exists(args.result_path): os.mkdir(args.result_path) print("Predicting results...") test_predicts_list = [] for fold_id, model in enumerate(models): model_path = os.path.join(args.result_path, "model{0}_weights.npy".format(fold_id)) np.save(model_path, model.get_weights()) test_predicts_path = os.path.join( args.result_path, "test_predicts{0}.npy".format(fold_id)) test_predicts = model.predict(X_test, batch_size=args.batch_size) test_predicts_list.append(test_predicts) np.save(test_predicts_path, test_predicts) test_predicts = np.ones(test_predicts_list[0].shape) for fold_predict in test_predicts_list: print("fold_predict = ", fold_predict) test_predicts *= fold_predict test_predicts **= (1. / len(test_predicts_list)) test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT test_ids = test_data["id"].values test_ids = test_ids.reshape((len(test_ids), 1)) print("test_predicts = ", test_predicts) test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES) test_predicts["id"] = test_ids test_predicts = test_predicts[["id"] + CLASSES] # Route results to a CSV file: "finished.csv" with path "predict_results/finished" submit_path = os.path.join(args.result_path, "finished") test_predicts.to_csv(submit_path, index=False)