Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Recurrent neural network for identifying and classifying toxic online comments"
    )

    parser.add_argument("train_file_path")
    parser.add_argument("test_file_path")
    parser.add_argument("embedding_path")
    parser.add_argument("--result-path", default="toxic_results")
    parser.add_argument("--batch-size", type=int, default=256)
    parser.add_argument("--sentences-length", type=int, default=500)
    parser.add_argument("--recurrent-units", type=int, default=64)
    parser.add_argument("--dropout-rate", type=float, default=0.3)
    parser.add_argument("--dense-size", type=int, default=32)
    parser.add_argument("--fold-count", type=int, default=10)

    args = parser.parse_args()

    if args.fold_count <= 1:
        raise ValueError("fold-count should be more than 1")

    print("Loading data...")
    train_data = pd.read_csv(args.train_file_path)
    test_data = pd.read_csv(args.test_file_path)
    """
    list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values
    list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values
    y_train = train_data[CLASSES].values

    print("Tokenizing sentences in train set...")
    tokenized_sentences_train, words_dict = tokenize_sentences(list_sentences_train, {})

    print("Tokenizing sentences in test set...")
    tokenized_sentences_test, words_dict = tokenize_sentences(list_sentences_test, words_dict)

    words_dict[UNKNOWN_WORD] = len(words_dict)

    print("Loading embeddings...")
    embedding_list, embedding_word_dict = read_embedding_list(args.embedding_path)
    embedding_size = len(embedding_list[0])

    print("Preparing data...")
    embedding_list, embedding_word_dict = clear_embedding_list(embedding_list, embedding_word_dict, words_dict)

    embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict)
    embedding_list.append([0.] * embedding_size)
    embedding_word_dict[END_WORD] = len(embedding_word_dict)
    embedding_list.append([-1.] * embedding_size)

    embedding_matrix = np.array(embedding_list)

    id_to_word = dict((id, word) for word, id in words_dict.items())
    train_list_of_token_ids = convert_tokens_to_ids(
        tokenized_sentences_train,
        id_to_word,
        embedding_word_dict,
        args.sentences_length)
    test_list_of_token_ids = convert_tokens_to_ids(
        tokenized_sentences_test,
        id_to_word,
        embedding_word_dict,
        args.sentences_length)
    X_train = np.array(train_list_of_token_ids)
    X_test = np.array(test_list_of_token_ids)
    """
    embed_path = os.path.join(args.embedding_path, 'embeddings.npz')
    data = np.load(embed_path)
    embedding_matrix = data['arr_0']

    data_path = os.path.join(args.embedding_path, 'train.npz')
    data = np.load(data_path)
    X_train = data['arr_0']

    data_path = os.path.join(args.embedding_path, 'test.npz')
    data = np.load(data_path)
    X_test = data['arr_0']

    data_path = os.path.join(args.embedding_path, 'label.npz')
    data = np.load(data_path)
    y_train = data['arr_0']
    """
    get_model_func = lambda: get_model(
        embedding_matrix,
        args.sentences_length,
        args.dropout_rate,
        args.recurrent_units,
        args.dense_size)
    """
    get_model_func = lambda: get_GRU_GlobalMax_Ave(
        embedding_matrix, args.sentences_length, args.dropout_rate, args.
        recurrent_units, args.dense_size)

    print("Starting to train models...")
    models = train_folds(X_train, y_train, args.fold_count, args.batch_size,
                         get_model_func)

    if not os.path.exists(args.result_path):
        os.mkdir(args.result_path)

    print("Predicting results...")
    test_predicts_list = []
    for fold_id, model in enumerate(models):
        model_path = os.path.join(args.result_path,
                                  "model{0}_weights.npy".format(fold_id))
        np.save(model_path, model.get_weights())

        test_predicts_path = os.path.join(
            args.result_path, "test_predicts{0}.npy".format(fold_id))
        test_predicts = model.predict(X_test, batch_size=args.batch_size)
        test_predicts_list.append(test_predicts)
        np.save(test_predicts_path, test_predicts)

    test_predicts = np.ones(test_predicts_list[0].shape)
    for fold_predict in test_predicts_list:
        test_predicts *= fold_predict

    test_predicts **= (1. / len(test_predicts_list))
    test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT

    test_ids = test_data["id"].values
    test_ids = test_ids.reshape((len(test_ids), 1))

    test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)
    test_predicts["id"] = test_ids
    test_predicts = test_predicts[["id"] + CLASSES]
    submit_path = os.path.join(args.result_path, "submit")
    test_predicts.to_csv(submit_path, index=False)
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        description="Recurrent neural network for identifying and classifying toxic online comments")

    parser.add_argument("train_file_path")
    parser.add_argument("test_file_path")
    parser.add_argument('--test-mode', type=bool, default=False)
    parser.add_argument("--embedding-path", default=None)
    parser.add_argument("--result-path", default="toxic_results")
    parser.add_argument("--batch-size", type=int, default=256)
    parser.add_argument("--sentences-length", type=int, default=500)
    parser.add_argument("--recurrent-units", type=int, default=64)
    parser.add_argument("--dropout-rate", type=float, default=0.5)
    parser.add_argument("--dense-size", type=int, default=32)
    parser.add_argument("--fold-count", type=int, default=10)
    parser.add_argument('--epoch', type=int, default=5)
    parser.add_argument('--load-pretrained', type=bool, default=False)
    parser.add_argument('--sample', type=float, default=0.1)

    args = parser.parse_args()

    # if args.fold_count <= 1:
    #     raise ValueError("fold-count should be more than 1")

    print("Loading data...")

    vocab_size = 50000
    embedding_dim = 64

    X_train, y_train, X_test, embedding_matrix = get_train_test_and_embedding(
        train_csv=args.train_file_path, test_csv=args.test_file_path,
        sequence_length=args.sentences_length, vocab_size=vocab_size,
        embedding_file=args.embedding_path, embedding_dim=embedding_dim)

    test_data = pd.read_csv(args.test_file_path)
    test_ids = test_data["id"].values

    get_model_func = lambda: get_model(
        embedding_matrix,
        args.sentences_length,
        args.dropout_rate,
        args.recurrent_units,
        args.dense_size)

    if args.test_mode is True:
        print('in test mode!')
        X_size = X_train.shape[0]
        sub_train_indices = np.random.choice(range(X_size), size=int(X_size*args.sample), replace=False)
        X_test_size = X_test.shape[0]
        sub_test_indices = np.random.choice(range(X_test_size), size=int(X_test_size*0.1), replace=False)
        X_test = X_test[sub_test_indices]
        test_ids = test_ids[sub_test_indices]
    else:
        print(' not in test mode')
        X_size = X_train.shape[0]
        sub_train_indices = np.random.choice(range(X_size), size=int(X_size*1), replace=False)
        # X_test_size = X_test.shape[0]
        # sub_test_indices = np.random.choice(range(X_test_size), size=int(X_test_size*1), replace=False)

    X_train = X_train[sub_train_indices]
    y_train = y_train[sub_train_indices]


    print("Starting to train models...")
    # model, hist = train_folds(X_train, y_train, args.epoch, args.batch_size, get_model_func)

    if args.load_pretrained is False:
        models, scores = train_folds(
            X=X_train, y=y_train,
            epoch=args.epoch, fold_count=args.fold_count,
            batch_size=args.batch_size,
            get_model_func=get_model_func)
    else:
        models = [get_model_func() for _ in range(args.fold_count)]
        scores = [0.98729337078270507, 0.98741052541081709, 0.98875435673905765, 0.98888426426103615, 0.98921313005210798, 0.98900847797211722, 0.98849751432495514, 0.98839252464184923, 0.98844750778401702, 0.98633935039731879]

    print('the fold score is : {}'.format(scores))
    validation_scores = np.mean(scores)

    if not os.path.exists(args.result_path):
        os.mkdir(args.result_path)

    # print("Predicting results...")
    # y_pred = model.predict(X_test)
    #

    probabilities = softmax(scores)
    max_probability_index = np.argmax(scores)
    #mean_probabilities = [1./scores] * len(scores)
    # test_predicts_list = []
    # for fold_id, model in enumerate(models):
    test_predicts_softmax = np.zeros(shape=(X_test.shape[0], len(CLASSES)))
    test_predicats_mean = np.zeros(shape=(X_test.shape[0], len(CLASSES)))
    test_predicats_max = np.zeros(shape=(X_test.shape[0], len(CLASSES)))
    test_predicats_with_normalizated = np.zeros(shape=(X_test.shape[0], len(CLASSES)))
    print('predicate test set!')
    for fold_id, model in enumerate(models):
        prob = probabilities[fold_id]
        print('predicate with fold_id {}'.format(fold_id))
        model_path = os.path.join(args.result_path, 'model{0}_weights.npy'.format(fold_id))

        if args.load_pretrained is True:
            weights = np.load('model{0}_weights.npy')
            model.set_weights(weights)
        else:
            np.save(model_path, model.get_weights())

    for fold_id, model in enumerate(models):
        prob = probabilities[fold_id]
        # test_predicts_path = os.path.join(args.result_path, "test_predicts{0}.npy".format(fold_id))
        t = model.predict(X_test, batch_size=args.batch_size)
        test_predicts_softmax += prob * t
        test_predicats_mean += t * (1/len(models))
        if fold_id == max_probability_index:
            test_predicats_max = t
        # test_predicts_list.append(test_predicts)
        # np.save(test_predicts_path, test_predicts)
        fold_id += 1

    test_predicats_with_normalizated = test_predicts_softmax / PROBABILITIES_NORMALIZE_COEFFICIENT

    # test_predicts = np.ones(test_predicts_list[0].shape)
    # for fold_predict in test_predicts_list:
    #     test_predicts += fold_predict

    # test_predicts /= len(test_predicts_list)

    # test_predicts **= (1. / len(test_predicts_list))
    # test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT
    #

    test_ids = test_ids.reshape((len(test_ids), 1))

    results_with_label = {
        'softmax': test_predicts_softmax,
        'max': test_predicats_max,
        'mean': test_predicats_mean,
        'normalized': test_predicats_with_normalizated
    }

    for method in results_with_label:
        test_predicts = pd.DataFrame(data=results_with_label[method], columns=CLASSES)
        test_predicts["id"] = test_ids
        test_predicts = test_predicts[["id"] + CLASSES]

        now = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')

        embedding_size = re.findall('[(cbow)|(skip)]-(\d+)-',args.embedding_path)[0]
        parameters = "{}-emb-{}-batch_size-{}-sen_len-{}-RUNIT-{}-dense_s-{}".format(
            method,
            embedding_size,
            args.batch_size,
            args.sentences_length,
            args.recurrent_units,
            args.dense_size
        )

        submit_path = os.path.join(args.result_path, "{}_{}_submission_lstm_{}.csv".format(parameters, now, validation_scores))
        test_predicts.to_csv(submit_path, index=False)
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Recurrent neural network for identifying and classifying toxic online comments"
    )

    parser.add_argument("train_file_path")
    parser.add_argument("test_file_path")
    parser.add_argument("embedding_path")
    parser.add_argument("--result-path", default="toxic_results")
    parser.add_argument("--batch-size", type=int, default=256)
    parser.add_argument("--sentences-length", type=int, default=500)
    parser.add_argument("--recurrent-units", type=int, default=64)
    parser.add_argument("--dropout-rate", type=float, default=0.3)
    parser.add_argument("--dense-size", type=int, default=32)
    parser.add_argument("--fold-count", type=int, default=10)

    # parser.add_argument("train_file_path")
    # parser.add_argument("test_file_path")
    # parser.add_argument("embedding_path")
    # parser.add_argument("--result-path", default="toxic_results")
    # parser.add_argument("--batch-size", type=int, default=32)
    # parser.add_argument("--sentences-length", type=int, default=200)
    # parser.add_argument("--recurrent-units", type=int, default=80)
    # parser.add_argument("--dropout-rate", type=float, default=0.3)
    # parser.add_argument("--dense-size", type=int, default=32)
    # parser.add_argument("--fold-count", type=int, default=10)

    args = parser.parse_args()

    if args.fold_count <= 1:
        raise ValueError("fold-count should be more than 1")

    print("Loading data...")
    train_data = pd.read_csv(args.train_file_path)
    test_data = pd.read_csv(args.test_file_path)

    list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values
    list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values
    y_train = train_data[CLASSES].values

    print("Tokenizing sentences in train set...")
    tokenized_sentences_train, words_dict = tokenize_sentences(
        list_sentences_train, {})

    print("Tokenizing sentences in test set...")
    tokenized_sentences_test, words_dict = tokenize_sentences(
        list_sentences_test, words_dict)

    words_dict[UNKNOWN_WORD] = len(words_dict)

    print("Loading embeddings...")
    embedding_list, embedding_word_dict = read_embedding_list(
        args.embedding_path)
    embedding_size = len(embedding_list[0])

    print("Preparing data...")
    embedding_list, embedding_word_dict = clear_embedding_list(
        embedding_list, embedding_word_dict, words_dict)

    embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict)
    embedding_list.append([0.] * embedding_size)
    embedding_word_dict[END_WORD] = len(embedding_word_dict)
    embedding_list.append([-1.] * embedding_size)

    embedding_matrix = np.array(embedding_list)

    id_to_word = dict((id, word) for word, id in words_dict.items())
    train_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_train,
                                                    id_to_word,
                                                    embedding_word_dict,
                                                    args.sentences_length)
    test_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_test,
                                                   id_to_word,
                                                   embedding_word_dict,
                                                   args.sentences_length)
    X_train = np.array(train_list_of_token_ids)
    X_test = np.array(test_list_of_token_ids)

    get_model_func = lambda: get_model(embedding_matrix, args.sentences_length,
                                       args.dropout_rate, args.recurrent_units,
                                       args.dense_size)

    import gc
    del train_data, test_data, list_sentences_train, list_sentences_test
    del tokenized_sentences_train, tokenized_sentences_test, words_dict
    del embedding_list, embedding_word_dict
    del train_list_of_token_ids, test_list_of_token_ids
    gc.collect()

    print("Starting to train models...")
    models = train_folds(X_train, y_train, X_test, args.fold_count,
                         args.batch_size, get_model_func)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Recurrent neural network for identifying and classifying toxic online comments"
    )

    parser.add_argument("train_file_path")
    parser.add_argument("test_file_path")
    parser.add_argument("embedding_path")
    parser.add_argument("--result-path", default="toxic_results")
    parser.add_argument("--batch-size", type=int, default=256)
    parser.add_argument("--sentences-length", type=int, default=500)
    parser.add_argument("--recurrent-units", type=int, default=64)
    parser.add_argument("--dropout-rate", type=float, default=0.3)
    parser.add_argument("--dense-size", type=int, default=32)
    parser.add_argument("--fold-count", type=int, default=10)
    parser.add_argument("--aug-count", type=int, default=1)

    args = parser.parse_args()

    if args.fold_count <= 1:
        raise ValueError("fold-count should be more than 1")

    print("Loading data...")
    train_data = pd.read_csv(args.train_file_path)
    test_data = pd.read_csv(args.test_file_path)

    embed_path = os.path.join(args.embedding_path, 'embeddings.npz')
    data = np.load(embed_path)
    embedding_matrix = data['arr_0']

    data_path = os.path.join(args.embedding_path, 'train.npz')
    data = np.load(data_path)
    X_train = data['arr_0']

    data_path = os.path.join(args.embedding_path, 'test.npz')
    data = np.load(data_path)
    X_test = data['arr_0']

    data_path = os.path.join(args.embedding_path, 'label.npz')
    data = np.load(data_path)
    y_train = data['arr_0']

    get_model_func = lambda: get_LSTMGRU_GlobalMaxAve(
        embedding_matrix, args.sentences_length, args.dropout_rate, args.
        recurrent_units, args.dense_size)

    print("Starting to train models...")
    models, train_preds = train_folds(X_train,
                                      y_train,
                                      args.fold_count,
                                      args.batch_size,
                                      get_model_func,
                                      aug=args.aug_count)

    if not os.path.exists(args.result_path):
        os.mkdir(args.result_path)

    print("Predicting results...")
    test_predicts_list = []
    for fold_id, model in enumerate(models):
        model_path = os.path.join(args.result_path,
                                  "model{0}_weights.npy".format(fold_id))
        np.save(model_path, model.get_weights())

        test_predicts_path = os.path.join(
            args.result_path, "test_predicts{0}.npy".format(fold_id))
        test_predicts = model.predict(X_test, batch_size=args.batch_size)
        test_predicts_list.append(test_predicts)
        np.save(test_predicts_path, test_predicts)

    test_predicts = np.ones(test_predicts_list[0].shape)
    for fold_predict in test_predicts_list:
        test_predicts *= fold_predict

    test_predicts **= (1. / len(test_predicts_list))
    test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT

    test_ids = test_data["id"].values
    test_ids = test_ids.reshape((len(test_ids), 1))

    test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)
    test_predicts["id"] = test_ids
    test_predicts = test_predicts[["id"] + CLASSES]
    submit_path = os.path.join(args.result_path, "submit")
    test_predicts.to_csv(submit_path, index=False)

    train_ids = train_data["id"].values
    train_ids = train_ids.reshape((len(train_ids), 1))

    train_predicts = pd.DataFrame(data=train_preds, columns=CLASSES)
    train_predicts["id"] = train_ids
    train_predicts = train_predicts[["id"] + CLASSES]
    valid_path = os.path.join(args.result_path, "valid")
    train_predicts.to_csv(valid_path, index=False)
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Recurrent neural network for identifying and classifying toxic online comments"
    )

    parser.add_argument("train_file_path")
    parser.add_argument("test_file_path")
    parser.add_argument("embedding_path")
    parser.add_argument("--result-path", default="toxic_results")
    parser.add_argument("--batch-size", type=int, default=256)
    parser.add_argument("--sentences-length", type=int, default=500)
    parser.add_argument("--recurrent-units", type=int, default=64)
    parser.add_argument("--dropout-rate", type=float, default=0.3)
    parser.add_argument("--dense-size", type=int, default=32)
    parser.add_argument("--fold-count", type=int, default=10)

    args = parser.parse_args()

    if args.fold_count <= 1:
        raise ValueError("fold-count should be more than 1")

    print("Loading data...")
    train_data = pd.read_csv(args.train_file_path)
    test_data = pd.read_csv(args.test_file_path)

    # Identify language
    #train_data['language'] = train_data['comment_text'].apply(detect_language)
    #test_data['language'] = test_data['comment_text'].apply(detect_language)

    # Translate the non-english to the english.
    #train_data['comment_text'] = train_data.apply(lambda x: translate(x.comment_text, x.language),axis=1)
    #test_data['comment_text'] = test_data.apply(lambda x: translate(x.comment_text, x.language),axis=1)
    #train_data.to_csv("train_data_translated.csv")
    #test_data.to_csv("test_data_translated.csv")

    #train_data['comment_text'] = train_data.apply(lambda x: clean(x.comment_text), axis=1)
    #train_data['comment_text'] = train_data.apply(lambda x: clean(x.comment_text), axis=1)
    #train_data.to_csv("train_data_cleaned_after_translate.csv")
    #test_data.to_csv("test_data_cleaned_after_translate.csv")

    list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values
    list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values

    y_train = train_data[CLASSES].values

    print("Tokenizing sentences in train set...")
    tokenized_sentences_train, words_dict = tokenize_sentences(
        list_sentences_train, {})

    print("Tokenizing sentences in test set...")
    tokenized_sentences_test, words_dict = tokenize_sentences(
        list_sentences_test, words_dict)

    words_dict[UNKNOWN_WORD] = len(words_dict)

    print("Loading embeddings...")
    embedding_list, embedding_word_dict = read_embedding_list(
        args.embedding_path)
    embedding_size = len(embedding_list[0])

    print("Preparing data...")
    embedding_list, embedding_word_dict = clear_embedding_list(
        embedding_list, embedding_word_dict, words_dict)

    embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict)
    embedding_list.append([0.] * embedding_size)
    embedding_word_dict[END_WORD] = len(embedding_word_dict)
    embedding_list.append([-1.] * embedding_size)

    embedding_matrix = np.array(embedding_list)

    embedding_matrix_path = os.path.join(args.result_path,
                                         "embedding_matrix.npy")
    np.save(embedding_matrix_path, embedding_matrix)
    words_dict_path = os.path.join(args.result_path, "words_dict.npy")
    np.save(words_dict_path, words_dict)

    id_to_word = dict((id, word) for word, id in words_dict.items())
    train_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_train,
                                                    id_to_word,
                                                    embedding_word_dict,
                                                    args.sentences_length)
    test_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_test,
                                                   id_to_word,
                                                   embedding_word_dict,
                                                   args.sentences_length)
    X_train = np.array(train_list_of_token_ids)
    X_test = np.array(test_list_of_token_ids)
    print(embedding_matrix.shape)
    print(embedding_matrix.shape[0])
    print(embedding_matrix.shape[1])
    get_model_func = lambda: get_model(embedding_matrix, args.sentences_length,
                                       args.dropout_rate, args.recurrent_units,
                                       args.dense_size)

    print("Starting to train models...")
    models = train_folds(X_train, y_train, args.fold_count, args.batch_size,
                         get_model_func)

    if not os.path.exists(args.result_path):
        os.mkdir(args.result_path)

    print("Predicting results...")
    test_predicts_list = []
    for fold_id, model in enumerate(models):
        model_path = os.path.join(args.result_path,
                                  "model{0}_weights.npy".format(fold_id))
        np.save(model_path, model.get_weights())
        model.save_weights("model{0}_weights.h5".format(fold_id))

        test_predicts_path = os.path.join(
            args.result_path, "test_predicts{0}.npy".format(fold_id))
        test_predicts = model.predict(X_test, batch_size=args.batch_size)
        test_predicts_list.append(test_predicts)
        np.save(test_predicts_path, test_predicts)

    test_predicts = np.ones(test_predicts_list[0].shape)
    for fold_predict in test_predicts_list:
        test_predicts *= fold_predict

    test_predicts **= (1. / len(test_predicts_list))
    test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT

    test_ids = test_data["id"].values
    test_ids = test_ids.reshape((len(test_ids), 1))

    test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)
    test_predicts["id"] = test_ids
    test_predicts = test_predicts[["id"] + CLASSES]
    submit_path = os.path.join(args.result_path, "submit")
    test_predicts.to_csv(submit_path, index=False)

    print("Predicting Discussion posts...")
    posts = pd.read_csv("posts_cleaned.csv")
    posts = posts.dropna()
    discussion_posts = posts['MSG_TEXT'].tolist()
    tokenized_discussion_posts, words_dict = tokenize_sentences(
        discussion_posts, words_dict)
    #id_to_word = dict((id, word) for word, id in words_dict.items())
    discussion_list_of_token_ids = convert_tokens_to_ids(
        tokenized_discussion_posts, id_to_word, embedding_word_dict,
        args.sentences_length)
    X_test = np.array(discussion_list_of_token_ids)
    discussion_predict_list = []
    for fold_id, model in enumerate(models):
        discussion_predicts = model.predict(X_test, batch_size=args.batch_size)
        discussion_predict_list.append(discussion_predicts)

    discussion_predicts = np.ones(discussion_predict_list[0].shape)
    for fold_predict in discussion_predict_list:
        discussion_predicts *= fold_predict

    discussion_predicts **= (1. / len(discussion_predict_list))
    discussion_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT

    discussion_predicts = pd.DataFrame(data=discussion_predicts,
                                       columns=CLASSES)
    discussion_predicts['MSG_TEXT'] = discussion_posts
    discussion_predicts = discussion_predicts[["MSG_TEXT"] + CLASSES]
    discussion_predicts_path = os.path.join(args.result_path,
                                            "discussion_predicts.csv")
    discussion_predicts.to_csv(discussion_predicts_path, index=False)
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Recurrent neural network for identifying and classifying toxic online comments"
    )

    parser.add_argument("train_file_path")
    parser.add_argument("test_file_path")
    parser.add_argument("embedding_path")
    parser.add_argument("--result-path", default="toxic_results")
    parser.add_argument("--batch-size", type=int, default=256)
    parser.add_argument("--sentences-length", type=int, default=500)
    parser.add_argument("--recurrent-units", type=int, default=64)
    parser.add_argument("--dropout-rate", type=float, default=0.3)
    parser.add_argument("--dense-size", type=int, default=32)
    parser.add_argument("--fold-count", type=int, default=10)
    parser.add_argument("--modelname-prefix", type=str, default="")
    parser.add_argument("--cv", type=str, default="True")
    parser.add_argument("--use-roc", type=str, default="False")

    args = parser.parse_args()

    if args.fold_count <= 1:
        raise ValueError("fold-count should be more than 1")
    print('Input params')
    print(args)

    start = time.time()
    print('#' * 50)
    print("Loading data...")
    print('#' * 50)

    if os.path.exists(
            os.path.join(
                args.result_path,
                'tokenized_sentences_train.pkl')) and os.path.exists(
                    os.path.join(
                        args.result_path,
                        'tokenized_sentences_train.pkl')) and os.path.exists(
                            os.path.join(args.result_path,
                                         'tokenized_sentences_train.pkl')):
        print('Preprocessed files found. Reading preprocess files')
        train_data = pd.read_csv(args.train_file_path)
        test_data = pd.read_csv(args.test_file_path)
        y_train = train_data[CLASSES].values

        with open(
                os.path.join(args.result_path,
                             'tokenized_sentences_train.pkl'), 'rb') as f:
            tokenized_sentences_train = pickle.load(f)
        with open(
                os.path.join(args.result_path, 'tokenized_sentences_test.pkl'),
                'rb') as f:
            tokenized_sentences_test = pickle.load(f)
        with open(os.path.join(args.result_path, 'words_dict.pkl'), 'rb') as f:
            words_dict = pickle.load(f)

    else:
        print('Preprocessed files not found.')
        train_data = pd.read_csv(args.train_file_path)
        test_data = pd.read_csv(args.test_file_path)

        list_sentences_train = train_data["comment_text"].fillna(
            NAN_WORD).values
        list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values
        y_train = train_data[CLASSES].values

        print('#' * 50)
        print("Tokenizing sentences in train set...")
        print('#' * 50)
        tokenized_sentences_train, words_dict = tokenize_sentences(
            list_sentences_train, {})

        print('#' * 50)
        print("Tokenizing sentences in test set...")
        print('#' * 50)
        tokenized_sentences_test, words_dict = tokenize_sentences(
            list_sentences_test, words_dict)

        print('Saving preprocess files...')
        with open(
                os.path.join(args.result_path,
                             'tokenized_sentences_train.pkl'), 'wb') as f:
            pickle.dump(tokenized_sentences_train, f)
        with open(
                os.path.join(args.result_path, 'tokenized_sentences_test.pkl'),
                'wb') as f:
            pickle.dump(tokenized_sentences_test, f)
        with open(os.path.join(args.result_path, 'words_dict.pkl'), 'wb') as f:
            pickle.dump(words_dict, f)

    print('total words', len(words_dict))
    words_dict[UNKNOWN_WORD] = len(words_dict)

    print('#' * 50)
    print("Loading embeddings...")
    print('#' * 50)
    if 'glove' in args.embedding_path:
        print('Reading Glove embedding')
        embedding_list, embedding_word_dict = read_embedding_list_glove(
            args.embedding_path)
    else:
        print('Reading Fasttext embedding')
        embedding_list, embedding_word_dict = read_embedding_list(
            args.embedding_path)

    embedding_size = len(embedding_list[0])
    print('Embedding size', embedding_size)

    print('#' * 50)
    print("Preparing data...")
    print('#' * 50)
    embedding_list, embedding_word_dict = clear_embedding_list(
        embedding_list, embedding_word_dict, words_dict)

    embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict)
    embedding_list.append([0.] * embedding_size)
    embedding_word_dict[END_WORD] = len(embedding_word_dict)
    embedding_list.append([-1.] * embedding_size)

    embedding_matrix = np.array(embedding_list)
    print('Embedding matrix shape:', embedding_matrix.shape)

    id_to_word = dict((id, word) for word, id in words_dict.items())
    train_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_train,
                                                    id_to_word,
                                                    embedding_word_dict,
                                                    args.sentences_length)
    test_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_test,
                                                   id_to_word,
                                                   embedding_word_dict,
                                                   args.sentences_length)
    X_train = np.array(train_list_of_token_ids)
    X_test = np.array(test_list_of_token_ids)

    # GRU cross validation
    # get_model_func = lambda: get_model(
    #     embedding_matrix,
    #     args.sentences_length,
    #     args.dropout_rate,
    #     args.recurrent_units,
    #     args.dense_size)

    # GRU maxpool, avgpool
    # get_model_func = lambda: get_model_pool(
    #     embedding_matrix,
    #     args.sentences_length,
    #     args.dropout_rate,
    #     args.recurrent_units,
    #     args.dense_size)

    # deepmoji style
    # get_model_func = lambda: get_model_deepmoji_style(
    #     embedding_matrix,
    #     args.sentences_length,
    #     args.dropout_rate,
    #     args.recurrent_units,
    #     args.dense_size)

    # GRU maxpool, avgpool validation
    # get_model_func = lambda: get_gru_model(
    #     embedding_matrix,
    #     args.sentences_length,
    #     args.dropout_rate,
    #     args.recurrent_units,
    #     args.dense_size)

    # GRU maxpool, avgpool + cnn validation
    # get_model_func = lambda: get_model_pool_gru_cnn(
    #     embedding_matrix,
    #     args.sentences_length,
    #     args.dropout_rate,
    #     args.recurrent_units,
    #     args.dense_size)

    # dpcnn validation
    get_model_func = lambda: get_dpcnn_model(
        embedding_matrix, args.sentences_length, args.dropout_rate, args.
        dense_size)

    # lstm with attention cross val
    # get_model_func = lambda: get_model_att_lstm(
    #     embedding_matrix,
    #     args.sentences_length,
    #     args.dropout_rate,
    #     args.recurrent_units,
    #     args.dense_size)

    # capsule net
    # get_model_func = lambda: get_capsnet_model(
    #     embedding_matrix,
    #     args.sentences_length,
    #     args.dropout_rate,
    #     args.recurrent_units,
    #     args.dense_size)

    print('#' * 50)
    print("Starting to train models...")
    print('#' * 50)
    models = train_folds(X_train, y_train, args.fold_count, args.batch_size,
                         get_model_func, args.cv, args.use_roc)

    if not os.path.exists(args.result_path):
        os.mkdir(args.result_path)

    print('#' * 50)
    print("Predicting results...")
    print('#' * 50)

    if args.cv == "True":
        test_predicts_list = []
        for fold_id, model in enumerate(models):
            model_path = os.path.join(
                args.result_path,
                "{0}_model{1}_weights.npy".format(args.modelname_prefix,
                                                  fold_id))
            np.save(model_path, model.get_weights())

            test_predicts_path = os.path.join(
                args.result_path,
                "{0}_test_predicts{1}.npy".format(args.modelname_prefix,
                                                  fold_id))
            test_predicts = model.predict(X_test,
                                          batch_size=args.batch_size * 2)
            test_predicts_list.append(test_predicts)
            np.save(test_predicts_path, test_predicts)

        test_predicts = np.ones(test_predicts_list[0].shape)
        for fold_predict in test_predicts_list:
            test_predicts *= fold_predict

        test_predicts **= (1. / len(test_predicts_list))
        # test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT

        test_ids = test_data["id"].values
        test_ids = test_ids.reshape((len(test_ids), 1))

        test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)
        test_predicts["id"] = test_ids
        test_predicts = test_predicts[["id"] + CLASSES]
        submit_path = os.path.join(args.result_path,
                                   "{}_submit".format(args.modelname_prefix))
        test_predicts.to_csv(submit_path, index=False)
        print('#' * 50)
        print('Prediction Completed...')
        print('#' * 50)
        total_time = time.time() - start
        mins, sec = divmod(total_time, 60)
        hrs, mins = divmod(mins, 60)
        print('Total time taken : {:.0f}h {:.0f}m {:.0f}s'.format(
            hrs, mins, sec))
    else:
        print('No Cross Validation')
        test_predicts = models.predict(X_test, batch_size=args.batch_size * 2)
        test_ids = test_data["id"].values
        test_ids = test_ids.reshape((len(test_ids), 1))
        test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)
        test_predicts["id"] = test_ids
        test_predicts = test_predicts[["id"] + CLASSES]
        submit_path = os.path.join(
            args.result_path, "{}_submit_nocv".format(args.modelname_prefix))
        test_predicts.to_csv(submit_path, index=False)

        total_time = time.time() - start
        mins, sec = divmod(total_time, 60)
        hrs, mins = divmod(mins, 60)
        print('Total time taken : {:.0f}h {:.0f}m {:.0f}s'.format(
            hrs, mins, sec))