def GRNN_preprocess(csv_folder, output_folder, save_word2vec_data=True):
    # Read and Preprocessing train data
    print('Reading and preprocessing train_data...\n')
    train_texts, train_labels, train_size = prep_data(csv_folder, 'train')

    # Save text data for word2vec
    if save_word2vec_data:
        torch.save(train_texts,
                   os.path.join(output_folder, 'word2vec_data_grnn.pth.tar'))

    # Read and Preprocessing val data
    print('Reading and preprocessing val_data...\n')
    val_texts, val_labels, val_size = prep_data(csv_folder, 'val')

    # Read and Preprocessing test data
    print('Reading and preprocessing test_data...\n')
    test_texts, test_labels, test_size = prep_data(csv_folder, 'test')

    print('\n Train word2vec model...')
    train_word2vec_model(data_folder=output_folder, model='grnn')
    print('\nEND TRAINING WORD2VEC MODEL\n')

    # Build word_map & embedding
    embedding, word_map = load_word2vec_embeddings_grnn(output_folder)

    # Encode train data
    encode_data('train', train_texts, word_map, train_labels, output_folder)
    # Encode val data
    encode_data('val', val_texts, word_map, val_labels, output_folder)
    # Encode test data
    encode_data('test', test_texts, word_map, test_labels, output_folder)

    print('END PREPROCESSING!\n')
    return embedding, word_map, train_size, val_size, test_size
def HAN_preprocess(csv_folder,
                   output_folder,
                   sentence_limit,
                   word_limit,
                   min_word_count=5,
                   save_word2vec_data=True):
    # Read and Preprocessing train data
    print('Reading and preprocessing train_data...\n')
    train_texts, train_labels, word_counter, n_classes = read_csv(
        csv_folder, 'train', sentence_limit, word_limit)

    # Save text data for word2vec
    if save_word2vec_data:
        torch.save(train_texts,
                   os.path.join(output_folder, 'word2vec_data.pth.tar'))

    # Build word_map (=vocabulary, remove unique words)
    word_map = dict()
    word_map['<pad>'] = 0
    for word, count in word_counter.items():
        if count >= min_word_count:
            word_map[word] = len(word_map)
    word_map['<unk>'] = len(word_map)

    # Save word_map
    with open(os.path.join(output_folder, 'word_map.json'), 'w') as j:
        json.dump(word_map, j)

    split_preprocessing('train', train_texts, train_labels, output_folder,
                        sentence_limit, word_limit, word_map)

    # Read and Preprocessing val data
    print('Reading and preprocessing val data...\n')
    val_texts, val_labels, _, _ = read_csv(csv_folder, 'val', sentence_limit,
                                           word_limit)
    split_preprocessing('val', val_texts, val_labels, output_folder,
                        sentence_limit, word_limit, word_map)

    # Read and Preprocessing test data
    print('Reading and preprocessing test data...\n')
    test_texts, test_labels, _, _ = read_csv(csv_folder, 'test',
                                             sentence_limit, word_limit)
    split_preprocessing('test', test_texts, test_labels, output_folder,
                        sentence_limit, word_limit, word_map)

    print('END PREPROCESSING!\n')

    print('\n Train word2vec model...')
    train_word2vec_model(data_folder=output_folder, model='han')
    print('\nEND TRAINING WORD2VEC MODEL\n')

    return word_map, n_classes
    test = test.dropna()
    train_temp = train.loc[:, ['anger', 'body']]
    test_temp = test.loc[:, ['anger', 'body']]

    train_temp.to_csv('./data/train.csv', index=False, header=False)
    test_temp.to_csv('./data/test.csv', index=False, header=False)

    create_input_files(csv_folder='./data',
                       output_folder='./outdata',
                       # sentence_limit=15,
                       # word_limit=20,
                       # min_word_count=5)
                       sentence_limit=30,
                       word_limit=100,
                       min_word_count=10)

    train_word2vec_model(data_folder='./outdata', algorithm='skipgram')
    file1 = open("label.txt", "a")
    file2 = open("result.txt", "a")
    file2.close()

    for i in c.LABELS:
        print(i)
        file1.write(i)
        file1.write('\n')
        os.system('python3 train.py')
        os.system('python3 eval.py')

    file1.close()

Exemple #4
0
from utils import create_input_files, train_word2vec_model

if __name__ == '__main__':
    create_input_files(csv_folder='/users5/yjtian/tyj/demo/Hierarchical-Attention-Network/yahoo_answers_csv',
                       output_folder='/users5/yjtian/tyj/demo/HAN/data',
                       sentence_limit=15,
                       word_limit=20,
                       min_word_count=5)

    train_word2vec_model(data_folder='/users5/yjtian/tyj/demo/HAN/data',
                         algorithm='skipgram')
Exemple #5
0
from utils import create_input_files, train_word2vec_model

if __name__ == '__main__':
    create_input_files(csv_folder='./yahoo_answers_csv',
                       output_folder='/media/ssd/han data',
                       sentence_limit=15,
                       word_limit=20,
                       min_word_count=5)

    train_word2vec_model(data_folder='/media/ssd/han data',
                         algorithm='skipgram')
import sys
from utils import create_input_files_fromdb, train_word2vec_model

if __name__ == '__main__':
    args = sys.argv
    create_input_files_fromdb(output_folder='./data' + args[1],
                              hostname=args[2],
                              database=args[3],
                              sentence_limit=15,
                              word_limit=20,
                              min_word_count=5)

    train_word2vec_model(data_folder='./data' + args[1], algorithm='skipgram')