Esempio n. 1
0
def load_data(args):
    # train woed2vec model
    # *****************
    # df = pd.read_csv('data/atec_nlp_sim_train.tsv', names=['id', 'ques1', 'ques2', 'label'], sep='\t')
    # word2vec_model = train_word2vec_model(df)
    # df_train = pd.read_csv('data/train_3000.tsv', names=['id', 'ques1', 'ques2', 'label'], sep='\t')
    # get_tfidf_weighted_embedding(df_train, word2vec_model, mode='train')

    # df_dev = pd.read_csv('data/valid_3000.tsv', names=['id', 'ques1', 'ques2', 'label'], sep='\t')
    # get_tfidf_weighted_embedding(df_dev, word2vec_model, mode='valid')

    # df_test = pd.read_csv('data/test_3000.tsv', names=['id', 'ques1', 'ques2'], sep='\t')
    # get_tfidf_weighted_embedding(df_test, word2vec_model, mode='test')
    
    # word2vec_model.save(w2v_model_path)

    # *****************
    
    text_field   = data.Field(sequential=False, use_vocab=False, batch_first=True)
    text_field.preprocessing = data.Pipeline(preprocess)
    label_field   = data.Field(sequential=False, use_vocab=False, batch_first=True)
    
    train_data, train_iter = gen_iter(args.train_path, text_field, label_field, args)
    dev_data, dev_iter     = gen_iter(args.dev_path, text_field, label_field, args)
    test_data, test_iter   = gen_iter_test(args.test_path, text_field, label_field, args)
    

    return text_field, label_field, \
        train_data, train_iter,\
        dev_data, dev_iter,\
        test_data, test_iter
          
Esempio n. 2
0
def load_data(args):
    # train woed2vec model
    # *****************
    # df_train = pd.read_csv('data/train_3000.tsv', names=['id', 'ques1', 'ques2', 'label'], sep='\t')
    # word2vec_model = train_word2vec_model(df_train)
    # word2vec_model.save(w2v_model_path)
    # *****************

    text_field = data.Field(sequential=True,
                            use_vocab=True,
                            fix_length=25,
                            batch_first=True,
                            eos_token='<EOS>',
                            init_token='<BOS>',
                            pad_token='<PAD>',
                            tokenize=jieba.lcut)
    label_field = data.Field(sequential=False,
                             use_vocab=False,
                             batch_first=True)

    train_data, train_iter = gen_iter('data/train_3000.tsv', text_field,
                                      label_field, args)
    dev_data, dev_iter = gen_iter('data/valid_3000.tsv', text_field,
                                  label_field, args)
    test_data, test_iter = gen_iter_test(args.test_path, text_field,
                                         label_field, args)
    text_field.build_vocab(train_data, dev_data)

    return text_field, label_field, \
        train_data, train_iter,\
        dev_data, dev_iter,\
        test_data, test_iter
Esempio n. 3
0
def load_data(args):
    '''
        1. train the w2v_model
        2. split the data as 9:1(train:dev)
        3. load the data
        load as pairs
    '''
    df = preprocess(args.data_path)
    #print ('Positive in set: %s' %str(len(df[df['label'] == 1])))
    #print ('Negative in set: %s' %str(len(df[df['label'] == 0])))
    word2vec_model = train_word2vec_model(df)
    word2vec_model.save(args.w2v_model_path)
    df = df[['q1_list', 'q2_list', 'label']]
    df['q1_list'] = df['q1_list'].apply(lambda x: ' '.join(x))
    df['q2_list'] = df['q2_list'].apply(lambda x: ' '.join(x))
    train_df = df.head(int(len(df)*0.9))
    train_df_rev = pd.DataFrame()
    train_df_rev['q1_list'] = train_df['q2_list']
    train_df_rev['q2_list'] = train_df['q1_list']
    train_df_rev['label'] = train_df['label']
    train_df = pd.concat([train_df, train_df_rev])
    dev_df   = df.tail(int(len(df)*0.1))
    # print 'Positive in dev set', len(dev_df[dev_df['label'] == 1])
    # print 'Negative in dev set', len(dev_df[dev_df['label'] == 0])
    train_df.to_csv(args.train_path, index=False, encoding='utf-8', sep='\t', header=None)
    # add qoura
    # with open('data/qoura_train.tsv' , 'r') as fqoura, open('data/combine.csv', 'w') as fcomb:
    #     for line in fqoura:
    #         label, q1, q2, pid = line.strip().split('\t')
    #         fcomb.write('%s\t%s\t%s\n' %(q1, q2, label))
    # with open(args.train_path , 'r') as ftrain, open('data/combine.csv', 'a') as fcomb:
    #     for line in ftrain:
    #         fcomb.write(line)

    dev_df.to_csv(args.dev_path, index=False, encoding='utf-8', sep='\t', header=None)

    
    text_field    = data.Field(sequential=True, use_vocab=True, batch_first=True, eos_token='<EOS>', init_token='<BOS>', pad_token='<PAD>')
    label_field   = data.Field(sequential=False, use_vocab=False)
    
    # train_data, train_iter = gen_iter(args.train_path, text_field, label_field, args)
    # dev_data, dev_iter     = gen_iter(args.dev_path, text_field, label_field, args)
    
    # text_field.build_vocab(train_data, dev_data)

    # return text_field, label_field, \
    #     train_data, train_iter,\
    #     dev_data, dev_iter

    df_test = preprocess_test(args.test_path)
    df_test['q1_list'] = df_test['q1_list'].apply(lambda x: ' '.join(x))
    df_test['q2_list'] = df_test['q2_list'].apply(lambda x: ' '.join(x))
    df_test = df_test[['id', 'q1_list', 'q2_list']]
    df_test.to_csv(args.to_test_path, index=False, encoding='utf-8', sep='\t', header=None)
    
    # train_data, train_iter = gen_iter('data/combine.csv', text_field, label_field, args)
    train_data, train_iter = gen_iter(args.train_path, text_field, label_field, args)
    dev_data, dev_iter     = gen_iter(args.dev_path, text_field, label_field, args)
    test_data, test_iter   = gen_iter_test(args.to_test_path, text_field, label_field, args)
    text_field.build_vocab(train_data, dev_data)

    return text_field, label_field, \
        train_data, train_iter,\
        dev_data, dev_iter,\
        test_data, test_iter