def load_data(args): # train woed2vec model # ***************** # df = pd.read_csv('data/atec_nlp_sim_train.tsv', names=['id', 'ques1', 'ques2', 'label'], sep='\t') # word2vec_model = train_word2vec_model(df) # df_train = pd.read_csv('data/train_3000.tsv', names=['id', 'ques1', 'ques2', 'label'], sep='\t') # get_tfidf_weighted_embedding(df_train, word2vec_model, mode='train') # df_dev = pd.read_csv('data/valid_3000.tsv', names=['id', 'ques1', 'ques2', 'label'], sep='\t') # get_tfidf_weighted_embedding(df_dev, word2vec_model, mode='valid') # df_test = pd.read_csv('data/test_3000.tsv', names=['id', 'ques1', 'ques2'], sep='\t') # get_tfidf_weighted_embedding(df_test, word2vec_model, mode='test') # word2vec_model.save(w2v_model_path) # ***************** text_field = data.Field(sequential=False, use_vocab=False, batch_first=True) text_field.preprocessing = data.Pipeline(preprocess) label_field = data.Field(sequential=False, use_vocab=False, batch_first=True) train_data, train_iter = gen_iter(args.train_path, text_field, label_field, args) dev_data, dev_iter = gen_iter(args.dev_path, text_field, label_field, args) test_data, test_iter = gen_iter_test(args.test_path, text_field, label_field, args) return text_field, label_field, \ train_data, train_iter,\ dev_data, dev_iter,\ test_data, test_iter
def load_data(args): # train woed2vec model # ***************** # df_train = pd.read_csv('data/train_3000.tsv', names=['id', 'ques1', 'ques2', 'label'], sep='\t') # word2vec_model = train_word2vec_model(df_train) # word2vec_model.save(w2v_model_path) # ***************** text_field = data.Field(sequential=True, use_vocab=True, fix_length=25, batch_first=True, eos_token='<EOS>', init_token='<BOS>', pad_token='<PAD>', tokenize=jieba.lcut) label_field = data.Field(sequential=False, use_vocab=False, batch_first=True) train_data, train_iter = gen_iter('data/train_3000.tsv', text_field, label_field, args) dev_data, dev_iter = gen_iter('data/valid_3000.tsv', text_field, label_field, args) test_data, test_iter = gen_iter_test(args.test_path, text_field, label_field, args) text_field.build_vocab(train_data, dev_data) return text_field, label_field, \ train_data, train_iter,\ dev_data, dev_iter,\ test_data, test_iter
def load_data(args): ''' 1. train the w2v_model 2. split the data as 9:1(train:dev) 3. load the data load as pairs ''' df = preprocess(args.data_path) #print ('Positive in set: %s' %str(len(df[df['label'] == 1]))) #print ('Negative in set: %s' %str(len(df[df['label'] == 0]))) word2vec_model = train_word2vec_model(df) word2vec_model.save(args.w2v_model_path) df = df[['q1_list', 'q2_list', 'label']] df['q1_list'] = df['q1_list'].apply(lambda x: ' '.join(x)) df['q2_list'] = df['q2_list'].apply(lambda x: ' '.join(x)) train_df = df.head(int(len(df)*0.9)) train_df_rev = pd.DataFrame() train_df_rev['q1_list'] = train_df['q2_list'] train_df_rev['q2_list'] = train_df['q1_list'] train_df_rev['label'] = train_df['label'] train_df = pd.concat([train_df, train_df_rev]) dev_df = df.tail(int(len(df)*0.1)) # print 'Positive in dev set', len(dev_df[dev_df['label'] == 1]) # print 'Negative in dev set', len(dev_df[dev_df['label'] == 0]) train_df.to_csv(args.train_path, index=False, encoding='utf-8', sep='\t', header=None) # add qoura # with open('data/qoura_train.tsv' , 'r') as fqoura, open('data/combine.csv', 'w') as fcomb: # for line in fqoura: # label, q1, q2, pid = line.strip().split('\t') # fcomb.write('%s\t%s\t%s\n' %(q1, q2, label)) # with open(args.train_path , 'r') as ftrain, open('data/combine.csv', 'a') as fcomb: # for line in ftrain: # fcomb.write(line) dev_df.to_csv(args.dev_path, index=False, encoding='utf-8', sep='\t', header=None) text_field = data.Field(sequential=True, use_vocab=True, batch_first=True, eos_token='<EOS>', init_token='<BOS>', pad_token='<PAD>') label_field = data.Field(sequential=False, use_vocab=False) # train_data, train_iter = gen_iter(args.train_path, text_field, label_field, args) # dev_data, dev_iter = gen_iter(args.dev_path, text_field, label_field, args) # text_field.build_vocab(train_data, dev_data) # return text_field, label_field, \ # train_data, train_iter,\ # dev_data, dev_iter df_test = preprocess_test(args.test_path) df_test['q1_list'] = df_test['q1_list'].apply(lambda x: ' '.join(x)) df_test['q2_list'] = df_test['q2_list'].apply(lambda x: ' '.join(x)) df_test = df_test[['id', 'q1_list', 'q2_list']] df_test.to_csv(args.to_test_path, index=False, encoding='utf-8', sep='\t', header=None) # train_data, train_iter = gen_iter('data/combine.csv', text_field, label_field, args) train_data, train_iter = gen_iter(args.train_path, text_field, label_field, args) dev_data, dev_iter = gen_iter(args.dev_path, text_field, label_field, args) test_data, test_iter = gen_iter_test(args.to_test_path, text_field, label_field, args) text_field.build_vocab(train_data, dev_data) return text_field, label_field, \ train_data, train_iter,\ dev_data, dev_iter,\ test_data, test_iter