def main(): train_file_paths = ["data/yelp/sentiment.train.0", "data/yelp/sentiment.train.1"] dev_file_paths = ["data/yelp/sentiment.dev.0", "data/yelp/sentiment.dev.1"] test_file_paths = ["data/yelp/sentiment.test.0", "data/yelp/sentiment.test.1"] word2idx, idx2word, embedding = build_vocab(train_file_paths, glove_path=config.glove_path) if config.train: # prepare data loader for training train_loader = get_loader(train_file_paths[1], train_file_paths[0], word2idx, debug=config.debug, batch_size=config.batch_size) # prepare data loader for evaluation dev_loader = get_loader(dev_file_paths[1], dev_file_paths[0], word2idx, shuffle=False, debug=config.debug, batch_size=config.batch_size) data_loaders = [train_loader, dev_loader] trainer = Trainer(embedding, data_loaders) trainer.train() else: test_loader = get_loader(test_file_paths[1], test_file_paths[0], word2idx, debug=config.debug, shuffle=False, batch_size=16) data_loaders = [test_loader] trainer = Trainer(embedding, data_loaders) trainer.inference(config.model_path, config.output_dir, idx2word)
def prepare_data(args): if config.MAINTAIN_CANDIDATES >= 1 and config.FIX_VOCAB >= 1: # ELSE # read data and metadata from pickled files with open(P_DATA_DIR + 'metadata.pkl', 'rb') as f: metadata = pkl.load(f) with open(P_DATA_DIR + 'data.pkl', 'rb') as f: data_ = pkl.load(f) # read content of data and metadata candidates = data_['candidates'] candid2idx, idx2candid = metadata['candid2idx'], metadata['idx2candid'] else: # get candidates (restaurants) candidates, candid2idx, idx2candid = data_utils.load_candidates( candidates_f=os.path.join(DATA_DIR, 'candidates.txt')) # get data train, test, val = data_utils.load_dialog(data_dir=DATA_DIR, candid_dic=candid2idx) ## # get metadata metadata = data_utils.build_vocab(train + test + val, candidates) ### # write data to file data_ = { 'candidates': candidates, 'train': train, 'test': test, 'val': val } with open(P_DATA_DIR + 'data.pkl', 'wb') as f: pkl.dump(data_, f) ### # save metadata to disk metadata['candid2idx'] = candid2idx metadata['idx2candid'] = idx2candid # # build embeddings # w2idx = metadata['w2idx'] # print('Loading word2vec...') # w2v_model = gensim.models.Word2Vec.load( # os.path.join(W2V_DIR, 'dic_18_unk_short.bin')) # embeddings = list() # for word, _ in w2idx.items(): # if w2v_model.__contains__(word.strip()): # vector = w2v_model.__getitem__(word.strip()) # else: # # print('unk:', word) # vector = w2v_model.__getitem__('unk') # # print(type(vector)) # embeddings.append(vector) # embeddings = np.asarray(embeddings) # metadata['embeddings'] = embeddings with open(P_DATA_DIR + 'metadata.pkl', 'wb') as f: pkl.dump(metadata, f)
def train_data(data_dir="data"): data_path = os.path.join(data_dir, "ptb.train.txt") _, word_to_id = build_vocab(data_path) with open(data_path, "r") as ftrain: for line in ftrain: words = line.strip().split() word_ids = [word_to_id[w] for w in words] yield word_ids[0:-1], word_ids[1:]
def main(): config = Config() args = add_arguments(config) # read sentences data_path = config.dataset_base_path + args.dataset + '/' train_neg = open(data_path + 'train.0').readlines() train_pos = open(data_path + 'train.1').readlines() train_labels = [0] * len(train_neg) + [1] * len(train_pos) train = train_neg + train_pos train = [s.split() for s in train] train = [s[:30] if len(s) > 30 else s for s in train] dev_neg = open(data_path + 'dev.0').readlines() dev_pos = open(data_path + 'dev.1').readlines() dev_labels = [0] * len(dev_neg) + [1] * len(dev_pos) dev = dev_neg + dev_pos dev = [s.split() for s in dev] dev = [s[:30] if len(s) > 30 else s for s in dev] test_neg = open(data_path + 'test.0').readlines() test_pos = open(data_path + 'test.1').readlines() test_labels = [0] * len(test_neg) + [1] * len(test_pos) test = test_neg + test_pos test = [s.split() for s in test] test = [s[:30] if len(s) > 30 else s for s in test] word2id, _, _, _, _, _ = build_vocab(train, filter_stop_words=0) if (args.model == 'cmu'): with open(args.dataset + '.train.text', 'w') as fd: for l in train: fd.write(' '.join(l) + '\n') with open(args.dataset + '.train.labels', 'w') as fd: for l in train_labels: fd.write(str(l) + '\n') with open(args.dataset + '.dev.text', 'w') as fd: for l in dev: fd.write(' '.join(l) + '\n') with open(args.dataset + '.dev.labels', 'w') as fd: for l in dev_labels: fd.write(str(l) + '\n') with open(args.dataset + '.test.text', 'w') as fd: for l in test: fd.write(' '.join(l) + '\n') with open(args.dataset + '.test.labels', 'w') as fd: for l in test_labels: fd.write(str(l) + '\n') with open('vocab', 'w') as fd: for w in word2id: fd.write('%s\n' % w) return
def main(): config = model_config.Config() BATCH_SIZE = config.batch_size data = data_utils.read_data(config.corpus_path) data_utils.build_vocab(config.corpus_path, config.vocab_path) vocab = data_utils.read_dictionary() # random.shuffle(data) train_data = data[BATCH_SIZE:] test_data = data[:BATCH_SIZE] model = BiLSTM_CRF_Model(config=config, vocab=vocab, tag2label=data_utils.tag2label) model.build_graph() model.fit(train_data, val_data=test_data)
def main(): if args.no_attention: args.no_pf = True training_dataset = json.load(open(args.training_dataset, 'r')) source_vocab, target_vocab, source_vocab_list, target_vocab_list = data_utils.build_vocab( training_dataset, args.vocab_filename) if args.test: test_dataset = json.load(open(args.test_dataset, 'r')) test(test_dataset, source_vocab, target_vocab, source_vocab_list, target_vocab_list) else: validation_dataset = json.load(open(args.validation_dataset, 'r')) # print("Val data %s" % validation_dataset) train(training_dataset, validation_dataset, source_vocab, target_vocab, source_vocab_list, target_vocab_list, args.no_train)
def load_data(): # get the data paths train_path = './data/train/train.txt' valid_path = './data/valid/valid.txt' test_path = './data/test.txt' # build the complete vocabulary, then convert text data to list of integers word_to_id = build_vocab(train_path) train_data = file_to_word_ids(train_path, word_to_id) valid_data = file_to_word_ids(valid_path, word_to_id) test_data = file_to_word_ids(test_path, word_to_id) vocabulary = len(word_to_id) reversed_dictionary = dict(zip(word_to_id.values(), word_to_id.keys())) print(train_data[:5]) print(word_to_id) print(vocabulary) print(" ".join([reversed_dictionary[x] for x in train_data[:10]])) return train_data, valid_data, test_data, vocabulary, reversed_dictionary
def prepare_data(task_id, is_oov=False): task_id = task_id is_oov = is_oov # get candidates (restaurants) candidates, candid2idx, idx2candid = data_utils.load_candidates( task_id=task_id, candidates_f=DATA_DIR + 'dialog-babi-candidates.txt') # get data train, test, val = data_utils.load_dialog_task(data_dir=DATA_DIR, task_id=task_id, candid_dic=candid2idx, isOOV=is_oov) ## # get metadata metadata = data_utils.build_vocab(train + test + val, candidates) ### # write data to file data_ = { 'candidates': candidates, 'train': train, 'test': test, 'val': val } if is_oov: with open(P_DATA_DIR + str(task_id) + '_oov.data.pkl', 'wb') as f: pkl.dump(data_, f) else: with open(P_DATA_DIR + str(task_id) + '.data.pkl', 'wb') as f: pkl.dump(data_, f) ### # save metadata to disk metadata['candid2idx'] = candid2idx metadata['idx2candid'] = idx2candid if is_oov: with open(P_DATA_DIR + str(task_id) + '_oov.metadata.pkl', 'wb') as f: pkl.dump(metadata, f) else: with open(P_DATA_DIR + str(task_id) + '.metadata.pkl', 'wb') as f: pkl.dump(metadata, f)
def main(): if args.network == 'seq2seq' or args.network == 'seq2tree': source_serialize = True else: source_serialize = False if args.network == 'seq2seq' or args.network == 'tree2seq': target_serialize = True else: target_serialize = False if args.no_attention: args.no_pf = True train_data = json.load(open(args.train_data, 'r')) source_vocab, target_vocab, source_vocab_list, target_vocab_list = data_utils.build_vocab( train_data, args.vocab_filename, args.input_format, args.output_format) if args.test: test_data = json.load(open(args.test_data, 'r')) test(test_data, source_vocab, target_vocab, source_vocab_list, target_vocab_list, source_serialize, target_serialize) else: val_data = json.load(open(args.val_data, 'r')) train(train_data, val_data, source_vocab, target_vocab, source_vocab_list, target_vocab_list, source_serialize, target_serialize)
def ptb_raw_data(data_path=None): """load the original PTB dataset. """ def _file_to_word_ids(filename, word_to_id): """ Convert word string to word index according to the given word dictionary. Each line in the data file is an example. words are separated by blanks. """ data = read_words(filename) return [word_to_id[x] for x in data if x in word_to_id] train_path = os.path.join(data_path, "ptb.train.txt") valid_path = os.path.join(data_path, "ptb.valid.txt") test_path = os.path.join(data_path, "ptb.test.txt") words, word_to_id = build_vocab(train_path) train_data = _file_to_word_ids(train_path, word_to_id) valid_data = _file_to_word_ids(valid_path, word_to_id) test_data = _file_to_word_ids(test_path, word_to_id) return train_data, valid_data, test_data, words, word_to_id
def get_train_config(): train_contents, train_labels = read_file(FLAGS.train_file) # 1.先构建训练数据的词汇字典 if not os.path.exists(FLAGS.vocab_file): words = build_vocab(train_contents, FLAGS.vocab_file) else: words, _ = read_vocab(FLAGS.vocab_file) # 2.获取分类数据,构建分类数据的字典表,并保存至文件中 categories, cat_to_id = read_category() # 3.生成训练配置文件 vocab_size = len(words) num_classes = len(categories) #长度太大会内存溢出 # seq_len = max([len(content) for content in train_contents]) seq_len = 600 filter_sizes = [int(i) for i in FLAGS.filter_sizes.split(',')] # 生成环境配置文件 make_path(FLAGS) config_path = os.path.join(FLAGS.config_path, 'config') if not os.path.isfile(config_path): train_config = config_model(seq_len, vocab_size, num_classes, filter_sizes) save_config(train_config, config_path) return train_config
FLAGS = tf.flags.FLAGS FLAGS._parse_flags() MAXLEN = 30 np.random.seed(FLAGS.seed) train_file = "dataset/train.tsv" valid_file = "dataset/valid.tsv" test_file = "dataset/test.tsv" train_statement, train_ch, train_topic, train_speaker, train_job, train_state,\ train_party, train_location, train_y = load_data(train_file) topic_list = itertools.chain.from_iterable(train_topic) topic_index, _ = build_vocab(topic_list) valid_statement, valid_ch, valid_topic, valid_speaker, valid_job, valid_state,\ valid_party, valid_location, valid_y = load_data(valid_file) test_statement, test_ch, test_topic, test_speaker, test_job, test_state,\ test_party, test_location, test_y = load_data(test_file) # get dataset statistics train_tokens = texts_to_tokens(train_statement) valid_tokens = texts_to_tokens(valid_statement) test_tokens = texts_to_tokens(test_statement) train_sq_len = get_sequence_length(train_tokens) valid_sq_len = get_sequence_length(valid_tokens)
FLAGS = tf.flags.FLAGS FLAGS._parse_flags() MAXLEN = 30 np.random.seed(FLAGS.seed) train_file = "dataset/train.tsv" valid_file = "dataset/valid.tsv" test_file = "dataset/test.tsv" train_statement, train_ch, train_topic, train_speaker, train_job, train_state,\ train_party, train_location, train_y = load_data(train_file) topic_list = itertools.chain.from_iterable(train_topic) topic_index, _ = build_vocab(topic_list) train_location_list = itertools.chain.from_iterable(train_topic) location_index, _ = build_vocab(train_location_list) valid_statement, valid_ch, valid_topic, valid_speaker, valid_job, valid_state,\ valid_party, valid_location, valid_y = load_data(valid_file) test_statement, test_ch, test_topic, test_speaker, test_job, test_state,\ test_party, test_location, test_y = load_data(test_file) train_tokens = texts_to_tokens(train_statement) valid_tokens = texts_to_tokens(valid_statement) test_tokens = texts_to_tokens(test_statement) train_sq_len = get_sequence_length(train_tokens)
train_tokens = texts_to_tokens(train_statement) valid_tokens = texts_to_tokens(valid_statement) test_tokens = texts_to_tokens(test_statement) train_sq_len = get_sequence_length(train_tokens) # MAXLEN = max(train_sq_len) # AVR = sum(train_sq_len)/ len(train_sq_len) # print(AVR) valid_sq_len = get_sequence_length(valid_tokens) test_sq_len = get_sequence_length(test_tokens) # create vocabulary from the data itself wordlist = itertools.chain.from_iterable(train_tokens) word_index, _ = build_vocab(wordlist) # load dependency embedding dep_embedding_path = "dep_embedding/deps.contexts" dep_embedding_index = load_embedding(dep_embedding_path) dep_embedding_matrix = get_embedding_matrix(word_index, dep_embedding_index, FLAGS.word_embedding_size) print("finish loading dep embedding") fast_embedding_path = "fast-text/wiki.simple.vec" fast_embedding_index = load_embedding(fast_embedding_path) fast_embedding_matrix = get_embedding_matrix(word_index, fast_embedding_index, FLAGS.word_embedding_size) print("finish loading fast embedding") embedding_path = "glove.6B/glove.6B.{}d.txt".format(FLAGS.word_embedding_size) embedding_index = load_embedding(embedding_path) embedding_matrix = get_embedding_matrix(word_index, embedding_index,
from data_utils import batch_iter, build_vocab import os import numpy as np from tqdm import tqdm base_dir = 'data' train_sen_dir = os.path.join(base_dir, 'train_sen_data.txt') train_BIO_dir = os.path.join(base_dir, 'train_BIO_label_data.txt') test_sen_dir = os.path.join(base_dir, 'test_sen_data.txt') test_BIO_dir = os.path.join(base_dir, 'test_BIO_label_data.txt') # 加载 training_data = batch_iter(train_sen_dir, train_BIO_dir) test_data = batch_iter(test_sen_dir, test_BIO_dir) # 加载词典 word_to_ix = build_vocab() tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4} model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, config.EMBEDDING_DIM, config.HIDDEN_DIM) model.load_state_dict(torch.load('14lstm_params.pkl')) with torch.no_grad(): total = [] with open('eval1.txt', 'w', encoding='utf8') as f: for sentence, tags in tqdm(training_data): sentence_in = prepare_sequence(sentence, word_to_ix) targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long) score, tag_seq = model(sentence_in) tol = [] for index, i in enumerate(tag_seq):
config, save_model=True, model_path=model_path) else: from data_utils import build_vocab, build_dataset, build_iterator, load_embeddings from model_utils import load_model, train_model # split document to words logger.info("Split words...") train_df["text_words"] = train_df["text"].apply(lambda x: x.split()) X_train, y_train = train_df["text_words"], train_df["label"] train_df.drop(columns=["text"], inplace=True) logger.info("Building dataset...") vocab2id = build_vocab(docs=X_train, min_count=config.min_count) pkl.dump( vocab2id, open( os.path.join(args.model_dir, "vocab_{}.vocab".format(args.model)), "wb")) train_data = build_dataset(X_train, vocab2id, max_doc_len=config.max_doc_len) train_df.drop(columns=["text_words"], inplace=True) logger.info("Loading embeddings...") embeddings = load_embeddings(args.embedding_path, vocab2id) device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu")
assert os.path.exists(data_path), '{} file does not exist.'.format(data_path) # # directories print('preparing the directories...') for dir_path in [args.vocab_dir]: if not os.path.exists(dir_path): os.makedirs(dir_path) # # vocab print('building vocabulary ...') vocab = Vocab(lower=True) # filelist = train_files + dev_files + test_files example_gen = example_generator(filelist, False, args.max_p_len, single_pass=True) # vocab = build_vocab(example_gen, vocab) # unfiltered_vocab_size = vocab.size() print("unfiltered_vocab_size: %d" % unfiltered_vocab_size) print("token_min_cnt: %d" % args.token_min_cnt) # vocab.filter_tokens_by_cnt(min_cnt = args.token_min_cnt) filtered_num = unfiltered_vocab_size - vocab.size() print('after filter {} tokens, the final vocab size is {}'.format(filtered_num, vocab.size())) # # # embeddings print('assigning embeddings...') print("emb_dim: %d" % args.emb_dim) if args.embed_file is None:
VALIDATE_BATCH_SIZE = 64 TEST_BATCH_SIZE = 64 dataset = get_dataset(DATA_PATH) s = dataset[['question1', 'question2', 'is_duplicate']].values print("Spliting dataset") train, test_and_val = train_test_split(s, test_size=0.3) same_idx = np.where(train[:, 2] == 1)[0] train_set = train[same_idx] print("Building vocab") vocab = build_vocab(train_set) print("Creating Dataloader") dlt = DQDataset(train_set, vocab) dl = DataLoader(dlt, batch_size=TRAIN_BATCH_SIZE, collate_fn=collate_fn) test_val_ds = DQDataset(test_and_val, vocab) VAL_SPLIT_SIZE = len(test_val_ds) - len(test_val_ds) // 3 TEST_SPLIT_SIZE = len(test_val_ds) // 3 val_set, test_set = random_split(test_val_ds, [VAL_SPLIT_SIZE, TEST_SPLIT_SIZE]) vdl = DataLoader(val_set, batch_size=VALIDATE_BATCH_SIZE,
FLAGS._parse_flags() MAXLEN = 30 np.random.seed(FLAGS.seed) train_file = "dataset/train.tsv" valid_file = "dataset/valid.tsv" test_file = "dataset/test.tsv" # statement, credit_history, topic, speaker, job, state, party, location, label train_statement, train_ch, train_topic, train_speaker, train_job, train_state, \ train_party, train_location, train_y = load_data(train_file) topic_list = itertools.chain.from_iterable(train_topic) topic_index, _ = build_vocab(topic_list) valid_statement, valid_ch, valid_topic, valid_speaker, valid_job, valid_state, \ valid_party, valid_location, valid_y = load_data(valid_file) valid_statement, valid_ch, valid_topic, valid_speaker, valid_job, valid_state, \ valid_party, valid_location, valid_y = load_data(valid_file) test_statement, test_ch, test_topic, test_speaker, test_job, test_state, \ test_party, test_location, test_y = load_data(test_file) # text train_tokens = texts_to_tokens(train_statement) valid_tokens = texts_to_tokens(valid_statement) test_tokens = texts_to_tokens(test_statement) # text sequence