def main(args): train_input_filepath = args.train_input dev_input_filepath = args.dev_input vocab = Vocabulary() vocab.load_word_from_data(train_path=train_input_filepath, dev_path=dev_input_filepath) vocab.save(vocab_file=PREPROCESSED_DIR + 'vocab_file.vocab') print("vocab size:{}".format(len(vocab))) max_length = -1 with open(train_input_filepath, 'r') as f_t, open(dev_input_filepath, 'r') as f_d: for line in chain(f_t, f_d): line = line.rstrip().split() max_length = max(max_length, len(line)) # max_length = 17 for file_name in [train_input_filepath, dev_input_filepath]: output_file = file_name.split('/')[-1] + '.preprocessed' data = [] with open(file_name, 'r') as fin, open(PREPROCESSED_DIR + output_file, 'w') as fout: total_len = ilen(fin) bar = tqdm(total=total_len) fin.seek(0) for i, line in enumerate(fin, start=1): line = line.rstrip().split() data.append( torch.LongTensor(vocab.sentence2index(line, max_length))) # data = ' '.join( # list(map(str, vocab.sentence2index(line, max_length)))) # if i == total_len: # print(data, file=fout, end='') # else: # print(data, file=fout) bar.update(1) torch.save(data, PREPROCESSED_DIR + output_file)
def make_data_set_and_vocab(trainpath=None, vectorpath=None, threshhold=0): vocab = Vocabulary() if vectorpath is not None: vocab.load(vectorpath) counter = collections.Counter() with open(trainpath, 'r') as f: for line in f: words = make_wakati(line.strip()) for word in words: counter[word] += 1 # for word, _ in counter.most_common(self.n_max_word - 2): for word, cnt in counter.most_common(): if cnt <= threshhold: break if word not in vocab: vocab.add_word(word) vocab.save('vocab') # ここからデータセット作成 data_set = MyDataset(trainpath=trainpath, vocab=vocab) return data_set, vocab
import time print("Reading data", time.asctime(time.localtime(time.time()))) # Estimate vocabulary from training data # voc = pickle.load(open("voc.pkl", "rb")) voc = Vocabulary() with open(data_path, "r") as data: line = data.readline() while line: tokens = line.strip().split() voc.add_words(tokens) line = data.readline() voc.prune(top_words) voc.export_vocabulary(top_words, "voc.tsv") voc.save("voc.pkl") print("Starting training", time.asctime(time.localtime(time.time()))) reader = Reader(data_path, voc, n_contexts, window_size, k) terminals = assemble_graph(top_words, n_dims) first_batch = None in_words_ = terminals['in_words'] out_words_ = terminals['out_words'] labels_ = terminals['labels'] train_ = terminals['train'] loss_ = terminals['loss'] adder_ = terminals['adder']