def get_train_and_dev(self, train_file_path, grammar_file, primitive_types): src_freq = 3 code_freq = 3 grammar = ASDLGrammar.grammar_from_text( open(grammar_file).read(), primitive_types) transition_system = TransitionSystem(grammar) train_examples = self.preprocess_dataset(train_file_path, transition_system) full_train_examples = train_examples[:] np.random.shuffle(train_examples) dev_examples = train_examples[:200] train_examples = train_examples[200:] src_vocab = VocabEntry.from_corpus( [e.sentence for e in train_examples], size=5000, freq_cutoff=src_freq) primitive_tokens = [ map( lambda a: a.action.token, filter(lambda a: isinstance(a.action, GenTokenAction), e.tgt_actions)) for e in train_examples ] primitive_vocab = VocabEntry.from_corpus(primitive_tokens, size=5000, freq_cutoff=code_freq) # generate vocabulary for the code tokens! code_tokens = [ transition_system.tokenize_code(e.code, mode='decoder') for e in train_examples ] code_vocab = VocabEntry.from_corpus(code_tokens, size=5000, freq_cutoff=code_freq) vocab = Vocab(source=src_vocab, primitive=primitive_vocab, code=code_vocab) return train_examples, dev_examples, vocab
def _build_vocab(self, vocab_file, method, cased): corpus = [] for dataset_type in ["train", "test"]: data_file = "qna_data/vi_{}.json".format(dataset_type) json_samples = read_json_data(data_file) for json_sample in json_samples: for key in ["question", "text"]: pre_key = "{}_{}_{}".format( method, cased, key ) pre_text = json_sample[pre_key] corpus.append(pre_text.split()) self.vocab = VocabEntry.from_corpus(corpus, freq_cutoff=3) self.vocab.save_json(vocab_file) print ("Save vocab to file {}".format(vocab_file))
def extract_BNC(): stpwd = stopwords.words('english') stpwd = [w.upper() for w in stpwd] print(stpwd) vocab_path = paths.vocab_bnc source_path = paths.bnc_folder target_path = paths.bnc_extracted ndocs = 0 dataset = [] for (dirpath, dirnames, filenames) in os.walk(source_path): for fn in filenames: if fn.endswith(".xml"): print(dirpath, fn) fp = dirpath+"/"+fn tree = ET.parse(fp).getroot() sentences = find_rec(tree, "s", []) print(len(sentences)) for sent in sentences: st = [] for word in sent.findall("w"): w = word.attrib["hw"] pos = word.attrib["pos"] if w.upper() not in stpwd: st.append(w+"_"+pos) st = ["<s>"] * cconfig.context_size + st + ["<s>"] * cconfig.context_size dataset.append(st) ndocs += 1 if ndocs > 1000: break vocab = VocabEntry.from_corpus( dataset, size=cconfig.vocab_size, freq_cutoff=cconfig.freq_cutoff) pickle.dump(vocab, open(vocab_path, 'wb')) processed_dataset = [] for sent in dataset: np_sent = np.array([vocab[w] for w in sent if vocab[w] != 3]) processed_dataset.append(np_sent) processed_dataset = np.array(processed_dataset) np.save(target_path, processed_dataset) print(len(processed_dataset)) print(sum([len(s) for s in processed_dataset]))
def load_vocab(self): # Load the vocabulary or create vocabulary if not exists if self.args.vocab is not None: if not os.path.isfile(self.args.vocab): print('create new vocab and save to %s' % self.args.vocab) corpus = [] for story in self.trn[0]: for sent in story: corpus.append(sent) if self.args.rebuild_vocab: self.vocab = VocabEntry.from_corpus( corpus, 50000, remove_singleton=not self.args.include_singleton) else: self.vocab = VocabEntry.from_dict(self.w2id) torch.save(self.vocab, self.args.vocab) else: self.vocab = torch.load(self.args.vocab) else: print('vocab file is required') exit(0)
def build_data(self): corpus = self.train_paragraph_texts + \ self.train_question_texts + \ self.test_question_texts + \ self.test_paragraph_texts vocab = VocabEntry.from_corpus(corpus, freq_cutoff=1) vocab_file = "qna_data/{}_vocab.json".format(self.method) vocab.save_json(vocab_file) self.train_questions = vocab.padd_sents(self.train_question_texts, start_end=False) self.train_paragraphs = vocab.padd_sents(self.train_paragraph_texts, start_end=False) self.test_questions = vocab.padd_sents(self.test_question_texts, start_end=False) self.test_paragraphs = vocab.padd_sents(self.test_paragraph_texts, start_end=False) save_data = { "train_questions": self.train_questions, "train_paragraphs": self.train_paragraphs, "test_questions": self.test_questions, "test_paragraphs": self.test_paragraphs, } save_file = "qna_data/{}_dataset.json".format(self.method) write_json_data(save_file, save_data) self.vocab = vocab self._to_numpy() print("corpus len: ", len(corpus)) print(corpus[0]) print("max length: ", vocab.max_sent_len)
def main(argv): EPOCHS = int(argv[0]) BATCH_SIZE = int(argv[1]) LEARNING_RATE = float(argv[2]) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") words = [] definitions = [] sub_fasttext_dict = {} with open("../data/words_defs_dict.train", "br") as f: words, definitions, sub_fasttext_dict = pickle.load(f) print("number of words:", len(words)) eval = Evaluator() vocab = VocabEntry.from_corpus(definitions, 1000000, 0) for w in words: vocab.add(w) print("vocab length:", len(vocab)) assert (len(words) == len(definitions)) training_data = [(definitions[i], words[i]) for i in range(len(words))] model = LSTMModel(100, 100, vocab, sub_fasttext_dict, device) model.to(device) loss_function = nn.CosineEmbeddingLoss(margin=0.0, reduction='mean') optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) dt = str(datetime.fromtimestamp(time.time()))[:-7] print(dt) dt = dt.replace(' ', '_') dt = dt.replace(':', '-') start = timeit.default_timer() losses = [] best_loss = float('inf') for epoch in range(EPOCHS): epoch_losses = [] count = 0 for src_sents, tgt_word in batch_iter(training_data, BATCH_SIZE, False): model.zero_grad() x_lengths = [len(sent) for sent in src_sents] x = vocab.to_input_tensor(src_sents, device) init_hidden = model.initHidden(len(src_sents), device) tag_scores = model.forward(x, init_hidden, x_lengths) y_indices = vocab.words2indices(tgt_word) y_array = model.embedding.source[0](torch.tensor( y_indices, device=device)).double() y_pred = tag_scores[0].squeeze(dim=1).double().to(device) y_match = torch.ones(y_pred.shape[0], device=device) loss = loss_function(y_pred, y_array, y_match) loss.backward() optimizer.step() count += 1 epoch_losses.append(loss) if count % 200 == 0: print("Time elapsed", timeit.default_timer() - start, "Epoch", epoch, " Count", count, ": Loss", loss) losses.append(loss) eloss = sum(epoch_losses) / len(epoch_losses) if eloss < best_loss: best_loss = eloss title = 'ft_model' + dt + '.pt' torch.save(model.state_dict(), title) print("model saves as:", title, "with epoch loss of ", eloss) stop = timeit.default_timer() print('Time: ', stop - start) import matplotlib.pyplot as plt print(plt.plot([l.double() for l in losses]))
datetm = datetime.fromtimestamp(ts) dt = str(datetm)[:-7] print(dt) dt = dt.replace(' ', '_') dt = dt.replace(':', '-') print(dt) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print('use device: %s' % device) # In[12]: words, defs, ft_dict = pickle.load( open("../data/words_defs_dict_1M.train", "rb")) vocab = VocabEntry.from_corpus(defs, 1000000, 0) for w in ft_dict: vocab.add(w) # In[13]: def create_emb_layer(weights_matrix, src_pad_token_idx, non_trainable=True): num_embeddings, embedding_dim = weights_matrix.shape emb_layer = nn.Embedding(num_embeddings, embedding_dim, src_pad_token_idx) emb_layer.weight.data.copy_( torch.from_numpy(weights_matrix)) #figure out what is here if non_trainable: emb_layer.weight.requires_grad = False return emb_layer, num_embeddings, embedding_dim