def main(): args = get_args() vocab = Vocab(args.vocab_path, args.vocab_size) # create a vocabulary hps = get_hps() if not args.data_path == "": batcher = Batcher(args.data_path, vocab, hps, args.single_pass) import pdb pdb.set_trace() x = batcher.next_batch() import pdb pdb.set_trace() pass else: with open(args.json_path) as f: art = json.load(f) article = neologdn.normalize(art['body']) abstract = neologdn.normalize(art['title']) m = MeCab('-Owakati') parsed_article = m.parse(article) abs_words = m.parse(abstract).split() ex = B.Example(parsed_article, abs_words, vocab, hps) b = B.Batch([ex], hps, vocab) import pdb pdb.set_trace() pass
def build_batch(pair_list, vocab, hp): instance_list = [ batcher.Example(article=article, abstract_sentences=abstract, vocab=vocab, hps=hp) for article, abstract in pair_list ] batch = batcher.Batch(example_list=instance_list, hps=hp, vocab=vocab) return batch
def json_batch(fname, hps, vocab): with open(fname) as f: art = json.load(f) article = neologdn.normalize(art['body']) abstract = neologdn.normalize(art['title']) m = MeCab('-Owakati') parsed_article = m.parse(article) abs_words = m.parse(abstract).split() ex = B.Example(parsed_article, abs_words, vocab, hps) b = B.Batch([ex], hps, vocab) return b
def build_batch(query, vocab, hp): instance = batcher.Example(article=query, abstract_sentences='', vocab=vocab, hps=hp) batch = batcher.Batch(example_list=[ instance, ] * hp.batch_size, hps=hp, vocab=vocab) return batch
def get_separate_enc_states(model, sess, enc_sentences, vocab, hps): reps = [] examples = [] for enc_sent in enc_sentences: sent_str = ' '.join(enc_sent) doc_indices = [0] * len(enc_sent) # just filler, shouldn't do anything ex = batcher.Example(sent_str, [], [[]], doc_indices, None, vocab, hps) examples.append(ex) chunks = util.chunks(examples, hps.batch_size) if len(chunks[-1]) != hps.batch_size: # If last chunk is not filled, then just artificially fill it for i in range(hps.batch_size - len(chunks[-1])): chunks[-1].append(examples[-1]) for chunk in chunks: batch = batcher.Batch(chunk, hps, vocab) batch_enc_states, _ = model.run_encoder(sess, batch) for batch_idx, enc_states in enumerate(batch_enc_states): start_idx = 0 end_idx = batch.enc_lens[batch_idx] - 1 rep = get_fw_bw_rep(enc_states, start_idx, end_idx) reps.append(rep) reps = reps[:len(enc_sentences)] # Removes the filler examples return reps