def build_batch_by_article(article, vocab): words = jieba.cut(article) art_str = " ".join(words) example = Example(art_str, '', vocab) ex_list = [example for _ in range(config.beam_size)] batch = Batch(ex_list, vocab, config.beam_size) return batch
def test_batch(): # python test_batcher.py --data_path=../data/squad-v1/dev_raw.json --pointer_gen input_gen = text_generator( data.example_generator(hps.data_path, hps.single_pass)) example_list = [] for _ in range(hps.batch_size): p, q, a, ap = next(input_gen) example_list.append(Example(p, q, a, ap, vocab, hps)) batch = Batch(example_list, hps, vocab) print('batch answer pos:', batch.ans_indices) print('enc batch:', batch.enc_batch) print('enc batch words:', id2sentence(batch.enc_batch, vocab, batch.para_oovs_batch)) print('enc len:', batch.enc_lens) if hps.pointer_gen: print('max para oovs:', batch.max_para_oovs) print('para oovs:', batch.para_oovs_batch) print('enc batch extend vocab:', batch.enc_batch_extend_vocab) print( 'enc batch extend vocab words:', id2sentence(batch.enc_batch_extend_vocab, vocab, batch.para_oovs_batch)) print('dec batch:', batch.dec_batch) print('dec batch words:', id2sentence(batch.dec_batch, vocab, batch.para_oovs_batch)) print('target batch:', batch.target_batch) print('tgt batch words:', id2sentence(batch.target_batch, vocab, batch.para_oovs_batch)) print('origin para:', batch.original_paragraphs) print('origin question:', batch.original_questions) print('origin answer:', batch.original_answers)
def build_batch_by_article(article, vocab): nlp = StanfordCoreNLP(path, lang='en') words = nlp.word_tokenize(article) art_str = " ".join(words) example = Example(art_str, [ "", ], vocab) ex_list = [example for _ in range(config.beam_size)] batch = Batch(ex_list, vocab, config.beam_size) return batch
def generate_summary(spacy_article, ideal_summary_length_tokens=60): """ Generates summary of the given article. Note that this is slow (~20 seconds on a single CPU). Args: spacy_article: Spacy-processed text. The model was trained on the output of doc.spacy_text(), so for best results the input here should also come from doc.spacy_text(). Returns: Tuple of unicode summary of the text and scalar score of its quality. Score is approximately an average log-likelihood of the summary (so it is < 0.) and typically is in the range [-.2, -.5]. Summaries with scores below -.4 are usually not very good. """ assert isinstance(spacy_article, Doc) # These imports are slow - lazy import. from batcher import Batch, Example from beam_search import run_beam_search from io_processing import process_article, process_output if _model is None: _load_model() # Handle short inputs article_tokens, _, orig_article_tokens = process_article(spacy_article) if len(article_tokens) <= ideal_summary_length_tokens: return spacy_article.text, 0. min_summary_length = min(10 + len(article_tokens) / 10, 2 * ideal_summary_length_tokens / 3) max_summary_length = min(10 + len(article_tokens) / 5, 3 * ideal_summary_length_tokens / 2) # Make input data example = Example(' '.join(article_tokens), abstract='', vocab=_vocab, hps=_hps) batch = Batch([example] * _beam_size, _hps, _vocab) # Generate output hyp, score = run_beam_search( _sess, _model, _vocab, batch, _beam_size, max_summary_length, min_summary_length, _settings.trace_path, ) # Extract the output ids from the hypothesis and convert back to words return process_output(hyp.token_strings[1:], orig_article_tokens), score
def test_example(): ex = Example(paragraph, question, answer, answer_indices, vocab, hps) print('enc len:', ex.enc_len) print('enc input:', ex.enc_input) print('enc input words:', ' '.join([vocab.id2word(i) for i in ex.enc_input])) print('dec len:', ex.dec_len) print('dec input:', ex.dec_input) print('dec input words:', ' '.join([vocab.id2word(i) for i in ex.dec_input])) print('dec target:', ex.target) print('dec target words:', ' '.join([vocab.id2word(i) for i in ex.target])) if hps.pointer_gen: print('enc input extend vocab:', ex.enc_input_extend_vocab) print('paragraph oov:', ex.paragraph_oovs) print('original paragraph:', ex.original_paragraph) print('original question:', ex.original_question) print('original answer:', ex.original_answer) print('ans start pos:', ex.answer_start_idx) print('ans end pos:', ex.answer_end_idx)
def output_to_batch(current_batch, results, batcher, dis_batcher): # 生成新的 batch 和 dis-batch example_list = [] db_example_list = [] for i in range(FLAGS.batch_size): decoded_words_all = [] encode_words = current_batch.original_review_inputs[i] for j in range(FLAGS.max_dec_sen_num): output_ids = [int(t) for t in results['Greedy_outputs'][i][0:]] decoded_words = data.outputids2words(output_ids, batcher._vocab, None) # Remove the [STOP] token from decoded_words, if necessary try: # index of the (first) [STOP] symbol fst_stop_idx = decoded_words.index(data.STOP_DECODING) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words if len(decoded_words) < 2: continue if len(decoded_words_all) > 0: new_set1 = set(decoded_words_all[len(decoded_words_all) - 1].split()) new_set2 = set(decoded_words) if len(new_set1 & new_set2) > 0.5 * len(new_set2): continue if decoded_words[-1] != '.' and decoded_words[ -1] != '!' and decoded_words[-1] != '?': decoded_words.append('.') decoded_output = ' '.join(decoded_words).strip() decoded_words_all.append(decoded_output) decoded_words_all = ' '.join(decoded_words_all).strip() try: fst_stop_idx = decoded_words_all.index(data.STOP_DECODING_DOCUMENT) decoded_words_all = decoded_words_all[:fst_stop_idx] except ValueError: decoded_words_all = decoded_words_all decoded_words_all = decoded_words_all.replace("[UNK] ", "") decoded_words_all = decoded_words_all.replace("[UNK]", "") decoded_words_all, _ = re.subn(r"(! ){2,}", "", decoded_words_all) decoded_words_all, _ = re.subn(r"(\. ){2,}", "", decoded_words_all) if decoded_words_all.strip() == "": new_dis_example = bd.Example( current_batch.original_review_output[i], -0.0001, dis_batcher._vocab, dis_batcher._hps) new_example = Example(current_batch.original_review_output[i], batcher._vocab, batcher._hps, encode_words) else: new_dis_example = bd.Example(decoded_words_all, 1, dis_batcher._vocab, dis_batcher._hps) new_example = Example(decoded_words_all, batcher._vocab, batcher._hps, encode_words) example_list.append(new_example) db_example_list.append(new_dis_example) return Batch(example_list, batcher._hps, batcher._vocab), bd.Batch(db_example_list, dis_batcher._hps, dis_batcher._vocab)
def article_to_batch(self, article): abstract_sentences = '' example = Example(article, abstract_sentences, self.vocab, self.hps) # Process into an Example. repeated_example = [example for _ in range(self.hps.batch_size)] return Batch(repeated_example, self.hps, self.vocab)