def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) while True: try: (background_text, context_text, response_text, span_text, b_start, b_end, r_start, r_end, example_id) = next(input_gen) except StopIteration: # if there are no more examples: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) example = Example(background_text, context_text, response_text, span_text, b_start, b_end, r_start, r_end, example_id, self._vocab, self._hps) self._example_queue.put( example) # place the Example in the example queue.
def fill_example_queue(self): input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) while True: try: # read the next example from file. article and abstract are both strings. (article, abstract) = next(input_gen) except RuntimeError: # if there are no more examples: logger.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: logger.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) # Use the <s> and </s> tags in abstract to get a list of sentences. abstract_sentences = [ sent.strip() for sent in data.abstract2sents(abstract) ] example = Example(article, abstract_sentences, self._vocab) # Process into an Example. self._example_queue.put( example) # place the Example in the example queue.
def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" input_gen = self.text_generator(data.example_generator(self._data_path, self._single_pass, self._hps.decode_only, self._hps.language)) while True: try: (article, abstract, tags, title) = next(input_gen) # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: tf.logging.info("The example generator for this example queue filling thread has exhausted data.") if self._single_pass: tf.logging.info("single_pass mode is on, so we've finished reading dataset. This thread is stopping.") self._finished_reading = True break else: raise Exception("single_pass mode is off but the example generator is out of data; error.") if self._hps.mode in ['train', 'eval']: abstract_sentences_all = data.abstract2sents(abstract); # Use the <s> and </s> tags in abstract to get a list of sentences. for i in range(self._hps.max_keyphrase_num): sent = abstract_sentences_all[i % len(abstract_sentences_all)] abstract_sentences = [sent.strip()] example = Example(title, article, tags, abstract_sentences, abstract_sentences_all, self._vocab, self._hps, self._stop_words) # Process into an Example. self._example_queue.put(example) # place the Example in the example queue. elif self._hps.mode == "decode": abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] # Use the <s> and </s> tags in abstract to get a list of sentences. example = Example(title, article, tags, [abstract_sentences[0]], abstract_sentences, self._vocab, self._hps, self._stop_words) # Process into an Example. self._example_queue.put(example) # place the Example in the example queue.
def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" print self._single_pass input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) while True: try: (comment, label, keywords, topics) = input_gen.next( ) # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) example = Example(comment, label, keywords, topics, self._vocab, self._hps) # Process into an Example. self._example_queue.put( example) # place the Example in the example queue.
def test_batch(): # python test_batcher.py --data_path=../data/squad-v1/dev_raw.json --pointer_gen input_gen = text_generator( data.example_generator(hps.data_path, hps.single_pass)) example_list = [] for _ in range(hps.batch_size): p, q, a, ap = next(input_gen) example_list.append(Example(p, q, a, ap, vocab, hps)) batch = Batch(example_list, hps, vocab) print('batch answer pos:', batch.ans_indices) print('enc batch:', batch.enc_batch) print('enc batch words:', id2sentence(batch.enc_batch, vocab, batch.para_oovs_batch)) print('enc len:', batch.enc_lens) if hps.pointer_gen: print('max para oovs:', batch.max_para_oovs) print('para oovs:', batch.para_oovs_batch) print('enc batch extend vocab:', batch.enc_batch_extend_vocab) print( 'enc batch extend vocab words:', id2sentence(batch.enc_batch_extend_vocab, vocab, batch.para_oovs_batch)) print('dec batch:', batch.dec_batch) print('dec batch words:', id2sentence(batch.dec_batch, vocab, batch.para_oovs_batch)) print('target batch:', batch.target_batch) print('tgt batch words:', id2sentence(batch.target_batch, vocab, batch.para_oovs_batch)) print('origin para:', batch.original_paragraphs) print('origin question:', batch.original_questions) print('origin answer:', batch.original_answers)
def fill_example_queue(self): input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) while True: try: (article, abstract) = input_gen.next() except StopIteration: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) abstract_sentences = [ sent.strip() for sent in data.abstract2sents(abstract) ] article = [sent.strip() for sent in data.article2sents(article)] example = Example(article, abstract_sentences, self._vocab, self._concept_vocab) self._example_queue.put(example)
def fill_example_queue(self): # 创建一个生成器对象 input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) while True: try: (article, abstract) = input_gen.__next__( ) # read the next example from file. article and abstract are both strings. article, abstract = article.decode(), abstract.decode() except StopIteration: # if there are no more examples: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) abstract_sentences = [ sent.strip() for sent in data.abstract2sents(abstract) ] # 编码abstract #print("abstract_sentences:", abstract_sentences) example = Example(article, abstract_sentences, self._vocab) # 处理成一个Example. self._example_queue.put(example) # 放处理成一个Example对象至example queue. """
def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) while True: try: (article, abstract) = next( input_gen ) # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) abstract_sentences = [ abstract.strip() ] # Use the <s> and </s> tags in abstract to get a list of sentences. example = Example(article, abstract_sentences, self._vocab, self._hps) # Process into an Example. self._example_queue.put( example) # place the Example in the example queue.
def fill_example_queue(self): input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) while True: try: (article, abstract) = next( input_gen ) # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: if self._single_pass: self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) break # abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] # Use the <s> and </s> tags in abstract to get a list of sentences. # abstract = str(abstract, encoding='utf8') abstract_sentences = [abstract] example = Example(article, abstract_sentences, self._vocab) # Process into an Example. self._example_queue.put( example) # place the Example in the example queue.
def read_articles_abstracts(source_dir, dataset_split): source_dir = os.path.join(data_dir, dataset_articles) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) print('Creating list') ex_list = [ex for ex in example_generator] a=0
def test_stop(): from batcher import text_generator input_gen = text_generator( data.example_generator(hps.data_path, hps.single_pass)) i = 0 while True: sample = next(input_gen) i += 1 print(i, sample[0])
def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" input_gen = self.text_generator(data.example_generator(self._data_path, self._single_pass)) self._example_queue = [] for item in input_gen: article, abstract = str(item[0]), str(item[1]) abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] example = Example(article, abstract_sentences, self._vocab, self._hps) if example.flag == False: self._example_queue.append(example)
def main(unused_argv): print('Running statistics on %s' % exp_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, dataset_articles) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) ex_sents = ['single .', 'sentence .'] article_text = ' '.join(ex_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, ex_sents, article_text) if singles_and_pairs == 'pairs': single_feat_len = 0 else: single_feat_len = len( get_single_sent_features(0, sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0])) if singles_and_pairs == 'singles': pair_feat_len = 0 else: pair_feat_len = len( get_pair_sent_features([0, 1], sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0])) total = len(source_files ) * 1000 if 'cnn' or 'newsroom' in dataset_articles else len( source_files) example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) ex_gen = example_generator_extended(example_generator, total, single_feat_len, pair_feat_len) print('Creating list') ex_list = [ex for ex in ex_gen] print('Converting...') list(futures.map(load_and_evaluate_example, ex_list)) # for ex in tqdm(ex_list, total=total): # load_and_evaluate_example(ex) print('Evaluating ROUGE...') results_dict = rouge_eval_references.rouge_eval(ref_dir, dec_dir) # print("Results_dict: ", results_dict) rouge_eval_references.rouge_log(results_dict, my_log_dir) util.print_execution_time(start_time)
def fill_example_queue(self): input_gen = text_generator(data.example_generator(self.examples, self.single_pass)) while True: try: (paragraph, question, answer, answer_position) = next(input_gen) # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: self._finished_reading = True print('reading data one round finish') break example = Example(paragraph, question, answer, answer_position, self.vocab, self.max_enc_steps, self.max_dec_steps, self.dynamic_vocab) self._example_queue.put(example)
def fill_example_queue(self): input_gen = self.text_generator(data.example_generator(self._data_path, self._single_pass)) while True: try: (article, abstract) = next(input_gen) except StopIteration: # if there are no more examples: tf.logging.info("The example generator for this example queue filling thread has exhausted data.") self._finished_reading = True break abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] example = Example(article, abstract_sentences, self._vocab, self._hps) self._example_queue.put(example)
def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" input_gen = self.text_generator(data.example_generator(self._data, self._single_pass,self._device_id, data_as_tf_example=self._data_as_tf_example)) count = 0 query = None word_edge_list = None query_edge_list = None if self._data_as_tf_example: while True: try: article, abstract, word_edge_list, query, query_edge_list, epoch_num = input_gen.next() # read the next example from file. article and abstract are both strings. #tf.logging.info(random.randint(1,101)) except StopIteration: # if there are no more examples: tf.logging.info("The example generator for this example queue filling thread has exhausted data.") if self._single_pass: tf.logging.info("single_pass mode is on, so we've finished reading dataset. This thread is stopping.") self._finished_reading = True break else: raise Exception("single_pass mode is off but the example generator is out of data; error.") abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] # Use the <s> and </s> tags in abstract to get a list of sentences. example = Example(article, abstract_sentences, self._vocab, self._hps, word_edge_list=word_edge_list, query=query, query_edge_list=query_edge_list, epoch_num=epoch_num, bert_vocab=self.bert_vocab) self._example_queue.put(example) else: while True: try: curr_data = input_gen.next() count = count + 1 article = curr_data['article'] abstract = curr_data['abstract'].strip() if self._hps.word_gcn.value: word_edge_list = curr_data['word_edge_list'] if self._hps.query_encoder.value: query = curr_data['query'] if self._hps.query_gcn.value: query_edge_list = curr_data['query_edge_list'] except Exception as e: # if there are no more examples: tf.logging.info("The example generator for this example queue filling thread has exhausted data.") if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping.") self._finished_reading = True break else: tf.logging.info(e) raise Exception("single_pass mode is off but the example generator is out of data; error.") abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] # Use the <s> and </s> tags in abstract to get a list of sentences. example = Example(article, abstract_sentences, self._vocab, self._hps, word_edge_list=word_edge_list, query=query, query_edge_list=query_edge_list, epoch_num=epoch_num) self._example_queue.put(example) # place the Example in the example queue.
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.dataset_split == 'all': dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] vocab_counter = collections.Counter() for dataset_split in dataset_splits: source_dir = os.path.join(FLAGS.data_root, FLAGS.dataset_name) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) total = len(source_files) * 1000 example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, article, abstracts, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] # groundtruth_summ_sent_tokens = [sent.strip().split() for sent in groundtruth_summary_text.strip().split('\n')] groundtruth_summ_sent_tokens = [[ token for token in abstract.strip().split() if token not in ['<s>', '</s>'] ] for abstract in abstracts] all_tokens = util.flatten_list_of_lists( article_sent_tokens) + util.flatten_list_of_lists( groundtruth_summ_sent_tokens) vocab_counter.update(all_tokens) print("Writing vocab file...") with open(os.path.join('logs', "vocab_" + FLAGS.dataset_name), 'w') as writer: for word, count in vocab_counter.most_common(VOCAB_SIZE): writer.write(word + ' ' + str(count) + '\n') print("Finished writing vocab file")
def main(unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) print('Running statistics on %s' % exp_name) start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, dataset_articles) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) total = len(source_files)*1000 example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) # Read output of BERT and put into a dictionary with: # key=(article idx, source indices {this is a tuple of length 1 or 2, depending on if it is a singleton or pair}) # value=score qid_ssi_to_importances = rank_source_sents(temp_in_path, temp_out_path) ex_gen = example_generator_extended(example_generator, total, qid_ssi_to_importances, None, FLAGS.singles_and_pairs) print('Creating list') ex_list = [ex for ex in ex_gen] # # Main function to get results on all test examples # pool = mp.Pool(mp.cpu_count()) # ssi_list = list(tqdm(pool.imap(evaluate_example, ex_list), total=total)) # pool.close() # Main function to get results on all test examples ssi_list = list(map(evaluate_example, ex_list)) # save ssi_list with open(os.path.join(my_log_dir, 'ssi.pkl'), 'wb') as f: pickle.dump(ssi_list, f) with open(os.path.join(my_log_dir, 'ssi.pkl'), 'rb') as f: ssi_list = pickle.load(f) print('Evaluating BERT model F1 score...') suffix = util.all_sent_selection_eval(ssi_list) print('Evaluating ROUGE...') results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir, l_param=l_param) rouge_functions.rouge_log(results_dict, my_log_dir, suffix=suffix) ssis_restricted = [ssi_triple[1][:ssi_triple[2]] for ssi_triple in ssi_list] ssi_lens = [len(source_indices) for source_indices in util.flatten_list_of_lists(ssis_restricted)] num_singles = ssi_lens.count(1) num_pairs = ssi_lens.count(2) print ('Percent singles/pairs: %.2f %.2f' % (num_singles*100./len(ssi_lens), num_pairs*100./len(ssi_lens))) util.print_execution_time(start_time)
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.dataset_name == 'all': dataset_names = ['cnn_dm', 'xsum', 'duc_2004'] else: dataset_names = [FLAGS.dataset_name] for dataset_name in dataset_names: FLAGS.dataset_name = dataset_name source_dir = os.path.join(data_dir, dataset_name) if FLAGS.dataset_split == 'all': if dataset_name == 'duc_2004': dataset_splits = ['test'] else: dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for dataset_split in dataset_splits: source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) total = len(source_files) * 1000 example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) out_dir = os.path.join('data', 'bert', dataset_name, 'article_embeddings', 'input_article') util.create_dirs(out_dir) writer = open(os.path.join(out_dir, dataset_split) + '.tsv', 'wb') inst_id = 0 for example_idx, example in enumerate(tqdm(example_generator, total=total)): if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances: break raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example( example, names_to_types) article = ' '.join(raw_article_sents) writer.write((article + '\n').encode())
def fill_example_queue(self): input_gen = self.text_generator(data.example_generator( self._data_path)) while True: try: band_name = next(input_gen) except StopIteration: print("The example generator has exhausted saved_data") raise Exception( "single_pass off: Error! example generator out of saved_data." ) example = Example(band_name, self._vocab, self._hps) self._example_queue.put(example)
def fill_example_queue(self): input_gen = self.text_generator(data.example_generator(self._data_path, self._single_pass)) while True: try: (article, abstract) = input_gen.next() # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: tf.logging.info("The example generator for this example queue filling thread has exhausted data.") if self._single_pass: tf.logging.info("single_pass mode is on, so we've finished reading dataset. This thread is stopping.") self._finished_reading = True break else: raise Exception("single_pass mode is off but the example generator is out of data; error.") abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] # Use the <s> and </s> tags in abstract to get a list of sentences. example = Example(article, abstract_sentences, self._vocab) # Process into an Example. self._example_queue.put(example) # place the Example in the example queue.
def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass, self._cnn_500_dm_500)) # counter = 0 while True: try: ( article, abstracts, doc_indices_str, raw_article_sents ) = input_gen.next( ) # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) all_abstract_sentences = [[ sent.strip() for sent in data.abstract2sents(abstract) ] for abstract in abstracts] if len(all_abstract_sentences) != 0: abstract_sentences = all_abstract_sentences[0] else: abstract_sentences = [] doc_indices = [int(idx) for idx in doc_indices_str.strip().split()] example = Example(article, abstract_sentences, all_abstract_sentences, doc_indices, raw_article_sents, self._vocab, self._hps) # Process into an Example. self._example_queue.put( example) # place the Example in the example queue.
def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) f = 0 d = open("cont.txt", "w+") while True: try: (article, abstract, rel_scores) = next(input_gen) f = f + 1 #print("article") #print("article"+str(article)) #print("abstract") #print("abstract"+str(abstract)) #print("rel_scores") #print("rel_scores"+str(rel_scores)) #input_gen.next() # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) #print(article) #print(abstract) #print(rel_scores) #abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] # Use the <s> and </s> tags in abstract to get a list of sentences. example = Example(article, abstract, rel_scores, self._vocab, self._hps) # Process into an Example. self._example_queue.put( example) # place the Example in the example queue. d.write("ASDasdwqwq ewqewqewq ewq ewqe \n") d.write(str(f) + "\n")
def fill_example_queue(self): """ Reads data from file and processes into Examples which are then placed into the example queue. """ input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) while True: try: # read the next example from file. article and abstract are both strings. (article, abstract) = input_gen.next() except StopIteration: # if there are no more examples: tf.logging.info( "The example generator for this example queue filling thread has exhausted " "data.") if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread " "is stopping.") self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) example = Example(article, abstract, self._vocab, self._hps) if self._hps.attn_only_entities: n_people_enc_tokens = sum( 1 for token in example.enc_input[:self._hps.max_enc_steps] if 3 <= token < 3 + len(data.PERSON_TOKENS)) if n_people_enc_tokens < 2: continue self._example_queue.put(example)
def fill_example_queue(self): input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) f = open("inputs.txt", "a") while True: try: (source1, source2, target) = input_gen.next( ) # read the next example from file. article and abstract are both strings. f.write(source1 + "\t" + source2 + "\t" + target + "\n") except StopIteration: # if there are no more examples: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) abstract_sentences = [ sent.strip() for sent in data.abstract2sents(target) ] # Use the <s> and </s> tags in abstract to get a list of sentences. #example = Example(article, abstract_sentences, self._vocab) # Process into an Example. #example = Example2(article, ' '.join(abstract_sentences), abstract_sentences, self._vocab) #example = Example2(' '.join(abstract_sentences), article, abstract_sentences, self._vocab) #example = Example2(article, article, abstract_sentences, self._vocab) #example = Example2(' '.join(abstract_sentences), ' '.join(abstract_sentences), abstract_sentences, self._vocab) example = Example2(source1, source2, target.split(), self._vocab) self._example_queue.put( example) # place the Example in the example queue. f.close()
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) source_dir = os.path.join(data_dir, FLAGS.dataset_name) source_files = sorted(glob.glob(source_dir + '/' + FLAGS.dataset_split + '*')) total = len(source_files) * 1000 if ('cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name or 'xsum' in FLAGS.dataset_name) else len(source_files) example_generator = data.example_generator(source_dir + '/' + FLAGS.dataset_split + '*', True, False, should_check_valid=False) for example_idx, example in enumerate(tqdm(example_generator, total=total)): raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [util.process_sent(sent) for sent in raw_article_sents] groundtruth_summ_sents = [[sent.strip() for sent in groundtruth_summary_text.strip().split('\n')]] if doc_indices is None: doc_indices = [0] * len(util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] rel_sent_indices, _, _ = preprocess_for_lambdamart_no_flags.get_rel_sent_indices(doc_indices, article_sent_tokens) groundtruth_similar_source_indices_list = util.enforce_sentence_limit(groundtruth_similar_source_indices_list, FLAGS.sentence_limit)
def fill_example_queue(self): """ Reads data from file and processes into Examples which are then placed into the example queue. """ input_gen = data.example_generator(self.data_path, self.single_pass) while True: try: # read the next example from file. sentence and label are both strings. line = input_gen.next() except StopIteration: # if there are no more examples: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self.single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self.finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) if len(line) == 0: tf.logging.warning( "Found an example with empty sentence text. Skipping it.") else: # Process into an Example. fields = line.split("\t") if len(fields) == 2: sentence, label = fields[0], fields[1] else: sentence, label = fields[0], "" example = Example(sentence, label, self.vocab, self.hps) # place the Example in the example queue. self.example_queue.put(example)
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.singles_and_pairs == 'singles': FLAGS.sentence_limit = 1 else: FLAGS.sentence_limit = 2 if FLAGS.dataset_name == 'all': dataset_names = ['cnn_dm', 'xsum', 'duc_2004'] else: dataset_names = [FLAGS.dataset_name] for dataset_name in dataset_names: FLAGS.dataset_name = dataset_name source_dir = os.path.join(data_dir, dataset_name) if FLAGS.dataset_split == 'all': if dataset_name == 'duc_2004': dataset_splits = ['test'] else: # dataset_splits = ['val_test', 'test', 'val', 'train'] dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for dataset_split in dataset_splits: if dataset_split == 'val_test': source_dataset_split = 'val' else: source_dataset_split = dataset_split source_files = sorted( glob.glob(source_dir + '/' + source_dataset_split + '*')) total = len(source_files) * 1000 example_generator = data.example_generator( source_dir + '/' + source_dataset_split + '*', True, False, should_check_valid=False) out_dir = os.path.join('data', 'bert', dataset_name, FLAGS.singles_and_pairs, 'input') util.create_dirs(out_dir) writer = open(os.path.join(out_dir, dataset_split) + '.tsv', 'wb') header_list = [ 'should_merge', 'sent1', 'sent2', 'example_idx', 'inst_id', 'ssi' ] writer.write(('\t'.join(header_list) + '\n').encode()) inst_id = 0 for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent, whitespace=True) for sent in raw_article_sents ] groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] if dataset_name != 'duc_2004' or doc_indices is None or ( dataset_name != 'duc_2004' and len(doc_indices) != len( util.flatten_list_of_lists(article_sent_tokens))): doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] rel_sent_indices, _, _ = ssi_functions.get_rel_sent_indices( doc_indices, article_sent_tokens) similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, FLAGS.sentence_limit) possible_pairs = [ x for x in list( itertools.combinations( list(range(len(raw_article_sents))), 2)) ] # all pairs possible_pairs = filter_pairs_by_sent_position( possible_pairs, rel_sent_indices=rel_sent_indices) possible_singles = [(i, ) for i in range(len(raw_article_sents))] positives = [ssi for ssi in similar_source_indices_list] if dataset_split == 'test' or dataset_split == 'val_test': if FLAGS.singles_and_pairs == 'singles': possible_combinations = possible_singles else: possible_combinations = possible_pairs + possible_singles negatives = [ ssi for ssi in possible_combinations if not (ssi in positives or ssi[::-1] in positives) ] for ssi_idx, ssi in enumerate(positives): if len(ssi) == 0: continue if chronological_ssi and len(ssi) >= 2: if ssi[0] > ssi[1]: ssi = (min(ssi), max(ssi)) writer.write( get_string_bert_example(raw_article_sents, ssi, 1, example_idx, inst_id).encode()) inst_id += 1 for ssi in negatives: writer.write( get_string_bert_example(raw_article_sents, ssi, 0, example_idx, inst_id).encode()) inst_id += 1 else: positive_sents = list( set(util.flatten_list_of_lists(positives))) negative_pairs = [ pair for pair in possible_pairs if not any(i in positive_sents for i in pair) ] negative_singles = [ sing for sing in possible_singles if not sing[0] in positive_sents ] random_negative_pairs = np.random.permutation( len(negative_pairs)).tolist() random_negative_singles = np.random.permutation( len(negative_singles)).tolist() for ssi in similar_source_indices_list: if len(ssi) == 0: continue if chronological_ssi and len(ssi) >= 2: if ssi[0] > ssi[1]: ssi = (min(ssi), max(ssi)) is_pair = len(ssi) == 2 writer.write( get_string_bert_example(raw_article_sents, ssi, 1, example_idx, inst_id).encode()) inst_id += 1 # False sentence single/pair if is_pair: if len(random_negative_pairs) == 0: continue negative_indices = negative_pairs[ random_negative_pairs.pop()] else: if len(random_negative_singles) == 0: continue negative_indices = negative_singles[ random_negative_singles.pop()] article_lcs_paths = None writer.write( get_string_bert_example(raw_article_sents, negative_indices, 0, example_idx, inst_id).encode()) inst_id += 1
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.summarizer == 'all': summary_methods = list(summarizers.keys()) else: summary_methods = [FLAGS.summarizer] if FLAGS.dataset_name == 'all': dataset_names = datasets else: dataset_names = [FLAGS.dataset_name] sheets_strs = [] for summary_method in summary_methods: summary_fn = summarizers[summary_method] for dataset_name in dataset_names: FLAGS.dataset_name = dataset_name original_dataset_name = 'xsum' if 'xsum' in dataset_name else 'cnn_dm' if 'cnn_dm' in dataset_name or 'duc_2004' in dataset_name else '' vocab = Vocab('logs/vocab' + '_' + original_dataset_name, 50000) # create a vocabulary source_dir = os.path.join(data_dir, dataset_name) source_files = sorted( glob.glob(source_dir + '/' + FLAGS.dataset_split + '*')) total = len(source_files) * 1000 if ( 'cnn' in dataset_name or 'newsroom' in dataset_name or 'xsum' in dataset_name) else len(source_files) example_generator = data.example_generator( source_dir + '/' + FLAGS.dataset_split + '*', True, False, should_check_valid=False) if dataset_name == 'duc_2004': abs_source_dir = os.path.join( os.path.expanduser('~') + '/data/tf_data/with_coref', dataset_name) abs_example_generator = data.example_generator( abs_source_dir + '/' + FLAGS.dataset_split + '*', True, False, should_check_valid=False) abs_names_to_types = [('abstract', 'string_list')] triplet_ssi_list = [] for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) if dataset_name == 'duc_2004': abs_example = next(abs_example_generator) groundtruth_summary_texts = util.unpack_tf_example( abs_example, abs_names_to_types) groundtruth_summary_texts = groundtruth_summary_texts[0] groundtruth_summ_sents_list = [[ sent.strip() for sent in data.abstract2sents(abstract) ] for abstract in groundtruth_summary_texts] else: groundtruth_summary_texts = [groundtruth_summary_text] groundtruth_summ_sents_list = [] for groundtruth_summary_text in groundtruth_summary_texts: groundtruth_summ_sents = [ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ] groundtruth_summ_sents_list.append( groundtruth_summ_sents) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] if doc_indices is None: doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] groundtruth_similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, FLAGS.sentence_limit) log_dir = os.path.join(log_root, dataset_name + '_' + summary_method) dec_dir = os.path.join(log_dir, 'decoded') ref_dir = os.path.join(log_dir, 'reference') util.create_dirs(dec_dir) util.create_dirs(ref_dir) parser = PlaintextParser.from_string( ' '.join(raw_article_sents), Tokenizer("english")) summarizer = summary_fn() summary = summarizer( parser.document, 5) #Summarize the document with 5 sentences summary = [str(sentence) for sentence in summary] summary_tokenized = [] for sent in summary: summary_tokenized.append(sent.lower()) rouge_functions.write_for_rouge(groundtruth_summ_sents_list, summary_tokenized, example_idx, ref_dir, dec_dir, log=False) decoded_sent_tokens = [ sent.split() for sent in summary_tokenized ] sentence_limit = 2 sys_ssi_list, _, _ = get_simple_source_indices_list( decoded_sent_tokens, article_sent_tokens, vocab, sentence_limit, min_matched_tokens) triplet_ssi_list.append( (groundtruth_similar_source_indices_list, sys_ssi_list, -1)) print('Evaluating Lambdamart model F1 score...') suffix = util.all_sent_selection_eval(triplet_ssi_list) print(suffix) results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir) print(("Results_dict: ", results_dict)) sheets_str = rouge_functions.rouge_log(results_dict, log_dir, suffix=suffix) sheets_strs.append(dataset_name + '_' + summary_method + '\n' + sheets_str) for sheets_str in sheets_strs: print(sheets_str + '\n')
def main(unused_argv): print('Running statistics on %s' % exp_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.singles_and_pairs == 'both': in_dataset = FLAGS.dataset_name out_dataset = FLAGS.dataset_name + '_both' else: in_dataset = FLAGS.dataset_name + '_singles' out_dataset = FLAGS.dataset_name + '_singles' if FLAGS.lr: out_dataset = FLAGS.dataset_name + '_lr' start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, in_dataset) ex_sents = ['single .', 'sentence .'] article_text = ' '.join(ex_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, ex_sents, article_text, pca) if FLAGS.singles_and_pairs == 'pairs': single_feat_len = 0 else: single_feat_len = len( get_single_sent_features(0, sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0], 0)) if FLAGS.singles_and_pairs == 'singles': pair_feat_len = 0 else: pair_feat_len = len( get_pair_sent_features([0, 1], sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0], [0, 0])) util.print_vars(single_feat_len, pair_feat_len) util.create_dirs(temp_dir) if FLAGS.dataset_split == 'all': dataset_splits = ['test', 'val', 'train'] elif FLAGS.dataset_split == 'train_val': dataset_splits = ['val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for split in dataset_splits: source_files = sorted(glob.glob(source_dir + '/' + split + '*')) out_path = os.path.join(out_dir, out_dataset, split) if FLAGS.pca: out_path += '_pca' util.create_dirs(os.path.join(out_path)) total = len(source_files) * 1000 if ( 'cnn' in in_dataset or 'newsroom' in in_dataset or 'xsum' in in_dataset) else len(source_files) example_generator = data.example_generator(source_dir + '/' + split + '*', True, False, should_check_valid=False) # for example in tqdm(example_generator, total=total): ex_gen = example_generator_extended(example_generator, total, single_feat_len, pair_feat_len, FLAGS.singles_and_pairs, out_path) print('Creating list') ex_list = [ex for ex in ex_gen] if FLAGS.num_instances != -1: ex_list = ex_list[:FLAGS.num_instances] print('Converting...') # all_features = pool.map(convert_article_to_lambdamart_features, ex_list) # all_features = ray.get([convert_article_to_lambdamart_features.remote(ex) for ex in ex_list]) if FLAGS.lr: all_instances = list( futures.map(convert_article_to_lambdamart_features, ex_list)) all_instances = util.flatten_list_of_lists(all_instances) x = [inst.features for inst in all_instances] x = np.array(x) y = [inst.relevance for inst in all_instances] y = np.expand_dims(np.array(y), 1) x_y = np.concatenate((x, y), 1) np.save(writer, x_y) else: list(futures.map(convert_article_to_lambdamart_features, ex_list)) # writer.write(''.join(all_features)) # all_features = [] # for example in tqdm(ex_gen, total=total): # all_features.append(convert_article_to_lambdamart_features(example)) # all_features = util.flatten_list_of_lists(all_features) # num1 = sum(x == 1 for x in all_features) # num2 = sum(x == 2 for x in all_features) # print 'Single sent: %d instances. Pair sent: %d instances.' % (num1, num2) # for example in tqdm(ex_gen, total=total): # features = convert_article_to_lambdamart_features(example) # writer.write(features) final_out_path = out_path + '.txt' file_names = sorted(glob.glob(os.path.join(out_path, '*'))) writer = open(final_out_path, 'wb') for file_name in tqdm(file_names): with open(file_name) as f: text = f.read() writer.write(text) writer.close() util.print_execution_time(start_time)
def main(unused_argv): # def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) print('Running statistics on %s' % exp_name) start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, dataset_articles) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) ex_sents = ['single .', 'sentence .'] article_text = ' '.join(ex_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, ex_sents, article_text, pca) if FLAGS.singles_and_pairs == 'pairs': single_feat_len = 0 else: single_feat_len = len( get_single_sent_features(0, sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0], 0)) if FLAGS.singles_and_pairs == 'singles': pair_feat_len = 0 else: pair_feat_len = len( get_pair_sent_features([0, 1], sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0], [0, 0])) total = len(source_files ) * 1000 if 'cnn' or 'newsroom' in dataset_articles else len( source_files) example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) if FLAGS.mode == 'write_to_file': ex_gen = example_generator_extended(example_generator, total, single_feat_len, pair_feat_len, FLAGS.singles_and_pairs) print('Creating list') ex_list = [ex for ex in ex_gen] print('Converting...') # if len(sys.argv) > 1 and sys.argv[1] == '-m': list(futures.map(write_to_lambdamart_examples_to_file, ex_list)) # else: # instances_list = [] # for ex in tqdm(ex_list): # instances_list.append(write_to_lambdamart_examples_to_file(ex)) file_names = sorted(glob.glob(os.path.join(temp_in_dir, '*'))) instances_str = '' for file_name in tqdm(file_names): with open(file_name) as f: instances_str += f.read() with open(temp_in_path, 'wb') as f: f.write(instances_str) # RUN LAMBDAMART SCORING COMMAND HERE if FLAGS.mode == 'generate_summaries': qid_ssi_to_importances = rank_source_sents(temp_in_path, temp_out_path) ex_gen = example_generator_extended(example_generator, total, qid_ssi_to_importances, pair_feat_len, FLAGS.singles_and_pairs) print('Creating list') ex_list = [ex for ex in ex_gen] ssi_list = list(futures.map(evaluate_example, ex_list)) # save ssi_list with open(os.path.join(my_log_dir, 'ssi.pkl'), 'w') as f: pickle.dump(ssi_list, f) with open(os.path.join(my_log_dir, 'ssi.pkl')) as f: ssi_list = pickle.load(f) print('Evaluating Lambdamart model F1 score...') suffix = util.all_sent_selection_eval(ssi_list) # # # for ex in tqdm(ex_list, total=total): # # load_and_evaluate_example(ex) # print('Evaluating ROUGE...') results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir, l_param=l_param) # print("Results_dict: ", results_dict) rouge_functions.rouge_log(results_dict, my_log_dir, suffix=suffix) util.print_execution_time(start_time)