def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) while True: try: (article, abstract) = input_gen.next( ) # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) abstract_sentences = [ sent.strip() for sent in data.abstract2sents(abstract) ] # Use the <s> and </s> tags in abstract to get a list of sentences. example = Example(article, abstract_sentences, self._vocab, self._hps) # Process into an Example. self._example_queue.put( example) # place the Example in the example queue.
def fill_example_queue(self): input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) while True: try: (article, abstract) = input_gen.next() except StopIteration: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) abstract_sentences = [ sent.strip() for sent in data.abstract2sents(abstract) ] article = [sent.strip() for sent in data.article2sents(article)] example = Example(article, abstract_sentences, self._vocab, self._concept_vocab) self._example_queue.put(example)
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) source_dir = os.path.join(data_dir, FLAGS.dataset) source_files = sorted(glob.glob(source_dir + '/*')) for i in range(4): ref_dir = os.path.join(log_dir, 'reference_' + str(i), 'reference') dec_dir = os.path.join(log_dir, 'reference_' + str(i), 'decoded') util.create_dirs(ref_dir) util.create_dirs(dec_dir) for source_idx, source_file in enumerate(source_files): human_summary_texts = get_human_summary_texts(source_file) summaries = [] for summary_text in human_summary_texts: summary = data.abstract2sents(summary_text) summaries.append(summary) candidate = summaries[i] references = [ summaries[idx] for idx in range(len(summaries)) if idx != i ] rouge_functions.write_for_rouge(references, candidate, source_idx, ref_dir, dec_dir) results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir) # print("Results_dict: ", results_dict) rouge_functions.rouge_log(results_dict, os.path.join(log_dir, 'reference_' + str(i)))
def fill_example_queue(self): # 创建一个生成器对象 input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) while True: try: (article, abstract) = input_gen.__next__( ) # read the next example from file. article and abstract are both strings. article, abstract = article.decode(), abstract.decode() except StopIteration: # if there are no more examples: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) abstract_sentences = [ sent.strip() for sent in data.abstract2sents(abstract) ] # 编码abstract #print("abstract_sentences:", abstract_sentences) example = Example(article, abstract_sentences, self._vocab) # 处理成一个Example. self._example_queue.put(example) # 放处理成一个Example对象至example queue. """
def fill_example_queue(self): input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) while True: try: # read the next example from file. article and abstract are both strings. (article, abstract) = next(input_gen) except RuntimeError: # if there are no more examples: logger.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: logger.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) # Use the <s> and </s> tags in abstract to get a list of sentences. abstract_sentences = [ sent.strip() for sent in data.abstract2sents(abstract) ] example = Example(article, abstract_sentences, self._vocab) # Process into an Example. self._example_queue.put( example) # place the Example in the example queue.
def get_summary_from_example(e): summary_texts = [] for abstract in e.features.feature['abstract'].bytes_list.value: summary_texts.append(abstract) # the abstracts texts was saved under the key 'abstract' in the data files all_abstract_sentences = [[sent.strip() for sent in data.abstract2sents( abstract)] for abstract in summary_texts] summary_text = '\n'.join(all_abstract_sentences[0]) return summary_text
def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" input_gen = self.text_generator(data.example_generator(self._data_path, self._single_pass)) self._example_queue = [] for item in input_gen: article, abstract = str(item[0]), str(item[1]) abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] example = Example(article, abstract_sentences, self._vocab, self._hps) if example.flag == False: self._example_queue.append(example)
def get_decode_data(hps, vocab, data_path, randomize=False): tf.logging.info('Fetching data..') filelist = glob.glob(data_path) inputs = [] total_examples = 0 total_batches = 0 for f in filelist: reader = open(f, 'rb') while True: len_bytes = reader.read(8) if not len_bytes: break str_len = struct.unpack('q', len_bytes)[0] example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0] e = example_pb2.Example.FromString(example_str) try: article_text = e.features.feature['article'].bytes_list.value[ 0].decode() if len(article_text) == 0: #tf.logging.warning('Found an example with empty article text. Skipping it.') pass else: abstract_text = e.features.feature[ 'abstract'].bytes_list.value[0].decode() abstract_sentences = [ sent.strip() for sent in data.abstract2sents(abstract_text) ] example = Example(article_text, abstract_sentences, vocab, hps) inputs.append(example) total_examples = total_examples + 1 except ValueError: #tf.logging.error('Failed to get article or abstract from example') continue batches = [] tf.logging.info('Creating batches..') if randomize: random.shuffle(inputs) example = inputs[0] b = [example for _ in range(hps.beam_size)] batches.append(Batch(b, hps, vocab)) total_batches = 1 total_examples = 1 else: for i in range(0, len(inputs)): b = [inputs[i] for _ in range(hps.beam_size)] batches.append(Batch(b, hps, vocab)) total_batches = total_batches + 1 tf.logging.info('[TOTAL Batches] : %i', total_batches) tf.logging.info('[TOTAL Examples] : %i', total_examples) tf.logging.info('Creating batches..COMPLETE') return batches
def fill_example_queue(self): input_gen = self.text_generator(data.example_generator(self._data_path, self._single_pass)) while True: try: (article, abstract) = next(input_gen) except StopIteration: # if there are no more examples: tf.logging.info("The example generator for this example queue filling thread has exhausted data.") self._finished_reading = True break abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] example = Example(article, abstract_sentences, self._vocab, self._hps) self._example_queue.put(example)
def get_specific_example(hps, vocab, example_number): file_id, number = divmod(example_number, 1000) path = '/home/ubuntu/W266/final_0/W266_Final/data/final_chunked/validation_%03d.bin' % file_id print(f'Fetching example {number} from: {path}') filelist = glob.glob(path) inputs = [] total_examples = 0 total_batches = 0 for f in filelist: reader = open(f, 'rb') while True: len_bytes = reader.read(8) if not len_bytes: break str_len = struct.unpack('q', len_bytes)[0] example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0] e = example_pb2.Example.FromString(example_str) try: article_text = e.features.feature['article'].bytes_list.value[ 0].decode() if len(article_text) == 0: #tf.logging.warning('Found an example with empty article text. Skipping it.') pass else: abstract_text = e.features.feature[ 'abstract'].bytes_list.value[0].decode() abstract_sentences = [ sent.strip() for sent in data.abstract2sents(abstract_text) ] example = Example(article_text, abstract_sentences, vocab, hps) inputs.append(example) total_examples = total_examples + 1 except ValueError: #tf.logging.error('Failed to get article or abstract from example') continue batches = [] tf.logging.info('Creating batches..') example = inputs[number] b = [example for _ in range(hps.beam_size)] batches.append(Batch(b, hps, vocab)) total_batches = 1 total_examples = 1 tf.logging.info('[TOTAL Batches] : %i', total_batches) tf.logging.info('[TOTAL Examples] : %i', total_examples) tf.logging.info('Creating batches..COMPLETE') return batches
def fill_example_queue(self): input_gen = self.text_generator() while True: try: (content, query, summary) = input_gen.next( ) # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) content_sentence = [ sent.strip() for sent in data.abstract2sents(content) ] # Use the <s> and </s> tags in abstract to get a list of sentences. query_sentence = [ sent.strip() for sent in data.abstract2sents(query) ] summary_sentence = [ sent.strip() for sent in data.abstract2sents(summary) ] example = Example(content_sentence, query_sentence, summary_sentence, self._vocab) # Process into an Example. self._example_queue.put( example) # place the Example in the example queue.
def example_generator(self): # 文章、摘要生成器 while True: try: (article, abstract) = self.text_gen.__next__() except StopIteration: print("example generator 迭代结束") break # 编码abstract abstract_sentences = [ sent.strip() for sent in data.abstract2sents(abstract) ] # 处理成一个Example. example = Example(article, abstract_sentences[0], self._vocab) # 放Example对象到队列 yield example
def fill_example_queue(self): input_gen = self.text_generator(data.example_generator(self._data_path, self._single_pass)) while True: try: (article, abstract) = input_gen.next() # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: tf.logging.info("The example generator for this example queue filling thread has exhausted data.") if self._single_pass: tf.logging.info("single_pass mode is on, so we've finished reading dataset. This thread is stopping.") self._finished_reading = True break else: raise Exception("single_pass mode is off but the example generator is out of data; error.") abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] # Use the <s> and </s> tags in abstract to get a list of sentences. example = Example(article, abstract_sentences, self._vocab) # Process into an Example. self._example_queue.put(example) # place the Example in the example queue.
def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass, self._cnn_500_dm_500)) # counter = 0 while True: try: ( article, abstracts, doc_indices_str, raw_article_sents ) = input_gen.next( ) # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) all_abstract_sentences = [[ sent.strip() for sent in data.abstract2sents(abstract) ] for abstract in abstracts] if len(all_abstract_sentences) != 0: abstract_sentences = all_abstract_sentences[0] else: abstract_sentences = [] doc_indices = [int(idx) for idx in doc_indices_str.strip().split()] example = Example(article, abstract_sentences, all_abstract_sentences, doc_indices, raw_article_sents, self._vocab, self._hps) # Process into an Example. self._example_queue.put( example) # place the Example in the example queue.
def fill_example_queue(self): input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) f = open("inputs.txt", "a") while True: try: (source1, source2, target) = input_gen.next( ) # read the next example from file. article and abstract are both strings. f.write(source1 + "\t" + source2 + "\t" + target + "\n") except StopIteration: # if there are no more examples: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) abstract_sentences = [ sent.strip() for sent in data.abstract2sents(target) ] # Use the <s> and </s> tags in abstract to get a list of sentences. #example = Example(article, abstract_sentences, self._vocab) # Process into an Example. #example = Example2(article, ' '.join(abstract_sentences), abstract_sentences, self._vocab) #example = Example2(' '.join(abstract_sentences), article, abstract_sentences, self._vocab) #example = Example2(article, article, abstract_sentences, self._vocab) #example = Example2(' '.join(abstract_sentences), ' '.join(abstract_sentences), abstract_sentences, self._vocab) example = Example2(source1, source2, target.split(), self._vocab) self._example_queue.put( example) # place the Example in the example queue. f.close()
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.summarizer == 'all': summary_methods = list(summarizers.keys()) else: summary_methods = [FLAGS.summarizer] if FLAGS.dataset_name == 'all': dataset_names = datasets else: dataset_names = [FLAGS.dataset_name] sheets_strs = [] for summary_method in summary_methods: summary_fn = summarizers[summary_method] for dataset_name in dataset_names: FLAGS.dataset_name = dataset_name original_dataset_name = 'xsum' if 'xsum' in dataset_name else 'cnn_dm' if 'cnn_dm' in dataset_name or 'duc_2004' in dataset_name else '' vocab = Vocab('logs/vocab' + '_' + original_dataset_name, 50000) # create a vocabulary source_dir = os.path.join(data_dir, dataset_name) source_files = sorted( glob.glob(source_dir + '/' + FLAGS.dataset_split + '*')) total = len(source_files) * 1000 if ( 'cnn' in dataset_name or 'newsroom' in dataset_name or 'xsum' in dataset_name) else len(source_files) example_generator = data.example_generator( source_dir + '/' + FLAGS.dataset_split + '*', True, False, should_check_valid=False) if dataset_name == 'duc_2004': abs_source_dir = os.path.join( os.path.expanduser('~') + '/data/tf_data/with_coref', dataset_name) abs_example_generator = data.example_generator( abs_source_dir + '/' + FLAGS.dataset_split + '*', True, False, should_check_valid=False) abs_names_to_types = [('abstract', 'string_list')] triplet_ssi_list = [] for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) if dataset_name == 'duc_2004': abs_example = next(abs_example_generator) groundtruth_summary_texts = util.unpack_tf_example( abs_example, abs_names_to_types) groundtruth_summary_texts = groundtruth_summary_texts[0] groundtruth_summ_sents_list = [[ sent.strip() for sent in data.abstract2sents(abstract) ] for abstract in groundtruth_summary_texts] else: groundtruth_summary_texts = [groundtruth_summary_text] groundtruth_summ_sents_list = [] for groundtruth_summary_text in groundtruth_summary_texts: groundtruth_summ_sents = [ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ] groundtruth_summ_sents_list.append( groundtruth_summ_sents) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] if doc_indices is None: doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] groundtruth_similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, FLAGS.sentence_limit) log_dir = os.path.join(log_root, dataset_name + '_' + summary_method) dec_dir = os.path.join(log_dir, 'decoded') ref_dir = os.path.join(log_dir, 'reference') util.create_dirs(dec_dir) util.create_dirs(ref_dir) parser = PlaintextParser.from_string( ' '.join(raw_article_sents), Tokenizer("english")) summarizer = summary_fn() summary = summarizer( parser.document, 5) #Summarize the document with 5 sentences summary = [str(sentence) for sentence in summary] summary_tokenized = [] for sent in summary: summary_tokenized.append(sent.lower()) rouge_functions.write_for_rouge(groundtruth_summ_sents_list, summary_tokenized, example_idx, ref_dir, dec_dir, log=False) decoded_sent_tokens = [ sent.split() for sent in summary_tokenized ] sentence_limit = 2 sys_ssi_list, _, _ = get_simple_source_indices_list( decoded_sent_tokens, article_sent_tokens, vocab, sentence_limit, min_matched_tokens) triplet_ssi_list.append( (groundtruth_similar_source_indices_list, sys_ssi_list, -1)) print('Evaluating Lambdamart model F1 score...') suffix = util.all_sent_selection_eval(triplet_ssi_list) print(suffix) results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir) print(("Results_dict: ", results_dict)) sheets_str = rouge_functions.rouge_log(results_dict, log_dir, suffix=suffix) sheets_strs.append(dataset_name + '_' + summary_method + '\n' + sheets_str) for sheets_str in sheets_strs: print(sheets_str + '\n')
def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" def irrelevant_perturbation(sentences): summary = ' '.join(sentences) + '\n' irrelevant_summary = self.irrelevant_dict[summary] return irrelevant_summary.split() def syntax_perturbation(sentences): summary = ' '.join(sentences) summary = summary.split() original_summary = deepcopy(summary) sentence_len = len(summary) done = False pos1 = 0 pos2 = -1 while not done: pos1 += 1 pos2 -= 1 summary[pos1] = original_summary[pos2] summary[pos2] = original_summary[pos1] done = True if summary == original_summary: done = False return summary def semantic_perturbation(sentences): summary = ' '.join(sentences) summary = summary.split() original_summary = deepcopy(summary) try: tokenized_text = word_tokenize(' '.join(summary)) except: return sentences pos_tag = nltk.pos_tag(tokenized_text) change = 0 for pi in range(len(pos_tag)): antonym = '' try: for syn in wordnet.synsets(pos_tag[pi][0]): for l in syn.lemmas(): if l.antonyms(): antonym = l.antonyms()[0].name( ) # get the first antonym of the first lemma break if antonym != '': if change < 2: tokenized_text[pi] = antonym change += 1 break except: tokenized_text[pi] = '[UNK]' if summary == original_summary: change = 0 for k in range(len(summary)): try: summary[k] = semantic_change_simple[summary[k]] change += 1 except: pass if change >= 2: break summary = tokenized_text return summary def grammar_perturbation(sentences): summary = ' '.join(sentences) summary = summary.split() original_summary = deepcopy(summary) change = 0 for k in range(len(summary)): try: summary[k] = grammar_tweek_negation[summary[k]] change += 1 except: pass if change >= 2: break if summary == original_summary: change = 0 for k in range(len(summary)): try: summary[k] = grammar_tweek_custom[summary[k]] change += 1 except: pass if change >= 2: break if summary == original_summary: change = 0 for word in original_summary: new_word = singularize(word) if change >= 2: summary.append(word) else: summary.append(new_word) if new_word != word: change += 1 return summary input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass)) while True: try: (article, abstract) = input_gen.next( ) # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: tf.logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) abstract_sentences = [ sent.strip() for sent in data.abstract2sents(abstract) ] # Use the <s> and </s> tags in abstract to get a list of sentences. # perturbation abstract_sentences = grammar_perturbation(abstract_sentences) # if lead3 #abstract_sentences = (article.split('.')[0] + '. ' + article.split('.')[1] + '. ' + article.split('.')[2]+ '.').split() example = Example(article, abstract_sentences, self._vocab, self._hps) # Process into an Example. self._example_queue.put( example) # place the Example in the example queue.
def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" if self._example_generator is None: input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass, self._cnn_500_dm_500, is_original=('with_coref' not in self._data_path))) else: input_gen = self.text_generator(self._example_generator) if self._hps.pg_mmr and self._hps.ssi_data_path != '': # if use pg_mmr and bert print(util.bcolors.OKGREEN + "Loading SSI from BERT at %s" % os.path.join(self._hps.ssi_data_path, 'ssi.pkl') + util.bcolors.ENDC) with open(os.path.join(self._hps.ssi_data_path, 'ssi.pkl')) as f: ssi_triple_list = pickle.load(f) # ssi_list = [ssi_triple[1] for ssi_triple in ssi_triple_list] else: ssi_triple_list = None counter = 0 while True: try: ( article, abstracts, doc_indices_str, raw_article_sents, ssi, article_lcs_paths_list ) = next( input_gen ) # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True if ssi_triple_list is not None and counter < len( ssi_triple_list): raise Exception( 'Len of ssi list (%d) is greater than number of examples (%d)' % (len(ssi_triple_list), counter)) break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) if ssi_triple_list is not None: if counter >= len(ssi_triple_list): raise Exception( 'Len of ssi list (%d) is less than number of examples (>=%d)' % (len(ssi_triple_list), counter)) ssi_length_extractive = ssi_triple_list[counter][2] ssi = ssi_triple_list[counter][1] ssi = ssi[:ssi_length_extractive] article = article abstracts = [abstract for abstract in abstracts] if type(doc_indices_str) != str: doc_indices_str = doc_indices_str raw_article_sents = [sent for sent in raw_article_sents] all_abstract_sentences = [[ sent.strip() for sent in data.abstract2sents(abstract) ] for abstract in abstracts] if len(all_abstract_sentences) != 0: abstract_sentences = all_abstract_sentences[0] else: abstract_sentences = [] doc_indices = [int(idx) for idx in doc_indices_str.strip().split()] # join_separator = ' [SEP] ' if self._hps.sep else ' ' if self._hps.by_instance: # if we are running iteratively on only instances (a singleton/pair + a summary sentence), not the whole article for abs_idx, abstract_sentence in enumerate( abstract_sentences): inst_ssi = ssi[abs_idx] if len(inst_ssi) == 0: continue inst_abstract_sentences = abstract_sentence inst_raw_article_sents = util.reorder( raw_article_sents, inst_ssi) inst_article = ' '.join([ ' '.join(util.process_sent(sent, whitespace=True)) for sent in inst_raw_article_sents ]) inst_doc_indices = [0] * len(inst_article.split()) inst_article_lcs_paths_list = article_lcs_paths_list[ abs_idx] if len( inst_article ) == 0: # See https://github.com/abisee/pointer-generator/issues/1 logging.warning( 'Found an example with empty article text. Skipping it.\n*********************************************' ) elif len(inst_article.strip().split() ) < 3 and self._hps.skip_with_less_than_3: print( 'Article has less than 3 tokens, so skipping\n*********************************************' ) elif len(inst_abstract_sentences.strip().split() ) < 3 and self._hps.skip_with_less_than_3: print( 'Abstract has less than 3 tokens, so skipping\n*********************************************' ) else: inst_example = Example(None, [inst_abstract_sentences], all_abstract_sentences, None, inst_raw_article_sents, None, [inst_article_lcs_paths_list], self._vocab, self._hps) self._example_queue.put(inst_example) else: example = Example(None, abstract_sentences, all_abstract_sentences, None, raw_article_sents, ssi, article_lcs_paths_list, self._vocab, self._hps) # Process into an Example. self._example_queue.put( example) # place the Example in the example queue. # print "example num", counter counter += 1
def bin2txt(data_path, finished_dir): import glob import json import struct import nltk import data from tensorflow.core.example import example_pb2 from collections import OrderedDict def example_generator(file_path): with open(file_path, 'rb') as reader: while True: len_bytes = reader.read(8) if not len_bytes: break # finished reading this file str_len = struct.unpack('q', len_bytes)[0] example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0] yield example_pb2.Example.FromString(example_str) def text_generator(example_generator): while True: e = example_generator.next() # e is a tf.Example try: article_text = e.features.feature['article'].bytes_list.value[ 0] # the article text was saved under the key 'article' in the data files abstract_text = e.features.feature['abstract'].bytes_list.value[ 0] # the abstract text was saved under the key 'abstract' in the data files except ValueError: tf.logging.error( 'Failed to get article or abstract from example') continue if len( article_text ) == 0: # See https://github.com/abisee/pointer-generator/issues/1 tf.logging.warning( 'Found an example with empty article text. Skipping it.') else: yield (article_text, abstract_text) counter = 0 filelist = glob.glob(data_path) # get the list of datafiles assert filelist, ('Error: Empty filelist at %s' % data_path ) # check filelist isn't empty filelist = sorted(filelist) for f in filelist: input_gen = text_generator(example_generator(f)) with open( finished_dir + '/' + f.split('/')[-1].replace('.bin', '.txt'), 'w') as writer: while True: try: (article, abstract) = input_gen.next( ) # read the next example from file. article and abstract are both strings. abstract_sentences = [ sent.strip() for sent in data.abstract2sents(abstract) ] # Use the <s> and </s> tags in abstract to get a list of sentences. abstract = ' '.join(abstract_sentences) abstract_sentences = [ ' '.join(nltk.word_tokenize(sent)) for sent in nltk.sent_tokenize(abstract) ] json_format = json.dumps( OrderedDict([('uuid', 'uuid-%i' % counter), ('article', article), ('summary', ''), ('reference', abstract)])) counter += 1 writer.write(json_format) writer.write('\n') except StopIteration: # if there are no more examples: tf.logging.info( "The example generator for this example queue filling thread has exhausted data." ) break except UnicodeDecodeError: continue print "finished " + f
def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" if self._example_generator is None: input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass, self._cnn_500_dm_500, is_original=False)) else: input_gen = self.text_generator(self._example_generator) counter = 0 while True: try: (article, abstracts, doc_indices_str, raw_article_sents, ssi) = next(input_gen) # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: logging.info("The example generator for this example queue filling thread has exhausted data.") if self._single_pass: logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping.") self._finished_reading = True break else: raise Exception("single_pass mode is off but the example generator is out of data; error.") article = article abstracts = [abstract for abstract in abstracts] if type(doc_indices_str) != str: doc_indices_str = doc_indices_str raw_article_sents = [sent for sent in raw_article_sents] all_abstract_sentences = [[sent.strip() for sent in data.abstract2sents( abstract)] for abstract in abstracts] if len(all_abstract_sentences) != 0: abstract_sentences = all_abstract_sentences[0] else: abstract_sentences = [] doc_indices = [int(idx) for idx in doc_indices_str.strip().split()] if self._hps.by_instance: # if we are running iteratively on only instances (a singleton/pair + a summary sentence), not the whole article for abs_idx, abstract_sentence in enumerate(abstract_sentences): inst_ssi = ssi[abs_idx] if len(inst_ssi) == 0: continue inst_abstract_sentences = abstract_sentence inst_raw_article_sents = util.reorder(raw_article_sents, inst_ssi) inst_article = ' '.join([' '.join(util.process_sent(sent, whitespace=True)) for sent in inst_raw_article_sents]) inst_doc_indices = [0] * len(inst_article.split()) if len(inst_article) == 0: # See https://github.com/abisee/pointer-generator/issues/1 logging.warning( 'Found an example with empty article text. Skipping it.\n*********************************************') elif len(inst_article.strip().split()) < 3 and self._hps.skip_with_less_than_3: print( 'Article has less than 3 tokens, so skipping\n*********************************************') elif len(inst_abstract_sentences.strip().split()) < 3 and self._hps.skip_with_less_than_3: print( 'Abstract has less than 3 tokens, so skipping\n*********************************************') else: inst_example = Example(inst_article, [inst_abstract_sentences], all_abstract_sentences, inst_doc_indices, inst_raw_article_sents, None, self._vocab, self._hps) self._example_queue.put(inst_example) else: example = Example(article, abstract_sentences, all_abstract_sentences, doc_indices, raw_article_sents, ssi, self._vocab, self._hps) # Process into an Example. self._example_queue.put(example) # place the Example in the example queue. # print "example num", counter counter += 1