Python example_generatorの例、data.example_generator Pythonの例

コード例 #1

0

ファイルを表示

    def fill_example_queue(self):
        """Reads data from file and processes into Examples which are then placed into the example queue."""

        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))

        while True:
            try:
                (background_text, context_text, response_text, span_text,
                 b_start, b_end, r_start, r_end, example_id) = next(input_gen)
            except StopIteration:  # if there are no more examples:
                tf.logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    tf.logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            example = Example(background_text, context_text, response_text,
                              span_text, b_start, b_end, r_start, r_end,
                              example_id, self._vocab, self._hps)

            self._example_queue.put(
                example)  # place the Example in the example queue.

コード例 #2

0

ファイルを表示

ファイル: batcher.py プロジェクト: iamxpy/pointer_summarizer

    def fill_example_queue(self):
        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))

        while True:
            try:
                # read the next example from file. article and abstract are both strings.
                (article, abstract) = next(input_gen)
            except RuntimeError:  # if there are no more examples:
                logger.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    logger.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            # Use the <s> and </s> tags in abstract to get a list of sentences.
            abstract_sentences = [
                sent.strip() for sent in data.abstract2sents(abstract)
            ]
            example = Example(article, abstract_sentences,
                              self._vocab)  # Process into an Example.
            self._example_queue.put(
                example)  # place the Example in the example queue.

コード例 #3

0

ファイルを表示

  def fill_example_queue(self):
    """Reads data from file and processes into Examples which are then placed into the example queue."""

    input_gen = self.text_generator(data.example_generator(self._data_path, self._single_pass, self._hps.decode_only, self._hps.language))

    while True:
      try:
        (article, abstract, tags, title) = next(input_gen) # read the next example from file. article and abstract are both strings.
      except StopIteration: # if there are no more examples:
        tf.logging.info("The example generator for this example queue filling thread has exhausted data.")
        if self._single_pass:
          tf.logging.info("single_pass mode is on, so we've finished reading dataset. This thread is stopping.")
          self._finished_reading = True
          break
        else:
          raise Exception("single_pass mode is off but the example generator is out of data; error.")

      if self._hps.mode in ['train', 'eval']:
        abstract_sentences_all = data.abstract2sents(abstract); # Use the <s> and </s> tags in abstract to get a list of sentences.
        for i in range(self._hps.max_keyphrase_num):
          sent = abstract_sentences_all[i % len(abstract_sentences_all)]
          abstract_sentences = [sent.strip()]
          example = Example(title, article, tags, abstract_sentences, abstract_sentences_all, self._vocab, self._hps, self._stop_words) # Process into an Example.
          self._example_queue.put(example) # place the Example in the example queue.

      elif self._hps.mode == "decode":
          abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] # Use the <s> and </s> tags in abstract to get a list of sentences.
          example = Example(title, article, tags, [abstract_sentences[0]], abstract_sentences, self._vocab, self._hps, self._stop_words) # Process into an Example.
          self._example_queue.put(example) # place the Example in the example queue.

コード例 #4

0

ファイルを表示

    def fill_example_queue(self):
        """Reads data from file and processes into Examples which are then placed into the example queue."""
        print self._single_pass
        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))

        while True:
            try:
                (comment, label, keywords, topics) = input_gen.next(
                )  # read the next example from file. article and abstract are both strings.

            except StopIteration:  # if there are no more examples:
                tf.logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    tf.logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            example = Example(comment, label, keywords, topics, self._vocab,
                              self._hps)  # Process into an Example.
            self._example_queue.put(
                example)  # place the Example in the example queue.

コード例 #5

0

ファイルを表示

ファイル: test_batcher.py プロジェクト: lan2720/rl-GQ

def test_batch():
    # python test_batcher.py --data_path=../data/squad-v1/dev_raw.json --pointer_gen
    input_gen = text_generator(
        data.example_generator(hps.data_path, hps.single_pass))
    example_list = []
    for _ in range(hps.batch_size):
        p, q, a, ap = next(input_gen)
        example_list.append(Example(p, q, a, ap, vocab, hps))
    batch = Batch(example_list, hps, vocab)
    print('batch answer pos:', batch.ans_indices)
    print('enc batch:', batch.enc_batch)
    print('enc batch words:',
          id2sentence(batch.enc_batch, vocab, batch.para_oovs_batch))
    print('enc len:', batch.enc_lens)
    if hps.pointer_gen:
        print('max para oovs:', batch.max_para_oovs)
        print('para oovs:', batch.para_oovs_batch)
        print('enc batch extend vocab:', batch.enc_batch_extend_vocab)
        print(
            'enc batch extend vocab words:',
            id2sentence(batch.enc_batch_extend_vocab, vocab,
                        batch.para_oovs_batch))
    print('dec batch:', batch.dec_batch)
    print('dec batch words:',
          id2sentence(batch.dec_batch, vocab, batch.para_oovs_batch))
    print('target batch:', batch.target_batch)
    print('tgt batch words:',
          id2sentence(batch.target_batch, vocab, batch.para_oovs_batch))
    print('origin para:', batch.original_paragraphs)
    print('origin question:', batch.original_questions)
    print('origin answer:', batch.original_answers)

コード例 #6

0

ファイルを表示

    def fill_example_queue(self):
        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))

        while True:
            try:
                (article, abstract) = input_gen.next()

            except StopIteration:
                tf.logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    tf.logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            abstract_sentences = [
                sent.strip() for sent in data.abstract2sents(abstract)
            ]
            article = [sent.strip() for sent in data.article2sents(article)]
            example = Example(article, abstract_sentences, self._vocab,
                              self._concept_vocab)
            self._example_queue.put(example)

コード例 #7

0

ファイルを表示

ファイル: batcher.py プロジェクト: zingp/pointer-generator-pytorch

    def fill_example_queue(self):
        # 创建一个生成器对象
        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))

        while True:
            try:
                (article, abstract) = input_gen.__next__(
                )  # read the next example from file. article and abstract are both strings.
                article, abstract = article.decode(), abstract.decode()
            except StopIteration:  # if there are no more examples:
                tf.logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    tf.logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            abstract_sentences = [
                sent.strip() for sent in data.abstract2sents(abstract)
            ]  # 编码abstract
            #print("abstract_sentences:", abstract_sentences)
            example = Example(article, abstract_sentences,
                              self._vocab)  # 处理成一个Example.
            self._example_queue.put(example)  # 放处理成一个Example对象至example queue.
            """

コード例 #8

0

ファイルを表示

    def fill_example_queue(self):
        """Reads data from file and processes into Examples which are then placed into the example queue."""

        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))

        while True:
            try:
                (article, abstract) = next(
                    input_gen
                )  # read the next example from file. article and abstract are both strings.
            except StopIteration:  # if there are no more examples:
                tf.logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    tf.logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            abstract_sentences = [
                abstract.strip()
            ]  # Use the <s> and </s> tags in abstract to get a list of sentences.
            example = Example(article, abstract_sentences, self._vocab,
                              self._hps)  # Process into an Example.
            self._example_queue.put(
                example)  # place the Example in the example queue.

コード例 #9

0

ファイルを表示

    def fill_example_queue(self):
        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))
        while True:
            try:
                (article, abstract) = next(
                    input_gen
                )  # read the next example from file. article and abstract are both strings.
            except StopIteration:  # if there are no more examples:
                if self._single_pass:
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )
                    break


#      abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] # Use the <s> and </s> tags in abstract to get a list of sentences.
#      abstract = str(abstract, encoding='utf8')
            abstract_sentences = [abstract]
            example = Example(article, abstract_sentences,
                              self._vocab)  # Process into an Example.
            self._example_queue.put(
                example)  # place the Example in the example queue.

コード例 #10

0

ファイルを表示

ファイル: _lambdamart_scores_to_summaries.py プロジェクト: loganlebanoff/correct_summarization

def read_articles_abstracts(source_dir, dataset_split):
    source_dir = os.path.join(data_dir, dataset_articles)
    source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*'))
    example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False)

    print('Creating list')
    ex_list = [ex for ex in example_generator]
    a=0

コード例 #11

0

ファイルを表示

ファイル: test_batcher.py プロジェクト: lan2720/rl-GQ

def test_stop():
    from batcher import text_generator
    input_gen = text_generator(
        data.example_generator(hps.data_path, hps.single_pass))
    i = 0
    while True:
        sample = next(input_gen)
        i += 1
        print(i, sample[0])

コード例 #12

0

ファイルを表示

 def fill_example_queue(self):
   """Reads data from file and processes into Examples which are then placed into the example queue."""
   input_gen = self.text_generator(data.example_generator(self._data_path, self._single_pass))
   self._example_queue = []
   for item in input_gen:
       article, abstract = str(item[0]), str(item[1])
       abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)]
       example = Example(article, abstract_sentences, self._vocab, self._hps)
       if example.flag == False:
           self._example_queue.append(example)

コード例 #13

0

ファイルを表示

def main(unused_argv):
    print('Running statistics on %s' % exp_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)
    start_time = time.time()
    np.random.seed(random_seed)
    source_dir = os.path.join(data_dir, dataset_articles)
    source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*'))
    ex_sents = ['single .', 'sentence .']
    article_text = ' '.join(ex_sents)
    sent_term_matrix = util.get_doc_substituted_tfidf_matrix(
        tfidf_vectorizer, ex_sents, article_text)
    if singles_and_pairs == 'pairs':
        single_feat_len = 0
    else:
        single_feat_len = len(
            get_single_sent_features(0, sent_term_matrix,
                                     [['single', '.'], ['sentence', '.']],
                                     [0, 0]))
    if singles_and_pairs == 'singles':
        pair_feat_len = 0
    else:
        pair_feat_len = len(
            get_pair_sent_features([0, 1], sent_term_matrix,
                                   [['single', '.'], ['sentence', '.']],
                                   [0, 0]))

    total = len(source_files
                ) * 1000 if 'cnn' or 'newsroom' in dataset_articles else len(
                    source_files)
    example_generator = data.example_generator(source_dir + '/' +
                                               dataset_split + '*',
                                               True,
                                               False,
                                               should_check_valid=False)

    ex_gen = example_generator_extended(example_generator, total,
                                        single_feat_len, pair_feat_len)
    print('Creating list')
    ex_list = [ex for ex in ex_gen]
    print('Converting...')
    list(futures.map(load_and_evaluate_example, ex_list))
    # for ex in tqdm(ex_list, total=total):
    #     load_and_evaluate_example(ex)

    print('Evaluating ROUGE...')
    results_dict = rouge_eval_references.rouge_eval(ref_dir, dec_dir)
    # print("Results_dict: ", results_dict)
    rouge_eval_references.rouge_log(results_dict, my_log_dir)

    util.print_execution_time(start_time)

コード例 #14

0

ファイルを表示

ファイル: batcher.py プロジェクト: lan2720/rl-GQ

    def fill_example_queue(self):
        input_gen = text_generator(data.example_generator(self.examples, self.single_pass))

        while True:
            try:
                (paragraph, question, answer, answer_position) = next(input_gen) # read the next example from file. article and abstract are both strings.
            except StopIteration: # if there are no more examples:
                self._finished_reading = True
                print('reading data one round finish')
                break
            example = Example(paragraph, question, answer, answer_position, self.vocab,
                              self.max_enc_steps, self.max_dec_steps, self.dynamic_vocab)
            self._example_queue.put(example)

コード例 #15

0

ファイルを表示

ファイル: batcher.py プロジェクト: nishanth01/summary_final

    def fill_example_queue(self):
        input_gen = self.text_generator(data.example_generator(self._data_path, self._single_pass))
        while True:
            try:
                (article, abstract) = next(input_gen) 
            except StopIteration: # if there are no more examples:
                tf.logging.info("The example generator for this example queue filling thread has exhausted data.")
                self._finished_reading = True
                break

            abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] 
            example = Example(article, abstract_sentences, self._vocab, self._hps) 
            self._example_queue.put(example)

コード例 #16

0

ファイルを表示

ファイル: batcher.py プロジェクト: d294270681/horovod_gcn_pointer_generator

	def fill_example_queue(self):
		"""Reads data from file and processes into Examples which are then placed into the example queue."""
		input_gen = self.text_generator(data.example_generator(self._data, self._single_pass,self._device_id, data_as_tf_example=self._data_as_tf_example))
		count = 0
		query = None
		word_edge_list = None
		query_edge_list = None
		if self._data_as_tf_example:
			while True:
				try:
					 article, abstract, word_edge_list, query, query_edge_list, epoch_num = input_gen.next() # read the next example from file. article and abstract are both strings.
					 #tf.logging.info(random.randint(1,101))
				except StopIteration: # if there are no more examples:
					tf.logging.info("The example generator for this example queue filling thread has exhausted data.")
					if self._single_pass:
						tf.logging.info("single_pass mode is on, so we've finished reading dataset. This thread is stopping.")
						self._finished_reading = True
						break
					else:
						raise Exception("single_pass mode is off but the example generator is out of data; error.")
				abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] # Use the <s> and </s> tags in abstract to get a list of sentences.
				example = Example(article, abstract_sentences, self._vocab, self._hps, word_edge_list=word_edge_list, query=query, query_edge_list=query_edge_list, epoch_num=epoch_num, bert_vocab=self.bert_vocab)
				self._example_queue.put(example)
		else:

			while True:
				try:
					curr_data = input_gen.next()
					count = count + 1
					article = curr_data['article']
					abstract = curr_data['abstract'].strip()
					if self._hps.word_gcn.value:
						word_edge_list = curr_data['word_edge_list']
					if self._hps.query_encoder.value:
						query = curr_data['query']
					if self._hps.query_gcn.value:
						query_edge_list = curr_data['query_edge_list']
				except Exception as e:  # if there are no more examples:
					tf.logging.info("The example generator for this example queue filling thread has exhausted data.")
					if self._single_pass:
						tf.logging.info(
							"single_pass mode is on, so we've finished reading dataset. This thread is stopping.")
						self._finished_reading = True
						break
					else:
						tf.logging.info(e)
						raise Exception("single_pass mode is off but the example generator is out of data; error.")

				abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)]  # Use the <s> and </s> tags in abstract to get a list of sentences.
				example = Example(article, abstract_sentences, self._vocab, self._hps, word_edge_list=word_edge_list, query=query, query_edge_list=query_edge_list, epoch_num=epoch_num)
				self._example_queue.put(example)  # place the Example in the example queue.

コード例 #17

0

ファイルを表示

ファイル: make_vocab.py プロジェクト: peter-xbs/summarization-sing-pair-mix

def main(unused_argv):
    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.dataset_split == 'all':
        dataset_splits = ['test', 'val', 'train']
    else:
        dataset_splits = [FLAGS.dataset_split]

    vocab_counter = collections.Counter()

    for dataset_split in dataset_splits:

        source_dir = os.path.join(FLAGS.data_root, FLAGS.dataset_name)
        source_files = sorted(glob.glob(source_dir + '/' + dataset_split +
                                        '*'))

        total = len(source_files) * 1000
        example_generator = data.example_generator(source_dir + '/' +
                                                   dataset_split + '*',
                                                   True,
                                                   False,
                                                   should_check_valid=False)

        for example_idx, example in enumerate(
                tqdm(example_generator, total=total)):

            raw_article_sents, article, abstracts, doc_indices = util.unpack_tf_example(
                example, names_to_types)
            article_sent_tokens = [
                util.process_sent(sent) for sent in raw_article_sents
            ]
            # groundtruth_summ_sent_tokens = [sent.strip().split() for sent in groundtruth_summary_text.strip().split('\n')]
            groundtruth_summ_sent_tokens = [[
                token for token in abstract.strip().split()
                if token not in ['<s>', '</s>']
            ] for abstract in abstracts]
            all_tokens = util.flatten_list_of_lists(
                article_sent_tokens) + util.flatten_list_of_lists(
                    groundtruth_summ_sent_tokens)

            vocab_counter.update(all_tokens)

    print("Writing vocab file...")
    with open(os.path.join('logs', "vocab_" + FLAGS.dataset_name),
              'w') as writer:
        for word, count in vocab_counter.most_common(VOCAB_SIZE):
            writer.write(word + ' ' + str(count) + '\n')
    print("Finished writing vocab file")

コード例 #18

0

ファイルを表示

def main(unused_argv):

    if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)
    print('Running statistics on %s' % exp_name)

    start_time = time.time()
    np.random.seed(random_seed)
    source_dir = os.path.join(data_dir, dataset_articles)
    source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*'))


    total = len(source_files)*1000
    example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False)

    # Read output of BERT and put into a dictionary with:
    # key=(article idx, source indices {this is a tuple of length 1 or 2, depending on if it is a singleton or pair})
    # value=score
    qid_ssi_to_importances = rank_source_sents(temp_in_path, temp_out_path)
    ex_gen = example_generator_extended(example_generator, total, qid_ssi_to_importances, None, FLAGS.singles_and_pairs)
    print('Creating list')
    ex_list = [ex for ex in ex_gen]

    # # Main function to get results on all test examples
    # pool = mp.Pool(mp.cpu_count())
    # ssi_list = list(tqdm(pool.imap(evaluate_example, ex_list), total=total))
    # pool.close()

    # Main function to get results on all test examples
    ssi_list = list(map(evaluate_example, ex_list))

    # save ssi_list
    with open(os.path.join(my_log_dir, 'ssi.pkl'), 'wb') as f:
        pickle.dump(ssi_list, f)
    with open(os.path.join(my_log_dir, 'ssi.pkl'), 'rb') as f:
        ssi_list = pickle.load(f)
    print('Evaluating BERT model F1 score...')
    suffix = util.all_sent_selection_eval(ssi_list)
    print('Evaluating ROUGE...')
    results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir, l_param=l_param)
    rouge_functions.rouge_log(results_dict, my_log_dir, suffix=suffix)

    ssis_restricted = [ssi_triple[1][:ssi_triple[2]] for ssi_triple in ssi_list]
    ssi_lens = [len(source_indices) for source_indices in util.flatten_list_of_lists(ssis_restricted)]
    num_singles = ssi_lens.count(1)
    num_pairs = ssi_lens.count(2)
    print ('Percent singles/pairs: %.2f %.2f' % (num_singles*100./len(ssi_lens), num_pairs*100./len(ssi_lens)))

    util.print_execution_time(start_time)

コード例 #19

0

ファイルを表示

ファイル: preprocess_for_bert_article.py プロジェクト: peter-xbs/summarization-sing-pair-mix

def main(unused_argv):

    print('Running statistics on %s' % FLAGS.dataset_name)

    if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.dataset_name == 'all':
        dataset_names = ['cnn_dm', 'xsum', 'duc_2004']
    else:
        dataset_names = [FLAGS.dataset_name]

    for dataset_name in dataset_names:
        FLAGS.dataset_name = dataset_name


        source_dir = os.path.join(data_dir, dataset_name)

        if FLAGS.dataset_split == 'all':
            if dataset_name == 'duc_2004':
                dataset_splits = ['test']
            else:
                dataset_splits = ['test', 'val', 'train']
        else:
            dataset_splits = [FLAGS.dataset_split]


        for dataset_split in dataset_splits:

            source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*'))

            total = len(source_files) * 1000
            example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False,
                                                       should_check_valid=False)

            out_dir = os.path.join('data', 'bert', dataset_name, 'article_embeddings', 'input_article')
            util.create_dirs(out_dir)

            writer = open(os.path.join(out_dir, dataset_split) + '.tsv', 'wb')
            inst_id = 0
            for example_idx, example in enumerate(tqdm(example_generator, total=total)):
                if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances:
                    break
                raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example(
                    example, names_to_types)

                article = ' '.join(raw_article_sents)
                writer.write((article + '\n').encode())

コード例 #20

0

ファイルを表示

    def fill_example_queue(self):

        input_gen = self.text_generator(data.example_generator(
            self._data_path))
        while True:

            try:
                band_name = next(input_gen)

            except StopIteration:
                print("The example generator has exhausted saved_data")
                raise Exception(
                    "single_pass off: Error! example generator out of saved_data."
                )

            example = Example(band_name, self._vocab, self._hps)
            self._example_queue.put(example)

コード例 #21

0

ファイルを表示

ファイル: batcher.py プロジェクト: sa7i/pointer_summarizer

  def fill_example_queue(self):
    input_gen = self.text_generator(data.example_generator(self._data_path, self._single_pass))

    while True:
      try:
        (article, abstract) = input_gen.next() # read the next example from file. article and abstract are both strings.
      except StopIteration: # if there are no more examples:
        tf.logging.info("The example generator for this example queue filling thread has exhausted data.")
        if self._single_pass:
          tf.logging.info("single_pass mode is on, so we've finished reading dataset. This thread is stopping.")
          self._finished_reading = True
          break
        else:
          raise Exception("single_pass mode is off but the example generator is out of data; error.")

      abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] # Use the <s> and </s> tags in abstract to get a list of sentences.
      example = Example(article, abstract_sentences, self._vocab) # Process into an Example.
      self._example_queue.put(example) # place the Example in the example queue.

コード例 #22

0

ファイルを表示

    def fill_example_queue(self):
        """Reads data from file and processes into Examples which are then placed into the example queue."""

        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass,
                                   self._cnn_500_dm_500))
        # counter = 0
        while True:
            try:
                (
                    article, abstracts, doc_indices_str, raw_article_sents
                ) = input_gen.next(
                )  # read the next example from file. article and abstract are both strings.
            except StopIteration:  # if there are no more examples:
                logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            all_abstract_sentences = [[
                sent.strip() for sent in data.abstract2sents(abstract)
            ] for abstract in abstracts]
            if len(all_abstract_sentences) != 0:
                abstract_sentences = all_abstract_sentences[0]
            else:
                abstract_sentences = []
            doc_indices = [int(idx) for idx in doc_indices_str.strip().split()]
            example = Example(article, abstract_sentences,
                              all_abstract_sentences, doc_indices,
                              raw_article_sents, self._vocab,
                              self._hps)  # Process into an Example.
            self._example_queue.put(
                example)  # place the Example in the example queue.

コード例 #23

0

ファイルを表示

    def fill_example_queue(self):
        """Reads data from file and processes into Examples which are then placed into the example queue."""

        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))
        f = 0
        d = open("cont.txt", "w+")
        while True:
            try:
                (article, abstract, rel_scores) = next(input_gen)
                f = f + 1
                #print("article")
                #print("article"+str(article))
                #print("abstract")
                #print("abstract"+str(abstract))
                #print("rel_scores")
                #print("rel_scores"+str(rel_scores))
                #input_gen.next() # read the next example from file. article and abstract are both strings.
            except StopIteration:  # if there are no more examples:
                tf.logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    tf.logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )
            #print(article)
            #print(abstract)
            #print(rel_scores)
            #abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] # Use the <s> and </s> tags in abstract to get a list of sentences.
            example = Example(article, abstract, rel_scores, self._vocab,
                              self._hps)  # Process into an Example.
            self._example_queue.put(
                example)  # place the Example in the example queue.
        d.write("ASDasdwqwq ewqewqewq ewq ewqe \n")
        d.write(str(f) + "\n")

コード例 #24

0

ファイルを表示

ファイル: batcher.py プロジェクト: PrimerAI/text-summarization

    def fill_example_queue(self):
        """
        Reads data from file and processes into Examples which are then placed into the example
        queue.
        """
        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))

        while True:
            try:
                # read the next example from file. article and abstract are both strings.
                (article, abstract) = input_gen.next()
            except StopIteration:
                # if there are no more examples:
                tf.logging.info(
                    "The example generator for this example queue filling thread has exhausted "
                    "data.")
                if self._single_pass:
                    tf.logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread "
                        "is stopping.")
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            example = Example(article, abstract, self._vocab, self._hps)
            if self._hps.attn_only_entities:
                n_people_enc_tokens = sum(
                    1 for token in example.enc_input[:self._hps.max_enc_steps]
                    if 3 <= token < 3 + len(data.PERSON_TOKENS))
                if n_people_enc_tokens < 2:
                    continue

            self._example_queue.put(example)

コード例 #25

0

ファイルを表示

ファイル: batcher.py プロジェクト: gztangde/split_encoder_pointer_summarizer

    def fill_example_queue(self):
        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))
        f = open("inputs.txt", "a")
        while True:
            try:
                (source1, source2, target) = input_gen.next(
                )  # read the next example from file. article and abstract are both strings.
                f.write(source1 + "\t" + source2 + "\t" + target + "\n")
            except StopIteration:  # if there are no more examples:
                tf.logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    tf.logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            abstract_sentences = [
                sent.strip() for sent in data.abstract2sents(target)
            ]  # Use the <s> and </s> tags in abstract to get a list of sentences.
            #example = Example(article, abstract_sentences, self._vocab) # Process into an Example.
            #example = Example2(article, ' '.join(abstract_sentences), abstract_sentences, self._vocab)
            #example = Example2(' '.join(abstract_sentences), article, abstract_sentences, self._vocab)
            #example = Example2(article, article, abstract_sentences, self._vocab)
            #example = Example2(' '.join(abstract_sentences), ' '.join(abstract_sentences), abstract_sentences, self._vocab)
            example = Example2(source1, source2, target.split(), self._vocab)
            self._example_queue.put(
                example)  # place the Example in the example queue.
        f.close()

コード例 #26

0

ファイルを表示

def main(unused_argv):

    print('Running statistics on %s' % FLAGS.dataset_name)

    if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    source_dir = os.path.join(data_dir, FLAGS.dataset_name)
    source_files = sorted(glob.glob(source_dir + '/' + FLAGS.dataset_split + '*'))

    total = len(source_files) * 1000 if ('cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name or 'xsum' in FLAGS.dataset_name) else len(source_files)
    example_generator = data.example_generator(source_dir + '/' + FLAGS.dataset_split + '*', True, False,
                                               should_check_valid=False)

    for example_idx, example in enumerate(tqdm(example_generator, total=total)):
        raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example(
            example, names_to_types)
        article_sent_tokens = [util.process_sent(sent) for sent in raw_article_sents]
        groundtruth_summ_sents = [[sent.strip() for sent in groundtruth_summary_text.strip().split('\n')]]
        if doc_indices is None:
            doc_indices = [0] * len(util.flatten_list_of_lists(article_sent_tokens))
        doc_indices = [int(doc_idx) for doc_idx in doc_indices]
        rel_sent_indices, _, _ = preprocess_for_lambdamart_no_flags.get_rel_sent_indices(doc_indices, article_sent_tokens)
        groundtruth_similar_source_indices_list = util.enforce_sentence_limit(groundtruth_similar_source_indices_list, FLAGS.sentence_limit)

コード例 #27

0

ファイルを表示

 def fill_example_queue(self):
     """
     Reads data from file and processes into Examples which are then placed into the example queue.
     """
     input_gen = data.example_generator(self.data_path, self.single_pass)
     while True:
         try:
             # read the next example from file. sentence and label are both strings.
             line = input_gen.next()
         except StopIteration:  # if there are no more examples:
             tf.logging.info(
                 "The example generator for this example queue filling thread has exhausted data."
             )
             if self.single_pass:
                 tf.logging.info(
                     "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                 )
                 self.finished_reading = True
                 break
             else:
                 raise Exception(
                     "single_pass mode is off but the example generator is out of data; error."
                 )
         if len(line) == 0:
             tf.logging.warning(
                 "Found an example with empty sentence text. Skipping it.")
         else:
             # Process into an Example.
             fields = line.split("\t")
             if len(fields) == 2:
                 sentence, label = fields[0], fields[1]
             else:
                 sentence, label = fields[0], ""
             example = Example(sentence, label, self.vocab, self.hps)
             # place the Example in the example queue.
             self.example_queue.put(example)

コード例 #28

0

ファイルを表示

def main(unused_argv):

    print('Running statistics on %s' % FLAGS.dataset_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.singles_and_pairs == 'singles':
        FLAGS.sentence_limit = 1
    else:
        FLAGS.sentence_limit = 2

    if FLAGS.dataset_name == 'all':
        dataset_names = ['cnn_dm', 'xsum', 'duc_2004']
    else:
        dataset_names = [FLAGS.dataset_name]

    for dataset_name in dataset_names:
        FLAGS.dataset_name = dataset_name

        source_dir = os.path.join(data_dir, dataset_name)

        if FLAGS.dataset_split == 'all':
            if dataset_name == 'duc_2004':
                dataset_splits = ['test']
            else:
                # dataset_splits = ['val_test', 'test', 'val', 'train']
                dataset_splits = ['test', 'val', 'train']
        else:
            dataset_splits = [FLAGS.dataset_split]

        for dataset_split in dataset_splits:
            if dataset_split == 'val_test':
                source_dataset_split = 'val'
            else:
                source_dataset_split = dataset_split

            source_files = sorted(
                glob.glob(source_dir + '/' + source_dataset_split + '*'))

            total = len(source_files) * 1000
            example_generator = data.example_generator(
                source_dir + '/' + source_dataset_split + '*',
                True,
                False,
                should_check_valid=False)

            out_dir = os.path.join('data', 'bert', dataset_name,
                                   FLAGS.singles_and_pairs, 'input')
            util.create_dirs(out_dir)

            writer = open(os.path.join(out_dir, dataset_split) + '.tsv', 'wb')
            header_list = [
                'should_merge', 'sent1', 'sent2', 'example_idx', 'inst_id',
                'ssi'
            ]
            writer.write(('\t'.join(header_list) + '\n').encode())
            inst_id = 0
            for example_idx, example in enumerate(
                    tqdm(example_generator, total=total)):
                raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example(
                    example, names_to_types)
                article_sent_tokens = [
                    util.process_sent(sent, whitespace=True)
                    for sent in raw_article_sents
                ]
                groundtruth_summ_sents = [[
                    sent.strip()
                    for sent in groundtruth_summary_text.strip().split('\n')
                ]]
                if dataset_name != 'duc_2004' or doc_indices is None or (
                        dataset_name != 'duc_2004' and len(doc_indices) != len(
                            util.flatten_list_of_lists(article_sent_tokens))):
                    doc_indices = [0] * len(
                        util.flatten_list_of_lists(article_sent_tokens))
                doc_indices = [int(doc_idx) for doc_idx in doc_indices]
                rel_sent_indices, _, _ = ssi_functions.get_rel_sent_indices(
                    doc_indices, article_sent_tokens)
                similar_source_indices_list = util.enforce_sentence_limit(
                    groundtruth_similar_source_indices_list,
                    FLAGS.sentence_limit)

                possible_pairs = [
                    x for x in list(
                        itertools.combinations(
                            list(range(len(raw_article_sents))), 2))
                ]  # all pairs
                possible_pairs = filter_pairs_by_sent_position(
                    possible_pairs, rel_sent_indices=rel_sent_indices)
                possible_singles = [(i, )
                                    for i in range(len(raw_article_sents))]
                positives = [ssi for ssi in similar_source_indices_list]

                if dataset_split == 'test' or dataset_split == 'val_test':
                    if FLAGS.singles_and_pairs == 'singles':
                        possible_combinations = possible_singles
                    else:
                        possible_combinations = possible_pairs + possible_singles
                    negatives = [
                        ssi for ssi in possible_combinations
                        if not (ssi in positives or ssi[::-1] in positives)
                    ]

                    for ssi_idx, ssi in enumerate(positives):
                        if len(ssi) == 0:
                            continue
                        if chronological_ssi and len(ssi) >= 2:
                            if ssi[0] > ssi[1]:
                                ssi = (min(ssi), max(ssi))
                        writer.write(
                            get_string_bert_example(raw_article_sents, ssi, 1,
                                                    example_idx,
                                                    inst_id).encode())
                        inst_id += 1
                    for ssi in negatives:
                        writer.write(
                            get_string_bert_example(raw_article_sents, ssi, 0,
                                                    example_idx,
                                                    inst_id).encode())
                        inst_id += 1

                else:
                    positive_sents = list(
                        set(util.flatten_list_of_lists(positives)))
                    negative_pairs = [
                        pair for pair in possible_pairs
                        if not any(i in positive_sents for i in pair)
                    ]
                    negative_singles = [
                        sing for sing in possible_singles
                        if not sing[0] in positive_sents
                    ]
                    random_negative_pairs = np.random.permutation(
                        len(negative_pairs)).tolist()
                    random_negative_singles = np.random.permutation(
                        len(negative_singles)).tolist()

                    for ssi in similar_source_indices_list:
                        if len(ssi) == 0:
                            continue
                        if chronological_ssi and len(ssi) >= 2:
                            if ssi[0] > ssi[1]:
                                ssi = (min(ssi), max(ssi))
                        is_pair = len(ssi) == 2
                        writer.write(
                            get_string_bert_example(raw_article_sents, ssi, 1,
                                                    example_idx,
                                                    inst_id).encode())
                        inst_id += 1

                        # False sentence single/pair
                        if is_pair:
                            if len(random_negative_pairs) == 0:
                                continue
                            negative_indices = negative_pairs[
                                random_negative_pairs.pop()]
                        else:
                            if len(random_negative_singles) == 0:
                                continue
                            negative_indices = negative_singles[
                                random_negative_singles.pop()]
                        article_lcs_paths = None
                        writer.write(
                            get_string_bert_example(raw_article_sents,
                                                    negative_indices, 0,
                                                    example_idx,
                                                    inst_id).encode())
                        inst_id += 1

コード例 #29

0

ファイルを表示

def main(unused_argv):

    print('Running statistics on %s' % FLAGS.dataset_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.summarizer == 'all':
        summary_methods = list(summarizers.keys())
    else:
        summary_methods = [FLAGS.summarizer]
    if FLAGS.dataset_name == 'all':
        dataset_names = datasets
    else:
        dataset_names = [FLAGS.dataset_name]

    sheets_strs = []
    for summary_method in summary_methods:
        summary_fn = summarizers[summary_method]
        for dataset_name in dataset_names:
            FLAGS.dataset_name = dataset_name

            original_dataset_name = 'xsum' if 'xsum' in dataset_name else 'cnn_dm' if 'cnn_dm' in dataset_name or 'duc_2004' in dataset_name else ''
            vocab = Vocab('logs/vocab' + '_' + original_dataset_name,
                          50000)  # create a vocabulary

            source_dir = os.path.join(data_dir, dataset_name)
            source_files = sorted(
                glob.glob(source_dir + '/' + FLAGS.dataset_split + '*'))

            total = len(source_files) * 1000 if (
                'cnn' in dataset_name or 'newsroom' in dataset_name
                or 'xsum' in dataset_name) else len(source_files)
            example_generator = data.example_generator(
                source_dir + '/' + FLAGS.dataset_split + '*',
                True,
                False,
                should_check_valid=False)

            if dataset_name == 'duc_2004':
                abs_source_dir = os.path.join(
                    os.path.expanduser('~') + '/data/tf_data/with_coref',
                    dataset_name)
                abs_example_generator = data.example_generator(
                    abs_source_dir + '/' + FLAGS.dataset_split + '*',
                    True,
                    False,
                    should_check_valid=False)
                abs_names_to_types = [('abstract', 'string_list')]

            triplet_ssi_list = []
            for example_idx, example in enumerate(
                    tqdm(example_generator, total=total)):
                raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example(
                    example, names_to_types)
                if dataset_name == 'duc_2004':
                    abs_example = next(abs_example_generator)
                    groundtruth_summary_texts = util.unpack_tf_example(
                        abs_example, abs_names_to_types)
                    groundtruth_summary_texts = groundtruth_summary_texts[0]
                    groundtruth_summ_sents_list = [[
                        sent.strip() for sent in data.abstract2sents(abstract)
                    ] for abstract in groundtruth_summary_texts]

                else:
                    groundtruth_summary_texts = [groundtruth_summary_text]
                    groundtruth_summ_sents_list = []
                    for groundtruth_summary_text in groundtruth_summary_texts:
                        groundtruth_summ_sents = [
                            sent.strip() for sent in
                            groundtruth_summary_text.strip().split('\n')
                        ]
                        groundtruth_summ_sents_list.append(
                            groundtruth_summ_sents)
                article_sent_tokens = [
                    util.process_sent(sent) for sent in raw_article_sents
                ]
                if doc_indices is None:
                    doc_indices = [0] * len(
                        util.flatten_list_of_lists(article_sent_tokens))
                doc_indices = [int(doc_idx) for doc_idx in doc_indices]
                groundtruth_similar_source_indices_list = util.enforce_sentence_limit(
                    groundtruth_similar_source_indices_list,
                    FLAGS.sentence_limit)

                log_dir = os.path.join(log_root,
                                       dataset_name + '_' + summary_method)
                dec_dir = os.path.join(log_dir, 'decoded')
                ref_dir = os.path.join(log_dir, 'reference')
                util.create_dirs(dec_dir)
                util.create_dirs(ref_dir)

                parser = PlaintextParser.from_string(
                    ' '.join(raw_article_sents), Tokenizer("english"))
                summarizer = summary_fn()

                summary = summarizer(
                    parser.document,
                    5)  #Summarize the document with 5 sentences
                summary = [str(sentence) for sentence in summary]

                summary_tokenized = []
                for sent in summary:
                    summary_tokenized.append(sent.lower())

                rouge_functions.write_for_rouge(groundtruth_summ_sents_list,
                                                summary_tokenized,
                                                example_idx,
                                                ref_dir,
                                                dec_dir,
                                                log=False)

                decoded_sent_tokens = [
                    sent.split() for sent in summary_tokenized
                ]
                sentence_limit = 2
                sys_ssi_list, _, _ = get_simple_source_indices_list(
                    decoded_sent_tokens, article_sent_tokens, vocab,
                    sentence_limit, min_matched_tokens)
                triplet_ssi_list.append(
                    (groundtruth_similar_source_indices_list, sys_ssi_list,
                     -1))

            print('Evaluating Lambdamart model F1 score...')
            suffix = util.all_sent_selection_eval(triplet_ssi_list)
            print(suffix)

            results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir)
            print(("Results_dict: ", results_dict))
            sheets_str = rouge_functions.rouge_log(results_dict,
                                                   log_dir,
                                                   suffix=suffix)
            sheets_strs.append(dataset_name + '_' + summary_method + '\n' +
                               sheets_str)

    for sheets_str in sheets_strs:
        print(sheets_str + '\n')

コード例 #30

0

ファイルを表示

ファイル: preprocess_for_lambdamart_no_flags.py プロジェクト: loganlebanoff/correct_summarization

def main(unused_argv):
    print('Running statistics on %s' % exp_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.singles_and_pairs == 'both':
        in_dataset = FLAGS.dataset_name
        out_dataset = FLAGS.dataset_name + '_both'
    else:
        in_dataset = FLAGS.dataset_name + '_singles'
        out_dataset = FLAGS.dataset_name + '_singles'

    if FLAGS.lr:
        out_dataset = FLAGS.dataset_name + '_lr'

    start_time = time.time()
    np.random.seed(random_seed)
    source_dir = os.path.join(data_dir, in_dataset)
    ex_sents = ['single .', 'sentence .']
    article_text = ' '.join(ex_sents)
    sent_term_matrix = util.get_doc_substituted_tfidf_matrix(
        tfidf_vectorizer, ex_sents, article_text, pca)
    if FLAGS.singles_and_pairs == 'pairs':
        single_feat_len = 0
    else:
        single_feat_len = len(
            get_single_sent_features(0, sent_term_matrix,
                                     [['single', '.'], ['sentence', '.']],
                                     [0, 0], 0))
    if FLAGS.singles_and_pairs == 'singles':
        pair_feat_len = 0
    else:
        pair_feat_len = len(
            get_pair_sent_features([0, 1], sent_term_matrix,
                                   [['single', '.'], ['sentence', '.']],
                                   [0, 0], [0, 0]))
    util.print_vars(single_feat_len, pair_feat_len)
    util.create_dirs(temp_dir)

    if FLAGS.dataset_split == 'all':
        dataset_splits = ['test', 'val', 'train']
    elif FLAGS.dataset_split == 'train_val':
        dataset_splits = ['val', 'train']
    else:
        dataset_splits = [FLAGS.dataset_split]
    for split in dataset_splits:
        source_files = sorted(glob.glob(source_dir + '/' + split + '*'))

        out_path = os.path.join(out_dir, out_dataset, split)
        if FLAGS.pca:
            out_path += '_pca'
        util.create_dirs(os.path.join(out_path))
        total = len(source_files) * 1000 if (
            'cnn' in in_dataset or 'newsroom' in in_dataset
            or 'xsum' in in_dataset) else len(source_files)
        example_generator = data.example_generator(source_dir + '/' + split +
                                                   '*',
                                                   True,
                                                   False,
                                                   should_check_valid=False)
        # for example in tqdm(example_generator, total=total):
        ex_gen = example_generator_extended(example_generator, total,
                                            single_feat_len, pair_feat_len,
                                            FLAGS.singles_and_pairs, out_path)
        print('Creating list')
        ex_list = [ex for ex in ex_gen]
        if FLAGS.num_instances != -1:
            ex_list = ex_list[:FLAGS.num_instances]
        print('Converting...')
        # all_features = pool.map(convert_article_to_lambdamart_features, ex_list)

        # all_features = ray.get([convert_article_to_lambdamart_features.remote(ex) for ex in ex_list])

        if FLAGS.lr:
            all_instances = list(
                futures.map(convert_article_to_lambdamart_features, ex_list))
            all_instances = util.flatten_list_of_lists(all_instances)
            x = [inst.features for inst in all_instances]
            x = np.array(x)
            y = [inst.relevance for inst in all_instances]
            y = np.expand_dims(np.array(y), 1)
            x_y = np.concatenate((x, y), 1)
            np.save(writer, x_y)
        else:
            list(futures.map(convert_article_to_lambdamart_features, ex_list))
            # writer.write(''.join(all_features))

        # all_features = []
        # for example  in tqdm(ex_gen, total=total):
        #     all_features.append(convert_article_to_lambdamart_features(example))

        # all_features = util.flatten_list_of_lists(all_features)
        # num1 = sum(x == 1 for x in all_features)
        # num2 = sum(x == 2 for x in all_features)
        # print 'Single sent: %d instances. Pair sent: %d instances.' % (num1, num2)

        # for example in tqdm(ex_gen, total=total):
        #     features = convert_article_to_lambdamart_features(example)
        #     writer.write(features)

        final_out_path = out_path + '.txt'
        file_names = sorted(glob.glob(os.path.join(out_path, '*')))
        writer = open(final_out_path, 'wb')
        for file_name in tqdm(file_names):
            with open(file_name) as f:
                text = f.read()
            writer.write(text)
        writer.close()
    util.print_execution_time(start_time)

コード例 #31

0

ファイルを表示

ファイル: lambdamart_scores_to_summaries.py プロジェクト: loganlebanoff/correct_summarization

def main(unused_argv):
    # def main(unused_argv):

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)
    print('Running statistics on %s' % exp_name)

    start_time = time.time()
    np.random.seed(random_seed)
    source_dir = os.path.join(data_dir, dataset_articles)
    source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*'))
    ex_sents = ['single .', 'sentence .']
    article_text = ' '.join(ex_sents)
    sent_term_matrix = util.get_doc_substituted_tfidf_matrix(
        tfidf_vectorizer, ex_sents, article_text, pca)
    if FLAGS.singles_and_pairs == 'pairs':
        single_feat_len = 0
    else:
        single_feat_len = len(
            get_single_sent_features(0, sent_term_matrix,
                                     [['single', '.'], ['sentence', '.']],
                                     [0, 0], 0))
    if FLAGS.singles_and_pairs == 'singles':
        pair_feat_len = 0
    else:
        pair_feat_len = len(
            get_pair_sent_features([0, 1], sent_term_matrix,
                                   [['single', '.'], ['sentence', '.']],
                                   [0, 0], [0, 0]))

    total = len(source_files
                ) * 1000 if 'cnn' or 'newsroom' in dataset_articles else len(
                    source_files)
    example_generator = data.example_generator(source_dir + '/' +
                                               dataset_split + '*',
                                               True,
                                               False,
                                               should_check_valid=False)

    if FLAGS.mode == 'write_to_file':
        ex_gen = example_generator_extended(example_generator, total,
                                            single_feat_len, pair_feat_len,
                                            FLAGS.singles_and_pairs)
        print('Creating list')
        ex_list = [ex for ex in ex_gen]
        print('Converting...')
        # if len(sys.argv) > 1 and sys.argv[1] == '-m':
        list(futures.map(write_to_lambdamart_examples_to_file, ex_list))
        # else:
        #     instances_list = []
        #     for ex in tqdm(ex_list):
        #         instances_list.append(write_to_lambdamart_examples_to_file(ex))

        file_names = sorted(glob.glob(os.path.join(temp_in_dir, '*')))
        instances_str = ''
        for file_name in tqdm(file_names):
            with open(file_name) as f:
                instances_str += f.read()
        with open(temp_in_path, 'wb') as f:
            f.write(instances_str)

    # RUN LAMBDAMART SCORING COMMAND HERE

    if FLAGS.mode == 'generate_summaries':
        qid_ssi_to_importances = rank_source_sents(temp_in_path, temp_out_path)
        ex_gen = example_generator_extended(example_generator, total,
                                            qid_ssi_to_importances,
                                            pair_feat_len,
                                            FLAGS.singles_and_pairs)
        print('Creating list')
        ex_list = [ex for ex in ex_gen]
        ssi_list = list(futures.map(evaluate_example, ex_list))

        # save ssi_list
        with open(os.path.join(my_log_dir, 'ssi.pkl'), 'w') as f:
            pickle.dump(ssi_list, f)
        with open(os.path.join(my_log_dir, 'ssi.pkl')) as f:
            ssi_list = pickle.load(f)
        print('Evaluating Lambdamart model F1 score...')
        suffix = util.all_sent_selection_eval(ssi_list)
        #
        # # for ex in tqdm(ex_list, total=total):
        # #     load_and_evaluate_example(ex)
        #
        print('Evaluating ROUGE...')
        results_dict = rouge_functions.rouge_eval(ref_dir,
                                                  dec_dir,
                                                  l_param=l_param)
        # print("Results_dict: ", results_dict)
        rouge_functions.rouge_log(results_dict, my_log_dir, suffix=suffix)

    util.print_execution_time(start_time)