def write_for_attnvis(self, article, abstract, decoded_words, attn_dists, p_gens, ex_index, ssi=None): """Write some data to json file, which can be read into the in-browser attention visualizer tool: https://github.com/abisee/attn_vis Args: article: The original article string. abstract: The human (correct) abstract string. attn_dists: List of arrays; the attention distributions. decoded_words: List of strings; the words of the generated summary. p_gens: List of scalars; the p_gen values. If not running in pointer-generator mode, list of None. """ article_lst = article.split() # list of words decoded_lst = decoded_words # list of decoded words to_write = { 'article_lst': [make_html_safe(t) for t in article_lst], 'decoded_lst': [make_html_safe(t) for t in decoded_lst], 'abstract_str': make_html_safe(abstract), 'attn_dists': attn_dists } if FLAGS.pointer_gen: to_write['p_gens'] = p_gens if ssi is not None: to_write['ssi'] = ssi util.create_dirs(os.path.join(self._decode_dir, 'attn_vis_data')) output_fname = os.path.join(self._decode_dir, 'attn_vis_data', '%06d.json' % ex_index) with open(output_fname, 'w') as output_file: json.dump(to_write, output_file)
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) source_dir = os.path.join(data_dir, FLAGS.dataset) source_files = sorted(glob.glob(source_dir + '/*')) for i in range(4): ref_dir = os.path.join(log_dir, 'reference_' + str(i), 'reference') dec_dir = os.path.join(log_dir, 'reference_' + str(i), 'decoded') util.create_dirs(ref_dir) util.create_dirs(dec_dir) for source_idx, source_file in enumerate(source_files): human_summary_texts = get_human_summary_texts(source_file) summaries = [] for summary_text in human_summary_texts: summary = data.abstract2sents(summary_text) summaries.append(summary) candidate = summaries[i] references = [ summaries[idx] for idx in range(len(summaries)) if idx != i ] rouge_functions.write_for_rouge(references, candidate, source_idx, ref_dir, dec_dir) results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir) # print("Results_dict: ", results_dict) rouge_functions.rouge_log(results_dict, os.path.join(log_dir, 'reference_' + str(i)))
def load_cache_and_dataset(loader_name, cache_prefix, dataset_path, check_cache=True): """"Finds a loader by name, builds a cache prefix from the loader name, loads a dataset and returns the set split into train, test and validate subsets. :param loader_name: Name of the dataset loader to use (e.g. LSP, FLIC). :param cache_prefix: Prefix of the cache directory. :param dataset_path: Path to dataset. :param check_cache: Should the cache be checked for picked datasets? :returns: Tuple of ``(fresh_cache, train_set, validate_set, test_set, cache_path)``, where ``fresh_cache`` is a boolean indicating whether the cache is still appropriate to use in later stages of the training pipeline (will be false iff the cache was written to during loading), ``{train, validate, test}_set`` are the relevant chunks of the data set, and ``cache_path`` is a complete path to the cache directory (including the appropriate prefix).""" # Get loader if loader_name not in datasets.ALLOWED_LOADERS: print("'{}' is not a valid loader. Allowed loaders: {}".format( loader_name, ', '.join(datasets.ALLOWED_LOADERS) ), file=sys.stderr) sys.exit(1) loader = getattr(datasets, loader_name) # Caching cache_dir = path.join(args.cache, loader_name) logging.info("Checking cache directory '{}'".format(cache_dir)) create_dirs(cache_dir) # Now actually load the dataset, if possible pickle_path = path.join(cache_dir, 'dataset_meta.pickle') if check_cache and path.exists(pickle_path): logging.info("Loading pickled dataset") with open(pickle_path) as fp: train_set, validate_set, test_set = pickle.load(fp) else: if check_cache: msg = "Pickled dataset not found" else: msg = "Ignoring cached dataset, if any" logging.info(msg + '; loading full dataset') whole_dataset = loader(dataset_path) # Training set will contain 1/2 of the set train_set, others = whole_dataset.split(2) # Validation and test sets will each contain 1/4 of the set validate_set, test_set = others.split(2) logging.info("Pickling dataset for future use") with open(pickle_path, 'w') as fp: pickle.dump((train_set, validate_set, test_set), fp) check_cache = False return check_cache, train_set, validate_set, test_set, cache_dir
def convert_to_importance_model(): """Load non-coverage checkpoint, add initialized extra variables for coverage, and save as new checkpoint""" logging.info("converting non-importance model to importance model..") new_log_root = FLAGS.log_root + '_imp' + str(FLAGS.imp_loss_wt) if FLAGS.imp_loss_oneminus: new_log_root += '_oneminus' print("copying models from %s to %s..." % (FLAGS.log_root, new_log_root)) util.create_dirs(new_log_root) copy_tree(FLAGS.log_root, new_log_root) print("copied.")
def run(self, prediction_date: datetime): full_feature_path = self._get_cached_path() if os.path.exists(full_feature_path): df = pd.read_pickle(full_feature_path) else: df = self._feature_function(prediction_date, **self._params_dict) create_dirs(full_feature_path) df.to_pickle(full_feature_path) return df
def run(self): full_feature_path = self._get_cached_path() if os.path.exists(full_feature_path): df = pd.read_pickle(full_feature_path) else: df = self._feature_function(**self._params_dict) create_dirs(os.path.dirname(full_feature_path)) df.to_pickle(full_feature_path) return df
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) util.create_dirs(out_full_dir) util.create_dirs(out_dir) write_to_bin(all_test_urls, out_dir, 'test') write_to_bin(all_val_urls, out_dir, 'val') write_to_bin(all_train_urls, out_dir, 'train') # Chunk the data. This splits each of train.bin, val.bin and test.bin into smaller chunks, each containing e.g. 1000 examples, and saves them in finished_files/chunks chunk_all(out_full_dir, out_dir)
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.dataset_name == 'all': dataset_names = ['cnn_dm', 'xsum', 'duc_2004'] else: dataset_names = [FLAGS.dataset_name] for dataset_name in dataset_names: FLAGS.dataset_name = dataset_name source_dir = os.path.join(data_dir, dataset_name) if FLAGS.dataset_split == 'all': if dataset_name == 'duc_2004': dataset_splits = ['test'] else: dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for dataset_split in dataset_splits: source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) total = len(source_files) * 1000 example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) out_dir = os.path.join('data', 'bert', dataset_name, 'article_embeddings', 'input_article') util.create_dirs(out_dir) writer = open(os.path.join(out_dir, dataset_split) + '.tsv', 'wb') inst_id = 0 for example_idx, example in enumerate(tqdm(example_generator, total=total)): if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances: break raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example( example, names_to_types) article = ' '.join(raw_article_sents) writer.write((article + '\n').encode())
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.dataset_split == 'all': if FLAGS.dataset_name == 'duc_2004': dataset_splits = ['test'] else: dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for dataset_split in dataset_splits: processed_data_path = os.path.join(processed_data_dir, FLAGS.dataset_name, dataset_split) articles_path = os.path.join(processed_data_path, 'articles.tsv') abstracts_path = os.path.join(processed_data_path, 'summaries.tsv') highlight_path = os.path.join(processed_data_path, 'highlight.tsv') out_full_dir = os.path.join(processed_data_path, 'temp_highlight') pretty_html_path = os.path.join(processed_data_path, 'pretty_html.html') util.create_dirs(os.path.dirname(pretty_html_path)) util.create_dirs(out_full_dir) f_art = open(articles_path) f_abs = open(abstracts_path) f_pretty_html = open(pretty_html_path, 'wb') total = util.num_lines_in_file(articles_path) ex_list = create_example_list(f_art, f_abs, pretty_html_path, out_full_dir, total) pool = mp.Pool(mp.cpu_count()) _ = list(tqdm(pool.imap(process_one_example, ex_list), total=total)) pool.close() f_pretty_html.close() file_list = sorted(glob.glob(os.path.join(out_full_dir, '*'))) f_hl = open(highlight_path, 'wb') for file_name in tqdm(file_list): with open(file_name) as f_single_file_hl: highlights = f_single_file_hl.read() f_hl.write(highlights.encode()) f_hl.close()
def process_images(bot, update, user_data): bot.send_chat_action(update.message.chat.id, ChatAction.TYPING) logging.info('receipt received') payer = update.message.chat.first_name photo = None for p in update.message.photo: if photo is None: photo = p if p.file_size > photo.file_size: photo = p temp_dir, receipts_dir = cfg.temp_dir % payer, cfg.receipts_dir % payer create_dirs([temp_dir, receipts_dir]) all_images_path = f'{temp_dir}/{photo.file_id}' receipts_path = f'{receipts_dir}/{photo.file_id}' photo.get_file().download(custom_path=all_images_path) logging.info(f'image downloaded: {photo.file_id}') logging.info(f'reading image text: {photo.file_id}') try: bill = rp.read(all_images_path, cfg.logos_path, payer) user_data['bill'] = bill if bill.name is None: cache_bill = bill update.message.reply_text( 'Não consegui identificar o nome dessa conta. Que conta é essa?' ) return SET_BILL_NAME else: os.rename(all_images_path, receipts_path) bill_confirmation_button(update.message, bill) return SUBMIT except rp.InvalidReceipt as e: logging.info(f'recibo invalido: {photo.file_id}') update.message.reply_text( 'Recibo invalido\nPor enquanto só sei cadastrar recibos do banco Itaú :/' ) return ConversationHandler.END
def convert_singpairmix_to_tf_examples(dataset_name, processed_data_dir, tf_example_dir, dataset_split='all'): out_dir = os.path.join(tf_example_dir, dataset_name) out_full_dir = os.path.join(out_dir, 'all') util.create_dirs(out_full_dir) if dataset_split == 'all': if dataset_name == 'duc_2004': dataset_splits = ['test'] else: dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [dataset_split] for dataset_split in dataset_splits: processed_data_path = os.path.join(processed_data_dir, dataset_name, dataset_split) articles_path = os.path.join(processed_data_path,'articles.tsv') abstracts_path = os.path.join(processed_data_path,'summaries.tsv') highlight_path = os.path.join(processed_data_path,'highlight.tsv') f_art = open(articles_path) f_abs = open(abstracts_path) f_hl = open(highlight_path) writer = open(os.path.join(out_full_dir, dataset_split + '.bin'), 'wb') total = util.num_lines_in_file(articles_path) for example_idx in tqdm(range(total)): raw_article_sents = f_art.readline().strip().split('\t') groundtruth_summ_sents = f_abs.readline().strip().split('\t') summary_text = '\n'.join(groundtruth_summ_sents) article_sent_tokens = [util.process_sent(sent, whitespace=True) for sent in raw_article_sents] doc_indices = None if doc_indices is None or (dataset_name != 'duc_2004' and len(doc_indices) != len( util.flatten_list_of_lists(article_sent_tokens))): doc_indices = [0] * len(util.flatten_list_of_lists(article_sent_tokens)) doc_indices_str = ' '.join([str(idx) for idx in doc_indices]) similar_source_indices = [source_indices.split(',') for source_indices in f_hl.readline().split('\t')] write_bert_tf_example(similar_source_indices, raw_article_sents, summary_text, None, doc_indices_str, None, writer, dataset_name) writer.close() if dataset_name == 'cnn_dm' or dataset_name == 'newsroom' or dataset_name == 'xsum': chunk_size = 1000 else: chunk_size = 1 util.chunk_file(dataset_split, out_full_dir, out_dir, chunk_size=chunk_size)
def __init__(self): # process managing classes self.rtpplay = rtp.RTPPlay() self.rtpdump = rtp.RTPDump() # rtpdump parameters self.dump_address = config.RTPDUMP_ADDRESS self.dump_port = config.RTPDUMP_PORT # rtpplay parameters self.preview_address = config.RTPDUMP_PREVIEW_ADDRESS self.preview_port = config.RTPDUMP_PREVIEW_PORT # directories used when saving/previewing files self.sync_dir = config.SYNC_DIR self.dump_dir = config.DUMP_DIR # name/file extension of recorded video files self.video_basename = config.VIDEO_BASENAME self.video_file_ext = config.VIDEO_FILE_EXT # file extension for commit files (they share the video's name) self.commit_file_ext = config.COMMIT_FILE_EXT # maximum time we will wait for a process to complete an action self.max_block_time = config.MAX_BLOCK_TIME # replaced with custom function in unit tests self.file_exists = os.path.exists # make sure we have the directory structure we'll need util.create_dirs(self.sync_dir, self.dump_dir) # make sure critical operations are atomic self.__lock = threading.Lock() # state variables self._commit_time = None self._start_time = None self._dump_file = None
def __init__(self): self.__lock = threading.Lock() self.rtpplay = rtp.RTPPlay() self.rtpplay_live = rtp.RTPPlay() self.play_address = config.RTPPLAY_ADDRESS self.play_port = config.RTPPLAY_PORT self.sync_dir = config.SYNC_DIR self.dump_dir = config.DUMP_DIR self.max_block_time = config.MAX_BLOCK_TIME util.create_dirs(self.sync_dir, self.dump_dir) self._is_playing = False self._armed_file = None self._is_live_playing = False # replaced with custom function in unit tests self.file_exists = os.path.exists self.file_getsize = os.path.getsize self.listdir = os.listdir
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) out_dir = os.path.join( os.path.expanduser('~') + '/data/kaiqiang_data', FLAGS.dataset_name) if FLAGS.mode == 'write': util.create_dirs(out_dir) if FLAGS.dataset_name == 'duc_2004': dataset_splits = ['test'] elif FLAGS.dataset_split == 'all': dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for dataset_split in dataset_splits: if dataset_split == 'test': ssi_data_path = os.path.join( 'logs/%s_bert_both_sentemb_artemb_plushidden' % FLAGS.dataset_name, 'ssi.pkl') print(util.bcolors.OKGREEN + "Loading SSI from BERT at %s" % ssi_data_path + util.bcolors.ENDC) with open(ssi_data_path) as f: ssi_triple_list = pickle.load(f) source_dir = os.path.join(data_dir, FLAGS.dataset_name) source_files = sorted( glob.glob(source_dir + '/' + dataset_split + '*')) total = len(source_files) * 1000 if ( 'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name or 'xsum' in FLAGS.dataset_name) else len(source_files) example_generator = data.example_generator( source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) out_document_path = os.path.join(out_dir, dataset_split + '.Ndocument') out_summary_path = os.path.join(out_dir, dataset_split + '.Nsummary') out_example_idx_path = os.path.join(out_dir, dataset_split + '.Nexampleidx') doc_writer = open(out_document_path, 'w') if dataset_split != 'test': sum_writer = open(out_summary_path, 'w') ex_idx_writer = open(out_example_idx_path, 'w') for example_idx, example in enumerate( tqdm(example_generator, total=total)): if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances: break raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] if FLAGS.dataset_name == 'duc_2004': groundtruth_summ_sents = [[ sent.strip() for sent in gt_summ_text.strip().split('\n') ] for gt_summ_text in groundtruth_summary_text] else: groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] if doc_indices is None: doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] # rel_sent_indices, _, _ = preprocess_for_lambdamart_no_flags.get_rel_sent_indices(doc_indices, article_sent_tokens) if dataset_split == 'test': if example_idx >= len(ssi_triple_list): raise Exception( 'Len of ssi list (%d) is less than number of examples (>=%d)' % (len(ssi_triple_list), example_idx)) ssi_length_extractive = ssi_triple_list[example_idx][2] if ssi_length_extractive > 1: a = 0 ssi = ssi_triple_list[example_idx][1] ssi = ssi[:ssi_length_extractive] groundtruth_similar_source_indices_list = ssi else: groundtruth_similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, FLAGS.sentence_limit) for ssi_idx, ssi in enumerate( groundtruth_similar_source_indices_list): if len(ssi) == 0: continue my_article = ' '.join(util.reorder(raw_article_sents, ssi)) doc_writer.write(my_article + '\n') if dataset_split != 'test': sum_writer.write(groundtruth_summ_sents[0][ssi_idx] + '\n') ex_idx_writer.write(str(example_idx) + '\n') elif FLAGS.mode == 'evaluate': summary_dir = '/home/logan/data/kaiqiang_data/logan_ACL/trained_on_' + FLAGS.train_dataset + '/' + FLAGS.dataset_name out_summary_path = os.path.join(summary_dir, 'test' + 'Summary.txt') out_example_idx_path = os.path.join(out_dir, 'test' + '.Nexampleidx') decode_dir = 'logs/kaiqiang_%s_trainedon%s' % (FLAGS.dataset_name, FLAGS.train_dataset) rouge_ref_dir = os.path.join(decode_dir, 'reference') rouge_dec_dir = os.path.join(decode_dir, 'decoded') util.create_dirs(rouge_ref_dir) util.create_dirs(rouge_dec_dir) def num_lines_in_file(file_path): with open(file_path) as f: num_lines = sum(1 for line in f) return num_lines def process_example(sents, ex_idx, groundtruth_summ_sents): final_decoded_words = [] for sent in sents: final_decoded_words.extend(sent.split(' ')) rouge_functions.write_for_rouge(groundtruth_summ_sents, None, ex_idx, rouge_ref_dir, rouge_dec_dir, decoded_words=final_decoded_words, log=False) num_lines_summary = num_lines_in_file(out_summary_path) num_lines_example_indices = num_lines_in_file(out_example_idx_path) if num_lines_summary != num_lines_example_indices: raise Exception( 'Num lines summary != num lines example indices: (%d, %d)' % (num_lines_summary, num_lines_example_indices)) source_dir = os.path.join(data_dir, FLAGS.dataset_name) example_generator = data.example_generator(source_dir + '/' + 'test' + '*', True, False, should_check_valid=False) sum_writer = open(out_summary_path) ex_idx_writer = open(out_example_idx_path) prev_ex_idx = 0 sents = [] for line_idx in tqdm(range(num_lines_summary)): line = sum_writer.readline() ex_idx = int(ex_idx_writer.readline()) if ex_idx == prev_ex_idx: sents.append(line) else: example = example_generator.next() raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example( example, names_to_types) if FLAGS.dataset_name == 'duc_2004': groundtruth_summ_sents = [[ sent.strip() for sent in gt_summ_text.strip().split('\n') ] for gt_summ_text in groundtruth_summary_text] else: groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] process_example(sents, ex_idx, groundtruth_summ_sents) prev_ex_idx = ex_idx sents = [line] example = example_generator.next() raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example( example, names_to_types) if FLAGS.dataset_name == 'duc_2004': groundtruth_summ_sents = [[ sent.strip() for sent in gt_summ_text.strip().split('\n') ] for gt_summ_text in groundtruth_summary_text] else: groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] process_example(sents, ex_idx, groundtruth_summ_sents) print("Now starting ROUGE eval...") if FLAGS.dataset_name == 'xsum': l_param = 100 else: l_param = 100 results_dict = rouge_functions.rouge_eval(rouge_ref_dir, rouge_dec_dir, l_param=l_param) rouge_functions.rouge_log(results_dict, decode_dir) else: raise Exception('mode flag was not evaluate or write.')
self.terminal = sys.stdout self.log = open("temp/logfile.txt", "w") def write(self, message): self.terminal.write(message) if not "\r" in message: self.log.write(message) self.log.flush() if __name__ == '__main__': if file_exists("mem/data_temp"): print("Danger: corrupted data file!") else: #Create directories create_dirs("mem/backup/") create_dirs("mem/important/") create_dirs("temp/") #Logging sys.stdout = Logger() #Check Python version print("Using Python version " +\ str(sys.version_info.major) + "." +\ str(sys.version_info.minor) + "." +\ str(sys.version_info.micro) + " " +\ sys.version_info.releaselevel + " " +\ str(int(round(log(sys.maxint * 2 + 2, 2)))) + "bit") if sys.version_info.major != 2: print("Not supported; use Python 2") elif 0: print("")
bin_values = [x / 100. for x in list(range(100))] pretty_dataset_names = { 'cnn_dm': 'CNN/DM', 'xsum': 'XSum', 'duc_2004': 'DUC-04' } plt.rcParams['font.family'] = 'serif' plt.rcParams['font.serif'] = 'Ubuntu' plt.rcParams['font.monospace'] = 'Ubuntu Mono' # plt.rcParams['font.weight'] = 'bold' plt.rcParams['axes.labelsize'] = 20 plt.rcParams['axes.labelweight'] = 'bold' plt.rcParams['axes.titlesize'] = 20 util.create_dirs('stuff/plots') plot_data_file = os.path.join( 'stuff/plots', FLAGS.dataset_name + '_' + FLAGS.dataset_split + '.pkl') plot_file = os.path.join( 'stuff/plots', FLAGS.dataset_name + '_' + FLAGS.dataset_split + '.pdf') def plot_histograms(all_list_of_hist_pairs): nrows = len(all_list_of_hist_pairs) ncols = len(all_list_of_hist_pairs[0]) fig, axes = plt.subplots(nrows=nrows, ncols=ncols) if axes.ndim == 1: axes = axes.reshape(1, -1) fig.set_size_inches(10, 5) fig.subplots_adjust(wspace=0.075, hspace=0.05)
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) start_time = time.time() np.random.seed(random_seed) if FLAGS.dataset_name == 'all': datasets = dataset_names else: datasets = [FLAGS.dataset_name] for dataset in datasets: coref_dir = os.path.join(FLAGS.coref_root, dataset) to_coref_dir = os.path.join(coref_dir, 'to_coref') corenlp_lists_dir = os.path.join(coref_dir, 'corenlp_lists') data_coref_dir = os.path.join(FLAGS.data_root, 'with_coref', dataset) util.create_dirs(to_coref_dir) util.create_dirs(corenlp_lists_dir) util.create_dirs(data_coref_dir) source_dir = os.path.join(FLAGS.data_root, dataset) if FLAGS.dataset_split == 'all': dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for dataset_split in dataset_splits: source_files = sorted( glob.glob(source_dir + '/' + dataset_split + '*')) total = len(source_files) * 1000 if ( 'cnn' in dataset or 'newsroom' in dataset or 'xsum' in dataset) else len(source_files) example_generator = data.example_generator( source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) if FLAGS.mode == 'prepare': corenlp_list = [] out_idx = 0 for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, article, abstract, doc_indices = util.unpack_tf_example( example, names_to_types) if raw_article_sents is None: continue raw_article = ' '.join(raw_article_sents) file_name = os.path.join( to_coref_dir, '%s_%06d.bin' % (dataset_split, out_idx)) with open(file_name, 'wb') as f: f.write(raw_article) corenlp_list.append(file_name) with open( os.path.join(corenlp_lists_dir, 'all_' + dataset_split + '.txt'), 'wb') as f: f.write('\n'.join(corenlp_list)) out_idx += 1 elif FLAGS.mode == 'create': process_coref_dir = os.path.join(coref_dir, 'processed') out_idx = 0 out_file_name = os.path.join( data_coref_dir, dataset_split + '_{:05d}.bin'.format(out_idx // 1000)) writer = open(os.path.join(out_file_name), 'wb') coref_files = sorted( glob.glob( os.path.join(process_coref_dir, dataset_split + '*'))) coref_dict = {} for c in coref_files: coref_dict[c.split('/')[-1].split('.json')[0]] = c print(len(coref_files), len(source_files)) for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, article, abstract, doc_indices = util.unpack_tf_example( example, names_to_types) if raw_article_sents is None: continue raw_article_sents = [ sent for sent in raw_article_sents if sent.strip() != '' ] if out_idx % 1000 == 0 and out_idx != 0: writer.close() out_file_name = os.path.join( data_coref_dir, dataset_split + '_{:05d}.bin'.format(out_idx // 1000)) writer = open(os.path.join(out_file_name), 'wb') # coref_file = os.path.join(process_coref_dir, 'test_%06d.bin.json' % example_idx) # coref_file = coref_files[out_idx] # matched_files = [name for name in coref_files if '%s_%06d.bin'%(dataset_split, out_idx) in name] file_name = '%s_%06d.bin' % (dataset_split, out_idx) if file_name in coref_dict: file_path = coref_dict[file_name] corefs = get_corefs(file_path) fixed_corefs = fix_trailing_apostrophe_s(corefs) corefs_relevant_info = remove_irrelevant(fixed_corefs) corefs_json = json.dumps(corefs_relevant_info) else: corefs_json = json.dumps([]) example.features.feature['corefs'].bytes_list.value.extend( [corefs_json]) tf_example = convert_data.make_example( article, abstract, doc_indices, raw_article_sents, corefs) convert_data.write_tf_example(example, writer) out_idx += 1 writer.close() # file_name = os.path.join(data_coref_dir, '%s_%06d.bin' % (dataset_split, example_idx)) # writer = open(file_name, 'wb') # coref_file = os.path.join(process_coref_dir, 'test_%06d.bin.json'%example_idx) # corefs = get_corefs(coref_file) # fixed_corefs = fix_trailing_apostrophe_s(corefs) # # corefs_relevant_info = remove_irrelevant(fixed_corefs) # corefs_json = json.dumps(corefs_relevant_info) # # example.features.feature['corefs'].bytes_list.value.extend([corefs_json]) # tf_example_str = example.SerializeToString() # str_len = len(tf_example_str) # writer.write(struct.pack('q', str_len)) # writer.write(struct.pack('%ds' % str_len, tf_example_str)) # # writer.close() util.print_execution_time(start_time)
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.singles_and_pairs == 'singles': FLAGS.sentence_limit = 1 else: FLAGS.sentence_limit = 2 if FLAGS.dataset_name == 'all': dataset_names = ['cnn_dm', 'xsum', 'duc_2004'] else: dataset_names = [FLAGS.dataset_name] for dataset_name in dataset_names: FLAGS.dataset_name = dataset_name source_dir = os.path.join(data_dir, dataset_name) if FLAGS.dataset_split == 'all': if dataset_name == 'duc_2004': dataset_splits = ['test'] else: # dataset_splits = ['val_test', 'test', 'val', 'train'] dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for dataset_split in dataset_splits: if dataset_split == 'val_test': source_dataset_split = 'val' else: source_dataset_split = dataset_split source_files = sorted( glob.glob(source_dir + '/' + source_dataset_split + '*')) total = len(source_files) * 1000 example_generator = data.example_generator( source_dir + '/' + source_dataset_split + '*', True, False, should_check_valid=False) out_dir = os.path.join('data', 'bert', dataset_name, FLAGS.singles_and_pairs, 'input') util.create_dirs(out_dir) writer = open(os.path.join(out_dir, dataset_split) + '.tsv', 'wb') header_list = [ 'should_merge', 'sent1', 'sent2', 'example_idx', 'inst_id', 'ssi' ] writer.write(('\t'.join(header_list) + '\n').encode()) inst_id = 0 for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent, whitespace=True) for sent in raw_article_sents ] groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] if dataset_name != 'duc_2004' or doc_indices is None or ( dataset_name != 'duc_2004' and len(doc_indices) != len( util.flatten_list_of_lists(article_sent_tokens))): doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] rel_sent_indices, _, _ = ssi_functions.get_rel_sent_indices( doc_indices, article_sent_tokens) similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, FLAGS.sentence_limit) possible_pairs = [ x for x in list( itertools.combinations( list(range(len(raw_article_sents))), 2)) ] # all pairs possible_pairs = filter_pairs_by_sent_position( possible_pairs, rel_sent_indices=rel_sent_indices) possible_singles = [(i, ) for i in range(len(raw_article_sents))] positives = [ssi for ssi in similar_source_indices_list] if dataset_split == 'test' or dataset_split == 'val_test': if FLAGS.singles_and_pairs == 'singles': possible_combinations = possible_singles else: possible_combinations = possible_pairs + possible_singles negatives = [ ssi for ssi in possible_combinations if not (ssi in positives or ssi[::-1] in positives) ] for ssi_idx, ssi in enumerate(positives): if len(ssi) == 0: continue if chronological_ssi and len(ssi) >= 2: if ssi[0] > ssi[1]: ssi = (min(ssi), max(ssi)) writer.write( get_string_bert_example(raw_article_sents, ssi, 1, example_idx, inst_id).encode()) inst_id += 1 for ssi in negatives: writer.write( get_string_bert_example(raw_article_sents, ssi, 0, example_idx, inst_id).encode()) inst_id += 1 else: positive_sents = list( set(util.flatten_list_of_lists(positives))) negative_pairs = [ pair for pair in possible_pairs if not any(i in positive_sents for i in pair) ] negative_singles = [ sing for sing in possible_singles if not sing[0] in positive_sents ] random_negative_pairs = np.random.permutation( len(negative_pairs)).tolist() random_negative_singles = np.random.permutation( len(negative_singles)).tolist() for ssi in similar_source_indices_list: if len(ssi) == 0: continue if chronological_ssi and len(ssi) >= 2: if ssi[0] > ssi[1]: ssi = (min(ssi), max(ssi)) is_pair = len(ssi) == 2 writer.write( get_string_bert_example(raw_article_sents, ssi, 1, example_idx, inst_id).encode()) inst_id += 1 # False sentence single/pair if is_pair: if len(random_negative_pairs) == 0: continue negative_indices = negative_pairs[ random_negative_pairs.pop()] else: if len(random_negative_singles) == 0: continue negative_indices = negative_singles[ random_negative_singles.pop()] article_lcs_paths = None writer.write( get_string_bert_example(raw_article_sents, negative_indices, 0, example_idx, inst_id).encode()) inst_id += 1
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) util.create_dirs(processed_root) # if not os.path.exists(os.path.join(raw_root, 'reference', 'summaries.txt')): util.create_dirs(os.path.join(raw_root, 'reference')) util.create_dirs(os.path.join(processed_root, 'article')) source_dir = os.path.join(data_dir, FLAGS.dataset_name) source_files = sorted( glob.glob(source_dir + '/' + FLAGS.dataset_split + '*')) total = len(source_files) * 1000 if ( 'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name or 'xsum' in FLAGS.dataset_name) else len(source_files) example_generator = data.example_generator(source_dir + '/' + FLAGS.dataset_split + '*', True, False, should_check_valid=False) if preprocess_article_and_human_summaries: writer = open(os.path.join(raw_root, 'reference', 'summaries.txt'), 'w') writer_article = open( os.path.join(processed_root, 'article', 'articles.txt'), 'w') writer_tokenized_article = open( os.path.join(processed_root, 'article', 'articles_tokenized.txt'), 'w') reference_articles = [] for example_idx, example in enumerate( tqdm(example_generator, total=total)): if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances: break raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) groundtruth_summ_sents = [ util.unfix_bracket_tokens_in_sent(sent.strip()) for sent in groundtruth_summary_text.strip().split('\n') ] writer.write('\t'.join(groundtruth_summ_sents) + '\n') reference_article = '\t'.join([ util.unfix_bracket_tokens_in_sent(sent.strip()) for sent in raw_article_sents ]) reference_articles.append(reference_article) pretty_reference_article = fix_punctuations(reference_article) writer_article.write(pretty_reference_article + '\n') writer_tokenized_article.write(reference_article + '\n') writer.close() for system in systems: print('Processing ' + system + '...') raw_dir = os.path.join(raw_root, system) processed_dir = os.path.join(processed_root, system) util.create_dirs(processed_dir) if system == 'reference': with open(os.path.join(raw_dir, 'summaries.txt')) as f: with open(os.path.join(processed_dir, 'summaries.txt'), 'w') as writer: text = f.read() pretty_reference_summaries = fix_punctuations(text) writer.write(pretty_reference_summaries) reference_summaries = [ summ.strip() for summ in text.split('\n') if summ.strip() != '' ] with open( os.path.join(processed_dir, 'summaries_tokenized.txt'), 'w') as writer_tokenized: writer_tokenized.write(text + '\n') elif system == 'abs-rl-rerank': decoded_files = sorted( glob.glob( os.path.join(raw_dir, 'rnn-ext_abs_rl_rerank', 'decoded', '*.dec'))) sys_ref_files = sorted( glob.glob(os.path.join(raw_dir, 'reference', '*.ref'))) summaries = [] for file in decoded_files: with open(file) as f: text = f.read() text = util.unfix_bracket_tokens_in_sent(text) summary_sents = text.split('\n') summaries.append('\t'.join(summary_sents)) sys_ref_summaries = [] for file in sys_ref_files: with open(file) as f: text = f.read() text = util.unfix_bracket_tokens_in_sent(text) summary_sents = text.split('\n') sys_ref_summaries.append('\t'.join(summary_sents)) reordered_summaries = reorder_list_like(summaries, sys_ref_summaries, reference_summaries) with open(os.path.join(processed_dir, 'summaries.txt'), 'w') as writer: with open( os.path.join(processed_dir, 'summaries_tokenized.txt'), 'w') as writer_tokenized: for summ in reordered_summaries: writer_tokenized.write(summ + '\n') writer.write(fix_punctuations(summ) + '\n') elif system == 'pg': decoded_files = sorted( glob.glob( os.path.join(raw_dir, 'pointer-gen-cov', '*_decoded.txt'))) summaries = [] for file in tqdm(decoded_files): with open(file) as f: summary_sents = f.read().split('\n') summaries.append('\t'.join(summary_sents)) ref_files = sorted( glob.glob(os.path.join(raw_dir, 'reference', '*_reference.txt'))) sys_ref_summaries = [] for file in tqdm(ref_files): with open(file) as f: summary_sents = f.read().split('\n') sys_ref_summaries.append('\t'.join(summary_sents)) reordered_summaries = reorder_list_like(summaries, sys_ref_summaries, reference_summaries) with open(os.path.join(processed_dir, 'summaries.txt'), 'w') as writer: with open( os.path.join(processed_dir, 'summaries_tokenized.txt'), 'w') as writer_tokenized: for summ in reordered_summaries: writer_tokenized.write(summ + '\n') writer.write(fix_punctuations(summ) + '\n') elif system == 'bottom-up': with open( os.path.join(raw_dir, 'bottom_up_cnndm_015_threshold.out')) as f: text_with_slash_t = f.read() text_with_slash_t = util.unfix_bracket_tokens_in_sent( text_with_slash_t) text_tab_separated = slash_t_to_tab_separated( text_with_slash_t) summaries = [ summ.strip() for summ in text_tab_separated.split('\n') if summ.strip() != '' ] with open(os.path.join(raw_dir, 'test.txt.tgt.tagged.shuf.noslash')) as f: text_with_slash_t = f.read() text_tab_separated = slash_t_to_tab_separated( text_with_slash_t) sys_ref_summaries = [ summ.strip() for summ in text_tab_separated.split('\n') if summ.strip() != '' ] reordered_summaries = reorder_list_like(summaries, sys_ref_summaries, reference_summaries) with open(os.path.join(processed_dir, 'summaries.txt'), 'w') as writer: with open( os.path.join(processed_dir, 'summaries_tokenized.txt'), 'w') as writer_tokenized: for summ in reordered_summaries: writer_tokenized.write(summ + '\n') writer.write(fix_punctuations(summ) + '\n') elif system == 'dca': with open(os.path.join(raw_dir, 'cnndm_m6_m7.txt')) as f: text = f.read() lines = text.split('\n') summary_texts = [] sys_ref_summary_texts = [] for line in tqdm(lines[1:]): if line.strip() == '': continue if len(line.split('\t')) != 3: a = 0 sys_ref_summary, _, summary = line.split('\t') summary = summary.replace('u . s .', 'u.s.') sys_ref_summary = sys_ref_summary.replace('u . s .', 'u.s.') summary_texts.append(summary) sys_ref_summary_texts.append(sys_ref_summary) summaries = [get_sents(summary) for summary in tqdm(summary_texts)] sys_ref_summaries = [ get_sents(sys_ref_summary) for sys_ref_summary in tqdm(sys_ref_summary_texts) ] reordered_summaries = reorder_list_like(summaries, sys_ref_summaries, reference_summaries) with open(os.path.join(processed_dir, 'summaries.txt'), 'w') as writer: with open( os.path.join(processed_dir, 'summaries_tokenized.txt'), 'w') as writer_tokenized: for summ in reordered_summaries: writer_tokenized.write(summ + '\n') writer.write(fix_punctuations(summ) + '\n') elif system == 'novel': with open(os.path.join(raw_dir, 'rl-novelty-lm.out')) as f: text = f.read() lines = text.split('\n') summaries = [] sys_articles = [] summary_texts = [] sys_article_texts = [] for line in tqdm(lines): if line.strip() == '': continue obj = json.loads(line) article = obj['article'] summary = obj['prediction'] summary_texts.append( util.unfix_bracket_tokens_in_sent(summary)) sys_article_texts.append( util.unfix_bracket_tokens_in_sent(article)) # nlp_summaries = nlp.pipe(summary_texts) # nlp_sys_articles = nlp.pipe(sys_article_texts) summaries = [ get_sents(summary) for summary in tqdm(summary_texts, total=11490) ] sys_articles = [ get_sents(article) for article in tqdm(sys_article_texts, total=11490) ] reordered_summaries = reorder_list_like(summaries, sys_articles, reference_articles) with open(os.path.join(processed_dir, 'summaries.txt'), 'w') as writer: with open( os.path.join(processed_dir, 'summaries_tokenized.txt'), 'w') as writer_tokenized: for summ in reordered_summaries: writer_tokenized.write(summ + '\n') writer.write(fix_punctuations(summ) + '\n') a = 0
if file_exists("temp/logfile.txt"): shutil.copyfile("temp/logfile.txt", "temp/logfile_old.txt") self.terminal = sys.stdout self.log = open("temp/logfile.txt", "w") def write(self, message): self.terminal.write(message) if not "\r" in message: self.log.write(message) self.log.flush() if __name__ == '__main__': if file_exists("mem/data_temp"): print("Danger: corrupted data file!") else: #Create directories create_dirs("mem/backup/") create_dirs("mem/important/") create_dirs("temp/") #Logging sys.stdout = Logger() #Check Python version print("Using Python version " +\ str(sys.version_info.major) + "." +\ str(sys.version_info.minor) + "." +\ str(sys.version_info.micro) + " " +\ sys.version_info.releaselevel + " " +\ str(int(round(log(sys.maxint * 2 + 2, 2)))) + "bit") if sys.version_info.major != 2: print("Not supported; use Python 2") elif 0: print("")
def main(unused_argv): print('Running statistics on %s' % FLAGS.exp_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.all_actions: FLAGS.sent_dataset = True FLAGS.ssi_dataset = True FLAGS.print_output = True FLAGS.highlight = True original_dataset_name = 'xsum' if 'xsum' in FLAGS.dataset_name else 'cnn_dm' if ( 'cnn_dm' in FLAGS.dataset_name or 'duc_2004' in FLAGS.dataset_name) else '' vocab = Vocab(FLAGS.vocab_path + '_' + original_dataset_name, FLAGS.vocab_size) # create a vocabulary source_dir = os.path.join(data_dir, FLAGS.dataset_name) util.create_dirs(html_dir) if FLAGS.dataset_split == 'all': if FLAGS.dataset_name == 'duc_2004': dataset_splits = ['test'] else: dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for dataset_split in dataset_splits: source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) if FLAGS.exp_name == 'reference': # summary_dir = log_dir + default_exp_name + '/decode_test_' + str(max_enc_steps) + \ # 'maxenc_4beam_' + str(min_dec_steps) + 'mindec_' + str(max_dec_steps) + 'maxdec_ckpt-238410/reference' # summary_files = sorted(glob.glob(summary_dir + '/*_reference.A.txt')) summary_dir = source_dir summary_files = source_files else: if FLAGS.exp_name == 'cnn_dm': summary_dir = log_dir + FLAGS.exp_name + '/decode_test_400maxenc_4beam_35mindec_100maxdec_ckpt-238410/decoded' else: ckpt_folder = util.find_largest_ckpt_folder(log_dir + FLAGS.exp_name) summary_dir = log_dir + FLAGS.exp_name + '/' + ckpt_folder + '/decoded' # summary_dir = log_dir + FLAGS.exp_name + '/decode_test_' + str(max_enc_steps) + \ # 'maxenc_4beam_' + str(min_dec_steps) + 'mindec_' + str(max_dec_steps) + 'maxdec_ckpt-238410/decoded' summary_files = sorted(glob.glob(summary_dir + '/*')) if len(summary_files) == 0: raise Exception('No files found in %s' % summary_dir) example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, is_original=True) pros = { 'annotators': 'dcoref', 'outputFormat': 'json', 'timeout': '5000000' } all_merge_examples = [] num_extracted_list = [] distances = [] relative_distances = [] html_str = '' extracted_sents_in_article_html = '' name = FLAGS.dataset_name + '_' + FLAGS.exp_name if FLAGS.coreference_replacement: name += '_coref' highlight_file_name = os.path.join( html_dir, FLAGS.dataset_name + '_' + FLAGS.exp_name) if FLAGS.consider_stopwords: highlight_file_name += '_stopwords' if FLAGS.highlight: extracted_sents_in_article_html_file = open( highlight_file_name + '_extracted_sents.html', 'wb') if FLAGS.kaiqiang: kaiqiang_article_texts = [] kaiqiang_abstract_texts = [] util.create_dirs(kaiqiang_dir) kaiqiang_article_file = open( os.path.join( kaiqiang_dir, FLAGS.dataset_name + '_' + dataset_split + '_' + str(FLAGS.min_matched_tokens) + '_articles.txt'), 'wb') kaiqiang_abstract_file = open( os.path.join( kaiqiang_dir, FLAGS.dataset_name + '_' + dataset_split + '_' + str(FLAGS.min_matched_tokens) + '_abstracts.txt'), 'wb') if FLAGS.ssi_dataset: if FLAGS.tag_tokens: with_coref_and_ssi_dir = lambdamart_dir + '_and_tag_tokens' else: with_coref_and_ssi_dir = lambdamart_dir lambdamart_out_dir = os.path.join(with_coref_and_ssi_dir, FLAGS.dataset_name) if FLAGS.sentence_limit == 1: lambdamart_out_dir += '_singles' if FLAGS.consider_stopwords: lambdamart_out_dir += '_stopwords' lambdamart_out_full_dir = os.path.join(lambdamart_out_dir, 'all') util.create_dirs(lambdamart_out_full_dir) lambdamart_writer = open( os.path.join(lambdamart_out_full_dir, dataset_split + '.bin'), 'wb') simple_similar_source_indices_list_plus_empty = [] example_idx = -1 instance_idx = 0 total = len(source_files) * 1000 if ( 'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name or 'xsum' in FLAGS.dataset_name) else len(source_files) random_choices = None if FLAGS.randomize: if FLAGS.dataset_name == 'cnn_dm': list_order = np.random.permutation(11490) random_choices = list_order[:FLAGS.num_instances] for example in tqdm(example_generator, total=total): example_idx += 1 if FLAGS.num_instances != -1 and instance_idx >= FLAGS.num_instances: break if random_choices is not None and example_idx not in random_choices: continue # for file_idx in tqdm(range(len(source_files))): # example = get_tf_example(source_files[file_idx]) article_text = example.features.feature[ 'article'].bytes_list.value[0].decode().lower() if FLAGS.exp_name == 'reference': summary_text, all_summary_texts = get_summary_from_example( example) else: summary_text = get_summary_text(summary_files[example_idx]) article_tokens = split_into_tokens(article_text) if 'raw_article_sents' in example.features.feature and len( example.features.feature['raw_article_sents'].bytes_list. value) > 0: raw_article_sents = example.features.feature[ 'raw_article_sents'].bytes_list.value raw_article_sents = [ sent.decode() for sent in raw_article_sents if sent.decode().strip() != '' ] article_sent_tokens = [ util.process_sent(sent, whitespace=True) for sent in raw_article_sents ] else: # article_text = util.to_unicode(article_text) # sent_pros = {'annotators': 'ssplit', 'outputFormat': 'json', 'timeout': '5000000'} # sents_result_dict = nlp.annotate(str(article_text), properties=sent_pros) # article_sent_tokens = [[token['word'] for token in sent['tokens']] for sent in sents_result_dict['sentences']] raw_article_sents = nltk.tokenize.sent_tokenize(article_text) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] if FLAGS.top_n_sents != -1: article_sent_tokens = article_sent_tokens[:FLAGS.top_n_sents] raw_article_sents = raw_article_sents[:FLAGS.top_n_sents] article_sents = [' '.join(sent) for sent in article_sent_tokens] try: article_tokens_string = str(' '.join(article_sents)) except: try: article_tokens_string = str(' '.join( [sent.decode('latin-1') for sent in article_sents])) except: raise if len(article_sent_tokens) == 0: continue summary_sent_tokens = split_into_sent_tokens(summary_text) if 'doc_indices' in example.features.feature and len( example.features.feature['doc_indices'].bytes_list.value ) > 0: doc_indices_str = example.features.feature[ 'doc_indices'].bytes_list.value[0].decode() if '1' in doc_indices_str: doc_indices = [ int(x) for x in doc_indices_str.strip().split() ] rel_sent_positions = importance_features.get_sent_indices( article_sent_tokens, doc_indices) else: num_tokens_total = sum( [len(sent) for sent in article_sent_tokens]) rel_sent_positions = list(range(len(raw_article_sents))) doc_indices = [0] * num_tokens_total else: rel_sent_positions = None doc_indices = None doc_indices_str = None if 'corefs' in example.features.feature and len( example.features.feature['corefs'].bytes_list.value) > 0: corefs_str = example.features.feature[ 'corefs'].bytes_list.value[0] corefs = json.loads(corefs_str) # summary_sent_tokens = limit_to_n_tokens(summary_sent_tokens, 100) similar_source_indices_list_plus_empty = [] simple_similar_source_indices, lcs_paths_list, article_lcs_paths_list, smooth_article_paths_list = ssi_functions.get_simple_source_indices_list( summary_sent_tokens, article_sent_tokens, vocab, FLAGS.sentence_limit, FLAGS.min_matched_tokens, not FLAGS.consider_stopwords, lemmatize=FLAGS.lemmatize, multiple_ssi=FLAGS.multiple_ssi) article_paths_parameter = article_lcs_paths_list if FLAGS.tag_tokens else None article_paths_parameter = smooth_article_paths_list if FLAGS.smart_tags else article_paths_parameter restricted_source_indices = util.enforce_sentence_limit( simple_similar_source_indices, FLAGS.sentence_limit) for summ_sent_idx, summ_sent in enumerate(summary_sent_tokens): if FLAGS.sent_dataset: if len(restricted_source_indices[summ_sent_idx]) == 0: continue merge_example = get_merge_example( restricted_source_indices[summ_sent_idx], article_sent_tokens, summ_sent, corefs, article_paths_parameter[summ_sent_idx]) all_merge_examples.append(merge_example) simple_similar_source_indices_list_plus_empty.append( simple_similar_source_indices) if FLAGS.ssi_dataset: summary_text_to_save = [ s for s in all_summary_texts ] if FLAGS.dataset_name == 'duc_2004' else summary_text write_lambdamart_example(simple_similar_source_indices, raw_article_sents, summary_text_to_save, corefs_str, doc_indices_str, article_paths_parameter, lambdamart_writer) if FLAGS.highlight: highlight_article_lcs_paths_list = smooth_article_paths_list if FLAGS.smart_tags else article_lcs_paths_list # simple_ssi_plus_empty = [ [s[0] for s in sim_source_ind] for sim_source_ind in simple_similar_source_indices] extracted_sents_in_article_html = ssi_functions.html_highlight_sents_in_article( summary_sent_tokens, simple_similar_source_indices, article_sent_tokens, doc_indices, lcs_paths_list, highlight_article_lcs_paths_list) extracted_sents_in_article_html_file.write( extracted_sents_in_article_html.encode()) a = 0 instance_idx += 1 if FLAGS.ssi_dataset: lambdamart_writer.close() if FLAGS.dataset_name == 'cnn_dm' or FLAGS.dataset_name == 'newsroom' or FLAGS.dataset_name == 'xsum': chunk_size = 1000 else: chunk_size = 1 util.chunk_file(dataset_split, lambdamart_out_full_dir, lambdamart_out_dir, chunk_size=chunk_size) if FLAGS.sent_dataset: with_coref_dir = data_dir + '_and_tag_tokens' if FLAGS.tag_tokens else data_dir out_dir = os.path.join(with_coref_dir, FLAGS.dataset_name + '_sent') if FLAGS.sentence_limit == 1: out_dir += '_singles' if FLAGS.consider_stopwords: out_dir += '_stopwords' if FLAGS.coreference_replacement: out_dir += '_coref' if FLAGS.top_n_sents != -1: out_dir += '_n=' + str(FLAGS.top_n_sents) util.create_dirs(out_dir) convert_data.write_with_generator(iter(all_merge_examples), len(all_merge_examples), out_dir, dataset_split) if FLAGS.print_output: # html_str = FLAGS.dataset + ' | ' + FLAGS.exp_name + '<br><br><br>' + html_str # save_fusions_to_file(html_str) ssi_path = os.path.join(ssi_dir, FLAGS.dataset_name) if FLAGS.consider_stopwords: ssi_path += '_stopwords' util.create_dirs(ssi_path) if FLAGS.dataset_name == 'duc_2004' and FLAGS.abstract_idx != 0: abstract_idx_str = '_%d' % FLAGS.abstract_idx else: abstract_idx_str = '' with open( os.path.join( ssi_path, dataset_split + '_ssi' + abstract_idx_str + '.pkl'), 'wb') as f: pickle.dump(simple_similar_source_indices_list_plus_empty, f) if FLAGS.kaiqiang: # kaiqiang_article_file.write('\n'.join(kaiqiang_article_texts)) # kaiqiang_abstract_file.write('\n'.join(kaiqiang_abstract_texts)) kaiqiang_article_file.close() kaiqiang_abstract_file.close() if FLAGS.highlight: extracted_sents_in_article_html_file.close() a = 0
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.dataset_name == 'all': dataset_names = ['cnn_dm', 'xsum', 'duc_2004'] else: dataset_names = [FLAGS.dataset_name] for dataset_name in dataset_names: FLAGS.dataset_name = dataset_name source_dir = os.path.join(data_dir, dataset_name) if FLAGS.dataset_split == 'all': if dataset_name == 'duc_2004': dataset_splits = ['test'] else: dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for dataset_split in dataset_splits: source_files = sorted( glob.glob(source_dir + '/' + dataset_split + '*')) total = len(source_files) * 1000 example_generator = data.example_generator( source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) out_dir = os.path.join('data', 'bert', dataset_name, 'article_embeddings', 'input_article') util.create_dirs(out_dir) writer = open(os.path.join(out_dir, dataset_split) + '.tsv', 'wb') # writer.write('\t'.join(['should_merge', 'sent1', 'sent2', 'example_idx', 'ssi']) + '\n') inst_id = 0 for example_idx, example in enumerate( tqdm(example_generator, total=total)): if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances: break raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) # article_sent_tokens = [util.process_sent(sent) for sent in raw_article_sents] # groundtruth_summ_sents = [[sent.strip() for sent in groundtruth_summary_text.strip().split('\n')]] # if doc_indices is None or (dataset_name != 'duc_2004' and len(doc_indices) != len(util.flatten_list_of_lists(article_sent_tokens))): # doc_indices = [0] * len(util.flatten_list_of_lists(article_sent_tokens)) # doc_indices = [int(doc_idx) for doc_idx in doc_indices] # rel_sent_indices, _, _ = preprocess_for_lambdamart_no_flags.get_rel_sent_indices(doc_indices, article_sent_tokens) # similar_source_indices_list = util.enforce_sentence_limit(groundtruth_similar_source_indices_list, FLAGS.sentence_limit) article = ' '.join(raw_article_sents) writer.write(article + '\n')
def convertImagesMasterMAP(targetDir, targetMetaDir, imageMetaDir, jobs, img2bal, stopped, queue, result_queue, num_imgs2process, verbose=False, nProc=None, method_galign=GALIGN_NORMAL, method_lalign=LALIGN_NORMAL): """ Called by both single and multi-page elections. Performs Target Extraction. Input: str targetDir: Directory to dump extracted target images to. str targetMetaDir: Directory to store target metadata into. str imageMetaDir: Directory to store metadata for each Ballot, such as ballotpath, path to each extracted target, assoc'd blank ballot, isflipped. list jobs: [[tmppaths_i, bbs_i, imgpaths_i, targetDir_i, targetDiffDir_i, imageMetaDir_i, queue], ...] stopped: """ targetDiffDir = targetDir + '_diffs' print "...removing previous Target Extract results..." if os.path.exists(targetDir): shutil.rmtree(targetDir) if os.path.exists(targetDiffDir): shutil.rmtree(targetDiffDir) if os.path.exists(targetMetaDir): shutil.rmtree(targetMetaDir) if os.path.exists(imageMetaDir): shutil.rmtree(imageMetaDir) print "...Finished removing previous Target Extract results" create_dirs(targetDir) create_dirs(targetDiffDir) create_dirs(targetMetaDir) create_dirs(imageMetaDir) if nProc is None: nProc = sh.numProcs() # nProc = 1 num_jobs = len(jobs) if nProc < 2: print 'using only 1 processes' # default behavior for non multiproc machines for job in jobs: if stopped(): return False t0 = time.clock() convertImagesWorkerMAP(job) print time.clock() - t0 else: print 'using ', nProc, ' processes' pool = mp.Pool(processes=nProc) ''' it = [False] def imdone(x): it[0] = True print "I AM DONE NOW!" ''' if wx.App.IsMainLoopRunning(): util.MyGauge.all_next_job(num_jobs) print "GOING UP TO", num_jobs # pool.map_async(convertImagesWorkerMAP,jobs,callback=lambda x: imdone(it)) pool.map_async(convertImagesWorkerMAP, jobs) cnt = 0 while cnt < len(jobs): val = queue.get(block=True) if val == True: if wx.App.IsMainLoopRunning(): util.MyGauge.all_tick() cnt += 1 elif type(val) in (str, unicode): # Something went wrong! print " WARNING: detected a failed extract job {0}.".format( cnt) cnt += 1 pool.close() pool.join() print " (Finished processing targetextract jobs)" cnt = 0 avg_intensities = [] # [(path, float avg_intensity), ...] # maps {int ballotid: {int page: [targetsdir, targetmetadir, diffmetadir, # imgmetadir]}} bal2targets = {} while cnt < num_imgs2process: (avg_intensities_cur, balP, page, target_rootdir, targetdiff_rootdir, imgmeta_rootdir) = result_queue.get(block=True) avg_intensities.extend(avg_intensities_cur) ballotid = img2bal[balP] # print "...finished ballotid {0}".format(ballotid) bal2targets.setdefault(ballotid, {})[page] = (target_rootdir, targetdiff_rootdir, imgmeta_rootdir) cnt += 1 print 'done.' return avg_intensities, bal2targets
if FLAGS.dataset_name == 'xsum': l_param = 40 else: l_param = 100 temp_in_dir = os.path.join(lambdamart_in_dir, 'lambdamart_' + FLAGS.singles_and_pairs) temp_out_dir = os.path.join(lambdamart_out_dir, 'lambdamart_' + FLAGS.singles_and_pairs) if FLAGS.pca: temp_in_dir += '_pca' temp_out_dir += '_pca' temp_in_path = temp_in_dir + '.txt' temp_out_path = temp_out_dir + '.txt' util.create_dirs(temp_in_dir) util.create_dirs(temp_out_dir) my_log_dir = os.path.join(log_dir, exp_name) dec_dir = os.path.join(my_log_dir, 'decoded') ref_dir = os.path.join(my_log_dir, 'reference') html_dir = os.path.join(my_log_dir, 'hightlighted_html') util.create_dirs(dec_dir) util.create_dirs(ref_dir) util.create_dirs(html_dir) util.create_dirs(temp_dir) tfidf_vec_path = 'data/tfidf/' + tfidf_model + '_tfidf_vec_5.pkl' with open(tfidf_vec_path, 'rb') as f: tfidf_vectorizer = pickle.load(f) pca_vec_path = 'data/tfidf/' + 'all' + '_pca.pkl'
def main(unused_argv): print('Running statistics on %s' % exp_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.singles_and_pairs == 'both': in_dataset = FLAGS.dataset_name out_dataset = FLAGS.dataset_name + '_both' else: in_dataset = FLAGS.dataset_name + '_singles' out_dataset = FLAGS.dataset_name + '_singles' if FLAGS.lr: out_dataset = FLAGS.dataset_name + '_lr' start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, in_dataset) ex_sents = ['single .', 'sentence .'] article_text = ' '.join(ex_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, ex_sents, article_text, pca) if FLAGS.singles_and_pairs == 'pairs': single_feat_len = 0 else: single_feat_len = len( get_single_sent_features(0, sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0], 0)) if FLAGS.singles_and_pairs == 'singles': pair_feat_len = 0 else: pair_feat_len = len( get_pair_sent_features([0, 1], sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0], [0, 0])) util.print_vars(single_feat_len, pair_feat_len) util.create_dirs(temp_dir) if FLAGS.dataset_split == 'all': dataset_splits = ['test', 'val', 'train'] elif FLAGS.dataset_split == 'train_val': dataset_splits = ['val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for split in dataset_splits: source_files = sorted(glob.glob(source_dir + '/' + split + '*')) out_path = os.path.join(out_dir, out_dataset, split) if FLAGS.pca: out_path += '_pca' util.create_dirs(os.path.join(out_path)) total = len(source_files) * 1000 if ( 'cnn' in in_dataset or 'newsroom' in in_dataset or 'xsum' in in_dataset) else len(source_files) example_generator = data.example_generator(source_dir + '/' + split + '*', True, False, should_check_valid=False) # for example in tqdm(example_generator, total=total): ex_gen = example_generator_extended(example_generator, total, single_feat_len, pair_feat_len, FLAGS.singles_and_pairs, out_path) print('Creating list') ex_list = [ex for ex in ex_gen] if FLAGS.num_instances != -1: ex_list = ex_list[:FLAGS.num_instances] print('Converting...') # all_features = pool.map(convert_article_to_lambdamart_features, ex_list) # all_features = ray.get([convert_article_to_lambdamart_features.remote(ex) for ex in ex_list]) if FLAGS.lr: all_instances = list( futures.map(convert_article_to_lambdamart_features, ex_list)) all_instances = util.flatten_list_of_lists(all_instances) x = [inst.features for inst in all_instances] x = np.array(x) y = [inst.relevance for inst in all_instances] y = np.expand_dims(np.array(y), 1) x_y = np.concatenate((x, y), 1) np.save(writer, x_y) else: list(futures.map(convert_article_to_lambdamart_features, ex_list)) # writer.write(''.join(all_features)) # all_features = [] # for example in tqdm(ex_gen, total=total): # all_features.append(convert_article_to_lambdamart_features(example)) # all_features = util.flatten_list_of_lists(all_features) # num1 = sum(x == 1 for x in all_features) # num2 = sum(x == 2 for x in all_features) # print 'Single sent: %d instances. Pair sent: %d instances.' % (num1, num2) # for example in tqdm(ex_gen, total=total): # features = convert_article_to_lambdamart_features(example) # writer.write(features) final_out_path = out_path + '.txt' file_names = sorted(glob.glob(os.path.join(out_path, '*'))) writer = open(final_out_path, 'wb') for file_name in tqdm(file_names): with open(file_name) as f: text = f.read() writer.write(text) writer.close() util.print_execution_time(start_time)
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) util.create_dirs(highlight_root) if not FLAGS.only_pairs: stats = {} for system in systems: print('Processing ' + system + '...') num_compress = 0 num_fuse = 0 num_copy = 0 num_fail = 0 highlight_dir = os.path.join(highlight_root, system) processed_dir = os.path.join(processed_root, system) util.create_dirs(highlight_dir) f_ssi = open(os.path.join(processed_dir, 'source_indices.txt'), 'w') f_summ = open( os.path.join(processed_dir, 'summaries_tokenized.txt')) f_article = open( os.path.join(processed_root, 'article', 'articles_tokenized.txt')) for example_idx in tqdm(range(11490)): if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances: break summary_sent_tokens = [ sent.split() for sent in f_summ.readline().strip().split('\t') ] article_sent_tokens = [ sent.split() for sent in f_article.readline().lower().strip().split('\t') ] groundtruth_ssi_list, lcs_paths_list, article_lcs_paths_list, smooth_article_paths_list = get_simple_source_indices_list( summary_sent_tokens, article_sent_tokens, None, FLAGS.sentence_limit, min_matched_tokens) groundtruth_highlighted_html = html_highlight_sents_in_article( summary_sent_tokens, groundtruth_ssi_list, article_sent_tokens, lcs_paths_list=lcs_paths_list, article_lcs_paths_list=smooth_article_paths_list) all_html = '<u>System Summary</u><br><br>' + groundtruth_highlighted_html write_highlighted_html(all_html, highlight_dir, example_idx) f_ssi.write('\t'.join([ ','.join(str(idx) for idx in source_indices ) if len(source_indices) >= 1 else '-1' for source_indices in groundtruth_ssi_list ]) + '\n') for ssi_idx, ssi in enumerate(groundtruth_ssi_list): if len(ssi) >= 2: num_fuse += 1 elif len(ssi) == 1: source_sent = ' '.join(article_sent_tokens[ssi[0]]) summ_sent = ' '.join(summary_sent_tokens[ssi_idx]) if source_sent == summ_sent: num_copy += 1 else: num_compress += 1 # tqdm.write(source_sent + '\n' + summ_sent + '\n\n') else: num_fail += 1 a = 0 stats[system] = (num_compress, num_fuse, num_copy, num_fail) f_summ.close() f_article.close() f_ssi.close() print("num_compress, num_fuse, num_copy, num_fail") for system in systems: print(system) total = sum(stats[system]) * 1. print('\t'.join( ["%.2f" % (val * 100 / total) for val in stats[system]])) else: util.create_dirs(pairs_only_processed_root) f_article = open( os.path.join(processed_root, 'article', 'articles.txt')) f_summs = [] f_ssis = [] for sys_idx, system in enumerate(systems): processed_dir = os.path.join(processed_root, system) f_summ = open(os.path.join(processed_dir, 'summaries.txt')) f_ssi = open(os.path.join(processed_dir, 'source_indices.txt')) f_summs.append(f_summ) f_ssis.append(f_ssi) w_article = open( os.path.join(pairs_only_processed_root, 'articles.txt'), 'w') w_summ = open(os.path.join(pairs_only_processed_root, 'summaries.txt'), 'w') w_ssi = open( os.path.join(pairs_only_processed_root, 'source_indices.txt'), 'w') w_system = open(os.path.join(pairs_only_processed_root, 'systems.txt'), 'w') systems_total = [] for example_idx in tqdm(range(11490)): if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances: break article_str = f_article.readline() systems_summ_sents = [] systems_ssis = [] system_names = [] no_reference_pairs = False ref_summ_sent = None ref_source_indices = None for sys_idx, system in enumerate(systems): system_name = systems[sys_idx] f_summ = f_summs[sys_idx] f_ssi = f_ssis[sys_idx] summary_sents = f_summ.readline().strip().split('\t') ssi = [ source_indices_str.split(',') for source_indices_str in f_ssi.readline().strip().split('\t') ] if system_name == 'reference': ssi_pairs = [] summary_sents_pairs = [] for summ_sent_idx, source_indices in enumerate(ssi): if len(source_indices) == 2: ssi_pairs.append(source_indices) summary_sents_pairs.append( summary_sents[summ_sent_idx]) if len(ssi_pairs) == 0: no_reference_pairs = True break summary_sents_pairs, ssi_pairs = util.shuffle( summary_sents_pairs, ssi_pairs) ref_summ_sent = summary_sents_pairs[0] ref_source_indices = ','.join(ssi_pairs[0]) else: for summ_sent_idx, source_indices in enumerate(ssi): if len(source_indices) == 2: try: systems_summ_sents.append( summary_sents[summ_sent_idx]) except: print(len(summary_sents), len(ssi), summ_sent_idx, system, example_idx) raise systems_ssis.append(','.join(ssi[summ_sent_idx])) system_names.append(system_name) if no_reference_pairs: continue if len(systems_summ_sents) < num_summ_sents_per_hit: continue systems_summ_sents, systems_ssis, system_names = util.shuffle( systems_summ_sents, systems_ssis, system_names) systems_summ_sents, systems_ssis, system_names = systems_summ_sents[: num_summ_sents_per_hit - 1], systems_ssis[: num_summ_sents_per_hit - 1], system_names[: num_summ_sents_per_hit - 1] systems_summ_sents.append(ref_summ_sent) systems_ssis.append(ref_source_indices) system_names.append('reference') systems_summ_sents, systems_ssis, system_names = util.shuffle( systems_summ_sents, systems_ssis, system_names) w_article.write(article_str) w_summ.write('\t'.join(systems_summ_sents) + '\n') w_ssi.write('\t'.join(systems_ssis) + '\n') w_system.write('\t'.join(system_names) + '\n') systems_total.extend(system_names) print(Counter(systems_total))
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.summarizer == 'all': summary_methods = list(summarizers.keys()) else: summary_methods = [FLAGS.summarizer] if FLAGS.dataset_name == 'all': dataset_names = datasets else: dataset_names = [FLAGS.dataset_name] sheets_strs = [] for summary_method in summary_methods: summary_fn = summarizers[summary_method] for dataset_name in dataset_names: FLAGS.dataset_name = dataset_name original_dataset_name = 'xsum' if 'xsum' in dataset_name else 'cnn_dm' if 'cnn_dm' in dataset_name or 'duc_2004' in dataset_name else '' vocab = Vocab('logs/vocab' + '_' + original_dataset_name, 50000) # create a vocabulary source_dir = os.path.join(data_dir, dataset_name) source_files = sorted( glob.glob(source_dir + '/' + FLAGS.dataset_split + '*')) total = len(source_files) * 1000 if ( 'cnn' in dataset_name or 'newsroom' in dataset_name or 'xsum' in dataset_name) else len(source_files) example_generator = data.example_generator( source_dir + '/' + FLAGS.dataset_split + '*', True, False, should_check_valid=False) if dataset_name == 'duc_2004': abs_source_dir = os.path.join( os.path.expanduser('~') + '/data/tf_data/with_coref', dataset_name) abs_example_generator = data.example_generator( abs_source_dir + '/' + FLAGS.dataset_split + '*', True, False, should_check_valid=False) abs_names_to_types = [('abstract', 'string_list')] triplet_ssi_list = [] for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) if dataset_name == 'duc_2004': abs_example = next(abs_example_generator) groundtruth_summary_texts = util.unpack_tf_example( abs_example, abs_names_to_types) groundtruth_summary_texts = groundtruth_summary_texts[0] groundtruth_summ_sents_list = [[ sent.strip() for sent in data.abstract2sents(abstract) ] for abstract in groundtruth_summary_texts] else: groundtruth_summary_texts = [groundtruth_summary_text] groundtruth_summ_sents_list = [] for groundtruth_summary_text in groundtruth_summary_texts: groundtruth_summ_sents = [ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ] groundtruth_summ_sents_list.append( groundtruth_summ_sents) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] if doc_indices is None: doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] groundtruth_similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, FLAGS.sentence_limit) log_dir = os.path.join(log_root, dataset_name + '_' + summary_method) dec_dir = os.path.join(log_dir, 'decoded') ref_dir = os.path.join(log_dir, 'reference') util.create_dirs(dec_dir) util.create_dirs(ref_dir) parser = PlaintextParser.from_string( ' '.join(raw_article_sents), Tokenizer("english")) summarizer = summary_fn() summary = summarizer( parser.document, 5) #Summarize the document with 5 sentences summary = [str(sentence) for sentence in summary] summary_tokenized = [] for sent in summary: summary_tokenized.append(sent.lower()) rouge_functions.write_for_rouge(groundtruth_summ_sents_list, summary_tokenized, example_idx, ref_dir, dec_dir, log=False) decoded_sent_tokens = [ sent.split() for sent in summary_tokenized ] sentence_limit = 2 sys_ssi_list, _, _ = get_simple_source_indices_list( decoded_sent_tokens, article_sent_tokens, vocab, sentence_limit, min_matched_tokens) triplet_ssi_list.append( (groundtruth_similar_source_indices_list, sys_ssi_list, -1)) print('Evaluating Lambdamart model F1 score...') suffix = util.all_sent_selection_eval(triplet_ssi_list) print(suffix) results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir) print(("Results_dict: ", results_dict)) sheets_str = rouge_functions.rouge_log(results_dict, log_dir, suffix=suffix) sheets_strs.append(dataset_name + '_' + summary_method + '\n' + sheets_str) for sheets_str in sheets_strs: print(sheets_str + '\n')
def main(): root_path = os.getcwd() util.create_dirs(OUTPUT_DIR) output_path = os.path.join(root_path, 'results', OUTPUT_DIR) input_path = os.path.join(root_path, *DATA_PATH) anot_path = os.path.join(root_path, *ANOT_PATH) label_path = os.path.join(root_path, *LABEL_PATH) img_util = util.ImgProcessor(OUTPUT_DIR, save_mode=False) labels = {} if NORMALIZE_LABEL: with open(os.path.join(root_path, *NORMALIZE_MAP_FILE)) as json_file: norm_label_map = json.load(json_file) if ADD_LABEL: print('Preparing Labels') with codecs.open(label_path, "r", encoding="utf-8") as csv_file: reader = csv.reader(csv_file) next(reader) for row in reader: id = '{}_{}'.format(row[0], row[1]) p = row[2] if NORMALIZE_LABEL: labels[id] = norm_label_map[p] else: labels[id] = float(p) print('Start Pre-processing: {}'.format(input_path)) count = 0 edge_over_pred = [] edge_under_pred = [] color_over_pred = [] color_under_pred = [] full_labels = [] full_dataset = [] for file_name in os.listdir(input_path): if COUNT_LIMIT != -1 and count >= COUNT_LIMIT: break file_path = os.path.join(input_path, file_name) if os.path.isfile(file_path): print("\tProcessing: {}".format(file_name)) img = cv2.imread(file_path) processed_img = [] name, type = os.path.splitext(file_name) info = name.split('_') slide = int(info[0]) rid = int(info[1]) if BIN_DATASET: p = labels.get(name, -1) if not (p == -1 or BIN_RANGE[0] <= p < BIN_RANGE[1]): continue if APPEND_RAW: for i in range(3): processed_img.append(img[:, :, i]) if APPEND_CELL_EDGE_DETECT: if EVALUATE_CELL_DETECTION: true_cell_count = get_cell_count(anot_path, file_name) for thr in CELL_EDGE_THRESHOLD: new_img, anot_size = img_util.cell_detection(img, file_name, threshold=thr) processed_img.append(new_img) if EVALUATE_CELL_DETECTION: edge_over_pred, edge_under_pred = evaluate_cell_detection(true_cell_count, anot_size, edge_over_pred, edge_under_pred) if DEV_MODE: break if APPEND_CELL_COLOR_DETECT: for boundary in COLOR_BOUNDARIES: new_img, anot_size = img_util.cell_detection(img, file_name, mode=1, boundaries=boundary) processed_img.append(new_img) if EVALUATE_CELL_DETECTION: color_over_pred, color_under_pred = evaluate_cell_detection(true_cell_count, anot_size, color_over_pred, color_under_pred) if DEV_MODE: break if APPEND_VESSEL_DETECT: for thr in VESSEL_THRESHOLD: new_img = img_util.detect_red_vessels(img, file_name, bgr_base=VESSEL_BASE, threshold=thr) processed_img.append(new_img) if DEV_MODE: break if APPEND_WHITE_DETECT: red_scale = util.grayscale_img('red', img) green_scale = util.grayscale_img('green', img) blue_scale = util.grayscale_img('blue', img) for thr in WHITE_THRESHOLD: temp_img = img.copy() new_img = img_util.detect_white_matter(temp_img, file_name, red_scale, green_scale, blue_scale, threshold=thr) processed_img.append(new_img) if DEV_MODE: break full_dataset.append(processed_img) if ADD_LABEL: if CLASSIFY_LABELS: p = labels[name] label = [0, 0, 0] if 0 <= p < 0.1: label[0] = 1 elif 0.1 <= p < 0.45: label[1] = 1 else: label[2] = 1 else: label = labels[name] else: label = 0 data_row = [slide, rid, label] full_labels.append(data_row) count += 1 print('Saving Dataset') if SAVE_PROCESSED_DATA: nb_batchs = int(len(full_dataset)/1000) prefix = OUTPUT_DATA_PREFIX if CLASSIFY_LABELS: prefix += '_class' if BIN_DATASET: prefix += '_bin({}-{})'.format(BIN_RANGE[0], BIN_RANGE[1]) if NORMALIZE_LABEL: prefix = '{}_norm{}'.format(prefix, NORM_VERSION) for i in range(nb_batchs): output_file = '{}_{}'.format(prefix, str(i)) lb = i*1000 ub = lb+1000 np.savez_compressed(os.path.join(output_path, OUTPUT_DATA_PATH, output_file), labels=full_labels[lb:ub], dataset=full_dataset[lb:ub]) remainder = len(full_dataset) % 1000 output_file = '{}_{}'.format(prefix, str(nb_batchs)) if remainder > 0: lb = nb_batchs * 1000 ub = lb + remainder np.savez_compressed(os.path.join(output_path, OUTPUT_DATA_PATH, output_file), labels=full_labels[lb:ub], dataset=full_dataset[lb:ub]) # Calculate Cell Detection Performance if EVALUATE_CELL_DETECTION: edge_avg_over_pred = 0 edge_avg_under_pred = 0 if len(edge_over_pred) > 0: edge_avg_over_pred = median(edge_over_pred) if len(edge_under_pred) > 0: edge_avg_under_pred = median(edge_under_pred) color_avg_over_pred = 0 color_avg_under_pred = 0 if len(color_over_pred) > 0: color_avg_over_pred = median(color_over_pred) if len(color_under_pred) > 0: color_avg_under_pred = median(color_under_pred) print('Performance:') print('\tEdge Avg Under Pred = {}'.format(edge_avg_under_pred)) print('\tEdge Avg Over Pred = {}'.format(edge_avg_over_pred)) print('\tColor Avg Under Pred = {}'.format(color_avg_under_pred)) print('\tColor Avg Over Pred = {}'.format(color_avg_over_pred)) print('Done Pre-processing')
def convertImagesMasterMAP(targetDir, targetMetaDir, imageMetaDir, jobs, img2bal, stopped, queue, result_queue, num_imgs2process, verbose=False, nProc=None, method_galign=GALIGN_NORMAL, method_lalign=LALIGN_NORMAL): """ Called by both single and multi-page elections. Performs Target Extraction. Input: str targetDir: Directory to dump extracted target images to. str targetMetaDir: Directory to store target metadata into. str imageMetaDir: Directory to store metadata for each Ballot, such as ballotpath, path to each extracted target, assoc'd blank ballot, isflipped. list jobs: [[tmppaths_i, bbs_i, imgpaths_i, targetDir_i, targetDiffDir_i, imageMetaDir_i, queue], ...] stopped: """ targetDiffDir=targetDir+'_diffs' print "...removing previous Target Extract results..." _t = time.time() if os.path.exists(targetDir): shutil.rmtree(targetDir) if os.path.exists(targetDiffDir): shutil.rmtree(targetDiffDir) if os.path.exists(targetMetaDir): shutil.rmtree(targetMetaDir) if os.path.exists(imageMetaDir): shutil.rmtree(imageMetaDir) dur = time.time() - _t print "...Finished removing previous Target Extract results ({0} s).".format(dur) create_dirs(targetDir) create_dirs(targetDiffDir) create_dirs(targetMetaDir) create_dirs(imageMetaDir) if nProc == None: nProc = sh.numProcs() #nProc = 1 num_jobs = len(jobs) if nProc < 2: print 'using only 1 processes' # default behavior for non multiproc machines for job in jobs: if stopped(): return False t0=time.clock(); convertImagesWorkerMAP(job) print time.clock()-t0 else: print 'using ', nProc, ' processes' pool=mp.Pool(processes=nProc) ''' it = [False] def imdone(x): it[0] = True print "I AM DONE NOW!" ''' if wx.App.IsMainLoopRunning(): wx.CallAfter(Publisher().sendMessage, "signals.MyGauge.nextjob", num_jobs) print "GOING UP TO", num_jobs #pool.map_async(convertImagesWorkerMAP,jobs,callback=lambda x: imdone(it)) pool.map_async(convertImagesWorkerMAP, jobs) cnt = 0 while cnt < len(jobs): val = queue.get(block=True) if val == True: if wx.App.IsMainLoopRunning(): wx.CallAfter(Publisher().sendMessage, "signals.MyGauge.tick") cnt += 1 elif type(val) in (str, unicode): # Something went wrong! print " WARNING: detected a failed extract job {0}.".format(cnt) cnt += 1 pool.close() pool.join() print " (Finished processing targetextract jobs)" cnt = 0 avg_intensities = [] # [(path, float avg_intensity), ...] bal2targets = {} # maps {int ballotid: {int page: [targetsdir, targetmetadir, diffmetadir, imgmetadir]}} while cnt < num_imgs2process: (avg_intensities_cur, balP, page, target_rootdir, targetdiff_rootdir, imgmeta_rootdir) = result_queue.get(block=True) avg_intensities.extend(avg_intensities_cur) ballotid = img2bal[balP] #print "...finished ballotid {0}".format(ballotid) bal2targets.setdefault(ballotid, {})[page] = (target_rootdir, targetdiff_rootdir, imgmeta_rootdir) cnt += 1 print 'done.' return avg_intensities, bal2targets
def process_attn_selections(attn_dir, decode_dir, vocab, extraction_eval=False): html_dir = os.path.join(decode_dir, 'extr_vis') util.create_dirs(html_dir) file_names = sorted(glob.glob(os.path.join(attn_dir, '*'))) if extraction_eval: ssi_dir = os.path.join('data/ssi', FLAGS.dataset_name, 'test_ssi.pkl') with open(ssi_dir) as f: ssi_list = pickle.load(f) if len(ssi_list) != len(file_names): raise Exception('len of ssi_list does not equal len file_names: ', len(ssi_list), len(file_names)) triplet_ssi_list = [] for file_idx, file_name in enumerate(tqdm(file_names)): with open(file_name) as f: data = json.load(f) p_gens = util.flatten_list_of_lists(data['p_gens']) article_lst = data['article_lst'] abstract_lst = data['abstract_str'].strip().split() decoded_lst = data['decoded_lst'] attn_dists = np.array(data['attn_dists']) article_lst = [art_word.replace('__', '') for art_word in article_lst] decoded_lst = [dec_word.replace('__', '') for dec_word in decoded_lst] abstract_lst = [abs_word.replace('__', '') for abs_word in abstract_lst] min_matched_tokens = 2 if 'singles' in FLAGS.exp_name: sentence_limit = 1 else: sentence_limit = 2 summary_sent_tokens = [nltk.tokenize.word_tokenize(sent) for sent in nltk.tokenize.sent_tokenize(' '.join(abstract_lst))] decoded_sent_tokens = [nltk.tokenize.word_tokenize(sent) for sent in nltk.tokenize.sent_tokenize(' '.join(decoded_lst))] article_sent_tokens = [nltk.tokenize.word_tokenize(sent) for sent in nltk.tokenize.sent_tokenize(' '.join(article_lst))] gt_ssi_list, lcs_paths_list, article_lcs_paths_list = get_simple_source_indices_list(summary_sent_tokens, article_sent_tokens, vocab, sentence_limit, min_matched_tokens) sys_ssi_list, _, _ = get_simple_source_indices_list(decoded_sent_tokens, article_sent_tokens, vocab, sentence_limit, min_matched_tokens) match_indices = [] for dec_idx, dec in enumerate(decoded_lst): art_match_indices = [art_idx for art_idx, art_word in enumerate(article_lst) if art_word.replace('__', '') == dec or art_word == dec] if len(art_match_indices) == 0: match_indices.append(None) else: art_attns = [attn_dists[dec_idx, art_idx] for art_idx in art_match_indices] best_match_idx = art_match_indices[np.argmax(art_attns)] match_indices.append(best_match_idx) html = create_html(article_lst, match_indices, decoded_lst, [abstract_lst], file_idx, gt_ssi_list, lcs_paths_list, article_lcs_paths_list, summary_sent_tokens, article_sent_tokens) with open(os.path.join(html_dir, '%06d.html' % file_idx), 'wb') as f: f.write(html) if extraction_eval: triplet_ssi_list.append((ssi_list[file_idx], sys_ssi_list, -1)) if extraction_eval: print('Evaluating Lambdamart model F1 score...') suffix = util.all_sent_selection_eval(triplet_ssi_list) print(suffix) with open(os.path.join(decode_dir, 'extraction_results.txt'), 'wb') as f: f.write(suffix) a=0
import util import os from tqdm import tqdm import glob import json split_dict = json.loads(open(os.path.expanduser('~') + "/xsum/XSum/XSum-Dataset/XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json").read()) data_types = ["test", "validation", "train"] article_dir = os.path.expanduser('~') + '/xsum/XSum/XSum-Dataset/xsum-preprocessed/document' summary_dir = os.path.expanduser('~') + '/xsum/XSum/XSum-Dataset/xsum-preprocessed/summary' out_dir = os.path.expanduser('~') + '/xsum/xsum-logan' util.create_dirs(out_dir) article_paths = sorted(glob.glob(article_dir + "*")) summary_paths = sorted(glob.glob(summary_dir + "*")) for data_type in data_types: bbcids = split_dict[data_type] if data_type == 'validation': dtype = 'val' else: dtype = data_type for bbcid_idx, bbcid in enumerate(tqdm(bbcids)): article_path = os.path.join(article_dir, bbcid + '.document') summary_path = os.path.join(summary_dir, bbcid + '.summary') if not os.path.exists(article_path): continue
help="clear graphs and test fields in native dirs") parser.add_argument('-inf', '--iframe', default=-1, help="display frame during inference") parser.add_argument('-g', '--grid_path', default='', help='Path to grid dictionary') args = parser.parse_args() config.data_path = args.input_dir if os.path.isdir(config.data_path): util.create_dirs(args.clear) else: print('Input dir is not valid') if not os.path.isdir(config.output_dir): print('WARNING - output dir is not valid. Meta graphs are not saved') exit() config.resample = args.trilinear config.param_state_size = args.latent_state_size config.n_filters = args.filters config.output_dir = args.output_dir config.save_freq = args.graph_saving_freq config.f_tensorboard = args.tensorboard_saving_freq config.sb_blocks = args.small_blocks config.batch_size = args.batch_size
if FLAGS.upper_bound: exp_name = exp_name + '_upperbound' if FLAGS.singles_and_pairs == 'singles': sentence_limit = 1 else: sentence_limit = 2 if FLAGS.dataset_name == 'xsum': l_param = 40 else: l_param = 100 # l_param = 100 temp_in_path = os.path.join(bert_in_dir, 'test.tsv') temp_out_path = os.path.join(bert_scores_dir, 'test_results.tsv') util.create_dirs(bert_scores_dir) my_log_dir = os.path.join(log_dir, exp_name) dec_dir = os.path.join(my_log_dir, 'decoded') ref_dir = os.path.join(my_log_dir, 'reference') html_dir = os.path.join(my_log_dir, 'hightlighted_html') util.create_dirs(dec_dir) util.create_dirs(ref_dir) util.create_dirs(html_dir) util.create_dirs(ssi_out_dir) def read_bert_scores(file_path): with open(file_path) as f: lines = f.readlines() data = [[float(x) for x in line.split('\t')] for line in lines] data = np.array(data) return data
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) start_time = time.time() np.random.seed(random_seed) util.create_dirs(os.path.join(out_dir, FLAGS.input_dataset)) if FLAGS.input_dataset == 'all': datasets = [ 'duc_2003', 'duc_2004', 'tac_2008', 'tac_2010', 'tac_2011', 'cnn_dm', 'xsum' ] else: datasets = [FLAGS.input_dataset] if dataset_split == 'all': dataset_splits = ['train', 'val', 'test'] else: dataset_splits = [dataset_split] all_articles = [] for in_dataset in datasets: source_dir = os.path.join(data_dir, in_dataset) for split in dataset_splits: # split = dataset_split source_files = sorted(glob.glob(source_dir + '/' + split + '*')) if len(source_files) == 0: continue total = len(source_files ) * 1000 if 'cnn' or 'newsroom' in in_dataset else len( source_files) example_generator = data.example_generator( source_dir + '/' + split + '*', True, False, should_check_valid=False) # for example in tqdm(example_generator, total=total): ex_gen = example_generator_extended(example_generator, total) print('Creating list') ex_list = [ex for ex in ex_gen] print('Converting...') articles = list(futures.map(save_as_txt_file, ex_list)) all_articles.extend(articles) vec = TfidfVectorizer(input='content', ngram_range=(1, 1), min_df=min_df, max_df=0.5, decode_error='ignore', preprocessor=my_preprocessor, tokenizer=my_tokenizer) # list(futures.map(save_as_txt_file, ex_list)) # file_list = [os.path.join(out_dir, in_dataset, fname) for fname in os.listdir(os.path.join(out_dir, in_dataset))] # vec = TfidfVectorizer(input='filename', ngram_range=(1,1), min_df=min_df, max_df=0.5, decode_error='ignore') # vec.fit(file_list) if FLAGS.pca: X = vec.fit_transform(all_articles) suffix = '_pca' elif FLAGS.pg_mmr: stemmer = PorterStemmer() class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: (stemmer.stem(w) for w in analyzer(doc)) vec = StemmedTfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 3), max_df=0.7) vec.fit_transform(all_articles) else: vec.fit_transform(all_articles) suffix = '' print('Vocabulary size', len(list(vec.vocabulary_.keys()))) if FLAGS.pg_mmr: util.create_dirs(os.path.join(log_dir, 'tfidf_vectorizer')) with open( os.path.join(log_dir, 'tfidf_vectorizer', FLAGS.input_dataset + '.dill'), 'wb') as f: dill.dump(vec, f) else: with open( os.path.join( out_dir, FLAGS.input_dataset + '_tfidf_vec_' + str(min_df) + suffix + '.pkl'), 'wb') as f: pickle.dump(vec, f) if FLAGS.pca: print('Fitting LSA model...') from sklearn.decomposition import TruncatedSVD svd = TruncatedSVD(n_components=100) svd.fit(X) with open(os.path.join(out_dir, FLAGS.input_dataset + '_pca' + '.pkl'), 'wb') as f: pickle.dump(svd, f) util.print_execution_time(start_time)