def compress(read_file_path, write_path_path): num_of_chunks, chunk_size = util.chunk_file(read_file_path) with open(read_file_path, **file_access_modes.default_read_configuration) as read_stream, \ open(write_path_path, **file_access_modes.write_bytes_configuration) as write_stream: dictionary = _generate_dictionary() read_limit = chunk_size initial_phrase = _empty_str compression_end = False compressed_rest = _empty_str for chunk_number in range(1, num_of_chunks + 1): if chunk_number == num_of_chunks: read_limit = None compression_end = True data = read_stream.read(read_limit) compressed_data, initial_phrase = _compress_data( data, dictionary, initial_phrase=initial_phrase, compression_end=compression_end) compressed_data = compressed_rest + compressed_data integer_num_of_bytes, compressed_rest = util.extract_integer_num_of_bytes( compressed_data) _write_bytes(write_stream, integer_num_of_bytes) _write_bytes(write_stream, compressed_rest)
def decompress(read_file_path, write_file_path, *, code_type): num_of_chunks, chunk_size = util.chunk_file(read_file_path) code_function = _code_functions[code_type] ending_bit = _ending_bits[code_type] read_code_function = _read_code_functions[code_type] characters_by_frequency = _characters_by_frequencies(read_file_path) codes = util.generate_codes(characters_by_frequency, code_function) reversed_codes = util.reverse_dictionary(codes) rest_bits = _empty_str read_limit = chunk_size compression_end = False with open(read_file_path, **file_access_modes.read_bytes_configuration) as read_stream, \ open(write_file_path, **file_access_modes.default_write_configuration) as write_stream: read_stream.seek(len(characters_by_frequency) + 1) for chunk_number in range(1, num_of_chunks + 1): if chunk_number == num_of_chunks: read_limit = None compression_end = True binary_data = read_stream.read(read_limit) bits = rest_bits + util.to_bits(binary_data) decompressed_data, rest_bits = _decompress_data( bits, reversed_codes, read_code_function=read_code_function, ending_bit=ending_bit, compression_end=compression_end) write_stream.write(decompressed_data)
def convert_singpairmix_to_tf_examples(dataset_name, processed_data_dir, tf_example_dir, dataset_split='all'): out_dir = os.path.join(tf_example_dir, dataset_name) out_full_dir = os.path.join(out_dir, 'all') util.create_dirs(out_full_dir) if dataset_split == 'all': if dataset_name == 'duc_2004': dataset_splits = ['test'] else: dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [dataset_split] for dataset_split in dataset_splits: processed_data_path = os.path.join(processed_data_dir, dataset_name, dataset_split) articles_path = os.path.join(processed_data_path,'articles.tsv') abstracts_path = os.path.join(processed_data_path,'summaries.tsv') highlight_path = os.path.join(processed_data_path,'highlight.tsv') f_art = open(articles_path) f_abs = open(abstracts_path) f_hl = open(highlight_path) writer = open(os.path.join(out_full_dir, dataset_split + '.bin'), 'wb') total = util.num_lines_in_file(articles_path) for example_idx in tqdm(range(total)): raw_article_sents = f_art.readline().strip().split('\t') groundtruth_summ_sents = f_abs.readline().strip().split('\t') summary_text = '\n'.join(groundtruth_summ_sents) article_sent_tokens = [util.process_sent(sent, whitespace=True) for sent in raw_article_sents] doc_indices = None if doc_indices is None or (dataset_name != 'duc_2004' and len(doc_indices) != len( util.flatten_list_of_lists(article_sent_tokens))): doc_indices = [0] * len(util.flatten_list_of_lists(article_sent_tokens)) doc_indices_str = ' '.join([str(idx) for idx in doc_indices]) similar_source_indices = [source_indices.split(',') for source_indices in f_hl.readline().split('\t')] write_bert_tf_example(similar_source_indices, raw_article_sents, summary_text, None, doc_indices_str, None, writer, dataset_name) writer.close() if dataset_name == 'cnn_dm' or dataset_name == 'newsroom' or dataset_name == 'xsum': chunk_size = 1000 else: chunk_size = 1 util.chunk_file(dataset_split, out_full_dir, out_dir, chunk_size=chunk_size)
def decompress(read_file_path, write_path_path): num_of_chunks, chunk_size = util.chunk_file(read_file_path) with open(read_file_path, **file_access_modes.read_bytes_configuration) as read_stream, \ open(write_path_path, **file_access_modes.default_write_configuration) as write_stream: dictionary = _generate_dictionary() reversed_dictionary = util.reverse_dictionary(dictionary) rest_bits = _empty_str initial_phrase = _empty_str read_limit = chunk_size for chunk_number in range(1, num_of_chunks + 1): if chunk_number == num_of_chunks: read_limit = None binary_data = read_stream.read(read_limit) bits = rest_bits + util.to_bits(binary_data) decompressed_data, rest_bits, initial_phrase = _decompress_data( bits, dictionary, reversed_dictionary, initial_phrase=initial_phrase) write_stream.write(decompressed_data)
def compress(read_stream_path, write_stream_path, *, code_type): code_function = _code_functions[code_type] ending_bit = _ending_bits[code_type] num_of_threads, thread_chunk = util.chunk_file(read_stream_path) results_queue = queue.PriorityQueue() threads = list() read_limit = thread_chunk for thread_number in range(1, num_of_threads + 1): if thread_number == num_of_threads: read_limit = None read_stream_start_position = thread_chunk * (thread_number - 1) thread_result_file_path = util.thread_result_file_path( write_stream_path, thread_number) threading_data = (results_queue, thread_number, read_stream_start_position, read_limit) thread = threading.Thread(target=_compress_file_content, args=(read_stream_path, thread_result_file_path), kwargs={ _threading_data_parameter: threading_data, _code_function_parameter: code_function, _ending_bit_parameter: ending_bit }) thread.start() threads.append(thread) for thread in threads: thread.join() _combine_threads_results(results_queue, write_stream_path, num_of_threads)
def main(unused_argv): print('Running statistics on %s' % FLAGS.exp_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.all_actions: FLAGS.sent_dataset = True FLAGS.ssi_dataset = True FLAGS.print_output = True FLAGS.highlight = True original_dataset_name = 'xsum' if 'xsum' in FLAGS.dataset_name else 'cnn_dm' if ( 'cnn_dm' in FLAGS.dataset_name or 'duc_2004' in FLAGS.dataset_name) else '' vocab = Vocab(FLAGS.vocab_path + '_' + original_dataset_name, FLAGS.vocab_size) # create a vocabulary source_dir = os.path.join(data_dir, FLAGS.dataset_name) util.create_dirs(html_dir) if FLAGS.dataset_split == 'all': if FLAGS.dataset_name == 'duc_2004': dataset_splits = ['test'] else: dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for dataset_split in dataset_splits: source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) if FLAGS.exp_name == 'reference': # summary_dir = log_dir + default_exp_name + '/decode_test_' + str(max_enc_steps) + \ # 'maxenc_4beam_' + str(min_dec_steps) + 'mindec_' + str(max_dec_steps) + 'maxdec_ckpt-238410/reference' # summary_files = sorted(glob.glob(summary_dir + '/*_reference.A.txt')) summary_dir = source_dir summary_files = source_files else: if FLAGS.exp_name == 'cnn_dm': summary_dir = log_dir + FLAGS.exp_name + '/decode_test_400maxenc_4beam_35mindec_100maxdec_ckpt-238410/decoded' else: ckpt_folder = util.find_largest_ckpt_folder(log_dir + FLAGS.exp_name) summary_dir = log_dir + FLAGS.exp_name + '/' + ckpt_folder + '/decoded' # summary_dir = log_dir + FLAGS.exp_name + '/decode_test_' + str(max_enc_steps) + \ # 'maxenc_4beam_' + str(min_dec_steps) + 'mindec_' + str(max_dec_steps) + 'maxdec_ckpt-238410/decoded' summary_files = sorted(glob.glob(summary_dir + '/*')) if len(summary_files) == 0: raise Exception('No files found in %s' % summary_dir) example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, is_original=True) pros = { 'annotators': 'dcoref', 'outputFormat': 'json', 'timeout': '5000000' } all_merge_examples = [] num_extracted_list = [] distances = [] relative_distances = [] html_str = '' extracted_sents_in_article_html = '' name = FLAGS.dataset_name + '_' + FLAGS.exp_name if FLAGS.coreference_replacement: name += '_coref' highlight_file_name = os.path.join( html_dir, FLAGS.dataset_name + '_' + FLAGS.exp_name) if FLAGS.consider_stopwords: highlight_file_name += '_stopwords' if FLAGS.highlight: extracted_sents_in_article_html_file = open( highlight_file_name + '_extracted_sents.html', 'wb') if FLAGS.kaiqiang: kaiqiang_article_texts = [] kaiqiang_abstract_texts = [] util.create_dirs(kaiqiang_dir) kaiqiang_article_file = open( os.path.join( kaiqiang_dir, FLAGS.dataset_name + '_' + dataset_split + '_' + str(FLAGS.min_matched_tokens) + '_articles.txt'), 'wb') kaiqiang_abstract_file = open( os.path.join( kaiqiang_dir, FLAGS.dataset_name + '_' + dataset_split + '_' + str(FLAGS.min_matched_tokens) + '_abstracts.txt'), 'wb') if FLAGS.ssi_dataset: if FLAGS.tag_tokens: with_coref_and_ssi_dir = lambdamart_dir + '_and_tag_tokens' else: with_coref_and_ssi_dir = lambdamart_dir lambdamart_out_dir = os.path.join(with_coref_and_ssi_dir, FLAGS.dataset_name) if FLAGS.sentence_limit == 1: lambdamart_out_dir += '_singles' if FLAGS.consider_stopwords: lambdamart_out_dir += '_stopwords' lambdamart_out_full_dir = os.path.join(lambdamart_out_dir, 'all') util.create_dirs(lambdamart_out_full_dir) lambdamart_writer = open( os.path.join(lambdamart_out_full_dir, dataset_split + '.bin'), 'wb') simple_similar_source_indices_list_plus_empty = [] example_idx = -1 instance_idx = 0 total = len(source_files) * 1000 if ( 'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name or 'xsum' in FLAGS.dataset_name) else len(source_files) random_choices = None if FLAGS.randomize: if FLAGS.dataset_name == 'cnn_dm': list_order = np.random.permutation(11490) random_choices = list_order[:FLAGS.num_instances] for example in tqdm(example_generator, total=total): example_idx += 1 if FLAGS.num_instances != -1 and instance_idx >= FLAGS.num_instances: break if random_choices is not None and example_idx not in random_choices: continue # for file_idx in tqdm(range(len(source_files))): # example = get_tf_example(source_files[file_idx]) article_text = example.features.feature[ 'article'].bytes_list.value[0].decode().lower() if FLAGS.exp_name == 'reference': summary_text, all_summary_texts = get_summary_from_example( example) else: summary_text = get_summary_text(summary_files[example_idx]) article_tokens = split_into_tokens(article_text) if 'raw_article_sents' in example.features.feature and len( example.features.feature['raw_article_sents'].bytes_list. value) > 0: raw_article_sents = example.features.feature[ 'raw_article_sents'].bytes_list.value raw_article_sents = [ sent.decode() for sent in raw_article_sents if sent.decode().strip() != '' ] article_sent_tokens = [ util.process_sent(sent, whitespace=True) for sent in raw_article_sents ] else: # article_text = util.to_unicode(article_text) # sent_pros = {'annotators': 'ssplit', 'outputFormat': 'json', 'timeout': '5000000'} # sents_result_dict = nlp.annotate(str(article_text), properties=sent_pros) # article_sent_tokens = [[token['word'] for token in sent['tokens']] for sent in sents_result_dict['sentences']] raw_article_sents = nltk.tokenize.sent_tokenize(article_text) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] if FLAGS.top_n_sents != -1: article_sent_tokens = article_sent_tokens[:FLAGS.top_n_sents] raw_article_sents = raw_article_sents[:FLAGS.top_n_sents] article_sents = [' '.join(sent) for sent in article_sent_tokens] try: article_tokens_string = str(' '.join(article_sents)) except: try: article_tokens_string = str(' '.join( [sent.decode('latin-1') for sent in article_sents])) except: raise if len(article_sent_tokens) == 0: continue summary_sent_tokens = split_into_sent_tokens(summary_text) if 'doc_indices' in example.features.feature and len( example.features.feature['doc_indices'].bytes_list.value ) > 0: doc_indices_str = example.features.feature[ 'doc_indices'].bytes_list.value[0].decode() if '1' in doc_indices_str: doc_indices = [ int(x) for x in doc_indices_str.strip().split() ] rel_sent_positions = importance_features.get_sent_indices( article_sent_tokens, doc_indices) else: num_tokens_total = sum( [len(sent) for sent in article_sent_tokens]) rel_sent_positions = list(range(len(raw_article_sents))) doc_indices = [0] * num_tokens_total else: rel_sent_positions = None doc_indices = None doc_indices_str = None if 'corefs' in example.features.feature and len( example.features.feature['corefs'].bytes_list.value) > 0: corefs_str = example.features.feature[ 'corefs'].bytes_list.value[0] corefs = json.loads(corefs_str) # summary_sent_tokens = limit_to_n_tokens(summary_sent_tokens, 100) similar_source_indices_list_plus_empty = [] simple_similar_source_indices, lcs_paths_list, article_lcs_paths_list, smooth_article_paths_list = ssi_functions.get_simple_source_indices_list( summary_sent_tokens, article_sent_tokens, vocab, FLAGS.sentence_limit, FLAGS.min_matched_tokens, not FLAGS.consider_stopwords, lemmatize=FLAGS.lemmatize, multiple_ssi=FLAGS.multiple_ssi) article_paths_parameter = article_lcs_paths_list if FLAGS.tag_tokens else None article_paths_parameter = smooth_article_paths_list if FLAGS.smart_tags else article_paths_parameter restricted_source_indices = util.enforce_sentence_limit( simple_similar_source_indices, FLAGS.sentence_limit) for summ_sent_idx, summ_sent in enumerate(summary_sent_tokens): if FLAGS.sent_dataset: if len(restricted_source_indices[summ_sent_idx]) == 0: continue merge_example = get_merge_example( restricted_source_indices[summ_sent_idx], article_sent_tokens, summ_sent, corefs, article_paths_parameter[summ_sent_idx]) all_merge_examples.append(merge_example) simple_similar_source_indices_list_plus_empty.append( simple_similar_source_indices) if FLAGS.ssi_dataset: summary_text_to_save = [ s for s in all_summary_texts ] if FLAGS.dataset_name == 'duc_2004' else summary_text write_lambdamart_example(simple_similar_source_indices, raw_article_sents, summary_text_to_save, corefs_str, doc_indices_str, article_paths_parameter, lambdamart_writer) if FLAGS.highlight: highlight_article_lcs_paths_list = smooth_article_paths_list if FLAGS.smart_tags else article_lcs_paths_list # simple_ssi_plus_empty = [ [s[0] for s in sim_source_ind] for sim_source_ind in simple_similar_source_indices] extracted_sents_in_article_html = ssi_functions.html_highlight_sents_in_article( summary_sent_tokens, simple_similar_source_indices, article_sent_tokens, doc_indices, lcs_paths_list, highlight_article_lcs_paths_list) extracted_sents_in_article_html_file.write( extracted_sents_in_article_html.encode()) a = 0 instance_idx += 1 if FLAGS.ssi_dataset: lambdamart_writer.close() if FLAGS.dataset_name == 'cnn_dm' or FLAGS.dataset_name == 'newsroom' or FLAGS.dataset_name == 'xsum': chunk_size = 1000 else: chunk_size = 1 util.chunk_file(dataset_split, lambdamart_out_full_dir, lambdamart_out_dir, chunk_size=chunk_size) if FLAGS.sent_dataset: with_coref_dir = data_dir + '_and_tag_tokens' if FLAGS.tag_tokens else data_dir out_dir = os.path.join(with_coref_dir, FLAGS.dataset_name + '_sent') if FLAGS.sentence_limit == 1: out_dir += '_singles' if FLAGS.consider_stopwords: out_dir += '_stopwords' if FLAGS.coreference_replacement: out_dir += '_coref' if FLAGS.top_n_sents != -1: out_dir += '_n=' + str(FLAGS.top_n_sents) util.create_dirs(out_dir) convert_data.write_with_generator(iter(all_merge_examples), len(all_merge_examples), out_dir, dataset_split) if FLAGS.print_output: # html_str = FLAGS.dataset + ' | ' + FLAGS.exp_name + '<br><br><br>' + html_str # save_fusions_to_file(html_str) ssi_path = os.path.join(ssi_dir, FLAGS.dataset_name) if FLAGS.consider_stopwords: ssi_path += '_stopwords' util.create_dirs(ssi_path) if FLAGS.dataset_name == 'duc_2004' and FLAGS.abstract_idx != 0: abstract_idx_str = '_%d' % FLAGS.abstract_idx else: abstract_idx_str = '' with open( os.path.join( ssi_path, dataset_split + '_ssi' + abstract_idx_str + '.pkl'), 'wb') as f: pickle.dump(simple_similar_source_indices_list_plus_empty, f) if FLAGS.kaiqiang: # kaiqiang_article_file.write('\n'.join(kaiqiang_article_texts)) # kaiqiang_abstract_file.write('\n'.join(kaiqiang_abstract_texts)) kaiqiang_article_file.close() kaiqiang_abstract_file.close() if FLAGS.highlight: extracted_sents_in_article_html_file.close() a = 0