def write_for_attnvis(self,
                          article,
                          abstract,
                          decoded_words,
                          attn_dists,
                          p_gens,
                          ex_index,
                          ssi=None):
        """Write some data to json file, which can be read into the in-browser attention visualizer tool:
            https://github.com/abisee/attn_vis

        Args:
            article: The original article string.
            abstract: The human (correct) abstract string.
            attn_dists: List of arrays; the attention distributions.
            decoded_words: List of strings; the words of the generated summary.
            p_gens: List of scalars; the p_gen values. If not running in pointer-generator mode, list of None.
        """
        article_lst = article.split()  # list of words
        decoded_lst = decoded_words  # list of decoded words
        to_write = {
            'article_lst': [make_html_safe(t) for t in article_lst],
            'decoded_lst': [make_html_safe(t) for t in decoded_lst],
            'abstract_str': make_html_safe(abstract),
            'attn_dists': attn_dists
        }
        if FLAGS.pointer_gen:
            to_write['p_gens'] = p_gens
        if ssi is not None:
            to_write['ssi'] = ssi
        util.create_dirs(os.path.join(self._decode_dir, 'attn_vis_data'))
        output_fname = os.path.join(self._decode_dir, 'attn_vis_data',
                                    '%06d.json' % ex_index)
        with open(output_fname, 'w') as output_file:
            json.dump(to_write, output_file)
def main(unused_argv):

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    source_dir = os.path.join(data_dir, FLAGS.dataset)
    source_files = sorted(glob.glob(source_dir + '/*'))

    for i in range(4):
        ref_dir = os.path.join(log_dir, 'reference_' + str(i), 'reference')
        dec_dir = os.path.join(log_dir, 'reference_' + str(i), 'decoded')
        util.create_dirs(ref_dir)
        util.create_dirs(dec_dir)
        for source_idx, source_file in enumerate(source_files):
            human_summary_texts = get_human_summary_texts(source_file)
            summaries = []
            for summary_text in human_summary_texts:
                summary = data.abstract2sents(summary_text)
                summaries.append(summary)
            candidate = summaries[i]
            references = [
                summaries[idx] for idx in range(len(summaries)) if idx != i
            ]
            rouge_functions.write_for_rouge(references, candidate, source_idx,
                                            ref_dir, dec_dir)

        results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir)
        # print("Results_dict: ", results_dict)
        rouge_functions.rouge_log(results_dict,
                                  os.path.join(log_dir, 'reference_' + str(i)))
Example #3
0
def load_cache_and_dataset(loader_name, cache_prefix, dataset_path,
                           check_cache=True):
    """"Finds a loader by name, builds a cache prefix from the loader name,
    loads a dataset and returns the set split into train, test and validate
    subsets.

    :param loader_name: Name of the dataset loader to use (e.g. LSP, FLIC).
    :param cache_prefix: Prefix of the cache directory.
    :param dataset_path: Path to dataset.
    :param check_cache: Should the cache be checked for picked datasets?
    :returns: Tuple of ``(fresh_cache, train_set, validate_set, test_set,
        cache_path)``, where ``fresh_cache`` is a boolean indicating whether
        the cache is still appropriate to use in later stages of the training
        pipeline (will be false iff the cache was written to during loading),
        ``{train, validate, test}_set`` are the relevant chunks of the data
        set, and ``cache_path`` is a complete path to the cache directory
        (including the appropriate prefix)."""
    # Get loader
    if loader_name not in datasets.ALLOWED_LOADERS:
        print("'{}' is not a valid loader. Allowed loaders: {}".format(
            loader_name, ', '.join(datasets.ALLOWED_LOADERS)
        ), file=sys.stderr)
        sys.exit(1)
    loader = getattr(datasets, loader_name)

    # Caching
    cache_dir = path.join(args.cache, loader_name)
    logging.info("Checking cache directory '{}'".format(cache_dir))
    create_dirs(cache_dir)

    # Now actually load the dataset, if possible
    pickle_path = path.join(cache_dir, 'dataset_meta.pickle')

    if check_cache and path.exists(pickle_path):
        logging.info("Loading pickled dataset")
        with open(pickle_path) as fp:
            train_set, validate_set, test_set = pickle.load(fp)
    else:
        if check_cache:
            msg = "Pickled dataset not found"
        else:
            msg = "Ignoring cached dataset, if any"
        logging.info(msg + '; loading full dataset')

        whole_dataset = loader(dataset_path)
        # Training set will contain 1/2 of the set
        train_set, others = whole_dataset.split(2)
        # Validation and test sets will each contain 1/4 of the set
        validate_set, test_set = others.split(2)

        logging.info("Pickling dataset for future use")
        with open(pickle_path, 'w') as fp:
            pickle.dump((train_set, validate_set, test_set), fp)
        check_cache = False

    return check_cache, train_set, validate_set, test_set, cache_dir
Example #4
0
def convert_to_importance_model():
    """Load non-coverage checkpoint, add initialized extra variables for coverage, and save as new checkpoint"""
    logging.info("converting non-importance model to importance model..")

    new_log_root = FLAGS.log_root + '_imp' + str(FLAGS.imp_loss_wt)
    if FLAGS.imp_loss_oneminus:
        new_log_root += '_oneminus'

    print("copying models from %s to %s..." % (FLAGS.log_root, new_log_root))
    util.create_dirs(new_log_root)
    copy_tree(FLAGS.log_root, new_log_root)
    print("copied.")
Example #5
0
    def run(self, prediction_date: datetime):

        full_feature_path = self._get_cached_path()

        if os.path.exists(full_feature_path):
            df = pd.read_pickle(full_feature_path)
        else:
            df = self._feature_function(prediction_date, **self._params_dict)
            create_dirs(full_feature_path)

            df.to_pickle(full_feature_path)

        return df
Example #6
0
    def run(self):

        full_feature_path = self._get_cached_path()

        if os.path.exists(full_feature_path):
            df = pd.read_pickle(full_feature_path)
        else:
            df = self._feature_function(**self._params_dict)
            create_dirs(os.path.dirname(full_feature_path))

            df.to_pickle(full_feature_path)

        return df
def main(unused_argv):

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    util.create_dirs(out_full_dir)
    util.create_dirs(out_dir)

    write_to_bin(all_test_urls, out_dir, 'test')
    write_to_bin(all_val_urls, out_dir, 'val')
    write_to_bin(all_train_urls, out_dir, 'train')

    # Chunk the data. This splits each of train.bin, val.bin and test.bin into smaller chunks, each containing e.g. 1000 examples, and saves them in finished_files/chunks
    chunk_all(out_full_dir, out_dir)
def main(unused_argv):

    print('Running statistics on %s' % FLAGS.dataset_name)

    if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.dataset_name == 'all':
        dataset_names = ['cnn_dm', 'xsum', 'duc_2004']
    else:
        dataset_names = [FLAGS.dataset_name]

    for dataset_name in dataset_names:
        FLAGS.dataset_name = dataset_name


        source_dir = os.path.join(data_dir, dataset_name)

        if FLAGS.dataset_split == 'all':
            if dataset_name == 'duc_2004':
                dataset_splits = ['test']
            else:
                dataset_splits = ['test', 'val', 'train']
        else:
            dataset_splits = [FLAGS.dataset_split]


        for dataset_split in dataset_splits:

            source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*'))

            total = len(source_files) * 1000
            example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False,
                                                       should_check_valid=False)

            out_dir = os.path.join('data', 'bert', dataset_name, 'article_embeddings', 'input_article')
            util.create_dirs(out_dir)

            writer = open(os.path.join(out_dir, dataset_split) + '.tsv', 'wb')
            inst_id = 0
            for example_idx, example in enumerate(tqdm(example_generator, total=total)):
                if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances:
                    break
                raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example(
                    example, names_to_types)

                article = ' '.join(raw_article_sents)
                writer.write((article + '\n').encode())
def main(unused_argv):

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.dataset_split == 'all':
        if FLAGS.dataset_name == 'duc_2004':
            dataset_splits = ['test']
        else:
            dataset_splits = ['test', 'val', 'train']
    else:
        dataset_splits = [FLAGS.dataset_split]
    for dataset_split in dataset_splits:

        processed_data_path = os.path.join(processed_data_dir,
                                           FLAGS.dataset_name, dataset_split)
        articles_path = os.path.join(processed_data_path, 'articles.tsv')
        abstracts_path = os.path.join(processed_data_path, 'summaries.tsv')
        highlight_path = os.path.join(processed_data_path, 'highlight.tsv')
        out_full_dir = os.path.join(processed_data_path, 'temp_highlight')
        pretty_html_path = os.path.join(processed_data_path,
                                        'pretty_html.html')

        util.create_dirs(os.path.dirname(pretty_html_path))
        util.create_dirs(out_full_dir)
        f_art = open(articles_path)
        f_abs = open(abstracts_path)
        f_pretty_html = open(pretty_html_path, 'wb')
        total = util.num_lines_in_file(articles_path)

        ex_list = create_example_list(f_art, f_abs, pretty_html_path,
                                      out_full_dir, total)

        pool = mp.Pool(mp.cpu_count())
        _ = list(tqdm(pool.imap(process_one_example, ex_list), total=total))
        pool.close()

        f_pretty_html.close()
        file_list = sorted(glob.glob(os.path.join(out_full_dir, '*')))
        f_hl = open(highlight_path, 'wb')
        for file_name in tqdm(file_list):
            with open(file_name) as f_single_file_hl:
                highlights = f_single_file_hl.read()
            f_hl.write(highlights.encode())
        f_hl.close()
Example #10
0
def process_images(bot, update, user_data):
    bot.send_chat_action(update.message.chat.id, ChatAction.TYPING)

    logging.info('receipt received')
    payer = update.message.chat.first_name

    photo = None
    for p in update.message.photo:
        if photo is None:
            photo = p

        if p.file_size > photo.file_size:
            photo = p

    temp_dir, receipts_dir = cfg.temp_dir % payer, cfg.receipts_dir % payer

    create_dirs([temp_dir, receipts_dir])
    all_images_path = f'{temp_dir}/{photo.file_id}'
    receipts_path = f'{receipts_dir}/{photo.file_id}'

    photo.get_file().download(custom_path=all_images_path)
    logging.info(f'image downloaded: {photo.file_id}')

    logging.info(f'reading image text: {photo.file_id}')

    try:
        bill = rp.read(all_images_path, cfg.logos_path, payer)
        user_data['bill'] = bill
        if bill.name is None:
            cache_bill = bill
            update.message.reply_text(
                'Não consegui identificar o nome dessa conta. Que conta é essa?'
            )
            return SET_BILL_NAME
        else:
            os.rename(all_images_path, receipts_path)
            bill_confirmation_button(update.message, bill)
            return SUBMIT

    except rp.InvalidReceipt as e:
        logging.info(f'recibo invalido: {photo.file_id}')
        update.message.reply_text(
            'Recibo invalido\nPor enquanto só sei cadastrar recibos do banco Itaú :/'
        )

    return ConversationHandler.END
Example #11
0
def convert_singpairmix_to_tf_examples(dataset_name, processed_data_dir, tf_example_dir, dataset_split='all'):
    out_dir = os.path.join(tf_example_dir, dataset_name)
    out_full_dir = os.path.join(out_dir, 'all')
    util.create_dirs(out_full_dir)
    if dataset_split == 'all':
        if dataset_name == 'duc_2004':
            dataset_splits = ['test']
        else:
            dataset_splits = ['test', 'val', 'train']
    else:
        dataset_splits = [dataset_split]
    for dataset_split in dataset_splits:
        processed_data_path = os.path.join(processed_data_dir, dataset_name, dataset_split)
        articles_path = os.path.join(processed_data_path,'articles.tsv')
        abstracts_path = os.path.join(processed_data_path,'summaries.tsv')
        highlight_path = os.path.join(processed_data_path,'highlight.tsv')

        f_art = open(articles_path)
        f_abs = open(abstracts_path)
        f_hl = open(highlight_path)
        writer = open(os.path.join(out_full_dir, dataset_split + '.bin'), 'wb')
        total = util.num_lines_in_file(articles_path)
        for example_idx in tqdm(range(total)):
            raw_article_sents = f_art.readline().strip().split('\t')
            groundtruth_summ_sents = f_abs.readline().strip().split('\t')
            summary_text = '\n'.join(groundtruth_summ_sents)
            article_sent_tokens = [util.process_sent(sent, whitespace=True) for sent in raw_article_sents]
            doc_indices = None
            if doc_indices is None or (dataset_name != 'duc_2004' and len(doc_indices) != len(
                    util.flatten_list_of_lists(article_sent_tokens))):
                doc_indices = [0] * len(util.flatten_list_of_lists(article_sent_tokens))
            doc_indices_str = ' '.join([str(idx) for idx in doc_indices])
            similar_source_indices = [source_indices.split(',') for source_indices in f_hl.readline().split('\t')]

            write_bert_tf_example(similar_source_indices, raw_article_sents, summary_text, None,
                                  doc_indices_str, None, writer, dataset_name)

        writer.close()
        if dataset_name == 'cnn_dm' or dataset_name == 'newsroom' or dataset_name == 'xsum':
            chunk_size = 1000
        else:
            chunk_size = 1
        util.chunk_file(dataset_split, out_full_dir, out_dir, chunk_size=chunk_size)
Example #12
0
    def __init__(self):
        
        # process managing classes
        self.rtpplay = rtp.RTPPlay()
        self.rtpdump = rtp.RTPDump()
        
        # rtpdump parameters
        self.dump_address = config.RTPDUMP_ADDRESS
        self.dump_port = config.RTPDUMP_PORT
        
        # rtpplay parameters
        self.preview_address = config.RTPDUMP_PREVIEW_ADDRESS
        self.preview_port = config.RTPDUMP_PREVIEW_PORT
        
        # directories used when saving/previewing files
        self.sync_dir = config.SYNC_DIR
        self.dump_dir = config.DUMP_DIR
        
        # name/file extension of recorded video files
        self.video_basename = config.VIDEO_BASENAME
        self.video_file_ext = config.VIDEO_FILE_EXT
        
        # file extension for commit files (they share the video's name)
        self.commit_file_ext = config.COMMIT_FILE_EXT
        
        # maximum time we will wait for a process to complete an action
        self.max_block_time = config.MAX_BLOCK_TIME
        
        # replaced with custom function in unit tests
        self.file_exists = os.path.exists

        # make sure we have the directory structure we'll need
        util.create_dirs(self.sync_dir, self.dump_dir)
        
        # make sure critical operations are atomic
        self.__lock = threading.Lock()
        
        # state variables
        self._commit_time = None
        self._start_time = None
        self._dump_file = None
Example #13
0
    def __init__(self):
        self.__lock = threading.Lock()
        
        self.rtpplay = rtp.RTPPlay()
        self.rtpplay_live = rtp.RTPPlay()
        self.play_address = config.RTPPLAY_ADDRESS
        self.play_port = config.RTPPLAY_PORT

        self.sync_dir = config.SYNC_DIR
        self.dump_dir = config.DUMP_DIR
        
        self.max_block_time = config.MAX_BLOCK_TIME
        
        util.create_dirs(self.sync_dir, self.dump_dir)

        self._is_playing = False
        self._armed_file = None
        self._is_live_playing = False

        # replaced with custom function in unit tests
        self.file_exists = os.path.exists
        self.file_getsize = os.path.getsize
        self.listdir = os.listdir
Example #14
0
def main(unused_argv):

    print('Running statistics on %s' % FLAGS.dataset_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    out_dir = os.path.join(
        os.path.expanduser('~') + '/data/kaiqiang_data', FLAGS.dataset_name)
    if FLAGS.mode == 'write':
        util.create_dirs(out_dir)
        if FLAGS.dataset_name == 'duc_2004':
            dataset_splits = ['test']
        elif FLAGS.dataset_split == 'all':
            dataset_splits = ['test', 'val', 'train']
        else:
            dataset_splits = [FLAGS.dataset_split]

        for dataset_split in dataset_splits:

            if dataset_split == 'test':
                ssi_data_path = os.path.join(
                    'logs/%s_bert_both_sentemb_artemb_plushidden' %
                    FLAGS.dataset_name, 'ssi.pkl')
                print(util.bcolors.OKGREEN +
                      "Loading SSI from BERT at %s" % ssi_data_path +
                      util.bcolors.ENDC)
                with open(ssi_data_path) as f:
                    ssi_triple_list = pickle.load(f)

            source_dir = os.path.join(data_dir, FLAGS.dataset_name)
            source_files = sorted(
                glob.glob(source_dir + '/' + dataset_split + '*'))

            total = len(source_files) * 1000 if (
                'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name
                or 'xsum' in FLAGS.dataset_name) else len(source_files)
            example_generator = data.example_generator(
                source_dir + '/' + dataset_split + '*',
                True,
                False,
                should_check_valid=False)

            out_document_path = os.path.join(out_dir,
                                             dataset_split + '.Ndocument')
            out_summary_path = os.path.join(out_dir,
                                            dataset_split + '.Nsummary')
            out_example_idx_path = os.path.join(out_dir,
                                                dataset_split + '.Nexampleidx')

            doc_writer = open(out_document_path, 'w')
            if dataset_split != 'test':
                sum_writer = open(out_summary_path, 'w')
            ex_idx_writer = open(out_example_idx_path, 'w')

            for example_idx, example in enumerate(
                    tqdm(example_generator, total=total)):
                if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances:
                    break
                raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example(
                    example, names_to_types)
                article_sent_tokens = [
                    util.process_sent(sent) for sent in raw_article_sents
                ]
                if FLAGS.dataset_name == 'duc_2004':
                    groundtruth_summ_sents = [[
                        sent.strip()
                        for sent in gt_summ_text.strip().split('\n')
                    ] for gt_summ_text in groundtruth_summary_text]
                else:
                    groundtruth_summ_sents = [[
                        sent.strip() for sent in
                        groundtruth_summary_text.strip().split('\n')
                    ]]
                if doc_indices is None:
                    doc_indices = [0] * len(
                        util.flatten_list_of_lists(article_sent_tokens))
                doc_indices = [int(doc_idx) for doc_idx in doc_indices]
                # rel_sent_indices, _, _ = preprocess_for_lambdamart_no_flags.get_rel_sent_indices(doc_indices, article_sent_tokens)

                if dataset_split == 'test':
                    if example_idx >= len(ssi_triple_list):
                        raise Exception(
                            'Len of ssi list (%d) is less than number of examples (>=%d)'
                            % (len(ssi_triple_list), example_idx))
                    ssi_length_extractive = ssi_triple_list[example_idx][2]
                    if ssi_length_extractive > 1:
                        a = 0
                    ssi = ssi_triple_list[example_idx][1]
                    ssi = ssi[:ssi_length_extractive]
                    groundtruth_similar_source_indices_list = ssi
                else:
                    groundtruth_similar_source_indices_list = util.enforce_sentence_limit(
                        groundtruth_similar_source_indices_list,
                        FLAGS.sentence_limit)

                for ssi_idx, ssi in enumerate(
                        groundtruth_similar_source_indices_list):
                    if len(ssi) == 0:
                        continue
                    my_article = ' '.join(util.reorder(raw_article_sents, ssi))
                    doc_writer.write(my_article + '\n')
                    if dataset_split != 'test':
                        sum_writer.write(groundtruth_summ_sents[0][ssi_idx] +
                                         '\n')
                    ex_idx_writer.write(str(example_idx) + '\n')
    elif FLAGS.mode == 'evaluate':
        summary_dir = '/home/logan/data/kaiqiang_data/logan_ACL/trained_on_' + FLAGS.train_dataset + '/' + FLAGS.dataset_name
        out_summary_path = os.path.join(summary_dir, 'test' + 'Summary.txt')
        out_example_idx_path = os.path.join(out_dir, 'test' + '.Nexampleidx')
        decode_dir = 'logs/kaiqiang_%s_trainedon%s' % (FLAGS.dataset_name,
                                                       FLAGS.train_dataset)
        rouge_ref_dir = os.path.join(decode_dir, 'reference')
        rouge_dec_dir = os.path.join(decode_dir, 'decoded')
        util.create_dirs(rouge_ref_dir)
        util.create_dirs(rouge_dec_dir)

        def num_lines_in_file(file_path):
            with open(file_path) as f:
                num_lines = sum(1 for line in f)
            return num_lines

        def process_example(sents, ex_idx, groundtruth_summ_sents):
            final_decoded_words = []
            for sent in sents:
                final_decoded_words.extend(sent.split(' '))
            rouge_functions.write_for_rouge(groundtruth_summ_sents,
                                            None,
                                            ex_idx,
                                            rouge_ref_dir,
                                            rouge_dec_dir,
                                            decoded_words=final_decoded_words,
                                            log=False)

        num_lines_summary = num_lines_in_file(out_summary_path)
        num_lines_example_indices = num_lines_in_file(out_example_idx_path)
        if num_lines_summary != num_lines_example_indices:
            raise Exception(
                'Num lines summary != num lines example indices: (%d, %d)' %
                (num_lines_summary, num_lines_example_indices))

        source_dir = os.path.join(data_dir, FLAGS.dataset_name)
        example_generator = data.example_generator(source_dir + '/' + 'test' +
                                                   '*',
                                                   True,
                                                   False,
                                                   should_check_valid=False)

        sum_writer = open(out_summary_path)
        ex_idx_writer = open(out_example_idx_path)
        prev_ex_idx = 0
        sents = []

        for line_idx in tqdm(range(num_lines_summary)):
            line = sum_writer.readline()
            ex_idx = int(ex_idx_writer.readline())

            if ex_idx == prev_ex_idx:
                sents.append(line)
            else:
                example = example_generator.next()
                raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example(
                    example, names_to_types)
                if FLAGS.dataset_name == 'duc_2004':
                    groundtruth_summ_sents = [[
                        sent.strip()
                        for sent in gt_summ_text.strip().split('\n')
                    ] for gt_summ_text in groundtruth_summary_text]
                else:
                    groundtruth_summ_sents = [[
                        sent.strip() for sent in
                        groundtruth_summary_text.strip().split('\n')
                    ]]
                process_example(sents, ex_idx, groundtruth_summ_sents)
                prev_ex_idx = ex_idx
                sents = [line]

        example = example_generator.next()
        raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example(
            example, names_to_types)
        if FLAGS.dataset_name == 'duc_2004':
            groundtruth_summ_sents = [[
                sent.strip() for sent in gt_summ_text.strip().split('\n')
            ] for gt_summ_text in groundtruth_summary_text]
        else:
            groundtruth_summ_sents = [[
                sent.strip()
                for sent in groundtruth_summary_text.strip().split('\n')
            ]]
        process_example(sents, ex_idx, groundtruth_summ_sents)

        print("Now starting ROUGE eval...")
        if FLAGS.dataset_name == 'xsum':
            l_param = 100
        else:
            l_param = 100
        results_dict = rouge_functions.rouge_eval(rouge_ref_dir,
                                                  rouge_dec_dir,
                                                  l_param=l_param)
        rouge_functions.rouge_log(results_dict, decode_dir)

    else:
        raise Exception('mode flag was not evaluate or write.')
Example #15
0
        self.terminal = sys.stdout
        self.log = open("temp/logfile.txt", "w")

    def write(self, message):
        self.terminal.write(message)
        if not "\r" in message:
            self.log.write(message)
            self.log.flush()


if __name__ == '__main__':
    if file_exists("mem/data_temp"):
        print("Danger: corrupted data file!")
    else:
        #Create directories
        create_dirs("mem/backup/")
        create_dirs("mem/important/")
        create_dirs("temp/")
        #Logging
        sys.stdout = Logger()
        #Check Python version
        print("Using Python version " +\
              str(sys.version_info.major) + "." +\
              str(sys.version_info.minor) + "." +\
              str(sys.version_info.micro) + " " +\
              sys.version_info.releaselevel + " " +\
              str(int(round(log(sys.maxint * 2 + 2, 2)))) + "bit")
        if sys.version_info.major != 2:
            print("Not supported; use Python 2")
        elif 0:
            print("")
Example #16
0
bin_values = [x / 100. for x in list(range(100))]
pretty_dataset_names = {
    'cnn_dm': 'CNN/DM',
    'xsum': 'XSum',
    'duc_2004': 'DUC-04'
}

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
# plt.rcParams['font.weight'] = 'bold'
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 20

util.create_dirs('stuff/plots')

plot_data_file = os.path.join(
    'stuff/plots', FLAGS.dataset_name + '_' + FLAGS.dataset_split + '.pkl')
plot_file = os.path.join(
    'stuff/plots', FLAGS.dataset_name + '_' + FLAGS.dataset_split + '.pdf')


def plot_histograms(all_list_of_hist_pairs):
    nrows = len(all_list_of_hist_pairs)
    ncols = len(all_list_of_hist_pairs[0])
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols)
    if axes.ndim == 1:
        axes = axes.reshape(1, -1)
    fig.set_size_inches(10, 5)
    fig.subplots_adjust(wspace=0.075, hspace=0.05)
def main(unused_argv):
    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)
    start_time = time.time()
    np.random.seed(random_seed)
    if FLAGS.dataset_name == 'all':
        datasets = dataset_names
    else:
        datasets = [FLAGS.dataset_name]

    for dataset in datasets:
        coref_dir = os.path.join(FLAGS.coref_root, dataset)
        to_coref_dir = os.path.join(coref_dir, 'to_coref')
        corenlp_lists_dir = os.path.join(coref_dir, 'corenlp_lists')
        data_coref_dir = os.path.join(FLAGS.data_root, 'with_coref', dataset)

        util.create_dirs(to_coref_dir)
        util.create_dirs(corenlp_lists_dir)
        util.create_dirs(data_coref_dir)

        source_dir = os.path.join(FLAGS.data_root, dataset)

        if FLAGS.dataset_split == 'all':
            dataset_splits = ['test', 'val', 'train']
        else:
            dataset_splits = [FLAGS.dataset_split]
        for dataset_split in dataset_splits:
            source_files = sorted(
                glob.glob(source_dir + '/' + dataset_split + '*'))

            total = len(source_files) * 1000 if (
                'cnn' in dataset or 'newsroom' in dataset
                or 'xsum' in dataset) else len(source_files)
            example_generator = data.example_generator(
                source_dir + '/' + dataset_split + '*',
                True,
                False,
                should_check_valid=False)

            if FLAGS.mode == 'prepare':
                corenlp_list = []
                out_idx = 0
                for example_idx, example in enumerate(
                        tqdm(example_generator, total=total)):
                    raw_article_sents, article, abstract, doc_indices = util.unpack_tf_example(
                        example, names_to_types)
                    if raw_article_sents is None:
                        continue
                    raw_article = ' '.join(raw_article_sents)
                    file_name = os.path.join(
                        to_coref_dir, '%s_%06d.bin' % (dataset_split, out_idx))
                    with open(file_name, 'wb') as f:
                        f.write(raw_article)
                    corenlp_list.append(file_name)
                with open(
                        os.path.join(corenlp_lists_dir,
                                     'all_' + dataset_split + '.txt'),
                        'wb') as f:
                    f.write('\n'.join(corenlp_list))
                    out_idx += 1

            elif FLAGS.mode == 'create':
                process_coref_dir = os.path.join(coref_dir, 'processed')

                out_idx = 0
                out_file_name = os.path.join(
                    data_coref_dir,
                    dataset_split + '_{:05d}.bin'.format(out_idx // 1000))
                writer = open(os.path.join(out_file_name), 'wb')
                coref_files = sorted(
                    glob.glob(
                        os.path.join(process_coref_dir, dataset_split + '*')))
                coref_dict = {}
                for c in coref_files:
                    coref_dict[c.split('/')[-1].split('.json')[0]] = c
                print(len(coref_files), len(source_files))
                for example_idx, example in enumerate(
                        tqdm(example_generator, total=total)):
                    raw_article_sents, article, abstract, doc_indices = util.unpack_tf_example(
                        example, names_to_types)
                    if raw_article_sents is None:
                        continue
                    raw_article_sents = [
                        sent for sent in raw_article_sents
                        if sent.strip() != ''
                    ]
                    if out_idx % 1000 == 0 and out_idx != 0:
                        writer.close()
                        out_file_name = os.path.join(
                            data_coref_dir, dataset_split +
                            '_{:05d}.bin'.format(out_idx // 1000))
                        writer = open(os.path.join(out_file_name), 'wb')

                    # coref_file = os.path.join(process_coref_dir, 'test_%06d.bin.json' % example_idx)
                    # coref_file = coref_files[out_idx]
                    # matched_files = [name for name in coref_files if '%s_%06d.bin'%(dataset_split, out_idx) in name]
                    file_name = '%s_%06d.bin' % (dataset_split, out_idx)
                    if file_name in coref_dict:
                        file_path = coref_dict[file_name]
                        corefs = get_corefs(file_path)
                        fixed_corefs = fix_trailing_apostrophe_s(corefs)

                        corefs_relevant_info = remove_irrelevant(fixed_corefs)
                        corefs_json = json.dumps(corefs_relevant_info)
                    else:
                        corefs_json = json.dumps([])

                    example.features.feature['corefs'].bytes_list.value.extend(
                        [corefs_json])

                    tf_example = convert_data.make_example(
                        article, abstract, doc_indices, raw_article_sents,
                        corefs)

                    convert_data.write_tf_example(example, writer)

                    out_idx += 1
                writer.close()

                # file_name = os.path.join(data_coref_dir, '%s_%06d.bin' % (dataset_split, example_idx))
                # writer = open(file_name, 'wb')
                # coref_file = os.path.join(process_coref_dir, 'test_%06d.bin.json'%example_idx)
                # corefs = get_corefs(coref_file)
                # fixed_corefs = fix_trailing_apostrophe_s(corefs)
                #
                # corefs_relevant_info = remove_irrelevant(fixed_corefs)
                # corefs_json = json.dumps(corefs_relevant_info)
                #
                # example.features.feature['corefs'].bytes_list.value.extend([corefs_json])
                # tf_example_str = example.SerializeToString()
                # str_len = len(tf_example_str)
                # writer.write(struct.pack('q', str_len))
                # writer.write(struct.pack('%ds' % str_len, tf_example_str))
                #
                # writer.close()

    util.print_execution_time(start_time)
Example #18
0
def main(unused_argv):

    print('Running statistics on %s' % FLAGS.dataset_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.singles_and_pairs == 'singles':
        FLAGS.sentence_limit = 1
    else:
        FLAGS.sentence_limit = 2

    if FLAGS.dataset_name == 'all':
        dataset_names = ['cnn_dm', 'xsum', 'duc_2004']
    else:
        dataset_names = [FLAGS.dataset_name]

    for dataset_name in dataset_names:
        FLAGS.dataset_name = dataset_name

        source_dir = os.path.join(data_dir, dataset_name)

        if FLAGS.dataset_split == 'all':
            if dataset_name == 'duc_2004':
                dataset_splits = ['test']
            else:
                # dataset_splits = ['val_test', 'test', 'val', 'train']
                dataset_splits = ['test', 'val', 'train']
        else:
            dataset_splits = [FLAGS.dataset_split]

        for dataset_split in dataset_splits:
            if dataset_split == 'val_test':
                source_dataset_split = 'val'
            else:
                source_dataset_split = dataset_split

            source_files = sorted(
                glob.glob(source_dir + '/' + source_dataset_split + '*'))

            total = len(source_files) * 1000
            example_generator = data.example_generator(
                source_dir + '/' + source_dataset_split + '*',
                True,
                False,
                should_check_valid=False)

            out_dir = os.path.join('data', 'bert', dataset_name,
                                   FLAGS.singles_and_pairs, 'input')
            util.create_dirs(out_dir)

            writer = open(os.path.join(out_dir, dataset_split) + '.tsv', 'wb')
            header_list = [
                'should_merge', 'sent1', 'sent2', 'example_idx', 'inst_id',
                'ssi'
            ]
            writer.write(('\t'.join(header_list) + '\n').encode())
            inst_id = 0
            for example_idx, example in enumerate(
                    tqdm(example_generator, total=total)):
                raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example(
                    example, names_to_types)
                article_sent_tokens = [
                    util.process_sent(sent, whitespace=True)
                    for sent in raw_article_sents
                ]
                groundtruth_summ_sents = [[
                    sent.strip()
                    for sent in groundtruth_summary_text.strip().split('\n')
                ]]
                if dataset_name != 'duc_2004' or doc_indices is None or (
                        dataset_name != 'duc_2004' and len(doc_indices) != len(
                            util.flatten_list_of_lists(article_sent_tokens))):
                    doc_indices = [0] * len(
                        util.flatten_list_of_lists(article_sent_tokens))
                doc_indices = [int(doc_idx) for doc_idx in doc_indices]
                rel_sent_indices, _, _ = ssi_functions.get_rel_sent_indices(
                    doc_indices, article_sent_tokens)
                similar_source_indices_list = util.enforce_sentence_limit(
                    groundtruth_similar_source_indices_list,
                    FLAGS.sentence_limit)

                possible_pairs = [
                    x for x in list(
                        itertools.combinations(
                            list(range(len(raw_article_sents))), 2))
                ]  # all pairs
                possible_pairs = filter_pairs_by_sent_position(
                    possible_pairs, rel_sent_indices=rel_sent_indices)
                possible_singles = [(i, )
                                    for i in range(len(raw_article_sents))]
                positives = [ssi for ssi in similar_source_indices_list]

                if dataset_split == 'test' or dataset_split == 'val_test':
                    if FLAGS.singles_and_pairs == 'singles':
                        possible_combinations = possible_singles
                    else:
                        possible_combinations = possible_pairs + possible_singles
                    negatives = [
                        ssi for ssi in possible_combinations
                        if not (ssi in positives or ssi[::-1] in positives)
                    ]

                    for ssi_idx, ssi in enumerate(positives):
                        if len(ssi) == 0:
                            continue
                        if chronological_ssi and len(ssi) >= 2:
                            if ssi[0] > ssi[1]:
                                ssi = (min(ssi), max(ssi))
                        writer.write(
                            get_string_bert_example(raw_article_sents, ssi, 1,
                                                    example_idx,
                                                    inst_id).encode())
                        inst_id += 1
                    for ssi in negatives:
                        writer.write(
                            get_string_bert_example(raw_article_sents, ssi, 0,
                                                    example_idx,
                                                    inst_id).encode())
                        inst_id += 1

                else:
                    positive_sents = list(
                        set(util.flatten_list_of_lists(positives)))
                    negative_pairs = [
                        pair for pair in possible_pairs
                        if not any(i in positive_sents for i in pair)
                    ]
                    negative_singles = [
                        sing for sing in possible_singles
                        if not sing[0] in positive_sents
                    ]
                    random_negative_pairs = np.random.permutation(
                        len(negative_pairs)).tolist()
                    random_negative_singles = np.random.permutation(
                        len(negative_singles)).tolist()

                    for ssi in similar_source_indices_list:
                        if len(ssi) == 0:
                            continue
                        if chronological_ssi and len(ssi) >= 2:
                            if ssi[0] > ssi[1]:
                                ssi = (min(ssi), max(ssi))
                        is_pair = len(ssi) == 2
                        writer.write(
                            get_string_bert_example(raw_article_sents, ssi, 1,
                                                    example_idx,
                                                    inst_id).encode())
                        inst_id += 1

                        # False sentence single/pair
                        if is_pair:
                            if len(random_negative_pairs) == 0:
                                continue
                            negative_indices = negative_pairs[
                                random_negative_pairs.pop()]
                        else:
                            if len(random_negative_singles) == 0:
                                continue
                            negative_indices = negative_singles[
                                random_negative_singles.pop()]
                        article_lcs_paths = None
                        writer.write(
                            get_string_bert_example(raw_article_sents,
                                                    negative_indices, 0,
                                                    example_idx,
                                                    inst_id).encode())
                        inst_id += 1
def main(unused_argv):

    print('Running statistics on %s' % FLAGS.dataset_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    util.create_dirs(processed_root)
    # if not os.path.exists(os.path.join(raw_root, 'reference', 'summaries.txt')):
    util.create_dirs(os.path.join(raw_root, 'reference'))
    util.create_dirs(os.path.join(processed_root, 'article'))
    source_dir = os.path.join(data_dir, FLAGS.dataset_name)
    source_files = sorted(
        glob.glob(source_dir + '/' + FLAGS.dataset_split + '*'))

    total = len(source_files) * 1000 if (
        'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name
        or 'xsum' in FLAGS.dataset_name) else len(source_files)
    example_generator = data.example_generator(source_dir + '/' +
                                               FLAGS.dataset_split + '*',
                                               True,
                                               False,
                                               should_check_valid=False)

    if preprocess_article_and_human_summaries:
        writer = open(os.path.join(raw_root, 'reference', 'summaries.txt'),
                      'w')
        writer_article = open(
            os.path.join(processed_root, 'article', 'articles.txt'), 'w')
        writer_tokenized_article = open(
            os.path.join(processed_root, 'article', 'articles_tokenized.txt'),
            'w')
        reference_articles = []
        for example_idx, example in enumerate(
                tqdm(example_generator, total=total)):
            if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances:
                break
            raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example(
                example, names_to_types)
            groundtruth_summ_sents = [
                util.unfix_bracket_tokens_in_sent(sent.strip())
                for sent in groundtruth_summary_text.strip().split('\n')
            ]
            writer.write('\t'.join(groundtruth_summ_sents) + '\n')
            reference_article = '\t'.join([
                util.unfix_bracket_tokens_in_sent(sent.strip())
                for sent in raw_article_sents
            ])
            reference_articles.append(reference_article)
            pretty_reference_article = fix_punctuations(reference_article)
            writer_article.write(pretty_reference_article + '\n')
            writer_tokenized_article.write(reference_article + '\n')
        writer.close()

    for system in systems:
        print('Processing ' + system + '...')
        raw_dir = os.path.join(raw_root, system)
        processed_dir = os.path.join(processed_root, system)
        util.create_dirs(processed_dir)
        if system == 'reference':
            with open(os.path.join(raw_dir, 'summaries.txt')) as f:
                with open(os.path.join(processed_dir, 'summaries.txt'),
                          'w') as writer:
                    text = f.read()
                    pretty_reference_summaries = fix_punctuations(text)
                    writer.write(pretty_reference_summaries)
                    reference_summaries = [
                        summ.strip() for summ in text.split('\n')
                        if summ.strip() != ''
                    ]
                with open(
                        os.path.join(processed_dir, 'summaries_tokenized.txt'),
                        'w') as writer_tokenized:
                    writer_tokenized.write(text + '\n')

        elif system == 'abs-rl-rerank':
            decoded_files = sorted(
                glob.glob(
                    os.path.join(raw_dir, 'rnn-ext_abs_rl_rerank', 'decoded',
                                 '*.dec')))
            sys_ref_files = sorted(
                glob.glob(os.path.join(raw_dir, 'reference', '*.ref')))
            summaries = []
            for file in decoded_files:
                with open(file) as f:
                    text = f.read()
                    text = util.unfix_bracket_tokens_in_sent(text)
                    summary_sents = text.split('\n')
                    summaries.append('\t'.join(summary_sents))
            sys_ref_summaries = []
            for file in sys_ref_files:
                with open(file) as f:
                    text = f.read()
                    text = util.unfix_bracket_tokens_in_sent(text)
                    summary_sents = text.split('\n')
                    sys_ref_summaries.append('\t'.join(summary_sents))
            reordered_summaries = reorder_list_like(summaries,
                                                    sys_ref_summaries,
                                                    reference_summaries)
            with open(os.path.join(processed_dir, 'summaries.txt'),
                      'w') as writer:
                with open(
                        os.path.join(processed_dir, 'summaries_tokenized.txt'),
                        'w') as writer_tokenized:
                    for summ in reordered_summaries:
                        writer_tokenized.write(summ + '\n')
                        writer.write(fix_punctuations(summ) + '\n')
        elif system == 'pg':
            decoded_files = sorted(
                glob.glob(
                    os.path.join(raw_dir, 'pointer-gen-cov', '*_decoded.txt')))
            summaries = []
            for file in tqdm(decoded_files):
                with open(file) as f:
                    summary_sents = f.read().split('\n')
                    summaries.append('\t'.join(summary_sents))
            ref_files = sorted(
                glob.glob(os.path.join(raw_dir, 'reference',
                                       '*_reference.txt')))
            sys_ref_summaries = []
            for file in tqdm(ref_files):
                with open(file) as f:
                    summary_sents = f.read().split('\n')
                    sys_ref_summaries.append('\t'.join(summary_sents))

            reordered_summaries = reorder_list_like(summaries,
                                                    sys_ref_summaries,
                                                    reference_summaries)

            with open(os.path.join(processed_dir, 'summaries.txt'),
                      'w') as writer:
                with open(
                        os.path.join(processed_dir, 'summaries_tokenized.txt'),
                        'w') as writer_tokenized:
                    for summ in reordered_summaries:
                        writer_tokenized.write(summ + '\n')
                        writer.write(fix_punctuations(summ) + '\n')
        elif system == 'bottom-up':
            with open(
                    os.path.join(raw_dir,
                                 'bottom_up_cnndm_015_threshold.out')) as f:
                text_with_slash_t = f.read()
                text_with_slash_t = util.unfix_bracket_tokens_in_sent(
                    text_with_slash_t)
                text_tab_separated = slash_t_to_tab_separated(
                    text_with_slash_t)
                summaries = [
                    summ.strip() for summ in text_tab_separated.split('\n')
                    if summ.strip() != ''
                ]
            with open(os.path.join(raw_dir,
                                   'test.txt.tgt.tagged.shuf.noslash')) as f:
                text_with_slash_t = f.read()
                text_tab_separated = slash_t_to_tab_separated(
                    text_with_slash_t)
                sys_ref_summaries = [
                    summ.strip() for summ in text_tab_separated.split('\n')
                    if summ.strip() != ''
                ]
            reordered_summaries = reorder_list_like(summaries,
                                                    sys_ref_summaries,
                                                    reference_summaries)
            with open(os.path.join(processed_dir, 'summaries.txt'),
                      'w') as writer:
                with open(
                        os.path.join(processed_dir, 'summaries_tokenized.txt'),
                        'w') as writer_tokenized:
                    for summ in reordered_summaries:
                        writer_tokenized.write(summ + '\n')
                        writer.write(fix_punctuations(summ) + '\n')
        elif system == 'dca':
            with open(os.path.join(raw_dir, 'cnndm_m6_m7.txt')) as f:
                text = f.read()
            lines = text.split('\n')
            summary_texts = []
            sys_ref_summary_texts = []
            for line in tqdm(lines[1:]):
                if line.strip() == '':
                    continue
                if len(line.split('\t')) != 3:
                    a = 0
                sys_ref_summary, _, summary = line.split('\t')
                summary = summary.replace('u . s .', 'u.s.')
                sys_ref_summary = sys_ref_summary.replace('u . s .', 'u.s.')
                summary_texts.append(summary)
                sys_ref_summary_texts.append(sys_ref_summary)
            summaries = [get_sents(summary) for summary in tqdm(summary_texts)]
            sys_ref_summaries = [
                get_sents(sys_ref_summary)
                for sys_ref_summary in tqdm(sys_ref_summary_texts)
            ]
            reordered_summaries = reorder_list_like(summaries,
                                                    sys_ref_summaries,
                                                    reference_summaries)
            with open(os.path.join(processed_dir, 'summaries.txt'),
                      'w') as writer:
                with open(
                        os.path.join(processed_dir, 'summaries_tokenized.txt'),
                        'w') as writer_tokenized:
                    for summ in reordered_summaries:
                        writer_tokenized.write(summ + '\n')
                        writer.write(fix_punctuations(summ) + '\n')
        elif system == 'novel':
            with open(os.path.join(raw_dir, 'rl-novelty-lm.out')) as f:
                text = f.read()
            lines = text.split('\n')
            summaries = []
            sys_articles = []
            summary_texts = []
            sys_article_texts = []
            for line in tqdm(lines):
                if line.strip() == '':
                    continue
                obj = json.loads(line)
                article = obj['article']
                summary = obj['prediction']
                summary_texts.append(
                    util.unfix_bracket_tokens_in_sent(summary))
                sys_article_texts.append(
                    util.unfix_bracket_tokens_in_sent(article))
            # nlp_summaries = nlp.pipe(summary_texts)
            # nlp_sys_articles = nlp.pipe(sys_article_texts)
            summaries = [
                get_sents(summary)
                for summary in tqdm(summary_texts, total=11490)
            ]
            sys_articles = [
                get_sents(article)
                for article in tqdm(sys_article_texts, total=11490)
            ]
            reordered_summaries = reorder_list_like(summaries, sys_articles,
                                                    reference_articles)
            with open(os.path.join(processed_dir, 'summaries.txt'),
                      'w') as writer:
                with open(
                        os.path.join(processed_dir, 'summaries_tokenized.txt'),
                        'w') as writer_tokenized:
                    for summ in reordered_summaries:
                        writer_tokenized.write(summ + '\n')
                        writer.write(fix_punctuations(summ) + '\n')
            a = 0
Example #20
0
        if file_exists("temp/logfile.txt"):
            shutil.copyfile("temp/logfile.txt", "temp/logfile_old.txt")
        self.terminal = sys.stdout
        self.log = open("temp/logfile.txt", "w")
    def write(self, message):
        self.terminal.write(message)
        if not "\r" in message:
            self.log.write(message)
            self.log.flush()

if __name__ == '__main__':
    if file_exists("mem/data_temp"):
        print("Danger: corrupted data file!")
    else:
        #Create directories
        create_dirs("mem/backup/")
        create_dirs("mem/important/")
        create_dirs("temp/")
        #Logging
        sys.stdout = Logger()
        #Check Python version
        print("Using Python version " +\
              str(sys.version_info.major) + "." +\
              str(sys.version_info.minor) + "." +\
              str(sys.version_info.micro) + " " +\
              sys.version_info.releaselevel + " " +\
              str(int(round(log(sys.maxint * 2 + 2, 2)))) + "bit")
        if sys.version_info.major != 2:
            print("Not supported; use Python 2")
        elif 0:
            print("")
def main(unused_argv):

    print('Running statistics on %s' % FLAGS.exp_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.all_actions:
        FLAGS.sent_dataset = True
        FLAGS.ssi_dataset = True
        FLAGS.print_output = True
        FLAGS.highlight = True

    original_dataset_name = 'xsum' if 'xsum' in FLAGS.dataset_name else 'cnn_dm' if (
        'cnn_dm' in FLAGS.dataset_name
        or 'duc_2004' in FLAGS.dataset_name) else ''
    vocab = Vocab(FLAGS.vocab_path + '_' + original_dataset_name,
                  FLAGS.vocab_size)  # create a vocabulary

    source_dir = os.path.join(data_dir, FLAGS.dataset_name)
    util.create_dirs(html_dir)

    if FLAGS.dataset_split == 'all':
        if FLAGS.dataset_name == 'duc_2004':
            dataset_splits = ['test']
        else:
            dataset_splits = ['test', 'val', 'train']
    else:
        dataset_splits = [FLAGS.dataset_split]
    for dataset_split in dataset_splits:
        source_files = sorted(glob.glob(source_dir + '/' + dataset_split +
                                        '*'))
        if FLAGS.exp_name == 'reference':
            # summary_dir = log_dir + default_exp_name + '/decode_test_' + str(max_enc_steps) + \
            #                 'maxenc_4beam_' + str(min_dec_steps) + 'mindec_' + str(max_dec_steps) + 'maxdec_ckpt-238410/reference'
            # summary_files = sorted(glob.glob(summary_dir + '/*_reference.A.txt'))
            summary_dir = source_dir
            summary_files = source_files
        else:
            if FLAGS.exp_name == 'cnn_dm':
                summary_dir = log_dir + FLAGS.exp_name + '/decode_test_400maxenc_4beam_35mindec_100maxdec_ckpt-238410/decoded'
            else:
                ckpt_folder = util.find_largest_ckpt_folder(log_dir +
                                                            FLAGS.exp_name)
                summary_dir = log_dir + FLAGS.exp_name + '/' + ckpt_folder + '/decoded'
                # summary_dir = log_dir + FLAGS.exp_name + '/decode_test_' + str(max_enc_steps) + \
                #             'maxenc_4beam_' + str(min_dec_steps) + 'mindec_' + str(max_dec_steps) + 'maxdec_ckpt-238410/decoded'
            summary_files = sorted(glob.glob(summary_dir + '/*'))
        if len(summary_files) == 0:
            raise Exception('No files found in %s' % summary_dir)
        example_generator = data.example_generator(source_dir + '/' +
                                                   dataset_split + '*',
                                                   True,
                                                   False,
                                                   is_original=True)
        pros = {
            'annotators': 'dcoref',
            'outputFormat': 'json',
            'timeout': '5000000'
        }
        all_merge_examples = []
        num_extracted_list = []
        distances = []
        relative_distances = []
        html_str = ''
        extracted_sents_in_article_html = ''
        name = FLAGS.dataset_name + '_' + FLAGS.exp_name
        if FLAGS.coreference_replacement:
            name += '_coref'
        highlight_file_name = os.path.join(
            html_dir, FLAGS.dataset_name + '_' + FLAGS.exp_name)
        if FLAGS.consider_stopwords:
            highlight_file_name += '_stopwords'
        if FLAGS.highlight:
            extracted_sents_in_article_html_file = open(
                highlight_file_name + '_extracted_sents.html', 'wb')
        if FLAGS.kaiqiang:
            kaiqiang_article_texts = []
            kaiqiang_abstract_texts = []
            util.create_dirs(kaiqiang_dir)
            kaiqiang_article_file = open(
                os.path.join(
                    kaiqiang_dir, FLAGS.dataset_name + '_' + dataset_split +
                    '_' + str(FLAGS.min_matched_tokens) + '_articles.txt'),
                'wb')
            kaiqiang_abstract_file = open(
                os.path.join(
                    kaiqiang_dir, FLAGS.dataset_name + '_' + dataset_split +
                    '_' + str(FLAGS.min_matched_tokens) + '_abstracts.txt'),
                'wb')
        if FLAGS.ssi_dataset:
            if FLAGS.tag_tokens:
                with_coref_and_ssi_dir = lambdamart_dir + '_and_tag_tokens'
            else:
                with_coref_and_ssi_dir = lambdamart_dir
            lambdamart_out_dir = os.path.join(with_coref_and_ssi_dir,
                                              FLAGS.dataset_name)
            if FLAGS.sentence_limit == 1:
                lambdamart_out_dir += '_singles'
            if FLAGS.consider_stopwords:
                lambdamart_out_dir += '_stopwords'
            lambdamart_out_full_dir = os.path.join(lambdamart_out_dir, 'all')
            util.create_dirs(lambdamart_out_full_dir)
            lambdamart_writer = open(
                os.path.join(lambdamart_out_full_dir, dataset_split + '.bin'),
                'wb')

        simple_similar_source_indices_list_plus_empty = []
        example_idx = -1
        instance_idx = 0
        total = len(source_files) * 1000 if (
            'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name
            or 'xsum' in FLAGS.dataset_name) else len(source_files)
        random_choices = None
        if FLAGS.randomize:
            if FLAGS.dataset_name == 'cnn_dm':
                list_order = np.random.permutation(11490)
                random_choices = list_order[:FLAGS.num_instances]
        for example in tqdm(example_generator, total=total):
            example_idx += 1
            if FLAGS.num_instances != -1 and instance_idx >= FLAGS.num_instances:
                break
            if random_choices is not None and example_idx not in random_choices:
                continue
        # for file_idx in tqdm(range(len(source_files))):
        #     example = get_tf_example(source_files[file_idx])
            article_text = example.features.feature[
                'article'].bytes_list.value[0].decode().lower()
            if FLAGS.exp_name == 'reference':
                summary_text, all_summary_texts = get_summary_from_example(
                    example)
            else:
                summary_text = get_summary_text(summary_files[example_idx])
            article_tokens = split_into_tokens(article_text)
            if 'raw_article_sents' in example.features.feature and len(
                    example.features.feature['raw_article_sents'].bytes_list.
                    value) > 0:
                raw_article_sents = example.features.feature[
                    'raw_article_sents'].bytes_list.value

                raw_article_sents = [
                    sent.decode() for sent in raw_article_sents
                    if sent.decode().strip() != ''
                ]
                article_sent_tokens = [
                    util.process_sent(sent, whitespace=True)
                    for sent in raw_article_sents
                ]
            else:
                # article_text = util.to_unicode(article_text)

                # sent_pros = {'annotators': 'ssplit', 'outputFormat': 'json', 'timeout': '5000000'}
                # sents_result_dict = nlp.annotate(str(article_text), properties=sent_pros)
                # article_sent_tokens = [[token['word'] for token in sent['tokens']] for sent in sents_result_dict['sentences']]

                raw_article_sents = nltk.tokenize.sent_tokenize(article_text)
                article_sent_tokens = [
                    util.process_sent(sent) for sent in raw_article_sents
                ]
            if FLAGS.top_n_sents != -1:
                article_sent_tokens = article_sent_tokens[:FLAGS.top_n_sents]
                raw_article_sents = raw_article_sents[:FLAGS.top_n_sents]
            article_sents = [' '.join(sent) for sent in article_sent_tokens]
            try:
                article_tokens_string = str(' '.join(article_sents))
            except:
                try:
                    article_tokens_string = str(' '.join(
                        [sent.decode('latin-1') for sent in article_sents]))
                except:
                    raise

            if len(article_sent_tokens) == 0:
                continue

            summary_sent_tokens = split_into_sent_tokens(summary_text)
            if 'doc_indices' in example.features.feature and len(
                    example.features.feature['doc_indices'].bytes_list.value
            ) > 0:
                doc_indices_str = example.features.feature[
                    'doc_indices'].bytes_list.value[0].decode()
                if '1' in doc_indices_str:
                    doc_indices = [
                        int(x) for x in doc_indices_str.strip().split()
                    ]
                    rel_sent_positions = importance_features.get_sent_indices(
                        article_sent_tokens, doc_indices)
                else:
                    num_tokens_total = sum(
                        [len(sent) for sent in article_sent_tokens])
                    rel_sent_positions = list(range(len(raw_article_sents)))
                    doc_indices = [0] * num_tokens_total

            else:
                rel_sent_positions = None
                doc_indices = None
                doc_indices_str = None
            if 'corefs' in example.features.feature and len(
                    example.features.feature['corefs'].bytes_list.value) > 0:
                corefs_str = example.features.feature[
                    'corefs'].bytes_list.value[0]
                corefs = json.loads(corefs_str)
            # summary_sent_tokens = limit_to_n_tokens(summary_sent_tokens, 100)

            similar_source_indices_list_plus_empty = []

            simple_similar_source_indices, lcs_paths_list, article_lcs_paths_list, smooth_article_paths_list = ssi_functions.get_simple_source_indices_list(
                summary_sent_tokens,
                article_sent_tokens,
                vocab,
                FLAGS.sentence_limit,
                FLAGS.min_matched_tokens,
                not FLAGS.consider_stopwords,
                lemmatize=FLAGS.lemmatize,
                multiple_ssi=FLAGS.multiple_ssi)

            article_paths_parameter = article_lcs_paths_list if FLAGS.tag_tokens else None
            article_paths_parameter = smooth_article_paths_list if FLAGS.smart_tags else article_paths_parameter
            restricted_source_indices = util.enforce_sentence_limit(
                simple_similar_source_indices, FLAGS.sentence_limit)
            for summ_sent_idx, summ_sent in enumerate(summary_sent_tokens):
                if FLAGS.sent_dataset:
                    if len(restricted_source_indices[summ_sent_idx]) == 0:
                        continue
                    merge_example = get_merge_example(
                        restricted_source_indices[summ_sent_idx],
                        article_sent_tokens, summ_sent, corefs,
                        article_paths_parameter[summ_sent_idx])
                    all_merge_examples.append(merge_example)

            simple_similar_source_indices_list_plus_empty.append(
                simple_similar_source_indices)
            if FLAGS.ssi_dataset:
                summary_text_to_save = [
                    s for s in all_summary_texts
                ] if FLAGS.dataset_name == 'duc_2004' else summary_text
                write_lambdamart_example(simple_similar_source_indices,
                                         raw_article_sents,
                                         summary_text_to_save, corefs_str,
                                         doc_indices_str,
                                         article_paths_parameter,
                                         lambdamart_writer)

            if FLAGS.highlight:
                highlight_article_lcs_paths_list = smooth_article_paths_list if FLAGS.smart_tags else article_lcs_paths_list
                # simple_ssi_plus_empty = [ [s[0] for s in sim_source_ind] for sim_source_ind in simple_similar_source_indices]
                extracted_sents_in_article_html = ssi_functions.html_highlight_sents_in_article(
                    summary_sent_tokens, simple_similar_source_indices,
                    article_sent_tokens, doc_indices, lcs_paths_list,
                    highlight_article_lcs_paths_list)
                extracted_sents_in_article_html_file.write(
                    extracted_sents_in_article_html.encode())
            a = 0

            instance_idx += 1

        if FLAGS.ssi_dataset:
            lambdamart_writer.close()
            if FLAGS.dataset_name == 'cnn_dm' or FLAGS.dataset_name == 'newsroom' or FLAGS.dataset_name == 'xsum':
                chunk_size = 1000
            else:
                chunk_size = 1
            util.chunk_file(dataset_split,
                            lambdamart_out_full_dir,
                            lambdamart_out_dir,
                            chunk_size=chunk_size)

        if FLAGS.sent_dataset:
            with_coref_dir = data_dir + '_and_tag_tokens' if FLAGS.tag_tokens else data_dir
            out_dir = os.path.join(with_coref_dir,
                                   FLAGS.dataset_name + '_sent')
            if FLAGS.sentence_limit == 1:
                out_dir += '_singles'
            if FLAGS.consider_stopwords:
                out_dir += '_stopwords'
            if FLAGS.coreference_replacement:
                out_dir += '_coref'
            if FLAGS.top_n_sents != -1:
                out_dir += '_n=' + str(FLAGS.top_n_sents)
            util.create_dirs(out_dir)
            convert_data.write_with_generator(iter(all_merge_examples),
                                              len(all_merge_examples), out_dir,
                                              dataset_split)

        if FLAGS.print_output:
            # html_str = FLAGS.dataset + ' | ' + FLAGS.exp_name + '<br><br><br>' + html_str
            # save_fusions_to_file(html_str)
            ssi_path = os.path.join(ssi_dir, FLAGS.dataset_name)
            if FLAGS.consider_stopwords:
                ssi_path += '_stopwords'
            util.create_dirs(ssi_path)
            if FLAGS.dataset_name == 'duc_2004' and FLAGS.abstract_idx != 0:
                abstract_idx_str = '_%d' % FLAGS.abstract_idx
            else:
                abstract_idx_str = ''
            with open(
                    os.path.join(
                        ssi_path,
                        dataset_split + '_ssi' + abstract_idx_str + '.pkl'),
                    'wb') as f:
                pickle.dump(simple_similar_source_indices_list_plus_empty, f)

        if FLAGS.kaiqiang:
            # kaiqiang_article_file.write('\n'.join(kaiqiang_article_texts))
            # kaiqiang_abstract_file.write('\n'.join(kaiqiang_abstract_texts))
            kaiqiang_article_file.close()
            kaiqiang_abstract_file.close()
        if FLAGS.highlight:
            extracted_sents_in_article_html_file.close()
        a = 0
Example #22
0
def main(unused_argv):

    print('Running statistics on %s' % FLAGS.dataset_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.dataset_name == 'all':
        dataset_names = ['cnn_dm', 'xsum', 'duc_2004']
    else:
        dataset_names = [FLAGS.dataset_name]

    for dataset_name in dataset_names:
        FLAGS.dataset_name = dataset_name

        source_dir = os.path.join(data_dir, dataset_name)

        if FLAGS.dataset_split == 'all':
            if dataset_name == 'duc_2004':
                dataset_splits = ['test']
            else:
                dataset_splits = ['test', 'val', 'train']
        else:
            dataset_splits = [FLAGS.dataset_split]

        for dataset_split in dataset_splits:

            source_files = sorted(
                glob.glob(source_dir + '/' + dataset_split + '*'))

            total = len(source_files) * 1000
            example_generator = data.example_generator(
                source_dir + '/' + dataset_split + '*',
                True,
                False,
                should_check_valid=False)

            out_dir = os.path.join('data', 'bert', dataset_name,
                                   'article_embeddings', 'input_article')
            util.create_dirs(out_dir)

            writer = open(os.path.join(out_dir, dataset_split) + '.tsv', 'wb')
            # writer.write('\t'.join(['should_merge', 'sent1', 'sent2', 'example_idx', 'ssi']) + '\n')
            inst_id = 0
            for example_idx, example in enumerate(
                    tqdm(example_generator, total=total)):
                if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances:
                    break
                raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example(
                    example, names_to_types)
                # article_sent_tokens = [util.process_sent(sent) for sent in raw_article_sents]
                # groundtruth_summ_sents = [[sent.strip() for sent in groundtruth_summary_text.strip().split('\n')]]
                # if doc_indices is None or (dataset_name != 'duc_2004' and len(doc_indices) != len(util.flatten_list_of_lists(article_sent_tokens))):
                #     doc_indices = [0] * len(util.flatten_list_of_lists(article_sent_tokens))
                # doc_indices = [int(doc_idx) for doc_idx in doc_indices]
                # rel_sent_indices, _, _ = preprocess_for_lambdamart_no_flags.get_rel_sent_indices(doc_indices, article_sent_tokens)
                # similar_source_indices_list = util.enforce_sentence_limit(groundtruth_similar_source_indices_list, FLAGS.sentence_limit)

                article = ' '.join(raw_article_sents)
                writer.write(article + '\n')
Example #23
0
def convertImagesMasterMAP(targetDir,
                           targetMetaDir,
                           imageMetaDir,
                           jobs,
                           img2bal,
                           stopped,
                           queue,
                           result_queue,
                           num_imgs2process,
                           verbose=False,
                           nProc=None,
                           method_galign=GALIGN_NORMAL,
                           method_lalign=LALIGN_NORMAL):
    """ Called by both single and multi-page elections. Performs
    Target Extraction.
    Input:
        str targetDir: Directory to dump extracted target images to.
        str targetMetaDir: Directory to store target metadata into.
        str imageMetaDir: Directory to store metadata for each Ballot,
            such as ballotpath, path to each extracted target, assoc'd
            blank ballot, isflipped.
        list jobs: [[tmppaths_i, bbs_i, imgpaths_i, targetDir_i, targetDiffDir_i, imageMetaDir_i, queue], ...]
        stopped:
    """
    targetDiffDir = targetDir + '_diffs'

    print "...removing previous Target Extract results..."
    if os.path.exists(targetDir):
        shutil.rmtree(targetDir)
    if os.path.exists(targetDiffDir):
        shutil.rmtree(targetDiffDir)
    if os.path.exists(targetMetaDir):
        shutil.rmtree(targetMetaDir)
    if os.path.exists(imageMetaDir):
        shutil.rmtree(imageMetaDir)
    print "...Finished removing previous Target Extract results"

    create_dirs(targetDir)
    create_dirs(targetDiffDir)
    create_dirs(targetMetaDir)
    create_dirs(imageMetaDir)

    if nProc is None:
        nProc = sh.numProcs()
    # nProc = 1
    num_jobs = len(jobs)

    if nProc < 2:
        print 'using only 1 processes'
        # default behavior for non multiproc machines
        for job in jobs:
            if stopped():
                return False
            t0 = time.clock()
            convertImagesWorkerMAP(job)
            print time.clock() - t0
    else:
        print 'using ', nProc, ' processes'
        pool = mp.Pool(processes=nProc)
        '''
        it = [False]
        def imdone(x):
            it[0] = True
            print "I AM DONE NOW!"
        '''
        if wx.App.IsMainLoopRunning():
            util.MyGauge.all_next_job(num_jobs)
        print "GOING UP TO", num_jobs
        # pool.map_async(convertImagesWorkerMAP,jobs,callback=lambda x: imdone(it))
        pool.map_async(convertImagesWorkerMAP, jobs)
        cnt = 0
        while cnt < len(jobs):
            val = queue.get(block=True)
            if val == True:
                if wx.App.IsMainLoopRunning():
                    util.MyGauge.all_tick()
                cnt += 1
            elif type(val) in (str, unicode):
                # Something went wrong!
                print "    WARNING: detected a failed extract job {0}.".format(
                    cnt)
                cnt += 1
        pool.close()
        pool.join()

    print "    (Finished processing targetextract jobs)"

    cnt = 0
    avg_intensities = []  # [(path, float avg_intensity), ...]
    # maps {int ballotid: {int page: [targetsdir, targetmetadir, diffmetadir,
    # imgmetadir]}}
    bal2targets = {}

    while cnt < num_imgs2process:
        (avg_intensities_cur, balP, page, target_rootdir, targetdiff_rootdir,
         imgmeta_rootdir) = result_queue.get(block=True)
        avg_intensities.extend(avg_intensities_cur)
        ballotid = img2bal[balP]
        # print "...finished ballotid {0}".format(ballotid)
        bal2targets.setdefault(ballotid,
                               {})[page] = (target_rootdir, targetdiff_rootdir,
                                            imgmeta_rootdir)
        cnt += 1
    print 'done.'
    return avg_intensities, bal2targets
if FLAGS.dataset_name == 'xsum':
    l_param = 40
else:
    l_param = 100

temp_in_dir = os.path.join(lambdamart_in_dir,
                           'lambdamart_' + FLAGS.singles_and_pairs)
temp_out_dir = os.path.join(lambdamart_out_dir,
                            'lambdamart_' + FLAGS.singles_and_pairs)
if FLAGS.pca:
    temp_in_dir += '_pca'
    temp_out_dir += '_pca'
temp_in_path = temp_in_dir + '.txt'
temp_out_path = temp_out_dir + '.txt'
util.create_dirs(temp_in_dir)
util.create_dirs(temp_out_dir)
my_log_dir = os.path.join(log_dir, exp_name)
dec_dir = os.path.join(my_log_dir, 'decoded')
ref_dir = os.path.join(my_log_dir, 'reference')
html_dir = os.path.join(my_log_dir, 'hightlighted_html')
util.create_dirs(dec_dir)
util.create_dirs(ref_dir)
util.create_dirs(html_dir)
util.create_dirs(temp_dir)

tfidf_vec_path = 'data/tfidf/' + tfidf_model + '_tfidf_vec_5.pkl'
with open(tfidf_vec_path, 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

pca_vec_path = 'data/tfidf/' + 'all' + '_pca.pkl'
def main(unused_argv):
    print('Running statistics on %s' % exp_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.singles_and_pairs == 'both':
        in_dataset = FLAGS.dataset_name
        out_dataset = FLAGS.dataset_name + '_both'
    else:
        in_dataset = FLAGS.dataset_name + '_singles'
        out_dataset = FLAGS.dataset_name + '_singles'

    if FLAGS.lr:
        out_dataset = FLAGS.dataset_name + '_lr'

    start_time = time.time()
    np.random.seed(random_seed)
    source_dir = os.path.join(data_dir, in_dataset)
    ex_sents = ['single .', 'sentence .']
    article_text = ' '.join(ex_sents)
    sent_term_matrix = util.get_doc_substituted_tfidf_matrix(
        tfidf_vectorizer, ex_sents, article_text, pca)
    if FLAGS.singles_and_pairs == 'pairs':
        single_feat_len = 0
    else:
        single_feat_len = len(
            get_single_sent_features(0, sent_term_matrix,
                                     [['single', '.'], ['sentence', '.']],
                                     [0, 0], 0))
    if FLAGS.singles_and_pairs == 'singles':
        pair_feat_len = 0
    else:
        pair_feat_len = len(
            get_pair_sent_features([0, 1], sent_term_matrix,
                                   [['single', '.'], ['sentence', '.']],
                                   [0, 0], [0, 0]))
    util.print_vars(single_feat_len, pair_feat_len)
    util.create_dirs(temp_dir)

    if FLAGS.dataset_split == 'all':
        dataset_splits = ['test', 'val', 'train']
    elif FLAGS.dataset_split == 'train_val':
        dataset_splits = ['val', 'train']
    else:
        dataset_splits = [FLAGS.dataset_split]
    for split in dataset_splits:
        source_files = sorted(glob.glob(source_dir + '/' + split + '*'))

        out_path = os.path.join(out_dir, out_dataset, split)
        if FLAGS.pca:
            out_path += '_pca'
        util.create_dirs(os.path.join(out_path))
        total = len(source_files) * 1000 if (
            'cnn' in in_dataset or 'newsroom' in in_dataset
            or 'xsum' in in_dataset) else len(source_files)
        example_generator = data.example_generator(source_dir + '/' + split +
                                                   '*',
                                                   True,
                                                   False,
                                                   should_check_valid=False)
        # for example in tqdm(example_generator, total=total):
        ex_gen = example_generator_extended(example_generator, total,
                                            single_feat_len, pair_feat_len,
                                            FLAGS.singles_and_pairs, out_path)
        print('Creating list')
        ex_list = [ex for ex in ex_gen]
        if FLAGS.num_instances != -1:
            ex_list = ex_list[:FLAGS.num_instances]
        print('Converting...')
        # all_features = pool.map(convert_article_to_lambdamart_features, ex_list)

        # all_features = ray.get([convert_article_to_lambdamart_features.remote(ex) for ex in ex_list])

        if FLAGS.lr:
            all_instances = list(
                futures.map(convert_article_to_lambdamart_features, ex_list))
            all_instances = util.flatten_list_of_lists(all_instances)
            x = [inst.features for inst in all_instances]
            x = np.array(x)
            y = [inst.relevance for inst in all_instances]
            y = np.expand_dims(np.array(y), 1)
            x_y = np.concatenate((x, y), 1)
            np.save(writer, x_y)
        else:
            list(futures.map(convert_article_to_lambdamart_features, ex_list))
            # writer.write(''.join(all_features))

        # all_features = []
        # for example  in tqdm(ex_gen, total=total):
        #     all_features.append(convert_article_to_lambdamart_features(example))

        # all_features = util.flatten_list_of_lists(all_features)
        # num1 = sum(x == 1 for x in all_features)
        # num2 = sum(x == 2 for x in all_features)
        # print 'Single sent: %d instances. Pair sent: %d instances.' % (num1, num2)

        # for example in tqdm(ex_gen, total=total):
        #     features = convert_article_to_lambdamart_features(example)
        #     writer.write(features)

        final_out_path = out_path + '.txt'
        file_names = sorted(glob.glob(os.path.join(out_path, '*')))
        writer = open(final_out_path, 'wb')
        for file_name in tqdm(file_names):
            with open(file_name) as f:
                text = f.read()
            writer.write(text)
        writer.close()
    util.print_execution_time(start_time)
def main(unused_argv):

    print('Running statistics on %s' % FLAGS.dataset_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    util.create_dirs(highlight_root)

    if not FLAGS.only_pairs:
        stats = {}
        for system in systems:
            print('Processing ' + system + '...')
            num_compress = 0
            num_fuse = 0
            num_copy = 0
            num_fail = 0
            highlight_dir = os.path.join(highlight_root, system)
            processed_dir = os.path.join(processed_root, system)
            util.create_dirs(highlight_dir)

            f_ssi = open(os.path.join(processed_dir, 'source_indices.txt'),
                         'w')
            f_summ = open(
                os.path.join(processed_dir, 'summaries_tokenized.txt'))
            f_article = open(
                os.path.join(processed_root, 'article',
                             'articles_tokenized.txt'))

            for example_idx in tqdm(range(11490)):
                if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances:
                    break
                summary_sent_tokens = [
                    sent.split()
                    for sent in f_summ.readline().strip().split('\t')
                ]
                article_sent_tokens = [
                    sent.split() for sent in
                    f_article.readline().lower().strip().split('\t')
                ]

                groundtruth_ssi_list, lcs_paths_list, article_lcs_paths_list, smooth_article_paths_list = get_simple_source_indices_list(
                    summary_sent_tokens, article_sent_tokens, None,
                    FLAGS.sentence_limit, min_matched_tokens)
                groundtruth_highlighted_html = html_highlight_sents_in_article(
                    summary_sent_tokens,
                    groundtruth_ssi_list,
                    article_sent_tokens,
                    lcs_paths_list=lcs_paths_list,
                    article_lcs_paths_list=smooth_article_paths_list)
                all_html = '<u>System Summary</u><br><br>' + groundtruth_highlighted_html
                write_highlighted_html(all_html, highlight_dir, example_idx)
                f_ssi.write('\t'.join([
                    ','.join(str(idx) for idx in source_indices
                             ) if len(source_indices) >= 1 else '-1'
                    for source_indices in groundtruth_ssi_list
                ]) + '\n')
                for ssi_idx, ssi in enumerate(groundtruth_ssi_list):
                    if len(ssi) >= 2:
                        num_fuse += 1
                    elif len(ssi) == 1:
                        source_sent = ' '.join(article_sent_tokens[ssi[0]])
                        summ_sent = ' '.join(summary_sent_tokens[ssi_idx])
                        if source_sent == summ_sent:
                            num_copy += 1
                        else:
                            num_compress += 1
                            # tqdm.write(source_sent + '\n' + summ_sent + '\n\n')
                    else:
                        num_fail += 1
                a = 0
            stats[system] = (num_compress, num_fuse, num_copy, num_fail)
            f_summ.close()
            f_article.close()
            f_ssi.close()
        print("num_compress, num_fuse, num_copy, num_fail")
        for system in systems:
            print(system)
            total = sum(stats[system]) * 1.
            print('\t'.join(
                ["%.2f" % (val * 100 / total) for val in stats[system]]))

    else:
        util.create_dirs(pairs_only_processed_root)
        f_article = open(
            os.path.join(processed_root, 'article', 'articles.txt'))
        f_summs = []
        f_ssis = []
        for sys_idx, system in enumerate(systems):
            processed_dir = os.path.join(processed_root, system)
            f_summ = open(os.path.join(processed_dir, 'summaries.txt'))
            f_ssi = open(os.path.join(processed_dir, 'source_indices.txt'))
            f_summs.append(f_summ)
            f_ssis.append(f_ssi)

        w_article = open(
            os.path.join(pairs_only_processed_root, 'articles.txt'), 'w')
        w_summ = open(os.path.join(pairs_only_processed_root, 'summaries.txt'),
                      'w')
        w_ssi = open(
            os.path.join(pairs_only_processed_root, 'source_indices.txt'), 'w')
        w_system = open(os.path.join(pairs_only_processed_root, 'systems.txt'),
                        'w')

        systems_total = []

        for example_idx in tqdm(range(11490)):
            if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances:
                break

            article_str = f_article.readline()

            systems_summ_sents = []
            systems_ssis = []
            system_names = []
            no_reference_pairs = False
            ref_summ_sent = None
            ref_source_indices = None
            for sys_idx, system in enumerate(systems):
                system_name = systems[sys_idx]
                f_summ = f_summs[sys_idx]
                f_ssi = f_ssis[sys_idx]
                summary_sents = f_summ.readline().strip().split('\t')
                ssi = [
                    source_indices_str.split(',') for source_indices_str in
                    f_ssi.readline().strip().split('\t')
                ]
                if system_name == 'reference':
                    ssi_pairs = []
                    summary_sents_pairs = []
                    for summ_sent_idx, source_indices in enumerate(ssi):
                        if len(source_indices) == 2:
                            ssi_pairs.append(source_indices)
                            summary_sents_pairs.append(
                                summary_sents[summ_sent_idx])
                    if len(ssi_pairs) == 0:
                        no_reference_pairs = True
                        break
                    summary_sents_pairs, ssi_pairs = util.shuffle(
                        summary_sents_pairs, ssi_pairs)
                    ref_summ_sent = summary_sents_pairs[0]
                    ref_source_indices = ','.join(ssi_pairs[0])
                else:
                    for summ_sent_idx, source_indices in enumerate(ssi):
                        if len(source_indices) == 2:
                            try:
                                systems_summ_sents.append(
                                    summary_sents[summ_sent_idx])
                            except:
                                print(len(summary_sents), len(ssi),
                                      summ_sent_idx, system, example_idx)
                                raise
                            systems_ssis.append(','.join(ssi[summ_sent_idx]))
                            system_names.append(system_name)
            if no_reference_pairs:
                continue
            if len(systems_summ_sents) < num_summ_sents_per_hit:
                continue
            systems_summ_sents, systems_ssis, system_names = util.shuffle(
                systems_summ_sents, systems_ssis, system_names)
            systems_summ_sents, systems_ssis, system_names = systems_summ_sents[:
                                                                                num_summ_sents_per_hit
                                                                                -
                                                                                1], systems_ssis[:
                                                                                                 num_summ_sents_per_hit
                                                                                                 -
                                                                                                 1], system_names[:
                                                                                                                  num_summ_sents_per_hit
                                                                                                                  -
                                                                                                                  1]

            systems_summ_sents.append(ref_summ_sent)
            systems_ssis.append(ref_source_indices)
            system_names.append('reference')
            systems_summ_sents, systems_ssis, system_names = util.shuffle(
                systems_summ_sents, systems_ssis, system_names)

            w_article.write(article_str)
            w_summ.write('\t'.join(systems_summ_sents) + '\n')
            w_ssi.write('\t'.join(systems_ssis) + '\n')
            w_system.write('\t'.join(system_names) + '\n')

            systems_total.extend(system_names)

        print(Counter(systems_total))
Example #27
0
def main(unused_argv):

    print('Running statistics on %s' % FLAGS.dataset_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.summarizer == 'all':
        summary_methods = list(summarizers.keys())
    else:
        summary_methods = [FLAGS.summarizer]
    if FLAGS.dataset_name == 'all':
        dataset_names = datasets
    else:
        dataset_names = [FLAGS.dataset_name]

    sheets_strs = []
    for summary_method in summary_methods:
        summary_fn = summarizers[summary_method]
        for dataset_name in dataset_names:
            FLAGS.dataset_name = dataset_name

            original_dataset_name = 'xsum' if 'xsum' in dataset_name else 'cnn_dm' if 'cnn_dm' in dataset_name or 'duc_2004' in dataset_name else ''
            vocab = Vocab('logs/vocab' + '_' + original_dataset_name,
                          50000)  # create a vocabulary

            source_dir = os.path.join(data_dir, dataset_name)
            source_files = sorted(
                glob.glob(source_dir + '/' + FLAGS.dataset_split + '*'))

            total = len(source_files) * 1000 if (
                'cnn' in dataset_name or 'newsroom' in dataset_name
                or 'xsum' in dataset_name) else len(source_files)
            example_generator = data.example_generator(
                source_dir + '/' + FLAGS.dataset_split + '*',
                True,
                False,
                should_check_valid=False)

            if dataset_name == 'duc_2004':
                abs_source_dir = os.path.join(
                    os.path.expanduser('~') + '/data/tf_data/with_coref',
                    dataset_name)
                abs_example_generator = data.example_generator(
                    abs_source_dir + '/' + FLAGS.dataset_split + '*',
                    True,
                    False,
                    should_check_valid=False)
                abs_names_to_types = [('abstract', 'string_list')]

            triplet_ssi_list = []
            for example_idx, example in enumerate(
                    tqdm(example_generator, total=total)):
                raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example(
                    example, names_to_types)
                if dataset_name == 'duc_2004':
                    abs_example = next(abs_example_generator)
                    groundtruth_summary_texts = util.unpack_tf_example(
                        abs_example, abs_names_to_types)
                    groundtruth_summary_texts = groundtruth_summary_texts[0]
                    groundtruth_summ_sents_list = [[
                        sent.strip() for sent in data.abstract2sents(abstract)
                    ] for abstract in groundtruth_summary_texts]

                else:
                    groundtruth_summary_texts = [groundtruth_summary_text]
                    groundtruth_summ_sents_list = []
                    for groundtruth_summary_text in groundtruth_summary_texts:
                        groundtruth_summ_sents = [
                            sent.strip() for sent in
                            groundtruth_summary_text.strip().split('\n')
                        ]
                        groundtruth_summ_sents_list.append(
                            groundtruth_summ_sents)
                article_sent_tokens = [
                    util.process_sent(sent) for sent in raw_article_sents
                ]
                if doc_indices is None:
                    doc_indices = [0] * len(
                        util.flatten_list_of_lists(article_sent_tokens))
                doc_indices = [int(doc_idx) for doc_idx in doc_indices]
                groundtruth_similar_source_indices_list = util.enforce_sentence_limit(
                    groundtruth_similar_source_indices_list,
                    FLAGS.sentence_limit)

                log_dir = os.path.join(log_root,
                                       dataset_name + '_' + summary_method)
                dec_dir = os.path.join(log_dir, 'decoded')
                ref_dir = os.path.join(log_dir, 'reference')
                util.create_dirs(dec_dir)
                util.create_dirs(ref_dir)

                parser = PlaintextParser.from_string(
                    ' '.join(raw_article_sents), Tokenizer("english"))
                summarizer = summary_fn()

                summary = summarizer(
                    parser.document,
                    5)  #Summarize the document with 5 sentences
                summary = [str(sentence) for sentence in summary]

                summary_tokenized = []
                for sent in summary:
                    summary_tokenized.append(sent.lower())

                rouge_functions.write_for_rouge(groundtruth_summ_sents_list,
                                                summary_tokenized,
                                                example_idx,
                                                ref_dir,
                                                dec_dir,
                                                log=False)

                decoded_sent_tokens = [
                    sent.split() for sent in summary_tokenized
                ]
                sentence_limit = 2
                sys_ssi_list, _, _ = get_simple_source_indices_list(
                    decoded_sent_tokens, article_sent_tokens, vocab,
                    sentence_limit, min_matched_tokens)
                triplet_ssi_list.append(
                    (groundtruth_similar_source_indices_list, sys_ssi_list,
                     -1))

            print('Evaluating Lambdamart model F1 score...')
            suffix = util.all_sent_selection_eval(triplet_ssi_list)
            print(suffix)

            results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir)
            print(("Results_dict: ", results_dict))
            sheets_str = rouge_functions.rouge_log(results_dict,
                                                   log_dir,
                                                   suffix=suffix)
            sheets_strs.append(dataset_name + '_' + summary_method + '\n' +
                               sheets_str)

    for sheets_str in sheets_strs:
        print(sheets_str + '\n')
Example #28
0
def main():
    root_path = os.getcwd()
    util.create_dirs(OUTPUT_DIR)

    output_path = os.path.join(root_path, 'results', OUTPUT_DIR)
    input_path = os.path.join(root_path, *DATA_PATH)
    anot_path = os.path.join(root_path, *ANOT_PATH)
    label_path = os.path.join(root_path, *LABEL_PATH)

    img_util = util.ImgProcessor(OUTPUT_DIR, save_mode=False)

    labels = {}

    if NORMALIZE_LABEL:
        with open(os.path.join(root_path, *NORMALIZE_MAP_FILE)) as json_file:
            norm_label_map = json.load(json_file)

    if ADD_LABEL:
        print('Preparing Labels')
        with codecs.open(label_path, "r", encoding="utf-8") as csv_file:
            reader = csv.reader(csv_file)
            next(reader)
            for row in reader:
                id = '{}_{}'.format(row[0], row[1])
                p = row[2]

                if NORMALIZE_LABEL:
                    labels[id] = norm_label_map[p]
                else:
                    labels[id] = float(p)


    print('Start Pre-processing: {}'.format(input_path))

    count = 0
    edge_over_pred = []
    edge_under_pred = []
    color_over_pred = []
    color_under_pred = []

    full_labels = []
    full_dataset = []

    for file_name in os.listdir(input_path):
        if COUNT_LIMIT != -1 and count >= COUNT_LIMIT:
            break

        file_path = os.path.join(input_path, file_name)
        if os.path.isfile(file_path):
            print("\tProcessing: {}".format(file_name))
            img = cv2.imread(file_path)
            processed_img = []

            name, type = os.path.splitext(file_name)
            info = name.split('_')
            slide = int(info[0])
            rid = int(info[1])

            if BIN_DATASET:
                p = labels.get(name, -1)
                if not (p == -1 or BIN_RANGE[0] <= p < BIN_RANGE[1]):
                    continue

            if APPEND_RAW:
                for i in range(3):
                    processed_img.append(img[:, :, i])

            if APPEND_CELL_EDGE_DETECT:
                if EVALUATE_CELL_DETECTION:
                    true_cell_count = get_cell_count(anot_path, file_name)

                for thr in CELL_EDGE_THRESHOLD:
                    new_img, anot_size = img_util.cell_detection(img, file_name, threshold=thr)
                    processed_img.append(new_img)
                    if EVALUATE_CELL_DETECTION:
                        edge_over_pred, edge_under_pred = evaluate_cell_detection(true_cell_count,
                                                                              anot_size, edge_over_pred, edge_under_pred)
                    if DEV_MODE:
                        break

            if APPEND_CELL_COLOR_DETECT:
                for boundary in COLOR_BOUNDARIES:
                    new_img, anot_size = img_util.cell_detection(img, file_name, mode=1, boundaries=boundary)
                    processed_img.append(new_img)
                    if EVALUATE_CELL_DETECTION:
                        color_over_pred, color_under_pred = evaluate_cell_detection(true_cell_count,
                                                                              anot_size, color_over_pred, color_under_pred)
                    if DEV_MODE:
                        break

            if APPEND_VESSEL_DETECT:
                for thr in VESSEL_THRESHOLD:
                    new_img = img_util.detect_red_vessels(img, file_name, bgr_base=VESSEL_BASE, threshold=thr)
                    processed_img.append(new_img)

                    if DEV_MODE:
                        break

            if APPEND_WHITE_DETECT:
                red_scale = util.grayscale_img('red', img)
                green_scale = util.grayscale_img('green', img)
                blue_scale = util.grayscale_img('blue', img)

                for thr in WHITE_THRESHOLD:
                    temp_img = img.copy()

                    new_img = img_util.detect_white_matter(temp_img, file_name, red_scale,
                                                           green_scale, blue_scale, threshold=thr)
                    processed_img.append(new_img)
                    if DEV_MODE:
                        break

            full_dataset.append(processed_img)

            if ADD_LABEL:
                if CLASSIFY_LABELS:
                    p = labels[name]
                    label = [0, 0, 0]
                    if 0 <= p < 0.1:
                        label[0] = 1
                    elif 0.1 <= p < 0.45:
                        label[1] = 1
                    else:
                        label[2] = 1
                else:
                    label = labels[name]
            else:
                label = 0
            data_row = [slide, rid, label]
            full_labels.append(data_row)

            count += 1

    print('Saving Dataset')

    if SAVE_PROCESSED_DATA:
        nb_batchs = int(len(full_dataset)/1000)
        prefix = OUTPUT_DATA_PREFIX

        if CLASSIFY_LABELS:
            prefix += '_class'

        if BIN_DATASET:
            prefix += '_bin({}-{})'.format(BIN_RANGE[0], BIN_RANGE[1])

        if NORMALIZE_LABEL:
            prefix = '{}_norm{}'.format(prefix, NORM_VERSION)

        for i in range(nb_batchs):
            output_file = '{}_{}'.format(prefix, str(i))
            lb = i*1000
            ub = lb+1000

            np.savez_compressed(os.path.join(output_path, OUTPUT_DATA_PATH, output_file), labels=full_labels[lb:ub],
                                dataset=full_dataset[lb:ub])

        remainder = len(full_dataset) % 1000

        output_file = '{}_{}'.format(prefix, str(nb_batchs))
        if remainder > 0:
            lb = nb_batchs * 1000
            ub = lb + remainder

            np.savez_compressed(os.path.join(output_path, OUTPUT_DATA_PATH, output_file), labels=full_labels[lb:ub],
                                    dataset=full_dataset[lb:ub])

    # Calculate Cell Detection Performance
    if EVALUATE_CELL_DETECTION:
        edge_avg_over_pred = 0
        edge_avg_under_pred = 0

        if len(edge_over_pred) > 0:
            edge_avg_over_pred = median(edge_over_pred)
        if len(edge_under_pred) > 0:
            edge_avg_under_pred = median(edge_under_pred)

        color_avg_over_pred = 0
        color_avg_under_pred = 0

        if len(color_over_pred) > 0:
            color_avg_over_pred = median(color_over_pred)
        if len(color_under_pred) > 0:
            color_avg_under_pred = median(color_under_pred)

        print('Performance:')
        print('\tEdge Avg Under Pred = {}'.format(edge_avg_under_pred))
        print('\tEdge Avg Over Pred = {}'.format(edge_avg_over_pred))
        print('\tColor Avg Under Pred = {}'.format(color_avg_under_pred))
        print('\tColor Avg Over Pred = {}'.format(color_avg_over_pred))

    print('Done Pre-processing')
Example #29
0
def convertImagesMasterMAP(targetDir, targetMetaDir, imageMetaDir, jobs, 
                           img2bal, stopped, queue, result_queue,
                           num_imgs2process,
                           verbose=False, nProc=None,
                           method_galign=GALIGN_NORMAL,
                           method_lalign=LALIGN_NORMAL):
    """ Called by both single and multi-page elections. Performs
    Target Extraction.
    Input:
        str targetDir: Directory to dump extracted target images to.
        str targetMetaDir: Directory to store target metadata into.
        str imageMetaDir: Directory to store metadata for each Ballot,
            such as ballotpath, path to each extracted target, assoc'd
            blank ballot, isflipped.
        list jobs: [[tmppaths_i, bbs_i, imgpaths_i, targetDir_i, targetDiffDir_i, imageMetaDir_i, queue], ...]
        stopped:
    """
    targetDiffDir=targetDir+'_diffs'

    print "...removing previous Target Extract results..."
    _t = time.time()
    if os.path.exists(targetDir): shutil.rmtree(targetDir)
    if os.path.exists(targetDiffDir): shutil.rmtree(targetDiffDir)
    if os.path.exists(targetMetaDir): shutil.rmtree(targetMetaDir)
    if os.path.exists(imageMetaDir): shutil.rmtree(imageMetaDir)
    dur = time.time() - _t
    print "...Finished removing previous Target Extract results ({0} s).".format(dur)

    create_dirs(targetDir)
    create_dirs(targetDiffDir)
    create_dirs(targetMetaDir)
    create_dirs(imageMetaDir)

    if nProc == None:
        nProc = sh.numProcs()
    #nProc = 1
    num_jobs = len(jobs)
    
    if nProc < 2:
        print 'using only 1 processes'
        # default behavior for non multiproc machines
        for job in jobs:
            if stopped():
                return False
            t0=time.clock();
            convertImagesWorkerMAP(job)
            print time.clock()-t0
    else:
        print 'using ', nProc, ' processes'
        pool=mp.Pool(processes=nProc)

        '''
        it = [False]
        def imdone(x):
            it[0] = True
            print "I AM DONE NOW!"
        '''
        if wx.App.IsMainLoopRunning():
            wx.CallAfter(Publisher().sendMessage, "signals.MyGauge.nextjob", num_jobs)
        print "GOING UP TO", num_jobs
        #pool.map_async(convertImagesWorkerMAP,jobs,callback=lambda x: imdone(it))
        pool.map_async(convertImagesWorkerMAP, jobs)
        cnt = 0
        while cnt < len(jobs):
            val = queue.get(block=True)
            if val == True:
                if wx.App.IsMainLoopRunning():
                    wx.CallAfter(Publisher().sendMessage, "signals.MyGauge.tick")
                cnt += 1
            elif type(val) in (str, unicode):
                # Something went wrong!
                print "    WARNING: detected a failed extract job {0}.".format(cnt)
                cnt += 1
        pool.close()
        pool.join()

    print "    (Finished processing targetextract jobs)"

    cnt = 0
    avg_intensities = [] # [(path, float avg_intensity), ...]
    bal2targets = {} # maps {int ballotid: {int page: [targetsdir, targetmetadir, diffmetadir, imgmetadir]}}

    while cnt < num_imgs2process:
        (avg_intensities_cur, balP, page, target_rootdir,
         targetdiff_rootdir, imgmeta_rootdir) = result_queue.get(block=True)
        avg_intensities.extend(avg_intensities_cur)
        ballotid = img2bal[balP]
        #print "...finished ballotid {0}".format(ballotid)
        bal2targets.setdefault(ballotid, {})[page] = (target_rootdir,
                                                      targetdiff_rootdir, imgmeta_rootdir)
        cnt += 1
    print 'done.'
    return avg_intensities, bal2targets
Example #30
0
def process_attn_selections(attn_dir, decode_dir, vocab, extraction_eval=False):

    html_dir = os.path.join(decode_dir, 'extr_vis')
    util.create_dirs(html_dir)
    file_names = sorted(glob.glob(os.path.join(attn_dir, '*')))

    if extraction_eval:
        ssi_dir = os.path.join('data/ssi', FLAGS.dataset_name, 'test_ssi.pkl')
        with open(ssi_dir) as f:
            ssi_list = pickle.load(f)
        if len(ssi_list) != len(file_names):
            raise Exception('len of ssi_list does not equal len file_names: ', len(ssi_list), len(file_names))
    triplet_ssi_list = []
    for file_idx, file_name in enumerate(tqdm(file_names)):
        with open(file_name) as f:
            data = json.load(f)
        p_gens = util.flatten_list_of_lists(data['p_gens'])
        article_lst = data['article_lst']
        abstract_lst = data['abstract_str'].strip().split()
        decoded_lst = data['decoded_lst']
        attn_dists = np.array(data['attn_dists'])

        article_lst = [art_word.replace('__', '') for art_word in article_lst]
        decoded_lst = [dec_word.replace('__', '') for dec_word in decoded_lst]
        abstract_lst = [abs_word.replace('__', '') for abs_word in abstract_lst]

        min_matched_tokens = 2
        if 'singles' in FLAGS.exp_name:
            sentence_limit = 1
        else:
            sentence_limit = 2
        summary_sent_tokens = [nltk.tokenize.word_tokenize(sent) for sent in nltk.tokenize.sent_tokenize(' '.join(abstract_lst))]
        decoded_sent_tokens = [nltk.tokenize.word_tokenize(sent) for sent in nltk.tokenize.sent_tokenize(' '.join(decoded_lst))]
        article_sent_tokens = [nltk.tokenize.word_tokenize(sent) for sent in nltk.tokenize.sent_tokenize(' '.join(article_lst))]
        gt_ssi_list, lcs_paths_list, article_lcs_paths_list = get_simple_source_indices_list(summary_sent_tokens, article_sent_tokens, vocab, sentence_limit,
                                       min_matched_tokens)
        sys_ssi_list, _, _ = get_simple_source_indices_list(decoded_sent_tokens, article_sent_tokens, vocab, sentence_limit,
                                       min_matched_tokens)


        match_indices = []
        for dec_idx, dec in enumerate(decoded_lst):
            art_match_indices = [art_idx for art_idx, art_word in enumerate(article_lst) if art_word.replace('__', '') == dec or art_word == dec]
            if len(art_match_indices) == 0:
                match_indices.append(None)
            else:
                art_attns = [attn_dists[dec_idx, art_idx] for art_idx in art_match_indices]
                best_match_idx = art_match_indices[np.argmax(art_attns)]
                match_indices.append(best_match_idx)

        html = create_html(article_lst, match_indices, decoded_lst, [abstract_lst], file_idx, gt_ssi_list, lcs_paths_list, article_lcs_paths_list, summary_sent_tokens, article_sent_tokens)
        with open(os.path.join(html_dir, '%06d.html' % file_idx), 'wb') as f:
            f.write(html)

        if extraction_eval:
            triplet_ssi_list.append((ssi_list[file_idx], sys_ssi_list, -1))

    if extraction_eval:
        print('Evaluating Lambdamart model F1 score...')
        suffix = util.all_sent_selection_eval(triplet_ssi_list)
        print(suffix)
        with open(os.path.join(decode_dir, 'extraction_results.txt'), 'wb') as f:
            f.write(suffix)


    a=0
Example #31
0
import util
import os
from tqdm import tqdm
import glob
import json

split_dict = json.loads(open(os.path.expanduser('~') + "/xsum/XSum/XSum-Dataset/XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json").read())
data_types = ["test", "validation", "train"]

article_dir = os.path.expanduser('~') + '/xsum/XSum/XSum-Dataset/xsum-preprocessed/document'
summary_dir = os.path.expanduser('~') + '/xsum/XSum/XSum-Dataset/xsum-preprocessed/summary'
out_dir = os.path.expanduser('~') + '/xsum/xsum-logan'
util.create_dirs(out_dir)

article_paths = sorted(glob.glob(article_dir + "*"))
summary_paths = sorted(glob.glob(summary_dir + "*"))


for data_type in data_types:
    bbcids = split_dict[data_type]

    if data_type == 'validation':
        dtype = 'val'
    else:
        dtype = data_type
    for bbcid_idx, bbcid in enumerate(tqdm(bbcids)):
        article_path = os.path.join(article_dir, bbcid + '.document')
        summary_path = os.path.join(summary_dir, bbcid + '.summary')
        if not os.path.exists(article_path):
            continue
Example #32
0
                    help="clear graphs and test fields in native dirs")
parser.add_argument('-inf',
                    '--iframe',
                    default=-1,
                    help="display frame during inference")

parser.add_argument('-g',
                    '--grid_path',
                    default='',
                    help='Path to grid dictionary')
args = parser.parse_args()

config.data_path = args.input_dir

if os.path.isdir(config.data_path):
    util.create_dirs(args.clear)
else:
    print('Input dir is not valid')

if not os.path.isdir(config.output_dir):
    print('WARNING - output dir is not valid. Meta graphs are not saved')
    exit()

config.resample = args.trilinear
config.param_state_size = args.latent_state_size
config.n_filters = args.filters
config.output_dir = args.output_dir
config.save_freq = args.graph_saving_freq
config.f_tensorboard = args.tensorboard_saving_freq
config.sb_blocks = args.small_blocks
config.batch_size = args.batch_size
Example #33
0
if FLAGS.upper_bound:
    exp_name = exp_name + '_upperbound'

if FLAGS.singles_and_pairs == 'singles':
    sentence_limit = 1
else:
    sentence_limit = 2

if FLAGS.dataset_name == 'xsum':
    l_param = 40
else:
    l_param = 100
# l_param = 100
temp_in_path = os.path.join(bert_in_dir, 'test.tsv')
temp_out_path = os.path.join(bert_scores_dir, 'test_results.tsv')
util.create_dirs(bert_scores_dir)
my_log_dir = os.path.join(log_dir, exp_name)
dec_dir = os.path.join(my_log_dir, 'decoded')
ref_dir = os.path.join(my_log_dir, 'reference')
html_dir = os.path.join(my_log_dir, 'hightlighted_html')
util.create_dirs(dec_dir)
util.create_dirs(ref_dir)
util.create_dirs(html_dir)
util.create_dirs(ssi_out_dir)

def read_bert_scores(file_path):
    with open(file_path) as f:
        lines = f.readlines()
    data = [[float(x) for x in line.split('\t')] for line in lines]
    data = np.array(data)
    return data
Example #34
0
def main(unused_argv):
    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    start_time = time.time()
    np.random.seed(random_seed)
    util.create_dirs(os.path.join(out_dir, FLAGS.input_dataset))

    if FLAGS.input_dataset == 'all':
        datasets = [
            'duc_2003', 'duc_2004', 'tac_2008', 'tac_2010', 'tac_2011',
            'cnn_dm', 'xsum'
        ]
    else:
        datasets = [FLAGS.input_dataset]
    if dataset_split == 'all':
        dataset_splits = ['train', 'val', 'test']
    else:
        dataset_splits = [dataset_split]
    all_articles = []
    for in_dataset in datasets:
        source_dir = os.path.join(data_dir, in_dataset)

        for split in dataset_splits:
            # split = dataset_split
            source_files = sorted(glob.glob(source_dir + '/' + split + '*'))

            if len(source_files) == 0:
                continue

            total = len(source_files
                        ) * 1000 if 'cnn' or 'newsroom' in in_dataset else len(
                            source_files)
            example_generator = data.example_generator(
                source_dir + '/' + split + '*',
                True,
                False,
                should_check_valid=False)
            # for example in tqdm(example_generator, total=total):
            ex_gen = example_generator_extended(example_generator, total)
            print('Creating list')
            ex_list = [ex for ex in ex_gen]
            print('Converting...')

            articles = list(futures.map(save_as_txt_file, ex_list))
            all_articles.extend(articles)
    vec = TfidfVectorizer(input='content',
                          ngram_range=(1, 1),
                          min_df=min_df,
                          max_df=0.5,
                          decode_error='ignore',
                          preprocessor=my_preprocessor,
                          tokenizer=my_tokenizer)

    # list(futures.map(save_as_txt_file, ex_list))
    # file_list = [os.path.join(out_dir, in_dataset, fname) for fname in os.listdir(os.path.join(out_dir, in_dataset))]
    # vec = TfidfVectorizer(input='filename', ngram_range=(1,1), min_df=min_df, max_df=0.5, decode_error='ignore')
    # vec.fit(file_list)

    if FLAGS.pca:
        X = vec.fit_transform(all_articles)
        suffix = '_pca'
    elif FLAGS.pg_mmr:
        stemmer = PorterStemmer()

        class StemmedTfidfVectorizer(TfidfVectorizer):
            def build_analyzer(self):
                analyzer = super(TfidfVectorizer, self).build_analyzer()
                return lambda doc: (stemmer.stem(w) for w in analyzer(doc))

        vec = StemmedTfidfVectorizer(analyzer='word',
                                     stop_words='english',
                                     ngram_range=(1, 3),
                                     max_df=0.7)
        vec.fit_transform(all_articles)
    else:
        vec.fit_transform(all_articles)
        suffix = ''
    print('Vocabulary size', len(list(vec.vocabulary_.keys())))
    if FLAGS.pg_mmr:
        util.create_dirs(os.path.join(log_dir, 'tfidf_vectorizer'))
        with open(
                os.path.join(log_dir, 'tfidf_vectorizer',
                             FLAGS.input_dataset + '.dill'), 'wb') as f:
            dill.dump(vec, f)
    else:
        with open(
                os.path.join(
                    out_dir, FLAGS.input_dataset + '_tfidf_vec_' +
                    str(min_df) + suffix + '.pkl'), 'wb') as f:
            pickle.dump(vec, f)

    if FLAGS.pca:
        print('Fitting LSA model...')
        from sklearn.decomposition import TruncatedSVD
        svd = TruncatedSVD(n_components=100)
        svd.fit(X)
        with open(os.path.join(out_dir, FLAGS.input_dataset + '_pca' + '.pkl'),
                  'wb') as f:
            pickle.dump(svd, f)

    util.print_execution_time(start_time)