Beispiel #1
0
def create_dataset(in_paths, out_paths: str, tk_vocab, max_plan_length,
                   max_summary_length, max_table_length, logger):
    summary_path, json_path, cplan_path = in_paths

    tables = [
        m.records for m in extract_matches_from_json(
            json_path, word_dict=None, process_summary=False)
    ]

    with open(summary_path, 'r') as f:
        file_content = f.read().strip().split('\n')

    summaries = []
    for line in file_content:
        summaries.append(line)

    tk_to_ix = tk_vocab.to_dict()

    tp_vocab = create_tp_vocab()
    tp_to_ix = tp_vocab.to_dict()

    ha_vocab = create_ha_vocab()
    ha_to_ix = ha_vocab.to_dict()

    pad_value = tk_to_ix[tk_vocab.get_pad()]
    if pad_value != tp_to_ix[tp_vocab.get_pad()] or pad_value != ha_to_ix[
            ha_vocab.get_pad()]:
        raise RuntimeError(
            "Different padding values in type and token vocabs!")

    cplan_ids = None
    if cplan_path is not None:
        cplan_ids = create_content_plan_ids(cplan_path, max_plan_length,
                                            pad_value, tables, logger)

    np_in = np.full(shape=[len(tables), 4, max_table_length],
                    fill_value=pad_value,
                    dtype=np.int16)
    # add space for special tokens
    np_target = np.full(shape=[len(tables), max_summary_length + 2],
                        fill_value=pad_value,
                        dtype=np.int16)
    unk_stat = UnkStat(tk_vocab)

    for m_ix, (table, summary) in enumerate(zip(tables, summaries)):
        for t_ix, record in enumerate(table):
            np_in[m_ix, 0, t_ix] = tp_to_ix[record.type]
            np_in[m_ix, 1, t_ix] = assign_ix_or_unk(
                tk_to_ix, "_".join(record.entity.strip().split()), unk_stat)
            np_in[m_ix, 2, t_ix] = assign_ix_or_unk(tk_to_ix, record.value,
                                                    unk_stat)
            np_in[m_ix, 3, t_ix] = ha_to_ix[record.ha]
        np_target[m_ix, 0] = tk_to_ix[tp_vocab.get_bos()]
        summary_tokens = summary.strip().split()
        for s_ix, subword in enumerate(summary_tokens):
            np_target[m_ix,
                      s_ix + 1] = assign_ix_or_unk(tk_to_ix, subword, unk_stat)
        np_target[m_ix, len(summary_tokens) + 1] = tk_to_ix[tp_vocab.get_eos()]

    logger(
        f"{out_paths[0]} : {unk_stat.get_unk_stat()} tokens assigned for OOV words"
    )

    extension = os.path.splitext(out_paths[0])[1]
    if extension == ".txt":
        save_np_to_txt(np_in, np_target, cplan_ids, out_paths, logger,
                       summary_path, json_path)
    elif extension == ".npy":
        logger(f"summaries {summary_path} -> {out_paths[1]}")
        logger(f"tables {json_path} -> {out_paths[0]}")
        logger("--- saving to .npy")
        np.save(out_paths[0], np_in)
        np.save(out_paths[1], np_target)
        if cplan_ids is not None:
            np.save(out_paths[2], cplan_ids)
    elif extension == ".tfrecord":
        save_np_to_tfrecord(np_in, np_target, cplan_ids, out_paths[0], logger,
                            summary_path, json_path)
Beispiel #2
0
def create_dataset(
        input_paths,
        output_paths,
        tk_vocab,
        mlcp  # max length content plan
    ,
        mls  # max length summary
    ,
        mlt  # max length table
    ,
        order_records,
        prun_records,
        logger):
    """ Create dataset from json file and preprocessed summary 
    Args:
        input_paths:    summary_path, json_path, content_plan_path
        output_paths:   show where to save the outputs
        tk_vocab:       token vocab
        mlcp:           maximum length content plan
        mls:            maximum length summary
        mlt:            maximum length table
        order_records:  order records so that first are team records followed by player records ordered by their point-total
        prun_record:    order records so that first are team records followed by player records of the top 10 players according
                        to their point total, which are further filtered
        logger:         callable, for logging important information
    """
    summary_path, json_path, cplan_path = input_paths

    tables = [
        m.records
        for m in extract_matches_from_json(json_path,
                                           word_dict=None,
                                           process_summary=False,
                                           order_records=order_records,
                                           prun_records=prun_records)
    ]
    # collect some statistics about sequences of records
    sum_length = 0
    max_length = None
    min_length = None
    for r in tables:
        length = len(r)
        if max_length is None:
            max_length = length
            min_length = length
        if max_length < length: max_length = length
        if min_length > length: min_length = length
        sum_length += length
    print(f"max table length: {max_length}")
    print(f"min table length: {min_length}")
    print(f"average table length: {sum_length / len(tables)}")

    # read the summaries
    with open(summary_path, 'r') as f:
        file_content = f.read().strip().split('\n')

    summaries = []
    for line in file_content:
        summaries.append(line)

    tk_to_ix = tk_vocab.to_dict()

    tp_vocab = create_tp_vocab()
    tp_to_ix = tp_vocab.to_dict()

    ha_vocab = create_ha_vocab()
    ha_to_ix = ha_vocab.to_dict()

    pad_value = tk_to_ix[tk_vocab.get_pad()]
    if pad_value != tp_to_ix[tp_vocab.get_pad()] or pad_value != ha_to_ix[
            ha_vocab.get_pad()]:
        raise RuntimeError(
            "Different padding values in type and token vocabs!")

    # extract content plan
    cplan_ids = None
    if cplan_path is not None:
        cplan_ids = create_content_plan_ids(cplan_path, mlcp, pad_value,
                                            tables, logger)

    np_in = np.full(shape=[len(tables), 4, mlt],
                    fill_value=pad_value,
                    dtype=np.int16)
    # add space for special tokens
    np_target = np.full(shape=[len(tables), mls + 2],
                        fill_value=pad_value,
                        dtype=np.int16)
    unk_stat = UnkStat(tk_vocab)

    # transform input tables and output summaries to np array
    for m_ix, (table, summary) in enumerate(zip(tables, summaries)):
        for t_ix, record in enumerate(table):
            np_in[m_ix, 0, t_ix] = tp_to_ix[record.type]
            np_in[m_ix, 1, t_ix] = assign_ix_or_unk(
                tk_to_ix, "_".join(record.entity.strip().split()), unk_stat)
            np_in[m_ix, 2, t_ix] = assign_ix_or_unk(tk_to_ix, record.value,
                                                    unk_stat)
            np_in[m_ix, 3, t_ix] = ha_to_ix[record.ha]
        np_target[m_ix, 0] = tk_to_ix[tp_vocab.get_bos()]
        summary_tokens = summary.strip().split()
        for s_ix, subword in enumerate(summary_tokens):
            np_target[m_ix,
                      s_ix + 1] = assign_ix_or_unk(tk_to_ix, subword, unk_stat)
        np_target[m_ix, len(summary_tokens) + 1] = tk_to_ix[tp_vocab.get_eos()]

    logger(
        f"{output_paths[0]} : {unk_stat.get_unk_stat()} tokens assigned for OOV words"
    )

    # save the np arrays in described format
    extension = os.path.splitext(output_paths[0])[1]
    if extension == ".txt":
        save_np_to_txt(np_in, np_target, cplan_ids, output_paths, logger,
                       summary_path, json_path)
    elif extension == ".npy":
        logger(f"summaries {summary_path} -> {output_paths[1]}")
        logger(f"tables {json_path} -> {output_paths[0]}")
        logger("--- saving to .npy")
        np.save(output_paths[0], np_in)
        np.save(output_paths[1], np_target)
        if cplan_ids is not None:
            np.save(output_paths[2], cplan_ids)
    elif extension == ".tfrecord":
        save_np_to_tfrecord(np_in, np_target, cplan_ids, output_paths[0],
                            logger, summary_path, json_path)
Beispiel #3
0
def extract_summaries_from_json(json_file_path,
                                output_path,
                                logger,
                                transform_player_names=False,
                                prepare_for_bpe_training=False,
                                prepare_for_bpe_application=False,
                                lowercase=False,
                                exception_cities=False,
                                exception_teams=False,
                                words_limit=None,
                                all_named_entities: OccurrenceDict = None,
                                cell_dict_overall: OccurrenceDict = None):
    word_dict = OccurrenceDict()
    player_dict = OccurrenceDict()
    team_name_dict = OccurrenceDict()
    city_dict = OccurrenceDict()
    cell_dict = OccurrenceDict()

    matches = extract_matches_from_json(json_file_path,
                                        player_dict=player_dict,
                                        city_dict=city_dict,
                                        team_name_dict=team_name_dict,
                                        cell_dict=cell_dict,
                                        word_dict=word_dict,
                                        words_limit=words_limit)
    max_table_length = 0
    for match in matches:
        if max_table_length < len(match.records):
            max_table_length = len(match.records)

    if transform_player_names or prepare_for_bpe_training or prepare_for_bpe_application or lowercase:
        tmp_dict = extract_players_from_summaries(
            matches,
            player_dict,
            logger,
            transform_player_names=transform_player_names,
            prepare_for_bpe_training=prepare_for_bpe_training,
            prepare_for_bpe_application=prepare_for_bpe_application,
            lowercase=lowercase,
            exception_cities=exception_cities,
            exception_teams=exception_teams)

    count = 0
    # save named entities from the summaries and table
    if all_named_entities is not None:
        for key in tmp_dict.keys():
            if key not in all_named_entities:
                count += 1
            all_named_entities.add(key, tmp_dict[key].occurrences)
        for key in team_name_dict.keys():
            if key not in all_named_entities:
                count += 1
            # each city is mentioned 16 times in any vocab
            occurrences = team_name_dict[key].occurrences
            transformed = "_".join(key.strip().split())
            all_named_entities.add(transformed, occurrences)
        for key in player_dict.keys():
            transformed = "_".join(key.strip().split())
            if transformed not in all_named_entities:
                count += 1
                all_named_entities.add(transformed)
    logger(f"{count} new values introduced to all_named_entities")

    count = 0
    # save cell values from the table
    if cell_dict_overall is not None:
        for key in cell_dict.keys():
            if key not in cell_dict_overall:
                count += 1
            cell_dict_overall.add(key, cell_dict[key].occurrences)
    logger(f"{count} new values introduced to cell_dict_overall")

    with open(output_path, 'w') as f:
        for match in matches:
            print(" ".join(match.summary.get_words()), file=f)

    return max_table_length
Beispiel #4
0
def gather_json_stats(json_file_path,
                      logger,
                      train_word_dict=None,
                      transform_player_names: bool = False):
    """
    - traverse all the elements of the json,
    - extract all the match statistics and summaries
    - create dictionaries
    """
    word_dict = OccurrenceDict()
    player_dict = OccurrenceDict()
    team_name_dict = OccurrenceDict()
    city_dict = OccurrenceDict()
    cell_dict = OccurrenceDict()
    type_dict = create_tp_vocab()

    total_summary_length = 0
    max_summary_length = None
    min_summary_length = None

    total_table_length = 0
    max_table_length = None
    min_table_length = None

    matches = extract_matches_from_json(json_file_path,
                                        player_dict=player_dict,
                                        city_dict=city_dict,
                                        team_name_dict=team_name_dict,
                                        cell_dict=cell_dict,
                                        word_dict=word_dict)

    ll = Logger(log=False)
    player_in_summary_dict = extract_players_from_summaries(
        matches,
        player_dict,
        ll,
        transform_player_names=transform_player_names)

    for match in matches:
        # collect summary statistics
        sum_length = len(match.summary)
        total_summary_length += sum_length
        if min_summary_length is None or sum_length < min_summary_length:
            min_summary_length = sum_length
        if max_summary_length is None or sum_length > max_summary_length:
            max_summary_length = sum_length

        # collect table statistics
        table_length = len(match.records)
        total_table_length += table_length
        if min_table_length is None or table_length < min_table_length:
            min_table_length = table_length
        if max_table_length is None or table_length > max_table_length:
            max_table_length = table_length

    # print summary statistics
    logger("---")
    logger(f"total number of summaries : {len(matches)}")
    logger(f"max summary length : {max_summary_length}")
    logger(f"min summary length : {min_summary_length}")
    logger(f"average summary length : {total_summary_length / len(matches)}")
    logger("---")
    logger(f"number of different tokens in summaries: {len(word_dict.keys())}")
    logger(
        f"number of different tokens with more than 5 occurrences in summaries: {len(word_dict.sort(prun_occurrences=5).keys())}"
    )
    if train_word_dict is None:
        train_word_dict = word_dict
    count = 0
    for word in word_dict.keys():
        if word in train_word_dict:
            count += 1
    overlap = (count * 100.0) / len(word_dict.keys())
    logger(
        f"percent of tokens from the train dict in the actual dict: {overlap}")

    # print record statistics
    logger("---")
    logger(f"max number of records : {max_table_length}")
    logger(f"min number of records : {min_table_length}")
    logger(f"average records length : {total_table_length / len(matches)}")

    # logger other vocab statistics
    logger("---")
    logger(
        f"number of unique player names in summaries: {len(player_in_summary_dict.keys())}"
    )
    logger(
        f"number of unique player names in match stats: {len(player_dict.keys())}"
    )
    logger("---")
    more_than_five_1 = player_dict.sort(prun_occurrences=5)
    more_than_five_2 = player_in_summary_dict.sort(prun_occurrences=5)
    logger(
        f"number of unique player names with more than or equal to 5 occurrences in summaries: {len(more_than_five_2.keys())}"
    )
    logger(
        f"number of unique player names with more than or equal to 5 occurrences in tables: {len(more_than_five_1.keys())}"
    )
    logger("---")
    logger(
        f"number of different tokens in cell values : {len(cell_dict.keys())}")
    more_than_five_3 = cell_dict.sort(prun_occurrences=5)
    logger(
        f"number of different tokens in cell values with more than or equal to 5 occurrences in cell values : {len(more_than_five_3.keys())}"
    )
    logger("---")
    logger(
        f"number of unique city names in match stats : {len(city_dict.keys())}"
    )
    logger(
        f"number of unique team names in match stats : {len(team_name_dict.keys())}"
    )
    logger(
        f"number of different types of table cells : {len(type_dict.keys())}")

    # player statistics
    logger("---")
    logger("20 most mentioned players in the summaries")
    for player in player_in_summary_dict.sort(20).keys():
        logger(player)

    return word_dict
Beispiel #5
0
def extract_summaries_from_json(json_file_path,
                                output_path,
                                logger,
                                transform_player_names=False,
                                prepare_for_bpe_training=False,
                                prepare_for_bpe_application=False,
                                lowercase=False,
                                exception_cities=False,
                                exception_teams=False,
                                words_limit=None,
                                all_named_entities: OccurrenceDict = None,
                                cell_dict_overall: OccurrenceDict = None,
                                order_records: bool = False,
                                prun_records: bool = False):
    """ Extract summaries from .json RotoWire dataset files and save them to output_path
    
    Args:
        json_file_path:                 path to .json file with RotoWire dataset
        output_path:                    path where to save the extracted summaries
        logger:                         callable for logging important events during the extraction
        transform_player_names:         transform each all the mentions of a unique player to a single token
        prepare_for_bpe_training:       remove all the player names from the summaries, so that BPE won't learn
                                        to divide player names
        prepare_for_bpe_application:    transform and enclose the player names to special <<<>>> brackets to allow easy regexp
                                        extraction from BPE application
        lowercase:                      lowercase all the tokens with exception of the player names
        exception_cities:               add city names to the same set as player names (e.g. excluded from lowercasing etc.)
        exception_teams:                add team names to the same set as player names (e.g. excluded from lowercasing etc. )
        words_limit:                     prun all the sentences from a summary that cause it to exceed the words_limit
        all_named_entities:             store all named entities from the tables and summaries to the all_named_entities dict if provided
        cell_dict_overall:              save all the values from the records to the all_named_entities dict if provided
        order_records:                  order records so that first are team records followed by player records ordered by their point-total
        prun_record:                    order records so that first are team records followed by player records of the top 10 players according
                                        to their point total, which are further filtered
    Returns:
        max length of a sequence of records    
    """
    word_dict = OccurrenceDict()
    player_dict = OccurrenceDict()
    team_name_dict = OccurrenceDict()
    city_dict = OccurrenceDict()
    cell_dict = OccurrenceDict()

    matches = extract_matches_from_json(json_file_path,
                                        player_dict=player_dict,
                                        city_dict=city_dict,
                                        team_name_dict=team_name_dict,
                                        cell_dict=cell_dict,
                                        word_dict=word_dict,
                                        words_limit=words_limit,
                                        order_records=order_records,
                                        prun_records=prun_records)
    max_table_length = 0
    for match in matches:
        if max_table_length < len(match.records):
            max_table_length = len(match.records)

    if transform_player_names or prepare_for_bpe_training or prepare_for_bpe_application or lowercase:
        tmp_dict = extract_players_from_summaries(
            matches,
            player_dict,
            logger,
            transform_player_names=transform_player_names,
            prepare_for_bpe_training=prepare_for_bpe_training,
            prepare_for_bpe_application=prepare_for_bpe_application,
            lowercase=lowercase,
            exception_cities=exception_cities,
            exception_teams=exception_teams)

    count = 0
    # save named entities from the summaries and table
    if all_named_entities is not None:
        for key in tmp_dict.keys():
            if key not in all_named_entities:
                count += 1
            all_named_entities.add(key, tmp_dict[key].occurrences)
        for key in team_name_dict.keys():
            if key not in all_named_entities:
                count += 1
            # each city is mentioned 16 times in any vocab
            occurrences = team_name_dict[key].occurrences
            transformed = "_".join(key.strip().split())
            all_named_entities.add(transformed, occurrences)
        for key in player_dict.keys():
            transformed = "_".join(key.strip().split())
            if transformed not in all_named_entities:
                count += 1
                all_named_entities.add(transformed)
    logger(f"{count} new values introduced to all_named_entities")

    count = 0
    # save cell values from the table
    if cell_dict_overall is not None:
        for key in cell_dict.keys():
            if key not in cell_dict_overall:
                count += 1
            cell_dict_overall.add(key, cell_dict[key].occurrences)
    logger(f"{count} new values introduced to cell_dict_overall")

    with open(output_path, 'w') as f:
        for match in matches:
            print(" ".join(match.summary.get_words()), file=f)

    return max_table_length