Beispiel #1
0
def get_all_captions_from_files(files: MutableSequence[Path],
                                data_dir: Path) \
        -> List[Dict[str, str]]:
    all_entries = chain.from_iterable(
        [csv_functions.read_csv_file(a_f, data_dir) for a_f in files])

    return [{k: v
             for k, v in i.items() if not k.startswith('file')}
            for i in all_entries]
Beispiel #2
0
def main():
    all_entries = csv_functions.read_csv_file(
        'clotho_captions_development.csv', 'data')

    all_words = []

    for entry in all_entries:
        entry_words = [
            captions_functions.get_sentence_words(v, unique=True)
            for k, v in entry.items() if not k.startswith('file')
        ]
        all_words.extend(list(set(chain.from_iterable(entry_words))))

    counter = Counter(all_words)

    results = []
    max_min = 0
    files_to_use = []
    max_files = 50

    for entry in all_entries:
        captions = [v for k, v in entry.items() if not k.startswith('file')]
        min_freq = 1e6
        for caption in captions:
            min_freq = min(
                min_freq, *[
                    counter.get(word) for word in
                    captions_functions.get_sentence_words(caption, unique=True)
                ])
        max_min = max(max_min, min_freq)
        results.append({'file': entry.get('file_name'), 'min_freq': min_freq})
        if 10 < min_freq < 20:
            files_to_use.append(entry.get('file_name'))

    print(f'Max minimum freq is {max_min}')
    print(f'Amount of files that I can use is {len(files_to_use)}')
    # plt.hist([k['min_freq'] for k in results],
    #          bins=max_min,
    #          histtype='stepfilled')
    # plt.grid()
    # plt.show()

    x = np.arange(len(files_to_use))
    final_files = [
        files_to_use[i] for i in np.random.permutation(x)[:max_files]
    ]
    [print(f'File {i+1:02d} is {f}') for i, f in enumerate(final_files)]

    p = Path('validation_file_names.pickle')

    print('Saving list of validation files...', end=' ')
    with p.open('wb') as f:
        pickle.dump(final_files, f)
    print('done.')
def get_annotations_files(settings_ann: MutableMapping[str, Any], dir_ann: Path) -> \
        Tuple[List[MutableMapping[str, Any]], List[MutableMapping[str, Any]]]:
    """Reads, process (if necessary), and returns tha annotations files.

    :param settings_ann: Settings to be used.
    :type settings_ann: dict
    :param dir_ann: Directory of the annotations files.
    :type dir_ann: pathlib.Path
    :return: Development and evaluation annotations files.
    :rtype: list[collections.OrderedDict], list[collections.OrderedDict]
    """
    field_caption = settings_ann['captions_fields_prefix']
    csv_development = read_csv_file(file_name=settings_ann['development_file'],
                                    base_dir=dir_ann)
    csv_evaluation = read_csv_file(file_name=settings_ann['evaluation_file'],
                                   base_dir=dir_ann)

    caption_fields = [field_caption.format(c_ind) for c_ind in range(1, 6)]

    for csv_entry in chain(csv_development, csv_evaluation):
        # Clean sentence to remove any spaces before punctuations.

        captions = [
            clean_sentence(csv_entry.get(caption_field),
                           keep_case=True,
                           remove_punctuation=False,
                           remove_specials=False)
            for caption_field in caption_fields
        ]

        if settings_ann['use_special_tokens']:
            captions = [
                '<SOS> {} <EOS>'.format(caption) for caption in captions
            ]

        [
            csv_entry.update({caption_field: caption})
            for caption_field, caption in zip(caption_fields, captions)
        ]

    return csv_development, csv_evaluation
Beispiel #4
0
        ind_list = []
        for kw in kws:
            try:
                ind = words_list.index(kw)
            except ValueError:
                continue
            ind_list.append(ind)
        keyword_indices.append(ind_list)
    return keyword_indices


if __name__ == "__main__":

    # load metadata csv files
    dir_path = 'data/clotho_csv_files'
    meta_dev = read_csv_file('clotho_metadata_development.csv', dir_path)
    meta_eval = read_csv_file('clotho_metadata_evaluation.csv', dir_path)

    # load StanfordPOSTagger -> pos_tag is better
    # jar = "./stanford-postagger-full-2020-08-06/stanford-postagger.jar"
    # model = "./stanford-postagger-full-2020-08-06/models/english-bidirectional-distsim.tagger"
    # tagger = StanfordPOSTagger(model, jar, encoding="utf-8")


    lmt = WordNetLemmatizer()

    dict_pos_map = {
        'NN': NOUN,
        'NNS': NOUN,
        'MD': NOUN,
        'VBG': VERB,