def get_all_captions_from_files(files: MutableSequence[Path], data_dir: Path) \ -> List[Dict[str, str]]: all_entries = chain.from_iterable( [csv_functions.read_csv_file(a_f, data_dir) for a_f in files]) return [{k: v for k, v in i.items() if not k.startswith('file')} for i in all_entries]
def main(): all_entries = csv_functions.read_csv_file( 'clotho_captions_development.csv', 'data') all_words = [] for entry in all_entries: entry_words = [ captions_functions.get_sentence_words(v, unique=True) for k, v in entry.items() if not k.startswith('file') ] all_words.extend(list(set(chain.from_iterable(entry_words)))) counter = Counter(all_words) results = [] max_min = 0 files_to_use = [] max_files = 50 for entry in all_entries: captions = [v for k, v in entry.items() if not k.startswith('file')] min_freq = 1e6 for caption in captions: min_freq = min( min_freq, *[ counter.get(word) for word in captions_functions.get_sentence_words(caption, unique=True) ]) max_min = max(max_min, min_freq) results.append({'file': entry.get('file_name'), 'min_freq': min_freq}) if 10 < min_freq < 20: files_to_use.append(entry.get('file_name')) print(f'Max minimum freq is {max_min}') print(f'Amount of files that I can use is {len(files_to_use)}') # plt.hist([k['min_freq'] for k in results], # bins=max_min, # histtype='stepfilled') # plt.grid() # plt.show() x = np.arange(len(files_to_use)) final_files = [ files_to_use[i] for i in np.random.permutation(x)[:max_files] ] [print(f'File {i+1:02d} is {f}') for i, f in enumerate(final_files)] p = Path('validation_file_names.pickle') print('Saving list of validation files...', end=' ') with p.open('wb') as f: pickle.dump(final_files, f) print('done.')
def get_annotations_files(settings_ann: MutableMapping[str, Any], dir_ann: Path) -> \ Tuple[List[MutableMapping[str, Any]], List[MutableMapping[str, Any]]]: """Reads, process (if necessary), and returns tha annotations files. :param settings_ann: Settings to be used. :type settings_ann: dict :param dir_ann: Directory of the annotations files. :type dir_ann: pathlib.Path :return: Development and evaluation annotations files. :rtype: list[collections.OrderedDict], list[collections.OrderedDict] """ field_caption = settings_ann['captions_fields_prefix'] csv_development = read_csv_file(file_name=settings_ann['development_file'], base_dir=dir_ann) csv_evaluation = read_csv_file(file_name=settings_ann['evaluation_file'], base_dir=dir_ann) caption_fields = [field_caption.format(c_ind) for c_ind in range(1, 6)] for csv_entry in chain(csv_development, csv_evaluation): # Clean sentence to remove any spaces before punctuations. captions = [ clean_sentence(csv_entry.get(caption_field), keep_case=True, remove_punctuation=False, remove_specials=False) for caption_field in caption_fields ] if settings_ann['use_special_tokens']: captions = [ '<SOS> {} <EOS>'.format(caption) for caption in captions ] [ csv_entry.update({caption_field: caption}) for caption_field, caption in zip(caption_fields, captions) ] return csv_development, csv_evaluation
ind_list = [] for kw in kws: try: ind = words_list.index(kw) except ValueError: continue ind_list.append(ind) keyword_indices.append(ind_list) return keyword_indices if __name__ == "__main__": # load metadata csv files dir_path = 'data/clotho_csv_files' meta_dev = read_csv_file('clotho_metadata_development.csv', dir_path) meta_eval = read_csv_file('clotho_metadata_evaluation.csv', dir_path) # load StanfordPOSTagger -> pos_tag is better # jar = "./stanford-postagger-full-2020-08-06/stanford-postagger.jar" # model = "./stanford-postagger-full-2020-08-06/models/english-bidirectional-distsim.tagger" # tagger = StanfordPOSTagger(model, jar, encoding="utf-8") lmt = WordNetLemmatizer() dict_pos_map = { 'NN': NOUN, 'NNS': NOUN, 'MD': NOUN, 'VBG': VERB,