def creating_alphabets(alphabet_path, alphabet_data_paths, wiki_alphabet_data_paths, word_dict): train_paths = alphabet_data_paths['train'] wiki_train_paths = wiki_alphabet_data_paths['train'] # [rram] extra_paths = [v for k, v in alphabet_data_paths.items() if k != 'train'] wiki_extra_paths = [ v for k, v in alphabet_data_paths.items() if k != 'train' ] # [rram] alphabet_dict = {} wiki_alphabet_dict = {} # [rram] alphabet_dict['alphabets'] = prepare_data.create_alphabets( alphabet_path, train_paths, extra_paths=extra_paths, max_vocabulary_size=100000, embedd_dict=word_dict) wiki_alphabet_dict['alphabets'] = prepare_data.create_alphabets( wiki_alphabet_paths, wiki_train_paths, extra_paths=wiki_extra_paths, max_vocabulary_size=100000, embedd_dict=word_dict) # [rram] for k, v in alphabet_dict['alphabets'].items(): num_key = 'num_' + k.split('_')[0] alphabet_dict[num_key] = v.size() logger.info("%s : %d" % (num_key, alphabet_dict[num_key])) for k, v in wiki_alphabet_dict['alphabets'].items(): # [rram] num_key = 'num_' + k.split('_')[0] # [rram] wiki_alphabet_dict[num_key] = v.size() # [rram] logger.info("%s : %d" % (num_key, wiki_alphabet_dict[num_key])) # [rram] return alphabet_dict, wiki_alphabet_dict # used to be alphabet_dict only # [rram]
def creating_alphabets(alphabet_path, alphabet_data_paths, word_dict): train_paths = alphabet_data_paths['train'] extra_paths = [v for k, v in alphabet_data_paths.items() if k != 'train'] alphabet_dict = {} alphabet_dict['alphabets'] = prepare_data.create_alphabets( alphabet_path, train_paths, extra_paths=extra_paths, max_vocabulary_size=300000, # rram 100K -> 300K or 400K embedd_dict=word_dict) print(alphabet_dict['alphabets']) # rram - debugging fout = open('debug_alphabets.txt', 'w') # rram - debugging print('WRITING alphabets TO DEBUG_ALPHABETS.txt') # rram - debugging for k, v in alphabet_dict['alphabets'].items(): # print(k, v.get_content()) # rram - debugging fout.write(f'({k}, {v.get_content()})\n') # rram - debugging num_key = 'num_' + k.split('_')[0] alphabet_dict[num_key] = v.size() logger.info("%s : %d" % (num_key, alphabet_dict[num_key])) fout.close() # rram - debugging return alphabet_dict
def creating_alphabets(alphabet_path, alphabet_data_paths, word_dict): train_paths = alphabet_data_paths['train'] extra_paths = [v for k,v in alphabet_data_paths.items() if k != 'train'] alphabet_dict = {} alphabet_dict['alphabets'] = prepare_data.create_alphabets(alphabet_path, train_paths, extra_paths=extra_paths, max_vocabulary_size=100000, embedd_dict=word_dict) for k, v in alphabet_dict['alphabets'].items(): num_key = 'num_' + k.split('_')[0] alphabet_dict[num_key] = v.size() logger.info("%s : %d" % (num_key, alphabet_dict[num_key])) return alphabet_dict