def count_data_and_build_dict(self, data_list, gene_dicts=True): def add_ept_and_unk(a_list): a_list.insert(0, '@@@empty') a_list.insert(1, '@@@unk') return a_list _logger.add() _logger.add('counting and build dictionaries') token_collection = [] char_collection = [] sent_len_collection = [] token_len_collection = [] for sample in data_list: for tree_node in sample: token_collection += tree_node['token_seq'] sent_len_collection.append(len(tree_node['token_seq'])) for char_seq in tree_node['char_seq']: char_collection += char_seq token_len_collection.append(len(char_seq)) max_sent_len = dynamic_length(sent_len_collection, 1, security=False)[0] max_token_len = dynamic_length(token_len_collection, 0.99, security=False)[0] if gene_dicts: # token & char tokenSet = dynamic_keep(token_collection, 1) charSet = dynamic_keep(char_collection, 1) if cfg.use_glove_unk_token: gloveData = load_glove(cfg.word_embedding_length) gloveTokenSet = list(gloveData.keys()) if cfg.lower_word: tokenSet = list(set([token.lower() for token in tokenSet])) ##!!! gloveTokenSet = list(set([token.lower() for token in gloveTokenSet])) ##!!! # delete token from gloveTokenSet which appears in tokenSet for token in tokenSet: try: gloveTokenSet.remove(token) except ValueError: pass else: if cfg.lower_word: tokenSet = list(set([token.lower() for token in tokenSet])) gloveTokenSet = [] tokenSet = add_ept_and_unk(tokenSet) charSet = add_ept_and_unk(charSet) dicts = {'token': tokenSet, 'char': charSet, 'glove': gloveTokenSet} else: dicts = {} _logger.done() return dicts, {'sent': max_sent_len, 'token': max_token_len}
def count_data_and_build_dict(dataset, sent_len_rate, gene_dicts=True): def add_ept_and_unk(a_list): a_list.insert(0, '@@@empty') a_list.insert(1, '@@@unk') return a_list _logger.add() _logger.add('counting and build dictionaries') token_collection = [] sent_num_collection = [] sent_len_collection = [] question_len_collection = [] for topic in dataset: for paragraph in topic['paragraphs']: sent_num_collection.append(len(paragraph['context_token'])) for sent_token in paragraph['context_token']: sent_len_collection.append(len(sent_token)) token_collection += sent_token for qa in paragraph['qas']: question_len_collection.append(len(qa['question_token'])) token_collection += qa['question_token'] _logger.done() max_sent_num, _ = dynamic_length(sent_num_collection, 1.) max_sent_len, _ = dynamic_length(sent_len_collection, sent_len_rate) max_question_len, _ = dynamic_length(question_len_collection, 0.995) if gene_dicts: tokenSet = dynamic_keep(token_collection, 0.995) if cfg.use_glove_unk_token: gloveData = load_glove(cfg.word_embedding_length) gloveTokenSet = list(gloveData.keys()) if cfg.lower_word: tokenSet = list(set([token.lower() for token in tokenSet])) ##!!! gloveTokenSet = list( set([token.lower() for token in gloveTokenSet])) ##!!! # delete token from gloveTokenSet which appears in tokenSet for token in tokenSet: try: gloveTokenSet.remove(token) except ValueError: pass else: if cfg.lower_word: tokenSet = list(set([token.lower() for token in tokenSet])) gloveTokenSet = [] tokenSet = add_ept_and_unk(tokenSet) dicts = {'token': tokenSet, 'glove': gloveTokenSet} else: dicts = {} _logger.done() return dicts, { 'sent_num': max_sent_num, 'sent_len': max_sent_len, 'question': max_question_len }