def make_note_sound(self, freq, time, amplitude=1): """ Generates the numpy array for a single note with length [time] and loudness [amplitude] at the given frequency. """ note = sin(2 * pi * freq * amplitude * linspace(0, time, time * self.RATE)) freq = utils.reverse_dictionary(self.FREQUENCY_MAP)[freq] print freq, time, amplitude, len(note) return note / np.max(np.abs(note), axis=0) # Normalize the note
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_iob = {"O": 0, "B": 1, "I": 2} vocab_type = load_vocab(config.types_filename) id2type = reverse_dictionary(vocab_type) print vocab_type # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_iob = get_processing_word(vocab_iob, lowercase=False) processing_type = get_processing_word(vocab_type, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), niob=len(vocab_iob), ntype=len(vocab_type), id2type=id2type) model.build() model.train(train, dev, vocab_tags) model.evaluate(test, vocab_tags)
def study_filtering(): """ Temp to figure out what is going on """ config = ConfigParser.RawConfigParser() config.read('../example.cfg') gpath = config.get('output', 'output_dir') nstudies = config.get('params', 'nstudies') df = pd.read_pickle(gpath + 'full_size.pkl') dfd = pd.read_pickle(gpath + 'drop_duplicats_size.pkl') df = generate_unique_mapping(dfd, df, nstudies=2) print('begin filtering by study') study_dict = pickle.load( open(gpath + 'dict_test.txt', 'rb')) sdict = reverse_dictionary(study_dict) print('**** study dict loaded ******') gs, sl = remove_singleton_exp_variants(df, sdict, nstudies) filtered_data = dfd.ix[gs.values,:] filtered_data.to_csv(gpath + 'filtered_all.txt') embed()
def load_dataset(self, dataset_filepaths, parameters, token_to_vector=None): ''' dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' ''' start_time = time.time() print('Load dataset... ', end='', flush=True) if parameters['token_pretrained_embedding_filepath'] != '': if token_to_vector == None: token_to_vector = utils_nlp.load_pretrained_token_embeddings( parameters) else: token_to_vector = {} if self.verbose: print("len(token_to_vector): {0}".format(len(token_to_vector))) # Load pretraining dataset to ensure that index to label is compatible to the pretrained model, # and that token embeddings that are learned in the pretrained model are loaded properly. all_tokens_in_pretraining_dataset = [] all_characters_in_pretraining_dataset = [] if parameters['use_pretrained_model']: pretraining_dataset = pickle.load( open( os.path.join(parameters['pretrained_model_folder'], 'dataset.pickle'), 'rb')) all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values( ) all_characters_in_pretraining_dataset = pretraining_dataset.index_to_character.values( ) remap_to_unk_count_threshold = 1 self.UNK_TOKEN_INDEX = 0 self.PADDING_CHARACTER_INDEX = 0 self.tokens_mapped_to_unk = [] self.UNK = 'UNK' self.unique_labels = [] labels = {} tokens = {} label_count = {} token_count = {} character_count = {} for dataset_type in ['train', 'valid', 'test', 'deploy']: labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \ = self._parse_dataset(dataset_filepaths.get(dataset_type, None)) if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) token_count['all'] = {} for token in list(token_count['train'].keys()) + list( token_count['valid'].keys()) + list( token_count['test'].keys()) + list( token_count['deploy'].keys()): token_count['all'][token] = token_count['train'][ token] + token_count['valid'][token] + token_count['test'][ token] + token_count['deploy'][token] if parameters['load_all_pretrained_token_embeddings']: for token in token_to_vector: if token not in token_count['all']: token_count['all'][token] = -1 token_count['train'][token] = -1 for token in all_tokens_in_pretraining_dataset: if token not in token_count['all']: token_count['all'][token] = -1 token_count['train'][token] = -1 character_count['all'] = {} for character in list(character_count['train'].keys()) + list( character_count['valid'].keys()) + list( character_count['test'].keys()) + list( character_count['deploy'].keys()): character_count['all'][character] = character_count['train'][ character] + character_count['valid'][ character] + character_count['test'][ character] + character_count['deploy'][character] for character in all_characters_in_pretraining_dataset: if character not in character_count['all']: character_count['all'][character] = -1 character_count['train'][character] = -1 for dataset_type in dataset_filepaths.keys(): if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) label_count['all'] = {} for character in list(label_count['train'].keys()) + list( label_count['valid'].keys()) + list( label_count['test'].keys()) + list( label_count['deploy'].keys()): label_count['all'][character] = label_count['train'][ character] + label_count['valid'][character] + label_count[ 'test'][character] + label_count['deploy'][character] token_count['all'] = utils.order_dictionary(token_count['all'], 'value_key', reverse=True) label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) character_count['all'] = utils.order_dictionary(character_count['all'], 'value', reverse=True) if self.verbose: print('character_count[\'all\']: {0}'.format( character_count['all'])) token_to_index = {} token_to_index[self.UNK] = self.UNK_TOKEN_INDEX iteration_number = 0 number_of_unknown_tokens = 0 if self.verbose: print("parameters['remap_unknown_tokens_to_unk']: {0}".format( parameters['remap_unknown_tokens_to_unk'])) if self.verbose: print("len(token_count['train'].keys()): {0}".format( len(token_count['train'].keys()))) for token, count in token_count['all'].items(): if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1 if parameters['remap_unknown_tokens_to_unk'] == 1 and \ (token_count['train'][token] == 0 or \ parameters['load_only_pretrained_token_embeddings']) and \ not utils_nlp.is_token_in_pretrained_embeddings(token, token_to_vector, parameters) and \ token not in all_tokens_in_pretraining_dataset: if self.verbose: print("token: {0}".format(token)) if self.verbose: print("token.lower(): {0}".format(token.lower())) if self.verbose: print("re.sub('\d', '0', token.lower()): {0}".format( re.sub('\d', '0', token.lower()))) token_to_index[token] = self.UNK_TOKEN_INDEX number_of_unknown_tokens += 1 self.tokens_mapped_to_unk.append(token) else: token_to_index[token] = iteration_number iteration_number += 1 if self.verbose: print("number_of_unknown_tokens: {0}".format( number_of_unknown_tokens)) infrequent_token_indices = [] for token, count in token_count['train'].items(): if 0 < count <= remap_to_unk_count_threshold: infrequent_token_indices.append(token_to_index[token]) if self.verbose: print("len(token_count['train']): {0}".format( len(token_count['train']))) if self.verbose: print("len(infrequent_token_indices): {0}".format( len(infrequent_token_indices))) # Ensure that both B- and I- versions exist for each label labels_without_bio = set() for label in label_count['all'].keys(): new_label = utils_nlp.remove_bio_from_label_name(label) labels_without_bio.add(new_label) for label in labels_without_bio: if label == 'O': continue if parameters['tagging_format'] == 'bioes': prefixes = ['B-', 'I-', 'E-', 'S-'] else: prefixes = ['B-', 'I-'] for prefix in prefixes: l = prefix + label if l not in label_count['all']: label_count['all'][l] = 0 label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) if parameters['use_pretrained_model']: self.unique_labels = sorted( list(pretraining_dataset.label_to_index.keys())) # Make sure labels are compatible with the pretraining dataset. for label in label_count['all']: if label not in pretraining_dataset.label_to_index: raise AssertionError( "The label {0} does not exist in the pretraining dataset. " .format(label) + "Please ensure that only the following labels exist in the dataset: {0}" .format(', '.join(self.unique_labels))) label_to_index = pretraining_dataset.label_to_index.copy() else: label_to_index = {} iteration_number = 0 for label, count in label_count['all'].items(): label_to_index[label] = iteration_number iteration_number += 1 self.unique_labels.append(label) if self.verbose: print('self.unique_labels: {0}'.format(self.unique_labels)) character_to_index = {} iteration_number = 0 for character, count in character_count['all'].items(): if iteration_number == self.PADDING_CHARACTER_INDEX: iteration_number += 1 character_to_index[character] = iteration_number iteration_number += 1 if self.verbose: print('token_count[\'train\'][0:10]: {0}'.format( list(token_count['train'].items())[0:10])) token_to_index = utils.order_dictionary(token_to_index, 'value', reverse=False) if self.verbose: print('token_to_index: {0}'.format(token_to_index)) index_to_token = utils.reverse_dictionary(token_to_index) if parameters['remap_unknown_tokens_to_unk'] == 1: index_to_token[self.UNK_TOKEN_INDEX] = self.UNK if self.verbose: print('index_to_token: {0}'.format(index_to_token)) if self.verbose: print('label_count[\'train\']: {0}'.format(label_count['train'])) label_to_index = utils.order_dictionary(label_to_index, 'value', reverse=False) if self.verbose: print('label_to_index: {0}'.format(label_to_index)) index_to_label = utils.reverse_dictionary(label_to_index) if self.verbose: print('index_to_label: {0}'.format(index_to_label)) character_to_index = utils.order_dictionary(character_to_index, 'value', reverse=False) index_to_character = utils.reverse_dictionary(character_to_index) if self.verbose: print('character_to_index: {0}'.format(character_to_index)) if self.verbose: print('index_to_character: {0}'.format(index_to_character)) if self.verbose: print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10])) if self.verbose: print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10])) if self.verbose: # Print sequences of length 1 in train set for token_sequence, label_sequence in zip(tokens['train'], labels['train']): if len(label_sequence) == 1 and label_sequence[0] != 'O': print("{0}\t{1}".format(token_sequence[0], label_sequence[0])) self.token_to_index = token_to_index self.index_to_token = index_to_token self.index_to_character = index_to_character self.character_to_index = character_to_index self.index_to_label = index_to_label self.label_to_index = label_to_index if self.verbose: print("len(self.token_to_index): {0}".format( len(self.token_to_index))) if self.verbose: print("len(self.index_to_token): {0}".format( len(self.index_to_token))) self.tokens = tokens self.labels = labels token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices( dataset_filepaths.keys()) self.token_indices = token_indices self.label_indices = label_indices self.character_indices_padded = character_indices_padded self.character_indices = character_indices self.token_lengths = token_lengths self.characters = characters self.label_vector_indices = label_vector_indices self.number_of_classes = max(self.index_to_label.keys()) + 1 self.vocabulary_size = max(self.index_to_token.keys()) + 1 self.alphabet_size = max(self.index_to_character.keys()) + 1 if self.verbose: print("self.number_of_classes: {0}".format(self.number_of_classes)) if self.verbose: print("self.alphabet_size: {0}".format(self.alphabet_size)) if self.verbose: print("self.vocabulary_size: {0}".format(self.vocabulary_size)) # unique_labels_of_interest is used to compute F1-scores. self.unique_labels_of_interest = list(self.unique_labels) self.unique_labels_of_interest.remove('O') self.unique_label_indices_of_interest = [] for lab in self.unique_labels_of_interest: self.unique_label_indices_of_interest.append(label_to_index[lab]) self.infrequent_token_indices = infrequent_token_indices if self.verbose: print('self.unique_labels_of_interest: {0}'.format( self.unique_labels_of_interest)) if self.verbose: print('self.unique_label_indices_of_interest: {0}'.format( self.unique_label_indices_of_interest)) elapsed_time = time.time() - start_time print('done ({0:.2f} seconds)'.format(elapsed_time)) return token_to_vector
def load_dataset(self, dataset_filepaths, parameters): ''' args: dataset_filepaths : dictionary with keys 'train', 'valid', 'test' http://stackoverflow.com/questions/27416164/what-is-conll-data-format ''' all_pretrained_tokens = None if parameters['token_pretrained_embedding_filepath'] != '': all_pretrained_tokens = utils_nlp.load_tokens_from_pretrained_token_embeddings( parameters) if self.verbose: print("len(all_pretrained_tokens): {0}".format( len(all_pretrained_tokens))) remap_to_unk_count_threshold = 1 #if ['train'] not in dataset_filepaths.keys(): raise ValueError('') UNK_TOKEN_INDEX = 0 PADDING_CHARACTER_INDEX = 0 self.UNK = 'UNK' self.unique_labels = [] labels = {} tokens = {} characters = {} token_lengths = {} label_count = {} token_count = {} character_count = {} for dataset_type in ['train', 'valid', 'test']: labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], \ character_count[dataset_type] = self._parse_dataset(dataset_filepaths[dataset_type],dataset_type)#,all_pretrained_tokens,token_count) if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) token_count['all'] = {} # utils.merge_dictionaries() for token in list(token_count['train'].keys()) + list( token_count['valid'].keys()) + list( token_count['test'].keys()): token_count['all'][ token] = token_count['train'][token] + token_count['valid'][ token] + token_count['test'][token] for dataset_type in ['train', 'valid', 'test']: if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) character_count['all'] = {} # utils.merge_dictionaries() for character in list(character_count['train'].keys()) + list( character_count['valid'].keys()) + list( character_count['test'].keys()): character_count['all'][character] = character_count['train'][ character] + character_count['valid'][ character] + character_count['test'][character] label_count['all'] = {} # utils.merge_dictionaries() for character in list(label_count['train'].keys()) + list( label_count['valid'].keys()) + list( label_count['test'].keys()): label_count['all'][ character] = label_count['train'][character] + label_count[ 'valid'][character] + label_count['test'][character] token_count['all'] = utils.order_dictionary(token_count['all'], 'value', reverse=True) #label_count['train'] = utils.order_dictionary(label_count['train'], 'key', reverse = False) label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) label_count['train'] = utils.order_dictionary(label_count['train'], 'key', reverse=False) character_count['all'] = utils.order_dictionary(character_count['all'], 'value', reverse=True) if self.verbose: print('character_count[\'all\']: {0}'.format( character_count['all'])) token_to_index = {} token_to_index[self.UNK] = UNK_TOKEN_INDEX iteration_number = 0 number_of_unknown_tokens = 0 # if self.verbose: print("parameters['remove_unknown_tokens']: {0}".format(parameters['remove_unknown_tokens'])) # if self.verbose: print("len(token_count['train'].keys()): {0}".format(len(token_count['train'].keys()))) for token, count in token_count['all'].items(): if iteration_number == UNK_TOKEN_INDEX: iteration_number += 1 if parameters['remove_unknown_tokens'] == 1 and \ token_count['train'][token] == 0 and \ (all_pretrained_tokens == None or \ token not in all_pretrained_tokens and \ token.lower() not in all_pretrained_tokens and \ re.sub('\d', '0', token.lower()) not in all_pretrained_tokens):#all( [x not in all_pretrained_tokens for x in [ token, token.lower(), re.sub('\d', '0', token.lower()) ]]): # if self.verbose: print("token: {0}".format(token)) # if self.verbose: print("token.lower(): {0}".format(token.lower())) # if self.verbose: print("re.sub('\d', '0', token.lower()): {0}".format(re.sub('\d', '0', token.lower()))) # assert(token not in ) # assert(token.lower() not in all_pretrained_tokens) # assert(re.sub('\d', '0', token.lower()) not in all_pretrained_tokens) token_to_index[token] = UNK_TOKEN_INDEX number_of_unknown_tokens += 1 else: token_to_index[token] = iteration_number iteration_number += 1 if self.verbose: print("number_of_unknown_tokens: {0}".format( number_of_unknown_tokens)) # 0/0 infrequent_token_indices = [] for token, count in token_count['train'].items(): if 0 < count <= remap_to_unk_count_threshold: infrequent_token_indices.append(token_to_index[token]) if self.verbose: print("len(token_count['train']): {0}".format( len(token_count['train']))) if self.verbose: print("len(infrequent_token_indices): {0}".format( len(infrequent_token_indices))) label_to_index = {} iteration_number = 0 #for label, count in label_count['train'].items(): for label, count in label_count['all'].items(): label_to_index[label] = iteration_number iteration_number += 1 self.unique_labels.append(label) #for label, count in label_count['train'].items(): # self.unique_labels.append(label) if self.verbose: print('self.unique_labels: {0}'.format(self.unique_labels)) character_to_index = {} iteration_number = 0 for character, count in character_count['all'].items(): if iteration_number == PADDING_CHARACTER_INDEX: iteration_number += 1 character_to_index[character] = iteration_number iteration_number += 1 if self.verbose: print('token_count[\'train\'][0:10]: {0}'.format( list(token_count['train'].items())[0:10])) token_to_index = utils.order_dictionary(token_to_index, 'value', reverse=False) #if self.verbose: print('token_to_index[0:10]: {0}'.format(token_to_index[0:10])) index_to_token = utils.reverse_dictionary(token_to_index) if parameters['remove_unknown_tokens'] == 1: index_to_token[UNK_TOKEN_INDEX] = self.UNK #if self.verbose: print('index_to_token[0:10]: {0}'.format(index_to_token[0:10])) #if self.verbose: print('label_count[\'train\']: {0}'.format(label_count['train'])) label_to_index = utils.order_dictionary(label_to_index, 'value', reverse=False) if self.verbose: print('label_to_index: {0}'.format(label_to_index)) index_to_label = utils.reverse_dictionary(label_to_index) if self.verbose: print('index_to_label: {0}'.format(index_to_label)) index_to_character = utils.reverse_dictionary(character_to_index) if self.verbose: print('character_to_index: {0}'.format(character_to_index)) if self.verbose: print('index_to_character: {0}'.format(index_to_character)) if self.verbose: print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10])) if self.verbose: print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10])) # Map tokens and labels to their indices token_indices = {} label_indices = {} character_indices = {} character_indices_padded = {} for dataset_type in ['train', 'valid', 'test']: token_indices[dataset_type] = [] characters[dataset_type] = [] character_indices[dataset_type] = [] token_lengths[dataset_type] = [] character_indices_padded[dataset_type] = [] for token_sequence in tokens[dataset_type]: token_indices[dataset_type].append( [token_to_index[token] for token in token_sequence]) characters[dataset_type].append( [list(token) for token in token_sequence]) character_indices[dataset_type].append( [[character_to_index[character] for character in token] for token in token_sequence]) token_lengths[dataset_type].append( [len(token) for token in token_sequence]) longest_token_length_in_sequence = max( token_lengths[dataset_type][-1]) character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, PADDING_CHARACTER_INDEX) for temp_token_indices in character_indices[dataset_type][-1] ]) label_indices[dataset_type] = [] for label_sequence in labels[dataset_type]: label_indices[dataset_type].append( [label_to_index[label] for label in label_sequence]) if self.verbose: print('token_lengths[\'train\'][0][0:10]: {0}'.format( token_lengths['train'][0][0:10])) if self.verbose: print('characters[\'train\'][0][0:10]: {0}'.format( characters['train'][0][0:10])) if self.verbose: print('token_indices[\'train\'][0:10]: {0}'.format( token_indices['train'][0:10])) if self.verbose: print('label_indices[\'train\'][0:10]: {0}'.format( label_indices['train'][0:10])) if self.verbose: print('character_indices[\'train\'][0][0:10]: {0}'.format( character_indices['train'][0][0:10])) if self.verbose: print('character_indices_padded[\'train\'][0][0:10]: {0}'.format( character_indices_padded['train'][0][0:10])) # Vectorize the labels # [Numpy 1-hot array](http://stackoverflow.com/a/42263603/395857) label_binarizer = sklearn.preprocessing.LabelBinarizer() label_binarizer.fit(range(max(index_to_label.keys()) + 1)) label_vector_indices = {} for dataset_type in ['train', 'valid', 'test']: label_vector_indices[dataset_type] = [] for label_indices_sequence in label_indices[dataset_type]: label_vector_indices[dataset_type].append( label_binarizer.transform(label_indices_sequence)) if self.verbose: print('label_vector_indices[\'train\'][0:2]: {0}'.format( label_vector_indices['train'][0:2])) if self.verbose: print('len(label_vector_indices[\'train\']): {0}'.format( len(label_vector_indices['train']))) self.token_to_index = token_to_index self.index_to_token = index_to_token self.token_indices = token_indices self.label_indices = label_indices self.character_indices_padded = character_indices_padded self.index_to_character = index_to_character self.character_to_index = character_to_index self.character_indices = character_indices self.token_lengths = token_lengths self.characters = characters self.tokens = tokens self.labels = labels self.label_vector_indices = label_vector_indices self.index_to_label = index_to_label self.label_to_index = label_to_index if self.verbose: print("len(self.token_to_index): {0}".format( len(self.token_to_index))) if self.verbose: print("len(self.index_to_token): {0}".format( len(self.index_to_token))) self.number_of_classes = max(self.index_to_label.keys()) + 1 self.vocabulary_size = max(self.index_to_token.keys()) + 1 self.alphabet_size = max(self.index_to_character.keys()) + 1 if self.verbose: print("self.number_of_classes: {0}".format(self.number_of_classes)) if self.verbose: print("self.alphabet_size: {0}".format(self.alphabet_size)) if self.verbose: print("self.vocabulary_size: {0}".format(self.vocabulary_size)) # unique_labels_of_interest is used to compute F1-scores. self.unique_labels_of_interest = list(self.unique_labels) self.unique_labels_of_interest.remove('O') self.unique_label_indices_of_interest = [] for lab in self.unique_labels_of_interest: self.unique_label_indices_of_interest.append(label_to_index[lab]) self.infrequent_token_indices = infrequent_token_indices if self.verbose: print('self.unique_labels_of_interest: {0}'.format( self.unique_labels_of_interest)) if self.verbose: print('self.unique_label_indices_of_interest: {0}'.format( self.unique_label_indices_of_interest)) print('Dataset formatting completed')
def load_dataset(self, word_index=None, tag_index=None, char_index=None, ner_index=None, prefix_index=None, suffix_index=None, fgen=True): ''' dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' ''' start_time = time.time() if fgen: self.tokens_mapped_to_unk = [] prefix_count, suffix_count, word_count, tag_count, char_count, self.sentence_list = self._parse_dataset( fgen) word_count = utils.order_dictionary(word_count, 'value_key', reverse=True) tag_count = utils.order_dictionary(tag_count, 'key', reverse=False) char_count = utils.order_dictionary(char_count, 'value', reverse=True) prefix_count = utils.order_dictionary(prefix_count, 'value_key', reverse=True) suffix_count = utils.order_dictionary(suffix_count, 'value_key', reverse=True) pid = 0 self.prefix_index["-padding-"] = pid pid += 1 for pre, count in prefix_count.items(): self.prefix_index[pre] = pid pid += 1 self.prefix_index['-UNK-'] = pid pickle.dump(self.prefix_index, open("prefix_index", 'wb')) sid = 0 self.suffix_index["-padding-"] = sid sid += 1 for suf, count in suffix_count.items(): self.suffix_index[suf] = sid sid += 1 self.suffix_index['-UNK-'] = sid pickle.dump(self.suffix_index, open("suffix_index", 'wb')) wid = 0 self.word_index["-padding-"] = wid wid += 1 for word, count in word_count.items(): self.word_index[word] = wid wid += 1 self.word_index['-UNK-'] = wid pickle.dump(self.word_index, open("word_index", 'wb')) tid = 0 #self.tag_index["-padding-"] = tid #tid += 1 for tag, count in tag_count.items(): self.tag_index[tag] = tid tid += 1 pickle.dump(self.tag_index, open("tag_index", 'wb')) cid = 0 self.char_index["-padding-"] = cid cid += 1 for char, count in char_count.items(): self.char_index[char] = cid cid += 1 self.char_index['-UNK-'] = cid pickle.dump(self.char_index, open("char_index", 'wb')) else: self.word_index = word_index self.tag_index = tag_index self.char_index = char_index self.prefix_index = prefix_index self.suffix_index = suffix_index _, _, _, _, _, self.sentence_list = self._parse_dataset(fgen) for name, sent_list in self.sentence_list.items(): for sent in sent_list: sent.gen_id_list(self.word_index, self.tag_index, self.char_index) if not self.use_char: sent.gen_sent_features(self.word_index, self.tag_index, self.prefix_index, self.suffix_index) self.number_of_classes = len(self.tag_index) self.vocabulary_size = len(self.word_index) if self.char_index is not None: self.alphabet_size = len(self.char_index) if not self.use_char: self.prefix_size = len(self.prefix_index) self.suffix_size = len(self.suffix_index) if self.char_index is not None: self.char_map = utils.reverse_dictionary(self.char_index) self.word_map = utils.reverse_dictionary(self.word_index) self.tag_map = utils.reverse_dictionary(self.tag_index) elapsed_time = time.time() - start_time print('loading dataset done ({0:.2f} seconds)'.format(elapsed_time))
def load_dataset(self, word_index=None, tag_index=None, char_index=None, ner_index=None, prefix_index=None, suffix_index=None, fgen=True): ''' dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' ''' start_time = time.time() if fgen: self.tokens_mapped_to_unk = [] prefix_count, suffix_count, word_count, tag_count, char_count, ner_count, self.sentence_list = self._parse_dataset(fgen) word_count = utils.order_dictionary(word_count, 'value_key', reverse = True) tag_count = utils.order_dictionary(tag_count, 'key', reverse = False) char_count = utils.order_dictionary(char_count, 'value', reverse = True) prefix_count = utils.order_dictionary(prefix_count, 'value_key', reverse = True) suffix_count = utils.order_dictionary(suffix_count, 'value_key', reverse = True) pid = 0 self.prefix_index["-padding-"] = pid pid += 1 for pre, count in prefix_count.items(): self.prefix_index[pre] = pid pid += 1 self.prefix_index['-UNK-'] = pid pickle.dump(self.prefix_index, open(os.path.join("lstm_ner_models", "ner_prefix_index"), 'wb')) sid = 0 self.suffix_index["-padding-"] = sid sid += 1 for suf, count in suffix_count.items(): self.suffix_index[suf] = sid sid += 1 self.suffix_index['-UNK-'] = sid pickle.dump(self.suffix_index, open(os.path.join("lstm_ner_models", "ner_suffix_index"), 'wb')) wid = 0 self.word_index["-padding-"] = wid wid += 1 for word, count in word_count.items(): self.word_index[word] = wid wid += 1 if count <=1: self.rare_words.append(word) self.word_index['-UNK-'] = wid pickle.dump(self.word_index, open(os.path.join("lstm_ner_models", "ner_word_index"), 'wb')) # Ensure that both B- and I- versions exist for ach label labels_without_bio = set() for label, count in ner_count.items(): new_label = utils.remove_bio_from_label_name(label) labels_without_bio.add(new_label) prefixes = ['B-', 'I-'] nid = 0 self.ner_index['O'] = nid nid += 1 for label in labels_without_bio: if label == 'O': continue for prefix in prefixes: l = prefix + label self.ner_index[l] = nid nid += 1 pickle.dump(self.ner_index, open(os.path.join("lstm_ner_models", "ner_index"), 'wb')) ''' tid = 0 self.tag_index["-padding-"] = tid tid += 1 for tag, count in tag_count.items(): self.tag_index[tag] = tid tid += 1 pickle.dump(self.tag_index, open("ner_tag_index", 'wb')) ''' cid = 0 self.char_index["-padding-"] = cid cid += 1 for char, count in char_count.items(): self.char_index[char] = cid cid += 1 self.char_index['-UNK-'] = cid pickle.dump(self.char_index, open(os.path.join("lstm_ner_models", "ner_char_index"), 'wb')) else: self.word_index = word_index self.tag_index = tag_index self.char_index = char_index self.ner_index = ner_index self.prefix_index = prefix_index self.suffix_index = suffix_index _, _, _, _, _, _, self.sentence_list = self._parse_dataset(fgen) for name, sent_list in self.sentence_list.items(): for sent in sent_list: sent.gen_id_list(self.word_index, self.char_index, self.ner_index, None) if not self.use_char: sent.gen_sent_features(self.word_index, prefix_map=self.prefix_index, suffix_map=self.suffix_index) self.number_of_classes = len(self.ner_index) self.vocabulary_size = len(self.word_index) if self.char_index != None: self.alphabet_size = len(self.char_index) else: self.alphabet_size = 0 #self.pos_classes = len(self.tag_index) self.number_of_boi = 3 self.number_of_type = 4 self.prefix_size = len(self.prefix_index) self.suffix_size = len(self.suffix_index) if self.char_index != None: self.char_map = utils.reverse_dictionary(self.char_index) self.word_map = utils.reverse_dictionary(self.word_index) #self.tag_map = utils.reverse_dictionary(self.tag_index) self.ner_map = utils.reverse_dictionary(self.ner_index) elapsed_time = time.time() - start_time print('loading dataset done ({0:.2f} seconds)'.format(elapsed_time))
def load_dataset(self, dataset_filepaths, parameters): ''' dataset_filepaths : dictionary with keys 'train', 'valid', 'test' ''' start_time = time.time() pprint('Load dataset... ') # Load pretraining dataset to ensure that index to label is compatible to the pretrained model, # and that token embeddings that are learned in the pretrained model are loaded properly. all_tokens_in_pretraining_dataset = [] if parameters['use_pretrained_model']: pretrained_model_folder = os.path.dirname( parameters['pretrained_model_checkpoint_filepath']) pretraining_dataset = pickle.load( open(os.path.join(pretrained_model_folder, 'dataset.pickle'), 'rb')) all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values( ) self.vocab_embeddings = all_tokens_in_pretraining_dataset remap_to_unk_count_threshold = 1 self.PADDING_CHARACTER_INDEX = 1 self.PADDING_TOKEN_INDEX = 1 self.UNK_TOKEN_INDEX = 0 self.UNK_CHARACTER_INDEX = 0 self.tokens_mapped_to_unk = [] self.UNK = '<UNK>' self.PAD = '<PAD>' self.unique_labels = [] labels = {} tokens = {} characters = {} token_lengths = {} sequence_lengths = {} longest_token_length_in_sequence = {} label_count = {} token_count = {} character_count = {} for dataset_type in ['train', 'valid', 'test']: labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \ = self._parse_dataset(dataset_filepaths.get(dataset_type, None), parameters['language'], parameters['data_to_use'] if 'data_to_use' in parameters else None) if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) token_count['all'] = {} for token in list(token_count['train'].keys()) + list( token_count['valid'].keys()) + list( token_count['test'].keys()): token_count['all'][ token] = token_count['train'][token] + token_count['valid'][ token] + token_count['test'][token] for dataset_type in dataset_filepaths.keys(): if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) character_count['all'] = {} for character in list(character_count['train'].keys()) + list( character_count['valid'].keys()) + list( character_count['test'].keys()): character_count['all'][character] = character_count['train'][ character] + character_count['valid'][ character] + character_count['test'][character] label_count['all'] = {} for character in list(label_count['train'].keys()) + list( label_count['valid'].keys()) + list( label_count['test'].keys()): label_count['all'][ character] = label_count['train'][character] + label_count[ 'valid'][character] + label_count['test'][character] token_count['all'] = utils.order_dictionary(token_count['all'], 'value_key', reverse=True) label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) character_count['all'] = utils.order_dictionary(character_count['all'], 'value', reverse=True) if self.verbose: print('character_count[\'all\']: {0}'.format( character_count['all'])) token_to_index = {} token_to_index[self.UNK] = self.UNK_TOKEN_INDEX token_to_index[self.PAD] = self.PADDING_TOKEN_INDEX iteration_number = 0 number_of_unknown_tokens = 0 if self.verbose: print("parameters['remap_unknown_tokens_to_unk']: {0}".format( parameters['remap_unknown_tokens_to_unk'])) if self.verbose: print("len(token_count['train'].keys()): {0}".format( len(token_count['train'].keys()))) for token, count in token_count['all'].items(): if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1 if iteration_number == self.PADDING_TOKEN_INDEX: iteration_number += 1 if parameters['remap_unknown_tokens_to_unk'] == 1 and \ (token_count['train'][token] == 0 or \ parameters['load_only_pretrained_token_embeddings']) and \ not utils_nlp.is_token_in_pretrained_embeddings(token, self.vocab_embeddings, parameters) and \ token not in all_tokens_in_pretraining_dataset: if self.verbose: print("token: {0}".format(token)) if self.verbose: print("token.lower(): {0}".format(token.lower())) if self.verbose: print("re.sub('\d', '0', token.lower()): {0}".format( re.sub('\d', '0', token.lower()))) token_to_index[token] = iteration_number iteration_number += 1 #if parameters['embedding_type'] == 'fasttext': # token_to_index[token] = iteration_number # iteration_number += 1 #else: # token_to_index[token] = self.UNK_TOKEN_INDEX # number_of_unknown_tokens += 1 # self.tokens_mapped_to_unk.append(token) else: token_to_index[token] = iteration_number iteration_number += 1 if self.verbose: print("number_of_unknown_tokens: {0}".format( number_of_unknown_tokens)) infrequent_token_indices = [] for token, count in token_count['train'].items(): if 0 < count <= remap_to_unk_count_threshold: infrequent_token_indices.append(token_to_index[token]) if self.verbose: print("len(token_count['train']): {0}".format( len(token_count['train']))) if self.verbose: print("len(infrequent_token_indices): {0}".format( len(infrequent_token_indices))) label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) if parameters['use_pretrained_model']: self.unique_labels = sorted( list(pretraining_dataset.label_to_index.keys())) # Make sure labels are compatible with the pretraining dataset. for label in label_count['all']: if label not in pretraining_dataset.label_to_index: raise AssertionError( "The label {0} does not exist in the pretraining dataset. " .format(label) + "Please ensure that only the following labels exist in the dataset: {0}" .format(', '.join(self.unique_labels))) label_to_index = pretraining_dataset.label_to_index.copy() else: label_to_index = {} iteration_number = 0 for label, count in label_count['all'].items(): label_to_index[label] = iteration_number iteration_number += 1 self.unique_labels.append(label) self.PADDING_LABEL_INDEX = label_to_index['O'] if self.verbose: print('self.unique_labels: {0}'.format(self.unique_labels)) character_to_index = {} character_to_index[self.UNK] = self.UNK_CHARACTER_INDEX if parameters['use_pretrained_model']: # TODO: initialize character_to_index from saved pickle character_to_index = pretraining_dataset.character_to_index.copy() else: character_to_index[self.PAD] = self.PADDING_CHARACTER_INDEX iteration_number = 0 for character, count in character_count['all'].items(): if iteration_number == self.UNK_CHARACTER_INDEX: iteration_number += 1 if iteration_number == self.PADDING_CHARACTER_INDEX: iteration_number += 1 character_to_index[character] = iteration_number iteration_number += 1 if self.verbose: print('token_count[\'train\'][0:10]: {0}'.format( list(token_count['train'].items())[0:10])) token_to_index = utils.order_dictionary(token_to_index, 'value', reverse=False) if self.verbose: print('token_to_index: {0}'.format(token_to_index)) index_to_token = utils.reverse_dictionary(token_to_index) if parameters['remap_unknown_tokens_to_unk'] == 1: index_to_token[self.UNK_TOKEN_INDEX] = self.UNK if self.verbose: print('index_to_token: {0}'.format(index_to_token)) if self.verbose: print('label_count[\'train\']: {0}'.format(label_count['train'])) label_to_index = utils.order_dictionary(label_to_index, 'value', reverse=False) if self.verbose: print('label_to_index: {0}'.format(label_to_index)) index_to_label = utils.reverse_dictionary(label_to_index) if self.verbose: print('index_to_label: {0}'.format(index_to_label)) character_to_index = utils.order_dictionary(character_to_index, 'value', reverse=False) index_to_character = utils.reverse_dictionary(character_to_index) if self.verbose: print('character_to_index: {0}'.format(character_to_index)) if self.verbose: print('index_to_character: {0}'.format(index_to_character)) if self.verbose: print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10])) if self.verbose: print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10])) if self.verbose: # Print sequences of length 1 in train set for token_sequence, label_sequence in zip(tokens['train'], labels['train']): if len(label_sequence) == 1 and label_sequence[0] != 'O': print("{0}\t{1}".format(token_sequence[0], label_sequence[0])) # Map tokens and labels to their indices token_indices = {} label_indices = {} character_indices = {} #character_indices_padded = {} for dataset_type in dataset_filepaths.keys(): token_indices[dataset_type] = [] characters[dataset_type] = [] character_indices[dataset_type] = [] token_lengths[dataset_type] = [] sequence_lengths[dataset_type] = [] longest_token_length_in_sequence[dataset_type] = [] #character_indices_padded[dataset_type] = [] for token_sequence in tokens[dataset_type]: token_indices[dataset_type].append( [token_to_index[token] for token in token_sequence]) characters[dataset_type].append( [list(token) for token in token_sequence]) character_indices[dataset_type].append( [[character_to_index[character] for character in token] for token in token_sequence]) token_lengths[dataset_type].append( [len(token) for token in token_sequence]) sequence_lengths[dataset_type].append(len(token_sequence)) longest_token_length_in_sequence[dataset_type].append( max(token_lengths[dataset_type][-1])) #character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX) # for temp_token_indices in character_indices[dataset_type][-1]]) label_indices[dataset_type] = [] for label_sequence in labels[dataset_type]: label_indices[dataset_type].append( [label_to_index[label] for label in label_sequence]) if self.verbose: print('token_lengths[\'train\'][0][0:10]: {0}'.format( token_lengths['train'][0][0:10])) if self.verbose: print('characters[\'train\'][0][0:10]: {0}'.format( characters['train'][0][0:10])) if self.verbose: print('token_indices[\'train\'][0:10]: {0}'.format( token_indices['train'][0:10])) if self.verbose: print('label_indices[\'train\'][0:10]: {0}'.format( label_indices['train'][0:10])) if self.verbose: print('character_indices[\'train\'][0][0:10]: {0}'.format( character_indices['train'][0][0:10])) #if self.verbose: print('character_indices_padded[\'train\'][0][0:10]: {0}'.format(character_indices_padded['train'][0][0:10])) label_vector_indices = {} tmp_vector = [0] * len(self.unique_labels) tmp_vector[label_to_index["O"]] = 1 self.PADDING_LABEL_VECTOR = tmp_vector for dataset_type in dataset_filepaths.keys(): label_vector_indices[dataset_type] = [] for label_indices_sequence in label_indices[dataset_type]: vector_sequence = [] for indice in label_indices_sequence: vector = [0] * len(self.unique_labels) vector[indice] = 1 vector_sequence.append(vector) label_vector_indices[dataset_type].append(vector_sequence) if self.verbose: print('label_vector_indices[\'train\'][0:2]: {0}'.format( label_vector_indices['train'][0:2])) if self.verbose: print('len(label_vector_indices[\'train\']): {0}'.format( len(label_vector_indices['train']))) self.token_to_index = token_to_index self.index_to_token = index_to_token self.token_indices = token_indices self.label_indices = label_indices #self.character_indices_padded = character_indices_padded self.index_to_character = index_to_character self.character_to_index = character_to_index self.character_indices = character_indices self.token_lengths = token_lengths self.sequence_lengths = sequence_lengths self.longest_token_length_in_sequence = longest_token_length_in_sequence self.characters = characters self.tokens = tokens self.labels = labels self.label_vector_indices = label_vector_indices self.index_to_label = index_to_label self.label_to_index = label_to_index if self.verbose: print("len(self.token_to_index): {0}".format( len(self.token_to_index))) if self.verbose: print("len(self.index_to_token): {0}".format( len(self.index_to_token))) self.number_of_classes = len(self.unique_labels) self.vocabulary_size = len(self.index_to_token) if len( self.index_to_token) > 100000 else 100000 self.alphabet_size = len(self.character_to_index) if self.verbose: print("self.number_of_classes: {0}".format(self.number_of_classes)) if self.verbose: print("self.alphabet_size: {0}".format(self.alphabet_size)) if self.verbose: print("self.vocabulary_size: {0}".format(self.vocabulary_size)) # unique_labels_of_interest is used to compute F1-scores. self.unique_labels_of_interest = list(self.unique_labels) self.unique_labels_of_interest.remove('O') self.unique_label_indices_of_interest = [] for lab in self.unique_labels_of_interest: self.unique_label_indices_of_interest.append(label_to_index[lab]) self.infrequent_token_indices = infrequent_token_indices if self.verbose: print('self.unique_labels_of_interest: {0}'.format( self.unique_labels_of_interest)) if self.verbose: print('self.unique_label_indices_of_interest: {0}'.format( self.unique_label_indices_of_interest)) print(self.label_to_index) elapsed_time = time.time() - start_time print('done ({0:.2f} seconds)'.format(elapsed_time))
def load_dataset(self, dataset_filepaths, parameters): ''' dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' ''' start_time = time.time() print('Load dataset... ', end='', flush=True) all_pretrained_tokens = [] if parameters['token_pretrained_embedding_filepath'] != '': all_pretrained_tokens = utils_nlp.load_tokens_from_pretrained_token_embeddings( parameters) if self.verbose: print("len(all_pretrained_tokens): {0}".format( len(all_pretrained_tokens))) all_tokens_in_pretraining_dataset = [] if parameters['use_pretrained_model']: pretraining_dataset = pickle.load( open( os.path.join(parameters['pretrained_model_folder'], 'dataset.pickle'), 'rb')) all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values( ) remap_to_unk_count_threshold = 1 self.UNK_TOKEN_INDEX = 0 self.PADDING_CHARACTER_INDEX = 0 self.tokens_mapped_to_unk = [] self.UNK = 'UNK' self.unique_labels = [] labels = {} tokens = {} if parameters['use_corrector']: labels_corrector = {} label_corrector_count = {} self.unique_labels_corrector = [] if parameters['include_pos']: labels_pos = {} label_pos_count = {} self.unique_labels_pos = [] characters = {} token_lengths = {} label_count = {} token_count = {} character_count = {} for dataset_type in ['train', 'valid', 'test', 'deploy']: # print("what am i getting?? {:s}".format(str(dataset_filepaths.get(dataset_type, None)))) if parameters['use_corrector']: if parameters['include_pos']: labels_pos[dataset_type], labels_corrector[dataset_type], labels[dataset_type], \ tokens[dataset_type], token_count[dataset_type], label_pos_count[dataset_type], \ label_corrector_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \ = self._parse_dataset(dataset_filepaths.get(dataset_type, None), use_corrector=True, include_pos=True, tagging_format=parameters['tagging_format']) else: labels_corrector[dataset_type], labels[dataset_type], tokens[dataset_type], \ token_count[dataset_type], label_corrector_count[dataset_type], label_count[dataset_type], \ character_count[dataset_type] \ = self._parse_dataset(dataset_filepaths.get(dataset_type, None), use_corrector=True, tagging_format=parameters['tagging_format']) else: labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], \ character_count[dataset_type] \ = self._parse_dataset(dataset_filepaths.get(dataset_type, None), tagging_format=parameters['tagging_format']) if self.verbose: print("len(token_count[{1}]): {0}".format( len(token_count[dataset_type]), dataset_type)) # sys.exit(0) token_count['all'] = {} for token in list(token_count['train'].keys()) + list( token_count['valid'].keys()) + list( token_count['test'].keys()) + list( token_count['deploy'].keys()): token_count['all'][token] = token_count['train'][ token] + token_count['valid'][token] + token_count['test'][ token] + token_count['deploy'][token] if self.verbose: print("len(token_count[all]): {0}".format(len(token_count['all']))) for dataset_type in dataset_filepaths.keys(): if self.verbose: print("len(token_count[{1}]): {0}".format( len(token_count[dataset_type]), dataset_type)) character_count['all'] = {} for character in list(character_count['train'].keys()) + list( character_count['valid'].keys()) + list( character_count['test'].keys()) + list( character_count['deploy'].keys()): character_count['all'][character] = character_count['train'][ character] + character_count['valid'][ character] + character_count['test'][ character] + character_count['deploy'][character] label_count['all'] = {} for character in list(label_count['train'].keys()) + list( label_count['valid'].keys()) + list( label_count['test'].keys()) + list( label_count['deploy'].keys()): label_count['all'][character] = label_count['train'][character] + label_count['valid'][character] + \ label_count['test'][character] + label_count['deploy'][character] if parameters['use_corrector']: label_corrector_count['all'] = {} for label in list(label_corrector_count['train'].keys()) + list( label_corrector_count['valid'].keys()) + list( label_corrector_count['test'].keys()) + list( label_corrector_count['deploy'].keys()): label_corrector_count['all'][label] = label_corrector_count['train'][label] + label_corrector_count['valid'][label] + \ label_corrector_count['test'][label] + label_corrector_count['deploy'][label] label_corrector_count['all'] = utils.order_dictionary( label_corrector_count['all'], 'key', reverse=False) if parameters['include_pos']: label_pos_count['all'] = {} for label in list(label_pos_count['train'].keys()) + list( label_pos_count['valid'].keys()) + list( label_pos_count['test'].keys()) + list( label_pos_count['deploy'].keys()): label_pos_count['all'][label] = label_pos_count['train'][label] + label_pos_count['valid'][label] + \ label_pos_count['test'][label] + label_pos_count['deploy'][label] label_pos_count['all'] = utils.order_dictionary( label_pos_count['all'], 'key', reverse=False) token_count['all'] = utils.order_dictionary(token_count['all'], 'value_key', reverse=True) label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) character_count['all'] = utils.order_dictionary(character_count['all'], 'value', reverse=True) if self.verbose: print('character_count[\'all\']: {0}'.format( character_count['all'])) token_to_index = {} token_to_index[self.UNK] = self.UNK_TOKEN_INDEX iteration_number = 0 number_of_unknown_tokens = 0 if self.verbose: print("parameters['remap_unknown_tokens_to_unk']: {0}".format( parameters['remap_unknown_tokens_to_unk'])) if self.verbose: print("len(token_count['train'].keys()): {0}".format( len(token_count['train'].keys()))) for token, count in token_count['all'].items(): if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1 if parameters['remap_unknown_tokens_to_unk'] == 1 and \ (token_count['train'][token] == 0 or \ parameters['load_only_pretrained_token_embeddings']) and \ not utils_nlp.is_token_in_pretrained_embeddings(token, all_pretrained_tokens, parameters) and \ token not in all_tokens_in_pretraining_dataset: token_to_index[token] = self.UNK_TOKEN_INDEX number_of_unknown_tokens += 1 self.tokens_mapped_to_unk.append(token) else: token_to_index[token] = iteration_number iteration_number += 1 if self.verbose: print("number_of_unknown_tokens: {0}".format( number_of_unknown_tokens)) infrequent_token_indices = [] for token, count in token_count['train'].items(): if 0 < count <= remap_to_unk_count_threshold: infrequent_token_indices.append(token_to_index[token]) if self.verbose: print("len(token_count['train']): {0}".format( len(token_count['train']))) if self.verbose: print("len(infrequent_token_indices): {0}".format( len(infrequent_token_indices))) # Ensure that both B- and I- versions exist for each label labels_without_bio = set() for label in label_count['all'].keys(): new_label = utils_nlp.remove_bio_from_label_name(label) labels_without_bio.add(new_label) for label in labels_without_bio: if label == 'O': continue if parameters['tagging_format'] == 'bioes': prefixes = ['B-', 'I-', 'E-', 'S-'] else: prefixes = ['B-', 'I-'] for prefix in prefixes: l = prefix + label if l not in label_count['all']: label_count['all'][l] = 0 label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) if parameters['use_pretrained_model'] and not parameters['add_class']: self.unique_labels = sorted( list(pretraining_dataset.label_to_index.keys())) # Make sure labels are compatible with the pretraining dataset. for label in label_count['all']: if label not in pretraining_dataset.label_to_index: raise AssertionError( "The label {0} does not exist in the pretraining dataset. " .format(label) + "Please ensure that only the following labels exist in the dataset: {0}" .format(', '.join(self.unique_labels))) label_to_index = pretraining_dataset.label_to_index.copy() elif parameters['use_pretrained_model'] and parameters['add_class']: # make sure that the added labels are mapped to the end of the dectionary print('Adding new label-index pair to label_to_index dictionary') old_label_to_index = pretraining_dataset.label_to_index.copy() for label, count in label_count['all'].items(): if label not in old_label_to_index.keys(): old_label_to_index[label] = len(old_label_to_index.keys()) label_to_index = old_label_to_index.copy() self.unique_labels = list(label_to_index.keys()) else: label_to_index = {} iteration_number = 0 for label, count in label_count['all'].items(): label_to_index[label] = iteration_number iteration_number += 1 self.unique_labels.append(label) if parameters['use_corrector']: label_corrector_to_index = {} self.unique_labels_corrector = list( label_corrector_count['all'].keys()) for n, label in enumerate(self.unique_labels_corrector): label_corrector_to_index[label] = n if parameters['include_pos']: label_pos_to_index = {} self.unique_labels_pos = list(label_pos_count['all'].keys()) for n, pos in enumerate(self.unique_labels_pos): label_pos_to_index[pos] = n if self.verbose: print('self.unique_labels: {0}'.format(self.unique_labels)) character_to_index = {} iteration_number = 0 for character, count in character_count['all'].items(): if iteration_number == self.PADDING_CHARACTER_INDEX: iteration_number += 1 character_to_index[character] = iteration_number iteration_number += 1 if self.verbose: print('token_count[\'train\'][0:10]: {0}'.format( list(token_count['train'].items())[0:10])) token_to_index = utils.order_dictionary(token_to_index, 'value', reverse=False) index_to_token = utils.reverse_dictionary(token_to_index) if parameters['remap_unknown_tokens_to_unk'] == 1: index_to_token[self.UNK_TOKEN_INDEX] = self.UNK if self.verbose: print('label_count[\'train\']: {0}'.format(label_count['train'])) label_to_index = utils.order_dictionary(label_to_index, 'value', reverse=False) if self.verbose: print('label_to_index: {0}'.format(label_to_index)) index_to_label = utils.reverse_dictionary(label_to_index) if self.verbose: print('index_to_label: {0}'.format(index_to_label)) if parameters['use_corrector']: label_corrector_to_index = utils.order_dictionary( label_corrector_to_index, 'value', reverse=False) index_to_label_corrector = utils.reverse_dictionary( label_corrector_to_index) if parameters['include_pos']: label_pos_to_index = utils.order_dictionary(label_pos_to_index, 'value', reverse=False) index_to_label_pos = utils.reverse_dictionary( label_pos_to_index) character_to_index = utils.order_dictionary(character_to_index, 'value', reverse=False) index_to_character = utils.reverse_dictionary(character_to_index) if self.verbose: print('character_to_index: {0}'.format(character_to_index)) if self.verbose: print('index_to_character: {0}'.format(index_to_character)) if self.verbose: print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10])) if self.verbose: print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10])) if self.verbose: # Print sequences of length 1 in train set for token_sequence, label_sequence in zip(tokens['train'], labels['train']): if len(label_sequence) == 1 and label_sequence[0] != 'O': print("{0}\t{1}".format(token_sequence[0], label_sequence[0])) token_indices = {} label_indices = {} if parameters['use_corrector']: label_indices_corrector = {} if parameters['include_pos']: label_indices_pos = {} character_indices = {} character_indices_padded = {} for dataset_type in dataset_filepaths.keys(): # print("dataset_type: {:s}".format(dataset_type)) token_indices[dataset_type] = [] characters[dataset_type] = [] character_indices[dataset_type] = [] token_lengths[dataset_type] = [] character_indices_padded[dataset_type] = [] for token_sequence in tokens[dataset_type]: token_indices[dataset_type].append( [token_to_index[token] for token in token_sequence]) characters[dataset_type].append( [list(token) for token in token_sequence]) character_indices[dataset_type].append( [[character_to_index[character] for character in token] for token in token_sequence]) token_lengths[dataset_type].append( [len(token) for token in token_sequence]) longest_token_length_in_sequence = max( token_lengths[dataset_type][-1]) character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX) for temp_token_indices in character_indices[dataset_type][-1] ]) label_indices[dataset_type] = [] for label_sequence in labels[dataset_type]: label_indices[dataset_type].append( [label_to_index[label] for label in label_sequence]) if parameters['use_corrector']: label_indices_corrector[dataset_type] = [] for label_sequence_corrector in labels_corrector[dataset_type]: label_indices_corrector[dataset_type].append([ label_corrector_to_index[label] for label in label_sequence_corrector ]) if parameters['include_pos']: label_indices_pos[dataset_type] = [] for label_sequence_pos in labels_pos[dataset_type]: label_indices_pos[dataset_type].append([ label_pos_to_index[label] for label in label_sequence_pos ]) if self.verbose: print('token_lengths[\'train\'][0][0:10]: {0}'.format( token_lengths['train'][0][0:10])) if self.verbose: print('characters[\'train\'][0][0:10]: {0}'.format( characters['train'][0][0:10])) if self.verbose: print('token_indices[\'train\'][0:10]: {0}'.format( token_indices['train'][0:10])) if self.verbose: print('label_indices[\'train\'][0:10]: {0}'.format( label_indices['train'][0:10])) if self.verbose: print('character_indices[\'train\'][0][0:10]: {0}'.format( character_indices['train'][0][0:10])) if self.verbose: print('character_indices_padded[\'train\'][0][0:10]: {0}'.format( character_indices_padded['train'][0][0:10])) label_binarizer = sklearn.preprocessing.LabelBinarizer() label_binarizer.fit(range(max(index_to_label.keys()) + 1)) label_vector_indices = {} for dataset_type in dataset_filepaths.keys(): label_vector_indices[dataset_type] = [] for label_indices_sequence in label_indices[dataset_type]: label_vector_indices[dataset_type].append( label_binarizer.transform(label_indices_sequence)) if parameters['use_corrector']: label_binarizer_corrector = sklearn.preprocessing.LabelBinarizer() label_binarizer_corrector.fit( range(max(index_to_label_corrector.keys()) + 1)) label_corrector_vector_indices = {} for dataset_type in dataset_filepaths.keys(): label_corrector_vector_indices[dataset_type] = [] for label_indices_sequence in label_indices_corrector[ dataset_type]: label_corrector_vector_indices[dataset_type].append( label_binarizer_corrector.transform( label_indices_sequence)) if parameters['include_pos']: label_binarizer_pos = sklearn.preprocessing.LabelBinarizer() label_binarizer_pos.fit( range(max(index_to_label_pos.keys()) + 1)) label_pos_vector_indices = {} for dataset_type in dataset_filepaths.keys(): label_pos_vector_indices[dataset_type] = [] for label_indices_sequence in label_indices_pos[ dataset_type]: label_pos_vector_indices[dataset_type].append( label_binarizer_pos.transform( label_indices_sequence)) if self.verbose: print('label_vector_indices[\'train\'][0:2]: {0}'.format( label_vector_indices['train'][0:2])) if self.verbose: print('len(label_vector_indices[\'train\']): {0}'.format( len(label_vector_indices['train']))) self.token_to_index = token_to_index self.index_to_token = index_to_token self.token_indices = token_indices self.label_indices = label_indices self.character_indices_padded = character_indices_padded self.index_to_character = index_to_character self.character_to_index = character_to_index self.character_indices = character_indices self.token_lengths = token_lengths self.characters = characters self.tokens = tokens self.labels = labels self.label_vector_indices = label_vector_indices self.index_to_label = index_to_label self.label_to_index = label_to_index if parameters['use_corrector']: self.index_to_label_corrector = index_to_label_corrector self.label_corrector_to_index = label_corrector_to_index self.label_indices_corrector = label_indices_corrector self.label_corrector_vector_indices = label_corrector_vector_indices if parameters['include_pos']: self.index_to_label_pos = index_to_label_pos self.label_pos_to_index = label_pos_to_index self.label_indices_pos = label_indices_pos self.label_pos_vector_indices = label_pos_vector_indices if self.verbose: print("len(self.token_to_index): {0}".format( len(self.token_to_index))) if self.verbose: print("len(self.index_to_token): {0}".format( len(self.index_to_token))) if parameters['add_class'] and parameters[ 'tagging_format'] == 'bioes' and len( self.index_to_label) > 100: self.number_of_classes = max(self.index_to_label.keys()) + 1 - 8 elif parameters['add_class'] and parameters[ 'tagging_format'] == 'bioes': print('here') self.number_of_classes = max(self.index_to_label.keys()) + 1 - 4 elif parameters['add_class'] and parameters['tagging_format'] == 'bio': print('here2') self.number_of_classes = max(self.index_to_label.keys()) + 1 - 2 else: self.number_of_classes = max( self.index_to_label.keys()) + 1 # 1 is for O label print('max(self.index_to_label.keys()) : {:d}'.format( max(self.index_to_label.keys()))) print(self.index_to_label.keys()) print(self.number_of_classes) self.vocabulary_size = max(self.index_to_token.keys()) + 1 self.alphabet_size = max(self.index_to_character.keys()) + 1 if self.verbose: print("self.number_of_classes: {0}".format(self.number_of_classes)) if self.verbose: print("self.alphabet_size: {0}".format(self.alphabet_size)) if self.verbose: print("self.vocabulary_size: {0}".format(self.vocabulary_size)) self.unique_labels_of_interest = list(self.unique_labels) self.unique_labels_of_interest.remove('O') self.unique_label_indices_of_interest = [] for lab in self.unique_labels_of_interest: self.unique_label_indices_of_interest.append(label_to_index[lab]) self.infrequent_token_indices = infrequent_token_indices if self.verbose: print('self.unique_labels_of_interest: {0}'.format( self.unique_labels_of_interest)) if self.verbose: print('self.unique_label_indices_of_interest: {0}'.format( self.unique_label_indices_of_interest)) elapsed_time = time.time() - start_time print('done ({0:.2f} seconds)'.format(elapsed_time))
def load_dataset(self, word2id=None, tag2id=None, prefix2id=None, suffix2id=None, fgen=True): ''' dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' ''' start_time = time.time() if fgen: self.tokens_mapped_to_unk = [] prefix_count, suffix_count, word_count, tag_count, self.sentence_list = self._parse_dataset( fgen) word_count = utils.order_dictionary(word_count, 'value_key', reverse=True) tag_count = utils.order_dictionary(tag_count, 'key', reverse=False) prefix_count = utils.order_dictionary(prefix_count, 'value_key', reverse=True) suffix_count = utils.order_dictionary(suffix_count, 'value_key', reverse=True) pid = 0 self.prefix2id["-padding-"] = pid pid += 1 for pre, count in prefix_count.items(): self.prefix2id[pre] = pid pid += 1 self.prefix2id['-UNK-'] = pid pickle.dump( self.prefix2id, open(os.path.join(self.data_output, "ner_prefix2id"), 'wb')) sid = 0 self.suffix2id["-padding-"] = sid sid += 1 for suf, count in suffix_count.items(): self.suffix2id[suf] = sid sid += 1 self.suffix2id['-UNK-'] = sid pickle.dump( self.suffix2id, open(os.path.join(self.data_output, "ner_suffix2id"), 'wb')) wid = 0 self.word2id["-padding-"] = wid wid += 1 for word, count in word_count.items(): self.word2id[word] = wid wid += 1 self.word2id['-UNK-'] = wid pickle.dump( self.word2id, open(os.path.join(self.data_output, "ner_word2id"), 'wb')) # Ensure that both B- and I- versions exist for ach label labels_without_bio = set() for label, count in tag_count.items(): new_label = utils.remove_bio_from_label_name(label) labels_without_bio.add(new_label) prefixes = ['B-', 'I-'] nid = 0 self.tag2id['O'] = nid nid += 1 for label in labels_without_bio: if label == 'O': continue for prefix in prefixes: l = prefix + label self.tag2id[l] = nid nid += 1 pickle.dump( self.tag2id, open(os.path.join(self.data_output, "ner_tag2id"), 'wb')) else: self.word2id = word2id self.tag2id = tag2id self.prefix2id = prefix2id self.suffix2id = suffix2id _, _, _, _, self.sentence_list = self._parse_dataset(fgen) for name, sent_list in self.sentence_list.items(): for sent in sent_list: sent.gen_id_list(self.word2id, self.tag2id) sent.gen_sent_features(self.word2id, self.prefix2id, self.suffix2id) self.number_of_classes = len(self.tag2id) self.vocabulary_size = len(self.word2id) self.prefix_size = len(self.prefix2id) self.suffix_size = len(self.suffix2id) self.id2word = utils.reverse_dictionary(self.word2id) self.id2tag = utils.reverse_dictionary(self.tag2id) elapsed_time = time.time() - start_time print('loading dataset done ({0:.2f} seconds)'.format(elapsed_time))
def load_dataset(self, dataset_filepaths, parameters, token_to_vector=None): ''' dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' Load word vectors từ file đã chuẩn bị sẵn ''' start_time = time.time() print('Load dataset... ', end='', flush=True) if parameters['token_pretrained_embedding_filepath'] != '': if token_to_vector == None: token_to_vector = utils_nlp.load_pretrained_token_embeddings( parameters) else: token_to_vector = {} if self.verbose: print("len(token_to_vector): {0}".format(len(token_to_vector))) # Load pretraining dataset to ensure that index to label is compatible to the pretrained model, # and that token embeddings that are learned in the pretrained model are loaded properly. all_tokens_in_pretraining_dataset = [] all_characters_in_pretraining_dataset = [] if parameters['use_pretrained_model']: pretraining_dataset = pickle.load( open( os.path.join(parameters['pretrained_model_folder'], 'dataset.pickle'), 'rb')) all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values( ) # Những token lưu ở đợt train trước all_characters_in_pretraining_dataset = pretraining_dataset.index_to_character.values( ) # Những character lưu ở đợt train trước remap_to_unk_count_threshold = 1 self.UNK_TOKEN_INDEX = 0 # Index của những unknow token self.PADDING_CHARACTER_INDEX = 0 self.tokens_mapped_to_unk = [] # những unknown token self.UNK = 'UNK' self.unique_labels = [] # Các nhãn tồn tại trong dataset labels = {} # nhãn {all: ...., train: ..., test: ...} tokens = {} # token {all: ...., train: ..., test: ...} label_count = {} # Đếm số nhãn {all: ...., train: ..., test: ...} token_count = {} # Đếm số token {all: ...., train: ..., test: ...} character_count = {} # Đếm số ký tự {all: ...., train: ..., test: ...} for dataset_type in ['train', 'valid', 'test', 'deploy']: labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \ = self._parse_dataset(dataset_filepaths.get(dataset_type, None)) if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) # Tính tổng hợp lại cho tất cả các dataset token_count['all'] = {} for token in list(token_count['train'].keys()) + list( token_count['valid'].keys()) + list( token_count['test'].keys()) + list( token_count['deploy'].keys()): token_count['all'][token] = token_count['train'][ token] + token_count['valid'][token] + token_count['test'][ token] + token_count['deploy'][token] # Thêm những token ở pretrained trước với giá trị -1 if parameters['load_all_pretrained_token_embeddings']: for token in token_to_vector: if token not in token_count['all']: token_count['all'][token] = -1 token_count['train'][token] = -1 for token in all_tokens_in_pretraining_dataset: if token not in token_count['all']: token_count['all'][token] = -1 token_count['train'][token] = -1 # Tính tổng hợp lại cho tất cả các dataset character_count['all'] = {} for character in list(character_count['train'].keys()) + list( character_count['valid'].keys()) + list( character_count['test'].keys()) + list( character_count['deploy'].keys()): character_count['all'][character] = character_count['train'][ character] + character_count['valid'][ character] + character_count['test'][ character] + character_count['deploy'][character] # Thêm những token ở pretrained trước với giá trị -1 for character in all_characters_in_pretraining_dataset: if character not in character_count['all']: character_count['all'][character] = -1 character_count['train'][character] = -1 for dataset_type in dataset_filepaths.keys(): if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) # Tính tổng hợp lại các nhãn ở đợt train trước label_count['all'] = {} for character in list(label_count['train'].keys()) + list( label_count['valid'].keys()) + list( label_count['test'].keys()) + list( label_count['deploy'].keys()): label_count['all'][character] = label_count['train'][ character] + label_count['valid'][character] + label_count[ 'test'][character] + label_count['deploy'][character] token_count['all'] = utils.order_dictionary( token_count['all'], 'value_key', reverse=True ) # Sort token count theo các token có freq cao đến thấp, token desc label_count['all'] = utils.order_dictionary( label_count['all'], 'key', reverse=False) # Sort label count theo label asc character_count['all'] = utils.order_dictionary( character_count['all'], 'value', reverse=True ) # Sort character count theo các character có freq cao đến thấp if self.verbose: print('character_count[\'all\']: {0}'.format( character_count['all'])) token_to_index = {} token_to_index[self.UNK] = self.UNK_TOKEN_INDEX iteration_number = 0 number_of_unknown_tokens = 0 if self.verbose: print("parameters['remap_unknown_tokens_to_unk']: {0}".format( parameters['remap_unknown_tokens_to_unk'])) if self.verbose: print("len(token_count['train'].keys()): {0}".format( len(token_count['train'].keys()))) for token, count in token_count['all'].items(): if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1 ''' UNK_TOKEN: token không xuất hiện trong pretraining_dataset và trong word vectors ''' if parameters['remap_unknown_tokens_to_unk'] == 1 and \ (token_count['train'][token] == 0 or \ parameters['load_only_pretrained_token_embeddings']) and \ not utils_nlp.is_token_in_pretrained_embeddings(token, token_to_vector, parameters) and \ token not in all_tokens_in_pretraining_dataset: if self.verbose: print("token: {0}".format(token)) if self.verbose: print("token.lower(): {0}".format(token.lower())) if self.verbose: print("re.sub('\d', '0', token.lower()): {0}".format( re.sub('\d', '0', token.lower()))) token_to_index[token] = self.UNK_TOKEN_INDEX number_of_unknown_tokens += 1 self.tokens_mapped_to_unk.append(token) else: token_to_index[token] = iteration_number iteration_number += 1 if self.verbose: print("number_of_unknown_tokens: {0}".format( number_of_unknown_tokens)) infrequent_token_indices = [ ] # Các token xuất hiện thấp trong train dataset for token, count in token_count['train'].items(): if 0 < count <= remap_to_unk_count_threshold: infrequent_token_indices.append(token_to_index[token]) if self.verbose: print("len(token_count['train']): {0}".format( len(token_count['train']))) if self.verbose: print("len(infrequent_token_indices): {0}".format( len(infrequent_token_indices))) # Ensure that both B- and I- versions exist for each label # Bỏ các tiền tố B-, O-, I-... labels_without_bio = set() for label in label_count['all'].keys(): new_label = utils_nlp.remove_bio_from_label_name(label) labels_without_bio.add(new_label) # Kết hợp các ENTITY vs các tiền tố B-, I-,... và thêm vào label count for label in labels_without_bio: if label == 'O': continue if parameters['tagging_format'] == 'bioes': prefixes = ['B-', 'I-', 'E-', 'S-'] else: prefixes = ['B-', 'I-'] for prefix in prefixes: l = prefix + label if l not in label_count['all']: label_count['all'][l] = 0 # Sắp xếp label_count theo label asc label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) if parameters['use_pretrained_model']: self.unique_labels = sorted( list(pretraining_dataset.label_to_index.keys())) # Make sure labels are compatible with the pretraining dataset. for label in label_count['all']: if label not in pretraining_dataset.label_to_index: raise AssertionError( "The label {0} does not exist in the pretraining dataset. " .format(label) + "Please ensure that only the following labels exist in the dataset: {0}" .format(', '.join(self.unique_labels))) label_to_index = pretraining_dataset.label_to_index.copy() else: label_to_index = {} iteration_number = 0 for label, count in label_count['all'].items(): label_to_index[label] = iteration_number iteration_number += 1 self.unique_labels.append(label) if self.verbose: print('self.unique_labels: {0}'.format(self.unique_labels)) character_to_index = {} iteration_number = 0 for character, count in character_count['all'].items(): if iteration_number == self.PADDING_CHARACTER_INDEX: iteration_number += 1 character_to_index[character] = iteration_number iteration_number += 1 if self.verbose: print('token_count[\'train\'][0:10]: {0}'.format( list(token_count['train'].items())[0:10])) token_to_index = utils.order_dictionary(token_to_index, 'value', reverse=False) if self.verbose: print('token_to_index: {0}'.format(token_to_index)) index_to_token = utils.reverse_dictionary(token_to_index) if parameters['remap_unknown_tokens_to_unk'] == 1: index_to_token[self.UNK_TOKEN_INDEX] = self.UNK if self.verbose: print('index_to_token: {0}'.format(index_to_token)) if self.verbose: print('label_count[\'train\']: {0}'.format(label_count['train'])) label_to_index = utils.order_dictionary(label_to_index, 'value', reverse=False) if self.verbose: print('label_to_index: {0}'.format(label_to_index)) index_to_label = utils.reverse_dictionary(label_to_index) if self.verbose: print('index_to_label: {0}'.format(index_to_label)) character_to_index = utils.order_dictionary(character_to_index, 'value', reverse=False) index_to_character = utils.reverse_dictionary(character_to_index) if self.verbose: print('character_to_index: {0}'.format(character_to_index)) if self.verbose: print('index_to_character: {0}'.format(index_to_character)) if self.verbose: print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10])) if self.verbose: print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10])) if self.verbose: # Print sequences of length 1 in train set for token_sequence, label_sequence in zip(tokens['train'], labels['train']): if len(label_sequence) == 1 and label_sequence[0] != 'O': print("{0}\t{1}".format(token_sequence[0], label_sequence[0])) self.token_to_index = token_to_index # {token: index sau khi sắp xếp theo freq từ cao đến thấp, 0 nếu là unk token} self.index_to_token = index_to_token # Ngược token_to_index self.index_to_character = index_to_character # Ngược character_to_index self.character_to_index = character_to_index # { character: index sau khi sắp xếp freq từ cao đến thấp} self.index_to_label = index_to_label # Ngược label_to_index self.label_to_index = label_to_index # {label: index sau khi sắp xếp asc} if self.verbose: print("len(self.token_to_index): {0}".format( len(self.token_to_index))) if self.verbose: print("len(self.index_to_token): {0}".format( len(self.index_to_token))) self.tokens = tokens self.labels = labels token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices( dataset_filepaths.keys()) self.token_indices = token_indices self.label_indices = label_indices self.character_indices_padded = character_indices_padded self.character_indices = character_indices self.token_lengths = token_lengths self.characters = characters self.label_vector_indices = label_vector_indices self.number_of_classes = max(self.index_to_label.keys()) + 1 self.vocabulary_size = max(self.index_to_token.keys()) + 1 self.alphabet_size = max(self.index_to_character.keys()) + 1 if self.verbose: print("self.number_of_classes: {0}".format(self.number_of_classes)) if self.verbose: print("self.alphabet_size: {0}".format(self.alphabet_size)) if self.verbose: print("self.vocabulary_size: {0}".format(self.vocabulary_size)) # unique_labels_of_interest is used to compute F1-scores. self.unique_labels_of_interest = list(self.unique_labels) self.unique_labels_of_interest.remove('O') self.unique_label_indices_of_interest = [] for lab in self.unique_labels_of_interest: self.unique_label_indices_of_interest.append(label_to_index[lab]) self.infrequent_token_indices = infrequent_token_indices if self.verbose: print('self.unique_labels_of_interest: {0}'.format( self.unique_labels_of_interest)) if self.verbose: print('self.unique_label_indices_of_interest: {0}'.format( self.unique_label_indices_of_interest)) elapsed_time = time.time() - start_time print('done ({0:.2f} seconds)'.format(elapsed_time)) return token_to_vector
def load_dataset(self, dataset_filepaths, parameters, annotator): ''' dataset_filepaths : dictionary with keys 'train', 'valid', 'test' ''' start_time = time.time() print('Load dataset... ', end='', flush=True) if parameters['do_split']: dataset_filepaths = self._do_split(parameters) all_pretrained_tokens = [] if parameters['token_pretrained_embedding_filepath'] != '': all_pretrained_tokens = utils_nlp.load_tokens_from_pretrained_token_embeddings( parameters) if self.verbose: print("len(all_pretrained_tokens): {0}".format( len(all_pretrained_tokens))) # Load pretraining dataset to ensure that index to label is compatible to the pretrained model, # and that token embeddings that are learned in the pretrained model are loaded properly. all_tokens_in_pretraining_dataset = [] self.UNK_TOKEN_INDEX = 0 self.PADDING_TOKEN_INDEX = 1 self.tokens_mapped_to_unk = [] self.UNK = '_UNK_' self.PAD = '_PAD_' self.unique_labels = [] labels = {} tokens = {} token_count = {} label_count = {} self.max_tokens = -1 # Look for max length for dataset_type in ['train', 'valid', 'test']: max_tokens = self._find_max_length( dataset_filepaths.get(dataset_type, None), annotator, force_preprocessing=parameters['do_split']) if parameters['max_length_sentence'] == -1: self.max_tokens = max(self.max_tokens, max_tokens) else: if self.max_tokens == -1: self.max_tokens = max_tokens self.max_tokens = min(parameters['max_length_sentence'], self.max_tokens) for dataset_type in ['train', 'valid', 'test']: labels[dataset_type], tokens[dataset_type], token_count[ dataset_type], label_count[dataset_type] = self._parse_dataset( dataset_filepaths.get(dataset_type, None), annotator, force_preprocessing=parameters['do_split'], limit=self.max_tokens) if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) token_count['all'] = {} for token in list(token_count['train'].keys()) + list( token_count['valid'].keys()) + list( token_count['test'].keys()): token_count['all'][token] = token_count['train'].get( token, 0) + token_count['valid'].get( token, 0) + token_count['test'].get(token, 0) for dataset_type in dataset_filepaths.keys(): if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) label_count['all'] = {} for character in list(label_count['train'].keys()) + list( label_count['valid'].keys()) + list( label_count['test'].keys()): label_count['all'][character] = label_count['train'].get( character, 0) + label_count['valid'].get( character, 0) + label_count['test'].get(character, 0) token_count['all'] = utils.order_dictionary(token_count['all'], 'value_key', reverse=True) label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) token_to_index = {} token_to_index[self.UNK] = self.UNK_TOKEN_INDEX token_to_index[self.PAD] = self.PADDING_TOKEN_INDEX iteration_number = 0 number_of_unknown_tokens = 0 if self.verbose: print("parameters['remap_unknown_tokens_to_unk']: {0}".format( parameters['remap_unknown_tokens_to_unk'])) if self.verbose: print("len(token_count['train'].keys()): {0}".format( len(token_count['train'].keys()))) for token, count in token_count['all'].items(): if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1 if iteration_number == self.PADDING_TOKEN_INDEX: iteration_number += 1 if parameters['remap_unknown_tokens_to_unk'] and ( token_count['train'].get(token, 0) == 0 or parameters['load_only_pretrained_token_embeddings'] ) and not utils_nlp.is_token_in_pretrained_embeddings( token, all_pretrained_tokens, parameters ) and token not in all_tokens_in_pretraining_dataset: if self.verbose: print("token: {0}".format(token)) if self.verbose: print("token.lower(): {0}".format(token.lower())) if self.verbose: print("re.sub('\d', '0', token.lower()): {0}".format( re.sub('\d', '0', token.lower()))) token_to_index[token] = self.UNK_TOKEN_INDEX number_of_unknown_tokens += 1 self.tokens_mapped_to_unk.append(token) else: token_to_index[token] = iteration_number iteration_number += 1 if self.verbose: print("number_of_unknown_tokens: {0}".format( number_of_unknown_tokens)) infrequent_token_indices = [] for token, count in token_count['train'].items(): if 0 < count <= parameters['remap_to_unk_count_threshold']: infrequent_token_indices.append(token_to_index[token]) if self.verbose: print("len(token_count['train']): {0}".format( len(token_count['train']))) if self.verbose: print("len(infrequent_token_indices): {0}".format( len(infrequent_token_indices))) label_to_index = {} iteration_number = 0 for label, count in label_count['all'].items(): label_to_index[label] = iteration_number iteration_number += 1 self.unique_labels.append(label) if self.verbose: print('self.unique_labels: {0}'.format(self.unique_labels)) if self.verbose: print('token_count[\'train\'][0:10]: {0}'.format( list(token_count['train'].items())[0:10])) token_to_index = utils.order_dictionary(token_to_index, 'value', reverse=False) if self.verbose: print('token_to_index: {0}'.format(token_to_index)) index_to_token = utils.reverse_dictionary(token_to_index) if parameters['remap_unknown_tokens_to_unk'] == 1: index_to_token[self.UNK_TOKEN_INDEX] = self.UNK index_to_token[self.PADDING_TOKEN_INDEX] = self.PAD if self.verbose: print('index_to_token: {0}'.format(index_to_token)) if self.verbose: print('label_count[\'train\']: {0}'.format(label_count['train'])) label_to_index = utils.order_dictionary(label_to_index, 'value', reverse=False) if self.verbose: print('label_to_index: {0}'.format(label_to_index)) index_to_label = utils.reverse_dictionary(label_to_index) if self.verbose: print('index_to_label: {0}'.format(index_to_label)) if self.verbose: print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10])) if self.verbose: print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10])) # Map tokens and labels to their indices token_indices = {} label_indices = {} token_lengths = {} token_indices_padded = {} for dataset_type in dataset_filepaths.keys(): token_indices[dataset_type] = [] token_lengths[dataset_type] = [] token_indices_padded[dataset_type] = [] # Tokens for token_sequence in tokens[dataset_type]: token_indices[dataset_type].append( [token_to_index[token] for token in token_sequence]) token_lengths[dataset_type].append(len(token_sequence)) # Labels label_indices[dataset_type] = [] for label in labels[dataset_type]: label_indices[dataset_type].append(label_to_index[label]) # Pad tokens for dataset_type in dataset_filepaths.keys(): token_indices_padded[dataset_type] = [] token_indices_padded[dataset_type] = [ utils.pad_list(temp_token_indices, self.max_tokens, self.PADDING_TOKEN_INDEX) for temp_token_indices in token_indices[dataset_type] ] if self.verbose: print('token_lengths[\'train\'][0:10]: {0}'.format( token_lengths['train'][0:10])) if self.verbose: print('token_indices[\'train\'][0][0:10]: {0}'.format( token_indices['train'][0][0:10])) if self.verbose: print('token_indices_padded[\'train\'][0][0:10]: {0}'.format( token_indices_padded['train'][0][0:10])) if self.verbose: print('label_indices[\'train\'][0:10]: {0}'.format( label_indices['train'][0:10])) self.token_to_index = token_to_index self.index_to_token = index_to_token self.token_indices = token_indices self.label_indices = label_indices self.token_indices_padded = token_indices_padded self.token_lengths = token_lengths self.tokens = tokens self.labels = labels self.index_to_label = index_to_label self.label_to_index = label_to_index if self.verbose: print("len(self.token_to_index): {0}".format( len(self.token_to_index))) if self.verbose: print("len(self.index_to_token): {0}".format( len(self.index_to_token))) self.number_of_classes = max(self.index_to_label.keys()) + 1 self.vocabulary_size = max(self.index_to_token.keys()) + 1 if self.verbose: print("self.number_of_classes: {0}".format(self.number_of_classes)) if self.verbose: print("self.vocabulary_size: {0}".format(self.vocabulary_size)) self.infrequent_token_indices = infrequent_token_indices # Binarize label label_vector_indices = {} for dataset_type, labels in label_indices.items(): label_vector_indices[dataset_type] = [] for label in labels: label_vector_indices[dataset_type].append( utils.convert_one_hot(label, self.number_of_classes)) self.label_vector_indices = label_vector_indices elapsed_time = time.time() - start_time print('done ({0:.2f} seconds)'.format(elapsed_time))