def load_token_embeddings(sess, W, dataset, parameters): # Load embeddings # https://github.com/dennybritz/cnn-text-classification-tf/issues/17 print('Load embeddings') #full_word_embeddings_folder =os.path.join('..','data','word_vectors') #full_word_embeddings_filepath = os.path.join(full_word_embeddings_folder,'glove.6B.{0}d.txt'.format(token_embedding_size)) file_input = codecs.open(parameters['token_pretrained_embedding_filepath'], 'r', 'UTF-8') count = -1 # case_sensitive = False # initial_weights = np.random.uniform(-0.25,0.25,(vocabulary_size, token_embedding_size)) initial_weights = sess.run(W.read_value()) token_to_vector = {} for cur_line in file_input: count += 1 #if count > 1000:break cur_line = cur_line.strip() cur_line = cur_line.split(' ') if len(cur_line)==0:continue token = cur_line[0] vector =cur_line[1:] token_to_vector[token] = vector number_of_loaded_word_vectors = 0 number_of_token_original_case_found = 0 number_of_token_lowercase_found = 0 number_of_token_lowercase_normalized_found = 0 for token in dataset.token_to_index.keys(): # TODO: shouldn't it apply to token_to_index instead? # if not case_sensitive: token = token.lower() # For python 2.7 # if token not in dataset.token_to_index.viewkeys():continue # For python 3.5 if token in token_to_vector.keys(): initial_weights[dataset.token_to_index[token]] = token_to_vector[token] number_of_token_original_case_found += 1 elif token.lower() in token_to_vector.keys(): initial_weights[dataset.token_to_index[token]] = token_to_vector[token.lower()] number_of_token_lowercase_found += 1 elif re.sub('\d', '0', token.lower()) in token_to_vector.keys(): initial_weights[dataset.token_to_index[token]] = token_to_vector[re.sub('\d', '0', token.lower())] number_of_token_lowercase_normalized_found += 1 else: continue number_of_loaded_word_vectors += 1 file_input.close() print("number_of_token_original_case_found: {0}".format(number_of_token_original_case_found)) print("number_of_token_lowercase_found: {0}".format(number_of_token_lowercase_found)) print("number_of_token_lowercase_normalized_found: {0}".format(number_of_token_lowercase_normalized_found)) print('number_of_loaded_word_vectors: {0}'.format(number_of_loaded_word_vectors)) print("len(dataset.token_to_index): {0}".format(len(dataset.token_to_index))) print("len(dataset.index_to_token): {0}".format(len(dataset.index_to_token))) # sess.run(tf.global_variables_initializer()) sess.run(W.assign(initial_weights)) print('Load embeddings completed')
def printtoken(type, token, srow_scol, erow_ecol, line): tok = " " srow, scol = srow_scol erow, ecol = erow_ecol with open(filename, "a+") as outfile: if tok_name[type] == "NEWLINE" or tok_name[type] == "NL" or tok_name[ type] == "DEDENT" or tok_name[type] == "ENDMARKER" or tok_name[ type] == "INDENT" or tok_name[type] == "COMMENT": pass elif tok_name[type] == "STRING" or tok_name[ type] == "NAME" or tok_name[type] == "NUMBER": outfile.write(token.lower() + '\n')
def load_dataset(self, dataset_filepaths, parameters, token_to_vector=None): ''' dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' ''' start_time = time.time() print('Load dataset... ', end='', flush=True) if parameters['token_pretrained_embedding_filepath'] != '': if token_to_vector == None: token_to_vector = utils_nlp.load_pretrained_token_embeddings( parameters) else: token_to_vector = {} if self.verbose: print("len(token_to_vector): {0}".format(len(token_to_vector))) # Load pretraining dataset to ensure that index to label is compatible to the pretrained model, # and that token embeddings that are learned in the pretrained model are loaded properly. all_tokens_in_pretraining_dataset = [] all_characters_in_pretraining_dataset = [] if parameters['use_pretrained_model']: try: pretraining_dataset = pickle.load( open( os.path.join(parameters['pretrained_model_folder'], 'dataset.pickle'), 'rb')) except: pretraining_dataset = utils.renamed_load( open( os.path.join(parameters['pretrained_model_folder'], 'dataset.pickle'), 'rb')) all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values( ) all_characters_in_pretraining_dataset = pretraining_dataset.index_to_character.values( ) remap_to_unk_count_threshold = 1 self.UNK_TOKEN_INDEX = 0 self.PADDING_CHARACTER_INDEX = 0 self.tokens_mapped_to_unk = [] self.UNK = 'UNK' self.unique_labels = [] labels = {} tokens = {} label_count = {} token_count = {} character_count = {} for dataset_type in ['train', 'valid', 'test', 'deploy']: labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \ = self._parse_dataset(dataset_filepaths.get(dataset_type, None)) if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) token_count['all'] = {} for token in list(token_count['train'].keys()) + list( token_count['valid'].keys()) + list( token_count['test'].keys()) + list( token_count['deploy'].keys()): token_count['all'][token] = token_count['train'][ token] + token_count['valid'][token] + token_count['test'][ token] + token_count['deploy'][token] if parameters['load_all_pretrained_token_embeddings']: for token in token_to_vector: if token not in token_count['all']: token_count['all'][token] = -1 token_count['train'][token] = -1 for token in all_tokens_in_pretraining_dataset: if token not in token_count['all']: token_count['all'][token] = -1 token_count['train'][token] = -1 character_count['all'] = {} for character in list(character_count['train'].keys()) + list( character_count['valid'].keys()) + list( character_count['test'].keys()) + list( character_count['deploy'].keys()): character_count['all'][character] = character_count['train'][ character] + character_count['valid'][ character] + character_count['test'][ character] + character_count['deploy'][character] for character in all_characters_in_pretraining_dataset: if character not in character_count['all']: character_count['all'][character] = -1 character_count['train'][character] = -1 for dataset_type in dataset_filepaths.keys(): if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) label_count['all'] = {} for character in list(label_count['train'].keys()) + list( label_count['valid'].keys()) + list( label_count['test'].keys()) + list( label_count['deploy'].keys()): label_count['all'][character] = label_count['train'][ character] + label_count['valid'][character] + label_count[ 'test'][character] + label_count['deploy'][character] token_count['all'] = utils.order_dictionary(token_count['all'], 'value_key', reverse=True) label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) character_count['all'] = utils.order_dictionary(character_count['all'], 'value', reverse=True) if self.verbose: print('character_count[\'all\']: {0}'.format( character_count['all'])) token_to_index = {} token_to_index[self.UNK] = self.UNK_TOKEN_INDEX iteration_number = 0 number_of_unknown_tokens = 0 if self.verbose: print("parameters['remap_unknown_tokens_to_unk']: {0}".format( parameters['remap_unknown_tokens_to_unk'])) if self.verbose: print("len(token_count['train'].keys()): {0}".format( len(token_count['train'].keys()))) for token, count in token_count['all'].items(): if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1 if parameters['remap_unknown_tokens_to_unk'] == 1 and \ (token_count['train'][token] == 0 or \ parameters['load_only_pretrained_token_embeddings']) and \ not utils_nlp.is_token_in_pretrained_embeddings(token, token_to_vector, parameters) and \ token not in all_tokens_in_pretraining_dataset: if self.verbose: print("token: {0}".format(token)) if self.verbose: print("token.lower(): {0}".format(token.lower())) if self.verbose: print("re.sub('\d', '0', token.lower()): {0}".format( re.sub('\d', '0', token.lower()))) token_to_index[token] = self.UNK_TOKEN_INDEX number_of_unknown_tokens += 1 self.tokens_mapped_to_unk.append(token) else: token_to_index[token] = iteration_number iteration_number += 1 if self.verbose: print("number_of_unknown_tokens: {0}".format( number_of_unknown_tokens)) infrequent_token_indices = [] for token, count in token_count['train'].items(): if 0 < count <= remap_to_unk_count_threshold: infrequent_token_indices.append(token_to_index[token]) if self.verbose: print("len(token_count['train']): {0}".format( len(token_count['train']))) if self.verbose: print("len(infrequent_token_indices): {0}".format( len(infrequent_token_indices))) # Ensure that both B- and I- versions exist for each label labels_without_bio = set() for label in label_count['all'].keys(): new_label = utils_nlp.remove_bio_from_label_name(label) labels_without_bio.add(new_label) for label in labels_without_bio: if label == 'O': continue if parameters['tagging_format'] == 'bioes': prefixes = ['B-', 'I-', 'E-', 'S-'] else: prefixes = ['B-', 'I-'] for prefix in prefixes: l = prefix + label if l not in label_count['all']: label_count['all'][l] = 0 label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) if parameters['use_pretrained_model']: self.unique_labels = sorted( list(pretraining_dataset.label_to_index.keys())) # Make sure labels are compatible with the pretraining dataset. for label in label_count['all']: if label not in pretraining_dataset.label_to_index: raise AssertionError( "The label {0} does not exist in the pretraining dataset. " .format(label) + "Please ensure that only the following labels exist in the dataset: {0}" .format(', '.join(self.unique_labels))) label_to_index = pretraining_dataset.label_to_index.copy() else: label_to_index = {} iteration_number = 0 for label, count in label_count['all'].items(): label_to_index[label] = iteration_number iteration_number += 1 self.unique_labels.append(label) if self.verbose: print('self.unique_labels: {0}'.format(self.unique_labels)) character_to_index = {} iteration_number = 0 for character, count in character_count['all'].items(): if iteration_number == self.PADDING_CHARACTER_INDEX: iteration_number += 1 character_to_index[character] = iteration_number iteration_number += 1 if self.verbose: print('token_count[\'train\'][0:10]: {0}'.format( list(token_count['train'].items())[0:10])) token_to_index = utils.order_dictionary(token_to_index, 'value', reverse=False) if self.verbose: print('token_to_index: {0}'.format(token_to_index)) index_to_token = utils.reverse_dictionary(token_to_index) if parameters['remap_unknown_tokens_to_unk'] == 1: index_to_token[self.UNK_TOKEN_INDEX] = self.UNK if self.verbose: print('index_to_token: {0}'.format(index_to_token)) if self.verbose: print('label_count[\'train\']: {0}'.format(label_count['train'])) label_to_index = utils.order_dictionary(label_to_index, 'value', reverse=False) if self.verbose: print('label_to_index: {0}'.format(label_to_index)) index_to_label = utils.reverse_dictionary(label_to_index) if self.verbose: print('index_to_label: {0}'.format(index_to_label)) character_to_index = utils.order_dictionary(character_to_index, 'value', reverse=False) index_to_character = utils.reverse_dictionary(character_to_index) if self.verbose: print('character_to_index: {0}'.format(character_to_index)) if self.verbose: print('index_to_character: {0}'.format(index_to_character)) if self.verbose: print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10])) if self.verbose: print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10])) if self.verbose: # Print sequences of length 1 in train set for token_sequence, label_sequence in zip(tokens['train'], labels['train']): if len(label_sequence) == 1 and label_sequence[0] != 'O': print("{0}\t{1}".format(token_sequence[0], label_sequence[0])) self.token_to_index = token_to_index self.index_to_token = index_to_token self.index_to_character = index_to_character self.character_to_index = character_to_index self.index_to_label = index_to_label self.label_to_index = label_to_index if self.verbose: print("len(self.token_to_index): {0}".format( len(self.token_to_index))) if self.verbose: print("len(self.index_to_token): {0}".format( len(self.index_to_token))) self.tokens = tokens self.labels = labels token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices( dataset_filepaths.keys()) self.token_indices = token_indices self.label_indices = label_indices self.character_indices_padded = character_indices_padded self.character_indices = character_indices self.token_lengths = token_lengths self.characters = characters self.label_vector_indices = label_vector_indices self.number_of_classes = max(self.index_to_label.keys()) + 1 self.vocabulary_size = max(self.index_to_token.keys()) + 1 self.alphabet_size = max(self.index_to_character.keys()) + 1 if self.verbose: print("self.number_of_classes: {0}".format(self.number_of_classes)) if self.verbose: print("self.alphabet_size: {0}".format(self.alphabet_size)) if self.verbose: print("self.vocabulary_size: {0}".format(self.vocabulary_size)) # unique_labels_of_interest is used to compute F1-scores. self.unique_labels_of_interest = list(self.unique_labels) self.unique_labels_of_interest.remove('O') self.unique_label_indices_of_interest = [] for lab in self.unique_labels_of_interest: self.unique_label_indices_of_interest.append(label_to_index[lab]) self.infrequent_token_indices = infrequent_token_indices if self.verbose: print('self.unique_labels_of_interest: {0}'.format( self.unique_labels_of_interest)) if self.verbose: print('self.unique_label_indices_of_interest: {0}'.format( self.unique_label_indices_of_interest)) elapsed_time = time.time() - start_time print('done ({0:.2f} seconds)'.format(elapsed_time)) return token_to_vector
def s_other(scanner, token): return (Scanner.OTHER, token.lower().strip())
def s_suiteintro(scanner, token): return (Scanner.SUITEINTRO,token.lower().strip(',').strip())
def s_phrase(scanner, token): """A Comma delimited word phrase""" return (Scanner.PHRASE,token.lower().strip(',').strip())
def s_word(scanner, token): return (Scanner.WORD, token.lower().strip('.'))
def _parse_dataset(self, dataset_filepath, parameters): token_count = collections.defaultdict(lambda: 0) label_count = collections.defaultdict(lambda: 0) character_count = collections.defaultdict(lambda: 0) if parameters['use_pos']: pos_tag_count = collections.defaultdict(lambda: 0) if parameters['use_gaz']: gaz_count = collections.defaultdict(lambda: 0) #self._parse_gaz(parameters['gaz_filepath']) if parameters['use_aff']: aff_count = collections.defaultdict(lambda: 0) line_count = -1 tokens = [] labels = [] pos_tags = [] new_token_sequence = [] new_label_sequence = [] if parameters['use_pos']: new_pos_tag_sequence = [] if parameters['use_gaz']: new_gaz_sequence = [] gazs = [] if parameters['use_aff']: new_aff_sequence = [] affs = [] if dataset_filepath: f = codecs.open(dataset_filepath, 'r', 'UTF-8') for line in f: line_count += 1 line = line.strip().split(' ') if len(line) == 0 or len( line[0]) == 0 or '-DOCSTART-' in line[0]: if len(new_token_sequence) > 0: labels.append(new_label_sequence) tokens.append(new_token_sequence) if parameters['use_pos']: pos_tags.append(new_pos_tag_sequence) if parameters['use_gaz']: gazs.append(new_gaz_sequence) if parameters['use_aff']: affs.append(new_aff_sequence) new_aff_sequence = [] new_token_sequence = [] new_label_sequence = [] new_pos_tag_sequence = [] new_gaz_sequence = [] continue token = str(line[0]) label = str(line[-1]) # beware: in both cases we are assuming bioes if parameters['use_pos']: ''' if parameters['tokenizer'] == 'pos': pos_tag = str(line[-2]) else: pos_tag = str(line[-3]) ''' if parameters['tokenizer'] == 'pos': pos_tag = str(line[-3]) else: pos_tag = str(line[-4]) #print(pos_tag) if parameters['use_gaz']: gaz = token.lower() in self.gaz_set if gaz: gaz = 1 else: gaz = 0 if parameters['use_aff']: aff = 0 # Check for prefix for pref in self.aff_set['prefix']: pattern = '^' + re.escape(pref.lower()) result = re.match(pattern, token.lower()) if result: aff = 1 for suf in self.aff_set['suffix']: pattern = re.escape(suf.lower()) + '$' result = re.match(pattern, token.lower()) if result: aff = 1 for rot in self.aff_set['root']: result = token.lower().find(rot) if result > 1: aff = 1 token_count[token] += 1 label_count[label] += 1 if parameters['use_pos']: pos_tag_count[pos_tag] += 1 if parameters['use_gaz']: gaz_count[gaz] += 1 if parameters['use_aff']: aff_count[aff] += 1 new_token_sequence.append(token) new_label_sequence.append(label) if parameters['use_pos']: new_pos_tag_sequence.append(pos_tag) if parameters['use_gaz']: new_gaz_sequence.append(gaz) if parameters['use_aff']: new_aff_sequence.append(aff) for character in token: character_count[character] += 1 if self.debug and line_count > 200: break # for debugging purposes if len(new_token_sequence) > 0: labels.append(new_label_sequence) tokens.append(new_token_sequence) if parameters['use_pos']: pos_tags.append(new_pos_tag_sequence) if parameters['use_gaz']: gazs.append(new_gaz_sequence) if parameters['use_aff']: affs.append(new_aff_sequence) f.close() if not parameters['use_pos']: pos_tags = None pos_tag_count = None if not parameters['use_gaz']: gazs = None gaz_count = None if not parameters['use_aff']: affs = None aff_count = None return labels, tokens, token_count, label_count, character_count, pos_tags, pos_tag_count, gazs, gaz_count, affs, aff_count
def load_dataset(self, dataset_filepaths, parameters, token_to_vector=None): ''' dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' Load word vectors từ file đã chuẩn bị sẵn ''' start_time = time.time() print('Load dataset... ', end='', flush=True) if parameters['token_pretrained_embedding_filepath'] != '': if token_to_vector == None: token_to_vector = utils_nlp.load_pretrained_token_embeddings( parameters) else: token_to_vector = {} if self.verbose: print("len(token_to_vector): {0}".format(len(token_to_vector))) # Load pretraining dataset to ensure that index to label is compatible to the pretrained model, # and that token embeddings that are learned in the pretrained model are loaded properly. all_tokens_in_pretraining_dataset = [] all_characters_in_pretraining_dataset = [] if parameters['use_pretrained_model']: pretraining_dataset = pickle.load( open( os.path.join(parameters['pretrained_model_folder'], 'dataset.pickle'), 'rb')) all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values( ) # Những token lưu ở đợt train trước all_characters_in_pretraining_dataset = pretraining_dataset.index_to_character.values( ) # Những character lưu ở đợt train trước remap_to_unk_count_threshold = 1 self.UNK_TOKEN_INDEX = 0 # Index của những unknow token self.PADDING_CHARACTER_INDEX = 0 self.tokens_mapped_to_unk = [] # những unknown token self.UNK = 'UNK' self.unique_labels = [] # Các nhãn tồn tại trong dataset labels = {} # nhãn {all: ...., train: ..., test: ...} tokens = {} # token {all: ...., train: ..., test: ...} label_count = {} # Đếm số nhãn {all: ...., train: ..., test: ...} token_count = {} # Đếm số token {all: ...., train: ..., test: ...} character_count = {} # Đếm số ký tự {all: ...., train: ..., test: ...} for dataset_type in ['train', 'valid', 'test', 'deploy']: labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \ = self._parse_dataset(dataset_filepaths.get(dataset_type, None)) if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) # Tính tổng hợp lại cho tất cả các dataset token_count['all'] = {} for token in list(token_count['train'].keys()) + list( token_count['valid'].keys()) + list( token_count['test'].keys()) + list( token_count['deploy'].keys()): token_count['all'][token] = token_count['train'][ token] + token_count['valid'][token] + token_count['test'][ token] + token_count['deploy'][token] # Thêm những token ở pretrained trước với giá trị -1 if parameters['load_all_pretrained_token_embeddings']: for token in token_to_vector: if token not in token_count['all']: token_count['all'][token] = -1 token_count['train'][token] = -1 for token in all_tokens_in_pretraining_dataset: if token not in token_count['all']: token_count['all'][token] = -1 token_count['train'][token] = -1 # Tính tổng hợp lại cho tất cả các dataset character_count['all'] = {} for character in list(character_count['train'].keys()) + list( character_count['valid'].keys()) + list( character_count['test'].keys()) + list( character_count['deploy'].keys()): character_count['all'][character] = character_count['train'][ character] + character_count['valid'][ character] + character_count['test'][ character] + character_count['deploy'][character] # Thêm những token ở pretrained trước với giá trị -1 for character in all_characters_in_pretraining_dataset: if character not in character_count['all']: character_count['all'][character] = -1 character_count['train'][character] = -1 for dataset_type in dataset_filepaths.keys(): if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) # Tính tổng hợp lại các nhãn ở đợt train trước label_count['all'] = {} for character in list(label_count['train'].keys()) + list( label_count['valid'].keys()) + list( label_count['test'].keys()) + list( label_count['deploy'].keys()): label_count['all'][character] = label_count['train'][ character] + label_count['valid'][character] + label_count[ 'test'][character] + label_count['deploy'][character] token_count['all'] = utils.order_dictionary( token_count['all'], 'value_key', reverse=True ) # Sort token count theo các token có freq cao đến thấp, token desc label_count['all'] = utils.order_dictionary( label_count['all'], 'key', reverse=False) # Sort label count theo label asc character_count['all'] = utils.order_dictionary( character_count['all'], 'value', reverse=True ) # Sort character count theo các character có freq cao đến thấp if self.verbose: print('character_count[\'all\']: {0}'.format( character_count['all'])) token_to_index = {} token_to_index[self.UNK] = self.UNK_TOKEN_INDEX iteration_number = 0 number_of_unknown_tokens = 0 if self.verbose: print("parameters['remap_unknown_tokens_to_unk']: {0}".format( parameters['remap_unknown_tokens_to_unk'])) if self.verbose: print("len(token_count['train'].keys()): {0}".format( len(token_count['train'].keys()))) for token, count in token_count['all'].items(): if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1 ''' UNK_TOKEN: token không xuất hiện trong pretraining_dataset và trong word vectors ''' if parameters['remap_unknown_tokens_to_unk'] == 1 and \ (token_count['train'][token] == 0 or \ parameters['load_only_pretrained_token_embeddings']) and \ not utils_nlp.is_token_in_pretrained_embeddings(token, token_to_vector, parameters) and \ token not in all_tokens_in_pretraining_dataset: if self.verbose: print("token: {0}".format(token)) if self.verbose: print("token.lower(): {0}".format(token.lower())) if self.verbose: print("re.sub('\d', '0', token.lower()): {0}".format( re.sub('\d', '0', token.lower()))) token_to_index[token] = self.UNK_TOKEN_INDEX number_of_unknown_tokens += 1 self.tokens_mapped_to_unk.append(token) else: token_to_index[token] = iteration_number iteration_number += 1 if self.verbose: print("number_of_unknown_tokens: {0}".format( number_of_unknown_tokens)) infrequent_token_indices = [ ] # Các token xuất hiện thấp trong train dataset for token, count in token_count['train'].items(): if 0 < count <= remap_to_unk_count_threshold: infrequent_token_indices.append(token_to_index[token]) if self.verbose: print("len(token_count['train']): {0}".format( len(token_count['train']))) if self.verbose: print("len(infrequent_token_indices): {0}".format( len(infrequent_token_indices))) # Ensure that both B- and I- versions exist for each label # Bỏ các tiền tố B-, O-, I-... labels_without_bio = set() for label in label_count['all'].keys(): new_label = utils_nlp.remove_bio_from_label_name(label) labels_without_bio.add(new_label) # Kết hợp các ENTITY vs các tiền tố B-, I-,... và thêm vào label count for label in labels_without_bio: if label == 'O': continue if parameters['tagging_format'] == 'bioes': prefixes = ['B-', 'I-', 'E-', 'S-'] else: prefixes = ['B-', 'I-'] for prefix in prefixes: l = prefix + label if l not in label_count['all']: label_count['all'][l] = 0 # Sắp xếp label_count theo label asc label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) if parameters['use_pretrained_model']: self.unique_labels = sorted( list(pretraining_dataset.label_to_index.keys())) # Make sure labels are compatible with the pretraining dataset. for label in label_count['all']: if label not in pretraining_dataset.label_to_index: raise AssertionError( "The label {0} does not exist in the pretraining dataset. " .format(label) + "Please ensure that only the following labels exist in the dataset: {0}" .format(', '.join(self.unique_labels))) label_to_index = pretraining_dataset.label_to_index.copy() else: label_to_index = {} iteration_number = 0 for label, count in label_count['all'].items(): label_to_index[label] = iteration_number iteration_number += 1 self.unique_labels.append(label) if self.verbose: print('self.unique_labels: {0}'.format(self.unique_labels)) character_to_index = {} iteration_number = 0 for character, count in character_count['all'].items(): if iteration_number == self.PADDING_CHARACTER_INDEX: iteration_number += 1 character_to_index[character] = iteration_number iteration_number += 1 if self.verbose: print('token_count[\'train\'][0:10]: {0}'.format( list(token_count['train'].items())[0:10])) token_to_index = utils.order_dictionary(token_to_index, 'value', reverse=False) if self.verbose: print('token_to_index: {0}'.format(token_to_index)) index_to_token = utils.reverse_dictionary(token_to_index) if parameters['remap_unknown_tokens_to_unk'] == 1: index_to_token[self.UNK_TOKEN_INDEX] = self.UNK if self.verbose: print('index_to_token: {0}'.format(index_to_token)) if self.verbose: print('label_count[\'train\']: {0}'.format(label_count['train'])) label_to_index = utils.order_dictionary(label_to_index, 'value', reverse=False) if self.verbose: print('label_to_index: {0}'.format(label_to_index)) index_to_label = utils.reverse_dictionary(label_to_index) if self.verbose: print('index_to_label: {0}'.format(index_to_label)) character_to_index = utils.order_dictionary(character_to_index, 'value', reverse=False) index_to_character = utils.reverse_dictionary(character_to_index) if self.verbose: print('character_to_index: {0}'.format(character_to_index)) if self.verbose: print('index_to_character: {0}'.format(index_to_character)) if self.verbose: print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10])) if self.verbose: print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10])) if self.verbose: # Print sequences of length 1 in train set for token_sequence, label_sequence in zip(tokens['train'], labels['train']): if len(label_sequence) == 1 and label_sequence[0] != 'O': print("{0}\t{1}".format(token_sequence[0], label_sequence[0])) self.token_to_index = token_to_index # {token: index sau khi sắp xếp theo freq từ cao đến thấp, 0 nếu là unk token} self.index_to_token = index_to_token # Ngược token_to_index self.index_to_character = index_to_character # Ngược character_to_index self.character_to_index = character_to_index # { character: index sau khi sắp xếp freq từ cao đến thấp} self.index_to_label = index_to_label # Ngược label_to_index self.label_to_index = label_to_index # {label: index sau khi sắp xếp asc} if self.verbose: print("len(self.token_to_index): {0}".format( len(self.token_to_index))) if self.verbose: print("len(self.index_to_token): {0}".format( len(self.index_to_token))) self.tokens = tokens self.labels = labels token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices( dataset_filepaths.keys()) self.token_indices = token_indices self.label_indices = label_indices self.character_indices_padded = character_indices_padded self.character_indices = character_indices self.token_lengths = token_lengths self.characters = characters self.label_vector_indices = label_vector_indices self.number_of_classes = max(self.index_to_label.keys()) + 1 self.vocabulary_size = max(self.index_to_token.keys()) + 1 self.alphabet_size = max(self.index_to_character.keys()) + 1 if self.verbose: print("self.number_of_classes: {0}".format(self.number_of_classes)) if self.verbose: print("self.alphabet_size: {0}".format(self.alphabet_size)) if self.verbose: print("self.vocabulary_size: {0}".format(self.vocabulary_size)) # unique_labels_of_interest is used to compute F1-scores. self.unique_labels_of_interest = list(self.unique_labels) self.unique_labels_of_interest.remove('O') self.unique_label_indices_of_interest = [] for lab in self.unique_labels_of_interest: self.unique_label_indices_of_interest.append(label_to_index[lab]) self.infrequent_token_indices = infrequent_token_indices if self.verbose: print('self.unique_labels_of_interest: {0}'.format( self.unique_labels_of_interest)) if self.verbose: print('self.unique_label_indices_of_interest: {0}'.format( self.unique_label_indices_of_interest)) elapsed_time = time.time() - start_time print('done ({0:.2f} seconds)'.format(elapsed_time)) return token_to_vector
def get_inputs(dataset, token2idx, char2idx, label2idx, config): dataset_filepath = None if dataset == 'train': dataset_filepath = config.path_train elif dataset == 'eval': dataset_filepath = config.path_eval elif dataset == 'test': dataset_filepath = config.path_test else: print("unknown dataset: ", dataset) separator = config.separator lowercase = config.lowercase # collection per sentence # format [[[char_idxs], word_idx], ...] sentence_token = [] # format [[label], ...] sentence_label = [] # format [[sentence1_token], [sentence2_token], ...] tokens = [] # format [[sentence1_label], [sentence2_label], ...] labels = [] # go throught whole CoNLL file f = codecs.open(dataset_filepath, 'r', 'UTF-8') for line in f: line = line.strip().split(separator) # encouter a new sentence if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]: if len(sentence_token) > 0: labels.append(sentence_label) tokens.append(sentence_token) sentence_label = [] sentence_token = [] continue token = str(line[0]) label = str(line[-1]) # 1. preprocess word if lowercase: word = token.lower() else: word = token # don't use NUM # if word.isdigit(): # word = NUM # char idxs char_idxs = [] for char in word: if char in char2idx: char_idxs += [char2idx[char]] else: print("encounter UNK char:", char) # word idx if word in token2idx: word_idx = token2idx[word] else: word_idx = token2idx['$UNK$'] # label idx if label in label2idx: label_idx = label2idx[label] else: print("encounter UNK label:", label) sentence_token.append((char_idxs, word_idx)) sentence_label.append(label_idx) if len(sentence_token) > 0: tokens.append(sentence_token) labels.append(sentence_label) f.close() return tokens, labels
def load_dataset(self, dataset_filepaths, parameters): ''' dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy' ''' start_time = time.time() print('Load dataset... ', end='', flush=True) all_pretrained_tokens = [] if parameters['token_pretrained_embedding_filepath'] != '': all_pretrained_tokens = utils_nlp.load_tokens_from_pretrained_token_embeddings( parameters) if self.verbose: print("len(all_pretrained_tokens): {0}".format( len(all_pretrained_tokens))) # Load pretraining dataset to ensure that index to label is compatible to the pretrained model, # and that token embeddings that are learned in the pretrained model are loaded properly. all_tokens_in_pretraining_dataset = [] if parameters['use_pretrained_model']: pretrained_model_folder = os.path.dirname( parameters['pretrained_model_checkpoint_filepath']) pretraining_dataset = pickle.load( open(os.path.join(pretrained_model_folder, 'dataset.pickle'), 'rb')) all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values( ) remap_to_unk_count_threshold = 1 self.UNK_TOKEN_INDEX = 0 self.PADDING_CHARACTER_INDEX = 0 self.tokens_mapped_to_unk = [] self.UNK = 'UNK' self.unique_labels = [] labels = {} tokens = {} characters = {} token_lengths = {} label_count = {} token_count = {} character_count = {} for dataset_type in ['train', 'valid', 'test', 'deploy']: labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \ = self._parse_dataset(dataset_filepaths.get(dataset_type, None)) if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) token_count['all'] = {} for token in list(token_count['train'].keys()) + list( token_count['valid'].keys()) + list( token_count['test'].keys()) + list( token_count['deploy'].keys()): token_count['all'][token] = token_count['train'][ token] + token_count['valid'][token] + token_count['test'][ token] + token_count['deploy'][token] for dataset_type in dataset_filepaths.keys(): if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) character_count['all'] = {} for character in list(character_count['train'].keys()) + list( character_count['valid'].keys()) + list( character_count['test'].keys()) + list( character_count['deploy'].keys()): character_count['all'][character] = character_count['train'][ character] + character_count['valid'][ character] + character_count['test'][ character] + character_count['deploy'][character] label_count['all'] = {} for character in list(label_count['train'].keys()) + list( label_count['valid'].keys()) + list( label_count['test'].keys()) + list( label_count['deploy'].keys()): label_count['all'][character] = label_count['train'][ character] + label_count['valid'][character] + label_count[ 'test'][character] + label_count['deploy'][character] token_count['all'] = utils.order_dictionary(token_count['all'], 'value_key', reverse=True) label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) character_count['all'] = utils.order_dictionary(character_count['all'], 'value', reverse=True) if self.verbose: print('character_count[\'all\']: {0}'.format( character_count['all'])) token_to_index = {} token_to_index[self.UNK] = self.UNK_TOKEN_INDEX iteration_number = 0 number_of_unknown_tokens = 0 if self.verbose: print("parameters['remap_unknown_tokens_to_unk']: {0}".format( parameters['remap_unknown_tokens_to_unk'])) if self.verbose: print("len(token_count['train'].keys()): {0}".format( len(token_count['train'].keys()))) for token, count in token_count['all'].items(): if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1 if parameters['remap_unknown_tokens_to_unk'] == 1 and \ token_count['train'][token] == 0 and \ (token not in all_pretrained_tokens and \ token.lower() not in all_pretrained_tokens and \ re.sub('\d', '0', token.lower()) not in all_pretrained_tokens) and \ token not in all_tokens_in_pretraining_dataset: if self.verbose: print("token: {0}".format(token)) if self.verbose: print("token.lower(): {0}".format(token.lower())) if self.verbose: print("re.sub('\d', '0', token.lower()): {0}".format( re.sub('\d', '0', token.lower()))) token_to_index[token] = self.UNK_TOKEN_INDEX number_of_unknown_tokens += 1 self.tokens_mapped_to_unk.append(token) else: token_to_index[token] = iteration_number iteration_number += 1 if self.verbose: print("number_of_unknown_tokens: {0}".format( number_of_unknown_tokens)) infrequent_token_indices = [] for token, count in token_count['train'].items(): if 0 < count <= remap_to_unk_count_threshold: infrequent_token_indices.append(token_to_index[token]) if self.verbose: print("len(token_count['train']): {0}".format( len(token_count['train']))) if self.verbose: print("len(infrequent_token_indices): {0}".format( len(infrequent_token_indices))) # Ensure that both B- and I- versions exist for each label labels_without_bio = set() for label in label_count['all'].keys(): new_label = utils_nlp.remove_bio_from_label_name(label) labels_without_bio.add(new_label) for label in labels_without_bio: if label == 'O': continue begin_label = 'B-' + label inside_label = 'I-' + label for l in [begin_label, inside_label]: if l not in label_count['all']: label_count['all'][l] = 0 label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) if parameters['use_pretrained_model']: self.unique_labels = sorted( list(pretraining_dataset.label_to_index.keys())) # Make sure labels are compatible with the pretraining dataset. for label in label_count['all']: if label not in pretraining_dataset.label_to_index: raise AssertionError( "The label {0} does not exist in the pretraining dataset. " .format(label) + "Please ensure that only the following labels exist in the dataset: {0}" .format(', '.join(self.unique_labels))) label_to_index = pretraining_dataset.label_to_index.copy() else: label_to_index = {} iteration_number = 0 for label, count in label_count['all'].items(): label_to_index[label] = iteration_number iteration_number += 1 self.unique_labels.append(label) if self.verbose: print('self.unique_labels: {0}'.format(self.unique_labels)) character_to_index = {} iteration_number = 0 for character, count in character_count['all'].items(): if iteration_number == self.PADDING_CHARACTER_INDEX: iteration_number += 1 character_to_index[character] = iteration_number iteration_number += 1 if self.verbose: print('token_count[\'train\'][0:10]: {0}'.format( list(token_count['train'].items())[0:10])) token_to_index = utils.order_dictionary(token_to_index, 'value', reverse=False) if self.verbose: print('token_to_index: {0}'.format(token_to_index)) index_to_token = utils.reverse_dictionary(token_to_index) if parameters['remap_unknown_tokens_to_unk'] == 1: index_to_token[self.UNK_TOKEN_INDEX] = self.UNK if self.verbose: print('index_to_token: {0}'.format(index_to_token)) if self.verbose: print('label_count[\'train\']: {0}'.format(label_count['train'])) label_to_index = utils.order_dictionary(label_to_index, 'value', reverse=False) if self.verbose: print('label_to_index: {0}'.format(label_to_index)) index_to_label = utils.reverse_dictionary(label_to_index) if self.verbose: print('index_to_label: {0}'.format(index_to_label)) character_to_index = utils.order_dictionary(character_to_index, 'value', reverse=False) index_to_character = utils.reverse_dictionary(character_to_index) if self.verbose: print('character_to_index: {0}'.format(character_to_index)) if self.verbose: print('index_to_character: {0}'.format(index_to_character)) if self.verbose: print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10])) if self.verbose: print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10])) if self.verbose: # Print sequences of length 1 in train set for token_sequence, label_sequence in zip(tokens['train'], labels['train']): if len(label_sequence) == 1 and label_sequence[0] != 'O': print("{0}\t{1}".format(token_sequence[0], label_sequence[0])) # Map tokens and labels to their indices token_indices = {} label_indices = {} character_indices = {} character_indices_padded = {} for dataset_type in dataset_filepaths.keys(): token_indices[dataset_type] = [] characters[dataset_type] = [] character_indices[dataset_type] = [] token_lengths[dataset_type] = [] character_indices_padded[dataset_type] = [] for token_sequence in tokens[dataset_type]: token_indices[dataset_type].append( [token_to_index[token] for token in token_sequence]) characters[dataset_type].append( [list(token) for token in token_sequence]) character_indices[dataset_type].append( [[character_to_index[character] for character in token] for token in token_sequence]) token_lengths[dataset_type].append( [len(token) for token in token_sequence]) longest_token_length_in_sequence = max( token_lengths[dataset_type][-1]) character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX) for temp_token_indices in character_indices[dataset_type][-1] ]) label_indices[dataset_type] = [] for label_sequence in labels[dataset_type]: label_indices[dataset_type].append( [label_to_index[label] for label in label_sequence]) if self.verbose: print('token_lengths[\'train\'][0][0:10]: {0}'.format( token_lengths['train'][0][0:10])) if self.verbose: print('characters[\'train\'][0][0:10]: {0}'.format( characters['train'][0][0:10])) if self.verbose: print('token_indices[\'train\'][0:10]: {0}'.format( token_indices['train'][0:10])) if self.verbose: print('label_indices[\'train\'][0:10]: {0}'.format( label_indices['train'][0:10])) if self.verbose: print('character_indices[\'train\'][0][0:10]: {0}'.format( character_indices['train'][0][0:10])) if self.verbose: print('character_indices_padded[\'train\'][0][0:10]: {0}'.format( character_indices_padded['train'][0][0:10])) # Vectorize the labels # [Numpy 1-hot array](http://stackoverflow.com/a/42263603/395857) label_binarizer = sklearn.preprocessing.LabelBinarizer() label_binarizer.fit(range(max(index_to_label.keys()) + 1)) label_vector_indices = {} for dataset_type in dataset_filepaths.keys(): label_vector_indices[dataset_type] = [] for label_indices_sequence in label_indices[dataset_type]: label_vector_indices[dataset_type].append( label_binarizer.transform(label_indices_sequence)) if self.verbose: print('label_vector_indices[\'train\'][0:2]: {0}'.format( label_vector_indices['train'][0:2])) if self.verbose: print('len(label_vector_indices[\'train\']): {0}'.format( len(label_vector_indices['train']))) self.token_to_index = token_to_index self.index_to_token = index_to_token self.token_indices = token_indices self.label_indices = label_indices self.character_indices_padded = character_indices_padded self.index_to_character = index_to_character self.character_to_index = character_to_index self.character_indices = character_indices self.token_lengths = token_lengths self.characters = characters self.tokens = tokens self.labels = labels self.label_vector_indices = label_vector_indices self.index_to_label = index_to_label self.label_to_index = label_to_index if self.verbose: print("len(self.token_to_index): {0}".format( len(self.token_to_index))) if self.verbose: print("len(self.index_to_token): {0}".format( len(self.index_to_token))) self.number_of_classes = max(self.index_to_label.keys()) + 1 self.vocabulary_size = max(self.index_to_token.keys()) + 1 self.alphabet_size = max(self.index_to_character.keys()) + 1 if self.verbose: print("self.number_of_classes: {0}".format(self.number_of_classes)) if self.verbose: print("self.alphabet_size: {0}".format(self.alphabet_size)) if self.verbose: print("self.vocabulary_size: {0}".format(self.vocabulary_size)) # unique_labels_of_interest is used to compute F1-scores. self.unique_labels_of_interest = list(self.unique_labels) self.unique_labels_of_interest.remove('O') self.unique_label_indices_of_interest = [] for lab in self.unique_labels_of_interest: self.unique_label_indices_of_interest.append(label_to_index[lab]) self.infrequent_token_indices = infrequent_token_indices if self.verbose: print('self.unique_labels_of_interest: {0}'.format( self.unique_labels_of_interest)) if self.verbose: print('self.unique_label_indices_of_interest: {0}'.format( self.unique_label_indices_of_interest)) elapsed_time = time.time() - start_time print('done ({0:.2f} seconds)'.format(elapsed_time))
def load_dataset(self, dataset_filepaths, parameters): ''' args: dataset_filepaths : dictionary with keys 'train', 'valid', 'test' http://stackoverflow.com/questions/27416164/what-is-conll-data-format ''' start_time = time.time() print('Load dataset... ', end='', flush=True) all_pretrained_tokens = None if parameters['token_pretrained_embedding_filepath'] != '': all_pretrained_tokens = utils_nlp.load_tokens_from_pretrained_token_embeddings( parameters) if self.verbose: print("len(all_pretrained_tokens): {0}".format( len(all_pretrained_tokens))) remap_to_unk_count_threshold = 1 #if ['train'] not in dataset_filepaths.keys(): raise ValueError('') self.UNK_TOKEN_INDEX = 0 self.PADDING_CHARACTER_INDEX = 0 self.tokens_mapped_to_unk = [] self.UNK = 'UNK' self.unique_labels = [] labels = {} tokens = {} characters = {} token_lengths = {} label_count = {} token_count = {} character_count = {} for dataset_type in ['train', 'valid', 'test']: labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], \ character_count[dataset_type] = self._parse_dataset(dataset_filepaths[dataset_type],dataset_type)#,all_pretrained_tokens,token_count) if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) token_count['all'] = {} # utils.merge_dictionaries() for token in list(token_count['train'].keys()) + list( token_count['valid'].keys()) + list( token_count['test'].keys()): token_count['all'][ token] = token_count['train'][token] + token_count['valid'][ token] + token_count['test'][token] for dataset_type in ['train', 'valid', 'test']: if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) character_count['all'] = {} # utils.merge_dictionaries() for character in list(character_count['train'].keys()) + list( character_count['valid'].keys()) + list( character_count['test'].keys()): character_count['all'][character] = character_count['train'][ character] + character_count['valid'][ character] + character_count['test'][character] label_count['all'] = {} # utils.merge_dictionaries() for character in list(label_count['train'].keys()) + list( label_count['valid'].keys()) + list( label_count['test'].keys()): label_count['all'][ character] = label_count['train'][character] + label_count[ 'valid'][character] + label_count['test'][character] token_count['all'] = utils.order_dictionary(token_count['all'], 'value', reverse=True) #label_count['train'] = utils.order_dictionary(label_count['train'], 'key', reverse = False) label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) label_count['train'] = utils.order_dictionary(label_count['train'], 'key', reverse=False) character_count['all'] = utils.order_dictionary(character_count['all'], 'value', reverse=True) if self.verbose: print('character_count[\'all\']: {0}'.format( character_count['all'])) token_to_index = {} token_to_index[self.UNK] = self.UNK_TOKEN_INDEX iteration_number = 0 number_of_unknown_tokens = 0 for token, count in token_count['all'].items(): if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1 if parameters['remove_unknown_tokens'] == 1 and \ token_count['train'][token] == 0 and \ (all_pretrained_tokens == None or \ token not in all_pretrained_tokens and \ token.lower() not in all_pretrained_tokens and \ re.sub('\d', '0', token.lower()) not in all_pretrained_tokens): token_to_index[token] = self.UNK_TOKEN_INDEX number_of_unknown_tokens += 1 self.tokens_mapped_to_unk.append(token) else: token_to_index[token] = iteration_number iteration_number += 1 if self.verbose: print("number_of_unknown_tokens: {0}".format( number_of_unknown_tokens)) # 0/0 infrequent_token_indices = [] for token, count in token_count['train'].items(): if 0 < count <= remap_to_unk_count_threshold: infrequent_token_indices.append(token_to_index[token]) if self.verbose: print("len(token_count['train']): {0}".format( len(token_count['train']))) if self.verbose: print("len(infrequent_token_indices): {0}".format( len(infrequent_token_indices))) label_to_index = {} iteration_number = 0 #for label, count in label_count['train'].items(): for label, count in label_count['all'].items(): label_to_index[label] = iteration_number iteration_number += 1 self.unique_labels.append(label) #for label, count in label_count['train'].items(): # self.unique_labels.append(label) if self.verbose: print('self.unique_labels: {0}'.format(self.unique_labels)) character_to_index = {} iteration_number = 0 for character, count in character_count['all'].items(): if iteration_number == self.PADDING_CHARACTER_INDEX: iteration_number += 1 character_to_index[character] = iteration_number iteration_number += 1 if self.verbose: print('token_count[\'train\'][0:10]: {0}'.format( list(token_count['train'].items())[0:10])) token_to_index = utils.order_dictionary(token_to_index, 'value', reverse=False) index_to_token = utils.reverse_dictionary(token_to_index) if parameters['remove_unknown_tokens'] == 1: index_to_token[self.UNK_TOKEN_INDEX] = self.UNK label_to_index = utils.order_dictionary(label_to_index, 'value', reverse=False) if self.verbose: print('label_to_index: {0}'.format(label_to_index)) index_to_label = utils.reverse_dictionary(label_to_index) if self.verbose: print('index_to_label: {0}'.format(index_to_label)) index_to_character = utils.reverse_dictionary(character_to_index) if self.verbose: print('character_to_index: {0}'.format(character_to_index)) if self.verbose: print('index_to_character: {0}'.format(index_to_character)) if self.verbose: print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10])) if self.verbose: print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10])) # Map tokens and labels to their indices token_indices = {} label_indices = {} character_indices = {} character_indices_padded = {} for dataset_type in ['train', 'valid', 'test']: token_indices[dataset_type] = [] characters[dataset_type] = [] character_indices[dataset_type] = [] token_lengths[dataset_type] = [] character_indices_padded[dataset_type] = [] for token_sequence in tokens[dataset_type]: token_indices[dataset_type].append( [token_to_index[token] for token in token_sequence]) characters[dataset_type].append( [list(token) for token in token_sequence]) character_indices[dataset_type].append( [[character_to_index[character] for character in token] for token in token_sequence]) token_lengths[dataset_type].append( [len(token) for token in token_sequence]) longest_token_length_in_sequence = max( token_lengths[dataset_type][-1]) character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX) for temp_token_indices in character_indices[dataset_type][-1] ]) label_indices[dataset_type] = [] for label_sequence in labels[dataset_type]: label_indices[dataset_type].append( [label_to_index[label] for label in label_sequence]) if self.verbose: print('token_lengths[\'train\'][0][0:10]: {0}'.format( token_lengths['train'][0][0:10])) if self.verbose: print('characters[\'train\'][0][0:10]: {0}'.format( characters['train'][0][0:10])) if self.verbose: print('token_indices[\'train\'][0:10]: {0}'.format( token_indices['train'][0:10])) if self.verbose: print('label_indices[\'train\'][0:10]: {0}'.format( label_indices['train'][0:10])) if self.verbose: print('character_indices[\'train\'][0][0:10]: {0}'.format( character_indices['train'][0][0:10])) if self.verbose: print('character_indices_padded[\'train\'][0][0:10]: {0}'.format( character_indices_padded['train'][0][0:10])) # Vectorize the labels # [Numpy 1-hot array](http://stackoverflow.com/a/42263603/395857) label_binarizer = sklearn.preprocessing.LabelBinarizer() label_binarizer.fit(range(max(index_to_label.keys()) + 1)) label_vector_indices = {} for dataset_type in ['train', 'valid', 'test']: label_vector_indices[dataset_type] = [] for label_indices_sequence in label_indices[dataset_type]: label_vector_indices[dataset_type].append( label_binarizer.transform(label_indices_sequence)) if self.verbose: print('label_vector_indices[\'train\'][0:2]: {0}'.format( label_vector_indices['train'][0:2])) if self.verbose: print('len(label_vector_indices[\'train\']): {0}'.format( len(label_vector_indices['train']))) self.token_to_index = token_to_index self.index_to_token = index_to_token self.token_indices = token_indices self.label_indices = label_indices self.character_indices_padded = character_indices_padded self.index_to_character = index_to_character self.character_to_index = character_to_index self.character_indices = character_indices self.token_lengths = token_lengths self.characters = characters self.tokens = tokens self.labels = labels self.label_vector_indices = label_vector_indices self.index_to_label = index_to_label self.label_to_index = label_to_index if self.verbose: print("len(self.token_to_index): {0}".format( len(self.token_to_index))) if self.verbose: print("len(self.index_to_token): {0}".format( len(self.index_to_token))) self.number_of_classes = max(self.index_to_label.keys()) + 1 self.vocabulary_size = max(self.index_to_token.keys()) + 1 self.alphabet_size = max(self.index_to_character.keys()) + 1 if self.verbose: print("self.number_of_classes: {0}".format(self.number_of_classes)) if self.verbose: print("self.alphabet_size: {0}".format(self.alphabet_size)) if self.verbose: print("self.vocabulary_size: {0}".format(self.vocabulary_size)) # unique_labels_of_interest is used to compute F1-scores. self.unique_labels_of_interest = list(self.unique_labels) self.unique_labels_of_interest.remove('O') self.unique_label_indices_of_interest = [] for lab in self.unique_labels_of_interest: self.unique_label_indices_of_interest.append(label_to_index[lab]) self.infrequent_token_indices = infrequent_token_indices if self.verbose: print('self.unique_labels_of_interest: {0}'.format( self.unique_labels_of_interest)) if self.verbose: print('self.unique_label_indices_of_interest: {0}'.format( self.unique_label_indices_of_interest)) elapsed_time = time.time() - start_time print('done ({0:.2f} seconds)'.format(elapsed_time))