def load_dataset_sequence_labeling(train_path, dev_path, test_path, word_column=1, label_column=4, label_name='ner', oov='embedding', fine_tune=False, embedding="word2Vec", embedding_path=None, use_character=False): """ load data from file :param train_path: path of training file :param dev_path: path of dev file :param test_path: path of test file :param word_column: the column index of word (start from 0) :param label_column: the column of label (start from 0) :param label_name: name of label, such as pos or ner :param oov: embedding for oov word, choose from ['random', 'embedding']. If "embedding", then add words in dev and test data to alphabet; if "random", not. :param fine_tune: if fine tune word embeddings. :param embedding: embeddings for words, choose from ['word2vec', 'senna']. :param embedding_path: path of file storing word embeddings. :param use_character: if use character embeddings. :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table (if fine tune), label_alphabet, C_train, C_dev, C_test, char_embedd_table """ def construct_tensor_fine_tune(word_index_sentences, label_index_sentences): X = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Y = np.empty([len(word_index_sentences), max_length], dtype=np.int32) mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX) for i in range(len(word_index_sentences)): word_ids = word_index_sentences[i] label_ids = label_index_sentences[i] length = len(word_ids) for j in range(length): wid = word_ids[j] label = label_ids[j] X[i, j] = wid Y[i, j] = label - 1 # Zero out X after the end of the sequence X[i, length:] = 0 # Copy the last label after the end of the sequence Y[i, length:] = Y[i, length - 1] # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 return X, Y, mask def generate_dataset_fine_tune(): """ generate data tensor when fine tuning :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table, label_size """ embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet, logger) print 'Embedd_dict' print np.shape(embedd_dict) logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length], Y.shape = [#data, max_length]) X_train, Y_train, mask_train = construct_tensor_fine_tune(word_index_sentences_train, label_index_sentences_train) X_dev, Y_dev, mask_dev = construct_tensor_fine_tune(word_index_sentences_dev, label_index_sentences_dev) X_test, Y_test, mask_test = construct_tensor_fine_tune(word_index_sentences_test, label_index_sentences_test) C_train, C_dev, C_test, char_embedd_table, mask_c_train, mask_c_dev, mask_c_test = generate_character_data(word_sentences_train, word_sentences_dev, word_sentences_test, max_length) if use_character else ( None, None, None, None) last_index_c_train, last_index_c_dev, last_index_c_test = generate_last_index(mask_c_train, mask_c_dev, mask_c_test) return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless), label_alphabet, \ C_train, C_dev, C_test, char_embedd_table def construct_tensor_not_fine_tune(word_sentences, label_index_sentences, unknown_embedd, embedd_dict, embedd_dim, caseless): X = np.empty([len(word_sentences), max_length, embedd_dim], dtype=theano.config.floatX) Y = np.empty([len(word_sentences), max_length], dtype=np.int32) mask = np.zeros([len(word_sentences), max_length], dtype=theano.config.floatX) # bad_dict = dict() # bad_num = 0 for i in range(len(word_sentences)): words = word_sentences[i] label_ids = label_index_sentences[i] length = len(words) for j in range(length): word = words[j].lower() if caseless else words[j] label = label_ids[j] embedd = embedd_dict[word] if word in embedd_dict else unknown_embedd X[i, j, :] = embedd Y[i, j] = label - 1 # if word not in embedd_dict: # bad_num += 1 # if word in bad_dict: # bad_dict[word] += 1 # else: # bad_dict[word] = 1 # Zero out X after the end of the sequence X[i, length:] = np.zeros([1, embedd_dim], dtype=theano.config.floatX) # Copy the last label after the end of the sequence Y[i, length:] = Y[i, length - 1] # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 # for w, c in bad_dict.items(): # if c >= 100: # print "%s: %d" % (w, c) # print bad_num return X, Y, mask def generate_dataset_not_fine_tune(): """ generate data tensor when not fine tuning :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, None, label_size """ embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %s" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length, embedding_dim], Y.shape = [#data, max_length]) unknown_embedd = np.random.uniform(-0.01, 0.01, [1, embedd_dim]) X_train, Y_train, mask_train = construct_tensor_not_fine_tune(word_sentences_train, label_index_sentences_train, unknown_embedd, embedd_dict, embedd_dim, caseless) X_dev, Y_dev, mask_dev = construct_tensor_not_fine_tune(word_sentences_dev, label_index_sentences_dev, unknown_embedd, embedd_dict, embedd_dim, caseless) X_test, Y_test, mask_test = construct_tensor_not_fine_tune(word_sentences_test, label_index_sentences_test, unknown_embedd, embedd_dict, embedd_dim, caseless) C_train, C_dev, C_test, char_embedd_table, mask_c_train, mask_c_dev, mask_c_test = generate_character_data(word_sentences_train, word_sentences_dev, word_sentences_test, max_length) if use_character else ( None, None, None, None) return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ None, label_alphabet, C_train, C_dev, C_test, char_embedd_table word_alphabet = Alphabet('word') label_alphabet = Alphabet(label_name) # read training data logger.info("Reading data from training set...") word_sentences_train, _, word_index_sentences_train, label_index_sentences_train = read_conll_sequence_labeling( train_path, word_alphabet, label_alphabet, word_column, label_column) #print word_sentences_train[0] #print word_index_sentences_train[0] #print label_index_sentences_train # if oov is "random" and do not fine tune, close word_alphabet if oov == "random" and not fine_tune: logger.info("Close word alphabet.") word_alphabet.close() # read dev data logger.info("Reading data from dev set...") word_sentences_dev, _, word_index_sentences_dev, label_index_sentences_dev = read_conll_sequence_labeling( dev_path, word_alphabet, label_alphabet, word_column, label_column) # read test data logger.info("Reading data from test set...") word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = read_conll_sequence_labeling( test_path, word_alphabet, label_alphabet, word_column, label_column) # close alphabets word_alphabet.close() label_alphabet.close() label_alphabet.save('tmp', 'label_alphabet') logger.info("word alphabet size: %d" % (word_alphabet.size() - 1)) logger.info("label alphabet size: %d" % (label_alphabet.size() - 1)) # get maximum length max_length_train = get_max_length(word_sentences_train) max_length_dev = get_max_length(word_sentences_dev) max_length_test = get_max_length(word_sentences_test) max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test)) logger.info("Maximum length of training set is %d" % max_length_train) logger.info("Maximum length of dev set is %d" % max_length_dev) logger.info("Maximum length of test set is %d" % max_length_test) logger.info("Maximum length used for training is %d" % max_length) if fine_tune: logger.info("Generating data with fine tuning...") return generate_dataset_fine_tune() else: logger.info("Generating data without fine tuning...") return generate_dataset_not_fine_tune()
def create_alphabets(alphabet_directory, data_paths, max_vocabulary_size, normalize_digits=True): logger = utils.get_logger("Create Alphabets") word_alphabet = Alphabet('word') pos_alphabet = Alphabet('pos') type_alphabet = Alphabet('type') if not gfile.Exists(alphabet_directory): logger.info("Creating Alphabets: %s" % alphabet_directory) pos_alphabet.add(ROOT_POS) type_alphabet.add(ROOT_TYPE) pos_alphabet.add(PAD_POS) type_alphabet.add(PAD_TYPE) vocab = dict() for data_path in data_paths: logger.info("Processing data: %s" % data_path) with gfile.GFile(data_path, mode="r") as file: for line in file: line = line.decode('utf-8') line = line.strip() if len(line) == 0: continue tokens = line.split() word = DIGIT_RE.sub( b"0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[4] type = tokens[7] pos_alphabet.add(pos) type_alphabet.add(type) if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) logger.info("Total Vocabulary Size: %d" % len(vocab_list)) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Type Alphabet Size: %d" % type_alphabet.size()) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] for word in vocab_list: word_alphabet.add(word) word_alphabet.save(alphabet_directory) pos_alphabet.save(alphabet_directory) type_alphabet.save(alphabet_directory) else: word_alphabet.load(alphabet_directory) pos_alphabet.load(alphabet_directory) type_alphabet.load(alphabet_directory) word_alphabet.close() pos_alphabet.close() type_alphabet.close() return word_alphabet, pos_alphabet, type_alphabet
def generate_character_data(sentences_train, sentences_dev, sentences_test, max_sent_length, char_embedd_dim=30): """ generate data for charaters :param sentences_train: :param sentences_dev: :param sentences_test: :param max_sent_length: :return: C_train, C_dev, C_test, char_embedd_table """ def get_character_indexes(sentences): index_sentences = [] max_length = 0 for words in sentences: index_words = [] for word in words: index_chars = [] if len(word) > max_length: max_length = len(word) for char in word[:MAX_CHAR_LENGTH]: char_id = char_alphabet.get_index(char) index_chars.append(char_id) index_words.append(index_chars) index_sentences.append(index_words) return index_sentences, max_length def construct_tensor_char(index_sentences): C = np.empty([len(index_sentences), max_sent_length, max_char_length], dtype=np.int32) word_end_id = char_alphabet.get_index(word_end) mask_c = np.ones([len(index_sentences), max_sent_length, max_char_length], dtype=theano.config.floatX) for i in range(len(index_sentences)): words = index_sentences[i] sent_length = len(words) for j in range(sent_length): chars = words[j] char_length = len(chars) for k in range(char_length): cid = chars[k] C[i, j, k] = cid # fill index of word end after the end of word C[i, j, char_length:] = word_end_id mask_c[i, j, char_length:] = 0 # Zero out C after the end of the sentence C[i, sent_length:, :] = 0 mask_c[i, sent_length:, :] = 0 return C, mask_c def build_char_embedd_table(): scale = np.sqrt(3.0 / char_embedd_dim) char_embedd_table = np.random.uniform(-scale, scale, [char_alphabet.size(), char_embedd_dim]).astype( theano.config.floatX) return char_embedd_table char_alphabet = Alphabet('character') char_alphabet.get_index(word_end) index_sentences_train, max_char_length_train = get_character_indexes(sentences_train) index_sentences_dev, max_char_length_dev = get_character_indexes(sentences_dev) index_sentences_test, max_char_length_test = get_character_indexes(sentences_test) # close character alphabet char_alphabet.close() char_alphabet.save('tmp', 'char_alphabet') logger.info("character alphabet size: %d" % (char_alphabet.size() - 1)) max_char_length = min(MAX_CHAR_LENGTH, max(max_char_length_train, max_char_length_dev, max_char_length_test)) logger.info("Maximum character length of training set is %d" % max_char_length_train) logger.info("Maximum character length of dev set is %d" % max_char_length_dev) logger.info("Maximum character length of test set is %d" % max_char_length_test) logger.info("Maximum character length used for training is %d" % max_char_length) # fill character tensor C_train, mask_c_train = construct_tensor_char(index_sentences_train) C_dev, mask_c_dev = construct_tensor_char(index_sentences_dev) C_test, mask_c_test = construct_tensor_char(index_sentences_test) return C_train, C_dev, C_test, build_char_embedd_table(), mask_c_train, mask_c_dev, mask_c_test