def getNext(self, normalize_digits=True): line = self.__source_file.readline() # skip multiple blank lines. while len(line) > 0 and len(line.strip()) == 0: line = self.__source_file.readline() if len(line) == 0: return None line = line.strip().split('\t') label = line[1] words = line[0].strip().split(' ') word_ids = [] for pos, word in enumerate(words): word = DIGIT_RE.sub("0", word) if normalize_digits else word if self.refiner is not None: word_id = self.__word_alphabet.get_index(word) if word_id < 0: unk_signature = self.refiner.refine(word, pos) self.__word_alphabet.add(unk_signature) word_ids.append( self.__word_alphabet.get_index(unk_signature)) else: word_ids.append(word_id) else: word_ids.append(self.__word_alphabet.get_index(word)) label_id = int(label) return NERInstance(Sentence(words, word_ids, None, None), label, label_id, None, None, None, None)
def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: with open(data_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split('\t')[0].split(' ') for token in tokens: word = DIGIT_RE.sub("0", token) if normalize_digits else token if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict): vocab_set.add(word) vocab_list.append(word)
def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: # logger.info("Processing data: %s" % data_path) with open(data_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split(' ') word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0] pos = tokens[1] chunk = tokens[2] ner = tokens[3] pos_alphabet.add(pos) chunk_alphabet.add(chunk) ner_alphabet.add(ner) if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict): vocab_set.add(word) vocab_list.append(word)
def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: # logger.info("Processing data: %s" % data_path) with open(data_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split('\t') for char in tokens[1]: char_alphabet.add(char) word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[4] type = tokens[7] pos_alphabet.add(pos) type_alphabet.add(type) if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict): vocab_set.add(word) vocab_list.append(word)
def getNext(self, normalize_digits=True, symbolic_root=False, symbolic_end=False): line = self.__source_file.readline() # skip multiple blank lines. while len(line) > 0 and len(line.strip()) == 0: line = self.__source_file.readline() if len(line) == 0: return None lines = [] while len(line.strip()) > 0: line = line.strip() lines.append(line.split('\t')) line = self.__source_file.readline() length = len(lines) if length == 0: return None words = [] word_ids = [] char_seqs = [] char_id_seqs = [] postags = [] pos_ids = [] types = [] type_ids = [] heads = [] if symbolic_root: words.append(ROOT) word_ids.append(self.__word_alphabet.get_index(ROOT)) char_seqs.append([ ROOT_CHAR, ]) char_id_seqs.append([ self.__char_alphabet.get_index(ROOT_CHAR), ]) postags.append(ROOT_POS) pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS)) types.append(ROOT_TYPE) type_ids.append(self.__type_alphabet.get_index(ROOT_TYPE)) heads.append(0) for tokens in lines: chars = [] char_ids = [] for char in tokens[1]: chars.append(char) char_ids.append(self.__char_alphabet.get_index(char)) if len(chars) > MAX_CHAR_LENGTH: chars = chars[:MAX_CHAR_LENGTH] char_ids = char_ids[:MAX_CHAR_LENGTH] char_seqs.append(chars) char_id_seqs.append(char_ids) word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[4] head = int(tokens[6]) type = tokens[7] words.append(word) word_ids.append(self.__word_alphabet.get_index(word)) postags.append(pos) pos_ids.append(self.__pos_alphabet.get_index(pos)) types.append(type) type_ids.append(self.__type_alphabet.get_index(type)) heads.append(head) if symbolic_end: words.append(END) word_ids.append(self.__word_alphabet.get_index(END)) char_seqs.append([ END_CHAR, ]) char_id_seqs.append([ self.__char_alphabet.get_index(END_CHAR), ]) postags.append(END_POS) pos_ids.append(self.__pos_alphabet.get_index(END_POS)) types.append(END_TYPE) type_ids.append(self.__type_alphabet.get_index(END_TYPE)) heads.append(0) for position, word in enumerate(words): # TODO here the position is not correct if self.refine_unk: word_idx = self.__word_alphabet.get_index(word) if word_idx < 0: unk_signature = self.refiner.refine(word, position) word_ids[position] = self.__word_alphabet.get_index( unk_signature) return DependencyInstance( Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids, heads, types, type_ids)
def getNext(self, normalize_digits=True): line = self.__source_file.readline() # skip multiple blank lines. while len(line) > 0 and len(line.strip()) == 0: line = self.__source_file.readline() if len(line) == 0: return None lines = [] while len(line.strip()) > 0: line = line.strip() lines.append(line.split(' ')) line = self.__source_file.readline() length = len(lines) if length == 0: return None words = [] word_ids = [] char_seqs = [] char_id_seqs = [] postags = [] pos_ids = [] chunk_tags = [] chunk_ids = [] ner_tags = [] ner_ids = [] for tokens in lines: chars = [] char_ids = [] for char in tokens[0]: chars.append(char) char_ids.append(self.__char_alphabet.get_index(char)) if len(chars) > MAX_CHAR_LENGTH: chars = chars[:MAX_CHAR_LENGTH] char_ids = char_ids[:MAX_CHAR_LENGTH] char_seqs.append(chars) char_id_seqs.append(char_ids) word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0] pos = tokens[1] chunk = tokens[2] ner = tokens[3] words.append(word) word_ids.append(self.__word_alphabet.get_index(word)) postags.append(pos) pos_ids.append(self.__pos_alphabet.get_index(pos)) chunk_tags.append(chunk) chunk_ids.append(self.__chunk_alphabet.get_index(chunk)) ner_tags.append(ner) ner_ids.append(self.__ner_alphabet.get_index(ner)) return NERInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids, chunk_tags, chunk_ids, ner_tags, ner_ids)
def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None, min_occurrence=1, normalize_digits=True): def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: # logger.info("Processing data: %s" % data_path) with open(data_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split(' ') word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0] pos = tokens[1] chunk = tokens[2] ner = tokens[3] pos_alphabet.add(pos) chunk_alphabet.add(chunk) ner_alphabet.add(ner) if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict): vocab_set.add(word) vocab_list.append(word) logger = get_logger("Create Alphabets") word_alphabet = Alphabet('word', default_value=True, singleton=True) char_alphabet = Alphabet('character', default_value=True) pos_alphabet = Alphabet('pos') chunk_alphabet = Alphabet('chunk') ner_alphabet = Alphabet('ner') if not os.path.isdir(alphabet_directory): logger.info("Creating Alphabets: %s" % alphabet_directory) char_alphabet.add(PAD_CHAR) pos_alphabet.add(PAD_POS) chunk_alphabet.add(PAD_CHUNK) ner_alphabet.add(PAD_NER) vocab = defaultdict(int) with open(train_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split(' ') for char in tokens[0]: char_alphabet.add(char) word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0] vocab[word] += 1 pos = tokens[1] pos_alphabet.add(pos) chunk = tokens[2] chunk_alphabet.add(chunk) ner = tokens[3] ner_alphabet.add(ner) # collect singletons singletons = set([word for word, count in vocab.items() if count <= min_occurrence]) # if a singleton is in pretrained embedding dict, set the count to min_occur + c if embedd_dict is not None: assert isinstance(embedd_dict, OrderedDict) for word in vocab.keys(): if word in embedd_dict or word.lower() in embedd_dict: vocab[word] += min_occurrence vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) logger.info("Total Vocabulary Size: %d" % len(vocab_list)) logger.info("Total Singleton Size: %d" % len(singletons)) vocab_list = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence] logger.info("Total Vocabulary Size (w.o rare words): %d" % len(vocab_list)) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] if data_paths is not None and embedd_dict is not None: expand_vocab() for word in vocab_list: word_alphabet.add(word) if word in singletons: word_alphabet.add_singleton(word_alphabet.get_index(word)) word_alphabet.save(alphabet_directory) char_alphabet.save(alphabet_directory) pos_alphabet.save(alphabet_directory) chunk_alphabet.save(alphabet_directory) ner_alphabet.save(alphabet_directory) else: word_alphabet.load(alphabet_directory) char_alphabet.load(alphabet_directory) pos_alphabet.load(alphabet_directory) chunk_alphabet.load(alphabet_directory) ner_alphabet.load(alphabet_directory) word_alphabet.close() char_alphabet.close() pos_alphabet.close() chunk_alphabet.close() ner_alphabet.close() logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size())) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size()) logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) return word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet
def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None, min_occurrence=1, normalize_digits=True, unk_rank=5): def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: # logger.info("Processing data: %s" % data_path) with open(data_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split('\t') for char in tokens[1]: char_alphabet.add(char) word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[4] type = tokens[7] pos_alphabet.add(pos) type_alphabet.add(type) if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict): vocab_set.add(word) vocab_list.append(word) logger = get_logger("Create Alphabets") word_alphabet = Alphabet('word', singleton=True) char_alphabet = Alphabet('character') pos_alphabet = Alphabet('pos') type_alphabet = Alphabet('type') logger.info("Creating Alphabets: %s" % alphabet_directory) char_alphabet.add(PAD_CHAR) pos_alphabet.add(PAD_POS) type_alphabet.add(PAD_TYPE) char_alphabet.add(ROOT_CHAR) pos_alphabet.add(ROOT_POS) type_alphabet.add(ROOT_TYPE) char_alphabet.add(END_CHAR) pos_alphabet.add(END_POS) type_alphabet.add(END_TYPE) vocab = defaultdict(int) # here we use the list to save every word and position word_collect = [] with open(train_path, 'r') as file: words = [] position = 0 for line in file: line = line.strip() if len(line) == 0: position = 0 word_collect.append(words) words = [] continue tokens = line.split('\t') for char in tokens[1]: char_alphabet.add(char) word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] vocab[word] += 1 words.append((word, position)) position += 1 pos = tokens[4] pos_alphabet.add(pos) type = tokens[7] type_alphabet.add(type) # collect singletons singletons = set([word for word, count in vocab.items() if count <= min_occurrence]) # if a singleton is in pretrained embedding dict, set the count to min_occur + c if embedd_dict is not None: assert isinstance(embedd_dict, OrderedDict) for word in vocab.keys(): if word in embedd_dict or word.lower() in embedd_dict: vocab[word] += min_occurrence vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) logger.info("Total Vocabulary Size: %d" % len(vocab_list)) logger.info("Total Singleton Size: %d" % len(singletons)) multi_vocab = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence] logger.info("Total Vocabulary Size (w.o rare words): %d" % len(multi_vocab)) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] if data_paths is not None and embedd_dict is not None: expand_vocab() for word in vocab_list: if word in multi_vocab: word_alphabet.add(word) elif word in singletons: word_alphabet.add_singleton(word_alphabet.get_index(word)) else: raise ValueError("Error word: " + word) # unk refiner unk_refiner = UNKRefiner(level=unk_rank, alphabet=word_alphabet) for words in word_collect: for word, position in words: if word in singletons: unk_signature = unk_refiner.refine(word, position) word_alphabet.add(unk_signature) word_alphabet.close() char_alphabet.close() pos_alphabet.close() type_alphabet.close() logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size())) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Type Alphabet Size: %d" % type_alphabet.size()) return word_alphabet, char_alphabet, pos_alphabet, type_alphabet
def load_embedding_dict(embedding, embedding_path, normalize_digits=True): """ load word embeddings from file :param embedding: :param embedding_path: :return: embedding dict, embedding dimention, caseless """ print("loading embedding: %s from %s" % (embedding, embedding_path)) if embedding == 'word2vec': # loading word2vec word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True) embedd_dim = word2vec.vector_size return word2vec, embedd_dim elif embedding == 'glove': # loading GloVe embedd_dim = -1 embedd_dict = OrderedDict() with gzip.open(embedding_path, 'rt', encoding='utf8') as file: for line1 in file: line = line1.strip() if len(line) == 0: continue tokens = line.split(' ') if embedd_dim < 0: embedd_dim = len(tokens) - 1 elif embedd_dim + 1 != len(tokens): continue # assert (embedd_dim + 1 == len(tokens)) embedd = np.empty([1, embedd_dim], dtype=np.float32) embedd[:] = tokens[1:] word = DIGIT_RE.sub( "0", tokens[0]) if normalize_digits else tokens[0] embedd_dict[word] = embedd return embedd_dict, embedd_dim elif embedding == 'senna': # loading Senna embedd_dim = -1 embedd_dict = OrderedDict() with gzip.open(embedding_path, 'rt') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split() if embedd_dim < 0: embedd_dim = len(tokens) - 1 else: assert (embedd_dim + 1 == len(tokens)) embedd = np.empty([1, embedd_dim], dtype=np.float32) embedd[:] = tokens[1:] word = DIGIT_RE.sub( "0", tokens[0]) if normalize_digits else tokens[0] embedd_dict[word] = embedd return embedd_dict, embedd_dim elif embedding == 'sskip': embedd_dim = -1 embedd_dict = OrderedDict() with gzip.open(embedding_path, 'rt') as file: # skip the first line file.readline() for line in file: line = line.strip() try: if len(line) == 0: continue tokens = line.split() if len(tokens) < embedd_dim: continue if embedd_dim < 0: embedd_dim = len(tokens) - 1 embedd = np.empty([1, embedd_dim], dtype=np.float32) start = len(tokens) - embedd_dim word = ' '.join(tokens[0:start]) embedd[:] = tokens[start:] word = DIGIT_RE.sub("0", word) if normalize_digits else word embedd_dict[word] = embedd except UnicodeDecodeError: continue return embedd_dict, embedd_dim elif embedding == 'polyglot': words, embeddings = pickle.load(open(embedding_path, 'rb'), encoding='latin1') _, embedd_dim = embeddings.shape embedd_dict = OrderedDict() for i, word in enumerate(words): embedd = np.empty([1, embedd_dim], dtype=np.float32) embedd[:] = embeddings[i, :] word = DIGIT_RE.sub("0", word) if normalize_digits else word embedd_dict[word] = embedd return embedd_dict, embedd_dim else: raise ValueError( "embedding should choose from [word2vec, senna, glove, sskip, polyglot]" )
def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None, min_occurrence=1, normalize_digits=True): def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: with open(data_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split('\t')[0].split(' ') for token in tokens: word = DIGIT_RE.sub("0", token) if normalize_digits else token if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict): vocab_set.add(word) vocab_list.append(word) logger = get_logger("Create Alphabets") word_alphabet = Alphabet('word', singleton=True) if not os.path.isdir(alphabet_directory): logger.info("Creating Alphabets: %s" % alphabet_directory) vocab = defaultdict(int) with open(train_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split('\t')[0].split(' ') for token in tokens: word = DIGIT_RE.sub("0", token) if normalize_digits else token vocab[word] += 1 # collect singletons singletons = set([word for word, count in vocab.items() if count <= min_occurrence]) # if a singleton is in pretrained embedding dict, set the count to min_occur + c if embedd_dict is not None: assert isinstance(embedd_dict, OrderedDict) for word in vocab.keys(): if word in embedd_dict or word.lower() in embedd_dict: vocab[word] += min_occurrence vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) logger.info("Total Vocabulary Size: %d" % len(vocab_list)) logger.info("Total Singleton Size: %d" % len(singletons)) multi_vocab = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence] logger.info("Total Vocabulary Size (w.o rare words): %d" % len(multi_vocab)) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] if data_paths is not None and embedd_dict is not None: expand_vocab() for word in vocab_list: if word in multi_vocab: word_alphabet.add(word) elif word in singletons: word_alphabet.add_singleton(word_alphabet.get_index(word)) else: raise ValueError("Error word: " + word) refiner = UNKRefiner(0, word_alphabet) # TODO fix the pos here for word in singletons: unk_signature = refiner.refine(word, 0) word_alphabet.add(unk_signature) word_alphabet.save(alphabet_directory) else: word_alphabet.load(alphabet_directory) word_alphabet.close() logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size())) return word_alphabet