def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: # logger.info("Processing data: %s" % data_path) with open(data_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0 or line.startswith( '#'): # conllu format. Attardi continue tokens = line.split('\t') if '-' in tokens[0] or '.' in tokens[0]: # conllu. Attardi continue for char in tokens[1]: char_alphabet.add(char) word = DIGIT_RE.sub( "0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[4] type = tokens[7] pos_alphabet.add(pos) type_alphabet.add(type) if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict): vocab_set.add(word) vocab_list.append(word)
def getNext(self, normalize_digits=True): line = self.__source_file.readline() # skip multiple blank lines. while len(line) > 0 and len(line.strip()) == 0: line = self.__source_file.readline() if len(line) == 0: return None lines = [] while len(line.strip()) > 0: line = line.strip() lines.append(line.split(' ')) line = self.__source_file.readline() length = len(lines) if length == 0: return None words = [] char_seqs = [] postags = [] chunk_tags = [] ner_tags = [] for tokens in lines: if '-' in tokens[0] or '.' in tokens[0]: # conllu clitics. Attardi continue chars = [] for char in tokens[1]: chars.append(char) if len(chars) > MAX_CHAR_LENGTH: chars = chars[:MAX_CHAR_LENGTH] char_seqs.append(chars) word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[2] chunk = tokens[3] ner = tokens[4] words.append(word) postags.append(pos) chunk_tags.append(chunk) ner_tags.append(ner) return NERInstance(Sentence(words, char_seqs), postags, chunk_tags, ner_tags)
def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: # logger.info("Processing data: %s" % data_path) sentences = parse(open(data_path, 'r').read()) for sentence in sentences: for word in sentence: form = word['form'] pos = word['upostag'] type = word['deprel'] real_word = form.split('_BERT_')[0] for char in real_word: char_alphabet.add(char) form = DIGIT_RE.sub("0", form) if normalize_digits else form pos_alphabet.add(pos) type_alphabet.add(type) if form not in vocab_set and (form in embedd_dict or form.lower() in embedd_dict): vocab_set.add(form) vocab_list.append(form)
def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: # logger.info("Processing data: %s" % data_path) with open(data_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split(' ') word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[2] chunk = tokens[3] ner = tokens[4] pos_alphabet.add(pos) chunk_alphabet.add(chunk) ner_alphabet.add(ner) if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict): vocab_set.add(word) vocab_list.append(word)
def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None, min_occurrence=0, normalize_digits=False): def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: # logger.info("Processing data: %s" % data_path) sentences = parse(open(data_path, 'r').read()) for sentence in sentences: for word in sentence: form = word['form'] pos = word['upostag'] type = word['deprel'] real_word = form.split('_BERT_')[0] for char in real_word: char_alphabet.add(char) form = DIGIT_RE.sub("0", form) if normalize_digits else form pos_alphabet.add(pos) type_alphabet.add(type) if form not in vocab_set and (form in embedd_dict or form.lower() in embedd_dict): vocab_set.add(form) vocab_list.append(form) logger = get_logger("Create Alphabets") word_alphabet = Alphabet('word', defualt_value=True, singleton=False) char_alphabet = Alphabet('character', defualt_value=True) pos_alphabet = Alphabet('pos') type_alphabet = Alphabet('type') if not os.path.isdir(alphabet_directory): logger.info("Creating Alphabets: %s" % alphabet_directory) char_alphabet.add(PAD_CHAR) pos_alphabet.add(PAD_POS) type_alphabet.add(PAD_TYPE) char_alphabet.add(ROOT_CHAR) pos_alphabet.add(ROOT_POS) type_alphabet.add(ROOT_TYPE) char_alphabet.add(END_CHAR) pos_alphabet.add(END_POS) type_alphabet.add(END_TYPE) vocab = defaultdict(int) sentences = parse(open(train_path, 'r').read()) for sentence in sentences: for word in sentence: form = word['form'] pos = word['upostag'] type = word['deprel'] real_word = form.split('_BERT_')[0] for char in real_word: char_alphabet.add(char) form = DIGIT_RE.sub("0", form) if normalize_digits else form vocab[form] += 1 pos_alphabet.add(pos) type_alphabet.add(type) # collect singletons singletons = set( [word for word, count in vocab.items() if count <= min_occurrence]) # if a singleton is in pretrained embedding dict, set the count to min_occur + c if embedd_dict is not None: assert isinstance(embedd_dict, OrderedDict) for word in vocab.keys(): if word in embedd_dict or word.lower() in embedd_dict: vocab[word] += min_occurrence vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) logger.info("Total Vocabulary Size: %d" % len(vocab_list)) logger.info("Total Singleton Size: %d" % len(singletons)) vocab_list = [ word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence ] logger.info("Total Vocabulary Size (w.o rare words): %d" % len(vocab_list)) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] if data_paths is not None and embedd_dict is not None: expand_vocab() for word in vocab_list: word_alphabet.add(word) if word in singletons: word_alphabet.add_singleton(word_alphabet.get_index(word)) word_alphabet.save(alphabet_directory) char_alphabet.save(alphabet_directory) pos_alphabet.save(alphabet_directory) type_alphabet.save(alphabet_directory) else: word_alphabet.load(alphabet_directory) char_alphabet.load(alphabet_directory) pos_alphabet.load(alphabet_directory) type_alphabet.load(alphabet_directory) word_alphabet.close() char_alphabet.close() pos_alphabet.close() type_alphabet.close() logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size())) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Type Alphabet Size: %d" % type_alphabet.size()) return word_alphabet, char_alphabet, pos_alphabet, type_alphabet
def getNext(self, normalize_digits=True, symbolic_root=False, symbolic_end=False): line = self.__source_file.readline() # skip multiple blank lines. while len(line) > 0 and len(line.strip()) == 0: line = self.__source_file.readline() if len(line) == 0: return None lines = [] while len(line.strip()) > 0: line = line.strip() lines.append(line.split('\t')) line = self.__source_file.readline() length = len(lines) if length == 0: return None words = [] word_ids = [] char_seqs = [] char_id_seqs = [] postags = [] pos_ids = [] types = [] type_ids = [] heads = [] if symbolic_root: words.append(ROOT) word_ids.append(self.__word_alphabet.get_index(ROOT)) char_seqs.append([ ROOT_CHAR, ]) char_id_seqs.append([ self.__char_alphabet.get_index(ROOT_CHAR), ]) postags.append(ROOT_POS) pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS)) types.append(ROOT_TYPE) type_ids.append(self.__type_alphabet.get_index(ROOT_TYPE)) heads.append(0) for tokens in lines: chars = [] char_ids = [] for char in tokens[1]: chars.append(char) char_ids.append(self.__char_alphabet.get_index(char)) if len(chars) > MAX_CHAR_LENGTH: chars = chars[:MAX_CHAR_LENGTH] char_ids = char_ids[:MAX_CHAR_LENGTH] char_seqs.append(chars) char_id_seqs.append(char_ids) word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[4] head = int(tokens[6]) type = tokens[7] words.append(word) word_ids.append(self.__word_alphabet.get_index(word)) postags.append(pos) pos_ids.append(self.__pos_alphabet.get_index(pos)) types.append(type) type_ids.append(self.__type_alphabet.get_index(type)) heads.append(head) if symbolic_end: words.append(END) word_ids.append(self.__word_alphabet.get_index(END)) char_seqs.append([ END_CHAR, ]) char_id_seqs.append([ self.__char_alphabet.get_index(END_CHAR), ]) postags.append(END_POS) pos_ids.append(self.__pos_alphabet.get_index(END_POS)) types.append(END_TYPE) type_ids.append(self.__type_alphabet.get_index(END_TYPE)) heads.append(0) bert_sent_token = [] # word id for bert one_subword_word_indicator_ids = [] # subword index for word in words: word_tokens = self.tokenizer.tokenize(word) one_subword_word_indicator_ids.append(len(bert_sent_token) + 1) bert_sent_token += word_tokens bert_sent_token_ids = self.tokenizer.convert_tokens_to_ids( ['[CLS]'] + bert_sent_token + ['[SEP]']) return DependencyInstance( Sentence(bert_sent_token_ids, one_subword_word_indicator_ids, words, word_ids, char_seqs, char_id_seqs), postags, pos_ids, heads, types, type_ids)
def getNext(self, normalize_digits=True): line = self.__source_file.readline() # skip multiple blank lines. while len(line) > 0 and len(line.strip()) == 0: line = self.__source_file.readline() if len(line) == 0: return None lines = [] while len(line.strip()) > 0: line = line.strip() lines.append(line.split(' ')) line = self.__source_file.readline() length = len(lines) if length == 0: return None words = [] word_ids = [] char_seqs = [] char_id_seqs = [] postags = [] pos_ids = [] chunk_tags = [] chunk_ids = [] ner_tags = [] ner_ids = [] for tokens in lines: chars = [] char_ids = [] for char in tokens[1]: chars.append(char) char_ids.append(self.__char_alphabet.get_index(char)) if len(chars) > MAX_CHAR_LENGTH: chars = chars[:MAX_CHAR_LENGTH] char_ids = char_ids[:MAX_CHAR_LENGTH] char_seqs.append(chars) char_id_seqs.append(char_ids) word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[2] chunk = tokens[3] ner = tokens[4] words.append(word) word_ids.append(self.__word_alphabet.get_index(word)) postags.append(pos) pos_ids.append(self.__pos_alphabet.get_index(pos)) chunk_tags.append(chunk) chunk_ids.append(self.__chunk_alphabet.get_index(chunk)) ner_tags.append(ner) ner_ids.append(self.__ner_alphabet.get_index(ner)) return NERInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids, chunk_tags, chunk_ids, ner_tags, ner_ids)
def getNext(self, normalize_digits=False, symbolic_root=False, symbolic_end=False): if len(self.__sentences) == self.__cur_idx: return None sentence = self.__sentences[self.__cur_idx] self.__cur_idx += 1 words = [] word_ids = [] char_seqs = [] char_id_seqs = [] postags = [] pos_ids = [] types = [] type_ids = [] heads = [] if symbolic_root: words.append(ROOT) word_ids.append(self.__word_alphabet.get_index(ROOT)) char_seqs.append([ ROOT_CHAR, ]) char_id_seqs.append([ self.__char_alphabet.get_index(ROOT_CHAR), ]) postags.append(ROOT_POS) pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS)) types.append(ROOT_TYPE) type_ids.append(self.__type_alphabet.get_index(ROOT_TYPE)) heads.append(0) for word in sentence: chars = [] char_ids = [] real_word = word['form'].split('_BERT_')[0] for char in real_word: chars.append(char) char_ids.append(self.__char_alphabet.get_index(char)) if len(chars) > MAX_CHAR_LENGTH: chars = chars[:MAX_CHAR_LENGTH] char_ids = char_ids[:MAX_CHAR_LENGTH] char_seqs.append(chars) char_id_seqs.append(char_ids) form = DIGIT_RE.sub( "0", word['form']) if normalize_digits else word['form'] pos = word['upostag'] head = word['head'] type = word['deprel'] words.append(form) word_ids.append(self.__word_alphabet.get_index(form)) postags.append(pos) pos_ids.append(self.__pos_alphabet.get_index(pos)) types.append(type) type_ids.append(self.__type_alphabet.get_index(type)) heads.append(head) if symbolic_end: words.append(END) word_ids.append(self.__word_alphabet.get_index(END)) char_seqs.append([ END_CHAR, ]) char_id_seqs.append([ self.__char_alphabet.get_index(END_CHAR), ]) postags.append(END_POS) pos_ids.append(self.__pos_alphabet.get_index(END_POS)) types.append(END_TYPE) type_ids.append(self.__type_alphabet.get_index(END_TYPE)) heads.append(0) return DependencyInstance( Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids, heads, types, type_ids)
def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None, min_occurrence=1, normalize_digits=True): def expand_vocab(): vocab_set = set(vocab_list) for data_path in data_paths: # logger.info("Processing data: %s" % data_path) with open(data_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0 or line.startswith( '#'): # conllu format. Attardi continue tokens = line.split('\t') if '-' in tokens[0] or '.' in tokens[0]: # conllu. Attardi continue for char in tokens[1]: char_alphabet.add(char) word = DIGIT_RE.sub( "0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[4] type = tokens[7] pos_alphabet.add(pos) type_alphabet.add(type) if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict): vocab_set.add(word) vocab_list.append(word) logger = get_logger("Create Alphabets") word_alphabet = Alphabet('word', defualt_value=True, singleton=True) char_alphabet = Alphabet('character', defualt_value=True) pos_alphabet = Alphabet('pos') type_alphabet = Alphabet('type') if not os.path.isdir(alphabet_directory): logger.info("Creating Alphabets: %s" % alphabet_directory) char_alphabet.add(PAD_CHAR) pos_alphabet.add(PAD_POS) type_alphabet.add(PAD_TYPE) char_alphabet.add(ROOT_CHAR) pos_alphabet.add(ROOT_POS) type_alphabet.add(ROOT_TYPE) char_alphabet.add(END_CHAR) pos_alphabet.add(END_POS) type_alphabet.add(END_TYPE) vocab = defaultdict(int) # Attardi with open(train_path, 'r') as file: for line in file: line = line.strip() if len(line) == 0 or line.startswith('#'): # conllu. Attardi continue tokens = line.split('\t') if '-' in tokens[0] or '.' in tokens[0]: # conllu. Attardi continue for char in tokens[1]: char_alphabet.add(char) word = DIGIT_RE.sub( "0", tokens[1]) if normalize_digits else tokens[1] vocab[word] += 1 pos = tokens[4] pos_alphabet.add(pos) type = tokens[7] type_alphabet.add(type) # collect singletons singletons = set( [word for word, count in vocab.items() if count <= min_occurrence]) # if a singleton is in pretrained embedding dict, set the count to min_occur + c if embedd_dict is not None: assert isinstance(embedd_dict, OrderedDict) for word in vocab.keys(): if word in embedd_dict or word.lower() in embedd_dict: vocab[word] += min_occurrence vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) logger.info("Total Vocabulary Size: %d" % len(vocab_list)) logger.info("Total Singleton Size: %d" % len(singletons)) vocab_list = [ word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence ] logger.info("Total Vocabulary Size (w.o rare words): %d" % len(vocab_list)) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] if data_paths is not None and embedd_dict is not None: expand_vocab() for word in vocab_list: word_alphabet.add(word) if word in singletons: word_alphabet.add_singleton(word_alphabet.get_index(word)) word_alphabet.save(alphabet_directory) char_alphabet.save(alphabet_directory) pos_alphabet.save(alphabet_directory) type_alphabet.save(alphabet_directory) else: word_alphabet.load(alphabet_directory) char_alphabet.load(alphabet_directory) pos_alphabet.load(alphabet_directory) type_alphabet.load(alphabet_directory) word_alphabet.close() char_alphabet.close() pos_alphabet.close() type_alphabet.close() logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size())) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Type Alphabet Size: %d" % type_alphabet.size()) return word_alphabet, char_alphabet, pos_alphabet, type_alphabet
def load_embedding_dict(embedding, embedding_path, normalize_digits=True): """ load word embeddings from file :param embedding: :param embedding_path: :return: embedding dict, embedding dimention, caseless """ print("loading embedding: %s from %s" % (embedding, embedding_path)) if embedding == 'word2vec': # loading word2vec word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True) embedd_dim = word2vec.vector_size return word2vec, embedd_dim elif embedding == 'glove': # loading GloVe embedd_dim = -1 embedd_dict = OrderedDict() with gzip.open(embedding_path, 'rt') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split() if embedd_dim < 0: embedd_dim = len(tokens) - 1 else: assert (embedd_dim + 1 == len(tokens)) embedd = np.empty([1, embedd_dim], dtype=np.float32) embedd[:] = tokens[1:] word = DIGIT_RE.sub( "0", tokens[0]) if normalize_digits else tokens[0] embedd_dict[word] = embedd return embedd_dict, embedd_dim elif embedding == 'senna': # loading Senna embedd_dim = -1 embedd_dict = OrderedDict() with gzip.open(embedding_path, 'rt') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split() if embedd_dim < 0: embedd_dim = len(tokens) - 1 else: assert (embedd_dim + 1 == len(tokens)) embedd = np.empty([1, embedd_dim], dtype=np.float32) embedd[:] = tokens[1:] word = DIGIT_RE.sub( "0", tokens[0]) if normalize_digits else tokens[0] embedd_dict[word] = embedd return embedd_dict, embedd_dim elif embedding == 'sskip': embedd_dim = -1 embedd_dict = OrderedDict() with gzip.open(embedding_path, 'rt') as file: # Attardi # skip the first line file.readline() for line in file: line = line.strip() try: if len(line) == 0: continue tokens = line.split() if len(tokens) < embedd_dim: continue if embedd_dim < 0: embedd_dim = len(tokens) - 1 embedd = np.empty([1, embedd_dim], dtype=np.float32) start = len(tokens) - embedd_dim word = ' '.join(tokens[0:start]) embedd[:] = tokens[start:] word = DIGIT_RE.sub("0", word) if normalize_digits else word embedd_dict[word] = embedd except UnicodeDecodeError: continue return embedd_dict, embedd_dim elif embedding == 'polyglot': words, embeddings = pickle.load(open(embedding_path, 'rb'), encoding='latin1') _, embedd_dim = embeddings.shape embedd_dict = OrderedDict() for i, word in enumerate(words): embedd = np.empty([1, embedd_dim], dtype=np.float32) embedd[:] = embeddings[i, :] word = DIGIT_RE.sub("0", word) if normalize_digits else word embedd_dict[word] = embedd return embedd_dict, embedd_dim else: raise ValueError( "embedding should choose from [word2vec, senna, glove, sskip, polyglot]" )
def getNext(self, normalize_digits=True, symbolic_root=False, symbolic_end=False): line = self.__source_file.readline() # skip multiple blank lines. while len(line) > 0 and len(line.strip()) == 0: line = self.__source_file.readline() if len(line) == 0: return None lines = [] while len(line.strip()) > 0: if not line.startswith('#'): # Attardi line = line.strip() tokens = line.split('\t') lines.append(tokens) line = self.__source_file.readline() length = len(lines) if length == 0: return None words = [] char_seqs = [] lemmas = [] cpostags = [] postags = [] featss = [] heads = [] types = [] depss = [] miscs = [] if symbolic_root: words.append(ROOT) char_seqs.append([ROOT_CHAR]) lemmas.append(ROOT_LEMMA) postags.append(ROOT_POS) cpostags.append(ROOT_XPOS) featss.append(ROOT_FEATS) types.append(ROOT_TYPE) heads.append(0) depss.append(ROOT_DEPS) miscs.append(ROOT_MISC) for tokens in lines: chars = tokens[1][:MAX_CHAR_LENGTH] char_seqs.append(chars) word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] lemma = tokens[2] cpos = tokens[3] pos = tokens[4] feats = tokens[5] head = int(tokens[6]) type = tokens[7] deps = tokens[8] misc = tokens[9] words.append(word) lemmas.append(lemma) postags.append(pos) cpostags.append(cpos) featss.append(feats) heads.append(head) types.append(type) depss.append(deps) miscs.append(misc) if symbolic_end: words.append(END) char_seqs.append([END_CHAR]) lemmas.append(END_LEMMA) cpostags.append(END_XPOS) postags.append(END_POS) featss.append(END_FEATS) heads.append(0) types.append(END_TYPE) depss.append(END_DEPS) miscs.append(END_MISC) return SentenceTree(Sentence(words, char_seqs), lemmas, postags, cpostags, featss, heads, types, depss, miscs)
def getNext(self): words = [] char_seqs = [] lemmas = [] upostags = [] xpostags = [] featss = [] heads = [] types = [] depss = [] miscs = [] for line in self.__source_file: if line.strip() == '': # EOS break if line.startswith('#'): continue tokens = line.split('\t') if '-' in tokens[0] or '.' in tokens[0]: # conllu clitics. Attardi continue word = DIGIT_RE.sub( "0", tokens[1]) if self.normalize_digits else tokens[1] # trim to MAX_CHAR_LENGTH chars = tokens[1][:MAX_CHAR_LENGTH] lemma = tokens[2] upos = tokens[3] xpos = tokens[4] feats = tokens[5] head = int(tokens[6]) type = tokens[7] deps = tokens[8] misc = tokens[9] words.append(word) char_seqs.append(chars) lemmas.append(lemma) upostags.append(upos) xpostags.append(xpos) featss.append(feats) heads.append(head) types.append(type) depss.append(deps) miscs.append(misc) if not words: return None if self.symbolic_root: words.insert(0, ROOT) char_seqs.insert(0, [ROOT_CHAR]) lemmas.insert(0, ROOT_LEMMA) upostags.insert(0, ROOT_UPOS) xpostags.insert(0, ROOT_XPOS) heads.insert(0, 0) types.insert(0, ROOT_TYPE) depss.insert(0, ROOT_DEPS) miscs.insert(0, ROOT_MISC) if self.symbolic_end: words.append(END) char_seqs.append([END_CHAR]) lemmas.append(END_LEMMA) upostags.append(END_UPOS) xpostags.append(END_XPOS) featss.append(END_FEATS) heads.append(0) types.append(END_TYPE) depss.append(END_DEPS) miscs.append(END_MISC) return SentenceTree(Sentence(words, char_seqs), lemmas, upostags, xpostags, featss, heads, types, depss, miscs)
def getNext(self, normalize_digits=True, symbolic_root=False, symbolic_end=False): line = self.__source_file.readline() # skip multiple blank lines. while len(line) > 0 and len(line.strip()) == 0: line = self.__source_file.readline() if len(line) == 0: return None lines = [] while len(line.strip()) > 0: if not line.startswith('#'): # Attardi line = line.strip() tokens = line.split('\t') if not '-' in tokens[0] and not '.' in tokens[ 0]: # conllu. Attardi lines.append(tokens) line = self.__source_file.readline() length = len(lines) if length == 0: return None words = [] word_ids = [] char_seqs = [] char_id_seqs = [] postags = [] pos_ids = [] types = [] type_ids = [] heads = [] if symbolic_root: words.append(ROOT) word_ids.append(self.__word_alphabet.get_index(ROOT)) char_seqs.append([ ROOT_CHAR, ]) char_id_seqs.append([ self.__char_alphabet.get_index(ROOT_CHAR), ]) postags.append(ROOT_POS) pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS)) types.append(ROOT_TYPE) type_ids.append(self.__type_alphabet.get_index(ROOT_TYPE)) heads.append(0) for tokens in lines: chars = [] char_ids = [] for char in tokens[1]: chars.append(char) char_ids.append(self.__char_alphabet.get_index(char)) if len(chars) > MAX_CHAR_LENGTH: chars = chars[:MAX_CHAR_LENGTH] char_ids = char_ids[:MAX_CHAR_LENGTH] char_seqs.append(chars) char_id_seqs.append(char_ids) word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[4] head = int(tokens[6]) type = tokens[7] words.append(word) word_ids.append(self.__word_alphabet.get_index(word)) postags.append(pos) pos_ids.append(self.__pos_alphabet.get_index(pos)) types.append(type) type_ids.append(self.__type_alphabet.get_index(type)) heads.append(head) if symbolic_end: words.append(END) word_ids.append(self.__word_alphabet.get_index(END)) char_seqs.append([ END_CHAR, ]) char_id_seqs.append([ self.__char_alphabet.get_index(END_CHAR), ]) postags.append(END_POS) pos_ids.append(self.__pos_alphabet.get_index(END_POS)) types.append(END_TYPE) type_ids.append(self.__type_alphabet.get_index(END_TYPE)) heads.append(0) return DependencyInstance( Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids, heads, types, type_ids)
def load_embedding_dict(embedding, embedding_path, normalize_digits=False, word2index_path=''): """ load word embeddings from file :param embedding: :param embedding_path: :return: embedding dict, embedding dimention, caseless """ print("loading embedding: %s from %s" % (embedding, embedding_path)) if embedding == 'word2vec': # loading word2vec word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True) embedd_dim = word2vec.vector_size return word2vec, embedd_dim elif embedding == 'glove': # loading GloVe embedd_dim = -1 embedd_dict = OrderedDict() with gzip.open(embedding_path, 'rt') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split() if embedd_dim < 0: embedd_dim = len(tokens) - 1 else: assert (embedd_dim + 1 == len(tokens)) embedd = np.empty([1, embedd_dim], dtype=np.float32) embedd[:] = tokens[1:] word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0] embedd_dict[word] = embedd return embedd_dict, embedd_dim elif embedding == 'senna': # loading Senna embedd_dim = -1 embedd_dict = OrderedDict() with gzip.open(embedding_path, 'rt') as file: for line in file: line = line.strip() if len(line) == 0: continue tokens = line.split() if embedd_dim < 0: embedd_dim = len(tokens) - 1 else: assert (embedd_dim + 1 == len(tokens)) embedd = np.empty([1, embedd_dim], dtype=np.float32) embedd[:] = tokens[1:] word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0] embedd_dict[word] = embedd return embedd_dict, embedd_dim elif embedding == 'sskip': embedd_dim = -1 embedd_dict = OrderedDict() with gzip.open(embedding_path, 'rt') as file: # skip the first line file.readline() for line in file: line = line.strip() try: if len(line) == 0: continue tokens = line.split() if len(tokens) < embedd_dim: continue if embedd_dim < 0: embedd_dim = len(tokens) - 1 embedd = np.empty([1, embedd_dim], dtype=np.float32) start = len(tokens) - embedd_dim word = ' '.join(tokens[0:start]) embedd[:] = tokens[start:] word = DIGIT_RE.sub("0", word) if normalize_digits else word embedd_dict[word] = embedd except UnicodeDecodeError: continue return embedd_dict, embedd_dim elif embedding == 'polyglot': words, embeddings = pickle.load(open(embedding_path, 'rb'), encoding='latin1') _, embedd_dim = embeddings.shape embedd_dict = OrderedDict() for i, word in enumerate(words): embedd = np.empty([1, embedd_dim], dtype=np.float32) embedd[:] = embeddings[i, :] word = DIGIT_RE.sub("0", word) if normalize_digits else word embedd_dict[word] = embedd return embedd_dict, embedd_dim elif embedding == 'fasttext': fin = io.open(embedding_path, 'r', encoding='utf-8', newline='\n', errors='ignore') n, d = map(int, fin.readline().split()) embedd_dict = OrderedDict() for line in fin: tokens = line.rstrip().split(' ') embedd_dict[tokens[0]] = list(map(float, tokens[1:])) return embedd_dict, 300 elif embedding == 'bert': assert word2index_path != '' with open(word2index_path, 'r') as file: word2id = json.load(file) embedd_dict = OrderedDict() for key in word2id.keys(): embedd_dict[key] = np.load(embedding_path+'/'+str(word2id[key])+'.npy').tolist() print(len(embedd_dict)) return embedd_dict, 768 else: raise ValueError("embedding should choose from [word2vec, senna, glove, sskip, polyglot]")