def __init__(self, config): """ :return: """ if config.word_vector == 'word2vec': logger.info("Loading word2vec from disk ...") self.model = Word2Vec.load_word2vec_format(config.word_vector_path, binary=True) print("Loading done...") self.full_alphabet = Alphabet("full_lookup")
def read_conll(path, label_alphabet=None): word_sentences = [] pos_sentences = [] words = [] poses = [] word_alphabet = Alphabet('word', (padding_symbol, )) if label_alphabet is None: label_alphabet = Alphabet('label', (padding_symbol, )) with open(path) as f: for l in f: if l.strip() == "": word_sentences.append(words[:]) pos_sentences.append(poses[:]) words = [] poses = [] else: parts = l.split() word = parts[1] pos = parts[4] words.append(word) poses.append(pos) word_alphabet.add(word) label_alphabet.add(pos) # Add the last sentence in. if len(words) > 0: word_sentences.append(words[:]) pos_sentences.append(poses[:]) return word_sentences, pos_sentences, word_alphabet, label_alphabet
def read_conll(path): word_sentences = [] pos_sentences = [] words = [] poses = [] word_alphabet = Alphabet('word', (padding_symbol,)) pos_alphabet = Alphabet('pos', (padding_symbol,)) with open(path) as f: for l in f: if l.strip() == "": word_sentences.append(words[:]) pos_sentences.append(poses[:]) words = [] poses = [] else: parts = l.split() word = parts[1] pos = parts[4] words.append(word) poses.append(pos) word_alphabet.add(word) pos_alphabet.add(pos) return word_sentences, pos_sentences, word_alphabet, pos_alphabet
def read_models(model_base, data_name, model): logger.info("Loading models from disk.") models = {} models_to_load = ['auto', 'vanilla'] if model == 'all' else [model] for t in models_to_load: model = BaseLearner() model_dir = os.path.join(model_base, data_name, t) model.load(model_dir) pos_alphabet = Alphabet('pos') word_alphabet = Alphabet('word') pos_alphabet.load(model_dir) word_alphabet.load(model_dir) models[t] = (model, pos_alphabet, word_alphabet) logger.info("Loading done.") return models
def read_models(model_base, data_name, model): logger.info("Loading models from disk.") models = {} models_to_load = ['auto', 'vanilla'] if model == 'all' else [model] for t in models_to_load: model = BaseLearner() model_dir = get_model_directory(model_base, data_name, t) model.load(model_dir) pos_alphabet = Alphabet('pos') word_alphabet = Alphabet('word') pos_alphabet.load(model_dir) word_alphabet.load(model_dir) models[t] = (model, pos_alphabet, word_alphabet) logger.info("Loading done.") return models
class Lookup: def __init__(self, config): """ :return: """ if config.word_vector == 'word2vec': logger.info("Loading word2vec from disk ...") self.model = Word2Vec.load_word2vec_format(config.word_vector_path, binary=True) print("Loading done...") self.full_alphabet = Alphabet("full_lookup") def initail_lookup(self, alphabet): """ Initialize the lookup table of the word vectors. This will create a full lookup table that contains all the vocabulary, and a table that contains only the given alphabet. :param alphabet: The alphabet that stores the words. :return: A numpy array of shape [vocabulary size, dimension], each row is a word embedding. """ embeddings = [] if Alphabet.default_index == 0: embeddings.append(uniform_embedding([1, self.model.vector_size])) else: raise ValueError("Default index is not the first one, you must change the implementation here.") # Add words from the given alphabet to the embedding list, and to the full alphabet. for w, index in alphabet.iteritems(): if not self.full_alphabet.has_instance(w): embedding = self.model[w] if w in self.model else uniform_embedding([1, self.model.vector_size]) embeddings.append(embedding) self.full_alphabet.add(w) # Store embeddings that appear in training data. self.table = np.vstack(embeddings) for w in self.model.vocab.keys(): if not alphabet.has_instance(w): embedding = self.model[w] self.full_alphabet.add(w) embeddings.append(embedding) # Store embeddings of the full vocabulary. self.full_table = np.vstack(embeddings) logger.info("The training only embedding table contains %d embeddings, each with a dimension of size %d." % ( self.table.shape[0], self.table.shape[1])) logger.info("The full embedding table contains %d embeddings, each with a dimension of size %d." % ( self.full_table.shape[0], self.full_table.shape[1])) def load_additional_embeddings(self, original_alphabet, new_alphabet): """ Create an additional lookup table that contains additional words that's not in the orginal ones. :param original_alphabet: The original table. :param new_alphabet: The additional table. :return: """ embeddings = [] for w, index in new_alphabet.iteritems(): if not original_alphabet.has_instance(w): embedding = self.model[w] if w in self.model else uniform_embedding([1, self.model.vector_size]) embeddings.append(embedding) if len(embeddings) > 0: additional_table = np.vstack(embeddings) return additional_table else: return None