def test_filter(self):
     embs = load_from_dir("tests/data/embeddings/text/plain_with_file_header")
     path_vocab = "./tests/data/vocabs/plain"
     vocab = Vocabulary()
     vocab.load(path_vocab)
     embs.filter_by_vocab(["the", "apple"])
     embs.filter_by_vocab([])
Example #2
0
 def test_filter(self):
     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
     path_vocab = path.join('.', 'tests', 'data', 'vocabs', 'plain')
     vocab = Vocabulary()
     vocab.load(path_vocab)
     embs.filter_by_vocab(["the", "apple"])
     embs.filter_by_vocab([])
Example #3
0
 def load_hdf5(self, path):
     """loads embeddings from hdf5 format"""
     file_in = tables.open_file(os.path.join(path, 'vectors.h5p'), 'r')
     self.matrix = file_in.root.vectors.read()
     self.vocabulary = Vocabulary()
     self.vocabulary.load(path)
     # self.name += os.path.basename(os.path.normpath(path))
     file_in.close()
Example #4
0
 def test_save_and_load(self):
     vocab = Vocabulary()
     vocab.load(path_vocab)
     cnt_1 = vocab.cnt_words
     vocab.save_to_dir("/tmp/vecto/vocab/save1")
     vocab.load("/tmp/vecto/vocab/save1")
     assert cnt_1 == vocab.cnt_words
Example #5
0
 def load_npy(self, path):
     """loads embeddings from numpy format"""
     self.matrix = np.load(os.path.join(path, "vectors.npy"))
     # self.load_with_alpha(0.6)
     self.vocabulary = Vocabulary_simple()
     self.vocabulary.load(path)
     self.name += os.path.basename(os.path.normpath(path))
Example #6
0
 def load_from_text(self, path):
     i = 0
     # self.name+="_"+os.path.basename(os.path.normpath(path))
     self.vocabulary = Vocabulary()
     rows = []
     header = False
     vec_size = -1
     with detect_archive_format_and_open(path) as file_in:
         for line_number, line in enumerate(file_in):
             tokens = line.split()
             if i == 0 and len(tokens) == 2:
                 header = True
                 cnt_words = int(tokens[0])
                 vec_size = int(tokens[1])
                 continue
             # word = tokens[0].decode('ascii',errors="ignore")
             # word = tokens[0].decode('UTF-8', errors="ignore")
             word = tokens[0]
             self.vocabulary.dic_words_ids[word] = i
             self.vocabulary.lst_words.append(word)
             str_vec = tokens[1:]
             if vec_size == -1:
                 vec_size = len(str_vec)
             if vec_size != len(str_vec):
                 warning_message = "input error in line {}, expected tokens: {}, read tokens: {}, line: {}  ".format(
                     line_number, vec_size, len(str_vec), line)
                 warnings.warn(warning_message)
                 continue
             row = np.zeros(len(str_vec), dtype=np.float32)
             for j in range(len(str_vec)):
                 row[j] = float(str_vec[j])
             rows.append(row)
             i += 1
     # if header:
     #     assert cnt_words == len(rows)
     self.matrix = np.vstack(rows)
     if header:
         assert vec_size == self.matrix.shape[1]
     self.vocabulary.lst_frequencies = np.zeros(len(
         self.vocabulary.lst_words),
                                                dtype=np.int32)
     self.name = os.path.basename(os.path.dirname(os.path.normpath(path)))
Example #7
0
 def load_with_alpha(self, path, power=0.6):
     f = tables.open_file(os.path.join(path, 'vectors.h5p'), 'r')
     #        left = np.nan_to_num(f.root.vectors.read())
     left = f.root.vectors.read()
     sigma = f.root.sigma.read()
     logger.info("loaded left singular vectors and sigma")
     sigma = np.power(sigma, power)
     self.matrix = np.dot(left, np.diag(sigma))
     logger.info("computed the product")
     self.metadata["pow_sigma"] = power
     self.metadata["size_dimensions"] = int(self.matrix.shape[1])
     f.close()
     self.vocabulary = Vocabulary_simple()
     self.vocabulary.load(path)
     self.name += os.path.basename(
         os.path.normpath(path)) + "_a" + str(power)
Example #8
0
 def load_from_file(self, filename):
     self.vocabulary = Vocabulary()
     f = open(filename, "rb")
     header = f.readline().split()
     cnt_rows = int(header[0])
     size_row = int(header[1])
     # self.name += "_{}".format(size_row)
     self.matrix = np.zeros((cnt_rows, size_row), dtype=np.float32)
     # logger.debug("cnt rows = {}, size row = {}".format(cnt_rows, size_row))
     for i in range(cnt_rows):
         word = ModelW2V._load_word(f).decode('UTF-8',
                                              errors="ignore").strip()
         self.vocabulary.dic_words_ids[word] = i
         self.vocabulary.lst_words.append(word)
         s_row = f.read(size_row * 4)
         row = np.fromstring(s_row, dtype=np.float32)
         # row = row / np.linalg.norm(row)
         self.matrix[i] = row
     f.close()
Example #9
0
 def test_text_to_ids(self):
     v = Vocabulary()
     v.load(path_vocab)
     doc = load_file_as_ids(path_text_file, v)
     assert doc.shape == (TEST_TEXT_LEN, )
     assert np.allclose(doc[:10], [-1, 40, -1, -1, -1, -1, -1, -1, 57, -1])
Example #10
0
def load_from_dir(path):
    """Automatically detects embeddings format and loads

    Args:
        path: directory where embeddings are stores

    Returns:
        Instance of appropriate Model-based class
    """
    #    if os.path.isfile(os.path.join(path, "cooccurrence_csr.h5p")):
    #        logger.info("detected as sparse explicit in hdf5")
    #        result = ModelSparse()
    #        result.load_from_hdf5(path)
    #        result.load_metadata(path)
    #        return result
    #    if os.path.isfile(os.path.join(path, "bigrams.data.bin")):
    #        logger.info("detected as sparse in vecto legacy format")
    #        result = ModelSparse()
    #        result.load(path)
    #        result.load_metadata(path)
    #        return result
    #     if os.path.isfile(os.path.join(path, "vectors.bin")):
    #         logger.info("this is w2v original binary format")
    #         result = ModelW2V()
    #         result.load_from_dir(path)
    #         result.load_metadata(path)
    #         return result
    #    if os.path.isfile(os.path.join(path, "sgns.words.npy")):
    #        result = ModelLevy()
    #        logger.info("this is Levi")
    #        result.load_from_dir(path)
    #        result.load_metadata(path)
    #        return result
    #     if os.path.isfile(os.path.join(path, "vectors.npy")):
    #         result = ModelNumbered()
    #         logger.info("detected as dense ")
    #         result.load_npy(path)
    #         result.load_metadata(path)
    #         return result
    if os.path.isfile(os.path.join(path, "vectors.h5p")):
        result = vecto.embeddings.dense.WordEmbeddingsDense()
        logger.info("detected as vecto format ")
        result.load_hdf5(path)
        result.load_metadata(path)
        return result

    result = vecto.embeddings.dense.WordEmbeddingsDense()
    files = os.listdir(path)
    for f in files:
        if f.endswith(".gz") or f.endswith(".bz") or f.endswith(
                ".txt") or f.endswith(".vec"):
            logger.info(path + "Detected VSM in plain text format")
            result.load_from_text(os.path.join(path, f))
            result.load_metadata(path)
            return result
        if f.endswith(".npy"):
            logger.info("Detected VSM in numpy format")
            result.matrix = np.load(os.path.join(path, f))
            result.vocabulary = Vocabulary()
            result.vocabulary.load(path)
            result.load_metadata(path)
            return result
        # if any(file.endswith('bin') for file in os.listdir(path)):
        #     result = ModelW2V()
        #     logger.info("Detected VSM in the w2v original binary format")
        #     result.load_from_dir(path)
        #     result.load_metadata(path)
        #     return result
#        if f.startswith("words") and f.endswith(".npy") \
#               and os.path.isfile(os.path.join(path, f.replace(".npy", ".vocab"))):
#            result = Model_Fun()
#            result = ModelLevy()
#            logger.info("Detected VSM in npy and vocab in plain text file format")
#            result.load_from_dir(path, f[: -4])
#            result.load_metadata(path)
#            return result

    raise RuntimeError("Cannot detect the format of this VSM")
Example #11
0
def train(args):
    time_start = timer()
    if args.subword == 'none':
        current_utils = utils.word
    else:
        current_utils = utils.subword
    current_utils.args = args

    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        cuda.check_cuda_available()

    if args.path_vocab == '':
        vocab = create_from_dir(args.path_corpus, language=args.language)
    else:
        vocab = Vocabulary()
        vocab.load(args.path_vocab)
        logger.info("loaded vocabulary")

    if args.context_representation != 'word':  # for deps or ner context representation, we need a new context vocab for NS or HSM loss function.
        vocab_context = create_from_annotated_dir(
            args.path_corpus, representation=args.context_representation)
    else:
        vocab_context = vocab

    vocab_ngram_tokens = None
    if args.subword != 'none':
        if args.path_vocab_ngram_tokens == '':
            vocab_ngram_tokens = create_ngram_tokens_from_dir(
                args.path_corpus, args.min_gram, args.max_gram)
        else:
            vocab_ngram_tokens = Vocabulary()
            vocab_ngram_tokens.load(args.path_vocab_ngram_tokens)

        if args.path_word2chars == '':
            word2chars = None
        else:
            word2chars = get_word2chars(args.path_word2chars)

    loss_func = get_loss_func(args, vocab_context)
    model = get_model(args, loss_func, vocab, vocab_ngram_tokens,
                      current_utils)

    if args.gpu >= 0:
        model.to_gpu()
        logger.debug("model sent to gpu")

    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    if os.path.isfile(args.path_corpus):
        # todo for file corpus
        pass
    else:
        if args.subword == 'none':
            train_iter = current_utils.DirWindowIterator(
                path=args.path_corpus,
                vocab=vocab,
                window_size=args.window,
                batch_size=args.batchsize,
                language=args.language)
        else:
            train_iter = current_utils.DirWindowIterator(
                path=args.path_corpus,
                vocab=vocab,
                vocab_ngram_tokens=vocab_ngram_tokens,
                word2chars=word2chars,
                window_size=args.window,
                batch_size=args.batchsize,
                language=args.language)
    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       converter=current_utils.convert,
                                       device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'),
                               out=args.path_out)

    if os.path.isfile(args.path_corpus):
        # todo for file corpus
        # trainer.extend(extensions.Evaluator(val_iter, model, converter=convert, device=args.gpu))
        # trainer.extend(extensions.PrintReport(['epoch', 'main/loss', 'validation/main/loss', 'elapsed_time']))
        pass
    else:
        trainer.extend(
            extensions.PrintReport(['epoch', 'main/loss', 'elapsed_time']))
    trainer.extend(extensions.ProgressBar())
    trainer.extend(extensions.LogReport())
    trainer.run()
    model = create_model(args, model, vocab)
    time_end = timer()
    model.metadata["execution_time"] = time_end - time_start
    return model
Example #12
0
 def test_tokens_to_ids(self):
     vocab = Vocabulary()
     vocab.load(path_vocab)
     tokens = ["the", "apple"]
     ids = vocab.tokens_to_ids(tokens)
     print("ids:", ids)
Example #13
0
 def test_misc(self):
     vocab = Vocabulary()
     vocab.load(path_vocab)
     vocab.get_word_by_id(1)
     vocab.get_frequency("the")
     vocab.get_frequency("apple")
     vocab.lst_frequencies = []
     vocab.get_frequency("apple")
Example #14
0
 def test_filter(self):
     vocab = Vocabulary()
     vocab.load(path_vocab)
     vocab.filter_by_wordlist(["the"])
Example #15
0
class WordEmbeddingsDense(WordEmbeddings):
    """Stores dense embeddings.

    """
    def cmp_vectors(self, vec1, vec2):
        cos = normed(vec1) @ normed(vec2)
        if math.isnan(cos):
            return 0
        return (cos + 1) / 2

    def cmp_rows(self, id1, id2):
        vec1 = self.matrix[id1]
        vec2 = self.matrix[id2]
        return self.cmp_vectors(vec1, vec2)

    def cmp_words(self, word1, word2):
        id1 = self.vocabulary.get_id(word1)
        id2 = self.vocabulary.get_id(word2)
        if (id1 < 0) or (id2 < 0):
            return 0
        return self.cmp_rows(id1, id2)

    def save_matr_to_hdf5(self, path):
        file_out = tables.open_file(os.path.join(path, 'vectors.h5p'), 'w')
        atom = tables.Atom.from_dtype(self.matrix.dtype)
        ds = file_out.create_carray(file_out.root, 'vectors', atom,
                                    self.matrix.shape)
        ds[:] = self.matrix
        ds.flush()
        file_out.close()

    def load_hdf5(self, path):
        """loads embeddings from hdf5 format"""
        file_in = tables.open_file(os.path.join(path, 'vectors.h5p'), 'r')
        self.matrix = file_in.root.vectors.read()
        self.vocabulary = Vocabulary()
        self.vocabulary.load(path)
        # self.name += os.path.basename(os.path.normpath(path))
        file_in.close()

    def load_npy(self, path):
        """loads embeddings from numpy format"""
        self.matrix = np.load(os.path.join(path, "vectors.npy"))
        # self.load_with_alpha(0.6)
        self.vocabulary = Vocabulary_simple()
        self.vocabulary.load(path)
        self.name += os.path.basename(os.path.normpath(path))

    def save_to_dir(self, path):
        os.makedirs(path, exist_ok=True)
        self.vocabulary.save_to_dir(path)
        # self.matrix.tofile(os.path.join(path,"vectors.bin"))
        # np.save(os.path.join(path, "vectors.npy"), self.matrix)
        self.save_matr_to_hdf5(path)
        save_json(self.metadata, os.path.join(path, "metadata.json"))

    def save_to_dir_plain_txt(self, path):
        os.makedirs(path, exist_ok=True)
        with open(os.path.join(path, 'vectors.txt'), 'w') as output:
            for i, w in enumerate(self.vocabulary.lst_words):
                if len(w.strip()) == 0:
                    continue
                output.write(w + ' ')
                for j in range(self.matrix[i].shape[0]):
                    output.write(str(self.matrix[i][j]))
                    output.write(' ')
                output.write("\n")

    def load_with_alpha(self, path, power=0.6):
        f = tables.open_file(os.path.join(path, 'vectors.h5p'), 'r')
        #        left = np.nan_to_num(f.root.vectors.read())
        left = f.root.vectors.read()
        sigma = f.root.sigma.read()
        logger.info("loaded left singular vectors and sigma")
        sigma = np.power(sigma, power)
        self.matrix = np.dot(left, np.diag(sigma))
        logger.info("computed the product")
        self.metadata["pow_sigma"] = power
        self.metadata["size_dimensions"] = int(self.matrix.shape[1])
        f.close()
        self.vocabulary = Vocabulary_simple()
        self.vocabulary.load(path)
        self.name += os.path.basename(
            os.path.normpath(path)) + "_a" + str(power)

    def normalize(self):
        nrm = np.linalg.norm(self.matrix, axis=1)
        nrm[nrm == 0] = 1
        self.matrix /= nrm[:, np.newaxis]
        self._normalized_matrix = self.matrix
        self.metadata["normalized"] = True
        self.normalized = True

    def cache_normalized_copy(self):
        if hasattr(self, 'normalized') and self.normalized == True:
            self._normalized_matrix = self.matrix
        else:
            self._normalized_matrix = self.matrix.copy()
            self._normalized_matrix /= np.linalg.norm(self._normalized_matrix,
                                                      axis=1)[:, None]

    def load_from_text(self, path):
        i = 0
        # self.name+="_"+os.path.basename(os.path.normpath(path))
        self.vocabulary = Vocabulary()
        rows = []
        header = False
        vec_size = -1
        with detect_archive_format_and_open(path) as file_in:
            for line_number, line in enumerate(file_in):
                tokens = line.split()
                if i == 0 and len(tokens) == 2:
                    header = True
                    cnt_words = int(tokens[0])
                    vec_size = int(tokens[1])
                    continue
                # word = tokens[0].decode('ascii',errors="ignore")
                # word = tokens[0].decode('UTF-8', errors="ignore")
                word = tokens[0]
                self.vocabulary.dic_words_ids[word] = i
                self.vocabulary.lst_words.append(word)
                str_vec = tokens[1:]
                if vec_size == -1:
                    vec_size = len(str_vec)
                if vec_size != len(str_vec):
                    warning_message = "input error in line {}, expected tokens: {}, read tokens: {}, line: {}  ".format(
                        line_number, vec_size, len(str_vec), line)
                    warnings.warn(warning_message)
                    continue
                row = np.zeros(len(str_vec), dtype=np.float32)
                for j in range(len(str_vec)):
                    row[j] = float(str_vec[j])
                rows.append(row)
                i += 1
        # if header:
        #     assert cnt_words == len(rows)
        self.matrix = np.vstack(rows)
        if header:
            assert vec_size == self.matrix.shape[1]
        self.vocabulary.lst_frequencies = np.zeros(len(
            self.vocabulary.lst_words),
                                                   dtype=np.int32)
        self.name = os.path.basename(os.path.dirname(os.path.normpath(path)))

    def _populate_from_source_and_wordlist(self, source, wordlist):
        self.metadata["class"] = "embeddings"
        self.metadata["source"] = source.metadata
        self.vocabulary = source.vocabulary.filter_by_wordlist(wordlist)
        self.metadata["vocabulary"] = self.vocabulary.metadata
        lst_new_vectors = []
        for w in self.vocabulary.lst_words:
            lst_new_vectors.append(source.get_vector(w))
        self.matrix = np.array(lst_new_vectors, dtype=np.float32)

    def filter_by_vocab(self, words):
        """reduced embeddings to the provided list of words

        Args:
            words: set or list of words to keep

        Returns:
            Instance of Dense class

        """
        if len(words) == 0:
            return self
        new_embds = WordEmbeddingsDense()
        new_embds._populate_from_source_and_wordlist(self, words)
        return new_embds

    def get_x_label(self, i):
        return i

    def viz_wordlist(self, wl, colored=False, show_legend=False):
        colors = brewer2mpl.get_map('Set2', 'qualitative', 8).mpl_colors
        cnt = 0
        for i in wl:
            row = self.get_vector(i)
            row = normed(row)
            if colored:
                plt.bar(range(0, len(row)),
                        row,
                        color=colors[cnt],
                        linewidth=0,
                        alpha=0.6,
                        label=i)
            else:
                plt.bar(range(0, len(row)),
                        row,
                        color="black",
                        linewidth=0,
                        alpha=1 / len(wl),
                        label=i)
            cnt += 1
        if show_legend:
            plt.legend()

    def get_most_similar_vectors(self, u, cnt=10):
        scores = np.zeros(self.matrix.shape[0], dtype=np.float32)
        if hasattr(self, "_normalized_matrix"):
            scores = normed(u) @ self._normalized_matrix.T
            scores = (scores + 1) / 2
        else:
            str_warn = "\n\tthis method executes slow if embeddings are not normalized."
            str_warn += "\n\tuse normalize() method to normalize your embeddings"
            str_warn += "\n\tif for whatever reasons you need your embeddings to be not normalized, you can use .cache_normalized_copy() method to cache normalized copy of embeddings"
            str_warn += "\n\tplease note that latter will consume additional memory\n"
            warnings.warn(str_warn, RuntimeWarning)
            for i in range(self.matrix.shape[0]):
                scores[i] = self.cmp_vectors(u, self.matrix[i])
        ids = np.argsort(scores)[::-1]
        ids = ids[:cnt]
        return zip(ids, scores[ids])

    def get_most_similar_words(self, w, cnt=10):
        """returns list of words sorted by cosine proximity to a target word

        Args:
            w: target word
            cnt: how many similar words are needed

        Returns:
            list of words and corresponding similarities
        """

        if isinstance(w, str):
            vec = self.matrix[self.vocabulary.get_id(w)]
        else:
            vec = w
        rows = self.get_most_similar_vectors(vec, cnt)
        results = []
        for i in rows:
            results.append([self.vocabulary.get_word_by_id(i[0]), i[1]])
        return results

    def get_vector(self, w):
        i = self.vocabulary.get_id(w)
        if i < 0:
            raise RuntimeError('word do not exist', w)
        row = self.matrix[i]
        return row

    def has_word(self, w):
        i = self.vocabulary.get_id(w)
        if i < 0:
            return False
        return True
Example #16
0
 def test_load_from_dir(self):
     vocab = Vocabulary()
     vocab.load(path_vocab)
     print("the:", vocab.get_id("the"))
     vocab.load(path_vocab_one)
     print("the:", vocab.get_id("the"))
Example #17
0
 def test_text_to_ids(self):
     v = Vocabulary()
     v.load(path_vocab)
     doc = load_path_as_ids(path_text_file, v)
     # assert doc.shape == (TEST_TEXT_LEN,)
     assert np.allclose(doc[:10], [0, 40, 0, 0, 0, 1, 0, 0, 0, 0])