Esempio n. 1
0
    def analogy(self, pos, neg, n=10):
        """
        Analogy similarity.

        Parameters
        ----------
        pos : list
        neg : list

        Returns
        -------
        List of tuples, each tuple is  (word, similarity)


        Example
        -------
            `king - man + woman = queen` will be:
            `pos=['king', 'woman'], neg=['man']`
        """
        words = pos + neg

        pos = [(word, 1.0) for word in pos]
        neg = [(word, -1.0) for word in neg]

        mean = []
        for word, direction in pos + neg:
            mean.append(direction * unitvec(self.get_vector(word)))
        mean = np.array(mean).mean(axis=0)

        similarities = np.dot(self.l2norm, mean)
        best = similarities.argsort()[::-1][1:n + len(words) - 1]
        return self.generate_response(best, similarities)
Esempio n. 2
0
    def from_text(cls, fname, vocabUnicodeSize=78, desired_vocab=None):
        """
        Create a WordVectors class based on a word2vec text file

        Parameters
        ----------
        fname : path to file
        vocabUnicodeSize: the maximum string length (78, by default)
        desired_vocab: if set, this will ignore any word and vector that
                       doesn't fall inside desired_vocab.

        Returns
        -------
        WordVectors instance
        """
        with open(fname, 'rb') as fin:
            header = fin.readline()
            vocab_size, vector_size = list(map(int, header.split()))

            vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize)
            vectors = np.empty((vocab_size, vector_size), dtype=np.float)
            for i, line in enumerate(fin):
                line = line.decode('ISO-8859-1').strip()
                parts = line.split(' ')
                word = parts[0]
                include = desired_vocab is None or word in desired_vocab
                if include:
                    vector = np.array(parts[1:], dtype=np.float)
                    vocab[i] = word
                    vectors[i] = unitvec(vector)

            if desired_vocab is not None:
                vectors = vectors[vocab != '', :]
                vocab = vocab[vocab != '']
        return cls(vocab=vocab, vectors=vectors)
Esempio n. 3
0
    def __init__(self, vocab, vectors=None, l2norm=None, save_memory=True):
        """
        Initialize a WordVectors class based on vocabulary and vectors

        This initializer precomputes the l2norm of the vectors

        Parameters
        ----------
        vocab : np.array
            1d array with the vocabulary
        vectors : np.array
            2d array with the vectors calculated by word2vec
        l2norm : np.array
            2d array with the calulated l2norm of the vectors
        save_memory : boolean
            wheter or not save the original vectors in `self.vectors`
        """
        if vectors is None and l2norm is None:
            raise Exception('Need vectors OR l2norm arguments')

        self.vocab = vocab

        if l2norm is None:
            if not save_memory:
                self.vectors = vectors
            self.l2norm = np.vstack(unitvec(vec) for vec in vectors)
        else:
            self.l2norm = l2norm
Esempio n. 4
0
    def analogy(self, pos, neg, n=10):
        '''
        Analogy similarity.

        Parameters
        ----------
        pos : list
        neg : list

        Example
        -------
            king - man + woman = queen | will be:
            pos=['king', 'woman'], neg=['man']
        '''
        words = pos + neg

        pos = [(word, 1.0) for word in pos]
        neg = [(word, -1.0) for word in neg]

        mean = []
        for word, direction in pos + neg:
            mean.append(direction * unitvec(self.get_vector(word)))
        mean = np.array(mean).mean(axis=0)

        similarities = np.dot(self.l2norm, mean)
        best = np.argsort(similarities)[::-1][:n + len(words) - 1]
        return [(_word, sim) for _word, sim in zip(self.vocab[best], similarities[best]) if _word not in words]
Esempio n. 5
0
    def from_binary(
            cls,
            fname,
            vocab_unicode_size=78,
            desired_vocab=None,
            encoding="utf-8",
            new_lines=True,
    ):
        """
        Create a WordVectors class based on a word2vec binary file

        Parameters
        ----------
        fname : path to file
        vocabUnicodeSize: the maximum string length (78, by default)
        desired_vocab: if set any words that don't fall into this vocab will be droped

        Returns
        -------
        WordVectors instance
        """
        with open(fname, "rb") as fin:
            # The first line has the vocab_size and the vector_size as text
            header = fin.readline()
            vocab_size, vector_size = list(map(int, header.split()))

            vocab = np.empty(vocab_size, dtype="<U%s" % vocab_unicode_size)
            vectors = np.empty((vocab_size, vector_size), dtype=np.float)
            binary_len = np.dtype(np.float32).itemsize * vector_size
            for i in range(vocab_size):
                # read word
                word = b""
                while True:
                    ch = fin.read(1)
                    if ch == b" ":
                        break
                    word += ch
                include = desired_vocab is None or word in desired_vocab
                if include:
                    vocab[i] = word.decode(encoding)

                # read vector
                vector = np.fromstring(fin.read(binary_len), dtype=np.float32)
                if include:
                    vectors[i] = unitvec(vector)
                if new_lines:
                    fin.read(1)    # newline char

            if desired_vocab is not None:
                vectors = vectors[vocab != "", :]
                vocab = vocab[vocab != ""]
        return cls(vocab=vocab, vectors=vectors)
Esempio n. 6
0
    def from_binary(cls,
                    fname,
                    vocabUnicodeSize=78,
                    desired_vocab=None,
                    encoding="utf-8"):
        """
        Create a WordVectors class based on a word2vec binary file

        Parameters
        ----------
        fname : path to file
        vocabUnicodeSize: the maximum string length (78, by default)
        desired_vocab: if set, this will ignore any word and vector that
                       doesn't fall inside desired_vocab.

        Returns
        -------
        WordVectors instance
        """
        with open(fname, 'rb') as fin:
            header = fin.readline()
            vocab_size, vector_size = list(map(int, header.split()))

            vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize)
            vectors = np.empty((vocab_size, vector_size), dtype=np.float)
            binary_len = np.dtype(np.float32).itemsize * vector_size
            for i in range(vocab_size):
                # read word
                word = b''
                while True:
                    ch = fin.read(1)
                    if ch == b' ':
                        break
                    word += ch
                include = desired_vocab is None or word in desired_vocab
                if include:
                    vocab[i] = word.decode(encoding)

                # read vector
                vector = np.fromstring(fin.read(binary_len), dtype=np.float32)
                if include:
                    vectors[i] = unitvec(vector)
                fin.read(1)  # newline

            if desired_vocab is not None:
                vectors = vectors[vocab != '', :]
                vocab = vocab[vocab != '']
        return cls(vocab=vocab, vectors=vectors)
Esempio n. 7
0
    def from_binary(cls, fname, vocabUnicodeSize=78, desired_vocab=None):
        """
        Create a WordVectors class based on a word2vec binary file

        a version that can fit for utf8 text

        Parameters
        ----------
        fname : path to file
        vocabUnicodeSize: the maximum string length (78, by default)
        desired_vocab: if set, this will ignore any word and vector that doesn't fall inside desired_vocab.

        Returns
        -------
        WordVectors instance
        """
        with open(fname) as fin:
            header = fin.readline()
            vocab_size, vector_size = map(int, header.split())

            vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize)
            vectors = np.empty((vocab_size, vector_size), dtype=np.float)
            binary_len = np.dtype(np.float32).itemsize * vector_size
            for i in xrange(vocab_size):
                # read word
                word = ''
                while True:
                    ch = fin.read(
                        1
                    )  ##for utf-8 style strings, as one character contains more than one bytes, we should decode the whole word instead of each byte
                    if ch == ' ':
                        break
                    word += ch
                include = desired_vocab is None or word in desired_vocab
                if include:
                    vocab[i] = word.decode('utf-8')  #decode here

                # read vector
                vector = np.fromstring(fin.read(binary_len), dtype=np.float32)
                if include:
                    vectors[i] = unitvec(vector)
                fin.read(1)  # newline

            if desired_vocab is not None:
                vectors = vectors[vocab != u'', :]
                vocab = vocab[vocab != u'']
        return cls(vocab=vocab, vectors=vectors)
Esempio n. 8
0
    def from_binary(cls, fname, vocabUnicodeSize=78, desired_vocab=None):
        """
        Create a WordVectors class based on a word2vec binary file

        a version that can fit for utf8 text

        Parameters
        ----------
        fname : path to file
        vocabUnicodeSize: the maximum string length (78, by default)
        desired_vocab: if set, this will ignore any word and vector that doesn't fall inside desired_vocab.

        Returns
        -------
        WordVectors instance
        """
        with open(fname) as fin:
            header = fin.readline()
            vocab_size, vector_size = map(int, header.split())

            vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize)
            vectors = np.empty((vocab_size, vector_size), dtype=np.float)
            binary_len = np.dtype(np.float32).itemsize * vector_size
            for i in xrange(vocab_size):
                # read word
                word = ''
                while True:
                    ch = fin.read(1) ##for utf-8 style strings, as one character contains more than one bytes, we should decode the whole word instead of each byte
                    if ch == ' ':
                        break
                    word += ch
                include = desired_vocab is None or word in desired_vocab
                if include:
                    vocab[i] = word.decode('utf-8') #decode here

                # read vector
                vector = np.fromstring(fin.read(binary_len), dtype=np.float32)
                if include:
                    vectors[i] = unitvec(vector)
                fin.read(1)  # newline

            if desired_vocab is not None:
                vectors = vectors[vocab != u'', :]
                vocab = vocab[vocab != u'']
        return cls(vocab=vocab, vectors=vectors)
Esempio n. 9
0
    def from_binary(cls, fname, vocabUnicodeSize=78, desired_vocab=None, encoding="utf-8", newLines=True):
        """
        Create a WordVectors class based on a word2vec binary file

        Parameters
        ----------
        fname : path to file
        vocabUnicodeSize: the maximum string length (78, by default)
        desired_vocab: if set, this will ignore any word and vector that
                       doesn't fall inside desired_vocab.

        Returns
        -------
        WordVectors instance
        """
        with open(fname, 'rb') as fin:
            header = fin.readline()
            vocab_size, vector_size = list(map(int, header.split()))

            vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize)
            vectors = np.empty((vocab_size, vector_size), dtype=np.float)
            binary_len = np.dtype(np.float32).itemsize * vector_size
            for i in range(vocab_size):
                # read word
                word = b''
                while True:
                    ch = fin.read(1)
                    if ch == b' ':
                        break
                    word += ch
                include = desired_vocab is None or word in desired_vocab
                if include:
                    vocab[i] = word.decode(encoding)

                # read vector
                vector = np.fromstring(fin.read(binary_len), dtype=np.float32)
                if include:
                    vectors[i] = unitvec(vector)
                if newLines:
                    fin.read(1)  # newline

            if desired_vocab is not None:
                vectors = vectors[vocab != '', :]
                vocab = vocab[vocab != '']
        return cls(vocab=vocab, vectors=vectors)
Esempio n. 10
0
    def from_text(cls,
                  fname,
                  vocabUnicodeSize=78,
                  desired_vocab=None,
                  encoding="utf-8"):
        """
        Create a WordVectors class based on a word2vec text file

        Parameters
        ----------
        fname : path to file
        vocabUnicodeSize: the maximum string length (78, by default)
        desired_vocab: if set, this will ignore any word and vector that
                       doesn't fall inside desired_vocab.

        Returns
        -------
        WordVectors instance
        """
        with open(fname, 'rb') as fin:
            header = fin.readline()
            vocab_size, vector_size = list(map(int, header.split()))

            vocab = np.empty(vocab_size, dtype='<U%s' % vocabUnicodeSize)
            vectors = np.empty((vocab_size, vector_size), dtype=np.float)
            for i, line in enumerate(fin):
                try:
                    line = line.decode(encoding).strip()
                    parts = line.split(' ')
                    word = parts[0]
                    include = desired_vocab is None or word in desired_vocab
                    if include:
                        vector = np.array(parts[1:], dtype=np.float)
                        vocab[i] = word
                        vectors[i] = unitvec(vector)
                except:
                    pass
            if desired_vocab is not None:
                vectors = vectors[vocab != '', :]
                vocab = vocab[vocab != '']
        return cls(vocab=vocab, vectors=vectors)
Esempio n. 11
0
 def __init__(self, vocab=None, vectors=None, saveMemory=True):
     self.vocab = vocab
     if not saveMemory:
         self.vectors = vectors
     self.l2norm = np.vstack(unitvec(vec) for vec in vectors)
Esempio n. 12
0
 def __init__(self, vocab=None, vectors=None, saveMemory=True):
     self.vocab = vocab
     if not saveMemory:
         self.vectors = vectors
     self.l2norm = np.vstack(unitvec(vec) for vec in vectors)