def test_damerau_levenshtein_distance_ndarray(self):
        assert damerau_levenshtein_distance_ndarray(
            'Saturday', np.array(['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'])
        ).tolist() == [3, 5, 5, 6, 4, 5, 0]

        assert damerau_levenshtein_distance_ndarray(
            'Sjöstedt', np.array(['Sjöstedt', 'Sjostedt', 'Söstedt', 'Sjöedt'])
        ).tolist() == [0, 1, 1, 2]
Example #2
0
    def compact_word_vectors(self, vocab, filename=None, array=None,
                             top=20000):
        """ Retrieve pretrained word spectors for our vocabulary.
        The returned word array has row indices corresponding to the
        compact index of a word, and columns correponding to the word
        vector.

        Arguments
        ---------
        vocab : dict
            Dictionary where keys are the loose index, and values are
            the word string.

        use_spacy : bool
            Use SpaCy to load in word vectors. Otherwise Gensim.

        filename : str
            Filename for SpaCy-compatible word vectors or if use_spacy=False
            then uses word2vec vectors via gensim.

        Returns
        -------
        data : numpy float array
            Array such that data[compact_index, :] = word_vector

        Examples
        --------
        >>> import numpy.linalg as nl
        >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
        >>> word_indices = np.zeros(50).astype('int32')
        >>> word_indices[:25] = 19  # 'Shuttle' shows 25 times
        >>> word_indices[25:35] = 5  # 'astronomy' is in 10 times
        >>> word_indices[40:46] = 7  # 'cold' is in 6 times
        >>> word_indices[46:] = 3  # 'hot' is in 3 times
        >>> corpus = Corpus()
        >>> corpus.update_word_count(word_indices)
        >>> corpus.finalize()
        >>> v, s, f = corpus.compact_word_vectors(vocab)
        >>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y)
        >>> vocab[corpus.compact_to_loose[2]]
        'shuttle'
        >>> vocab[corpus.compact_to_loose[3]]
        'astronomy'
        >>> vocab[corpus.compact_to_loose[4]]
        'cold'
        >>> sim_shuttle_astro = sim(v[2, :], v[3, :])
        >>> sim_shuttle_cold = sim(v[2, :], v[4, :])
        >>> sim_shuttle_astro > sim_shuttle_cold
        True
        """
        n_words = len(self.compact_to_loose)
        from gensim.models.word2vec import Word2Vec
        model = Word2Vec.load_word2vec_format(filename, binary=True)
        n_dim = model.syn0.shape[1]
        data = np.random.normal(size=(n_words, n_dim)).astype('float32')
        data -= data.mean()
        data += model.syn0.mean()
        data /= data.std()
        data *= model.syn0.std()
        if array is not None:
            data = array
            n_words = data.shape[0]
        keys_raw = model.vocab.keys()
        keys = [s.encode('ascii', 'ignore') for s in keys_raw]
        lens = [len(s) for s in model.vocab.keys()]
        choices = np.array(keys, dtype='S')
        lengths = np.array(lens, dtype='int32')
        s, f = 0, 0
        rep0 = lambda w: w
        rep1 = lambda w: w.replace(' ', '_')
        rep2 = lambda w: w.title().replace(' ', '_')
        reps = [rep0, rep1, rep2]
        for compact in np.arange(top):
            loose = self.compact_to_loose.get(compact, None)
            if loose is None:
                continue
            word = vocab.get(loose, None)
            if word is None:
                continue
            word = word.strip()
            vector = None
            for rep in reps:
                clean = rep(word)
                if clean in model.vocab:
                    vector = model[clean]
                    break
            if vector is None:
                try:
                    word = unicode(word)
                    idx = lengths >= len(word) - 3
                    idx &= lengths <= len(word) + 3
                    sel = choices[idx]
                    d = damerau_levenshtein_distance_ndarray(word, sel)
                    choice = np.array(keys_raw)[idx][np.argmin(d)]
                    # choice = difflib.get_close_matches(word, choices)[0]
                    vector = model[choice]
                    print compact, word, ' --> ', choice
                except IndexError:
                    pass
            if vector is None:
                f += 1
                continue
            s += 1
            data[compact, :] = vector[:]
        return data, s, f
Example #3
0
    def compact_word_vectors(self,
                             vocab,
                             filename=None,
                             array=None,
                             top=20000):
        """ Retrieve pretrained word spectors for our vocabulary.
        The returned word array has row indices corresponding to the
        compact index of a word, and columns correponding to the word
        vector.

        Arguments
        ---------
        vocab : dict
            Dictionary where keys are the loose index, and values are
            the word string.

        use_spacy : bool
            Use SpaCy to load in word vectors. Otherwise Gensim.

        filename : str
            Filename for SpaCy-compatible word vectors or if use_spacy=False
            then uses word2vec vectors via gensim.

        Returns
        -------
        data : numpy float array
            Array such that data[compact_index, :] = word_vector

        Examples
        --------
        >>> import numpy.linalg as nl
        >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
        >>> word_indices = np.zeros(50).astype('int32')
        >>> word_indices[:25] = 19  # 'Shuttle' shows 25 times
        >>> word_indices[25:35] = 5  # 'astronomy' is in 10 times
        >>> word_indices[40:46] = 7  # 'cold' is in 6 times
        >>> word_indices[46:] = 3  # 'hot' is in 3 times
        >>> corpus = Corpus()
        >>> corpus.update_word_count(word_indices)
        >>> corpus.finalize()
        >>> v, s, f = corpus.compact_word_vectors(vocab)
        >>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y)
        >>> vocab[corpus.compact_to_loose[2]]
        'shuttle'
        >>> vocab[corpus.compact_to_loose[3]]
        'astronomy'
        >>> vocab[corpus.compact_to_loose[4]]
        'cold'
        >>> sim_shuttle_astro = sim(v[2, :], v[3, :])
        >>> sim_shuttle_cold = sim(v[2, :], v[4, :])
        >>> sim_shuttle_astro > sim_shuttle_cold
        True
        """
        n_words = len(self.compact_to_loose)
        import gensim
        model = gensim.models.KeyedVectors.load_word2vec_format(filename,
                                                                binary=True)
        n_dim = model.syn0.shape[1]
        data = np.random.normal(size=(n_words, n_dim)).astype('float32')
        data -= data.mean()
        data += model.syn0.mean()
        data /= data.std()
        data *= model.syn0.std()
        if array is not None:
            data = array
            n_words = data.shape[0]
        keys_raw = model.vocab.keys()
        keys = [s.encode('ascii', 'ignore') for s in keys_raw]
        lens = [len(s) for s in model.vocab.keys()]
        choices = np.array(keys, dtype='S')
        lengths = np.array(lens, dtype='int32')
        s, f = 0, 0

        def rep0(w):
            return w

        def rep1(w):
            return w.replace(' ', '_')

        def rep2(w):
            return w.title().replace(' ', '_')

        reps = [rep0, rep1, rep2]
        for compact in np.arange(min(top, n_words)):
            loose = self.compact_to_loose.get(compact, None)
            if loose is None:
                continue
            word = vocab.get(loose, None)
            if word is None:
                continue
            word = word.strip()
            vector = None
            for rep in reps:
                clean = rep(word)
                if clean in model.vocab:
                    vector = model[clean]
                    break
            if vector is None:
                try:
                    word = unicode(word)
                    idx = lengths >= len(word) - 3
                    idx &= lengths <= len(word) + 3
                    sel = choices[idx]
                    d = damerau_levenshtein_distance_ndarray(word, sel)
                    choice = np.array(keys_raw)[idx][np.argmin(d)]
                    # choice = difflib.get_close_matches(word, choices)[0]
                    vector = model[choice]
                    print compact, word, ' --> ', choice
                except IndexError:
                    pass
            if vector is None:
                f += 1
                continue
            s += 1
            data[compact, :] = vector[:]
        return data, s, f
Example #4
0
    def compact_word_vectors(self,
                             vocab,
                             filename=None,
                             array=None,
                             top=20000):
        """ Retrieve pretrained word spectors for our vocabulary.
        The returned word array has row indices corresponding to the
        compact index of a word, and columns correponding to the word
        vector.
        This is called by data/preprocess.py to map our corpus vocabulary to
        GoogleNews-based vector data.

        Arguments
        ---------
        vocab : dict
            Dictionary where keys are the loose index, and values are
            the word string.

        use_spacy : bool
            Use SpaCy to load in word vectors. Otherwise Gensim.

        filename : str
            Filename for SpaCy-compatible word vectors or if use_spacy=False
            then uses word2vec vectors via gensim.

        Returns
        -------
        data : numpy float array
            Array such that data[compact_index, :] = word_vector

        Examples
        --------
        >>> import numpy.linalg as nl
        >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
        >>> word_indices = np.zeros(50).astype('int32')
        >>> word_indices[:25] = 19  # 'Shuttle' shows 25 times
        >>> word_indices[25:35] = 5  # 'astronomy' is in 10 times
        >>> word_indices[40:46] = 7  # 'cold' is in 6 times
        >>> word_indices[46:] = 3  # 'hot' is in 3 times
        >>> corpus = Corpus()
        >>> corpus.update_word_count(word_indices)
        >>> corpus.finalize()
        >>> v, s, f = corpus.compact_word_vectors(vocab)
        >>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y)
        >>> vocab[corpus.compact_to_loose[2]]
        'shuttle'
        >>> vocab[corpus.compact_to_loose[3]]
        'astronomy'
        >>> vocab[corpus.compact_to_loose[4]]
        'cold'
        >>> sim_shuttle_astro = sim(v[2, :], v[3, :])
        >>> sim_shuttle_cold = sim(v[2, :], v[4, :])
        >>> sim_shuttle_astro > sim_shuttle_cold
        True
        """
        n_words = len(self.compact_to_loose)

        # Depreciated in 3.3.0
        # --------------------
        # from gensim.models.word2vec import Word2Vec
        # model = Word2Vec.load_word2vec_format(filename, binary=True)
        from gensim.models import KeyedVectors
        # load with # C binary format
        logger.info('Loading word2vec data from {}'.format(filename))
        model = KeyedVectors.load_word2vec_format(filename, binary=True)

        n_dim = model.syn0.shape[1]
        data = np.random.normal(size=(n_words, n_dim)).astype('float32')
        data -= data.mean()
        data += model.syn0.mean()
        data /= data.std()
        data *= model.syn0.std()
        if array is not None:
            data = array
            n_words = data.shape[0]

        # model.vocab is vocabulary of loaded GoogleNews word2vec data.
        # Extract word strings of the GoogleNews vocabulary.
        keys_raw = model.vocab.keys()

        # keys = [s.encode('ascii', 'ignore') for s in keys_raw]
        keys = [s for s in keys_raw]

        # Extract word string length of each word in the GoogleNews vocabulary.
        lens = [len(s) for s in model.vocab.keys()]

        # choices = np.array(keys, dtype='S')
        choices = np.array(keys)
        lengths = np.array(lens, dtype='int32')
        s, f = 0, 0

        # Some clean-up rules
        rep0 = lambda w: w
        rep1 = lambda w: w.replace(' ', '_')
        rep2 = lambda w: w.title().replace(' ', '_')
        reps = [rep0, rep1, rep2]

        # We only keep the first top# words in corpus, in the desc-order of
        # term-frequency.
        for compact in tqdm(np.arange(top)):
            # not need vectors for special items
            if compact == 0 or compact == 1:
                s += 1
                f += 1
                data[compact, :] = None
                continue

            # skipping those compact# not associated with long hash#, or
            # those long hash# not associated with any word in the our corpus vocabulary.
            # normally this should not happen.
            loose = self.compact_to_loose.get(compact, None)
            if loose is None:
                print(
                    'ATTN: skipping compact# {} because cannot find long hash#'
                    .format(compact))
                continue
            word = vocab.get(loose, None)
            if word is None:
                print(
                    'ATTN: skipping long hash# {} because cannot find word in corpus vocab'
                    .format(loose))
                continue

            word = word.strip()
            vector = None
            # Try all clean-up rules to see if we can find the word in
            # GoogleNews word2vec vocabalary
            for rep in reps:
                clean = rep(word)
                # note that  model is gensim model loaded from GoogleNews word2vec data.
                # model.vocab is the vocabulary of GoogleNews word2vec vocabulary.
                # TODO: note that if we use LEMMA when constructing our own library,
                # then lemma words in our library may or may not be available in
                # GoogleNews vocabulary, unless we have access to LEMMA data in
                # GoogleNews vocabulary. In that case we currently depend on
                # similarity to look for the replacement word.
                if clean in model.vocab:
                    vector = model[clean]
                    break

            # Cannot find one word from our corpus vocabulary in GoogleNews
            # corpus vocabulary? This may happen because of resons like typo.
            # To recover as much as possible, we search for the similiar words
            # in GoogleNews vocabulary using our word (similiarity is mesured
            # in damerau_levenshtein_distance).
            if vector is None:
                # logger.info('No match {} in GoogleNews - look for the most similiar'.format(word))
                try:
                    # not required in Python3
                    # word = unicode(word)

                    # select all words in the GoogleNews vocabulary that
                    # word-len between [our-word-len - 3, our-word-len +3]
                    idx = lengths >= len(word) - 3
                    idx &= lengths <= len(word) + 3
                    sel = choices[idx]

                    # calculate distance between our word and all selected words
                    # in the GoogleNews vocabulary.
                    # d = damerau_levenshtein_distance_withNPArray(word, sel)
                    # choice = np.array(keys_raw)[idx][np.argmin(d)]
                    d = damerau_levenshtein_distance_ndarray(word, sel)

                    # pick the nearest word
                    choice = np.array(keys)[idx][np.argmin(d)]

                    # choice = difflib.get_close_matches(word, choices)[0]
                    vector = model[choice]
                    print(compact, word, ' --> ', choice)
                except IndexError:
                    pass
            if vector is None:
                f += 1
                continue
            s += 1
            data[compact, :] = vector[:]
        return data, s, f
Example #5
0
print("normalized_damerau_levenshtein_distance('{}', '{}') = {}".format(
    'gifts', 'profit',
    normalized_damerau_levenshtein_distance('gifts', 'profit')))
print(
    "normalized_damerau_levenshtein_distance('{}', '{}') = {}  # unicode example\n"
    .format('Sjöstedt', 'Sjostedt',
            normalized_damerau_levenshtein_distance(
                'Sjöstedt', 'Sjostedt')))  # unicode example

print('# edit distances for a single sequence against an array of sequences')
array = np.array([
    'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
    'Saturday'
])
print("damerau_levenshtein_distance_ndarray('{}', np.array({})) = {}".format(
    'Saturday', array, damerau_levenshtein_distance_ndarray('Saturday',
                                                            array)))
print(
    "normalized_damerau_levenshtein_distance_ndarray('{}', np.array({})) = {}\n"
    .format('Saturday', array,
            normalized_damerau_levenshtein_distance_ndarray('Saturday',
                                                            array)))

print(
    '# normalized edit distances for a single sequence against an array of sequences - unicode'
)
array = np.array(['Sjöstedt', 'Sjostedt', 'Söstedt', 'Sjöedt'])
print("damerau_levenshtein_distance_ndarray('{}', np.array({})) = {}".format(
    'Sjöstedt', array, damerau_levenshtein_distance_ndarray('Sjöstedt',
                                                            array)))
print(
    "normalized_damerau_levenshtein_distance_ndarray('{}', np.array({})) = {}\n"