Ejemplo n.º 1
0
Archivo: ex9.py Proyecto: mat-hek/pjn
def rem_add(x, rem, add, wv: KeyedVectors):
    y = wv[parse(x)] - wv[parse(rem)] + wv[parse(add)]
    return wv.similar_by_vector(y, topn=5)
Ejemplo n.º 2
0
class VectorSpaceModel(object):

    """Base class for models that represent words as vectors.

    For now, this really is just a wrapper around the Gensim KeyedVectors / Word2Vec class.

    """

    def __init__(self, name=None):
        self.name = name
        self.m = KeyedVectors()
        return

    @classmethod
    def load(cls, filename, modelname=None, **kwargs):
        if filename.endswith('.pkl'):
            model = cls.load_pickle(filename, modelname=modelname, **kwargs)
        else:
            model = cls.load_w2v(filename, modelname=modelname, **kwargs)
        return model

    @classmethod
    def load_pickle(cls, filename, **kwargs):
        debug("Loading pickled model from file {:}".format(filename))
        model = pickle.load(filename)
        return model

    @classmethod
    def load_w2v(cls, filename, modelname=None, **kwargs):
        """Load the model from disk."""
        debug("Loading word2vec model from file {:}".format(filename))
        if filename.endswith(".bin"):
            m = KeyedVectors.load_word2vec_format(filename, binary=True)
        else:
            m = KeyedVectors.load_word2vec_format(filename)
        model = cls()
        model.m = m
        if modelname is None:
            modelname = os.path.basename(filename)
            modelname = re.sub('.bin', '', modelname)
        model.name = modelname
        return model

    def save_pickle(self, filename):
        debug("Saving model {:} to pickle file {:}".format(self.name, filename))
        pickle.dump(self, filename)
        return

    def __getitem__(self, word):
        return(self.m[word])

    def most_similar(self, query, k=5):
        """Return the most similar words to the query. `query` can be either a string or a
        vector. If it is a string, then its vector will be looked up in the current VSM.
        """
        if type(query) is str:
            results = self.m.most_similar(query, topn=k)
        else:
            results = self.m.similar_by_vector(query, topn=k)
        return results

    def __repr__(self):
        return "<VectorSpaceModel {:} with {:,} vectors>".format(repr(self.name), self.m.syn0.shape[0])