Beispiel #1
0
    def __init__(self, fname, index_fname=None):
        """
        Initialize this abstract base class, by loading a previously saved index
        from `index_fname` (or `fname.index` if `index_fname` is not set).
        This index will allow subclasses to support the `corpus[docno]` syntax
        (random access to document #`docno` in O(1)).

        >>> # save corpus in SvmLightCorpus format with an index
        >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]]
        >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus)
        >>> # load back as a document stream (*not* plain Python list)
        >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight')
        >>> print(corpus_with_random_access[1])
        [(0, 1.0), (1, 2.0)]

        """
        try:
            if index_fname is None:
                index_fname = utils.smart_extension(fname, '.index')
            self.index = utils.unpickle(index_fname)
            # change self.index into a numpy.ndarray to support fancy indexing
            self.index = numpy.asarray(self.index)
            logger.info("loaded corpus index from %s" % index_fname)
        except:
            self.index = None
        self.length = None
Beispiel #2
0
    def load_varembed_format(cls, vectors, morfessor_model=None):
        """
        Load the word vectors into matrix from the varembed output vector files.
        Using morphemes requires Python 2.7 version or above.

        'vectors' is the pickle file containing the word vectors.
        'morfessor_model' is the path to the trained morfessor model.
        'use_morphemes' False(default) use of morpheme embeddings in output.
        """
        result = cls()
        if vectors is None:
            raise Exception("Please provide vectors binary to load varembed model")
        d = utils.unpickle(vectors)
        word_to_ix = d['word_to_ix']
        morpho_to_ix = d['morpho_to_ix']
        word_embeddings = d['word_embeddings']
        morpho_embeddings = d['morpheme_embeddings']
        result.load_word_embeddings(word_embeddings, word_to_ix)
        if morfessor_model:
            try:
                import morfessor
                morfessor_model = morfessor.MorfessorIO().read_binary_model_file(morfessor_model)
                result.add_morphemes_to_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix)
            except ImportError:
                # Morfessor Package not found.
                logger.error('Could not import morfessor. Not using morpheme embeddings')
                raise ImportError('Could not import morfessor.')

        logger.info('Loaded varembed model vectors from %s', vectors)
        return result
Beispiel #3
0
    def load(cls, fname, *args, **kwargs):
        """
        Load a previously saved object from file (also see `save`).

        Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`:

            >>> LdaModel.load(fname, mmap='r')

        """
        kwargs['mmap'] = kwargs.get('mmap', None)
        result = super(LdaModel, cls).load(fname, *args, **kwargs)

        # check if `random_state` attribute has been set after main pickle load
        # if set -> the model to be loaded was saved using a >= 0.13.2 version of Gensim
        # if not set -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so set `random_state` as the default value
        if not hasattr(result, 'random_state'):
            result.random_state = utils.get_random_state(None)  # using default value `get_random_state(None)`
            logging.warning("random_state not set so using default value")

        state_fname = utils.smart_extension(fname, '.state')
        try:
            result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs)
        except Exception as e:
            logging.warning("failed to load state from %s: %s", state_fname, e)

        id2word_fname = utils.smart_extension(fname, '.id2word')
        # check if `id2word_fname` file is present on disk
        # if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim, so set `result.id2word` using the `id2word_fname` file
        # if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so `result.id2word` already set after the main pickle load
        if (os.path.isfile(id2word_fname)):
            try:
                result.id2word = utils.unpickle(id2word_fname)
            except Exception as e:
                logging.warning("failed to load id2word dictionary from %s: %s", id2word_fname, e)
        return result
Beispiel #4
0
    def __init__(self, model_prefix=None, num_best=None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix +
                                                    '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix +
                                           '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        self.similarity_index = Similarity.load(model_prefix +
                                                '_similarity.index',
                                                mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
def load_model(model, topn, positive=[], negative=[]):
    if model == 'glove' or model == 'ppmi' or model == 'svd':
        model = utils.unpickle('./model/{}.model'.format(model))
        return most_similar_dist(model, positive=positive, negative=negative, topn=topn)
    else:
        model = Word2Vec.load('./model/{}.model'.format(model))
        return model.most_similar(positive=positive, negative=negative, topn=topn)
Beispiel #6
0
    def load(cls, fname, *args, **kwargs):
        """
        Load a previously saved object from file (also see `save`).

        Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`:

            >>> LdaModel.load(fname, mmap='r')

        """
        kwargs['mmap'] = kwargs.get('mmap', None)
        result = super(LdaModel, cls).load(fname, *args, **kwargs)
        state_fname = utils.smart_extension(fname, '.state')
        try:
            result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs)
        except Exception as e:
            logging.warning("failed to load state from %s: %s", state_fname, e)
        id2word_fname = utils.smart_extension(fname, '.id2word')
        if (os.path.isfile(id2word_fname)):
            try:
                result.id2word = utils.unpickle(id2word_fname)
            except Exception as e:
                logging.warning("failed to load id2word dictionary from %s: %s", id2word_fname, e)
        else:
            result.id2word = None
        return result
def main():
    parser = argparse.ArgumentParser(
        description=
        'convertes a given .metadata.cpickle file (such as generated by gensim MmCorpus.serialize(..., metadata=True) to a pickled frozenset of contained pageids',
        epilog=
        'Example: ./{} --metadata=enwiki-metadata.cpickle.bz2 --pageids=enwiki-pageids.cpickle.bz2'
        .format(sys.argv[0]))
    parser.add_argument(
        '--metadata',
        type=argparse.FileType('r'),
        help='path to input binary metadata file (.cpickle/.cpickle.bz2)',
        required=True)
    parser.add_argument(
        '--pageids',
        type=argparse.FileType('w'),
        help=
        'path to output binary frozenset of pageids file (.cpickle/.cpickle.bz2)',
        required=True)

    args = parser.parse_args()
    input_metadata_path = args.metadata.name
    output_pageids_path = args.pageids.name

    logger.info('running with:\n{}'.format(
        pformat({
            'input_metadata_path': input_metadata_path,
            'output_pageids_path': output_pageids_path
        })))

    metadata = unpickle(input_metadata_path)
    logger.debug('unpickled {}'.format(metadata))
    pageids = frozenset(int(md[0]) for md in metadata.values())
    logger.info('extracted {} pageids'.format(len(pageids)))
    logger.debug('created set {}'.format(pageids))
    pickle(pageids, output_pageids_path)
Beispiel #8
0
    def __init__(self, fname, index_fname=None):
        """
        Initialize this abstract base class, by loading a previously saved index
        from `index_fname` (or `fname.index` if `index_fname` is not set).
        This index will allow subclasses to support the `corpus[docno]` syntax
        (random access to document #`docno` in O(1)).

        >>> # save corpus in SvmLightCorpus format with an index
        >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]]
        >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus)
        >>> # load back as a document stream (*not* plain Python list)
        >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight')
        >>> print(corpus_with_random_access[1])
        [(0, 1.0), (1, 2.0)]

        """
        try:
            if index_fname is None:
                index_fname = utils.smart_extension(fname, '.index')
            self.index = utils.unpickle(index_fname)
            # change self.index into a numpy.ndarray to support fancy indexing
            self.index = numpy.asarray(self.index)
            logger.info("loaded corpus index from %s", index_fname)
        except Exception:
            self.index = None
        self.length = None
Beispiel #9
0
    def load(cls, fname, *args, **kwargs):
        """
        Load a previously saved object from file (also see `save`).

        Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`:

            >>> LdaModel.load(fname, mmap='r')

        """
        kwargs['mmap'] = kwargs.get('mmap', None)
        result = super(LdaModel, cls).load(fname, *args, **kwargs)
        state_fname = utils.smart_extension(fname, '.state')
        try:
            result.state = super(LdaModel, cls).load(state_fname, *args,
                                                     **kwargs)
        except Exception as e:
            logging.warning("failed to load state from %s: %s", state_fname, e)
        id2word_fname = utils.smart_extension(fname, '.id2word')
        if (os.path.isfile(id2word_fname)):
            try:
                result.id2word = utils.unpickle(id2word_fname)
            except Exception as e:
                logging.warning(
                    "failed to load id2word dictionary from %s: %s",
                    id2word_fname, e)
        else:
            result.id2word = None
        return result
Beispiel #10
0
 def load(cls, fname):
     """
     Load a previously saved object from file (also see `save`).
     """
     logger.debug("loading %s object from %s" % (cls.__name__, fname))
     result = utils.unpickle(fname)
     result.index = numpy.load(fname + '.npy', mmap_mode='r') # load back as read-only
     return result
 def load(cls, fname):
     """
     Load a previously saved object from file (also see `save`).
     """
     logger.info("loading %s object from %s" % (cls.__name__, fname))
     result = utils.unpickle(fname)
     result.corpus = np.load(fname + '.npy', mmap_mode='r')  # load back as read-only
     return result
     #endclass EsaModel
Beispiel #12
0
 def load(cls, fname):
     """
     Load a previously saved object from file (also see `save`).
     """
     logger.debug("loading %s object from %s" % (cls.__name__, fname))
     result = utils.unpickle(fname)
     result.index = numpy.load(fname + '.npy',
                               mmap_mode='r')  # load back as read-only
     return result
def load_y(docid2path_path):
    docid2path = unpickle(docid2path_path)
    y = list()
    for path in docid2path.values():
        if 'ham' in path.split(os.sep):
            y.append(1)
        else:
            y.append(0)
    return np.asarray(y, dtype=int)
Beispiel #14
0
 def __init__(self, fname):
     super(IdMmCorpus, self).__init__(fname)
     try:
         dockeys_fname = utils.smart_extension(fname, '.dockeys')
         self.dockeys = utils.unpickle(dockeys_fname)
         self.key_to_index = {k:n for (n, k) in enumerate(self.dockeys)}
         logger.info("loaded dockey index from %s" % dockeys_fname)
     except:
         self.dockeys = None
Beispiel #15
0
 def __init__(self, fname):
     super(IdMmCorpus, self).__init__(fname)
     try:
         dockeys_fname = utils.smart_extension(fname, '.dockeys')
         self.dockeys = utils.unpickle(dockeys_fname)
         self.key_to_index = {k: n for (n, k) in enumerate(self.dockeys)}
         logger.info("loaded dockey index from %s" % dockeys_fname)
     except:
         self.dockeys = None
def main():
    parser = argparse.ArgumentParser(
        description=
        'creates an id2author gensim dictionary mapping file which maps internal author ids to author names. pageids can be filtered against a whitelist',
        epilog=
        'Example: ./{} --history-dump=enwiki-pages-meta-history.xml.bz2 --id2author=enwiki-id2author.cpickle --whitelist=enwiki-pageids.cpickle.bz2'
        .format(sys.argv[0]))
    parser.add_argument(
        '--history-dump',
        type=argparse.FileType('r'),
        help=
        'path to input WikiMedia *-pages-meta-history file (.xml/.xml.bz2)',
        required=True)
    parser.add_argument(
        '--id2author',
        type=argparse.FileType('w'),
        help=
        'path to output binary id2author dictionary file (.cpickle/.cpickle.bz2)',
        required=True)
    parser.add_argument(
        '--pageids-whitelist',
        type=argparse.FileType('r'),
        help=
        'path to input pageids whitelist (binary pickled frozenset, .cpickle/.cpickle.bz2)'
    )

    args = parser.parse_args()
    input_history_dump_path = args.history_dump.name
    output_id2author_path = args.id2author.name
    input_pageids_whitelist_path = args.pageids_whitelist.name if args.pageids_whitelist else None

    program, logger = init_gensim_logger()
    logger.info('running {} with:\n{}'.format(
        program,
        pformat({
            'input_history_dump_path': input_history_dump_path,
            'output_id2author_path': output_id2author_path,
            'input_pageids_whitelist_path': input_pageids_whitelist_path
        })))

    dump = xml_dump.Iterator.from_file(smart_open(input_history_dump_path))
    if input_pageids_whitelist_path:
        whitelist = unpickle(input_pageids_whitelist_path)
        logger.info('loaded pageids whitelist of {} pages'.format(
            len(whitelist)))
        author_iter = ((revision.contributor.user_text for revision in page)
                       for page in dump if page.id in whitelist)
    else:
        logger.info('no pageids whitelist given')
        author_iter = ((revision.contributor.user_text for revision in page)
                       for page in dump)
    id2author = Dictionary(author_iter)
    id2author.save(output_id2author_path)
    logging.info('number of processed documents: {}'.format(
        id2author.num_docs))
    logging.info('number of found authors: {}'.format(len(id2author.token2id)))
    def __init__(self, model, topn, alpha, tagger, complex_freq, simple_freq,
                 freq_t, char_ngram):

        logger.info("Instatiating Simple Science Simplifier...")

        self.model = unpickle(model)
        logger.info("Loaded embeddings models from : `{}`".format(model))
        self.topn = topn
        self.alpha = alpha
        self.tagger = GeniaTagger(tagger)
        logger.info("Loaded Genia PoS tagger from : `{}`".format(tagger))
        self.complex_freq = unpickle(complex_freq)
        logger.info(
            "Loaded Complex Word Frequencies from : `{}`".format(complex_freq))
        self.simple_freq = unpickle(simple_freq)
        logger.info(
            "Loaded Simple Word Frequencies from : `{}`".format(simple_freq))
        self.freq_t = freq_t
        self.char_ngram = char_ngram
 def load(cls, fname):
     """
     Load a previously saved object from file (also see `save`).
     """
     logger.info("loading %s object from %s and %s" % (cls.__name__,
                                                       fname,
                                                       fname + ".index"))
     result = utils.unpickle(fname)
     result.similarity_index = MatrixSimilarity.load(fname + ".index")
     return result
Beispiel #19
0
 def load(cls, fname):
     """
     Load a previously saved object from file (also see `save`).
     """
     logger.debug("loading %s object from %s and %s.*.npy" % (cls.__name__, fname, fname))
     result = utils.unpickle(fname)
     data = numpy.load(fname + '.data.npy', mmap_mode='r') # load back as read-only
     indptr = numpy.load(fname + '.indptr.npy', mmap_mode='r')
     indices = numpy.load(fname + '.indices.npy', mmap_mode='r')
     result.index.data, result.index.indptr, result.index.indices = data, indptr, indices
     return result
Beispiel #20
0
 def load(cls, fname):
     """
     Load a previously saved object from file (also see `save`).
     """
     logger.debug("loading %s object from %s" % (cls.__name__, fname))
     result = utils.unpickle(fname)
     ufname = fname + '.npy'
     try:
         result.projection.u = numpy.load(ufname, mmap_mode='r') # load back as read-only
     except:
         logger.debug("failed to load mmap'ed projection from %s" % ufname)
     return result
Beispiel #21
0
 def load(cls, fname):
     """
     Load a previously saved object from file (also see `save`).
     """
     logger.debug("loading %s object from %s and %s.*.npy" %
                  (cls.__name__, fname, fname))
     result = utils.unpickle(fname)
     data = numpy.load(fname + '.data.npy',
                       mmap_mode='r')  # load back as read-only
     indptr = numpy.load(fname + '.indptr.npy', mmap_mode='r')
     indices = numpy.load(fname + '.indices.npy', mmap_mode='r')
     result.index.data, result.index.indptr, result.index.indices = data, indptr, indices
     return result
Beispiel #22
0
 def load(cls, fname):
     """
     Load a previously saved object from file (also see `save`).
     """
     logger.debug("loading %s object from %s" % (cls.__name__, fname))
     result = utils.unpickle(fname)
     ufname = fname + '.npy'
     try:
         result.projection.u = numpy.load(ufname, mmap_mode='r') # load back as read-only
     except:
         logger.debug("failed to load mmap'ed projection from %s" % ufname)
     result.dispatcher = None # TODO load back incl. distributed state? will require re-initialization of worker state
     return result
Beispiel #23
0
 def load(cls, fname):
     """
     Load a previously saved object from file (also see `save`).
     """
     logger.debug("loading %s object from %s" % (cls.__name__, fname))
     result = utils.unpickle(fname)
     ufname = fname + '.npy'
     try:
         result.projection.u = numpy.load(
             ufname, mmap_mode='r')  # load back as read-only
     except:
         logger.debug("failed to load mmap'ed projection from %s" % ufname)
     return result
Beispiel #24
0
 def load(cls, fname):
     """
     Load a previously saved object from file (also see `save`).
     """
     logger.info("loading %s object from %s" % (cls.__name__, fname))
     result = utils.unpickle(fname)
     ufname = fname + '.npy'
     try:
         result.projection.u = numpy.load(
             ufname, mmap_mode='r')  # load back as read-only
     except:
         logger.info("failed to load mmap'ed projection from %s" % ufname)
     result.dispatcher = None  # TODO load back incl. distributed state? will require re-initialization of worker state
     return result
Beispiel #25
0
    def load(cls, fname, mmap=None):
        """
        Load a previously corpus index from file.
        """
        LOGGER.info("Loading %s object from %s", cls.__name__, fname)
        result = utils.unpickle(fname)
        LOGGER.info("Finished unpickling EsaModel")
        path = fname + '.npz'
        sc = np.load(path)
        result.sparse_corpus = sparse.coo_matrix(
            (sc['data'], (sc['row'], sc['col'])),
            shape=sc['shape'])
        LOGGER.info("Finished loading sparse corpus")

        return result
Beispiel #26
0
    def load_varembed_format(cls, vectors, morfessor_model=None):
        """
        Load the word vectors into matrix from the varembed output vector files.
        Using morphemes requires Python 2.7 version or above.

        'vectors' is the pickle file containing the word vectors.
        'morfessor_model' is the path to the trained morfessor model.
        'use_morphemes' False(default) use of morpheme embeddings in output.
        """
        result = cls()
        if vectors is None:
            raise Exception(
                "Please provide vectors binary to load varembed model")
        D = utils.unpickle(vectors)
        word_to_ix = D['word_to_ix']
        morpho_to_ix = D['morpho_to_ix']
        word_embeddings = D['word_embeddings']
        morpho_embeddings = D['morpheme_embeddings']
        result.load_word_embeddings(word_embeddings, word_to_ix)
        if morfessor_model:
            if sys.version_info >= (
                    2,
                    7):  #Morfessor is only supported for Python 2.7 and above.
                try:
                    import morfessor
                    morfessor_model = morfessor.MorfessorIO(
                    ).read_binary_model_file(morfessor_model)
                    result.add_morphemes_to_embeddings(morfessor_model,
                                                       morpho_embeddings,
                                                       morpho_to_ix)
                except ImportError:
                    # Morfessor Package not found.
                    logger.error(
                        'Could not import morfessor. Not using morpheme embeddings'
                    )
                    raise ImportError('Could not import morfessor.')
            else:
                # Raise exception in Python 2.6 or earlier.
                raise Exception(
                    'Using Morphemes requires Python 2.7 and above. Morfessor is not supported in python 2.6'
                )

        logger.info('Loaded varembed model vectors from %s', vectors)
        return result
    def load_varembed_format(cls, vectors, morfessor_model=None):
        """Load the word vectors into matrix from the varembed output vector files.

        Parameters
        ----------
        vectors : dict
            Pickle file containing the word vectors.
        morfessor_model : str, optional
            Path to the trained morfessor model.

        Returns
        -------
        :class:`~gensim.models.wrappers.varembed.VarEmbed`
            Ready to use instance.

        """
        result = cls()
        if vectors is None:
            raise Exception(
                "Please provide vectors binary to load varembed model")
        d = utils.unpickle(vectors)
        word_to_ix = d['word_to_ix']
        morpho_to_ix = d['morpho_to_ix']
        word_embeddings = d['word_embeddings']
        morpho_embeddings = d['morpheme_embeddings']
        result.load_word_embeddings(word_embeddings, word_to_ix)
        if morfessor_model:
            try:
                import morfessor
                morfessor_model = morfessor.MorfessorIO(
                ).read_binary_model_file(morfessor_model)
                result.add_morphemes_to_embeddings(morfessor_model,
                                                   morpho_embeddings,
                                                   morpho_to_ix)
            except ImportError:
                # Morfessor Package not found.
                logger.error(
                    'Could not import morfessor. Not using morpheme embeddings'
                )
                raise ImportError('Could not import morfessor.')

        logger.info('Loaded varembed model vectors from %s', vectors)
        return result
Beispiel #28
0
    def __init__(self, fname, index_fname=None):
        """

        Parameters
        ----------
        fname : str
            Path to corpus.
        index_fname : str, optional
            Path to index, if not provided - used `fname.index`.

        """
        try:
            if index_fname is None:
                index_fname = utils.smart_extension(fname, '.index')
            self.index = utils.unpickle(index_fname)
            # change self.index into a numpy.ndarray to support fancy indexing
            self.index = numpy.asarray(self.index)
            logger.info("loaded corpus index from %s", index_fname)
        except Exception:
            self.index = None
        self.length = None
Beispiel #29
0
    def __init__(self, fname, index_fname=None):
        """

        Parameters
        ----------
        fname : str
            Path to corpus.
        index_fname : str, optional
            Path to index, if not provided - used `fname.index`.

        """
        try:
            if index_fname is None:
                index_fname = utils.smart_extension(fname, '.index')
            self.index = utils.unpickle(index_fname)
            # change self.index into a numpy.ndarray to support fancy indexing
            self.index = numpy.asarray(self.index)
            logger.info("loaded corpus index from %s", index_fname)
        except Exception:
            self.index = None
        self.length = None
Beispiel #30
0
    def __init__(self, model_prefix = None, num_best = None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
Beispiel #31
0
    def load_varembed_format(cls, vectors, morfessor_model=None):
        """Load the word vectors into matrix from the varembed output vector files.

        Parameters
        ----------
        vectors : dict
            Pickle file containing the word vectors.
        morfessor_model : str, optional
            Path to the trained morfessor model.

        Returns
        -------
        :class:`~gensim.models.wrappers.varembed.VarEmbed`
            Ready to use instance.

        """
        result = cls()
        if vectors is None:
            raise Exception("Please provide vectors binary to load varembed model")
        d = utils.unpickle(vectors)
        word_to_ix = d['word_to_ix']
        morpho_to_ix = d['morpho_to_ix']
        word_embeddings = d['word_embeddings']
        morpho_embeddings = d['morpheme_embeddings']
        result.load_word_embeddings(word_embeddings, word_to_ix)
        if morfessor_model:
            try:
                import morfessor
                morfessor_model = morfessor.MorfessorIO().read_binary_model_file(morfessor_model)
                result.add_morphemes_to_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix)
            except ImportError:
                # Morfessor Package not found.
                logger.error('Could not import morfessor. Not using morpheme embeddings')
                raise ImportError('Could not import morfessor.')

        logger.info('Loaded varembed model vectors from %s', vectors)
        return result
Beispiel #32
0
    logging.root.setLevel(level=logging.INFO)

    # check and process input arguments
    if len(sys.argv) < 2:
        print(inspect.cleandoc(__doc__) % locals())
        sys.exit(1)
    model_prefix = sys.argv[1]

    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("Loading word dictionary...")
    dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')
    logger.debug(dictionary)

    logger.info("Loading document name map...")
    article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

    logger.info("Loading tf-idf model...")
    tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

    logger.info("Loading similarity index...")
    similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')
    similarity_index.use_reverse_index = True

    logger.info("Finished loading model files.")

    mismatches = 0
    for doc_idx in range(0, len(similarity_index)):
        logger.info("Checking doc: %d %s" % (doc_idx, article_dict[doc_idx]))
        rev_doc = scipy.sparse.dok_matrix((1, len(dictionary)), dtype=np.float64)
        fwd_doc = similarity_index.vector_by_id(doc_idx)
    def __init__(self, model):

        self.model = unpickle(model)
Beispiel #34
0
    logging.root.setLevel(level=logging.INFO)

    # check and process input arguments
    if len(sys.argv) < 3:
        print(inspect.cleandoc(__doc__) % locals())
        sys.exit(1)
    input_file, output_prefix = sys.argv[1:3]

    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("Loading word dictionary...")
    dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2')
    logger.debug(dictionary)

    logger.info("Loading document name map...")
    article_dict = utils.unpickle(output_prefix + '_bow.mm.metadata.cpickle')

    logger.info("Loading tf-idf model...")
    tfidf = TfidfModel.load(output_prefix + '.tfidf_model')

    logger.info("Loading similarity index...")
    similarity_index = Similarity.load(output_prefix + '_similarity.index', mmap='r')
    similarity_index.use_reverse_index = True
    similarity_index.preload_reverse_index()

    logger.info("Finished loading model files.")

    logger.info("Processing input documents...")

    try:
        infile = open(input_file, 'r')
    # decomposition (glove, ppmi) take too much RAM (quadratic in vocabulary size).
    logger.info("dictionary found, loading")
    with open(outf("pruned_vocab.csv")) as csvfile:
        reader = csv.reader(csvfile)
        word2id = dict((rows[0],rows[1]) for rows in reader)
        utils.pickle(word2id, outf('word2id'))
            
    id2word = gensim.utils.revdict(word2id)
    
    # filter sentences to contain only the dictionary words
    corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences())

    if 'word2vec' in program:
        if os.path.exists(outf('w2v')):
            logger.info("word2vec model found, loading")
            model = utils.unpickle(outf('w2v'))
        else:
            logger.info("word2vec model not found, creating")
            if NEGATIVE:
                model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS, hs=0, negative=NEGATIVE)
            else:
                model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS)
            model.build_vocab(corpus())
            model.train(corpus())  # train with 1 epoch
            model.init_sims(replace=True)
            model.word2id = dict((w, v.index) for w, v in model.vocab.iteritems())
            model.id2word = utils.revdict(model.word2id)
            model.word_vectors = model.syn0norm
            utils.pickle(model, outf('w2v'))

    logger.info("evaluating accuracy")
        sys.exit()

    logging.basicConfig(
        format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
        level=logging.INFO)
    logger = logging.getLogger(program)
    logger.info("running %s" % " ".join(sys.argv))

    outf = lambda prefix: os.path.join(output_dir, prefix)
    logger.info("output file template will be %s" % outf('PREFIX'))

    sentences = MyCorpus(corpus_path)

    if os.path.exists(outf('word2id')):
        logger.info("dictionary found, loading")
        word2id = utils.unpickle(outf('word2id'))
    else:
        logger.info("dictionary not found, creating")
        id2word = corpora.Dictionary(sentences, prune_at=10000000)
        id2word.filter_extremes(
            keep_n=TOKEN_LIMIT)  # filter out too freq/infreq words
        word2id = dict((v, k) for k, v in id2word.iteritems())
        utils.pickle(word2id, outf('word2id'))
    id2word = utils.revdict(word2id)

    # Filter all wiki documents to contain only those words.
    corpus = lambda: ([word for word in sentence if word in word2id]
                      for sentence in sentences)

    if os.path.exists(outf('kw2v_%s' % GAMMA)):
        logger.info("Kernel word2vec model found, loading")
Beispiel #37
0
    if len(sys.argv) < 4:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    in_file = gensim.models.word2vec.LineSentence(sys.argv[1])
    # in_file = gensim.models.word2vec.Text8Corpus(sys.argv[1])
    q_file = sys.argv[2]
    outf = lambda prefix: os.path.join(sys.argv[3], prefix)
    logger.info("output file template will be %s" % outf('PREFIX'))

    sentences = lambda: itertools.islice(in_file, DOC_LIMIT)

    # use only a small subset of all words; otherwise the methods based on matrix
    # decomposition (glove, ppmi) take too much RAM (quadratic in vocabulary size).
    if os.path.exists(outf('word2id')):
        logger.info("dictionary found, loading")
        word2id = utils.unpickle(outf('word2id'))
    else:
        logger.info("dictionary not found, creating")
        id2word = gensim.corpora.Dictionary(sentences(), prune_at=10000000)
        id2word.filter_extremes(keep_n=TOKEN_LIMIT)  # filter out too freq/infreq words
        word2id = dict((v, k) for k, v in id2word.iteritems())
        utils.pickle(word2id, outf('word2id'))
    id2word = gensim.utils.revdict(word2id)

    # filter sentences to contain only the dictionary words
    corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences())

    if 'word2vec' in program:
        if os.path.exists(outf('w2v')):
            logger.info("word2vec model found, loading")
            model = utils.unpickle(outf('w2v'))
 def __init__(self, filename):
     self.corpus = MmCorpus(filename)
     self.metadata = unpickle(filename + ".metadata.cpickle")
from gensim import utils
import time
import sqlite3 as sql
from wiki_to_esa_db.sql_statements import *

import pathlib as pl


del_articles_strings = ["list of", "liste der", "liste von"]

TF_IDF_THRESHOLD = 50
dp = pl.Path("data_de_snowball_stemmed")
tfidf_mat_path = dp / "corpus_tfidf.mm"

tfidf_corpus = MmCorpus(str(tfidf_mat_path))
id_to_titles = utils.unpickle(str(dp / "bow.mm.metadata.cpickle"))
titles_to_id = utils.unpickle(str(dp / "titles_to_id.pickle"))
dictionary = Dictionary.load_from_text(str(dp / "dictionary.txt.bz2"))

db_path = dp / "esa.db"
if db_path.exists():
    db_path.unlink()

conn = sql.connect(str(db_path))
with conn:
    cursor = conn.cursor()
    cursor.execute(term_table)
    cursor.execute(article_table)
    cursor.execute(term_article_table)
    cursor.execute(term_index)
    cursor.execute(term_article_index)
def createSearchObjs():
    """
    Creates the SimSearch and KeySearch objects using the data structures
    created in `make_wikicorpus.py`.
    Returns (simsearch, keysearch, titles_to_id)
    """
    
    # Load the article titles. These have the format (pageid, article title)
    fprint('Loading Wikipedia article titles...')
    t0 = time.time()
    
    id_to_titles = utils.unpickle('./data/bow.mm.metadata.cpickle')
    titles_to_id = utils.unpickle('./data/titles_to_id.pickle')

    # id_to_titles is actually a map of indeces to (pageid, article title)
    # The 'pageid' property is unused.
    # Convert id_to_titles into a simple list of titles.
    titles = [item[1][1] for item in id_to_titles.items()]
    
    fprint('    Took %.2f seconds' % (time.time() - t0))        
    
    # Load the dictionary (830ms on my machine)
    fprint('\nLoading dictionary...')
    t0 = time.time()
    
    dictionary = Dictionary.load_from_text('./data/dictionary.txt.bz2')
    
    fprint('    Took %.2f seconds' % (time.time() - t0))    
    
    # Load tf-idf model (60ms on my machine).
    fprint('\nLoading tf-idf model...')
    t0 = time.time()
    
    tfidf_model = TfidfModel.load('./data/tfidf.tfidf_model')    
    
    fprint('    Took %.2f seconds' % (time.time() - t0))        
    
    # We must not use `load`--that would attempt to load the corpus into 
    # memory, and it's 16.7 GB!!
    #corpus_tfidf = MmCorpus.load('./data/corpus_tfidf.mm')
    
    fprint('\nCreating tf-idf corpus object (leaves the vectors on disk)...')
    t0 = time.time()
    
    corpus_tfidf = MmCorpus('./data/corpus_tfidf.mm')
    
    fprint('    Took %.2f seconds' % (time.time() - t0))            
    
    # Create the KeySearch and SimSearch objects.    
    ksearch = KeySearch(dictionary, tfidf_model, corpus_tfidf, titles)
    simsearch = SimSearch(ksearch)
    
    # TODO - SimSearch doesn't currently have a clean way to provide the index
    # and model.
    
    fprint('\nLoading LSI model...')
    t0 = time.time()    
    simsearch.lsi = LsiModel.load('./data/lsi.lsi_model')
    
    fprint('    Took %.2f seconds' % (time.time() - t0))        
    
    # Load the Wikipedia LSI vectors into memory.
    # The matrix is 4.69GB for me, and takes ~15 seconds on my machine to load.
    fprint('\nLoading Wikipedia LSI index...')
    t0 = time.time()
        
    simsearch.index = MatrixSimilarity.load('./data/lsi_index.mm')
    
    fprint('    Took %.2f seconds' % (time.time() - t0))    

    # TODO - It would be interesting to try the 'Similarity' class which 
    #       shards the dataset on disk for you...

    return (simsearch, ksearch, titles_to_id)
Beispiel #41
0
        dictionary = gensim.corpora.dictionary.Dictionary.from_corpus(
            corpus, id2word=id2word)

        logger.info("calculating truncated SVD")
        lsi = gensim.models.LsiModel(corpus,
                                     id2word=dictionary,
                                     num_topics=DIM)
        self.singular_scaled = lsi.projection.s**s_exponent
        # embeddings = left singular vectors scaled by the (exponentiated) singular values
        self.word_vectors = lsi.projection.u * self.singular_scaled


if __name__ == "__main__":
    logging.basicConfig(
        format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
        level=logging.INFO)

    from svd import SvdModel

    word2id = utils.unpickle('./tmp/word2id')
    id2word = gensim.utils.revdict(word2id)

    logger.info("SVD model creating")
    corpus = gensim.corpora.MmCorpus('./tmp/pmi_matrix.mm')
    model = SvdModel(corpus, id2word, s_exponent=0.0)
    model.word2id = word2id
    model.id2word = id2word
    utils.pickle(model, './model/svd.model')

    logger.info("finished running svd")
Beispiel #42
0
 def doc_with_meta(self, fname):
     self.doc_metadata = utils.unpickle(fname + '.metadata.cpickle')
Beispiel #43
0
def recommend():
    """
    When A POST request with json data is made to this uri,
    Read the example from the json, predict probability and
    send it with a response
    """
    # Get value for our example that came with the request
    data = request.json

    # prob_nmf = pickle.load(open('models/prob_nmf.pickle', 'rb'))
    # # prob_nmf = joblib.load('models/prob_nmf.pkl')
    # all_titles = pickle.load(open('models/all_titles.pkl', 'rb'))

    # f = 30
    # t = AnnoyIndex(f)  # Length of item vector that will be indexed
    # for i, row in enumerate(prob_nmf):
    #     v = row
    #     t.add_item(i, v)
    #
    # t.build(10) # 10 trees


    ###########
    title = data["example"].strip('\"')

    # clean_titles = [t[5:] for t in all_titles]
    #
    # title_id = clean_titles.index(title)
    # idx = t.get_nns_by_item(title_id, 1000)

    # tedx_list = []
    # for i in idx:
    #     if all_titles[i][:5] == 'TEDX_':
    #         tedx_list.append(all_titles[i][5:])
    #         if len(tedx_list) > 2:
    #             break

    w2vTITLE = utils.unpickle(modelpath+"w2vTitle_s410_minC40pcent_window7.model")
    # w2vTITLE = utils.unpickle(modelpath + "w2vTitle_s400_minC60pcent_window7.model")
    DF2 = pd.read_pickle(modelpath+'BBCgoodfood_TokensNLemms4word2vec.pkl')

    outlist = [[i, round(v * 1000) / 1000] for i, v in w2vTITLE.most_similar(positive=[title], topn=200)
               if i not in [n for m in DF2.ingredLems for n in m] and i not in ['BBC Children in Need cupcakes']
               and v > 0.76]
    outlist[:5]


    searchedTitle= [title]
    RECrecipes = outlist[:5] #['test rec 0','test rec 1','test rec 2']


    # blog_list = ["", ""]
    # count = 0
    # for i in idx:
    #     if all_titles[i][:5] == 'IDEA_':
    #         blog_list[count] = all_titles[i][5:]
    #         count += 1
    #         if count > 1:
    #             break

    # Put the result in a nice dict so we can send it as json
    # results = {"recommend_tedx": tedx_list,
    #            "recommend_blog": blog_list}
    results = {"searchedTitle": searchedTitle,
               "RECrecipes": RECrecipes}
    return jsonify(results)
        sys.stdout.flush()
        
        t0 = time.time()
    
        # Generate bag-of-words vectors (term-document frequency matrix) and 
        # write these directly to disk.
        # On my machine, this took 3.53 hrs. 
        # By setting metadata = True, this will also record all of the article
        # titles into a separate pickle file, 'bow.mm.metadata.cpickle'
        MmCorpus.serialize('./data/bow.mm', wiki, metadata=True, progress_cnt=10000)
        
        print '    Conversion to bag-of-words took %s' % formatTime(time.time() - t0)
        sys.stdout.flush()

        # Load the article titles back
        id_to_titles = utils.unpickle('./data/bow.mm.metadata.cpickle')
    
        # Create the reverse mapping, from article title to index.
        titles_to_id = {}

        # For each article...
        for at in id_to_titles.items():
            # `at` is (index, (pageid, article_title))  e.g., (0, ('12', 'Anarchism'))
            # at[1][1] is the article title.
            # The pagied property is unused.
            titles_to_id[at[1][1]] = at[0]
        
        # Store the resulting map.
        utils.pickle(titles_to_id, './data/titles_to_id.pickle')

        # We're done with the article titles so free up their memory.
Beispiel #45
0
    if len(sys.argv) < 4:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    in_file = gensim.models.word2vec.LineSentence(sys.argv[1])
    # in_file = gensim.models.word2vec.Text8Corpus(sys.argv[1])
    q_file = sys.argv[2]
    outf = lambda prefix: os.path.join(sys.argv[3], prefix)
    logger.info("output file template will be %s" % outf('PREFIX'))

    sentences = lambda: itertools.islice(in_file, DOC_LIMIT)

    # use only a small subset of all words; otherwise the methods based on matrix
    # decomposition (glove, ppmi) take too much RAM (quadratic in vocabulary size).
    if os.path.exists(outf('word2id')):
        logger.info("dictionary found, loading")
        word2id = utils.unpickle(outf('word2id'))
    else:
        logger.info("dictionary not found, creating")
        id2word = gensim.corpora.Dictionary(sentences(), prune_at=10000000)
        id2word.filter_extremes(
            keep_n=TOKEN_LIMIT)  # filter out too freq/infreq words
        word2id = dict((v, k) for k, v in id2word.iteritems())
        utils.pickle(word2id, outf('word2id'))
    id2word = gensim.utils.revdict(word2id)

    # filter sentences to contain only the dictionary words
    corpus = lambda: ([word for word in sentence if word in word2id]
                      for sentence in sentences())

    if 'word2vec' in program:
        if os.path.exists(outf('w2v')):
Beispiel #46
0
import gensim
import os
from gensim import corpora
from gensim import utils

class DtmCorpus(corpora.textcorpus.TextCorpus):
        def get_texts(self):
            return self.input

        def __len__(self):
            return len(self.input)

corpus, time_seq = utils.unpickle('gensim/test/test_data/dtm_test')

dtm_home = os.environ.get('DTM_HOME', "C:/Users/Artyom/SkyDrive/TopicModels/dtm-master/")
dtm_path = os.path.join(dtm_home, 'bin', 'dtm') if dtm_home else None


model = gensim.models.DtmModel(dtm_path, corpus, time_seq, num_topics=2, id2word=corpus.dictionary)
topics = model.show_topics(topics=2, times=2, topn=10)