Esempio n. 1
0
	def handleWorker(self, queue, core_nr, dictionary, levThr):

		corpus_writer = matutils.MmWriter('%s/%s/%s_%i.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName, core_nr))
		corpus_writer.write_headers(-1, -1, -1)
		nnz = 0
		nrDocs = 0
		
		poslast = 0
		offsets = []
		
		# dictionary lookup is very slow. Copy to own version.
		mydictionary = {}
		for key in dictionary.iterkeys():
			mydictionary[key] = (dictionary[key], len(dictionary[key]))
		dictionaryLen = len(dictionary)
		
		for item in iter(queue.get, 'STOP'):
			termIndex = item
			
			# Determine offsets for the index file, allowing O(1) access time of documents.
			posnow = corpus_writer.fout.tell()
			if posnow == poslast:
				offsets[-1] = -1
			offsets.append(posnow)
			poslast = posnow
			
			nnz += self.writeLevenshteinDistance(termIndex, mydictionary, dictionaryLen, levThr, corpus_writer)
			nrDocs += 1
		corpus_writer.fake_headers(nrDocs, dictionaryLen, nnz)
		corpus_writer.close()
		
		# Write index to file
		index_fname = corpus_writer.fname + '.index'
		utils.pickle(offsets, index_fname)
def main():
    parser = argparse.ArgumentParser(
        description=
        'convertes a given .metadata.cpickle file (such as generated by gensim MmCorpus.serialize(..., metadata=True) to a pickled frozenset of contained pageids',
        epilog=
        'Example: ./{} --metadata=enwiki-metadata.cpickle.bz2 --pageids=enwiki-pageids.cpickle.bz2'
        .format(sys.argv[0]))
    parser.add_argument(
        '--metadata',
        type=argparse.FileType('r'),
        help='path to input binary metadata file (.cpickle/.cpickle.bz2)',
        required=True)
    parser.add_argument(
        '--pageids',
        type=argparse.FileType('w'),
        help=
        'path to output binary frozenset of pageids file (.cpickle/.cpickle.bz2)',
        required=True)

    args = parser.parse_args()
    input_metadata_path = args.metadata.name
    output_pageids_path = args.pageids.name

    logger.info('running with:\n{}'.format(
        pformat({
            'input_metadata_path': input_metadata_path,
            'output_pageids_path': output_pageids_path
        })))

    metadata = unpickle(input_metadata_path)
    logger.debug('unpickled {}'.format(metadata))
    pageids = frozenset(int(md[0]) for md in metadata.values())
    logger.info('extracted {} pageids'.format(len(pageids)))
    logger.debug('created set {}'.format(pageids))
    pickle(pageids, output_pageids_path)
Esempio n. 3
0
 def save(self, fname,
          separately=None, sep_limit=10485760, ignore=frozenset([])):
     """
     Save corpus index in for persistency
     """
     self.document_titles.save()
     path = fname + '.npz'
     LOGGER.info("Storing %s object to %s and %s",
                 self.__class__.__name__, fname, path)
     # Remove the index from self.__dict__,
     # so it doesn't get pickled
     index = self.sparse_corpus
     del self.corpus
     del self.sparse_corpus
     try:
         utils.pickle(self, fname)
         LOGGER.info("Finished pickling EsaModel")
         np.savez(path, row=index.row, col=index.col,
                  data=index.data, shape=index.shape)
         LOGGER.info("Finished saving sparse corpus")
         # Not needed? Check if they are saved properly with the rest of the model
         # pickle.dump(self.document_titles, open(fname + '_doc_titles.pickle', 'w'))
         # LOGGER.info("Finished saving (reduced) doc titles")
     finally:
         self.corpus = index
    def save_corpus(fname,
                    corpus,
                    id2word=None,
                    progress_cnt=1000,
                    metadata=False):
        """Store documents one on each line as plain text words."""
        logging.info("storing corpus in Line format to %s", fname)

        def word_id2word(word_id):
            try:
                return id2word[word_id]
            except KeyError:
                return ""

        with smart_open(fname, 'wb') as f:
            if metadata:
                docno2metadata = {}

            for docno, doc in enumerate(corpus):
                if metadata:
                    doc, data = doc
                    docno2metadata[docno] = data

                if docno % progress_cnt == 0:
                    logging.info("PROGRESS: saving document #%i", docno)

                fmt = ' '.join(map(word_id2word, doc))

                f.write(to_utf8("%s\n" % fmt))

            if metadata:
                utils.pickle(docno2metadata, fname + '.metadata.cpickle')
Esempio n. 5
0
    def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None):
        """
        Iterate through the document stream `corpus`, saving the documents to `fname`
        and recording byte offset of each document. Save the resulting index
        structure to file `index_fname` (or `fname`.index is not set).

        This relies on the underlying corpus class `serializer` providing (in
        addition to standard iteration):

        * `save_corpus` method that returns a sequence of byte offsets, one for
           each saved document,
        * the `docbyoffset(offset)` method, which returns a document
          positioned at `offset` bytes within the persistent storage (file).

        Example:

        >>> MmCorpus.serialize('test.mm', corpus)
        >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access
        >>> print mm[42] # retrieve document no. 42, etc.
        """
        if index_fname is None:
            index_fname = fname + '.index'

        if progress_cnt is not None:
            offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt)
        else:
            offsets = serializer.save_corpus(fname, corpus, id2word)
        if offsets is None:
            raise NotImplementedError("called serialize on class %s which \
            doesn't support indexing!" % serializer.__name__)

        # store offsets persistently, using pickle
        logger.info("saving %s index to %s" % (serializer.__name__, index_fname))
        utils.pickle(offsets, index_fname)
Esempio n. 6
0
    def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, metadata=False):
        """
        Save the vector space representation of an entire corpus to disk.

        Note that the documents are processed one at a time, so the whole corpus
        is allowed to be larger than the available RAM.
        """
        mw = MmWriter(fname)

        # write empty headers to the file (with enough space to be overwritten later)
        mw.write_headers(-1, -1, -1) # will print 50 spaces followed by newline on the stats line

        # calculate necessary header info (nnz elements, num terms, num docs) while writing out vectors
        _num_terms, num_nnz = 0, 0
        docno, poslast = -1, -1
        offsets = []
        if hasattr(corpus, 'metadata'):
            orig_metadata = corpus.metadata
            corpus.metadata = metadata
            if metadata:
                docno2metadata = {}
        else:
            metadata = False
        for docno, doc in enumerate(corpus):
            if metadata:
                bow, data = doc
                docno2metadata[docno] = data
            else:
                bow = doc
            if docno % progress_cnt == 0:
                logger.info("PROGRESS: saving document #%i" % docno)
            if index:
                posnow = mw.fout.tell()
                if posnow == poslast:
                    offsets[-1] = -1
                offsets.append(posnow)
                poslast = posnow
            max_id, veclen = mw.write_vector(docno, bow)
            _num_terms = max(_num_terms, 1 + max_id)
            num_nnz += veclen
        if metadata:
            utils.pickle(docno2metadata, fname + '.metadata.cpickle')
            corpus.metadata = orig_metadata

        num_docs = docno + 1
        num_terms = num_terms or _num_terms

        if num_docs * num_terms != 0:
            logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" % (
                num_docs, num_terms,
                100.0 * num_nnz / (num_docs * num_terms),
                num_nnz,
                num_docs * num_terms))

        # now write proper headers, by seeking and overwriting the spaces written earlier
        mw.fake_headers(num_docs, num_terms, num_nnz)

        mw.close()
        if index:
            return offsets
Esempio n. 7
0
    def serialize(serializer,
                  fname,
                  corpus,
                  id2word=None,
                  index_fname=None,
                  progress_cnt=None,
                  labels=None):
        """
        Iterate through the document stream `corpus`, saving the documents to `fname`
        and recording byte offset of each document. Save the resulting index
        structure to file `index_fname` (or `fname`.index is not set).

        This relies on the underlying corpus class `serializer` providing (in
        addition to standard iteration):

        * `save_corpus` method that returns a sequence of byte offsets, one for
           each saved document,
        * the `docbyoffset(offset)` method, which returns a document
          positioned at `offset` bytes within the persistent storage (file).

        Example:

        >>> MmCorpus.serialize('test.mm', corpus)
        >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access
        >>> print mm[42] # retrieve document no. 42, etc.
        """
        if index_fname is None:
            index_fname = fname + '.index'

        if progress_cnt is not None:
            if labels is not None:
                offsets = serializer.save_corpus(fname,
                                                 corpus,
                                                 id2word,
                                                 labels=labels,
                                                 progress_cnt=progress_cnt)
            else:
                offsets = serializer.save_corpus(fname,
                                                 corpus,
                                                 id2word,
                                                 progress_cnt=progress_cnt)
        else:
            if labels is not None:
                offsets = serializer.save_corpus(fname,
                                                 corpus,
                                                 id2word,
                                                 labels=labels)
            else:
                offsets = serializer.save_corpus(fname, corpus, id2word)

        if offsets is None:
            raise NotImplementedError(
                "called serialize on class %s which doesn't support indexing!"
                % serializer.__name__)

        # store offsets persistently, using pickle
        logger.info("saving %s index to %s" %
                    (serializer.__name__, index_fname))
        utils.pickle(offsets, index_fname)
Esempio n. 8
0
    def serialize(serializer,
                  fname,
                  corpus,
                  id2word=None,
                  index_fname=None,
                  progress_cnt=None,
                  labels=None,
                  metadata=False):
        """
        Iterate through the document stream `corpus`, saving the documents to `fname`
        and recording byte offset of each document. Save the resulting index
        structure to file `index_fname` (or `fname`.index is not set).

        This relies on the underlying corpus class `serializer` providing (in
        addition to standard iteration):

        * `save_corpus` method that returns a sequence of byte offsets, one for
           each saved document,
        * the `docbyoffset(offset)` method, which returns a document
          positioned at `offset` bytes within the persistent storage (file).
        * metadata if set to true will ensure that serialize will write out article titles to a pickle file.

        Example:

        >>> MmCorpus.serialize('test.mm', corpus)
        >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access
        >>> print(mm[42]) # retrieve document no. 42, etc.
        """
        if getattr(corpus, 'fname', None) == fname:
            raise ValueError(
                "identical input vs. output corpus filename, refusing to serialize: %s"
                % fname)

        if index_fname is None:
            index_fname = utils.smart_extension(fname, '.index')

        kwargs = {'metadata': metadata}
        if progress_cnt is not None:
            kwargs['progress_cnt'] = progress_cnt

        if labels is not None:
            kwargs['labels'] = labels

        offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs)

        if offsets is None:
            raise NotImplementedError(
                "Called serialize on class %s which doesn't support indexing!"
                % serializer.__name__)

        # store offsets persistently, using pickle
        # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return
        # the offsets that are actually stored on disk - we're not storing self.index in any case, the
        # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure
        # backwards compatibility
        logger.info("saving %s index to %s", serializer.__name__, index_fname)
        utils.pickle(offsets, index_fname)
Esempio n. 9
0
	def mergeIntoTermCorpus(self):
		corpus_writer = matutils.MmWriter('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName))
		corpus_writer.write_headers(-1, -1, -1)
		
		num_nnz = 0
		poslast = 0
		offsets = []
		
		write_index = 0
		totalLen = 0
		corporaDict = MyDict()
		for i in range(self.numberOfProcesses):
			corpus = corpora.MmCorpus('%s/%s/%s_%i.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName, i))
			#corporaList.append(corpus)
			# (current termId, current index in corpus, corpus)
			if len(corpus) > 0:
				termId = [id for (id, sim) in corpus[0] if sim == 1.0][0]
				corporaDict[i] = (termId, 0, corpus, len(corpus))
				totalLen += len(corpus)
			
		while 1:
			isDone = False
			for corpusId in corporaDict.keys():
				termId, index, corpus, len_corpus = corporaDict[corpusId] # Read all values for current corpus from MyDict.
				if termId == write_index: # We are writing to the merged corpus at index 'write_index'. Write it if it coincides with the column id of the current corpus.
				
					# Determine offsets for the index file, allowing O(1) access time of documents.
					posnow = corpus_writer.fout.tell()
					if posnow == poslast:
						offsets[-1] = -1
					offsets.append(posnow)
					poslast = posnow
				
					# Write current document
					max_id, veclen = corpus_writer.write_vector(write_index, corpus[index])
					num_nnz += veclen
					
					# Update values
					write_index += 1 #Update the write index of the merged corpus
					index += 1 #Update the index of the current corpus
					if index == len_corpus: #Reached the end of the current corpus. Set values to -1 so no more document will be grabbed from this corpus.
						corporaDict[corpusId] = (-1, -1, corpus, len_corpus) #Set index to -1. Corpus has been fully read.
					else:
						termId = [id for (id, sim) in corpus[index] if sim == 1.0][0] #Grab the next column id :: TODO -- CAN THIS BE DONE MORE EFFICIENTLY?
						corporaDict[corpusId] = (termId, index, corpus, len_corpus) #Update the MyDict with the new values of the current corpus
					
					if write_index == totalLen: # If all corpora have been fully read, exit the while loop.
						isDone = True
						
			if isDone:
				break
		corpus_writer.fake_headers(totalLen, totalLen, num_nnz)
		corpus_writer.close()
		
		# Write index to file
		index_fname = corpus_writer.fname + '.index'
		utils.pickle(offsets, index_fname)
def main(data_dir, out_dir):

    docid2path = dict()

    # iterable of (doctext, docpath) tuple
    reader = TextsStreamReader(data_dir, as_lines=False)
    outfile = codecs.open(
        os.path.join(out_dir, 'processed_enron_docs_as_lines.txt'), 'w',
        'utf-8', 'ignore')

    docid = 0
    opts = dict(sents=False,
                lower=True,
                stem=False,
                min_token_len=3,
                min_sent_len=4,
                remove_stops=True,
                filters=[
                    'strip_multiple_whitespaces', 'strip_tags',
                    'strip_punctuation', 'split_alphanum', 'strip_numeric'
                ])

    for doctext, docpath in reader:
        doctext = preprocess_text(doctext, **opts)
        # generator to list
        doctext = list(doctext)
        if doctext:
            # when sents=False, each document is returned as single sentence (first element),
            # where every element is a list of tokens
            doctext = doctext[0]
            if doctext:
                docid2path[docid] = docpath
                outfile.write(" ".join(doctext) + '\n')
                docid += 1

    outfile.close()
    utils.pickle(docid2path, os.path.join(out_dir, 'docid2path.pkl'))

    # create another file to hold sentences (useful for word2vec)
    outfile = codecs.open(
        os.path.join(out_dir, 'processed_enron_sents_as_lines.txt'), 'w',
        'utf-8', 'ignore')
    opts['sents'] = True

    for doctext, _ in reader:
        docsents = preprocess_text(doctext, **opts)
        docsents = list(docsents)
        if docsents:
            for sent in docsents:
                if sent:
                    outfile.write(" ".join(sent) + '\n')

    outfile.close()
Esempio n. 11
0
 def save(self, fname):
     """
     See MatrixSimilarity.save()
     """
     logger.info("storing %s object to %s and %s" % (self.__class__.__name__, fname, fname + '.npy'))
     # first, remove the index from self.__dict__, so it doesn't get pickled
     index = self.corpus
     del self.corpus
     try:
         utils.pickle(self, fname)  # store index-less object
         numpy.save(fname + '.npy', index)  # store index
     finally:
         self.corpus = index
def train(word2id, id2word, corpus, win, dim):
    cooccur = glove.Corpus(dictionary=word2id)
    cooccur.fit(corpus(), window=win)

    logger.info("glove model creating")
    logger.info('Dict size: %s' % len(cooccur.dictionary))
    logger.info('Collocations: %s' % cooccur.matrix.nnz)
    model = glove.Glove(no_components=dim, learning_rate=0.05)
    model.fit(cooccur.matrix, epochs=10, no_threads=5, verbose=True)
    model.add_dictionary(cooccur.dictionary)
    model.word2id = dict(
        (utils.to_unicode(w), id) for w, id in model.dictionary.items())
    model.id2word = gensim.utils.revdict(model.word2id)
    utils.pickle(model, './model/glove.model')
 def save(self, fname):
     """
     See MatrixSimilarity.save()
     """
     logger.info("storing %s object to %s and %s" % (self.__class__.__name__,
                                                     fname,
                                                     fname + '.index'))
     # first, remove the similarity index from self., so it doesn't get pickled
     sim = self.similarity_index
     del self.similarity_index
     try:
         sim.save(fname + ".index")
         utils.pickle(self, fname)  # store index-less object
     finally:
         self.similarity_index = sim
Esempio n. 14
0
    def serialize(serializer, fname, corpus, id2word=None, index_fname=None,
                  progress_cnt=None, labels=None, metadata=False,
                  dockeys_fname=None):
        key_order = []

        def corpus_as_list():
            for k, c in corpus:
                key_order.append(k)
                yield c
        IndexedCorpus.serialize.im_func(
            serializer, fname, corpus_as_list(), id2word,
            index_fname, progress_cnt, labels, metadata)
        dockeys_fname = dockeys_fname or utils.smart_extension(
            fname, '.dockeys')
        utils.pickle(key_order, dockeys_fname)
Esempio n. 15
0
	def createLevenshteinCorpus(self):
		corpus = corpora.MmCorpus('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.originalCorpusFileName))
		termCorpus = corpora.MmCorpus('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName))
		
		levCorpus_writer = matutils.MmWriter('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.levenshteinCorpusFileName))
		levCorpus_writer.write_headers(-1, -1, -1)
		num_nnz = 0
		poslast = 0
		offsets = []
		
		write_index = 0
		
		for doc in corpus:
			doc_ids = [id for (id, term) in doc]
			
			candidates = []
			levDoc = []
			
			for termId in doc_ids:
				# Create a list of candidates for this document
				for candidate in termCorpus[termId]:
					candidates.append(candidate)
					
			candidates = sorted(candidates, key=lambda tuple: tuple[0])
					
			it = itertools.groupby(candidates, key=lambda tuple: tuple[0]) #Group together similarity measures for the same term.
			for key, subiter in it:
				levDoc.append(max(subiter, key=lambda tuple: tuple[1])) #From the multiple similarity measures, grab the maximum.
			
			# Determine offsets for the index file, allowing O(1) access time of documents.
			posnow = levCorpus_writer.fout.tell()
			if posnow == poslast:
				offsets[-1] = -1
			offsets.append(posnow)
			poslast = posnow
		
			#Write the document to levCorpus in file
			max_id, veclen = levCorpus_writer.write_vector(write_index, levDoc)
			num_nnz += veclen
			
			write_index += 1
		
		levCorpus_writer.fake_headers(len(corpus), len(termCorpus), num_nnz)
		levCorpus_writer.close()
		
		# Write index to file
		index_fname = levCorpus_writer.fname + '.index'
		utils.pickle(offsets, index_fname)
Esempio n. 16
0
    def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False):
        """
        Iterate through the document stream `corpus`, saving the documents to `fname`
        and recording byte offset of each document. Save the resulting index
        structure to file `index_fname` (or `fname`.index is not set).

        This relies on the underlying corpus class `serializer` providing (in
        addition to standard iteration):

        * `save_corpus` method that returns a sequence of byte offsets, one for
           each saved document,
        * the `docbyoffset(offset)` method, which returns a document
          positioned at `offset` bytes within the persistent storage (file).

        Example:

        >>> MmCorpus.serialize('test.mm', corpus)
        >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access
        >>> print(mm[42]) # retrieve document no. 42, etc.
        """
        if getattr(corpus, 'fname', None) == fname:
            raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname)

        if index_fname is None:
            index_fname = utils.smart_extension(fname, '.index')

        if progress_cnt is not None:
            if labels is not None:
                offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata)
            else:
                offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata)
        else:
            if labels is not None:
                offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata)
            else:
                offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata)

        if offsets is None:
            raise NotImplementedError("called serialize on class %s which doesn't support indexing!" %
                serializer.__name__)

        # store offsets persistently, using pickle
        # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return
        # the offsets that are actually stored on disk - we're not storing self.index in any case, the
        # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure
        # backwards compatibility
        logger.info("saving %s index to %s" % (serializer.__name__, index_fname))
        utils.pickle(offsets, index_fname)
Esempio n. 17
0
 def save(self, fname):
     """
     Override the default `save` (which uses cPickle), because that's
     too inefficient and cPickle has bugs. Instead, single out the large index
     matrix and store that separately in binary format (that can be directly
     mmap'ed), under `fname.npy`. The rest of the object is pickled to `fname`.
     """
     logger.info("storing %s object to %s and %s" % (self.__class__.__name__, fname, fname + '.npy'))
     # first, remove the index from self.__dict__, so it doesn't get pickled
     index = self.index
     del self.index
     try:
         utils.pickle(self, fname) # store index-less object
         numpy.save(fname + '.npy', index) # store index
     finally:
         self.index = index
Esempio n. 18
0
 def save(self, fname):
     """
     Override the default `save` (which uses cPickle), because that's
     too inefficient and cPickle has bugs. Instead, single out the large index
     matrix and store that separately in binary format (that can be directly
     mmap'ed), under `fname.npy`. The rest of the object is pickled to `fname`.
     """
     logger.info("storing %s object to %s and %s" %
                 (self.__class__.__name__, fname, fname + '.npy'))
     # first, remove the index from self.__dict__, so it doesn't get pickled
     index = self.index
     del self.index
     try:
         utils.pickle(self, fname)  # store index-less object
         numpy.save(fname + '.npy', index)  # store index
     finally:
         self.index = index
Esempio n. 19
0
 def save(self, fname):
     """
     Override the default `save` (which uses cPickle), because that's
     too inefficient and cPickle has bugs. Instead, single out the large internal
     arrays and store them separately in binary format (that can be directly
     mmap'ed), under `fname.array_name.npy`.
     """
     logger.info("storing %s object to %s and %s.npy" % (self.__class__.__name__, fname, fname))
     assert isinstance(self.index, scipy.sparse.csr_matrix)
     # first, remove the arrays from self.__dict__, so they don't get pickled
     data, indptr, indices = self.index.data, self.index.indptr, self.index.indices
     del self.index.data, self.index.indptr, self.index.indices
     try:
         utils.pickle(self, fname) # store array-less object
         # store arrays (.npy suffix is appended by numpy automatically)
         numpy.save(fname + '.data.npy', data)
         numpy.save(fname + '.indptr.npy', indptr)
         numpy.save(fname + '.indices.npy', indices)
     finally:
         self.index.data, self.index.indptr, self.index.indices = data, indptr, indices
Esempio n. 20
0
    def save(self, fname):
        """
        Override the default `save` (which uses cPickle), because that's
        too inefficient and cPickle has bugs. Instead, single out the large transformation
        matrix and store that separately in binary format (that can be directly
        mmap'ed back in `load()`), under `fname.npy`.
        """
        logger.info("storing %s object to %s and %s" % (self.__class__.__name__, fname, fname + '.npy'))
        if self.projection.u is None:
            # model not initialized: there is no projection
            utils.pickle(self, fname)

        # first, remove the projection from self.__dict__, so it doesn't get pickled
        u = self.projection.u
        del self.projection.u
        try:
            utils.pickle(self, fname) # store projection-less object
            numpy.save(fname + '.npy', u) # store projection
        finally:
            self.projection.u = u
Esempio n. 21
0
 def save(self, fname):
     """
     Override the default `save` (which uses cPickle), because that's
     too inefficient and cPickle has bugs. Instead, single out the large internal
     arrays and store them separately in binary format (that can be directly
     mmap'ed), under `fname.array_name.npy`.
     """
     logger.info("storing %s object to %s and %s.npy" %
                 (self.__class__.__name__, fname, fname))
     assert isinstance(self.index, scipy.sparse.csr_matrix)
     # first, remove the arrays from self.__dict__, so they don't get pickled
     data, indptr, indices = self.index.data, self.index.indptr, self.index.indices
     del self.index.data, self.index.indptr, self.index.indices
     try:
         utils.pickle(self, fname)  # store array-less object
         # store arrays (.npy suffix is appended by numpy automatically)
         numpy.save(fname + '.data.npy', data)
         numpy.save(fname + '.indptr.npy', indptr)
         numpy.save(fname + '.indices.npy', indices)
     finally:
         self.index.data, self.index.indptr, self.index.indices = data, indptr, indices
Esempio n. 22
0
    def save(self, fname):
        """
        Override the default `save` (which uses cPickle), because that's
        too inefficient and cPickle has bugs. Instead, single out the large transformation
        matrix and store that separately in binary format (that can be directly
        mmap'ed back in `load()`), under `fname.npy`.
        """
        logger.info("storing %s object to %s and %s" %
                    (self.__class__.__name__, fname, fname + '.npy'))
        if self.projection.u is None:
            # model not initialized: there is no projection
            utils.pickle(self, fname)

        # first, remove the projection from self.__dict__, so it doesn't get pickled
        u = self.projection.u
        del self.projection.u
        try:
            utils.pickle(self, fname)  # store projection-less object
            numpy.save(fname + '.npy', ascarray(u))  # store projection
        finally:
            self.projection.u = u
Esempio n. 23
0
    def serialize(serializer,
                  fname,
                  corpus,
                  id2word=None,
                  index_fname=None,
                  progress_cnt=None,
                  labels=None,
                  metadata=False,
                  dockeys_fname=None):
        key_order = []

        def corpus_as_list():
            for k, c in corpus:
                key_order.append(k)
                yield c

        IndexedCorpus.serialize.__func__(serializer, fname, corpus_as_list(),
                                         id2word, index_fname, progress_cnt,
                                         labels, metadata)
        dockeys_fname = dockeys_fname or utils.smart_extension(
            fname, '.dockeys')
        utils.pickle(key_order, dockeys_fname)
Esempio n. 24
0
if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)

    from ppmi import PpmiModel

    in_file = gensim.models.word2vec.LineSentence('./all_sentences.dat')

    ##
    sentences = lambda: itertools.islice(in_file, None)

    logger.info("dictionary creating")
    id2word = gensim.corpora.Dictionary(in_file, prune_at=UNIQUE_WORDS)
    id2word.filter_extremes(keep_n=TOKEN_LIMIT)

    word2id = dict((v, k) for k, v in id2word.items())
    utils.pickle(word2id, './tmp/word2id')

    id2word = gensim.utils.revdict(word2id)

    ## filter sentences to contain only the dictionary words
    corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences())


    logger.info("PMI matrix creating")

    logger.info("raw cooccurrence matrix creating")
    raw = get_cooccur(corpus(), word2id, window=WINDOW, dynamic_window=DYNAMIC_WINDOW)
    numpy.save('./tmp/cooccur.npy', raw)
    # store the SPPMI matrix in sparse Matrix Market format on disk
    gensim.corpora.MmCorpus.serialize('./tmp/pmi_matrix.mm', raw2ppmi(raw, word2id, k_shift=NEGATIVE or 1))
    del raw
	program = os.path.basename(sys.argv[0])
	if len(sys.argv) < 3:
	    print(globals()['__doc__'] % locals())
	    sys.exit(1)
	in_file = gensim.models.word2vec.LineSentence(sys.argv[1])
	outf = lambda prefix: os.path.join(sys.argv[2], prefix)
	logger.info("output file template will be %s" % outf('PREFIX'))

	sentences = lambda: itertools.islice(in_file, DOC_LIMIT)

	if os.path.exists(outf('word2id')):
	    logger.info("dictionary already exists")
	else:
	    logger.info("dictionary not found, creating")
	    id2word = gensim.corpora.Dictionary(sentences(), prune_at=PRUNE_AT)
	    
	    id2word.save_as_text(outf("full_vocab.txt"))

	    utils.pickle(id2word, outf('id2word'))

	    id2word.filter_extremes(keep_n=TOKEN_LIMIT)  # filter out too freq/infreq words
	    word2id = dict((v, k) for k, v in id2word.iteritems())
	    
	    w = csv.writer(open(outf("TOKEN_LIMIT_vocab.txt"), "w"))
	    for key, val in word2id.items():
	        w.writerow([key.encode('utf-8').lower(), val])

	    utils.pickle(word2id, outf('word2id'))
	id2word = gensim.utils.revdict(word2id)

	logger.info("finished running %s" % program)
    outf = lambda prefix: os.path.join(output_dir, prefix)
    logger.info("output file template will be %s" % outf('PREFIX'))

    sentences = MyCorpus(corpus_path)

    if os.path.exists(outf('word2id')):
        logger.info("dictionary found, loading")
        word2id = utils.unpickle(outf('word2id'))
    else:
        logger.info("dictionary not found, creating")
        id2word = corpora.Dictionary(sentences, prune_at=10000000)
        id2word.filter_extremes(
            keep_n=TOKEN_LIMIT)  # filter out too freq/infreq words
        word2id = dict((v, k) for k, v in id2word.iteritems())
        utils.pickle(word2id, outf('word2id'))
    id2word = utils.revdict(word2id)

    # Filter all wiki documents to contain only those words.
    corpus = lambda: ([word for word in sentence if word in word2id]
                      for sentence in sentences)

    if os.path.exists(outf('kw2v_%s' % GAMMA)):
        logger.info("Kernel word2vec model found, loading")
        # model = utils.unpickle(outf('kw2v'))
        model = Word2Vec.load_word2vec_format(outf('kw2v_%s' % GAMMA),
                                              binary=True)
    else:
        logger.info("Kernel word2vec model not found, creating")
        if NEGATIVE:
            model = Word2Vec(size=DIM,
        sys.exit(1)
    in_file = gensim.models.word2vec.LineSentence(sys.argv[1])
    # in_file = gensim.models.word2vec.Text8Corpus(sys.argv[1])
    q_file = sys.argv[2]
    outf = lambda prefix: os.path.join(sys.argv[3], prefix)
    logger.info("output file template will be %s" % outf('PREFIX'))

    sentences = lambda: itertools.islice(in_file, DOC_LIMIT)

    # use only a small subset of all words; otherwise the methods based on matrix
    # decomposition (glove, ppmi) take too much RAM (quadratic in vocabulary size).
    logger.info("dictionary found, loading")
    with open(outf("pruned_vocab.csv")) as csvfile:
        reader = csv.reader(csvfile)
        word2id = dict((rows[0],rows[1]) for rows in reader)
        utils.pickle(word2id, outf('word2id'))
            
    id2word = gensim.utils.revdict(word2id)
    
    # filter sentences to contain only the dictionary words
    corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences())

    if 'word2vec' in program:
        if os.path.exists(outf('w2v')):
            logger.info("word2vec model found, loading")
            model = utils.unpickle(outf('w2v'))
        else:
            logger.info("word2vec model not found, creating")
            if NEGATIVE:
                model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS, hs=0, negative=NEGATIVE)
            else:
Esempio n. 28
0
        dictionary = gensim.corpora.dictionary.Dictionary.from_corpus(
            corpus, id2word=id2word)

        logger.info("calculating truncated SVD")
        lsi = gensim.models.LsiModel(corpus,
                                     id2word=dictionary,
                                     num_topics=DIM)
        self.singular_scaled = lsi.projection.s**s_exponent
        # embeddings = left singular vectors scaled by the (exponentiated) singular values
        self.word_vectors = lsi.projection.u * self.singular_scaled


if __name__ == "__main__":
    logging.basicConfig(
        format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
        level=logging.INFO)

    from svd import SvdModel

    word2id = utils.unpickle('./tmp/word2id')
    id2word = gensim.utils.revdict(word2id)

    logger.info("SVD model creating")
    corpus = gensim.corpora.MmCorpus('./tmp/pmi_matrix.mm')
    model = SvdModel(corpus, id2word, s_exponent=0.0)
    model.word2id = word2id
    model.id2word = id2word
    utils.pickle(model, './model/svd.model')

    logger.info("finished running svd")
Esempio n. 29
0
    outf = lambda prefix: os.path.join(sys.argv[3], prefix)
    logger.info("output file template will be %s" % outf('PREFIX'))

    sentences = lambda: itertools.islice(in_file, DOC_LIMIT)

    # use only a small subset of all words; otherwise the methods based on matrix
    # decomposition (glove, ppmi) take too much RAM (quadratic in vocabulary size).
    if os.path.exists(outf('word2id')):
        logger.info("dictionary found, loading")
        word2id = utils.unpickle(outf('word2id'))
    else:
        logger.info("dictionary not found, creating")
        id2word = gensim.corpora.Dictionary(sentences(), prune_at=10000000)
        id2word.filter_extremes(keep_n=TOKEN_LIMIT)  # filter out too freq/infreq words
        word2id = dict((v, k) for k, v in id2word.iteritems())
        utils.pickle(word2id, outf('word2id'))
    id2word = gensim.utils.revdict(word2id)

    # filter sentences to contain only the dictionary words
    corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences())

    if 'word2vec' in program:
        if os.path.exists(outf('w2v')):
            logger.info("word2vec model found, loading")
            model = utils.unpickle(outf('w2v'))
        else:
            logger.info("word2vec model not found, creating")
            if NEGATIVE:
                model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS, hs=0, negative=NEGATIVE)
            else:
                model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS)
Esempio n. 30
0
    def serialize(serializer, fname, corpus, id2word=None, index_fname=None,
                  progress_cnt=None, labels=None, metadata=False):
        """Serialize corpus with offset metadata, allows to use direct indexes after loading.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, float)
            Corpus in BoW format.
        id2word : dict of (str, str), optional
            Mapping id -> word.
        index_fname : str, optional
             Where to save resulting index, if None - store index to `fname`.index.
        progress_cnt : int, optional
            Number of documents after which progress info is printed.
        labels : bool, optional
             If True - ignore first column (class labels).
        metadata : bool, optional
            If True - ensure that serialize will write out article titles to a pickle file.

        Examples
        --------
        >>> from gensim.corpora import MmCorpus
        >>> from gensim.test.utils import get_tmpfile
        >>>
        >>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]]
        >>> output_fname = get_tmpfile("test.mm")
        >>>
        >>> MmCorpus.serialize(output_fname, corpus)
        >>> mm = MmCorpus(output_fname) # `mm` document stream now has random access
        >>> print(mm[1]) # retrieve document no. 42, etc.
        [(1, 0.1)]

        """
        if getattr(corpus, 'fname', None) == fname:
            raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname)

        if index_fname is None:
            index_fname = utils.smart_extension(fname, '.index')

        kwargs = {'metadata': metadata}
        if progress_cnt is not None:
            kwargs['progress_cnt'] = progress_cnt

        if labels is not None:
            kwargs['labels'] = labels

        offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs)

        if offsets is None:
            raise NotImplementedError(
                "Called serialize on class %s which doesn't support indexing!" % serializer.__name__
            )

        # store offsets persistently, using pickle
        # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return
        # the offsets that are actually stored on disk - we're not storing self.index in any case, the
        # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure
        # backwards compatibility
        logger.info("saving %s index to %s", serializer.__name__, index_fname)
        utils.pickle(offsets, index_fname)
Esempio n. 31
0
    def serialize(serializer,
                  fname,
                  corpus,
                  id2word=None,
                  index_fname=None,
                  progress_cnt=None,
                  labels=None,
                  metadata=False):
        """Serialize corpus with offset metadata, allows to use direct indexes after loading.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, float)
            Corpus in BoW format.
        id2word : dict of (str, str), optional
            Mapping id -> word.
        index_fname : str, optional
             Where to save resulting index, if None - store index to `fname`.index.
        progress_cnt : int, optional
            Number of documents after which progress info is printed.
        labels : bool, optional
             If True - ignore first column (class labels).
        metadata : bool, optional
            If True - ensure that serialize will write out article titles to a pickle file.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.corpora import MmCorpus
            >>> from gensim.test.utils import get_tmpfile
            >>>
            >>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]]
            >>> output_fname = get_tmpfile("test.mm")
            >>>
            >>> MmCorpus.serialize(output_fname, corpus)
            >>> mm = MmCorpus(output_fname)  # `mm` document stream now has random access
            >>> print(mm[1])  # retrieve document no. 42, etc.
            [(1, 0.1)]

        """
        if getattr(corpus, 'fname', None) == fname:
            raise ValueError(
                "identical input vs. output corpus filename, refusing to serialize: %s"
                % fname)

        if index_fname is None:
            index_fname = utils.smart_extension(fname, '.index')

        kwargs = {'metadata': metadata}
        if progress_cnt is not None:
            kwargs['progress_cnt'] = progress_cnt

        if labels is not None:
            kwargs['labels'] = labels

        offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs)

        if offsets is None:
            raise NotImplementedError(
                "Called serialize on class %s which doesn't support indexing!"
                % serializer.__name__)

        # store offsets persistently, using pickle
        # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return
        # the offsets that are actually stored on disk - we're not storing self.index in any case, the
        # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure
        # backwards compatibility
        logger.info("saving %s index to %s", serializer.__name__, index_fname)
        utils.pickle(offsets, index_fname)
Esempio n. 32
0
    logger.info("output file template will be %s" % outf('PREFIX'))

    sentences = lambda: itertools.islice(in_file, DOC_LIMIT)

    # use only a small subset of all words; otherwise the methods based on matrix
    # decomposition (glove, ppmi) take too much RAM (quadratic in vocabulary size).
    if os.path.exists(outf('word2id')):
        logger.info("dictionary found, loading")
        word2id = utils.unpickle(outf('word2id'))
    else:
        logger.info("dictionary not found, creating")
        id2word = gensim.corpora.Dictionary(sentences(), prune_at=10000000)
        id2word.filter_extremes(
            keep_n=TOKEN_LIMIT)  # filter out too freq/infreq words
        word2id = dict((v, k) for k, v in id2word.iteritems())
        utils.pickle(word2id, outf('word2id'))
    id2word = gensim.utils.revdict(word2id)

    # filter sentences to contain only the dictionary words
    corpus = lambda: ([word for word in sentence if word in word2id]
                      for sentence in sentences())

    if 'word2vec' in program:
        if os.path.exists(outf('w2v')):
            logger.info("word2vec model found, loading")
            model = utils.unpickle(outf('w2v'))
        else:
            logger.info("word2vec model not found, creating")
            if NEGATIVE:
                model = gensim.models.Word2Vec(size=DIM,
                                               min_count=0,
        # Load the article titles back
        id_to_titles = utils.unpickle('./data/bow.mm.metadata.cpickle')
    
        # Create the reverse mapping, from article title to index.
        titles_to_id = {}

        # For each article...
        for at in id_to_titles.items():
            # `at` is (index, (pageid, article_title))  e.g., (0, ('12', 'Anarchism'))
            # at[1][1] is the article title.
            # The pagied property is unused.
            titles_to_id[at[1][1]] = at[0]
        
        # Store the resulting map.
        utils.pickle(titles_to_id, './data/titles_to_id.pickle')

        # We're done with the article titles so free up their memory.
        del id_to_titles
        del titles_to_id
    
    
        # To clean up some memory, we can delete our original dictionary and 
        # wiki objects, and load back the dictionary directly from the file.
        del dictionary
        del wiki  
        
        # Load the dictionary back from disk.
        # (0.86sec on my machine loading from an SSD)
        dictionary = Dictionary.load_from_text('./data/dictionary.txt.bz2')
    
Esempio n. 34
0
    def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, metadata=False):
        """Save the corpus to disk in Matrix Market format.

        Parameters
        ----------
        fname : str
            Filename of the resulting file.
        corpus : iterable of list of (int, number)
            Corpus in Bow format.
        progress_cnt : int, optional
            Print progress for every `progress_cnt` number of documents.
        index : bool, optional
            If True, the offsets will be return, otherwise return None.
        num_terms : int, optional
            If provided, the `num_terms` attributes in the corpus will be ignored.
        metadata : bool, optional
            If True, a metadata file will be generated.

        Returns
        -------
        offsets : {list of int, None}
            List of offsets (if index=True) or nothing.

        Notes
        -----
        Documents are processed one at a time, so the whole corpus is allowed to be larger than the available RAM.

        See Also
        --------
        :func:`~gensim.corpora.mmcorpus.MmCorpus.save_corpus`

        """
        mw = MmWriter(fname)

        # write empty headers to the file (with enough space to be overwritten later)
        mw.write_headers(-1, -1, -1)  # will print 50 spaces followed by newline on the stats line

        # calculate necessary header info (nnz elements, num terms, num docs) while writing out vectors
        _num_terms, num_nnz = 0, 0
        docno, poslast = -1, -1
        offsets = []
        if hasattr(corpus, 'metadata'):
            orig_metadata = corpus.metadata
            corpus.metadata = metadata
            if metadata:
                docno2metadata = {}
        else:
            metadata = False
        for docno, doc in enumerate(corpus):
            if metadata:
                bow, data = doc
                docno2metadata[docno] = data
            else:
                bow = doc
            if docno % progress_cnt == 0:
                logger.info("PROGRESS: saving document #%i", docno)
            if index:
                posnow = mw.fout.tell()
                if posnow == poslast:
                    offsets[-1] = -1
                offsets.append(posnow)
                poslast = posnow
            max_id, veclen = mw.write_vector(docno, bow)
            _num_terms = max(_num_terms, 1 + max_id)
            num_nnz += veclen
        if metadata:
            utils.pickle(docno2metadata, fname + '.metadata.cpickle')
            corpus.metadata = orig_metadata

        num_docs = docno + 1
        num_terms = num_terms or _num_terms

        if num_docs * num_terms != 0:
            logger.info(
                "saved %ix%i matrix, density=%.3f%% (%i/%i)",
                num_docs, num_terms, 100.0 * num_nnz / (num_docs * num_terms), num_nnz, num_docs * num_terms
            )

        # now write proper headers, by seeking and overwriting the spaces written earlier
        mw.fake_headers(num_docs, num_terms, num_nnz)

        mw.close()
        if index:
            return offsets
Esempio n. 35
0
    def save(self,
             fname,
             ignore=['state', 'dispatcher'],
             separately=None,
             *args,
             **kwargs):
        """
        Save the model to file.

        Large internal arrays may be stored into separate files, with `fname` as prefix.

        `separately` can be used to define which arrays should be stored in separate files.

        `ignore` parameter can be used to define which variables should be ignored, i.e. left
        out from the pickled lda model. By default the internal `state` is ignored as it uses
        its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher`
        will be added to any ignore parameter defined.


        Note: do not save as a compressed file if you intend to load the file back with `mmap`.

        Note: If you intend to use models across Python 2/3 versions there are a few things to
        keep in mind:

          1. The pickled Python dictionaries will not work across Python versions
          2. The `save` method does not automatically save all np arrays using np, only
             those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main
             concern here is the `alpha` array if for instance using `alpha='auto'`.

        Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2)
        for an example on how to work around these issues.
        """
        if self.state is not None:
            self.state.save(utils.smart_extension(fname, '.state'), *args,
                            **kwargs)
        # Save the dictionary separately if not in 'ignore'.
        if 'id2word' not in ignore:
            utils.pickle(self.id2word,
                         utils.smart_extension(fname, '.id2word'))

        # make sure 'state', 'id2word' and 'dispatcher' are ignored from the pickled object, even if
        # someone sets the ignore list themselves
        if ignore is not None and ignore:
            if isinstance(ignore, six.string_types):
                ignore = [ignore]
            ignore = [e for e in ignore
                      if e]  # make sure None and '' are not in the list
            ignore = list(
                set(['state', 'dispatcher', 'id2word']) | set(ignore))
        else:
            ignore = ['state', 'dispatcher', 'id2word']

        # make sure 'expElogbeta' and 'sstats' are ignored from the pickled object, even if
        # someone sets the separately list themselves.
        separately_explicit = ['expElogbeta', 'sstats']
        # Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some
        # array manually.
        if (isinstance(self.alpha, six.string_types)
                and self.alpha == 'auto') or len(self.alpha.shape) != 1:
            separately_explicit.append('alpha')
        if (isinstance(self.eta, six.string_types)
                and self.eta == 'auto') or len(self.eta.shape) != 1:
            separately_explicit.append('eta')
        # Merge separately_explicit with separately.
        if separately:
            if isinstance(separately, six.string_types):
                separately = [separately]
            separately = [e for e in separately
                          if e]  # make sure None and '' are not in the list
            separately = list(set(separately_explicit) | set(separately))
        else:
            separately = separately_explicit
        super(LdaModel, self).save(fname,
                                   ignore=ignore,
                                   separately=separately,
                                   *args,
                                   **kwargs)
Esempio n. 36
0
    def save(self, fname, ignore=['state', 'dispatcher'], separately=None, *args, **kwargs):
        """
        Save the model to file.

        Large internal arrays may be stored into separate files, with `fname` as prefix.

        `separately` can be used to define which arrays should be stored in separate files.

        `ignore` parameter can be used to define which variables should be ignored, i.e. left
        out from the pickled lda model. By default the internal `state` is ignored as it uses
        its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher`
        will be added to any ignore parameter defined.


        Note: do not save as a compressed file if you intend to load the file back with `mmap`.

        Note: If you intend to use models across Python 2/3 versions there are a few things to
        keep in mind:

          1. The pickled Python dictionaries will not work across Python versions
          2. The `save` method does not automatically save all np arrays using np, only
             those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main
             concern here is the `alpha` array if for instance using `alpha='auto'`.

        Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2)
        for an example on how to work around these issues.
        """
        if self.state is not None:
            self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs)
        # Save the dictionary separately if not in 'ignore'.
        if 'id2word' not in ignore:
            utils.pickle(self.id2word, utils.smart_extension(fname, '.id2word'))

        # make sure 'state', 'id2word' and 'dispatcher' are ignored from the pickled object, even if
        # someone sets the ignore list themselves
        if ignore is not None and ignore:
            if isinstance(ignore, six.string_types):
                ignore = [ignore]
            ignore = [e for e in ignore if e]  # make sure None and '' are not in the list
            ignore = list(set(['state', 'dispatcher', 'id2word']) | set(ignore))
        else:
            ignore = ['state', 'dispatcher', 'id2word']

        # make sure 'expElogbeta' and 'sstats' are ignored from the pickled object, even if
        # someone sets the separately list themselves.
        separately_explicit = ['expElogbeta', 'sstats']
        # Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some
        # array manually.
        if (isinstance(self.alpha, six.string_types) and self.alpha == 'auto') or (isinstance(self.alpha, np.ndarray) and len(self.alpha.shape) != 1):
            separately_explicit.append('alpha')
        if (isinstance(self.eta, six.string_types) and self.eta == 'auto') or (isinstance(self.eta, np.ndarray) and len(self.eta.shape) != 1):
            separately_explicit.append('eta')
        # Merge separately_explicit with separately.
        if separately:
            if isinstance(separately, six.string_types):
                separately = [separately]
            separately = [e for e in separately if e]  # make sure None and '' are not in the list
            separately = list(set(separately_explicit) | set(separately))
        else:
            separately = separately_explicit
        super(LdaModel, self).save(fname, ignore=ignore, separately=separately, *args, **kwargs)