def handleWorker(self, queue, core_nr, dictionary, levThr): corpus_writer = matutils.MmWriter('%s/%s/%s_%i.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName, core_nr)) corpus_writer.write_headers(-1, -1, -1) nnz = 0 nrDocs = 0 poslast = 0 offsets = [] # dictionary lookup is very slow. Copy to own version. mydictionary = {} for key in dictionary.iterkeys(): mydictionary[key] = (dictionary[key], len(dictionary[key])) dictionaryLen = len(dictionary) for item in iter(queue.get, 'STOP'): termIndex = item # Determine offsets for the index file, allowing O(1) access time of documents. posnow = corpus_writer.fout.tell() if posnow == poslast: offsets[-1] = -1 offsets.append(posnow) poslast = posnow nnz += self.writeLevenshteinDistance(termIndex, mydictionary, dictionaryLen, levThr, corpus_writer) nrDocs += 1 corpus_writer.fake_headers(nrDocs, dictionaryLen, nnz) corpus_writer.close() # Write index to file index_fname = corpus_writer.fname + '.index' utils.pickle(offsets, index_fname)
def main(): parser = argparse.ArgumentParser( description= 'convertes a given .metadata.cpickle file (such as generated by gensim MmCorpus.serialize(..., metadata=True) to a pickled frozenset of contained pageids', epilog= 'Example: ./{} --metadata=enwiki-metadata.cpickle.bz2 --pageids=enwiki-pageids.cpickle.bz2' .format(sys.argv[0])) parser.add_argument( '--metadata', type=argparse.FileType('r'), help='path to input binary metadata file (.cpickle/.cpickle.bz2)', required=True) parser.add_argument( '--pageids', type=argparse.FileType('w'), help= 'path to output binary frozenset of pageids file (.cpickle/.cpickle.bz2)', required=True) args = parser.parse_args() input_metadata_path = args.metadata.name output_pageids_path = args.pageids.name logger.info('running with:\n{}'.format( pformat({ 'input_metadata_path': input_metadata_path, 'output_pageids_path': output_pageids_path }))) metadata = unpickle(input_metadata_path) logger.debug('unpickled {}'.format(metadata)) pageids = frozenset(int(md[0]) for md in metadata.values()) logger.info('extracted {} pageids'.format(len(pageids))) logger.debug('created set {}'.format(pageids)) pickle(pageids, output_pageids_path)
def save(self, fname, separately=None, sep_limit=10485760, ignore=frozenset([])): """ Save corpus index in for persistency """ self.document_titles.save() path = fname + '.npz' LOGGER.info("Storing %s object to %s and %s", self.__class__.__name__, fname, path) # Remove the index from self.__dict__, # so it doesn't get pickled index = self.sparse_corpus del self.corpus del self.sparse_corpus try: utils.pickle(self, fname) LOGGER.info("Finished pickling EsaModel") np.savez(path, row=index.row, col=index.col, data=index.data, shape=index.shape) LOGGER.info("Finished saving sparse corpus") # Not needed? Check if they are saved properly with the rest of the model # pickle.dump(self.document_titles, open(fname + '_doc_titles.pickle', 'w')) # LOGGER.info("Finished saving (reduced) doc titles") finally: self.corpus = index
def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): """Store documents one on each line as plain text words.""" logging.info("storing corpus in Line format to %s", fname) def word_id2word(word_id): try: return id2word[word_id] except KeyError: return "" with smart_open(fname, 'wb') as f: if metadata: docno2metadata = {} for docno, doc in enumerate(corpus): if metadata: doc, data = doc docno2metadata[docno] = data if docno % progress_cnt == 0: logging.info("PROGRESS: saving document #%i", docno) fmt = ' '.join(map(word_id2word, doc)) f.write(to_utf8("%s\n" % fmt)) if metadata: utils.pickle(docno2metadata, fname + '.metadata.cpickle')
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None): """ Iterate through the document stream `corpus`, saving the documents to `fname` and recording byte offset of each document. Save the resulting index structure to file `index_fname` (or `fname`.index is not set). This relies on the underlying corpus class `serializer` providing (in addition to standard iteration): * `save_corpus` method that returns a sequence of byte offsets, one for each saved document, * the `docbyoffset(offset)` method, which returns a document positioned at `offset` bytes within the persistent storage (file). Example: >>> MmCorpus.serialize('test.mm', corpus) >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access >>> print mm[42] # retrieve document no. 42, etc. """ if index_fname is None: index_fname = fname + '.index' if progress_cnt is not None: offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt) else: offsets = serializer.save_corpus(fname, corpus, id2word) if offsets is None: raise NotImplementedError("called serialize on class %s which \ doesn't support indexing!" % serializer.__name__) # store offsets persistently, using pickle logger.info("saving %s index to %s" % (serializer.__name__, index_fname)) utils.pickle(offsets, index_fname)
def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, metadata=False): """ Save the vector space representation of an entire corpus to disk. Note that the documents are processed one at a time, so the whole corpus is allowed to be larger than the available RAM. """ mw = MmWriter(fname) # write empty headers to the file (with enough space to be overwritten later) mw.write_headers(-1, -1, -1) # will print 50 spaces followed by newline on the stats line # calculate necessary header info (nnz elements, num terms, num docs) while writing out vectors _num_terms, num_nnz = 0, 0 docno, poslast = -1, -1 offsets = [] if hasattr(corpus, 'metadata'): orig_metadata = corpus.metadata corpus.metadata = metadata if metadata: docno2metadata = {} else: metadata = False for docno, doc in enumerate(corpus): if metadata: bow, data = doc docno2metadata[docno] = data else: bow = doc if docno % progress_cnt == 0: logger.info("PROGRESS: saving document #%i" % docno) if index: posnow = mw.fout.tell() if posnow == poslast: offsets[-1] = -1 offsets.append(posnow) poslast = posnow max_id, veclen = mw.write_vector(docno, bow) _num_terms = max(_num_terms, 1 + max_id) num_nnz += veclen if metadata: utils.pickle(docno2metadata, fname + '.metadata.cpickle') corpus.metadata = orig_metadata num_docs = docno + 1 num_terms = num_terms or _num_terms if num_docs * num_terms != 0: logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" % ( num_docs, num_terms, 100.0 * num_nnz / (num_docs * num_terms), num_nnz, num_docs * num_terms)) # now write proper headers, by seeking and overwriting the spaces written earlier mw.fake_headers(num_docs, num_terms, num_nnz) mw.close() if index: return offsets
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None): """ Iterate through the document stream `corpus`, saving the documents to `fname` and recording byte offset of each document. Save the resulting index structure to file `index_fname` (or `fname`.index is not set). This relies on the underlying corpus class `serializer` providing (in addition to standard iteration): * `save_corpus` method that returns a sequence of byte offsets, one for each saved document, * the `docbyoffset(offset)` method, which returns a document positioned at `offset` bytes within the persistent storage (file). Example: >>> MmCorpus.serialize('test.mm', corpus) >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access >>> print mm[42] # retrieve document no. 42, etc. """ if index_fname is None: index_fname = fname + '.index' if progress_cnt is not None: if labels is not None: offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt) else: offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt) else: if labels is not None: offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels) else: offsets = serializer.save_corpus(fname, corpus, id2word) if offsets is None: raise NotImplementedError( "called serialize on class %s which doesn't support indexing!" % serializer.__name__) # store offsets persistently, using pickle logger.info("saving %s index to %s" % (serializer.__name__, index_fname)) utils.pickle(offsets, index_fname)
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): """ Iterate through the document stream `corpus`, saving the documents to `fname` and recording byte offset of each document. Save the resulting index structure to file `index_fname` (or `fname`.index is not set). This relies on the underlying corpus class `serializer` providing (in addition to standard iteration): * `save_corpus` method that returns a sequence of byte offsets, one for each saved document, * the `docbyoffset(offset)` method, which returns a document positioned at `offset` bytes within the persistent storage (file). * metadata if set to true will ensure that serialize will write out article titles to a pickle file. Example: >>> MmCorpus.serialize('test.mm', corpus) >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access >>> print(mm[42]) # retrieve document no. 42, etc. """ if getattr(corpus, 'fname', None) == fname: raise ValueError( "identical input vs. output corpus filename, refusing to serialize: %s" % fname) if index_fname is None: index_fname = utils.smart_extension(fname, '.index') kwargs = {'metadata': metadata} if progress_cnt is not None: kwargs['progress_cnt'] = progress_cnt if labels is not None: kwargs['labels'] = labels offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs) if offsets is None: raise NotImplementedError( "Called serialize on class %s which doesn't support indexing!" % serializer.__name__) # store offsets persistently, using pickle # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return # the offsets that are actually stored on disk - we're not storing self.index in any case, the # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure # backwards compatibility logger.info("saving %s index to %s", serializer.__name__, index_fname) utils.pickle(offsets, index_fname)
def mergeIntoTermCorpus(self): corpus_writer = matutils.MmWriter('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName)) corpus_writer.write_headers(-1, -1, -1) num_nnz = 0 poslast = 0 offsets = [] write_index = 0 totalLen = 0 corporaDict = MyDict() for i in range(self.numberOfProcesses): corpus = corpora.MmCorpus('%s/%s/%s_%i.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName, i)) #corporaList.append(corpus) # (current termId, current index in corpus, corpus) if len(corpus) > 0: termId = [id for (id, sim) in corpus[0] if sim == 1.0][0] corporaDict[i] = (termId, 0, corpus, len(corpus)) totalLen += len(corpus) while 1: isDone = False for corpusId in corporaDict.keys(): termId, index, corpus, len_corpus = corporaDict[corpusId] # Read all values for current corpus from MyDict. if termId == write_index: # We are writing to the merged corpus at index 'write_index'. Write it if it coincides with the column id of the current corpus. # Determine offsets for the index file, allowing O(1) access time of documents. posnow = corpus_writer.fout.tell() if posnow == poslast: offsets[-1] = -1 offsets.append(posnow) poslast = posnow # Write current document max_id, veclen = corpus_writer.write_vector(write_index, corpus[index]) num_nnz += veclen # Update values write_index += 1 #Update the write index of the merged corpus index += 1 #Update the index of the current corpus if index == len_corpus: #Reached the end of the current corpus. Set values to -1 so no more document will be grabbed from this corpus. corporaDict[corpusId] = (-1, -1, corpus, len_corpus) #Set index to -1. Corpus has been fully read. else: termId = [id for (id, sim) in corpus[index] if sim == 1.0][0] #Grab the next column id :: TODO -- CAN THIS BE DONE MORE EFFICIENTLY? corporaDict[corpusId] = (termId, index, corpus, len_corpus) #Update the MyDict with the new values of the current corpus if write_index == totalLen: # If all corpora have been fully read, exit the while loop. isDone = True if isDone: break corpus_writer.fake_headers(totalLen, totalLen, num_nnz) corpus_writer.close() # Write index to file index_fname = corpus_writer.fname + '.index' utils.pickle(offsets, index_fname)
def main(data_dir, out_dir): docid2path = dict() # iterable of (doctext, docpath) tuple reader = TextsStreamReader(data_dir, as_lines=False) outfile = codecs.open( os.path.join(out_dir, 'processed_enron_docs_as_lines.txt'), 'w', 'utf-8', 'ignore') docid = 0 opts = dict(sents=False, lower=True, stem=False, min_token_len=3, min_sent_len=4, remove_stops=True, filters=[ 'strip_multiple_whitespaces', 'strip_tags', 'strip_punctuation', 'split_alphanum', 'strip_numeric' ]) for doctext, docpath in reader: doctext = preprocess_text(doctext, **opts) # generator to list doctext = list(doctext) if doctext: # when sents=False, each document is returned as single sentence (first element), # where every element is a list of tokens doctext = doctext[0] if doctext: docid2path[docid] = docpath outfile.write(" ".join(doctext) + '\n') docid += 1 outfile.close() utils.pickle(docid2path, os.path.join(out_dir, 'docid2path.pkl')) # create another file to hold sentences (useful for word2vec) outfile = codecs.open( os.path.join(out_dir, 'processed_enron_sents_as_lines.txt'), 'w', 'utf-8', 'ignore') opts['sents'] = True for doctext, _ in reader: docsents = preprocess_text(doctext, **opts) docsents = list(docsents) if docsents: for sent in docsents: if sent: outfile.write(" ".join(sent) + '\n') outfile.close()
def save(self, fname): """ See MatrixSimilarity.save() """ logger.info("storing %s object to %s and %s" % (self.__class__.__name__, fname, fname + '.npy')) # first, remove the index from self.__dict__, so it doesn't get pickled index = self.corpus del self.corpus try: utils.pickle(self, fname) # store index-less object numpy.save(fname + '.npy', index) # store index finally: self.corpus = index
def train(word2id, id2word, corpus, win, dim): cooccur = glove.Corpus(dictionary=word2id) cooccur.fit(corpus(), window=win) logger.info("glove model creating") logger.info('Dict size: %s' % len(cooccur.dictionary)) logger.info('Collocations: %s' % cooccur.matrix.nnz) model = glove.Glove(no_components=dim, learning_rate=0.05) model.fit(cooccur.matrix, epochs=10, no_threads=5, verbose=True) model.add_dictionary(cooccur.dictionary) model.word2id = dict( (utils.to_unicode(w), id) for w, id in model.dictionary.items()) model.id2word = gensim.utils.revdict(model.word2id) utils.pickle(model, './model/glove.model')
def save(self, fname): """ See MatrixSimilarity.save() """ logger.info("storing %s object to %s and %s" % (self.__class__.__name__, fname, fname + '.index')) # first, remove the similarity index from self., so it doesn't get pickled sim = self.similarity_index del self.similarity_index try: sim.save(fname + ".index") utils.pickle(self, fname) # store index-less object finally: self.similarity_index = sim
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False, dockeys_fname=None): key_order = [] def corpus_as_list(): for k, c in corpus: key_order.append(k) yield c IndexedCorpus.serialize.im_func( serializer, fname, corpus_as_list(), id2word, index_fname, progress_cnt, labels, metadata) dockeys_fname = dockeys_fname or utils.smart_extension( fname, '.dockeys') utils.pickle(key_order, dockeys_fname)
def createLevenshteinCorpus(self): corpus = corpora.MmCorpus('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.originalCorpusFileName)) termCorpus = corpora.MmCorpus('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName)) levCorpus_writer = matutils.MmWriter('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.levenshteinCorpusFileName)) levCorpus_writer.write_headers(-1, -1, -1) num_nnz = 0 poslast = 0 offsets = [] write_index = 0 for doc in corpus: doc_ids = [id for (id, term) in doc] candidates = [] levDoc = [] for termId in doc_ids: # Create a list of candidates for this document for candidate in termCorpus[termId]: candidates.append(candidate) candidates = sorted(candidates, key=lambda tuple: tuple[0]) it = itertools.groupby(candidates, key=lambda tuple: tuple[0]) #Group together similarity measures for the same term. for key, subiter in it: levDoc.append(max(subiter, key=lambda tuple: tuple[1])) #From the multiple similarity measures, grab the maximum. # Determine offsets for the index file, allowing O(1) access time of documents. posnow = levCorpus_writer.fout.tell() if posnow == poslast: offsets[-1] = -1 offsets.append(posnow) poslast = posnow #Write the document to levCorpus in file max_id, veclen = levCorpus_writer.write_vector(write_index, levDoc) num_nnz += veclen write_index += 1 levCorpus_writer.fake_headers(len(corpus), len(termCorpus), num_nnz) levCorpus_writer.close() # Write index to file index_fname = levCorpus_writer.fname + '.index' utils.pickle(offsets, index_fname)
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): """ Iterate through the document stream `corpus`, saving the documents to `fname` and recording byte offset of each document. Save the resulting index structure to file `index_fname` (or `fname`.index is not set). This relies on the underlying corpus class `serializer` providing (in addition to standard iteration): * `save_corpus` method that returns a sequence of byte offsets, one for each saved document, * the `docbyoffset(offset)` method, which returns a document positioned at `offset` bytes within the persistent storage (file). Example: >>> MmCorpus.serialize('test.mm', corpus) >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access >>> print(mm[42]) # retrieve document no. 42, etc. """ if getattr(corpus, 'fname', None) == fname: raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname) if index_fname is None: index_fname = utils.smart_extension(fname, '.index') if progress_cnt is not None: if labels is not None: offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata) else: offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) else: if labels is not None: offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata) else: offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata) if offsets is None: raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % serializer.__name__) # store offsets persistently, using pickle # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return # the offsets that are actually stored on disk - we're not storing self.index in any case, the # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure # backwards compatibility logger.info("saving %s index to %s" % (serializer.__name__, index_fname)) utils.pickle(offsets, index_fname)
def save(self, fname): """ Override the default `save` (which uses cPickle), because that's too inefficient and cPickle has bugs. Instead, single out the large index matrix and store that separately in binary format (that can be directly mmap'ed), under `fname.npy`. The rest of the object is pickled to `fname`. """ logger.info("storing %s object to %s and %s" % (self.__class__.__name__, fname, fname + '.npy')) # first, remove the index from self.__dict__, so it doesn't get pickled index = self.index del self.index try: utils.pickle(self, fname) # store index-less object numpy.save(fname + '.npy', index) # store index finally: self.index = index
def save(self, fname): """ Override the default `save` (which uses cPickle), because that's too inefficient and cPickle has bugs. Instead, single out the large internal arrays and store them separately in binary format (that can be directly mmap'ed), under `fname.array_name.npy`. """ logger.info("storing %s object to %s and %s.npy" % (self.__class__.__name__, fname, fname)) assert isinstance(self.index, scipy.sparse.csr_matrix) # first, remove the arrays from self.__dict__, so they don't get pickled data, indptr, indices = self.index.data, self.index.indptr, self.index.indices del self.index.data, self.index.indptr, self.index.indices try: utils.pickle(self, fname) # store array-less object # store arrays (.npy suffix is appended by numpy automatically) numpy.save(fname + '.data.npy', data) numpy.save(fname + '.indptr.npy', indptr) numpy.save(fname + '.indices.npy', indices) finally: self.index.data, self.index.indptr, self.index.indices = data, indptr, indices
def save(self, fname): """ Override the default `save` (which uses cPickle), because that's too inefficient and cPickle has bugs. Instead, single out the large transformation matrix and store that separately in binary format (that can be directly mmap'ed back in `load()`), under `fname.npy`. """ logger.info("storing %s object to %s and %s" % (self.__class__.__name__, fname, fname + '.npy')) if self.projection.u is None: # model not initialized: there is no projection utils.pickle(self, fname) # first, remove the projection from self.__dict__, so it doesn't get pickled u = self.projection.u del self.projection.u try: utils.pickle(self, fname) # store projection-less object numpy.save(fname + '.npy', u) # store projection finally: self.projection.u = u
def save(self, fname): """ Override the default `save` (which uses cPickle), because that's too inefficient and cPickle has bugs. Instead, single out the large transformation matrix and store that separately in binary format (that can be directly mmap'ed back in `load()`), under `fname.npy`. """ logger.info("storing %s object to %s and %s" % (self.__class__.__name__, fname, fname + '.npy')) if self.projection.u is None: # model not initialized: there is no projection utils.pickle(self, fname) # first, remove the projection from self.__dict__, so it doesn't get pickled u = self.projection.u del self.projection.u try: utils.pickle(self, fname) # store projection-less object numpy.save(fname + '.npy', ascarray(u)) # store projection finally: self.projection.u = u
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False, dockeys_fname=None): key_order = [] def corpus_as_list(): for k, c in corpus: key_order.append(k) yield c IndexedCorpus.serialize.__func__(serializer, fname, corpus_as_list(), id2word, index_fname, progress_cnt, labels, metadata) dockeys_fname = dockeys_fname or utils.smart_extension( fname, '.dockeys') utils.pickle(key_order, dockeys_fname)
if __name__ == "__main__": logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) from ppmi import PpmiModel in_file = gensim.models.word2vec.LineSentence('./all_sentences.dat') ## sentences = lambda: itertools.islice(in_file, None) logger.info("dictionary creating") id2word = gensim.corpora.Dictionary(in_file, prune_at=UNIQUE_WORDS) id2word.filter_extremes(keep_n=TOKEN_LIMIT) word2id = dict((v, k) for k, v in id2word.items()) utils.pickle(word2id, './tmp/word2id') id2word = gensim.utils.revdict(word2id) ## filter sentences to contain only the dictionary words corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences()) logger.info("PMI matrix creating") logger.info("raw cooccurrence matrix creating") raw = get_cooccur(corpus(), word2id, window=WINDOW, dynamic_window=DYNAMIC_WINDOW) numpy.save('./tmp/cooccur.npy', raw) # store the SPPMI matrix in sparse Matrix Market format on disk gensim.corpora.MmCorpus.serialize('./tmp/pmi_matrix.mm', raw2ppmi(raw, word2id, k_shift=NEGATIVE or 1)) del raw
program = os.path.basename(sys.argv[0]) if len(sys.argv) < 3: print(globals()['__doc__'] % locals()) sys.exit(1) in_file = gensim.models.word2vec.LineSentence(sys.argv[1]) outf = lambda prefix: os.path.join(sys.argv[2], prefix) logger.info("output file template will be %s" % outf('PREFIX')) sentences = lambda: itertools.islice(in_file, DOC_LIMIT) if os.path.exists(outf('word2id')): logger.info("dictionary already exists") else: logger.info("dictionary not found, creating") id2word = gensim.corpora.Dictionary(sentences(), prune_at=PRUNE_AT) id2word.save_as_text(outf("full_vocab.txt")) utils.pickle(id2word, outf('id2word')) id2word.filter_extremes(keep_n=TOKEN_LIMIT) # filter out too freq/infreq words word2id = dict((v, k) for k, v in id2word.iteritems()) w = csv.writer(open(outf("TOKEN_LIMIT_vocab.txt"), "w")) for key, val in word2id.items(): w.writerow([key.encode('utf-8').lower(), val]) utils.pickle(word2id, outf('word2id')) id2word = gensim.utils.revdict(word2id) logger.info("finished running %s" % program)
outf = lambda prefix: os.path.join(output_dir, prefix) logger.info("output file template will be %s" % outf('PREFIX')) sentences = MyCorpus(corpus_path) if os.path.exists(outf('word2id')): logger.info("dictionary found, loading") word2id = utils.unpickle(outf('word2id')) else: logger.info("dictionary not found, creating") id2word = corpora.Dictionary(sentences, prune_at=10000000) id2word.filter_extremes( keep_n=TOKEN_LIMIT) # filter out too freq/infreq words word2id = dict((v, k) for k, v in id2word.iteritems()) utils.pickle(word2id, outf('word2id')) id2word = utils.revdict(word2id) # Filter all wiki documents to contain only those words. corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences) if os.path.exists(outf('kw2v_%s' % GAMMA)): logger.info("Kernel word2vec model found, loading") # model = utils.unpickle(outf('kw2v')) model = Word2Vec.load_word2vec_format(outf('kw2v_%s' % GAMMA), binary=True) else: logger.info("Kernel word2vec model not found, creating") if NEGATIVE: model = Word2Vec(size=DIM,
sys.exit(1) in_file = gensim.models.word2vec.LineSentence(sys.argv[1]) # in_file = gensim.models.word2vec.Text8Corpus(sys.argv[1]) q_file = sys.argv[2] outf = lambda prefix: os.path.join(sys.argv[3], prefix) logger.info("output file template will be %s" % outf('PREFIX')) sentences = lambda: itertools.islice(in_file, DOC_LIMIT) # use only a small subset of all words; otherwise the methods based on matrix # decomposition (glove, ppmi) take too much RAM (quadratic in vocabulary size). logger.info("dictionary found, loading") with open(outf("pruned_vocab.csv")) as csvfile: reader = csv.reader(csvfile) word2id = dict((rows[0],rows[1]) for rows in reader) utils.pickle(word2id, outf('word2id')) id2word = gensim.utils.revdict(word2id) # filter sentences to contain only the dictionary words corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences()) if 'word2vec' in program: if os.path.exists(outf('w2v')): logger.info("word2vec model found, loading") model = utils.unpickle(outf('w2v')) else: logger.info("word2vec model not found, creating") if NEGATIVE: model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS, hs=0, negative=NEGATIVE) else:
dictionary = gensim.corpora.dictionary.Dictionary.from_corpus( corpus, id2word=id2word) logger.info("calculating truncated SVD") lsi = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics=DIM) self.singular_scaled = lsi.projection.s**s_exponent # embeddings = left singular vectors scaled by the (exponentiated) singular values self.word_vectors = lsi.projection.u * self.singular_scaled if __name__ == "__main__": logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) from svd import SvdModel word2id = utils.unpickle('./tmp/word2id') id2word = gensim.utils.revdict(word2id) logger.info("SVD model creating") corpus = gensim.corpora.MmCorpus('./tmp/pmi_matrix.mm') model = SvdModel(corpus, id2word, s_exponent=0.0) model.word2id = word2id model.id2word = id2word utils.pickle(model, './model/svd.model') logger.info("finished running svd")
outf = lambda prefix: os.path.join(sys.argv[3], prefix) logger.info("output file template will be %s" % outf('PREFIX')) sentences = lambda: itertools.islice(in_file, DOC_LIMIT) # use only a small subset of all words; otherwise the methods based on matrix # decomposition (glove, ppmi) take too much RAM (quadratic in vocabulary size). if os.path.exists(outf('word2id')): logger.info("dictionary found, loading") word2id = utils.unpickle(outf('word2id')) else: logger.info("dictionary not found, creating") id2word = gensim.corpora.Dictionary(sentences(), prune_at=10000000) id2word.filter_extremes(keep_n=TOKEN_LIMIT) # filter out too freq/infreq words word2id = dict((v, k) for k, v in id2word.iteritems()) utils.pickle(word2id, outf('word2id')) id2word = gensim.utils.revdict(word2id) # filter sentences to contain only the dictionary words corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences()) if 'word2vec' in program: if os.path.exists(outf('w2v')): logger.info("word2vec model found, loading") model = utils.unpickle(outf('w2v')) else: logger.info("word2vec model not found, creating") if NEGATIVE: model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS, hs=0, negative=NEGATIVE) else: model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS)
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): """Serialize corpus with offset metadata, allows to use direct indexes after loading. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, float) Corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word. index_fname : str, optional Where to save resulting index, if None - store index to `fname`.index. progress_cnt : int, optional Number of documents after which progress info is printed. labels : bool, optional If True - ignore first column (class labels). metadata : bool, optional If True - ensure that serialize will write out article titles to a pickle file. Examples -------- >>> from gensim.corpora import MmCorpus >>> from gensim.test.utils import get_tmpfile >>> >>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]] >>> output_fname = get_tmpfile("test.mm") >>> >>> MmCorpus.serialize(output_fname, corpus) >>> mm = MmCorpus(output_fname) # `mm` document stream now has random access >>> print(mm[1]) # retrieve document no. 42, etc. [(1, 0.1)] """ if getattr(corpus, 'fname', None) == fname: raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname) if index_fname is None: index_fname = utils.smart_extension(fname, '.index') kwargs = {'metadata': metadata} if progress_cnt is not None: kwargs['progress_cnt'] = progress_cnt if labels is not None: kwargs['labels'] = labels offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs) if offsets is None: raise NotImplementedError( "Called serialize on class %s which doesn't support indexing!" % serializer.__name__ ) # store offsets persistently, using pickle # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return # the offsets that are actually stored on disk - we're not storing self.index in any case, the # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure # backwards compatibility logger.info("saving %s index to %s", serializer.__name__, index_fname) utils.pickle(offsets, index_fname)
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): """Serialize corpus with offset metadata, allows to use direct indexes after loading. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, float) Corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word. index_fname : str, optional Where to save resulting index, if None - store index to `fname`.index. progress_cnt : int, optional Number of documents after which progress info is printed. labels : bool, optional If True - ignore first column (class labels). metadata : bool, optional If True - ensure that serialize will write out article titles to a pickle file. Examples -------- .. sourcecode:: pycon >>> from gensim.corpora import MmCorpus >>> from gensim.test.utils import get_tmpfile >>> >>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]] >>> output_fname = get_tmpfile("test.mm") >>> >>> MmCorpus.serialize(output_fname, corpus) >>> mm = MmCorpus(output_fname) # `mm` document stream now has random access >>> print(mm[1]) # retrieve document no. 42, etc. [(1, 0.1)] """ if getattr(corpus, 'fname', None) == fname: raise ValueError( "identical input vs. output corpus filename, refusing to serialize: %s" % fname) if index_fname is None: index_fname = utils.smart_extension(fname, '.index') kwargs = {'metadata': metadata} if progress_cnt is not None: kwargs['progress_cnt'] = progress_cnt if labels is not None: kwargs['labels'] = labels offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs) if offsets is None: raise NotImplementedError( "Called serialize on class %s which doesn't support indexing!" % serializer.__name__) # store offsets persistently, using pickle # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return # the offsets that are actually stored on disk - we're not storing self.index in any case, the # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure # backwards compatibility logger.info("saving %s index to %s", serializer.__name__, index_fname) utils.pickle(offsets, index_fname)
logger.info("output file template will be %s" % outf('PREFIX')) sentences = lambda: itertools.islice(in_file, DOC_LIMIT) # use only a small subset of all words; otherwise the methods based on matrix # decomposition (glove, ppmi) take too much RAM (quadratic in vocabulary size). if os.path.exists(outf('word2id')): logger.info("dictionary found, loading") word2id = utils.unpickle(outf('word2id')) else: logger.info("dictionary not found, creating") id2word = gensim.corpora.Dictionary(sentences(), prune_at=10000000) id2word.filter_extremes( keep_n=TOKEN_LIMIT) # filter out too freq/infreq words word2id = dict((v, k) for k, v in id2word.iteritems()) utils.pickle(word2id, outf('word2id')) id2word = gensim.utils.revdict(word2id) # filter sentences to contain only the dictionary words corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences()) if 'word2vec' in program: if os.path.exists(outf('w2v')): logger.info("word2vec model found, loading") model = utils.unpickle(outf('w2v')) else: logger.info("word2vec model not found, creating") if NEGATIVE: model = gensim.models.Word2Vec(size=DIM, min_count=0,
# Load the article titles back id_to_titles = utils.unpickle('./data/bow.mm.metadata.cpickle') # Create the reverse mapping, from article title to index. titles_to_id = {} # For each article... for at in id_to_titles.items(): # `at` is (index, (pageid, article_title)) e.g., (0, ('12', 'Anarchism')) # at[1][1] is the article title. # The pagied property is unused. titles_to_id[at[1][1]] = at[0] # Store the resulting map. utils.pickle(titles_to_id, './data/titles_to_id.pickle') # We're done with the article titles so free up their memory. del id_to_titles del titles_to_id # To clean up some memory, we can delete our original dictionary and # wiki objects, and load back the dictionary directly from the file. del dictionary del wiki # Load the dictionary back from disk. # (0.86sec on my machine loading from an SSD) dictionary = Dictionary.load_from_text('./data/dictionary.txt.bz2')
def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, metadata=False): """Save the corpus to disk in Matrix Market format. Parameters ---------- fname : str Filename of the resulting file. corpus : iterable of list of (int, number) Corpus in Bow format. progress_cnt : int, optional Print progress for every `progress_cnt` number of documents. index : bool, optional If True, the offsets will be return, otherwise return None. num_terms : int, optional If provided, the `num_terms` attributes in the corpus will be ignored. metadata : bool, optional If True, a metadata file will be generated. Returns ------- offsets : {list of int, None} List of offsets (if index=True) or nothing. Notes ----- Documents are processed one at a time, so the whole corpus is allowed to be larger than the available RAM. See Also -------- :func:`~gensim.corpora.mmcorpus.MmCorpus.save_corpus` """ mw = MmWriter(fname) # write empty headers to the file (with enough space to be overwritten later) mw.write_headers(-1, -1, -1) # will print 50 spaces followed by newline on the stats line # calculate necessary header info (nnz elements, num terms, num docs) while writing out vectors _num_terms, num_nnz = 0, 0 docno, poslast = -1, -1 offsets = [] if hasattr(corpus, 'metadata'): orig_metadata = corpus.metadata corpus.metadata = metadata if metadata: docno2metadata = {} else: metadata = False for docno, doc in enumerate(corpus): if metadata: bow, data = doc docno2metadata[docno] = data else: bow = doc if docno % progress_cnt == 0: logger.info("PROGRESS: saving document #%i", docno) if index: posnow = mw.fout.tell() if posnow == poslast: offsets[-1] = -1 offsets.append(posnow) poslast = posnow max_id, veclen = mw.write_vector(docno, bow) _num_terms = max(_num_terms, 1 + max_id) num_nnz += veclen if metadata: utils.pickle(docno2metadata, fname + '.metadata.cpickle') corpus.metadata = orig_metadata num_docs = docno + 1 num_terms = num_terms or _num_terms if num_docs * num_terms != 0: logger.info( "saved %ix%i matrix, density=%.3f%% (%i/%i)", num_docs, num_terms, 100.0 * num_nnz / (num_docs * num_terms), num_nnz, num_docs * num_terms ) # now write proper headers, by seeking and overwriting the spaces written earlier mw.fake_headers(num_docs, num_terms, num_nnz) mw.close() if index: return offsets
def save(self, fname, ignore=['state', 'dispatcher'], separately=None, *args, **kwargs): """ Save the model to file. Large internal arrays may be stored into separate files, with `fname` as prefix. `separately` can be used to define which arrays should be stored in separate files. `ignore` parameter can be used to define which variables should be ignored, i.e. left out from the pickled lda model. By default the internal `state` is ignored as it uses its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher` will be added to any ignore parameter defined. Note: do not save as a compressed file if you intend to load the file back with `mmap`. Note: If you intend to use models across Python 2/3 versions there are a few things to keep in mind: 1. The pickled Python dictionaries will not work across Python versions 2. The `save` method does not automatically save all np arrays using np, only those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main concern here is the `alpha` array if for instance using `alpha='auto'`. Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2) for an example on how to work around these issues. """ if self.state is not None: self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs) # Save the dictionary separately if not in 'ignore'. if 'id2word' not in ignore: utils.pickle(self.id2word, utils.smart_extension(fname, '.id2word')) # make sure 'state', 'id2word' and 'dispatcher' are ignored from the pickled object, even if # someone sets the ignore list themselves if ignore is not None and ignore: if isinstance(ignore, six.string_types): ignore = [ignore] ignore = [e for e in ignore if e] # make sure None and '' are not in the list ignore = list( set(['state', 'dispatcher', 'id2word']) | set(ignore)) else: ignore = ['state', 'dispatcher', 'id2word'] # make sure 'expElogbeta' and 'sstats' are ignored from the pickled object, even if # someone sets the separately list themselves. separately_explicit = ['expElogbeta', 'sstats'] # Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some # array manually. if (isinstance(self.alpha, six.string_types) and self.alpha == 'auto') or len(self.alpha.shape) != 1: separately_explicit.append('alpha') if (isinstance(self.eta, six.string_types) and self.eta == 'auto') or len(self.eta.shape) != 1: separately_explicit.append('eta') # Merge separately_explicit with separately. if separately: if isinstance(separately, six.string_types): separately = [separately] separately = [e for e in separately if e] # make sure None and '' are not in the list separately = list(set(separately_explicit) | set(separately)) else: separately = separately_explicit super(LdaModel, self).save(fname, ignore=ignore, separately=separately, *args, **kwargs)
def save(self, fname, ignore=['state', 'dispatcher'], separately=None, *args, **kwargs): """ Save the model to file. Large internal arrays may be stored into separate files, with `fname` as prefix. `separately` can be used to define which arrays should be stored in separate files. `ignore` parameter can be used to define which variables should be ignored, i.e. left out from the pickled lda model. By default the internal `state` is ignored as it uses its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher` will be added to any ignore parameter defined. Note: do not save as a compressed file if you intend to load the file back with `mmap`. Note: If you intend to use models across Python 2/3 versions there are a few things to keep in mind: 1. The pickled Python dictionaries will not work across Python versions 2. The `save` method does not automatically save all np arrays using np, only those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main concern here is the `alpha` array if for instance using `alpha='auto'`. Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2) for an example on how to work around these issues. """ if self.state is not None: self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs) # Save the dictionary separately if not in 'ignore'. if 'id2word' not in ignore: utils.pickle(self.id2word, utils.smart_extension(fname, '.id2word')) # make sure 'state', 'id2word' and 'dispatcher' are ignored from the pickled object, even if # someone sets the ignore list themselves if ignore is not None and ignore: if isinstance(ignore, six.string_types): ignore = [ignore] ignore = [e for e in ignore if e] # make sure None and '' are not in the list ignore = list(set(['state', 'dispatcher', 'id2word']) | set(ignore)) else: ignore = ['state', 'dispatcher', 'id2word'] # make sure 'expElogbeta' and 'sstats' are ignored from the pickled object, even if # someone sets the separately list themselves. separately_explicit = ['expElogbeta', 'sstats'] # Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some # array manually. if (isinstance(self.alpha, six.string_types) and self.alpha == 'auto') or (isinstance(self.alpha, np.ndarray) and len(self.alpha.shape) != 1): separately_explicit.append('alpha') if (isinstance(self.eta, six.string_types) and self.eta == 'auto') or (isinstance(self.eta, np.ndarray) and len(self.eta.shape) != 1): separately_explicit.append('eta') # Merge separately_explicit with separately. if separately: if isinstance(separately, six.string_types): separately = [separately] separately = [e for e in separately if e] # make sure None and '' are not in the list separately = list(set(separately_explicit) | set(separately)) else: separately = separately_explicit super(LdaModel, self).save(fname, ignore=ignore, separately=separately, *args, **kwargs)