def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunks=256): """ `num_features` is the number of features in the corpus (will be determined automatically by scanning the corpus if not specified). See `Similarity` class for description of the other parameters. """ if num_features is None: logger.info("scanning corpus to determine the number of features") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features self.num_best = num_best self.normalize = True self.chunks = chunks if corpus is not None: logger.info("creating matrix for %s documents and %i features" % (len(corpus), num_features)) self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype) # iterate over corpus, populating the numpy index matrix with (normalized) # document vectors for docno, vector in enumerate(corpus): if docno % 1000 == 0: logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus))) self.index[docno] = matutils.unitvec(matutils.sparse2full(vector, num_features))
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256, corpus_len=None): """ Parameters ---------- corpus : iterable of list of (int, number) Corpus in streamed Gensim bag-of-words format. num_best : int, optional If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. Otherwise, return a full vector with one float for every document in the index. num_features : int Size of the dictionary (number of features). corpus_len : int, optional Number of documents in `corpus`. If not specified, will scan the corpus to determine the matrix size. chunksize : int, optional Size of query chunks. Used internally when the query is an entire corpus. dtype : numpy.dtype, optional Datatype to store the internal matrix in. """ if num_features is None: logger.warning( "scanning corpus to determine the number of features (consider setting `num_features` explicitly)" ) num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features self.num_best = num_best self.normalize = True self.chunksize = chunksize if corpus_len is None: corpus_len = len(corpus) if corpus is not None: if self.num_features <= 0: raise ValueError( "cannot index a corpus with zero features (you must specify either `num_features` " "or a non-empty corpus in the constructor)" ) logger.info("creating matrix with %i documents and %i features", corpus_len, num_features) self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype) # iterate over corpus, populating the numpy index matrix with (normalized) # document vectors for docno, vector in enumerate(corpus): if docno % 1000 == 0: logger.debug("PROGRESS: at document #%i/%i", docno, corpus_len) # individual documents in fact may be in numpy.scipy.sparse format as well. # it's not documented because other it's not fully supported throughout. # the user better know what he's doing (no normalization, must # explicitly supply num_features etc). if isinstance(vector, numpy.ndarray): pass elif scipy.sparse.issparse(vector): vector = vector.toarray().flatten() else: vector = matutils.unitvec(matutils.sparse2full(vector, num_features)) self.index[docno] = vector
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256, corpus_len=None): """ `num_features` is the number of features in the corpus (will be determined automatically by scanning the corpus if not specified). See `Similarity` class for description of the other parameters. """ if num_features is None: logger.warning( "scanning corpus to determine the number of features (consider setting `num_features` explicitly)" ) num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features self.num_best = num_best self.normalize = True self.chunksize = chunksize if corpus_len is None: corpus_len = len(corpus) if corpus is not None: if self.num_features <= 0: raise ValueError( "cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)" ) logger.info("creating matrix with %i documents and %i features", corpus_len, num_features) self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype) # iterate over corpus, populating the numpy index matrix with (normalized) # document vectors for docno, vector in enumerate(corpus): if docno % 1000 == 0: logger.debug("PROGRESS: at document #%i/%i", docno, corpus_len) # individual documents in fact may be in numpy.scipy.sparse format as well. # it's not documented because other it's not fully supported throughout. # the user better know what he's doing (no normalization, must # explicitly supply num_features etc). if isinstance(vector, numpy.ndarray): pass elif scipy.sparse.issparse(vector): vector = vector.toarray().flatten() else: vector = matutils.unitvec( matutils.sparse2full(vector, num_features)) self.index[docno] = vector
def __init__(self, corpus, document_titles, num_clusters=None, num_features=None): """ Computes the interpreter matrix by calculating the TF-IDF value of each token in each concept (doc) in corpus. document_titles give the names of each concept (doc) in corpus. num_features gives the number of features of corpus If num_clusters == None all documents are used as concepts. """ if not num_clusters: self.num_clusters = len(document_titles) else: self.num_clusters = num_clusters if num_features is None: logger.info("scanning corpus to determine the number of features") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features #reduce column count by k-medoid clustering and using medoid of each cluster #TODO: skip clustering when num_clusters == None clusterer = KMedoids(corpus=corpus, num_features=self.num_features, num_clusters=self.num_clusters, max_iterations=10) clusters = clusterer.cluster() #set the corpus to medoids #the corpus is our interpreter matrix. It is not sparse #each column is a doc and is seen as a concept self.corpus = clusterer.get_medoids().T #reduce document titles self.document_titles = DocumentTitles() for cluster_id in clusters.iterkeys(): self.document_titles.append(document_titles[cluster_id] or "no title") #print clusters with their members for cluster_id, members in clusters.iteritems(): cluster_title = document_titles[cluster_id] member_titles = ", ".join(document_titles[member_id] for member_id in members) logger.debug("%s: %s" % (cluster_title, member_titles))
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256, corpus_len=None): """ `num_features` is the number of features in the corpus (will be determined automatically by scanning the corpus if not specified). See `Similarity` class for description of the other parameters. """ if num_features is None: logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features self.num_best = num_best self.normalize = True self.chunksize = chunksize if corpus_len is None: corpus_len = len(corpus) if corpus is not None: if self.num_features <= 0: raise ValueError( "cannot index a corpus with zero features (you must specify either `num_features` " "or a non-empty corpus in the constructor)" ) logger.info("creating matrix with %i documents and %i features", corpus_len, num_features) self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype) # iterate over corpus, populating the numpy index matrix with (normalized) # document vectors for docno, vector in enumerate(corpus): if docno % 1000 == 0: logger.debug("PROGRESS: at document #%i/%i", docno, corpus_len) # individual documents in fact may be in numpy.scipy.sparse format as well. # it's not documented because other it's not fully supported throughout. # the user better know what he's doing (no normalization, must # explicitly supply num_features etc). if isinstance(vector, numpy.ndarray): pass elif scipy.sparse.issparse(vector): vector = vector.toarray().flatten() else: vector = matutils.unitvec(matutils.sparse2full(vector, num_features)) self.index[docno] = vector
def __init__(self, corpus, document_titles, test_corpus, test_corpus_targets, num_test_corpus, num_best_features=1000, num_features=None, tmp_path='complete_similarity'): """ The similarity between a document and each document of the corpus is the feature created. The corpus is filtered for significant features. Parameters ---------- corpus : The corpus which contains all concepts. E.g. English Wikipedia in TF-IDF space. test_corpus : The test corpus is used to select features. All documents in this corpus should be classified. test_corpus_targets : The target classes of each document in the test corpus. num_test_corpus : Number of documents in the test corpus. document_titles : The names of each concept (doc) in corpus. num_features : The number of features of corpus num_best_features : Number of features which should be selected for cESA model. If one wants to use all concepts as features the she has to set num_best_features to the size of corpus. """ if num_features is None: logger.info("scanning corpus to determine the number of features") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features # create similarity index of complete corpus complete_similarity_index = Similarity(output_prefix=tmp_path, corpus=corpus, num_features=self.num_features) # reduce concept count by feature selection self.selector = iSelectKBest(if_classif, k=num_best_features) #transform each document of test_corpus logger.info("Test corpus of %d documents..." % num_test_corpus) transformed_test_corpus = (complete_similarity_index[doc] for doc in test_corpus) logger.info("Select best features...") X_y = izip(transformed_test_corpus, test_corpus_targets) self.selector.fit(X_y, len(document_titles)) logger.info("Done selecting.") #reduce similarity index selected_documents = [doc for doc, mask in itertools.izip(corpus, self.selector.get_support()) if mask] self.similarity_index = MatrixSimilarity(corpus=selected_documents, num_features=self.num_features) #reduce document titles self.document_titles = DocumentTitles() for doc_title, mask in itertools.izip(document_titles, self.selector.get_support()): if mask: self.document_titles.append(doc_title) #print doc titles for title in self.document_titles: logger.debug("%s" % title)
corpora.MmCorpus.serialize('corpus/'+doc_file.split('/')[-1]+str(num_topics_lsi)+\ '.lda.mm._f'+str(freq_th), corpus_lda) return corpus_lda if __name__=='__main__': if create_dictionary: dictionary = create_dict(dict_file) print("***Dictionary is created") else: dictionary = corpora.Dictionary.load('dictionary/'+dict_file.split('/')[-1]+'.gensim') print("***Dictionary is loaded") if create_corpus_bow: corpus_bow = create_corp_bow(doc_file) print("***BOW corpus is created") num_features = 1 + utils.get_max_id(corpus_bow) index_bow = similarities.Similarity(output_prefix=None, corpus=corpus_bow, num_features=num_features) print("***BOW corpus index is created") index_bow.save('index/'+doc_file.split('/')[-1]+'.bow.index._f'+str(freq_th)) else: #load BOW corpus corpus_bow = corpora.MmCorpus('corpus/'+doc_file.split('/')[-1]+'.bow.mm._f'+str(freq_th)) print("***BOW corpus is loaded") index_bow = similarities.Similarity.load('index/'+doc_file.split('/')[-1]+'.bow.index._f'+\ str(freq_th)) print("***BOW corpus index is loaded") dictionary = corpora.Dictionary.load('dictionary/'+doc_file.split('/')[-1]+'.dict._f'+\ str(freq_th)) print("***Dictionary2 is loaded") if create_corpus_tfidf: