class AnnStream: def __init__(self, data, k: int, n_cluster: int, reduction_method: str, dims: int, loadings: np.ndarray, use_for_pca: np.ndarray, mu: np.ndarray, sigma: np.ndarray, ann_metric: str, ann_efc: int, ann_ef: int, ann_m: int, nthreads: int, ann_parallel: bool, rand_state: int, do_kmeans_fit: bool, disable_scaling: bool, ann_idx): self.data = data self.k = k if self.k >= self.data.shape[0]: self.k = self.data.shape[0] - 1 self.nClusters = max(n_cluster, 2) self.dims = dims self.loadings = loadings if self.dims is None and self.loadings is None: raise ValueError( "ERROR: Provide either value for atleast one: 'dims' or 'loadings'" ) self.annMetric = ann_metric self.annEfc = ann_efc self.annEf = ann_ef self.annM = ann_m self.nthreads = nthreads if ann_parallel: self.annThreads = self.nthreads else: self.annThreads = 1 self.randState = rand_state self.batchSize = self._handle_batch_size() self.method = reduction_method self.nCells, self.nFeats = self.data.shape self.clusterLabels: np.ndarray = np.repeat(-1, self.nCells) disable_reduction = False if self.dims < 1: disable_reduction = True with threadpool_limits(limits=self.nthreads): if self.method == 'pca': self.mu, self.sigma = mu, sigma if self.loadings is None or len(self.loadings) == 0: if len(use_for_pca) != self.nCells: raise ValueError( "ERROR: `use_for_pca` does not have sample length as nCells" ) if disable_reduction is False: self._fit_pca(disable_scaling, use_for_pca) else: # Even though the dims might have been already adjusted according to loadings before calling # AnnStream, it could still be overwritten by _handle_batch_size. Hence need to hard set it here. self.dims = self.loadings.shape[1] # it is okay for dimensions to be larger than batch size here because we will not fit the PCA if disable_scaling: if disable_reduction: self.reducer = lambda x: x else: self.reducer = lambda x: x.dot(self.loadings) else: if disable_reduction: self.reducer = lambda x: self.transform_z(x) else: self.reducer = lambda x: self.transform_z(x).dot( self.loadings) elif self.method == 'lsi': if self.loadings is None or len(self.loadings) == 0: if disable_reduction is False: self._fit_lsi() else: self.dims = self.loadings.shape[1] if disable_reduction: self.reducer = lambda x: x else: self.reducer = lambda x: x.dot(self.loadings) elif self.method == 'custom': if self.loadings is None or len(self.loadings) == 0: logger.warning( "No loadings provided for manual dimension reduction") else: self.dims = self.loadings.shape[1] if disable_reduction: self.reducer = lambda x: x else: self.reducer = lambda x: x.dot(self.loadings) else: raise ValueError( f"ERROR: Unknown reduction method: {self.method}") if ann_idx is None: self.annIdx = self._fit_ann() else: self.annIdx = ann_idx self.annIdx.set_ef(self.annEf) self.annIdx.set_num_threads(1) self.kmeans = self._fit_kmeans(do_kmeans_fit) def _handle_batch_size(self): if self.dims > self.data.shape[0]: self.dims = self.data.shape[0] batch_size = self.data.chunksize[ 0] # Assuming all chunks are same size if self.dims >= batch_size: self.dims = batch_size - 1 # -1 because we will do PCA +1 logger.info( f"Number of PCA/LSI components reduced to batch size of {batch_size}" ) if self.nClusters > batch_size: self.nClusters = batch_size logger.info( f"Cluster number reduced to batch size of {batch_size}") return batch_size def iter_blocks(self, msg: str = '') -> np.ndarray: for i in tqdm(self.data.blocks, desc=msg, total=self.data.numblocks[0]): yield controlled_compute(i, self.nthreads) def transform_z(self, a: np.ndarray) -> np.ndarray: return (a - self.mu) / self.sigma def transform_ann(self, a: np.ndarray, k: int = None, self_indices: np.ndarray = None) -> tuple: if k is None: k = self.k # Adding +1 to k because first neighbour will be the query itself if self_indices is None: i, d = self.annIdx.knn_query(a, k=k) return i, d else: i, d = self.annIdx.knn_query(a, k=k + 1) return fix_knn_query(i, d, self_indices) def _fit_pca(self, disable_scaling, use_for_pca) -> None: from sklearn.decomposition import IncrementalPCA # We fit 1 extra PC dim than specified and then ignore the last PC. self._pca = IncrementalPCA(n_components=self.dims + 1, batch_size=self.batchSize) do_sample_subset = False if use_for_pca.sum() == self.nCells else True s, e = 0, 0 # We store the first block of values here. if such a case arises that we are left with less dims+1 cells to fit # then those cells can be added to end_reservoir for fitting. if there are no such cells then end reservoir is # just by itself after fitting rest of the cells. If may be the case that the first batch itself has less than # dims+1 cells. in that we keep adding cells to carry_over pile until it is big enough. end_reservoir = [] # carry_over store cells that can yet not be added to end_reservoir ot be used for fitting pca directly. carry_over = [] for i in self.iter_blocks(msg='Fitting PCA'): if do_sample_subset: e = s + i.shape[0] i = i[use_for_pca[s:e]] s = e if disable_scaling is False: i = self.transform_z(i) if len(carry_over) > 0: i = np.vstack((carry_over, i)) carry_over = [] if len(i) < (self.dims + 1): carry_over = i continue if len(end_reservoir) == 0: end_reservoir = i continue try: self._pca.partial_fit(i, check_input=False) except LinAlgError: # Add retry counter to make memory consumption doesn't escalate carry_over = i if len(carry_over) > 0: i = np.vstack((end_reservoir, carry_over)) else: i = end_reservoir try: self._pca.partial_fit(i, check_input=False) except LinAlgError: logger.warning( "{i.shape[0]} samples were not used in PCA fitting due to LinAlgError", flush=True) self.loadings = self._pca.components_[:-1, :].T def _fit_lsi(self) -> None: from gensim.models import LsiModel from gensim.matutils import Dense2Corpus self._lsiModel = LsiModel( Dense2Corpus( controlled_compute(self.data.blocks[0], self.nthreads).T), num_topics=self.dims, chunksize=self.data.chunksize[0], id2word={x: x for x in range(self.data.shape[1])}, extra_samples=0) for n, i in enumerate(self.iter_blocks(msg="Fitting LSI model")): if n == 0: continue self._lsiModel.add_documents(Dense2Corpus(i.T)) self.loadings = self._lsiModel.get_topics().T def _fit_ann(self): import hnswlib dims = self.dims if dims < 1: dims = self.data.shape[1] ann_idx = hnswlib.Index(space=self.annMetric, dim=dims) ann_idx.init_index(max_elements=self.nCells, ef_construction=self.annEfc, M=self.annM, random_seed=self.randState) ann_idx.set_ef(self.annEf) ann_idx.set_num_threads(self.annThreads) for i in self.iter_blocks(msg='Fitting ANN'): ann_idx.add_items(self.reducer(i)) return ann_idx def _fit_kmeans(self, do_ann_fit): from sklearn.cluster import MiniBatchKMeans if do_ann_fit is False: return None kmeans = MiniBatchKMeans(n_clusters=self.nClusters, random_state=self.randState, batch_size=self.batchSize) with threadpool_limits(limits=self.nthreads): for i in self.iter_blocks(msg='Fitting kmeans'): kmeans.partial_fit(self.reducer(i)) temp = [] for i in self.iter_blocks(msg='Estimating seed partitions'): temp.extend(kmeans.predict(self.reducer(i))) self.clusterLabels = np.array(temp) return kmeans
class Lsa(ModelABC): """Represent news articles as vectors using Latent Semantic Indexing.""" def __init__(self, dictionary: Dictionary, corpus=None, size: int = 200, decay: float = 1.0, lsa_filename: str = None, tfidf_filename: str = None): """ :param dictionary: A dictionary :param corpus: A corpus for training :param size: The length of feature vector :param decay: The decay parameter :param lsa_filename: File name of a previously trained model :param tfidf_filename: File name of a previously trained TF-IDF model """ super().__init__(size) # Check if we have already trained the Tfidf model if tfidf_filename is not None and os.path.exists(tfidf_filename): self.tfidf = TfidfModel.load(tfidf_filename) else: self.tfidf = TfidfModel(dictionary=dictionary) # Check if we have already trained the Lsa model if lsa_filename is not None and os.path.exists(lsa_filename): self.lsa = LsiModel.load(lsa_filename) logging.info("LSA model loaded") else: if corpus is None: raise ValueError("Corpus must be provided to train LSI") # Process the corpus corpus_tfidf = self.tfidf[corpus] self.lsa = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=size, onepass=True, decay=decay) def update(self, documents): """ Update model using documents. :param documents: The new documents used for update """ self.lsa.add_documents(documents) def save(self, filename: str): """ Save model to a file. :param filename: A model file name """ self.lsa.save(filename) self.tfidf.save(filename + '.tfidf') def _get_vector_representation(self, items): """ Represent documents as vectors. :param items: A list of documents :return: A list of feature vectors. """ return self.lsa[self.tfidf[items]]
def main(self): from sqlalchemy.sql.expression import select, func from ..schema import widget_feature as t_wf from ..schema import widget as t_w from ..schema import feature as t_f from ..schema import feature_set as t_fs from ..schema import datasource as t_ds with self.session_scope() as session: fs_words = session.query(t_fs).filter_by( name=self.config['input_feature_set']) fs_lsi = lookup_or_persist(session, t_fs, name=self.config['output_feature_set'], idmodel=None) q_words = session.query(t_f.idfeature) \ .join(t_fs, t_fs.idfeature_set == t_f.idfeature_set) \ .filter(t_fs.name == self.config['input_feature_set']) self.log.info("Getting word decoder ring from database") words = dict(enumerate(q_words)) self.log.info("number of words: {}".format(len(words))) self.log.info("Initializing model") model_set = ModelSet(session, name=self.config['model_name']) n_components = self.hyperparameters['n_components'] lsi = LsiModel(num_topics=self.hyperparameters['n_components'], id2word=words, onepass=True, extra_samples=n_components + 1, dtype=np.float32) self.log.info("Num Topics {}".format(lsi.num_topics)) model = model_set.new_model(lsi, q_words, params=self.hyperparameters, log=self.log) self.log.info("Creating output features") fs_lsi = lookup_or_persist(session, t_fs, name=self.config['output_feature_set'], idmodel=model.idmodel) output_features = [] for it in xrange(self.hyperparameters["n_components"]): idf = lookup_or_persist(session, t_f, name=unicode(it), idfeature_set=fs_lsi.idfeature_set) output_features.append(idf) self.log.info("Storing model data") q_lsi = session.query(t_f.idfeature) \ .join(t_fs, t_fs.idfeature_set == t_f.idfeature_set) \ .filter(t_fs.name == self.config['output_feature_set']) \ .filter(t_fs.idmodel == model.idmodel) model.select_predicts_features(q_lsi) all_widgets = session.query(t_w.idwidget) all_widgets = filter_widgets( all_widgets, min_idwidget=self.config['min_idwidget'], max_idwidget=self.config['max_idwidget'], datasources=self.config['datasources']) self.log.info("Counting stuff") num_widgets = all_widgets.count() self.log.info("Widget count: {}".format(num_widgets)) num_features = q_words.count() self.log.info("Input feature count: {}".format(num_features)) self.log.info("Output feature count: {}".format(q_lsi.count())) chunk_size = self.config['chunk_size'] for w_t, X in model.get_training_data(all_widgets, sparse_inputs=True, supervised=False, batch_size=chunk_size): self.log.info( "Training on feature matrix X with shape=({},{}) and nnz={}" .format(X.shape[0], X.shape[1], X.nnz)) lsi.add_documents(X.T) model.set_trained(lsi) for w_t, X in model.get_predict_data(all_widgets, sparse_inputs=True, supervised=False, batch_size=chunk_size): Y_hat = [] for spdoc in X: doc = zip(spdoc.indices, spdoc.data) spvec = lsi[doc] vec = np.zeros(shape=(lsi.num_topics, ), dtype=np.float32) for t, v in spvec: vec[t] = v Y_hat.append(vec) self.log.info("Y_hat[-1] = {}".format(Y_hat[-1])) model.update_predictions(w_t, np.array(Y_hat))
class LSAModel: """ Base class for LSA model. """ def __init__(self, vector_length): """ Initialize model with parameters. Model is fit if it has not been done before. :param vector_length: Number of topics in model. """ self.shortname = 'LSA' self.name = 'LSAmodel' + str(vector_length) self.vector_length = vector_length self.remove_stopwords = None self.word_dict = None self.path = None self.model = None self.doc_vecs = None def set_dict(self, data, remove_stopwords=False, no_below=1, no_above=1, filter_most_frequent=0): """ Set/make dictionary to be used for bow representations. :param data: Which data to use for making dictionary. :param remove_stopwords: Whether to remove stopwords. :param no_below: Minimum number of documents a word has to appear in to be included. :param no_above: Maximum fraction of documents a word can appear in to be included. :param filter_most_frequent: Remove the most frequent words. """ if self.word_dict != None: print( "Model already have a dictionary! This function call does nothing. " ) return self.name = '%s_%sdict_rs%s_nb%s_na%s_fmf%s' % ( self.name, data.name, str(remove_stopwords), str(no_below), str(no_above), str(filter_most_frequent)) self.remove_stopwords = remove_stopwords self.word_dict = data.get_dictionary(remove_stopwords, no_below, no_above, filter_most_frequent) def train(self, data): """ Fit LSA model to the data, set document topic vectors and calculate distances. :param data: Data to fit model on """ if self.word_dict == None: print( "Dictionary must be assigned to model before training. This function call does nothing" ) return if self.model == None: self.model = LsiModel(num_topics=self.vector_length, id2word=self.word_dict) self.name = '%s_%strain' % (self.name, data.name) self.path = Path('modelfiles/%s/%s' % (data.name, self.name)) try: self.model = LsiModel.load(str(self.path / '.model')) except: self.path.mkdir(parents=True, exist_ok=True) print("Training model...", end='') time.sleep(0.1) datastream = GetBow(data, self.remove_stopwords, self.word_dict) self.model.add_documents(datastream) self.model.save(str(self.path / '.model')) def fit(self, data): """ Fit LSA model to the data, set document topic vectors and calculate distances. """ if self.model == None: print( "Model must be trained first. This function call does nothing") return try: self.doc_vecs = pd.read_csv( self.path / str('document_vectors_%s.csv' % data.name), index_col=0) except: print("Fitting model...", end='') time.sleep(0.1) # Container for document topic vectors with zeros doc_vecs = np.zeros((len(data.ids), self.vector_length)) # For each document datastream = GetBow(data, self.remove_stopwords, self.word_dict) for i in range(len(datastream)): # element is now a tuple with index and value for nonzero vector elements for element in self.model[datastream[i]]: # Set nonzero elements in container doc_vecs[i][element[0]] = element[1] # Set document topic vectors as pandas dataframe self.doc_vecs = pd.DataFrame(doc_vecs, index=data.ids) self.doc_vecs.to_csv(self.path / str('document_vectors_%s.csv' % data.name))
# for i in model.show_topics(): # print(i) from gensim.models import LsiModel from gensim import corpora, models import jieba file_dir = "../corpora/test1" documents = [] with open(file_dir, "r", encoding='utf-8') as f: lines = f.readlines() for line in lines: seg_list = jieba.cut(line, cut_all=False) sentence = [word for word in seg_list] documents.append(sentence) Dict = corpora.Dictionary(documents) corpus = [Dict.doc2bow(doc) for doc in documents] tf_idf = models.TfidfModel(corpus) lsimodel = LsiModel(corpus=tf_idf[corpus], id2word=Dict, num_topics=4) # for i in lsimodel[tf_idf[corpus]]: # print(i) for i in lsimodel.show_topics(): print(i) # 添加文档 lsimodel.add_documents([[(1, 2), (2, 1)]])