Ejemplo n.º 1
0
class AnnStream:
    def __init__(self, data, k: int, n_cluster: int, reduction_method: str,
                 dims: int, loadings: np.ndarray, use_for_pca: np.ndarray,
                 mu: np.ndarray, sigma: np.ndarray, ann_metric: str,
                 ann_efc: int, ann_ef: int, ann_m: int, nthreads: int,
                 ann_parallel: bool, rand_state: int, do_kmeans_fit: bool,
                 disable_scaling: bool, ann_idx):
        self.data = data
        self.k = k
        if self.k >= self.data.shape[0]:
            self.k = self.data.shape[0] - 1
        self.nClusters = max(n_cluster, 2)
        self.dims = dims
        self.loadings = loadings
        if self.dims is None and self.loadings is None:
            raise ValueError(
                "ERROR: Provide either value for atleast one: 'dims' or 'loadings'"
            )
        self.annMetric = ann_metric
        self.annEfc = ann_efc
        self.annEf = ann_ef
        self.annM = ann_m
        self.nthreads = nthreads
        if ann_parallel:
            self.annThreads = self.nthreads
        else:
            self.annThreads = 1
        self.randState = rand_state
        self.batchSize = self._handle_batch_size()
        self.method = reduction_method
        self.nCells, self.nFeats = self.data.shape
        self.clusterLabels: np.ndarray = np.repeat(-1, self.nCells)
        disable_reduction = False
        if self.dims < 1:
            disable_reduction = True
        with threadpool_limits(limits=self.nthreads):
            if self.method == 'pca':
                self.mu, self.sigma = mu, sigma
                if self.loadings is None or len(self.loadings) == 0:
                    if len(use_for_pca) != self.nCells:
                        raise ValueError(
                            "ERROR: `use_for_pca` does not have sample length as nCells"
                        )
                    if disable_reduction is False:
                        self._fit_pca(disable_scaling, use_for_pca)
                else:
                    # Even though the dims might have been already adjusted according to loadings before calling
                    # AnnStream, it could still be overwritten by _handle_batch_size. Hence need to hard set it here.
                    self.dims = self.loadings.shape[1]
                    # it is okay for dimensions to be larger than batch size here because we will not fit the PCA
                if disable_scaling:
                    if disable_reduction:
                        self.reducer = lambda x: x
                    else:
                        self.reducer = lambda x: x.dot(self.loadings)
                else:
                    if disable_reduction:
                        self.reducer = lambda x: self.transform_z(x)
                    else:
                        self.reducer = lambda x: self.transform_z(x).dot(
                            self.loadings)
            elif self.method == 'lsi':
                if self.loadings is None or len(self.loadings) == 0:
                    if disable_reduction is False:
                        self._fit_lsi()
                else:
                    self.dims = self.loadings.shape[1]
                if disable_reduction:
                    self.reducer = lambda x: x
                else:
                    self.reducer = lambda x: x.dot(self.loadings)
            elif self.method == 'custom':
                if self.loadings is None or len(self.loadings) == 0:
                    logger.warning(
                        "No loadings provided for manual dimension reduction")
                else:
                    self.dims = self.loadings.shape[1]
                if disable_reduction:
                    self.reducer = lambda x: x
                else:
                    self.reducer = lambda x: x.dot(self.loadings)
            else:
                raise ValueError(
                    f"ERROR: Unknown reduction method: {self.method}")
            if ann_idx is None:
                self.annIdx = self._fit_ann()
            else:
                self.annIdx = ann_idx
                self.annIdx.set_ef(self.annEf)
                self.annIdx.set_num_threads(1)
            self.kmeans = self._fit_kmeans(do_kmeans_fit)

    def _handle_batch_size(self):
        if self.dims > self.data.shape[0]:
            self.dims = self.data.shape[0]
        batch_size = self.data.chunksize[
            0]  # Assuming all chunks are same size
        if self.dims >= batch_size:
            self.dims = batch_size - 1  # -1 because we will do PCA +1
            logger.info(
                f"Number of PCA/LSI components reduced to batch size of {batch_size}"
            )
        if self.nClusters > batch_size:
            self.nClusters = batch_size
            logger.info(
                f"Cluster number reduced to batch size of {batch_size}")
        return batch_size

    def iter_blocks(self, msg: str = '') -> np.ndarray:
        for i in tqdm(self.data.blocks, desc=msg,
                      total=self.data.numblocks[0]):
            yield controlled_compute(i, self.nthreads)

    def transform_z(self, a: np.ndarray) -> np.ndarray:
        return (a - self.mu) / self.sigma

    def transform_ann(self,
                      a: np.ndarray,
                      k: int = None,
                      self_indices: np.ndarray = None) -> tuple:
        if k is None:
            k = self.k
        # Adding +1 to k because first neighbour will be the query itself
        if self_indices is None:
            i, d = self.annIdx.knn_query(a, k=k)
            return i, d
        else:
            i, d = self.annIdx.knn_query(a, k=k + 1)
            return fix_knn_query(i, d, self_indices)

    def _fit_pca(self, disable_scaling, use_for_pca) -> None:
        from sklearn.decomposition import IncrementalPCA
        # We fit 1 extra PC dim than specified and then ignore the last PC.
        self._pca = IncrementalPCA(n_components=self.dims + 1,
                                   batch_size=self.batchSize)
        do_sample_subset = False if use_for_pca.sum() == self.nCells else True
        s, e = 0, 0
        # We store the first block of values here. if such a case arises that we are left with less dims+1 cells to fit
        # then those cells can be added to end_reservoir for fitting. if there are no such cells then end reservoir is
        # just by itself after fitting rest of the cells. If may be the case that the first batch itself has less than
        # dims+1 cells. in that we keep adding cells to carry_over pile until it is big enough.
        end_reservoir = []
        # carry_over store cells that can yet not be added to end_reservoir ot be used for fitting pca directly.
        carry_over = []
        for i in self.iter_blocks(msg='Fitting PCA'):
            if do_sample_subset:
                e = s + i.shape[0]
                i = i[use_for_pca[s:e]]
                s = e
            if disable_scaling is False:
                i = self.transform_z(i)
            if len(carry_over) > 0:
                i = np.vstack((carry_over, i))
                carry_over = []
            if len(i) < (self.dims + 1):
                carry_over = i
                continue
            if len(end_reservoir) == 0:
                end_reservoir = i
                continue
            try:
                self._pca.partial_fit(i, check_input=False)
            except LinAlgError:
                # Add retry counter to make memory consumption doesn't escalate
                carry_over = i
        if len(carry_over) > 0:
            i = np.vstack((end_reservoir, carry_over))
        else:
            i = end_reservoir
        try:
            self._pca.partial_fit(i, check_input=False)
        except LinAlgError:
            logger.warning(
                "{i.shape[0]} samples were not used in PCA fitting due to LinAlgError",
                flush=True)
        self.loadings = self._pca.components_[:-1, :].T

    def _fit_lsi(self) -> None:
        from gensim.models import LsiModel
        from gensim.matutils import Dense2Corpus

        self._lsiModel = LsiModel(
            Dense2Corpus(
                controlled_compute(self.data.blocks[0], self.nthreads).T),
            num_topics=self.dims,
            chunksize=self.data.chunksize[0],
            id2word={x: x
                     for x in range(self.data.shape[1])},
            extra_samples=0)
        for n, i in enumerate(self.iter_blocks(msg="Fitting LSI model")):
            if n == 0:
                continue
            self._lsiModel.add_documents(Dense2Corpus(i.T))
        self.loadings = self._lsiModel.get_topics().T

    def _fit_ann(self):
        import hnswlib

        dims = self.dims
        if dims < 1:
            dims = self.data.shape[1]
        ann_idx = hnswlib.Index(space=self.annMetric, dim=dims)
        ann_idx.init_index(max_elements=self.nCells,
                           ef_construction=self.annEfc,
                           M=self.annM,
                           random_seed=self.randState)
        ann_idx.set_ef(self.annEf)
        ann_idx.set_num_threads(self.annThreads)
        for i in self.iter_blocks(msg='Fitting ANN'):
            ann_idx.add_items(self.reducer(i))
        return ann_idx

    def _fit_kmeans(self, do_ann_fit):
        from sklearn.cluster import MiniBatchKMeans

        if do_ann_fit is False:
            return None
        kmeans = MiniBatchKMeans(n_clusters=self.nClusters,
                                 random_state=self.randState,
                                 batch_size=self.batchSize)
        with threadpool_limits(limits=self.nthreads):
            for i in self.iter_blocks(msg='Fitting kmeans'):
                kmeans.partial_fit(self.reducer(i))
        temp = []
        for i in self.iter_blocks(msg='Estimating seed partitions'):
            temp.extend(kmeans.predict(self.reducer(i)))
        self.clusterLabels = np.array(temp)
        return kmeans
Ejemplo n.º 2
0
class Lsa(ModelABC):
    """Represent news articles as vectors using Latent Semantic Indexing."""
    def __init__(self,
                 dictionary: Dictionary,
                 corpus=None,
                 size: int = 200,
                 decay: float = 1.0,
                 lsa_filename: str = None,
                 tfidf_filename: str = None):
        """
        :param dictionary: A dictionary
        :param corpus: A corpus for training
        :param size: The length of feature vector
        :param decay: The decay parameter
        :param lsa_filename: File name of a previously trained model
        :param tfidf_filename: File name of a previously trained TF-IDF model
        """
        super().__init__(size)

        # Check if we have already trained the Tfidf model
        if tfidf_filename is not None and os.path.exists(tfidf_filename):
            self.tfidf = TfidfModel.load(tfidf_filename)
        else:
            self.tfidf = TfidfModel(dictionary=dictionary)

        # Check if we have already trained the Lsa model
        if lsa_filename is not None and os.path.exists(lsa_filename):
            self.lsa = LsiModel.load(lsa_filename)
            logging.info("LSA model loaded")
        else:
            if corpus is None:
                raise ValueError("Corpus must be provided to train LSI")

            # Process the corpus
            corpus_tfidf = self.tfidf[corpus]

            self.lsa = LsiModel(corpus=corpus_tfidf,
                                id2word=dictionary,
                                num_topics=size,
                                onepass=True,
                                decay=decay)

    def update(self, documents):
        """
        Update model using documents.

        :param documents: The new documents used for update
        """
        self.lsa.add_documents(documents)

    def save(self, filename: str):
        """
        Save model to a file.

        :param filename: A model file name
        """
        self.lsa.save(filename)
        self.tfidf.save(filename + '.tfidf')

    def _get_vector_representation(self, items):
        """
        Represent documents as vectors.

        :param items: A list of documents
        :return: A list of feature vectors.
        """
        return self.lsa[self.tfidf[items]]
Ejemplo n.º 3
0
    def main(self):
        from sqlalchemy.sql.expression import select, func
        from ..schema import widget_feature as t_wf
        from ..schema import widget as t_w
        from ..schema import feature as t_f
        from ..schema import feature_set as t_fs
        from ..schema import datasource as t_ds

        with self.session_scope() as session:
            fs_words = session.query(t_fs).filter_by(
                name=self.config['input_feature_set'])
            fs_lsi = lookup_or_persist(session,
                                       t_fs,
                                       name=self.config['output_feature_set'],
                                       idmodel=None)

            q_words = session.query(t_f.idfeature) \
                .join(t_fs, t_fs.idfeature_set == t_f.idfeature_set) \
                .filter(t_fs.name == self.config['input_feature_set'])

            self.log.info("Getting word decoder ring from database")
            words = dict(enumerate(q_words))
            self.log.info("number of words: {}".format(len(words)))

            self.log.info("Initializing model")
            model_set = ModelSet(session, name=self.config['model_name'])
            n_components = self.hyperparameters['n_components']
            lsi = LsiModel(num_topics=self.hyperparameters['n_components'],
                           id2word=words,
                           onepass=True,
                           extra_samples=n_components + 1,
                           dtype=np.float32)
            self.log.info("Num Topics {}".format(lsi.num_topics))
            model = model_set.new_model(lsi,
                                        q_words,
                                        params=self.hyperparameters,
                                        log=self.log)

            self.log.info("Creating output features")
            fs_lsi = lookup_or_persist(session,
                                       t_fs,
                                       name=self.config['output_feature_set'],
                                       idmodel=model.idmodel)
            output_features = []
            for it in xrange(self.hyperparameters["n_components"]):
                idf = lookup_or_persist(session,
                                        t_f,
                                        name=unicode(it),
                                        idfeature_set=fs_lsi.idfeature_set)
                output_features.append(idf)

            self.log.info("Storing model data")
            q_lsi = session.query(t_f.idfeature) \
                    .join(t_fs, t_fs.idfeature_set == t_f.idfeature_set) \
                    .filter(t_fs.name == self.config['output_feature_set']) \
                    .filter(t_fs.idmodel == model.idmodel)

            model.select_predicts_features(q_lsi)

            all_widgets = session.query(t_w.idwidget)
            all_widgets = filter_widgets(
                all_widgets,
                min_idwidget=self.config['min_idwidget'],
                max_idwidget=self.config['max_idwidget'],
                datasources=self.config['datasources'])

            self.log.info("Counting stuff")
            num_widgets = all_widgets.count()
            self.log.info("Widget count: {}".format(num_widgets))
            num_features = q_words.count()
            self.log.info("Input feature count: {}".format(num_features))
            self.log.info("Output feature count: {}".format(q_lsi.count()))

            chunk_size = self.config['chunk_size']
            for w_t, X in model.get_training_data(all_widgets,
                                                  sparse_inputs=True,
                                                  supervised=False,
                                                  batch_size=chunk_size):
                self.log.info(
                    "Training on feature matrix X with shape=({},{}) and nnz={}"
                    .format(X.shape[0], X.shape[1], X.nnz))
                lsi.add_documents(X.T)

            model.set_trained(lsi)

            for w_t, X in model.get_predict_data(all_widgets,
                                                 sparse_inputs=True,
                                                 supervised=False,
                                                 batch_size=chunk_size):
                Y_hat = []
                for spdoc in X:
                    doc = zip(spdoc.indices, spdoc.data)
                    spvec = lsi[doc]
                    vec = np.zeros(shape=(lsi.num_topics, ), dtype=np.float32)
                    for t, v in spvec:
                        vec[t] = v
                    Y_hat.append(vec)

                    self.log.info("Y_hat[-1] = {}".format(Y_hat[-1]))
                model.update_predictions(w_t, np.array(Y_hat))
Ejemplo n.º 4
0
class LSAModel:
    """
    Base class for LSA model.
    """
    def __init__(self, vector_length):
        """
        Initialize model with parameters. Model is fit if it has not been done before.

        :param vector_length: Number of topics in model.
        """

        self.shortname = 'LSA'
        self.name = 'LSAmodel' + str(vector_length)
        self.vector_length = vector_length
        self.remove_stopwords = None
        self.word_dict = None
        self.path = None
        self.model = None
        self.doc_vecs = None

    def set_dict(self,
                 data,
                 remove_stopwords=False,
                 no_below=1,
                 no_above=1,
                 filter_most_frequent=0):
        """
        Set/make dictionary to be used for bow representations.

        :param data: Which data to use for making dictionary.
        :param remove_stopwords: Whether to remove stopwords.
        :param no_below: Minimum number of documents a word has to appear in to be included.
        :param no_above: Maximum fraction of documents a word can appear in to be included.
        :param filter_most_frequent: Remove the most frequent words.
        """

        if self.word_dict != None:
            print(
                "Model already have a dictionary! This function call does nothing. "
            )
            return

        self.name = '%s_%sdict_rs%s_nb%s_na%s_fmf%s' % (
            self.name, data.name, str(remove_stopwords), str(no_below),
            str(no_above), str(filter_most_frequent))

        self.remove_stopwords = remove_stopwords
        self.word_dict = data.get_dictionary(remove_stopwords, no_below,
                                             no_above, filter_most_frequent)

    def train(self, data):
        """
        Fit LSA model to the data, set document topic vectors and calculate distances.

        :param data: Data to fit model on
        """

        if self.word_dict == None:
            print(
                "Dictionary must be assigned to model before training. This function call does nothing"
            )
            return
        if self.model == None:
            self.model = LsiModel(num_topics=self.vector_length,
                                  id2word=self.word_dict)

        self.name = '%s_%strain' % (self.name, data.name)
        self.path = Path('modelfiles/%s/%s' % (data.name, self.name))

        try:
            self.model = LsiModel.load(str(self.path / '.model'))
        except:
            self.path.mkdir(parents=True, exist_ok=True)

            print("Training model...", end='')
            time.sleep(0.1)

            datastream = GetBow(data, self.remove_stopwords, self.word_dict)
            self.model.add_documents(datastream)

            self.model.save(str(self.path / '.model'))

    def fit(self, data):
        """
            Fit LSA model to the data, set document topic vectors and calculate distances.
        """

        if self.model == None:
            print(
                "Model must be trained first. This function call does nothing")
            return

        try:
            self.doc_vecs = pd.read_csv(
                self.path / str('document_vectors_%s.csv' % data.name),
                index_col=0)
        except:

            print("Fitting model...", end='')
            time.sleep(0.1)

            # Container for document topic vectors with zeros
            doc_vecs = np.zeros((len(data.ids), self.vector_length))

            # For each document
            datastream = GetBow(data, self.remove_stopwords, self.word_dict)
            for i in range(len(datastream)):

                # element is now a tuple with index and value for nonzero vector elements
                for element in self.model[datastream[i]]:

                    # Set nonzero elements in container
                    doc_vecs[i][element[0]] = element[1]

            # Set document topic vectors as pandas dataframe
            self.doc_vecs = pd.DataFrame(doc_vecs, index=data.ids)
            self.doc_vecs.to_csv(self.path /
                                 str('document_vectors_%s.csv' % data.name))
Ejemplo n.º 5
0
# for i in model.show_topics():
#     print(i)

from gensim.models import LsiModel
from gensim import corpora, models
import jieba
file_dir = "../corpora/test1"
documents = []
with open(file_dir, "r", encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        seg_list = jieba.cut(line, cut_all=False)
        sentence = [word for word in seg_list]
        documents.append(sentence)

Dict = corpora.Dictionary(documents)

corpus = [Dict.doc2bow(doc) for doc in documents]

tf_idf = models.TfidfModel(corpus)

lsimodel = LsiModel(corpus=tf_idf[corpus], id2word=Dict, num_topics=4)

# for i in lsimodel[tf_idf[corpus]]:
#     print(i)
for i in lsimodel.show_topics():
    print(i)

# 添加文档
lsimodel.add_documents([[(1, 2), (2, 1)]])