def fit(self, X, Y=None, embedding_matrix=None, steps_per_epoch=None): if isinstance(X, list): X = np.array(X) if isinstance(Y, list): Y = np.array(Y) if not (self.sequence_length and self.vocab_size and self.nb_outputs and steps_per_epoch): steps_per_epoch = self._init_from_data(X, Y) if isinstance(X, np.ndarray): data = self._prepare_data(X, Y, shuffle=True) else: # tensorflow dataset data = X.batch(self.batch_size) train_steps_per_epoch = int( (1 - self.validation_split) * steps_per_epoch) if train_steps_per_epoch == 0: logger.warning( "Not enough data for validation. Consider decreasing \ batch_size or validation_split. Some features that \ rely on validation metrics like early stopping might \ not work") else: steps_per_epoch = train_steps_per_epoch train_data = data.take(steps_per_epoch) val_data = data.skip(steps_per_epoch) strategy = self._get_distributed_strategy() with strategy.scope(): self.model = self._build_model(self.sequence_length, self.vocab_size, self.nb_outputs, steps_per_epoch, embedding_matrix) callbacks = [ CALLBACK_DICT[c] if c in CALLBACK_DICT else c for c in self.callbacks ] if self.early_stopping: early_stopping = tf.keras.callbacks.EarlyStopping( patience=5, restore_best_weights=True) callbacks.append(early_stopping) self.model.fit(train_data, validation_data=val_data, epochs=self.nb_epochs, callbacks=callbacks) return self
__all__ = [ 'WellcomeTfidf', 'Doc2VecVectorizer', 'Sent2VecVectorizer', 'WellcomeVotingClassifier' ] try: from .vectorizer import Vectorizer from .clustering import TextClustering from .spacy_ner import SpacyNER from .spacy_classifier import SpacyClassifier from .bert_classifier import BertClassifier from .bert_vectorizer import BertVectorizer from .spacy_knowledge_base import SpacyKnowledgeBase from .spacy_entity_linking import SpacyEntityLinker from .similarity_entity_linking import SimilarityEntityLinker from .cnn import CNNClassifier from .bilstm import BiLSTMClassifier from .keras_vectorizer import KerasVectorizer from .bert_semantic_equivalence import SemanticEquivalenceClassifier from .transformers_tokenizer import TransformersTokenizer __all__ += [ 'Vectorizer', 'TextClustering', 'SpacyNER', 'SpacyClassifier', 'BertClassifier', 'BertVectorizer', 'SpacyKnowledgeBase', 'SpacyEntityLinker', 'SemanticEquivalenceClassifier', 'CNNClassifier', 'BiLSTMClassifier', 'KerasVectorizer', 'SimilarityEntityLinker', 'SemanticEquivalenceClassifier', 'TransformersTokenizer' ] except ImportError as e: logger.error(e) logger.warning("Using WellcomeML without extras (transformers & torch).")
def optimise(self, X, param_grid, n_cluster_range=None, max_noise=0.2, verbose=False): """ Optimises the clustering silhouette based on a parameter grid, a range on number of clusters and a range on noise. It is customised to avoid re-fitting of intermediate steps (vectorizer and reducer) more than necessary. Args: X (iterable[str]): A list of texts to be clustered param_grid (dict) : A parameter grid, example: param_grid = {'reducer': {'min_dist': [0.0, 0.2], 'n_neighbors': [2,3,5], 'metric': ['cosine', 'euclidean']}, 'clustering': {'min_samples': [2, 5], 'eps': [0.5, 1, 1.5]}} n_cluster_range (2-uple of ints): A 2-uple describing the max and min number of clusters (e.g.: (10, 20)). If unset, will just choose the best silhouette max_noise (float in [0,1]): The maximum fraction of points unclustered. Default: 0.2 Returns: dict: A dictionary of results. The function returns a dictionary containing "params_list", "silhouette" (the silhouette for each parameter) and "best_clustering" (the best clustering parameters) """ min_n_clusters = (n_cluster_range[0] if n_cluster_range else 0) max_n_clusters = (n_cluster_range[1] if n_cluster_range else 10**5) # X might be transformed to be a vector, so we need to save the input # texts X_text = X # Linearises Dictionary to be compatible with grid search so it # becomes one dictionary with 'step__parameter' if self.reducer == 'tsne': logger.warning("TSNE is not suitable for predicting on new data." "Skipping Vectoriser/TSNE optimisation parameters") self.fit(X) X = self.reduced_points pipeline = Pipeline([('clustering', self.clustering_class)], memory=CACHE_DIR) params = {} elif self.cluster_reduced: # You cannot pickle sparse uMAP with more than 4096 points. # See https://github.com/lmcinnes/umap/issues/674 # Until that issue is fixed, we need to convert everything to dense or cannot cache # the transformations of the pipeline memory = (CACHE_DIR if len(X) < 4096 or self.embedding != 'tf-idf' else None) pipeline = Pipeline([('vectorizer', self.vectorizer), ('reducer', self.reducer_class), ('clustering', self.clustering_class)], memory=memory) params = { **{ f'reducer__{key}': value for key, value in param_grid.get('reducer', {}).items() } } params = { **params, **{ f'vectorizer__{key}': value for key, value in param_grid.get('vectorizer', {}).items() } } else: self.vectorizer.cache_transformed = True pipeline = Pipeline([('vectorizer', self.vectorizer), ('clustering', self.clustering_class)], memory=CACHE_DIR) params = { **{ f'vectorizer__{key}': value for key, value in param_grid.get('vectorizer', {}).items() } } params = { **params, **{ f'clustering__{key}': value for key, value in param_grid.get('clustering', {}).items() } } grid = GridSearchCV(estimator=pipeline, param_grid=params, scoring={ 'silhouette': _clustering_score, 'noise': _clustering_noise, 'n_clusters': _number_of_clusters }, refit='silhouette') logging_level = logger.level if verbose <= 1: # Previously disable logging to allow the loading bar to run # uninterruptly. Will reset after. logging.getLogger().setLevel(logging.WARNING) logger.setLevel(logging.WARNING) # Prunes result to actually optimise under constraints best_silhouette = 0 best_params = {} grid.fit(X, y=None) for params, silhouette, noise, n_clusters in zip( grid.cv_results_['params'], grid.cv_results_['mean_test_silhouette'], grid.cv_results_['mean_test_noise'], grid.cv_results_['mean_test_n_clusters']): if min_n_clusters <= n_clusters <= max_n_clusters\ and noise <= max_noise\ and silhouette > best_silhouette: best_silhouette = silhouette best_params = params if not best_params: logger.warning("Could not find any clustering model with the " "specified number of clusters and noise") self.silhouette = best_silhouette self.optimise_results = { key: value for key, value in grid.cv_results_.items() if key[:5] != 'split' # We don't need all cross-val split results } self.set_params(best_params, from_parameter_grid=True) # Fits the pipeline again with the best parameters self.fit(X_text) logger.setLevel(logging_level) return best_params
from sklearn.manifold import TSNE from sklearn.model_selection import GridSearchCV from sklearn.metrics import silhouette_score from sklearn.pipeline import Pipeline except ImportError as e: throw_extra_import_message(error=e, required_modules=required_modules, extras=extras) try: from hdbscan import HDBSCAN HDBSCAN_INSTALLED = True except (ValueError, ModuleNotFoundError): HDBSCAN_INSTALLED = False logger.warning( "If you want to use hdbscan you need to run" "pip3 install hdbscan --no-cache-dir --no-binary :all: --no-build-isolation " "Read more https://github.com/wellcometrust/WellcomeML/issues/197") CACHE_DIR = os.path.expanduser("~/.cache/wellcomeml") class TextClustering(object): """ Basic class for clustering pipelines. Attributes: vectorizer: The embedding Vectorizer object reducer: A dimensionality reduction object clustering: A clustering model object cluster_ids: Ids of the cluster cluster_names: Names of the clusters
import os from wellcomeml.logger import logger # Introduced a development_transformers env variable, that allows to # disable functions that use spacy. development_transformers_mode = (os.environ.get( "WELLCOMEML_ENV", "") == "development_transformers") if development_transformers_mode: logger.warning("Running in development mode. Only loading modules that" " use new version of transformers.") from .bert_semantic_equivalence import SemanticEquivalenceClassifier __all__ = [SemanticEquivalenceClassifier] else: from .frequency_vectorizer import WellcomeTfidf from .doc2vec_vectorizer import Doc2VecVectorizer from .sent2vec_vectorizer import Sent2VecVectorizer from .voting_classifier import WellcomeVotingClassifier __all__ = [ WellcomeTfidf, Doc2VecVectorizer, Sent2VecVectorizer, WellcomeVotingClassifier ] try: from .vectorizer import Vectorizer from .clustering import TextClustering from .spacy_ner import SpacyNER from .spacy_classifier import SpacyClassifier from .bert_classifier import BertClassifier