def generate( self, words: List[str], callback: Callable = dummy_callback ) -> Dict: if len(words) == 0: return {} if len(words) == 1: return {words[0]: {}} if len(words) == 2: return {sorted(words)[0]: {sorted(words)[1]: {}}} sims = self._get_similarities( words, self._get_embeddings(words, wrap_callback(callback, end=0.1)), wrap_callback(callback, start=0.1, end=0.2) ) if len(words) == 3: root = np.argmin(np.sum(sims, axis=1)) rest = sorted([words[i] for i in range(3) if i != root]) return {words[root]: {rest[0]: {}, rest[1]: {}}} ontology, root = generate_ontology( words, sims, callback=wrap_callback(callback, start=0.2) ) return Tree.from_prufer_sequence(ontology, words, root).to_dict()
def run(data: Table, learner: Learner, state: TaskState) -> Results: results = Results() if not data: return results def callback(i: float, status=""): state.set_progress_value(i * 100) if status: state.set_status(status) if state.is_interruption_requested(): raise Exception callback(0, "Initializing...") model = learner(data, wrap_callback(callback, end=0.6)) pred = model(data, wrap_callback(callback, start=0.6, end=0.99)) col = pred.get_column_view(model.outlier_var)[0] inliers_ind = np.where(col == 1)[0] outliers_ind = np.where(col == 0)[0] results.inliers = data[inliers_ind] results.outliers = data[outliers_ind] results.annotated_data = pred callback(1) return results
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: if callback is None: callback = dummy_callback corpus = TokenizedPreprocessor.__call__( self, corpus, wrap_callback(callback, end=0.2)) callback(0.2, "Fitting filter...") self._fit(corpus) return self._filter_tokens(corpus, wrap_callback(callback, start=0.6))
def embedding_keywords( corpus: Corpus, language: str = "English", progress_callback: Callable = None, ) -> List[List[Tuple[str, float]]]: """ Extract keywords using Embeddings. Parameters ---------- corpus Lists of tokens language Language of documents progress_callback Function for reporting progress. Returns ------- Keywords with scores """ if len(corpus) == 0: return [] if progress_callback is None: progress_callback = dummy_callback tokens = list(corpus.ngrams) # prepare structures language = EMBEDDING_LANGUAGE_MAPPING[language] doc_embs, word_embs, word2doc = _embedd_tokens( tokens, language, wrap_callback(progress_callback, 0, 0.7)) doc2word = [set(t) for t in tokens] word2ind = {w: i for i, w in enumerate(word2doc)} # many combinations of distances will not be used since each document do # not include all words. Anyway it is still much faster to compute all # distances pairs because of matrix calculations distances = cos_dist(doc_embs, word_embs) # the sum of document embeddings for each word dist_sums = { w: distances[list(dcs), i].sum() for i, (w, dcs) in enumerate(word2doc.items()) } cb = wrap_callback(progress_callback, 0.7, 1) # compute keywords scores doc_desc = [] for j in range(doc_embs.shape[0]): scores = [] for word in doc2word[j]: l_ = len(word2doc[word]) dist = distances[j, word2ind[word]] mean_distance = ((dist_sums[word] - dist) / (l_ - 1)) if l_ > 1 else 0 scores.append((word, dist - mean_distance)) doc_desc.append(sorted(scores, key=itemgetter(1))) cb((j + 1) / len(doc_embs)) return doc_desc
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: corpus = super().__call__(corpus) if callback is None: callback = dummy_callback callback(0, "Transforming...") corpus = self._store_documents(corpus, wrap_callback(callback, end=0.5)) return self._store_tokens(corpus, wrap_callback(callback, start=0.5)) \ if corpus.has_tokens() else corpus
def score(self, tree: Dict, callback: Callable = dummy_callback) -> float: tree = Tree.from_dict(tree) sims = self._get_similarities( tree.labels, self._get_embeddings(tree.labels, wrap_callback(callback, end=0.7)), wrap_callback(callback, start=0.7, end=0.8) ) callback(0.9) fitness_function = FitnessFunction(tree.labels, sims).fitness return fitness_function(tree, tree.root)[0]
def _run( corpus: Corpus, words: List[str], scoring_methods: List[str], aggregation: str, additional_params: dict, state: TaskState, ) -> None: """ Perform word scoring with selected scoring methods Parameters ---------- corpus Corpus of documents words List of words used for scoring scoring_methods Methods to score documents with aggregation Aggregation applied for each document on word scores additional_params Additional prameters for scores (e.g. embedding needs text language) state TaskState for reporting the task status and giving partial results """ def callback(i: float) -> None: state.set_progress_value(i * 100) if state.is_interruption_requested(): raise Exception cb_part = 1 / (len(scoring_methods) + 1) # +1 for preprocessing words = _preprocess_words(corpus, words, wrap_callback(callback, end=cb_part)) if len(words) == 0: raise Exception( "Empty word list after preprocessing. Please provide a valid set of words." ) for i, sm in enumerate(scoring_methods): scoring_method = SCORING_METHODS[sm][1] sig = signature(scoring_method) add_params = { k: v for k, v in additional_params.items() if k in sig.parameters } scs = scoring_method( corpus, words, wrap_callback(callback, start=(i + 1) * cb_part, end=(i + 2) * cb_part), **add_params) scs = AGGREGATIONS[aggregation](scs, axis=1) state.set_partial_result((sm, aggregation, scs))
def test_wrap_callback(self): def func(i): return i f = wrap_callback(func, start=0, end=0.8) self.assertEqual(f(0), 0) self.assertEqual(round(f(0.1), 2), 0.08) self.assertEqual(f(1), 0.8) f = wrap_callback(func, start=0.1, end=0.8) self.assertEqual(f(0), 0.1) self.assertEqual(f(0.1), 0.17) self.assertEqual(f(1), 0.8)
def _get_embeddings( self, words: List[str], callback: Callable = dummy_callback ) -> np.array: embeddings = np.zeros((len(words), EMB_DIM)) missing, missing_idx = list(), list() ticks = iter(np.linspace(0.0, 0.6, len(words))) for i, word in enumerate(words): callback(next(ticks)) emb = self.storage.get_embedding(word) if emb is None: missing.append(word) missing_idx.append(i) else: embeddings[i, :] = emb if len(missing_idx) > 0: embs = self.embedder(missing, callback=wrap_callback(callback, start=0.6, end=0.9)) if None in embs: raise RuntimeError("Couldn't obtain embeddings.") embeddings[missing_idx, :] = np.array(embs) for i in missing_idx: self.storage.save_embedding(words[i], embeddings[i, :]) return embeddings
def _run( data: Table, group_by_attrs: List[Variable], aggregations: Dict[Variable, Set[str]], result: Result, state: TaskState, ) -> Result: def progress(part): state.set_progress_value(part * 100) if state.is_interruption_requested(): raise Exception state.set_status("Aggregating") # group table rows if result.group_by is None: result.group_by = data.groupby(group_by_attrs) state.set_partial_result(result) aggregations = { var: [(agg, AGGREGATIONS[agg].function) for agg in sorted(aggs, key=AGGREGATIONS_ORD.index)] for var, aggs in aggregations.items() } result.result_table = result.group_by.aggregate( aggregations, wrap_callback(progress, 0.2, 1)) return result
def insert( self, tree: Dict, words: List[str], callback: Callable = dummy_callback ) -> Dict: tree = Tree.from_dict(tree) self._get_embeddings(words, wrap_callback(callback, end=0.3)) ticks = iter(np.linspace(0.3, 0.9, len(words))) for word in words: tick = next(ticks) tree.adj_list.append(set()) tree.labels.append(word) sims = self._get_similarities( tree.labels, self._get_embeddings(tree.labels, lambda x: callback(tick)), lambda x: callback(tick) ) idx = len(tree.adj_list) - 1 fitness_function = FitnessFunction(tree.labels, sims).fitness scores = list() for i in range(idx): tree.adj_list[i].add(idx) tree.adj_list[idx].add(i) scores.append(fitness_function(tree, tree.root)[0]) tree.adj_list[i].remove(idx) tree.adj_list[idx].remove(i) best = np.argmax(scores) tree.adj_list[best].add(idx) tree.adj_list[idx].add(best) callback(tick) return tree.to_dict()
def __call__(self, data, progress_callback=None): for cls in type(self).mro(): if 'incompatibility_reason' in cls.__dict__: incompatibility_reason = \ self.incompatibility_reason(data.domain) # pylint: disable=assignment-from-none if incompatibility_reason is not None: raise ValueError(incompatibility_reason) break if 'check_learner_adequacy' in cls.__dict__: warnings.warn( "check_learner_adequacy is deprecated and will be removed " "in upcoming releases. Learners should instead implement " "the incompatibility_reason method.", OrangeDeprecationWarning) if not self.check_learner_adequacy(data.domain): raise ValueError(self.learner_adequacy_err_msg) break origdomain = data.domain if isinstance(data, Instance): data = Table(data.domain, [data]) origdata = data if progress_callback is None: progress_callback = dummy_callback progress_callback(0, "Preprocessing...") try: cb = wrap_callback(progress_callback, end=0.1) data = self.preprocess(data, progress_callback=cb) except TypeError: data = self.preprocess(data) warnings.warn("A keyword argument 'progress_callback' has been " "added to the preprocess() signature. Implementing " "the method without the argument is deprecated and " "will result in an error in the future.", OrangeDeprecationWarning) if len(data.domain.class_vars) > 1 and not self.supports_multiclass: raise TypeError("%s doesn't support multiple class variables" % self.__class__.__name__) progress_callback(0.1, "Fitting...") model = self._fit_model(data) model.used_vals = [np.unique(y).astype(int) for y in data.Y[:, None].T] if not hasattr(model, "domain") or model.domain is None: # some models set domain themself and it should be respected # e.g. calibration learners set the base_learner's domain which # would be wrongly overwritten if we set it here for any model model.domain = data.domain model.supports_multiclass = self.supports_multiclass model.name = self.name model.original_domain = origdomain model.original_data = origdata progress_callback(1) return model
def _embedding_similarity( corpus: Corpus, words: List[str], callback: Callable, embedding_language: str, ) -> np.ndarray: language = LANGS_TO_ISO[embedding_language] # make sure there will be only embeddings in X after calling the embedder corpus = Corpus.from_table(Domain([], metas=corpus.domain.metas), corpus) emb = DocumentEmbedder(language) cb_part = len(corpus) / (len(corpus) + len(words)) documet_embeddings, skipped = emb.transform( corpus, wrap_callback(callback, 0, cb_part)) assert skipped is None words = [[w] for w in words] word_embeddings = np.array( emb.transform(words, wrap_callback(callback, cb_part, 1 - cb_part))) return cosine_similarity(documet_embeddings.X, word_embeddings)
def permutation_feature_importance( model: Model, data: Table, score: Score, n_repeats: int = 5, progress_callback: Callable = None ): """ Function calculates feature importance of a model for a given data. Parameters ---------- model : Model Fitted Orange Learner. data : Table Data to calculate the feature importance for. score : Score Score to use for model evaluation. n_repeats : int, optional, default 5 Number of times a feature is randomly shuffled. progress_callback : callable The callback for reporting the progress. Returns ------- np.ndarray Feature importance. """ if progress_callback is None: progress_callback = dummy_callback data = data.copy() _check_data(data) needs_pp = _check_model(model, data) scorer = _wrap_score(score, needs_pp) baseline_score = scorer(model, data) n_features = data.X.shape[1] step = 1 / n_features with data.unlocked(): perm_scores = [_calculate_permutation_scores( model, data, i, n_repeats, scorer, wrap_callback(progress_callback, start=i * step, end=(i + 1) * step) ) for i in range(n_features)] names = [attr.name for attr in data.domain.attributes] scores = baseline_score - np.array(perm_scores) if isinstance(score, RegressionScore) and not isinstance(score, R2): scores = -scores return scores, names
def apply_preprocessor(self, data: Optional[Corpus], preprocessor: Optional[PreprocessorList], state: TaskState) -> Result: def callback(i: float, status=""): state.set_progress_value(i * 100) if status: state.set_status(status) if state.is_interruption_requested(): raise Exception pp_data = None msgs = [] if data and preprocessor is not None: pp_data = preprocessor(data, wrap_callback(callback, end=0.9)) if not pp_data.has_tokens(): pp_data = BASE_TOKENIZER(pp_data, wrap_callback(callback, start=0.9)) if pp_data is not None and len(pp_data.dictionary) == 0: msgs.append(self.Warning.no_token_left) pp_data = None return Result(corpus=pp_data, msgs=msgs)
def get_shap_values_and_colors( model: Model, data: Table, progress_callback: Callable = None ) -> Tuple[List[np.ndarray], List[str], np.ndarray, np.ndarray]: """ Compute SHAP values and colors that represent how high is the feature value comparing to other values for the same feature. This function provides all required components for explain model widget. Parameters ---------- model Model which predictions are explained explained. data Data, which's prediction is explained. progress_callback The callback for reporting the progress. Returns ------- shap_values Shapely values for each data item computed by the SHAP library. The result is a list of SHAP values for each class - the class order is taken from values in the class_var. Each array in the list has shape (num cases x num attributes) - explanation for the contribution of each attribute to the final prediction. attributes The attributes from table on which explanation was made: table preprocessed by models preprocessors sample_mask SHAP values are computed just for a data sample. It is a boolean mask that tells which rows in data are explained. colors Colors for each data instance and each feature. The shape of the matrix is M x N x C, where M is a number of instances, N is a number of features, and C is 3 (one value for each RGB channel). """ if progress_callback is None: progress_callback = dummy_callback cb = wrap_callback(progress_callback, end=0.9) shap_values, transformed_data, sample_mask, _ = compute_shap_values( model, data, data, progress_callback=cb) colors = compute_colors(transformed_data[sample_mask]) attributes = [t.name for t in transformed_data.domain.attributes] progress_callback(1) return shap_values, attributes, sample_mask, colors
def __call__(self, corpus: Corpus, callback: Callable = None, **kw) -> Corpus: """ Marks tokens of a corpus with POS tags. """ if callback is None: callback = dummy_callback corpus = super().__call__(corpus, wrap_callback(callback, end=0.2)) assert corpus.has_tokens() callback(0.2, "POS Tagging...") tags = np.array(self._preprocess(corpus.tokens, **kw), dtype=object) corpus.pos_tags = tags return corpus
def __call__(self, data: Table, progress_callback: Callable = None) \ -> Table: assert isinstance(data, Table) assert self.outlier_var is not None domain = Domain(data.domain.attributes, data.domain.class_vars, data.domain.metas + (self.outlier_var, )) if progress_callback is None: progress_callback = dummy_callback progress_callback(0, "Preprocessing...") self._cached_data = self.data_to_model_domain( data, wrap_callback(progress_callback, end=0.1)) progress_callback(0.1, "Predicting...") metas = np.hstack((data.metas, self.predict(self._cached_data.X))) progress_callback(1) return Table.from_numpy(domain, data.X, data.Y, metas)
def search_authors( self, authors: List[str], *, max_tweets: Optional[int] = MAX_TWEETS, collecting: bool = False, callback: Callable = dummy_callback, ) -> Optional[Corpus]: """ Search recent tweets by authors. Parameters ---------- authors A list of authors to search for. max_tweets Limits the number of downloaded tweets. If none use APIs maximum. collecting Whether to collect results across multiple search calls. callback Function to report the progress Returns ------- Corpus with tweets """ if not collecting: self.reset() count_sum = 0 n = len(authors) for i, author in enumerate(authors): author_ = self.api.get_user(username=author) if author_.data is None: raise NoAuthorError(author) paginator = tweepy.Paginator( self.api.get_users_tweets, author_.data.id, **request_settings ) count_sum += self._fetch( paginator, max_tweets, callback=wrap_callback(callback, i / n, (i + 1) / n), ) self.append_history("Author", authors, None, None, count_sum) return self._create_corpus()
def __call__(self, corpus: Corpus, callback: Callable = None) \ -> Corpus: """ Applies a list of preprocessors to the corpus. :param corpus: Corpus :param callback: progress callback function :return: Corpus Preprocessed corpus. """ if callback is None: callback = dummy_callback n_pps = len(list(self.preprocessors)) for i, pp in enumerate(self.preprocessors): start = i / n_pps cb = wrap_callback(callback, start=start, end=start + 1 / n_pps) corpus = pp(corpus, cb) callback(1) return corpus
def __call__(self, data, progress_callback=None): if not self.check_learner_adequacy(data.domain): raise ValueError(self.learner_adequacy_err_msg) origdomain = data.domain if isinstance(data, Instance): data = Table(data.domain, [data]) origdata = data if progress_callback is None: progress_callback = dummy_callback progress_callback(0, "Preprocessing...") try: cb = wrap_callback(progress_callback, end=0.1) data = self.preprocess(data, progress_callback=cb) except TypeError: data = self.preprocess(data) warnings.warn( "A keyword argument 'progress_callback' has been " "added to the preprocess() signature. Implementing " "the method without the argument is deprecated and " "will result in an error in the future.", OrangeDeprecationWarning) if len(data.domain.class_vars) > 1 and not self.supports_multiclass: raise TypeError("%s doesn't support multiple class variables" % self.__class__.__name__) progress_callback(0.1, "Fitting...") model = self._fit_model(data) model.used_vals = [np.unique(y).astype(int) for y in data.Y[:, None].T] model.domain = data.domain model.supports_multiclass = self.supports_multiclass model.name = self.name model.original_domain = origdomain model.original_data = origdata progress_callback(1) return model
def annotate_documents( corpus: Corpus, embedding: np.ndarray, clustering_method: int, n_components: Optional[int] = None, epsilon: Optional[float] = None, cluster_labels: Optional[np.ndarray] = None, fdr_threshold: float = 0.05, n_words_in_cluster: int = 10, progress_callback: Optional[Callable] = None ) -> Tuple[np.ndarray, Dict[int, ClusterType], int, float, ScoresType]: """ Annotate documents in corpus, by performing clustering on the corpus and assigning characteristic terms to each cluster using Hypergeometric distribution. Return annotated clusters - for each cluster return a list of keywords with scores, cluster center coordinates and concave_hulls coordinates. Also return optimal values for n_components/epsilon if calculated and scores data (p-values and counts for all keywords). Parameters ---------- corpus : Corpus Corpus to be annotated. embedding : np.ndarray of size len(corpus) × 2 Usually tSNE projection of BoW of corpus. clustering_method : int 0 for DBSCAN 1 for Gaussian mixture models 2 for custom clustering where cluster_labels are used n_components: int, optional, default = None Number of clusters for Gaussian mixture models. If None, set to the number of clusters with maximal silhouette. epsilon : float, optional, default = None epsilon for DBSCAN. If None, optimal value is computed. cluster_labels : np.ndarray, optional Custom cluster labels. Usually included in corpus. fdr_threshold : float, optional, default = 0.05 hypergeom_p_values threshold n_words_in_cluster : int, optional, default = 10 Number of characteristic terms in each cluster. progress_callback : callable, optional Progress callback. Returns ------- cluster_labels : np.ndarray of size len(corpus) An array of floats (i.e. 0, 1, np.nan) that represent cluster labels for all documents in the corpus. clusters : dict Dictionary of keywords with scores, centroids and concave hulls for each cluster. n_components : int Optimal number of clusters for Gaussian mixture models, if the n_components is None, and clustering_method is ClusterDocuments.GAUSSIAN_MIXTURE. n_components otherwise. epsilon : float Optimal value for epsilon for DBSCAN, if the epsilon is None, and clustering_method is ClusterDocuments.DBSCAN. epsilon otherwise. scores : tuple Tuple of all keywords with p-values and counts. Raises ------ ValueError when there are no clusters in the embedding. """ if progress_callback is None: progress_callback = dummy_callback if clustering_method == ClusterDocuments.GAUSSIAN_MIXTURE: if n_components is None: n_components = ClusterDocuments.gmm_compute_n_components( embedding, wrap_callback(progress_callback, end=0.3)) n_components = min([n_components, len(embedding)]) cluster_labels = ClusterDocuments.gmm(embedding, n_components=n_components, threshold=0.6) elif clustering_method == ClusterDocuments.DBSCAN: if epsilon is None: epsilon = ClusterDocuments.dbscan_compute_epsilon(embedding) cluster_labels = ClusterDocuments.dbscan(embedding, eps=epsilon) else: assert cluster_labels is not None cluster_labels[np.isnan(cluster_labels)] = -1 if len(set(cluster_labels) - {-1}) == 0: raise ValueError("There are no clusters using current settings.") keywords = _get_characteristic_terms(corpus, n_keywords=20, progress_callback=wrap_callback( progress_callback, start=0.5)) clusters_keywords, all_keywords, scores, p_values = \ _hypergeom_clusters(cluster_labels, keywords, fdr_threshold, n_words_in_cluster) concave_hulls = compute_concave_hulls(embedding, cluster_labels, epsilon) centroids = { c: tuple(np.mean(concave_hulls[c], axis=0)) for c in set(cluster_labels) - {-1} } clusters = { int(key): (clusters_keywords[key], centroids[key], concave_hulls[key]) for key in clusters_keywords } cluster_labels = cluster_labels.astype(float) cluster_labels[cluster_labels == -1] = np.nan scores = (all_keywords, scores, p_values) return cluster_labels, clusters, n_components, epsilon, scores
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: if callback is None: callback = dummy_callback corpus = super().__call__(corpus, wrap_callback(callback, end=0.2)) callback(0.2, "Normalizing...") return self._store_tokens(corpus, wrap_callback(callback, start=0.2))
def runner(self, state: TaskState) -> Table: exp_type = self.data_output_options.expression_type[self.exp_type].type exp_source = self.data_output_options.expression_sources[ self.exp_source] proc_slug = self.data_output_options.process[self.proc_slug].slug collection_id = self.selected_collection_id table = self.data_table progress_steps_download = iter(np.linspace(0, 50, 2)) def callback(i: float, status=""): state.set_progress_value(i * 100) if status: state.set_status(status) if state.is_interruption_requested(): raise Exception if not table: collection = self.res.get_collection_by_id(collection_id) coll_table = resdk.tables.RNATables( collection, expression_source=exp_source, expression_process_slug=proc_slug, progress_callable=wrap_callback(callback, end=0.5), ) species = coll_table._data[0].output['species'] sample = coll_table._samples[0] state.set_status('Downloading ...') loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) df_exp = coll_table.exp if exp_type != 'rc' else coll_table.rc df_exp = df_exp.rename(index=coll_table.readable_index) df_metas = coll_table.meta df_metas = df_metas.rename(index=coll_table.readable_index) df_qc = None if self.append_qc_data: # TODO: check if there is a way to detect if collection # table contains QC data try: df_qc = coll_table.qc df_qc = df_qc.rename(index=coll_table.readable_index) except ValueError: pass loop.close() state.set_status('To data table ...') duplicates = { item for item, count in Counter([ label.split('.')[1] for label in df_metas.columns.to_list() if '.' in label ]).items() if count > 1 } # what happens if there is more nested sections? section_name_to_label = { section['name']: section['label'] for section in sample.descriptor_schema.schema } column_labels = {} for field_schema, fields, path in iterate_schema( sample.descriptor, sample.descriptor_schema.schema, path=''): path = path[1:] # this is ugly, but cant go around it if path not in df_metas.columns: continue label = field_schema['label'] section_name, field_name = path.split('.') column_labels[path] = ( label if field_name not in duplicates else f'{section_name_to_label[section_name]} - {label}') df_exp = df_exp.reset_index(drop=True) df_metas = df_metas.astype('object') df_metas = df_metas.fillna(np.nan) df_metas = df_metas.replace('nan', np.nan) df_metas = df_metas.rename(columns=column_labels) if df_qc is not None: df_metas = pd.merge(df_metas, df_qc, left_index=True, right_index=True) xym, domain_metas = vars_from_df(df_metas) x, _, m = xym x_metas = np.hstack((x, m)) attrs = [ContinuousVariable(col) for col in df_exp.columns] metas = domain_metas.attributes + domain_metas.metas domain = Domain(attrs, metas=metas) table = Table(domain, df_exp.to_numpy(), metas=x_metas) state.set_progress_value(next(progress_steps_download)) state.set_status('Matching genes ...') progress_steps_gm = iter( np.linspace(50, 99, len(coll_table.gene_ids))) def gm_callback(): state.set_progress_value(next(progress_steps_gm)) tax_id = species_name_to_taxid(species) gm = GeneMatcher(tax_id, progress_callback=gm_callback) table = gm.match_table_attributes(table, rename=True) table.attributes[TableAnnotation.tax_id] = tax_id table.attributes[TableAnnotation.gene_as_attr_name] = True table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID' self.data_table = table state.set_status('Normalizing ...') table = self.normalize(table) state.set_progress_value(100) return table
def run(corpus: Optional[Corpus], words: Optional[List], cached_keywords: Dict, scoring_methods: Set, scoring_methods_kwargs: Dict, agg_method: int, state: TaskState) -> Results: results = Results(scores=[], labels=[], all_keywords={}) if not corpus: return results # passed by reference (and not copied) - to save partial results results.all_keywords = cached_keywords if not scoring_methods: return results def callback(i: float, status=""): state.set_progress_value(i * 100) if status: state.set_status(status) if state.is_interruption_requested(): raise Exception callback(0, "Calculating...") scores = {} tokens = corpus.tokens documents = corpus.documents step = 1 / len(scoring_methods) for method_name, func in ScoringMethods.ITEMS: if method_name in scoring_methods: if method_name not in results.all_keywords: i = len(results.labels) cb = wrap_callback(callback, start=i * step, end=(i + 1) * step) needs_tokens = method_name in ScoringMethods.TOKEN_METHODS kw = {"progress_callback": cb} kw.update(scoring_methods_kwargs.get(method_name, {})) keywords = func(tokens if needs_tokens else documents, **kw) results.all_keywords[method_name] = keywords keywords = results.all_keywords[method_name] scores[method_name] = \ dict(AggregationMethods.aggregate(keywords, agg_method)) results.labels.append(method_name) scores = pd.DataFrame(scores) if words: # Normalize words for preprocessor in corpus.used_preprocessor.preprocessors: if isinstance(preprocessor, BaseNormalizer): words = [preprocessor.normalizer(w) for w in words] # Filter scores using words existing_words = [w for w in set(words) if w in scores.index] scores = scores.loc[existing_words] if existing_words \ else scores.iloc[:0] results.scores = scores.reset_index().sort_values( by=[results.labels[0], "index"], ascending=[False, True]).values.tolist() return results
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: if callback is None: callback = dummy_callback corpus = super().__call__(corpus, wrap_callback(callback, end=0.2)) return self._filter_tokens(corpus, wrap_callback(callback, start=0.2))