Esempio n. 1
0
 def generate(
     self,
     words: List[str],
     callback: Callable = dummy_callback
 ) -> Dict:
     if len(words) == 0:
         return {}
     if len(words) == 1:
         return {words[0]: {}}
     if len(words) == 2:
         return {sorted(words)[0]: {sorted(words)[1]: {}}}
     sims = self._get_similarities(
         words,
         self._get_embeddings(words, wrap_callback(callback, end=0.1)),
         wrap_callback(callback, start=0.1, end=0.2)
     )
     if len(words) == 3:
         root = np.argmin(np.sum(sims, axis=1))
         rest = sorted([words[i] for i in range(3) if i != root])
         return {words[root]: {rest[0]: {}, rest[1]: {}}}
     ontology, root = generate_ontology(
         words,
         sims,
         callback=wrap_callback(callback, start=0.2)
     )
     return Tree.from_prufer_sequence(ontology, words, root).to_dict()
Esempio n. 2
0
def run(data: Table, learner: Learner, state: TaskState) -> Results:
    results = Results()
    if not data:
        return results

    def callback(i: float, status=""):
        state.set_progress_value(i * 100)
        if status:
            state.set_status(status)
        if state.is_interruption_requested():
            raise Exception

    callback(0, "Initializing...")
    model = learner(data, wrap_callback(callback, end=0.6))
    pred = model(data, wrap_callback(callback, start=0.6, end=0.99))

    col = pred.get_column_view(model.outlier_var)[0]
    inliers_ind = np.where(col == 1)[0]
    outliers_ind = np.where(col == 0)[0]

    results.inliers = data[inliers_ind]
    results.outliers = data[outliers_ind]
    results.annotated_data = pred
    callback(1)
    return results
Esempio n. 3
0
 def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
     if callback is None:
         callback = dummy_callback
     corpus = TokenizedPreprocessor.__call__(
         self, corpus, wrap_callback(callback, end=0.2))
     callback(0.2, "Fitting filter...")
     self._fit(corpus)
     return self._filter_tokens(corpus, wrap_callback(callback, start=0.6))
Esempio n. 4
0
def embedding_keywords(
    corpus: Corpus,
    language: str = "English",
    progress_callback: Callable = None,
) -> List[List[Tuple[str, float]]]:
    """
    Extract keywords using Embeddings.

    Parameters
    ----------
    corpus
        Lists of tokens
    language
        Language of documents
    progress_callback
        Function for reporting progress.

    Returns
    -------
    Keywords with scores
    """
    if len(corpus) == 0:
        return []
    if progress_callback is None:
        progress_callback = dummy_callback

    tokens = list(corpus.ngrams)
    # prepare structures
    language = EMBEDDING_LANGUAGE_MAPPING[language]
    doc_embs, word_embs, word2doc = _embedd_tokens(
        tokens, language, wrap_callback(progress_callback, 0, 0.7))
    doc2word = [set(t) for t in tokens]
    word2ind = {w: i for i, w in enumerate(word2doc)}

    # many combinations of distances will not be used since each document do
    # not include all words. Anyway it is still much faster to compute all
    # distances pairs because of matrix calculations
    distances = cos_dist(doc_embs, word_embs)
    # the sum of document embeddings for each word
    dist_sums = {
        w: distances[list(dcs), i].sum()
        for i, (w, dcs) in enumerate(word2doc.items())
    }

    cb = wrap_callback(progress_callback, 0.7, 1)
    # compute keywords scores
    doc_desc = []
    for j in range(doc_embs.shape[0]):
        scores = []
        for word in doc2word[j]:
            l_ = len(word2doc[word])
            dist = distances[j, word2ind[word]]
            mean_distance = ((dist_sums[word] - dist) /
                             (l_ - 1)) if l_ > 1 else 0
            scores.append((word, dist - mean_distance))
        doc_desc.append(sorted(scores, key=itemgetter(1)))
        cb((j + 1) / len(doc_embs))
    return doc_desc
Esempio n. 5
0
 def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
     corpus = super().__call__(corpus)
     if callback is None:
         callback = dummy_callback
     callback(0, "Transforming...")
     corpus = self._store_documents(corpus, wrap_callback(callback,
                                                          end=0.5))
     return self._store_tokens(corpus, wrap_callback(callback, start=0.5)) \
         if corpus.has_tokens() else corpus
Esempio n. 6
0
 def score(self, tree: Dict, callback: Callable = dummy_callback) -> float:
     tree = Tree.from_dict(tree)
     sims = self._get_similarities(
         tree.labels,
         self._get_embeddings(tree.labels, wrap_callback(callback, end=0.7)),
         wrap_callback(callback, start=0.7, end=0.8)
     )
     callback(0.9)
     fitness_function = FitnessFunction(tree.labels, sims).fitness
     return fitness_function(tree, tree.root)[0]
Esempio n. 7
0
def _run(
    corpus: Corpus,
    words: List[str],
    scoring_methods: List[str],
    aggregation: str,
    additional_params: dict,
    state: TaskState,
) -> None:
    """
    Perform word scoring with selected scoring methods

    Parameters
    ----------
    corpus
        Corpus of documents
    words
        List of words used for scoring
    scoring_methods
        Methods to score documents with
    aggregation
        Aggregation applied for each document on word scores
    additional_params
        Additional prameters for scores (e.g. embedding needs text language)
    state
        TaskState for reporting the task status and giving partial results
    """
    def callback(i: float) -> None:
        state.set_progress_value(i * 100)
        if state.is_interruption_requested():
            raise Exception

    cb_part = 1 / (len(scoring_methods) + 1)  # +1 for preprocessing

    words = _preprocess_words(corpus, words,
                              wrap_callback(callback, end=cb_part))
    if len(words) == 0:
        raise Exception(
            "Empty word list after preprocessing. Please provide a valid set of words."
        )
    for i, sm in enumerate(scoring_methods):
        scoring_method = SCORING_METHODS[sm][1]
        sig = signature(scoring_method)
        add_params = {
            k: v
            for k, v in additional_params.items() if k in sig.parameters
        }
        scs = scoring_method(
            corpus, words,
            wrap_callback(callback,
                          start=(i + 1) * cb_part,
                          end=(i + 2) * cb_part), **add_params)
        scs = AGGREGATIONS[aggregation](scs, axis=1)
        state.set_partial_result((sm, aggregation, scs))
Esempio n. 8
0
    def test_wrap_callback(self):
        def func(i):
            return i

        f = wrap_callback(func, start=0, end=0.8)
        self.assertEqual(f(0), 0)
        self.assertEqual(round(f(0.1), 2), 0.08)
        self.assertEqual(f(1), 0.8)

        f = wrap_callback(func, start=0.1, end=0.8)
        self.assertEqual(f(0), 0.1)
        self.assertEqual(f(0.1), 0.17)
        self.assertEqual(f(1), 0.8)
Esempio n. 9
0
    def _get_embeddings(
        self,
        words: List[str],
        callback: Callable = dummy_callback
    ) -> np.array:
        embeddings = np.zeros((len(words), EMB_DIM))
        missing, missing_idx = list(), list()
        ticks = iter(np.linspace(0.0, 0.6, len(words)))
        for i, word in enumerate(words):
            callback(next(ticks))
            emb = self.storage.get_embedding(word)
            if emb is None:
                missing.append(word)
                missing_idx.append(i)
            else:
                embeddings[i, :] = emb
        if len(missing_idx) > 0:
            embs = self.embedder(missing, callback=wrap_callback(callback, start=0.6, end=0.9))
            if None in embs:
                raise RuntimeError("Couldn't obtain embeddings.")
            embeddings[missing_idx, :] = np.array(embs)
        for i in missing_idx:
            self.storage.save_embedding(words[i], embeddings[i, :])

        return embeddings
Esempio n. 10
0
def _run(
    data: Table,
    group_by_attrs: List[Variable],
    aggregations: Dict[Variable, Set[str]],
    result: Result,
    state: TaskState,
) -> Result:
    def progress(part):
        state.set_progress_value(part * 100)
        if state.is_interruption_requested():
            raise Exception

    state.set_status("Aggregating")
    # group table rows
    if result.group_by is None:
        result.group_by = data.groupby(group_by_attrs)
    state.set_partial_result(result)

    aggregations = {
        var: [(agg, AGGREGATIONS[agg].function)
              for agg in sorted(aggs, key=AGGREGATIONS_ORD.index)]
        for var, aggs in aggregations.items()
    }
    result.result_table = result.group_by.aggregate(
        aggregations, wrap_callback(progress, 0.2, 1))
    return result
Esempio n. 11
0
    def insert(
        self,
        tree: Dict,
        words: List[str],
        callback: Callable = dummy_callback
    ) -> Dict:
        tree = Tree.from_dict(tree)
        self._get_embeddings(words, wrap_callback(callback, end=0.3))
        ticks = iter(np.linspace(0.3, 0.9, len(words)))

        for word in words:
            tick = next(ticks)
            tree.adj_list.append(set())
            tree.labels.append(word)
            sims = self._get_similarities(
                tree.labels,
                self._get_embeddings(tree.labels, lambda x: callback(tick)),
                lambda x: callback(tick)
            )
            idx = len(tree.adj_list) - 1
            fitness_function = FitnessFunction(tree.labels, sims).fitness
            scores = list()
            for i in range(idx):
                tree.adj_list[i].add(idx)
                tree.adj_list[idx].add(i)
                scores.append(fitness_function(tree, tree.root)[0])
                tree.adj_list[i].remove(idx)
                tree.adj_list[idx].remove(i)
            best = np.argmax(scores)
            tree.adj_list[best].add(idx)
            tree.adj_list[idx].add(best)
            callback(tick)

        return tree.to_dict()
Esempio n. 12
0
    def __call__(self, data, progress_callback=None):

        for cls in type(self).mro():
            if 'incompatibility_reason' in cls.__dict__:
                incompatibility_reason = \
                    self.incompatibility_reason(data.domain)  # pylint: disable=assignment-from-none
                if incompatibility_reason is not None:
                    raise ValueError(incompatibility_reason)
                break
            if 'check_learner_adequacy' in cls.__dict__:
                warnings.warn(
                    "check_learner_adequacy is deprecated and will be removed "
                    "in upcoming releases. Learners should instead implement "
                    "the incompatibility_reason method.",
                    OrangeDeprecationWarning)
                if not self.check_learner_adequacy(data.domain):
                    raise ValueError(self.learner_adequacy_err_msg)
                break

        origdomain = data.domain

        if isinstance(data, Instance):
            data = Table(data.domain, [data])
        origdata = data

        if progress_callback is None:
            progress_callback = dummy_callback
        progress_callback(0, "Preprocessing...")
        try:
            cb = wrap_callback(progress_callback, end=0.1)
            data = self.preprocess(data, progress_callback=cb)
        except TypeError:
            data = self.preprocess(data)
            warnings.warn("A keyword argument 'progress_callback' has been "
                          "added to the preprocess() signature. Implementing "
                          "the method without the argument is deprecated and "
                          "will result in an error in the future.",
                          OrangeDeprecationWarning)

        if len(data.domain.class_vars) > 1 and not self.supports_multiclass:
            raise TypeError("%s doesn't support multiple class variables" %
                            self.__class__.__name__)

        progress_callback(0.1, "Fitting...")
        model = self._fit_model(data)
        model.used_vals = [np.unique(y).astype(int) for y in data.Y[:, None].T]
        if not hasattr(model, "domain") or model.domain is None:
            # some models set domain themself and it should be respected
            # e.g. calibration learners set the base_learner's domain which
            # would be wrongly overwritten if we set it here for any model
            model.domain = data.domain
        model.supports_multiclass = self.supports_multiclass
        model.name = self.name
        model.original_domain = origdomain
        model.original_data = origdata
        progress_callback(1)
        return model
Esempio n. 13
0
def _embedding_similarity(
    corpus: Corpus,
    words: List[str],
    callback: Callable,
    embedding_language: str,
) -> np.ndarray:
    language = LANGS_TO_ISO[embedding_language]
    # make sure there will be only embeddings in X after calling the embedder
    corpus = Corpus.from_table(Domain([], metas=corpus.domain.metas), corpus)
    emb = DocumentEmbedder(language)

    cb_part = len(corpus) / (len(corpus) + len(words))
    documet_embeddings, skipped = emb.transform(
        corpus, wrap_callback(callback, 0, cb_part))
    assert skipped is None

    words = [[w] for w in words]
    word_embeddings = np.array(
        emb.transform(words, wrap_callback(callback, cb_part, 1 - cb_part)))
    return cosine_similarity(documet_embeddings.X, word_embeddings)
Esempio n. 14
0
def permutation_feature_importance(
        model: Model,
        data: Table,
        score: Score,
        n_repeats: int = 5,
        progress_callback: Callable = None
):
    """
    Function calculates feature importance of a model for a given data.

    Parameters
    ----------
    model : Model
        Fitted Orange Learner.
    data : Table
        Data to calculate the feature importance for.
    score : Score
        Score to use for model evaluation.
    n_repeats : int, optional, default 5
        Number of times a feature is randomly shuffled.
    progress_callback : callable
        The callback for reporting the progress.

    Returns
    -------
    np.ndarray
         Feature importance.

    """
    if progress_callback is None:
        progress_callback = dummy_callback

    data = data.copy()
    _check_data(data)
    needs_pp = _check_model(model, data)

    scorer = _wrap_score(score, needs_pp)
    baseline_score = scorer(model, data)

    n_features = data.X.shape[1]
    step = 1 / n_features
    with data.unlocked():
        perm_scores = [_calculate_permutation_scores(
            model, data, i, n_repeats, scorer,
            wrap_callback(progress_callback, start=i * step,
                          end=(i + 1) * step)
        ) for i in range(n_features)]

    names = [attr.name for attr in data.domain.attributes]
    scores = baseline_score - np.array(perm_scores)
    if isinstance(score, RegressionScore) and not isinstance(score, R2):
        scores = -scores
    return scores, names
Esempio n. 15
0
    def apply_preprocessor(self, data: Optional[Corpus],
                           preprocessor: Optional[PreprocessorList],
                           state: TaskState) -> Result:
        def callback(i: float, status=""):
            state.set_progress_value(i * 100)
            if status:
                state.set_status(status)
            if state.is_interruption_requested():
                raise Exception

        pp_data = None
        msgs = []
        if data and preprocessor is not None:
            pp_data = preprocessor(data, wrap_callback(callback, end=0.9))
            if not pp_data.has_tokens():
                pp_data = BASE_TOKENIZER(pp_data,
                                         wrap_callback(callback, start=0.9))
            if pp_data is not None and len(pp_data.dictionary) == 0:
                msgs.append(self.Warning.no_token_left)
                pp_data = None
        return Result(corpus=pp_data, msgs=msgs)
Esempio n. 16
0
def get_shap_values_and_colors(
    model: Model,
    data: Table,
    progress_callback: Callable = None
) -> Tuple[List[np.ndarray], List[str], np.ndarray, np.ndarray]:
    """
    Compute SHAP values and colors that represent how high is the feature value
    comparing to other values for the same feature. This function provides all
    required components for explain model widget.

    Parameters
    ----------
    model
        Model which predictions are explained explained.
    data
        Data, which's prediction is explained.
    progress_callback
        The callback for reporting the progress.

    Returns
    -------
    shap_values
        Shapely values for each data item computed by the SHAP library. The
        result is a list of SHAP values for each class - the class order is
        taken from values in the class_var. Each array in the list has shape
        (num cases x num attributes) - explanation for the contribution of each
         attribute to the final prediction.
    attributes
        The attributes from table on which explanation was made: table
        preprocessed by models preprocessors
    sample_mask
        SHAP values are computed just for a data sample. It is a boolean mask
        that tells which rows in data are explained.
    colors
        Colors for each data instance and each feature. The shape of the matrix
        is M x N x C, where M is a number of instances, N is a number of
        features, and C is 3 (one value for each RGB channel).
    """
    if progress_callback is None:
        progress_callback = dummy_callback
    cb = wrap_callback(progress_callback, end=0.9)

    shap_values, transformed_data, sample_mask, _ = compute_shap_values(
        model, data, data, progress_callback=cb)

    colors = compute_colors(transformed_data[sample_mask])
    attributes = [t.name for t in transformed_data.domain.attributes]
    progress_callback(1)

    return shap_values, attributes, sample_mask, colors
Esempio n. 17
0
    def __call__(self,
                 corpus: Corpus,
                 callback: Callable = None,
                 **kw) -> Corpus:
        """ Marks tokens of a corpus with POS tags. """
        if callback is None:
            callback = dummy_callback
        corpus = super().__call__(corpus, wrap_callback(callback, end=0.2))

        assert corpus.has_tokens()
        callback(0.2, "POS Tagging...")
        tags = np.array(self._preprocess(corpus.tokens, **kw), dtype=object)
        corpus.pos_tags = tags
        return corpus
Esempio n. 18
0
    def __call__(self, data: Table, progress_callback: Callable = None) \
            -> Table:
        assert isinstance(data, Table)
        assert self.outlier_var is not None

        domain = Domain(data.domain.attributes, data.domain.class_vars,
                        data.domain.metas + (self.outlier_var, ))
        if progress_callback is None:
            progress_callback = dummy_callback
        progress_callback(0, "Preprocessing...")
        self._cached_data = self.data_to_model_domain(
            data, wrap_callback(progress_callback, end=0.1))
        progress_callback(0.1, "Predicting...")
        metas = np.hstack((data.metas, self.predict(self._cached_data.X)))
        progress_callback(1)
        return Table.from_numpy(domain, data.X, data.Y, metas)
Esempio n. 19
0
    def search_authors(
        self,
        authors: List[str],
        *,
        max_tweets: Optional[int] = MAX_TWEETS,
        collecting: bool = False,
        callback: Callable = dummy_callback,
    ) -> Optional[Corpus]:
        """
        Search recent tweets by authors.

        Parameters
        ----------
        authors
            A list of authors to search for.
        max_tweets
            Limits the number of downloaded tweets. If none use APIs maximum.
        collecting
            Whether to collect results across multiple search calls.
        callback
            Function to report the progress

        Returns
        -------
        Corpus with tweets
        """
        if not collecting:
            self.reset()

        count_sum = 0
        n = len(authors)
        for i, author in enumerate(authors):
            author_ = self.api.get_user(username=author)
            if author_.data is None:
                raise NoAuthorError(author)
            paginator = tweepy.Paginator(
                self.api.get_users_tweets, author_.data.id, **request_settings
            )
            count_sum += self._fetch(
                paginator,
                max_tweets,
                callback=wrap_callback(callback, i / n, (i + 1) / n),
            )
        self.append_history("Author", authors, None, None, count_sum)
        return self._create_corpus()
Esempio n. 20
0
    def __call__(self, corpus: Corpus, callback: Callable = None) \
            -> Corpus:
        """
        Applies a list of preprocessors to the corpus.

        :param corpus: Corpus
        :param callback: progress callback function
        :return: Corpus
            Preprocessed corpus.
        """
        if callback is None:
            callback = dummy_callback
        n_pps = len(list(self.preprocessors))
        for i, pp in enumerate(self.preprocessors):
            start = i / n_pps
            cb = wrap_callback(callback, start=start, end=start + 1 / n_pps)
            corpus = pp(corpus, cb)
        callback(1)
        return corpus
Esempio n. 21
0
    def __call__(self, data, progress_callback=None):
        if not self.check_learner_adequacy(data.domain):
            raise ValueError(self.learner_adequacy_err_msg)

        origdomain = data.domain

        if isinstance(data, Instance):
            data = Table(data.domain, [data])
        origdata = data

        if progress_callback is None:
            progress_callback = dummy_callback
        progress_callback(0, "Preprocessing...")
        try:
            cb = wrap_callback(progress_callback, end=0.1)
            data = self.preprocess(data, progress_callback=cb)
        except TypeError:
            data = self.preprocess(data)
            warnings.warn(
                "A keyword argument 'progress_callback' has been "
                "added to the preprocess() signature. Implementing "
                "the method without the argument is deprecated and "
                "will result in an error in the future.",
                OrangeDeprecationWarning)

        if len(data.domain.class_vars) > 1 and not self.supports_multiclass:
            raise TypeError("%s doesn't support multiple class variables" %
                            self.__class__.__name__)

        progress_callback(0.1, "Fitting...")
        model = self._fit_model(data)
        model.used_vals = [np.unique(y).astype(int) for y in data.Y[:, None].T]
        model.domain = data.domain
        model.supports_multiclass = self.supports_multiclass
        model.name = self.name
        model.original_domain = origdomain
        model.original_data = origdata
        progress_callback(1)
        return model
Esempio n. 22
0
def annotate_documents(
    corpus: Corpus,
    embedding: np.ndarray,
    clustering_method: int,
    n_components: Optional[int] = None,
    epsilon: Optional[float] = None,
    cluster_labels: Optional[np.ndarray] = None,
    fdr_threshold: float = 0.05,
    n_words_in_cluster: int = 10,
    progress_callback: Optional[Callable] = None
) -> Tuple[np.ndarray, Dict[int, ClusterType], int, float, ScoresType]:
    """
    Annotate documents in corpus, by performing clustering on the corpus and
    assigning characteristic terms to each cluster using Hypergeometric
    distribution.

    Return annotated clusters - for each cluster return a list of keywords
    with scores, cluster center coordinates and concave_hulls coordinates.
    Also return optimal values for n_components/epsilon if calculated and
    scores data (p-values and counts for all keywords).

    Parameters
    ----------
    corpus : Corpus
        Corpus to be annotated.
    embedding : np.ndarray of size len(corpus) × 2
        Usually tSNE projection of BoW of corpus.
    clustering_method : int
        0 for DBSCAN
        1 for Gaussian mixture models
        2 for custom clustering where cluster_labels are used
    n_components: int, optional, default = None
        Number of clusters for Gaussian mixture models. If None, set to the
        number of clusters with maximal silhouette.
    epsilon : float, optional, default = None
        epsilon for DBSCAN. If None, optimal value is computed.
    cluster_labels : np.ndarray, optional
        Custom cluster labels. Usually included in corpus.
    fdr_threshold : float, optional, default = 0.05
        hypergeom_p_values threshold
    n_words_in_cluster : int, optional, default = 10
        Number of characteristic terms in each cluster.
    progress_callback : callable, optional
        Progress callback.

    Returns
    -------
    cluster_labels : np.ndarray of size len(corpus)
        An array of floats (i.e. 0, 1, np.nan) that represent cluster labels
        for all documents in the corpus.
    clusters : dict
        Dictionary of keywords with scores, centroids and concave hulls
        for each cluster.
    n_components : int
        Optimal number of clusters for Gaussian mixture models, if the
        n_components is None, and clustering_method is
        ClusterDocuments.GAUSSIAN_MIXTURE. n_components otherwise.
    epsilon : float
        Optimal value for epsilon for DBSCAN, if the epsilon is None, and
        clustering_method is ClusterDocuments.DBSCAN. epsilon otherwise.
    scores : tuple
        Tuple of all keywords with p-values and counts.

    Raises
    ------
    ValueError when there are no clusters in the embedding.

    """
    if progress_callback is None:
        progress_callback = dummy_callback

    if clustering_method == ClusterDocuments.GAUSSIAN_MIXTURE:
        if n_components is None:
            n_components = ClusterDocuments.gmm_compute_n_components(
                embedding, wrap_callback(progress_callback, end=0.3))
        n_components = min([n_components, len(embedding)])
        cluster_labels = ClusterDocuments.gmm(embedding,
                                              n_components=n_components,
                                              threshold=0.6)

    elif clustering_method == ClusterDocuments.DBSCAN:
        if epsilon is None:
            epsilon = ClusterDocuments.dbscan_compute_epsilon(embedding)
        cluster_labels = ClusterDocuments.dbscan(embedding, eps=epsilon)

    else:
        assert cluster_labels is not None
        cluster_labels[np.isnan(cluster_labels)] = -1

    if len(set(cluster_labels) - {-1}) == 0:
        raise ValueError("There are no clusters using current settings.")

    keywords = _get_characteristic_terms(corpus,
                                         n_keywords=20,
                                         progress_callback=wrap_callback(
                                             progress_callback, start=0.5))
    clusters_keywords, all_keywords, scores, p_values = \
        _hypergeom_clusters(cluster_labels, keywords,
                            fdr_threshold, n_words_in_cluster)

    concave_hulls = compute_concave_hulls(embedding, cluster_labels, epsilon)

    centroids = {
        c: tuple(np.mean(concave_hulls[c], axis=0))
        for c in set(cluster_labels) - {-1}
    }

    clusters = {
        int(key): (clusters_keywords[key], centroids[key], concave_hulls[key])
        for key in clusters_keywords
    }

    cluster_labels = cluster_labels.astype(float)
    cluster_labels[cluster_labels == -1] = np.nan

    scores = (all_keywords, scores, p_values)

    return cluster_labels, clusters, n_components, epsilon, scores
Esempio n. 23
0
 def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
     if callback is None:
         callback = dummy_callback
     corpus = super().__call__(corpus, wrap_callback(callback, end=0.2))
     callback(0.2, "Normalizing...")
     return self._store_tokens(corpus, wrap_callback(callback, start=0.2))
    def runner(self, state: TaskState) -> Table:
        exp_type = self.data_output_options.expression_type[self.exp_type].type
        exp_source = self.data_output_options.expression_sources[
            self.exp_source]
        proc_slug = self.data_output_options.process[self.proc_slug].slug
        collection_id = self.selected_collection_id

        table = self.data_table
        progress_steps_download = iter(np.linspace(0, 50, 2))

        def callback(i: float, status=""):
            state.set_progress_value(i * 100)
            if status:
                state.set_status(status)
            if state.is_interruption_requested():
                raise Exception

        if not table:
            collection = self.res.get_collection_by_id(collection_id)
            coll_table = resdk.tables.RNATables(
                collection,
                expression_source=exp_source,
                expression_process_slug=proc_slug,
                progress_callable=wrap_callback(callback, end=0.5),
            )
            species = coll_table._data[0].output['species']
            sample = coll_table._samples[0]

            state.set_status('Downloading ...')
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            df_exp = coll_table.exp if exp_type != 'rc' else coll_table.rc
            df_exp = df_exp.rename(index=coll_table.readable_index)
            df_metas = coll_table.meta
            df_metas = df_metas.rename(index=coll_table.readable_index)
            df_qc = None
            if self.append_qc_data:
                # TODO: check if there is a way to detect if collection
                #       table contains QC data
                try:
                    df_qc = coll_table.qc
                    df_qc = df_qc.rename(index=coll_table.readable_index)
                except ValueError:
                    pass
            loop.close()

            state.set_status('To data table ...')

            duplicates = {
                item
                for item, count in Counter([
                    label.split('.')[1]
                    for label in df_metas.columns.to_list() if '.' in label
                ]).items() if count > 1
            }

            # what happens if there is more nested sections?
            section_name_to_label = {
                section['name']: section['label']
                for section in sample.descriptor_schema.schema
            }

            column_labels = {}
            for field_schema, fields, path in iterate_schema(
                    sample.descriptor, sample.descriptor_schema.schema,
                    path=''):
                path = path[1:]  # this is ugly, but cant go around it
                if path not in df_metas.columns:
                    continue
                label = field_schema['label']
                section_name, field_name = path.split('.')
                column_labels[path] = (
                    label if field_name not in duplicates else
                    f'{section_name_to_label[section_name]} - {label}')

            df_exp = df_exp.reset_index(drop=True)
            df_metas = df_metas.astype('object')
            df_metas = df_metas.fillna(np.nan)
            df_metas = df_metas.replace('nan', np.nan)
            df_metas = df_metas.rename(columns=column_labels)
            if df_qc is not None:
                df_metas = pd.merge(df_metas,
                                    df_qc,
                                    left_index=True,
                                    right_index=True)

            xym, domain_metas = vars_from_df(df_metas)
            x, _, m = xym
            x_metas = np.hstack((x, m))
            attrs = [ContinuousVariable(col) for col in df_exp.columns]
            metas = domain_metas.attributes + domain_metas.metas
            domain = Domain(attrs, metas=metas)
            table = Table(domain, df_exp.to_numpy(), metas=x_metas)
            state.set_progress_value(next(progress_steps_download))

            state.set_status('Matching genes ...')
            progress_steps_gm = iter(
                np.linspace(50, 99, len(coll_table.gene_ids)))

            def gm_callback():
                state.set_progress_value(next(progress_steps_gm))

            tax_id = species_name_to_taxid(species)
            gm = GeneMatcher(tax_id, progress_callback=gm_callback)
            table = gm.match_table_attributes(table, rename=True)
            table.attributes[TableAnnotation.tax_id] = tax_id
            table.attributes[TableAnnotation.gene_as_attr_name] = True
            table.attributes[TableAnnotation.gene_id_attribute] = 'Entrez ID'
            self.data_table = table

        state.set_status('Normalizing ...')
        table = self.normalize(table)
        state.set_progress_value(100)

        return table
Esempio n. 25
0
def run(corpus: Optional[Corpus], words: Optional[List], cached_keywords: Dict,
        scoring_methods: Set, scoring_methods_kwargs: Dict, agg_method: int,
        state: TaskState) -> Results:
    results = Results(scores=[], labels=[], all_keywords={})
    if not corpus:
        return results

    # passed by reference (and not copied) - to save partial results
    results.all_keywords = cached_keywords
    if not scoring_methods:
        return results

    def callback(i: float, status=""):
        state.set_progress_value(i * 100)
        if status:
            state.set_status(status)
        if state.is_interruption_requested():
            raise Exception

    callback(0, "Calculating...")
    scores = {}
    tokens = corpus.tokens
    documents = corpus.documents
    step = 1 / len(scoring_methods)
    for method_name, func in ScoringMethods.ITEMS:
        if method_name in scoring_methods:
            if method_name not in results.all_keywords:
                i = len(results.labels)
                cb = wrap_callback(callback,
                                   start=i * step,
                                   end=(i + 1) * step)

                needs_tokens = method_name in ScoringMethods.TOKEN_METHODS
                kw = {"progress_callback": cb}
                kw.update(scoring_methods_kwargs.get(method_name, {}))

                keywords = func(tokens if needs_tokens else documents, **kw)
                results.all_keywords[method_name] = keywords

            keywords = results.all_keywords[method_name]
            scores[method_name] = \
                dict(AggregationMethods.aggregate(keywords, agg_method))

            results.labels.append(method_name)

    scores = pd.DataFrame(scores)
    if words:

        # Normalize words
        for preprocessor in corpus.used_preprocessor.preprocessors:
            if isinstance(preprocessor, BaseNormalizer):
                words = [preprocessor.normalizer(w) for w in words]

        # Filter scores using words
        existing_words = [w for w in set(words) if w in scores.index]
        scores = scores.loc[existing_words] if existing_words \
            else scores.iloc[:0]

    results.scores = scores.reset_index().sort_values(
        by=[results.labels[0], "index"], ascending=[False,
                                                    True]).values.tolist()

    return results
Esempio n. 26
0
 def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
     if callback is None:
         callback = dummy_callback
     corpus = super().__call__(corpus, wrap_callback(callback, end=0.2))
     return self._filter_tokens(corpus, wrap_callback(callback, start=0.2))