コード例 #1
0
    def fit_transform(self, corpus: Corpus) -> Corpus:
        """
        Groups threads together into communities.

        :param corpus: the Corpus to use

        :return: Modifies and returns Corpus with new meta key: "communityEmbedder", value: Dict,
                containing "pts": an array with rows corresponding to embedded communities,
                and "labels": an array whose ith entry is the community of the ith row of X.
        """
        if self.community_key is None:
            raise RuntimeError(
                "Must specify community_key to retrieve label information from utterance"
            )

        corpus_meta = corpus.get_meta()
        if "threadEmbedder" not in corpus_meta:
            raise RuntimeError(
                "Missing threadEmbedder metadata: "
                "threadEmbedder.fit_transform() must be run on the Corpus first"
            )

        thread_embed_data = corpus_meta["threadEmbedder"]

        X_mid = thread_embed_data["X"]
        roots = thread_embed_data["roots"]

        if self.method.lower() == "svd":
            f = TruncatedSVD
        elif self.method.lower() == "tsne":
            f = TSNE
        elif self.method.lower() == "none":
            f = None
        else:
            raise Exception("Invalid embed_communities embedding method")

        if f is not None:
            X_embedded = f(n_components=self.n_components).fit_transform(X_mid)
        else:
            X_embedded = X_mid

        labels = [
            corpus.get_utterance(root).get("meta")[self.community_key]
            for root in roots
        ]
        # label_counts = Counter(labels)
        subs = defaultdict(list)
        for x, label in zip(X_embedded, labels):
            subs[label].append(x / np.linalg.norm(x))

        labels, subs = zip(*subs.items())
        pts = [np.mean(sub, axis=0) for sub in subs]

        retval = {"pts": pts, "labels": labels}
        corpus.add_meta("communityEmbedder", retval)

        return corpus
コード例 #2
0
    def fit_transform(self, corpus: Corpus) -> Corpus:
        """
        :param corpus: the Corpus to use

        :return: Modifies and returns corpus with new meta key: "threadEmbedder",
             value: Dict, containing "X": an array with rows corresponding
             to embedded threads, "roots": an array whose ith entry is the
             thread root id of the ith row of X. If return_components is True,
             then the Dict contains a third key "components": the SVD components array
        """
        convos = corpus.iter_conversations()
        sample_convo_meta = next(iter(convos))
        if "hyperconvo" not in sample_convo_meta:
            raise RuntimeError(
                "Missing thread statistics: HyperConvo.fit_transform() must be run on the Corpus first"
            )

        thread_stats = dict()

        for convo in convos:
            thread_stats.update(convo.meta["hyperconvo"])

        X = []
        roots = []
        for root, feats in thread_stats.items():
            roots.append(root)
            row = np.array([
                v[1] if not (np.isnan(v[1]) or np.isinf(v[1])) else 0
                for v in sorted(feats.items())
            ])
            X.append(row)
        X = np.array(X)

        if self.norm_method.lower() == "standard":
            X = StandardScaler().fit_transform(X)
        elif self.norm_method.lower() == "none":
            pass
        else:
            raise Exception("Invalid embed_feats normalization method")

        if self.method.lower() == "svd":
            f = TruncatedSVD
        elif self.method.lower() == "tsne":
            f = TSNE
        else:
            raise Exception("Invalid embed_feats embedding method")

        emb = f(n_components=self.n_components)
        X_mid = emb.fit_transform(X) / emb.singular_values_

        retval = {"X": X_mid, "roots": roots}
        if self.return_components: retval["components"] = emb.components_

        corpus.add_meta("threadEmbedder", retval)
        return corpus
コード例 #3
0
    def test_corpus_metadata(self):
        """
        Merge with overlap in corpus metadata

        Expect second corpus metadata to override if keys are the same
        """
        corpus1 = Corpus(utterances=[
            Utterance(id="0", text="hello world", speaker=Speaker(id="alice")),
            Utterance(id="1", text="my name is bob", speaker=Speaker(
                id="bob")),
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
        ])

        corpus2 = Corpus(utterances=[
            Utterance(id="3", text="i like pie", speaker=Speaker(id="delta")),
            Utterance(
                id="4", text="this is a sentence", speaker=Speaker(id="echo")),
            Utterance(id="5", text="goodbye", speaker=Speaker(id="foxtrot")),
        ])

        corpus1.add_meta('politeness', 0.95)
        corpus1.add_meta('toxicity', 0.8)

        corpus2.add_meta('toxicity', 0.9)
        corpus2.add_meta('paggro', 1.0)

        merged = corpus1.merge(corpus2)
        self.assertEqual(len(merged.meta), 3)
        self.assertEqual(merged.meta['toxicity'], 0.9)
コード例 #4
0
    def transform(self,
                  corpus: Corpus,
                  obj_type: str,
                  group_and_models: Callable[[Utterance],
                                             Tuple[str, List[str]]] = None,
                  group_model_attr_key: Callable[[str, str], str] = None,
                  selector: Callable[[CorpusComponent], bool] = lambda _: True,
                  target_text_func: Callable[[Utterance], List[str]] = None):
        """
    Annotates `obj_type` components in a corpus with surprise scores. Should be 
    called after fit().

    :param corpus: corpus to compute surprise for.
    :param obj_type: the type of corpus components to annotate. Should be either 
        'utterance', 'speaker', 'conversation', or 'corpus'. 
    :param group_and_models: optional function that defines how an utterance should 
        be grouped to form a target text and what models (contexts) the group should 
        be compared to when calculating surprise. Takes in an utterance and returns 
        a tuple containing the name of the group the utterance belongs to and a 
        list of models to calculate how surprising that group is against. Objects 
        will be annotated with a metadata field `self.surprise_attr_name` that is 
        maps a key corresponding to the `groupname` and `modelkey` to the surprise 
        score for utterances in the group when compared to the model. The key used 
        is defined by the `group_model_attr_key` parameter.
        If `group_and_models` is `None`, `self.model_key_selector` will be used 
        to select the group that an utterance belongs to. The surprise score will 
        be calculated for each group of utterances compared to the model in 
        `self.models` corresponding to the group.
    :param group_model_attr_key: optional function to define what key should be used 
        for a given `groupname` and `modelkey`. 
        If `group_model_attr_key` is `None`, the default key used will be 
        "GROUP_groupname_MODEL_modelkey" unless `groupname` and `modelkey` are equal 
        in which case just "modelkey" will be used as the key.
    :param selector: function to select objects to annotate. if function returns true, object will be annotated.
    :param target_text_func: optional function to define what the target text corresponding to an utterance should be. 
        takes in an utterance and returns a list of string tokens
    """
        if obj_type == 'corpus':
            utt_groups = defaultdict(list)
            group_models = defaultdict(set)
            for utt in corpus.iter_utterances():
                if group_and_models:
                    group_name, models = group_and_models(utt)
                else:
                    group_name = self.model_key_selector(utt)
                    models = {group_name}
                if target_text_func:
                    if group_name not in utt_groups:
                        utt_groups[group_name] = [target_text_func(utt)]
                else:
                    utt_groups[group_name].append(self.tokenizer(utt.text))
                group_models[group_name].update(models)
            surprise_scores = {}
            for group_name in tqdm(utt_groups, desc='transform'):
                for model_key in group_models[group_name]:
                    context = self.model_groups[model_key]
                    target = list(chain(*utt_groups[group_name]))
                    surprise_scores[Surprise._format_attr_key(
                        group_name, model_key,
                        group_model_attr_key)] = self._compute_surprise(
                            target, context)
            corpus.add_meta(self.surprise_attr_name, surprise_scores)
        elif obj_type == 'utterance':
            for utt in tqdm(corpus.iter_utterances(selector=selector),
                            desc='transform'):
                if group_and_models:
                    group_name, models = group_and_models(utt)
                    surprise_scores = {}
                    for model_key in models:
                        context = self.model_groups[model_key]
                        target = target_text_func(
                            utt) if target_text_func else self.tokenizer(
                                utt.text)
                        surprise_scores[Surprise._format_attr_key(
                            group_name, model_key,
                            group_model_attr_key)] = self._compute_surprise(
                                target, context)
                    utt.add_meta(self.surprise_attr_name, surprise_scores)
                else:
                    group_name = self.model_key_selector(utt)
                    context = self.model_groups[group_name]
                    target = target_text_func(
                        utt) if target_text_func else self.tokenizer(utt.text)
                    utt.add_meta(self.surprise_attr_name,
                                 self._compute_surprise(target, context))
        else:
            for obj in tqdm(corpus.iter_objs(obj_type, selector=selector),
                            desc='transform'):
                utt_groups = defaultdict(list)
                group_models = defaultdict(set)
                for utt in obj.iter_utterances():
                    if group_and_models:
                        group_name, models = group_and_models(utt)
                    else:
                        group_name = self.model_key_selector(utt)
                        models = {group_name}
                    if target_text_func:
                        if group_name not in utt_groups:
                            utt_groups[group_name] = [target_text_func(utt)]
                    else:
                        utt_groups[group_name].append(self.tokenizer(utt.text))
                    group_models[group_name].update(models)
                surprise_scores = {}
                for group_name in utt_groups:
                    for model_key in group_models[group_name]:
                        assert (model_key
                                in self.model_groups), 'invalid model key'
                        if not self.model_groups[model_key]: continue
                        context = self.model_groups[model_key]
                        target = list(chain(*utt_groups[group_name]))
                        surprise_scores[Surprise._format_attr_key(
                            group_name, model_key,
                            group_model_attr_key)] = self._compute_surprise(
                                target, context)
                obj.add_meta(self.surprise_attr_name, surprise_scores)
        return corpus