def fit_transform(self, corpus: Corpus) -> Corpus: """ Groups threads together into communities. :param corpus: the Corpus to use :return: Modifies and returns Corpus with new meta key: "communityEmbedder", value: Dict, containing "pts": an array with rows corresponding to embedded communities, and "labels": an array whose ith entry is the community of the ith row of X. """ if self.community_key is None: raise RuntimeError( "Must specify community_key to retrieve label information from utterance" ) corpus_meta = corpus.get_meta() if "threadEmbedder" not in corpus_meta: raise RuntimeError( "Missing threadEmbedder metadata: " "threadEmbedder.fit_transform() must be run on the Corpus first" ) thread_embed_data = corpus_meta["threadEmbedder"] X_mid = thread_embed_data["X"] roots = thread_embed_data["roots"] if self.method.lower() == "svd": f = TruncatedSVD elif self.method.lower() == "tsne": f = TSNE elif self.method.lower() == "none": f = None else: raise Exception("Invalid embed_communities embedding method") if f is not None: X_embedded = f(n_components=self.n_components).fit_transform(X_mid) else: X_embedded = X_mid labels = [ corpus.get_utterance(root).get("meta")[self.community_key] for root in roots ] # label_counts = Counter(labels) subs = defaultdict(list) for x, label in zip(X_embedded, labels): subs[label].append(x / np.linalg.norm(x)) labels, subs = zip(*subs.items()) pts = [np.mean(sub, axis=0) for sub in subs] retval = {"pts": pts, "labels": labels} corpus.add_meta("communityEmbedder", retval) return corpus
def fit_transform(self, corpus: Corpus) -> Corpus: """ :param corpus: the Corpus to use :return: Modifies and returns corpus with new meta key: "threadEmbedder", value: Dict, containing "X": an array with rows corresponding to embedded threads, "roots": an array whose ith entry is the thread root id of the ith row of X. If return_components is True, then the Dict contains a third key "components": the SVD components array """ convos = corpus.iter_conversations() sample_convo_meta = next(iter(convos)) if "hyperconvo" not in sample_convo_meta: raise RuntimeError( "Missing thread statistics: HyperConvo.fit_transform() must be run on the Corpus first" ) thread_stats = dict() for convo in convos: thread_stats.update(convo.meta["hyperconvo"]) X = [] roots = [] for root, feats in thread_stats.items(): roots.append(root) row = np.array([ v[1] if not (np.isnan(v[1]) or np.isinf(v[1])) else 0 for v in sorted(feats.items()) ]) X.append(row) X = np.array(X) if self.norm_method.lower() == "standard": X = StandardScaler().fit_transform(X) elif self.norm_method.lower() == "none": pass else: raise Exception("Invalid embed_feats normalization method") if self.method.lower() == "svd": f = TruncatedSVD elif self.method.lower() == "tsne": f = TSNE else: raise Exception("Invalid embed_feats embedding method") emb = f(n_components=self.n_components) X_mid = emb.fit_transform(X) / emb.singular_values_ retval = {"X": X_mid, "roots": roots} if self.return_components: retval["components"] = emb.components_ corpus.add_meta("threadEmbedder", retval) return corpus
def test_corpus_metadata(self): """ Merge with overlap in corpus metadata Expect second corpus metadata to override if keys are the same """ corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus2 = Corpus(utterances=[ Utterance(id="3", text="i like pie", speaker=Speaker(id="delta")), Utterance( id="4", text="this is a sentence", speaker=Speaker(id="echo")), Utterance(id="5", text="goodbye", speaker=Speaker(id="foxtrot")), ]) corpus1.add_meta('politeness', 0.95) corpus1.add_meta('toxicity', 0.8) corpus2.add_meta('toxicity', 0.9) corpus2.add_meta('paggro', 1.0) merged = corpus1.merge(corpus2) self.assertEqual(len(merged.meta), 3) self.assertEqual(merged.meta['toxicity'], 0.9)
def transform(self, corpus: Corpus, obj_type: str, group_and_models: Callable[[Utterance], Tuple[str, List[str]]] = None, group_model_attr_key: Callable[[str, str], str] = None, selector: Callable[[CorpusComponent], bool] = lambda _: True, target_text_func: Callable[[Utterance], List[str]] = None): """ Annotates `obj_type` components in a corpus with surprise scores. Should be called after fit(). :param corpus: corpus to compute surprise for. :param obj_type: the type of corpus components to annotate. Should be either 'utterance', 'speaker', 'conversation', or 'corpus'. :param group_and_models: optional function that defines how an utterance should be grouped to form a target text and what models (contexts) the group should be compared to when calculating surprise. Takes in an utterance and returns a tuple containing the name of the group the utterance belongs to and a list of models to calculate how surprising that group is against. Objects will be annotated with a metadata field `self.surprise_attr_name` that is maps a key corresponding to the `groupname` and `modelkey` to the surprise score for utterances in the group when compared to the model. The key used is defined by the `group_model_attr_key` parameter. If `group_and_models` is `None`, `self.model_key_selector` will be used to select the group that an utterance belongs to. The surprise score will be calculated for each group of utterances compared to the model in `self.models` corresponding to the group. :param group_model_attr_key: optional function to define what key should be used for a given `groupname` and `modelkey`. If `group_model_attr_key` is `None`, the default key used will be "GROUP_groupname_MODEL_modelkey" unless `groupname` and `modelkey` are equal in which case just "modelkey" will be used as the key. :param selector: function to select objects to annotate. if function returns true, object will be annotated. :param target_text_func: optional function to define what the target text corresponding to an utterance should be. takes in an utterance and returns a list of string tokens """ if obj_type == 'corpus': utt_groups = defaultdict(list) group_models = defaultdict(set) for utt in corpus.iter_utterances(): if group_and_models: group_name, models = group_and_models(utt) else: group_name = self.model_key_selector(utt) models = {group_name} if target_text_func: if group_name not in utt_groups: utt_groups[group_name] = [target_text_func(utt)] else: utt_groups[group_name].append(self.tokenizer(utt.text)) group_models[group_name].update(models) surprise_scores = {} for group_name in tqdm(utt_groups, desc='transform'): for model_key in group_models[group_name]: context = self.model_groups[model_key] target = list(chain(*utt_groups[group_name])) surprise_scores[Surprise._format_attr_key( group_name, model_key, group_model_attr_key)] = self._compute_surprise( target, context) corpus.add_meta(self.surprise_attr_name, surprise_scores) elif obj_type == 'utterance': for utt in tqdm(corpus.iter_utterances(selector=selector), desc='transform'): if group_and_models: group_name, models = group_and_models(utt) surprise_scores = {} for model_key in models: context = self.model_groups[model_key] target = target_text_func( utt) if target_text_func else self.tokenizer( utt.text) surprise_scores[Surprise._format_attr_key( group_name, model_key, group_model_attr_key)] = self._compute_surprise( target, context) utt.add_meta(self.surprise_attr_name, surprise_scores) else: group_name = self.model_key_selector(utt) context = self.model_groups[group_name] target = target_text_func( utt) if target_text_func else self.tokenizer(utt.text) utt.add_meta(self.surprise_attr_name, self._compute_surprise(target, context)) else: for obj in tqdm(corpus.iter_objs(obj_type, selector=selector), desc='transform'): utt_groups = defaultdict(list) group_models = defaultdict(set) for utt in obj.iter_utterances(): if group_and_models: group_name, models = group_and_models(utt) else: group_name = self.model_key_selector(utt) models = {group_name} if target_text_func: if group_name not in utt_groups: utt_groups[group_name] = [target_text_func(utt)] else: utt_groups[group_name].append(self.tokenizer(utt.text)) group_models[group_name].update(models) surprise_scores = {} for group_name in utt_groups: for model_key in group_models[group_name]: assert (model_key in self.model_groups), 'invalid model key' if not self.model_groups[model_key]: continue context = self.model_groups[model_key] target = list(chain(*utt_groups[group_name])) surprise_scores[Surprise._format_attr_key( group_name, model_key, group_model_attr_key)] = self._compute_surprise( target, context) obj.add_meta(self.surprise_attr_name, surprise_scores) return corpus