def test_with_overlap(self): """ Basic merge: with overlap in utterance id (but utterance has same data & metadata) """ corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus2 = Corpus(utterances=[ Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), Utterance( id="4", text="this is a sentence", speaker=Speaker(id="echo")), Utterance(id="5", text="goodbye", speaker=Speaker(id="foxtrot")), ]) merged = corpus1.merge(corpus2) self.assertEqual(len(list(merged.iter_utterances())), 5) self.assertEqual(len(list(merged.iter_speakers())), 5) self.assertEqual(len(list(corpus1.iter_utterances())), 3) self.assertEqual(len(list(corpus2.iter_utterances())), 3)
def transform(self, corpus: Corpus): """Adds metadata about politicization to each utterance. :param corpus: the corpus to compute features for. :type corpus: Corpus """ assert 'stem_tokens' in next(corpus.iter_utterances()).meta for utt in corpus.iter_utterances(): if utt.meta['valid']: utt.meta['num_pol_words'] = len( self.key_words.intersection(utt.meta['stem_tokens'])) utt.meta['political'] = int(utt.meta['num_pol_words'] > 0) else: utt.meta['num_pol_words'] = None utt.meta['political'] = None # for conv_id in corpus.conversations: # conv = corpus.get_conversation(conv_id) # for utt in conv.iter_utterances(): # if utt.text != None: # tokenized = word_tokenize(utt.text.lower()) # invocations = 0 # length = len(tokenized) # pol_words = [] # for token in tokenized: # if token in self.key_words: # invocations += 1 # pol_words.append(token) # utt.meta["num_pol_refs"] = invocations # if (length > 0): # utt.meta["num_pol_refs_incidence"] = (invocations/length) # else: # utt.meta["num_pol_refs_incidence"] = 0 # utt.meta["pol_words"] = pol_words return corpus
def test_overlap_diff_data(self): """ Merge with overlap in utterance id and utterance has diff data but same metadata Warning should be printed. Original utterance data should be preserved. """ corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus2 = Corpus(utterances=[ Utterance( id="2", text="this is a test2", speaker=Speaker(id="candace")), Utterance( id="4", text="this is a sentence", speaker=Speaker(id="echo")), Utterance(id="5", text="goodbye", speaker=Speaker(id="foxtrot")), ]) merged = corpus1.merge(corpus2) self.assertEqual(len(list(merged.iter_utterances())), 5) self.assertEqual(len(list(merged.iter_speakers())), 5) self.assertEqual(len(list(corpus1.iter_utterances())), 3) self.assertEqual(len(list(corpus2.iter_utterances())), 3) self.assertEqual(merged.get_utterance("2").text, "this is a test") self.assertEqual( merged.get_utterance("2").speaker, Speaker(id="charlie"))
def transform(self, corpus: Corpus) -> Corpus: super().transform(corpus) if self.replace_text: selector = lambda utt_: self.input_filter(utt_, None) for utt in corpus.iter_utterances(selector): cleaned_text = utt.get_info(self.output_field) if self.save_original: utt.set_info(self.output_field, utt.text) utt.text = cleaned_text if not self.save_original: next(corpus.iter_utterances(selector)).del_info( self.output_field) # deletes for all return corpus
def transform( self, corpus: Corpus, selector: Callable[[Conversation], bool] = lambda convo: True, ignore_utterances: Callable[[Utterance], bool] = lambda utt: False ) -> Corpus: """ Annotate the corpus utterances with forecast and forecast score information :param corpus: target Corpus :param selector: a (lambda) function that takes a Conversation and returns a bool: True if the Conversation is to be included in the transformation step. By default, includes all Conversations. :param ignore_utterances: a (lambda) function that takes an Utterance and returns a bool: True if the Utterance should be excluded from the Conversation in the transformation step. By default, all Utterances are included. :return: annotated Corpus """ id_to_context_reply_label = self._get_context_reply_label_dict( corpus, selector, ignore_utterances, include_label=False) forecast_df = self.forecaster_model.forecast(id_to_context_reply_label) for utt in corpus.iter_utterances(): if utt.id in forecast_df.index: utt.add_meta(self.forecast_feat_name, forecast_df.loc[utt.id][self.forecast_feat_name]) utt.add_meta( self.forecast_prob_feat_name, forecast_df.loc[utt.id][self.forecast_prob_feat_name]) else: utt.add_meta(self.forecast_feat_name, None) utt.add_meta(self.forecast_prob_feat_name, None) return corpus
def _get_scores(self, corpus: Corpus, selector: Optional[Callable[[Utterance], bool]] = lambda utt: True): """ Calculates average occurrence per utterance. Used in summarize() :param corpus: the target Corpus :param selector: (lambda) function specifying whether the utterance should be included """ utts = list(corpus.iter_utterances(selector)) if self.marker_attribute_name not in utts[0].meta: print( "Could not find politeness markers metadata. Running transform() on corpus first...", end="") self.transform(corpus, markers=True) print("Done.") counts = { k[21:len(k) - 2]: 0 for k in utts[0].meta[self.marker_attribute_name].keys() } for utt in utts: for k, v in utt.meta[self.marker_attribute_name].items(): counts[k[21:len(k) - 2]] += len(v) scores = {k: v / len(utts) for k, v in counts.items()} return scores
def transform(self, corpus: Corpus, selector: Optional[Callable[[Utterance], bool]] = lambda utt: True, markers: bool = False): """ Extract politeness strategies from each utterances in the corpus and annotate the utterances with the extracted strategies. Requires that the corpus has previously been transformed by a Parser, such that each utterance has dependency parse info in its metadata table. :param corpus: the corpus to compute features for. :param selector: a (lambda) function that takes an Utterance and returns a bool indicating whether the utterance should be included in this annotation step. :param markers: whether or not to add politeness occurrence markers """ for utt in corpus.iter_utterances(): if selector(utt): for i, sent in enumerate(utt.meta["parsed"]): for p in sent["toks"]: p["tok"] = re.sub("[^a-z,.:;]", "", p["tok"].lower()) utt.meta[self.ATTR_NAME], marks = get_politeness_strategy_features(utt) if markers: utt.meta[self.MRKR_NAME] = marks else: utt.meta[self.ATTR_NAME] = None utt.meta[self.MRKR_NAME] = None return corpus
def transform( self, corpus: Corpus, selector: Callable[[Utterance], bool] = lambda x: True) -> Corpus: """ Annotates the corpus utterances with the lists of fighting words that the utterance contains. The relevant fighting words to use are specified by FightingWords.top_k or FightingWords.threshold, with FightingWords.annot_method indicating which criterion to use. Lists are stored under metadata keys 'fighting_words_class1', 'fighting_words_class2' :param corpus: corpus to annotate :param selector: a (lambda) function that takes an Utterance and returns True/False; this selects for utterances that should be annotated with the fighting words :return: annotated corpus """ class1_ngrams, class2_ngrams = self.get_top_k_ngrams() if self.annot_method == "top_k" \ else self.get_ngrams_past_threshold() for utt in corpus.iter_utterances( ): # improve the efficiency of this; tricky because ngrams #TODO if selector(utt): utt.meta['fighting_words_class1'] = [ ngram for ngram in class1_ngrams if ngram in utt.text ] utt.meta['fighting_words_class2'] = [ ngram for ngram in class2_ngrams if ngram in utt.text ] else: utt.meta['fighting_words_class1'] = None utt.meta['fighting_words_class2'] = None return corpus
def fit(self, corpus: Corpus, class1_func: Callable[[Utterance], bool], class2_func: Callable[[Utterance], bool], y=None, selector: Callable[[Utterance], bool] = lambda utt: True): """ Learn the fighting words from a corpus, with an optional selector that selects for utterances prior to grouping the utterances into class1 / class2. :param corpus: target Corpus :param class1_func: selector function for identifying utterances that belong to class 1 :param class2_func: selector function for identifying utterances that belong to class 2 :param selector: a (lambda) function that takes an Utterance and returns True/False; this selects for utterances that should be included in this fitting step :return: fitted FightingWords Transformer """ class1, class2 = [], [] for utt in corpus.iter_utterances(selector): if class1_func(utt): class1.append(utt) elif class2_func(utt): class2.append(utt) if len(class1) == 0: raise ValueError("class1_func returned 0 valid utterances.") if len(class2) == 0: raise ValueError("class2_func returned 0 valid utterances.") print( "class1_func returned {} valid utterances. class2_func returned {} valid utterances." .format(len(class1), len(class2))) self.ngram_zscores = self._bayes_compare_language(class1, class2) print("ngram zscores computed.") return self
def summarize(self, corpus: Corpus, use_selector=True, exclude_na=True): """ Returns a DataFrame of utterances and their forecasts (and forecast probabilities) :param corpus: target Corpus :param use_selector: whether to use Forecaster's convo and utterance selector functions :param exclude_na: whether to drop NaN results :return: a pandas DataFrame """ utt_forecast_prob = [] if use_selector: for convo in corpus.iter_conversations(self.convo_selector_func): for utt in convo.iter_utterances(self.utt_selector_func): utt_forecast_prob.append( (utt.id, utt.meta[self.forecast_feat_name], utt.meta[self.forecast_prob_feat_name])) else: for utt in corpus.iter_utterances(): utt_forecast_prob.append( (utt.id, utt.meta[self.forecast_feat_name], utt.meta[self.forecast_prob_feat_name])) forecast_df = pd.DataFrame(utt_forecast_prob, columns=["utt_id", self.forecast_feat_name, self.forecast_prob_feat_name]) \ .set_index('utt_id').sort_values(self.forecast_prob_feat_name, ascending=False) if exclude_na: forecast_df = forecast_df.dropna() return forecast_df
def fit(self, corpus: Corpus, text_func: Callable[[Utterance], List[str]] = None, selector: Callable[[Utterance], bool] = lambda utt: True): """ Fits a model for each group of utterances in a corpus. The group that an utterance belongs to is determined by the `model_key_selector` parameter in the transformer's constructor. :param corpus: corpus to create models from. :param text_func: optional function to define how the text a model is trained on should be selected. Takes an utterance as input and returns a list of strings to train the model corresponding to that utterance on. The model corresponding to the utterance is determined by `self.model_key_selector`. For every utterance corresponding to the same model key, this function should return the same result. If `text_func` is `None`, a model will be trained on the text from all the utterances that belong to its group. :param selector: determines which utterances in the corpus to train models for. """ self.model_groups = defaultdict(list) for utt in tqdm(corpus.iter_utterances(selector=selector), desc='fit1'): key = self.model_key_selector(utt) if text_func: if key not in self.model_groups: self.model_groups[key] = text_func(utt) else: self.model_groups[key].append(utt.text) for key in tqdm(self.model_groups, desc='fit2'): if not text_func: self.model_groups[key] = [' '.join(self.model_groups[key])] self.model_groups[key] = list( map(lambda x: self.tokenizer(x), self.model_groups[key])) return self
def _preprocess_utterances(self, corpus: Corpus) -> Tuple[List[Hashable], List[Dict]]: """Convert each Utterance in the given Corpus into the representation expected by the politeness API. Assumes that the Corpus has already been parsed, so that each Utterance contains the `parsed` metadata entry :param corpus: the corpus to compute features for. :type corpus: Corpus """ utt_ids = [] # keep track of the order in which we process the utterances, so we can join with the corpus at the end documents = [] for i, utterance in enumerate(corpus.iter_utterances()): if self.verbose and i > 0 and (i % self.verbose) == 0: print("\t%03d" % i) utt_ids.append(utterance.id) doc = {"text": utterance.text, "sentences": [], "parses": []} # the politeness API goes sentence-by-sentence for sent in utterance.meta["parsed"].sents: doc["sentences"].append(sent.text) sent_parses = [] pos = sent.start for tok in sent: if tok.dep_ != "punct": # the politeness API does not know how to handle punctuation in parses ele = "%s(%s-%d, %s-%d)"%(tok.dep_, tok.head.text, tok.head.i + 1 - pos, tok.text, tok.i + 1 - pos) sent_parses.append(ele) doc["parses"].append(sent_parses) doc["unigrams"], doc["bigrams"] = get_unigrams_and_bigrams(doc) documents.append(doc) if self.verbose: print("Done!") return utt_ids, documents
def transform(self, corpus: Corpus): ''' compiles a list of all utterances by each user, organized by conversation; also annotates user with summary statistics. :param corpus: the Corpus to transform. :type corpus: Corpus ''' user_to_convo_utts = defaultdict(lambda: defaultdict(list)) for utterance in corpus.iter_utterances(): if not self.utterance_filter(utterance): continue user_to_convo_utts[utterance.user.name][utterance.root].append( (utterance.id, utterance.timestamp)) for user, convo_utts in user_to_convo_utts.items(): user_convos = {} for convo, utts in convo_utts.items(): sorted_utts = sorted(utts, key=lambda x: x[1]) user_convos[convo] = { 'utterance_ids': [x[0] for x in sorted_utts], 'start_time': sorted_utts[0][1], 'n_utterances': len(sorted_utts) } corpus.get_user(user).add_meta('conversations', user_convos) for user in corpus.iter_users(): if 'conversations' not in user.meta: continue user.add_meta('n_convos', len(user.meta['conversations'])) sorted_convos = sorted(user.meta['conversations'].items(), key=lambda x: x[1]['start_time']) user.add_meta('start_time', sorted_convos[0][1]['start_time']) for idx, (convo_id, _) in enumerate(sorted_convos): user.meta['conversations'][convo_id]['idx'] = idx return corpus
def test_no_overlap(self): """ Basic merge: no overlap in utterance id """ corpus1 = Corpus(utterances=[ Utterance(id=0, text="hello world", user=User(name="alice")), Utterance(id=1, text="my name is bob", user=User(name="bob")), Utterance(id=2, text="this is a test", user=User(name="charlie")), ]) corpus2 = Corpus(utterances=[ Utterance(id=3, text="i like pie", user=User(name="delta")), Utterance(id=4, text="this is a sentence", user=User(name="echo")), Utterance(id=5, text="goodbye", user=User(name="foxtrot")), ]) merged = corpus1.merge(corpus2) self.assertEqual(len(list(merged.iter_utterances())), 6) self.assertEqual(len(list(merged.iter_users())), 6) self.assertEqual(len(list(corpus1.iter_utterances())), 3) self.assertEqual(len(list(corpus2.iter_utterances())), 3)
def transform(self, corpus: Corpus): """Adds metadata about politicization to each utterance. :param corpus: the corpus to compute features for. :type corpus: Corpus """ assert 'stem_tokens' in next(corpus.iter_utterances()).meta counter = 1 for utt in corpus.iter_utterances(): if utt.meta['valid']: utt.meta['analysis'] = lexicon.analyze( utt.text, categories=self.categories) for k in utt.meta['analysis'].keys(): if utt.meta['analysis'][k] != 0.0: utt.meta['analysis'][k] = 1 else: utt.meta['analysis'] = None counter = counter + 1 if counter % 10000 == 0: print("processed ", counter, "utterances ") return corpus
def transform(self, corpus: Corpus): ''' tokenizes each utterance, and stores tokens as a space-separated string entry in the utterance metadata. :param corpus: the Corpus to tokenize utterances for. :type corpus: Corpus ''' for idx, utterance in enumerate(corpus.iter_utterances()): if self._print_output(idx): print(idx, 'utterances tokenized') utterance.add_meta('tokens', ' '.join(nltk.word_tokenize(utterance.text))) return corpus
def transform(self, corpus: Corpus) -> Corpus: super().transform(corpus) if self.replace_text: selector = lambda utt_: self.input_filter(utt_, None) for utt in corpus.iter_utterances(selector): cleaned_text = utt.retrieve_meta(self.output_field) if self.save_original: utt.add_meta(self.output_field, utt.text) utt.text = cleaned_text if not self.save_original: corpus.delete_metadata('utterance', self.output_field) return corpus
def transform(self, corpus: Corpus): ''' computes wordcount per utterance ::param corpus: the Corpus to compute wordcount for. :type corpus: Corpus ''' for utterance in corpus.iter_utterances(): if self.use_tokenized: utterance.add_meta('wordcount', len(utterance.meta['tokens'].split())) else: utterance.add_meta('wordcount', len(utterance.text.split())) return corpus
def transform(self, corpus: Corpus) -> Corpus: """ Annotates the corpus utterances with the lists of fighting words that the utterance contains The relevant fighting words to use are specified by FightingWords.top_k or FightingWords.threshold, with FightingWords.annot_method indicating which criterion to use. Lists are stored under metadata keys 'fighting_words_class1', 'fighting_words_class2' :param corpus: corpus to annotate :return: annotated corpus """ class1_ngrams, class2_ngrams = self.get_top_k_ngrams() if self.annot_method == "top_k" \ else self.get_ngrams_past_threshold() for utt in corpus.iter_utterances(): # improve the efficiency of this; tricky because ngrams #TODO utt.meta['fighting_words_class1'] = [ngram for ngram in class1_ngrams if ngram in utt.text] utt.meta['fighting_words_class2'] = [ngram for ngram in class2_ngrams if ngram in utt.text] return corpus
def _get_feat_df(self, corpus: Corpus, selector: Optional[Callable[[Utterance], bool]] = lambda utt: True): """ Construct binary feature dataframe. Used in summarize() :param corpus: the target Corpus :param selector: (lambda) function specifying whether the utterance should be included """ utts = list(corpus.iter_utterances(selector)) if self.strategy_attribute_name not in utts[0].meta: print("Could not find politeness strategies metadata. Running transform() on corpus first...", end="") self.transform(corpus) print("Done.") df_feat = pd.DataFrame.from_dict({utt.id: utt.meta['politeness_strategies'] for utt in utts}, orient='index') return df_feat
def transform(self, corpus: Corpus) -> Corpus: """ Computes per-utterance attributes for each utterance in the Corpus, storing these values in the `output_field` of each utterance as specified in the constructor. For utterances which do not contain all of the `input_field` attributes as specified in the constructor, or for utterances which return `False` on `input_filter`, this call will not annotate the utterance. :param corpus: Corpus :return: the corpus """ total_utts = len(corpus.utterances) for idx, utterance in enumerate(corpus.iter_utterances()): if self._print_output(idx): print('%03d/%03d utterances processed' % (idx, total_utts)) if not self.input_filter(utterance, self.aux_input): continue if self.input_field is None: text_entry = utterance.text elif isinstance(self.input_field, str): text_entry = utterance.get_info(self.input_field) elif isinstance(self.input_field, list): text_entry = { field: utterance.get_info(field) for field in self.input_field } if sum(x is None for x in text_entry.values()) > 0: text_entry = None if text_entry is None: continue if len(self.aux_input) == 0: result = self.proc_fn(text_entry) else: result = self.proc_fn(text_entry, self.aux_input) if self.multi_outputs: for res, out in zip(result, self.output_field): utterance.set_info(out, res) else: utterance.set_info(self.output_field, result) if self.verbosity > 0: print('%03d/%03d utterances processed' % (total_utts, total_utts)) return corpus
def transform(self, corpus: Corpus): name_to_gender = self.genderDictionary(corpus) ps = PorterStemmer() for utt in corpus.iter_utterances(): speaker_name = utt.user.name speaker_gender = utt.user.meta['sex'] gender_is_female = speaker_gender == 'FEMALE' contains_romantic = False male_about_female = False female_about_male = False for romantic_word in Genderromantic2.romantic_words: if romantic_word.lower() in utt.text.lower(): contains_romantic = True break utt.add_meta('female_about_male', female_about_male) utt.add_meta('male_about_female', male_about_female) utt.add_meta('contains_romantic', contains_romantic) return corpus
def transform(self, corpus: Corpus) -> Corpus: """ Annotate the corpus utterances with forecast and forecast probability information :param corpus: target Corpus :return: annotated Corpus """ id_to_context_reply_label = self._get_context_reply_label_dict( corpus, include_label=False) forecast_df = self.forecaster_model.forecast(id_to_context_reply_label) for utt in corpus.iter_utterances(): if utt.id in forecast_df.index: utt.add_meta(self.forecast_feat_name, forecast_df.loc[utt.id][self.forecast_feat_name]) utt.add_meta( self.forecast_prob_feat_name, forecast_df.loc[utt.id][self.forecast_prob_feat_name]) else: utt.add_meta(self.forecast_feat_name, None) utt.add_meta(self.forecast_prob_feat_name, None) return corpus
def transform(self, corpus: Corpus, selector: Optional[Callable[[Utterance], bool]] = lambda utt: True, markers: bool = False): """ Extract politeness strategies from each utterances in the corpus and annotate the utterances with the extracted strategies. Requires that the corpus has previously been transformed by a Parser, such that each utterance has dependency parse info in its metadata table. :param corpus: the corpus to compute features for. :param selector: a (lambda) function that takes an Utterance and returns a bool indicating whether the utterance should be included in this annotation step. :param markers: whether or not to add politeness occurrence markers """ total_utts = len(corpus.utterances) for idx, utt in enumerate(corpus.iter_utterances()): if self.verbose > 0 and idx > 0 and idx % self.verbose == 0: print('%03d/%03d utterances processed' % (idx, total_utts)) if selector(utt): parsed = utt.retrieve_meta(self.parse_attribute_name) for i, sent in enumerate(parsed): for p in sent["toks"]: # p["tok"] = re.sub("[^a-z,.:;]", "", p["tok"].lower()) p["tok"] = p['tok'].lower() parses = [x["toks"] for x in parsed] utt.meta[self.strategy_attribute_name], marks = self._extractor_lookup[self.strategy_collection](parses) if markers: utt.meta[self.marker_attribute_name] = marks else: utt.meta[self.strategy_attribute_name] = None utt.meta[self.marker_attribute_name] = None return corpus
def fit(self, corpus: Corpus, y=None): """ Learn the fighting words from a corpus :param corpus: target Corpus :return: fitted Transformer """ class1, class2 = [], [] for utt in corpus.iter_utterances(): if self.class1_selector(utt): class1.append(utt) elif self.class2_selector(utt): class2.append(utt) if len(class1) == 0: raise ValueError("class1_func returned 0 valid utterances.") if len(class2) == 0: raise ValueError("class2_func returned 0 valid utterances.") print("class1_func returned {} valid utterances. class2_func returned {} valid utterances.".format(len(class1), len(class2))) self.ngram_zscores = self._bayes_compare_language(class1, class2) print("ngram zscores computed.") return self
def transform(self, corpus: Corpus, obj_type: str, group_and_models: Callable[[Utterance], Tuple[str, List[str]]] = None, group_model_attr_key: Callable[[str, str], str] = None, selector: Callable[[CorpusComponent], bool] = lambda _: True, target_text_func: Callable[[Utterance], List[str]] = None): """ Annotates `obj_type` components in a corpus with surprise scores. Should be called after fit(). :param corpus: corpus to compute surprise for. :param obj_type: the type of corpus components to annotate. Should be either 'utterance', 'speaker', 'conversation', or 'corpus'. :param group_and_models: optional function that defines how an utterance should be grouped to form a target text and what models (contexts) the group should be compared to when calculating surprise. Takes in an utterance and returns a tuple containing the name of the group the utterance belongs to and a list of models to calculate how surprising that group is against. Objects will be annotated with a metadata field `self.surprise_attr_name` that is maps a key corresponding to the `groupname` and `modelkey` to the surprise score for utterances in the group when compared to the model. The key used is defined by the `group_model_attr_key` parameter. If `group_and_models` is `None`, `self.model_key_selector` will be used to select the group that an utterance belongs to. The surprise score will be calculated for each group of utterances compared to the model in `self.models` corresponding to the group. :param group_model_attr_key: optional function to define what key should be used for a given `groupname` and `modelkey`. If `group_model_attr_key` is `None`, the default key used will be "GROUP_groupname_MODEL_modelkey" unless `groupname` and `modelkey` are equal in which case just "modelkey" will be used as the key. :param selector: function to select objects to annotate. if function returns true, object will be annotated. :param target_text_func: optional function to define what the target text corresponding to an utterance should be. takes in an utterance and returns a list of string tokens """ if obj_type == 'corpus': utt_groups = defaultdict(list) group_models = defaultdict(set) for utt in corpus.iter_utterances(): if group_and_models: group_name, models = group_and_models(utt) else: group_name = self.model_key_selector(utt) models = {group_name} if target_text_func: if group_name not in utt_groups: utt_groups[group_name] = [target_text_func(utt)] else: utt_groups[group_name].append(self.tokenizer(utt.text)) group_models[group_name].update(models) surprise_scores = {} for group_name in tqdm(utt_groups, desc='transform'): for model_key in group_models[group_name]: context = self.model_groups[model_key] target = list(chain(*utt_groups[group_name])) surprise_scores[Surprise._format_attr_key( group_name, model_key, group_model_attr_key)] = self._compute_surprise( target, context) corpus.add_meta(self.surprise_attr_name, surprise_scores) elif obj_type == 'utterance': for utt in tqdm(corpus.iter_utterances(selector=selector), desc='transform'): if group_and_models: group_name, models = group_and_models(utt) surprise_scores = {} for model_key in models: context = self.model_groups[model_key] target = target_text_func( utt) if target_text_func else self.tokenizer( utt.text) surprise_scores[Surprise._format_attr_key( group_name, model_key, group_model_attr_key)] = self._compute_surprise( target, context) utt.add_meta(self.surprise_attr_name, surprise_scores) else: group_name = self.model_key_selector(utt) context = self.model_groups[group_name] target = target_text_func( utt) if target_text_func else self.tokenizer(utt.text) utt.add_meta(self.surprise_attr_name, self._compute_surprise(target, context)) else: for obj in tqdm(corpus.iter_objs(obj_type, selector=selector), desc='transform'): utt_groups = defaultdict(list) group_models = defaultdict(set) for utt in obj.iter_utterances(): if group_and_models: group_name, models = group_and_models(utt) else: group_name = self.model_key_selector(utt) models = {group_name} if target_text_func: if group_name not in utt_groups: utt_groups[group_name] = [target_text_func(utt)] else: utt_groups[group_name].append(self.tokenizer(utt.text)) group_models[group_name].update(models) surprise_scores = {} for group_name in utt_groups: for model_key in group_models[group_name]: assert (model_key in self.model_groups), 'invalid model key' if not self.model_groups[model_key]: continue context = self.model_groups[model_key] target = list(chain(*utt_groups[group_name])) surprise_scores[Surprise._format_attr_key( group_name, model_key, group_model_attr_key)] = self._compute_surprise( target, context) obj.add_meta(self.surprise_attr_name, surprise_scores) return corpus
def score(self, corpus: Corpus, speakers: Collection[Union[User, str]], group: Collection[Union[User, str]], focus: str = "speakers", speaker_thresh: int = 0, target_thresh: int = 3, utterances_thresh: int = 0, speaker_thresh_indiv: int = 0, target_thresh_indiv: int = 0, utterances_thresh_indiv: int = 0, utterance_thresh_func: Optional[Callable[ [Tuple[Utterance, Utterance]], bool]] = None, split_by_attribs: Optional[List[str]] = None, speaker_attribs: Optional[Dict] = None, target_attribs: Optional[Dict] = None) -> CoordinationScore: """Computes the coordination scores for each speaker, given a set of speakers and a group of targets. :param corpus: Corpus to compute scores on :param speakers: A collection of user ids or user objects corresponding to the speakers we want to compute scores for. :param group: A collection of user ids or user objects corresponding to the group of targets. :param focus: Either "speakers" or "targets". If "speakers", treat the set of targets for a particular speaker as a single person (i.e. concatenate all of their utterances); the returned dictionary will have speakers as keys. If "targets", treat the set of speakers for a particular target as a single person; the returned dictionary will have targets as keys. :param speaker_thresh: Thresholds based on minimum number of times the speaker uses each coordination marker. :param target_thresh: Thresholds based on minimum number of times the target uses each coordination marker. :param utterances_thresh: Thresholds based on the minimum number of utterances for each speaker. :param speaker_thresh_indiv: Like `speaker_thresh` but only considers the utterances between a speaker and a single target; thresholds whether the utterances for a single target should be considered for a particular speaker. :param target_thresh_indiv: Like `target_thresh` but thresholds whether a single target's utterances should be considered for a particular speaker. :param utterances_thresh_indiv: Like `utterances_thresh` but thresholds whether a single target's utterances should be considered for a particular speaker. :param utterance_thresh_func: Optional utterance-level threshold function that takes in a speaker `Utterance` and the `Utterance` the speaker replied to, and returns a `bool` corresponding to whether or not to include the utterance in scoring. :param split_by_attribs: Utterance meta attributes to split users by when tallying coordination (e.g. in supreme court transcripts, you may want to treat the same lawyer as a different person across different cases --- see coordination examples) :param speaker_attribs: attribute names and values the speaker must have :param target_attribs: attribute names and values the target must have :return: A :class:`CoordinationScore` object corresponding to the coordination scores for each speaker. """ if corpus != self.corpus: raise Exception("Coordination: must fit and score on same corpus") if not self.precomputed: raise Exception("Must fit before calling score") if split_by_attribs is None: split_by_attribs = [] if speaker_attribs is None: speaker_attribs = dict() if target_attribs is None: target_attribs = dict() #self.precompute() speakers = set(speakers) group = set(group) utterances = [] for utt in corpus.iter_utterances(): speaker = utt.user if speaker in speakers: if utt.reply_to is not None: reply_to = corpus.get_utterance(utt.reply_to) target = reply_to.user if target in group: utterances.append(utt) return self.scores_over_utterances( corpus, speakers, utterances, speaker_thresh, target_thresh, utterances_thresh, speaker_thresh_indiv, target_thresh_indiv, utterances_thresh_indiv, utterance_thresh_func, focus, split_by_attribs, speaker_attribs, target_attribs)