def summarize(self, corpus: Corpus, cv=LeaveOneOut()): """ Run PairedPrediction on the corpus with cross-validation :param corpus: target Corpus (must be annotated with pair information using PairedPrediction.transform()) :param cv: optional CV model: default is LOOCV :return: cross-validation accuracy score """ # Check if transform() needs to be run first sample_obj = next(corpus.iter_objs(self.obj_type)) meta_keys = set(sample_obj.meta) required_keys = {self.pair_orientation_feat_name, self.pair_id_feat_name, self.label_feat_name} required_keys -= meta_keys if len(required_keys) > 0: raise ValueError("Some metadata features required for paired prediction are missing: {}. " "You may need to run transform() first.".format(required_keys)) pair_id_to_obj = {'pos': dict(), 'neg': dict()} for obj in corpus.iter_objs(self.obj_type, self.selector): if obj.meta[self.pair_orientation_feat_name] is None: continue pair_id_to_obj[obj.meta[self.label_feat_name]][obj.meta[self.pair_id_feat_name]] = obj pair_ids = set(pair_id_to_obj['pos'].keys()).intersection(set(pair_id_to_obj['neg'].keys())) # print(set(pair_id_to_obj['pos'].keys())) print("Found {} valid pairs.".format(len(pair_ids))) pair_id_to_objs = dict() for pair_id in pair_ids: pair_id_to_objs[pair_id] = (pair_id_to_obj['pos'][pair_id], pair_id_to_obj['neg'][pair_id]) X, y = self._generate_paired_X_y(pair_id_to_objs) self.clf.fit(X, y) return np.mean(cross_val_score(self.clf, X, y, cv=cv, error_score='raise'))
def transform( self, corpus: Corpus, y=None, selector: Callable[[CorpusObject], bool] = lambda obj: True) -> Corpus: """ Annotate corpus objects with scores and rankings. :param corpus: target corpus :param selector: (lambda) function taking in a Corpus object and returning True / False; selects for Corpus objects to annotate. :return: annotated corpus """ obj_iters = { "conversation": corpus.iter_conversations, "user": corpus.iter_users, "utterance": corpus.iter_utterances } obj_scores = [(obj.id, self.score_func(obj)) for obj in obj_iters[self.obj_type](selector)] df = pd.DataFrame(obj_scores, columns=["id", self.score_feat_name]) \ .set_index('id').sort_values(self.score_feat_name, ascending=False) df[self.rank_feat_name] = [idx + 1 for idx, _ in enumerate(df.index)] for obj in corpus.iter_objs(obj_type=self.obj_type): if obj.id in df.index: obj.add_meta(self.score_feat_name, df.loc[obj.id][self.score_feat_name]) obj.add_meta(self.rank_feat_name, df.loc[obj.id][self.rank_feat_name]) else: obj.add_meta(self.score_feat_name, None) obj.add_meta(self.rank_feat_name, None) return corpus
def transform(self, corpus: Corpus) -> Corpus: """ Annotate corpus objects with pair information (label, pair_id, pair_orientation) :param corpus: target Corpus :return: annotated Corpus """ pos_objs, neg_objs = self._get_pos_neg_objects(corpus) obj_pairs = self._pair_objs(pos_objs, neg_objs) pair_orientations = self._assign_pair_orientations(obj_pairs) for pair_id, (pos_obj, neg_obj) in obj_pairs.items(): pos_obj.add_meta(self.label_feat_name, "pos") neg_obj.add_meta(self.label_feat_name, "neg") pos_obj.add_meta(self.pair_id_feat_name, pair_id) neg_obj.add_meta(self.pair_id_feat_name, pair_id) pos_obj.add_meta(self.pair_orientation_feat_name, pair_orientations[pair_id]) neg_obj.add_meta(self.pair_orientation_feat_name, pair_orientations[pair_id]) for obj in corpus.iter_objs(self.obj_type): # unlabelled objects include both objects that did not pass the selector # and objects that were not selected in the pairing step if self.label_feat_name not in obj.meta: obj.add_meta(self.label_feat_name, None) obj.add_meta(self.pair_id_feat_name, None) obj.add_meta(self.pair_orientation_feat_name, None) return corpus
def extract_feats_dict(corpus: Corpus, obj_type: str, pred_feats: List[str], selector: Callable[[CorpusObject], bool] = lambda x: True): """ Extract features dictionary from a corpus :param corpus: target corpus :param obj_type: Corpus object type :param pred_feats: list of features to extract metadata for :param selector: function to select for Corpus objects to extract features from :return: dictionary mapping object id to a dictionary of predictive features """ obj_id_to_feats = {obj.id: extract_feats_from_obj(obj, pred_feats) for obj in corpus.iter_objs(obj_type, selector)} return obj_id_to_feats
def _get_pos_neg_objects(self, corpus: Corpus): """ Get positively-labelled and negatively-labelled lists of objects :param corpus: target Corpus :return: list of positive objects, list of negative objects """ pos_objects = [] neg_objects = [] for obj in corpus.iter_objs(self.obj_type, self.selector): if not self.selector(obj): continue if self.pos_label_func(obj): pos_objects.append(obj) elif self.neg_label_func(obj): neg_objects.append(obj) return pos_objects, neg_objects
def extract_label_dict(corpus: Corpus, obj_type: str, labeller: Callable[[CorpusObject], bool], selector: Callable[[CorpusObject], bool] = lambda x: True): """ Generate dictionary mapping Corpus object id to label from corpus :param corpus: target corpus :param obj_type: Corpus object type :param labeller: function that takes a Corpus object as input and outputs its label :param selector: function to select for Corpus objects to extract features from :return: dictionary mapping Corpus object id to label """ obj_id_to_label = dict() for obj in corpus.iter_objs(obj_type, selector): obj_id_to_label[obj.id] = {'y': 1} if labeller(obj) else {'y': 0} return obj_id_to_label
def transform(self, corpus: Corpus, selector: Callable[[CorpusComponent], bool] = lambda x: True, config=None) -> Corpus: """ Annotates the corpus component objects with the lists of fighting words that the object contains. The relevant fighting words to use are specified by the config parameter. By default, the annotation method is to annotate the corpus components with the top 10 fighting words of each class. Lists are stored under the metadata attributes defined when initializing the FightingWords Transformer. :param corpus: corpus to annotate :param selector: a (lambda) function that takes a CorpusComponent and returns True/False; this selects for corpus components that should be annotated with the fighting words :param config: a dictionary of configuration parameters for setting which fighting words are significant enough to annotate. The dictionary should hold the keys: annot_method ('top_k' or 'threshold'), and either 'threshold' (a float for the min absolute z-score to be considered significant) or 'top_k' (an int to set the value of k). By default, config is {'annot_method': 'top_k', 'top_k': 10}. :return: annotated corpus """ config = { 'top_k': 10, 'annot_method': 'top_k' } if config is None else config class1_ngrams, class2_ngrams = self.get_top_k_ngrams(top_k=config['top_k']) if \ config['annot_method'] == "top_k" else self.get_ngrams_past_threshold(threshold=config['threshold']) for obj in corpus.iter_objs( self.obj_type ): # improve the efficiency of this; tricky because ngrams #TODO if selector(obj): obj_text = self.text_func(obj) obj.meta[self.class1_attribute_name] = [ ngram for ngram in class1_ngrams if ngram in obj_text ] obj.meta[self.class2_attribute_name] = [ ngram for ngram in class2_ngrams if ngram in obj_text ] else: obj.meta[self.class1_attribute_name] = None obj.meta[self.class2_attribute_name] = None return corpus
def fit(self, corpus: Corpus, class1_func: Callable[[CorpusComponent], bool], class2_func: Callable[[CorpusComponent], bool], y=None, selector: Callable[[CorpusComponent], bool] = lambda utt: True): """ Learn the fighting words from a corpus, with an optional selector that selects for corpus components prior to grouping the corpus components into class1 / class2. :param corpus: target Corpus :param class1_func: selector function for identifying corpus components that belong to class 1 :param class2_func: selector function for identifying corpus components that belong to class 2 :param selector: a (lambda) function that takes a CorpusComponent and returns True/False; this selects for Corpus components that should be considered in this fitting step :return: fitted FightingWords Transformer """ class1, class2 = [], [] for obj in corpus.iter_objs(self.obj_type, selector): if class1_func(obj): class1.append(obj) elif class2_func(obj): class2.append(obj) if len(class1) == 0: raise ValueError("class1_func returned 0 valid corpus components.") if len(class2) == 0: raise ValueError("class2_func returned 0 valid corpus components.") print("class1_func returned {} valid corpus components. " "class2_func returned {} valid corpus components.".format( len(class1), len(class2))) self.ngram_zscores = self._bayes_compare_language(class1, class2) print("ngram zscores computed.") return self
def transform(self, corpus: Corpus, obj_type: str, group_and_models: Callable[[Utterance], Tuple[str, List[str]]] = None, group_model_attr_key: Callable[[str, str], str] = None, selector: Callable[[CorpusComponent], bool] = lambda _: True, target_text_func: Callable[[Utterance], List[str]] = None): """ Annotates `obj_type` components in a corpus with surprise scores. Should be called after fit(). :param corpus: corpus to compute surprise for. :param obj_type: the type of corpus components to annotate. Should be either 'utterance', 'speaker', 'conversation', or 'corpus'. :param group_and_models: optional function that defines how an utterance should be grouped to form a target text and what models (contexts) the group should be compared to when calculating surprise. Takes in an utterance and returns a tuple containing the name of the group the utterance belongs to and a list of models to calculate how surprising that group is against. Objects will be annotated with a metadata field `self.surprise_attr_name` that is maps a key corresponding to the `groupname` and `modelkey` to the surprise score for utterances in the group when compared to the model. The key used is defined by the `group_model_attr_key` parameter. If `group_and_models` is `None`, `self.model_key_selector` will be used to select the group that an utterance belongs to. The surprise score will be calculated for each group of utterances compared to the model in `self.models` corresponding to the group. :param group_model_attr_key: optional function to define what key should be used for a given `groupname` and `modelkey`. If `group_model_attr_key` is `None`, the default key used will be "GROUP_groupname_MODEL_modelkey" unless `groupname` and `modelkey` are equal in which case just "modelkey" will be used as the key. :param selector: function to select objects to annotate. if function returns true, object will be annotated. :param target_text_func: optional function to define what the target text corresponding to an utterance should be. takes in an utterance and returns a list of string tokens """ if obj_type == 'corpus': utt_groups = defaultdict(list) group_models = defaultdict(set) for utt in corpus.iter_utterances(): if group_and_models: group_name, models = group_and_models(utt) else: group_name = self.model_key_selector(utt) models = {group_name} if target_text_func: if group_name not in utt_groups: utt_groups[group_name] = [target_text_func(utt)] else: utt_groups[group_name].append(self.tokenizer(utt.text)) group_models[group_name].update(models) surprise_scores = {} for group_name in tqdm(utt_groups, desc='transform'): for model_key in group_models[group_name]: context = self.model_groups[model_key] target = list(chain(*utt_groups[group_name])) surprise_scores[Surprise._format_attr_key( group_name, model_key, group_model_attr_key)] = self._compute_surprise( target, context) corpus.add_meta(self.surprise_attr_name, surprise_scores) elif obj_type == 'utterance': for utt in tqdm(corpus.iter_utterances(selector=selector), desc='transform'): if group_and_models: group_name, models = group_and_models(utt) surprise_scores = {} for model_key in models: context = self.model_groups[model_key] target = target_text_func( utt) if target_text_func else self.tokenizer( utt.text) surprise_scores[Surprise._format_attr_key( group_name, model_key, group_model_attr_key)] = self._compute_surprise( target, context) utt.add_meta(self.surprise_attr_name, surprise_scores) else: group_name = self.model_key_selector(utt) context = self.model_groups[group_name] target = target_text_func( utt) if target_text_func else self.tokenizer(utt.text) utt.add_meta(self.surprise_attr_name, self._compute_surprise(target, context)) else: for obj in tqdm(corpus.iter_objs(obj_type, selector=selector), desc='transform'): utt_groups = defaultdict(list) group_models = defaultdict(set) for utt in obj.iter_utterances(): if group_and_models: group_name, models = group_and_models(utt) else: group_name = self.model_key_selector(utt) models = {group_name} if target_text_func: if group_name not in utt_groups: utt_groups[group_name] = [target_text_func(utt)] else: utt_groups[group_name].append(self.tokenizer(utt.text)) group_models[group_name].update(models) surprise_scores = {} for group_name in utt_groups: for model_key in group_models[group_name]: assert (model_key in self.model_groups), 'invalid model key' if not self.model_groups[model_key]: continue context = self.model_groups[model_key] target = list(chain(*utt_groups[group_name])) surprise_scores[Surprise._format_attr_key( group_name, model_key, group_model_attr_key)] = self._compute_surprise( target, context) obj.add_meta(self.surprise_attr_name, surprise_scores) return corpus