def summarize(self, corpus: Corpus, cv=LeaveOneOut()):
        """
        Run PairedPrediction on the corpus with cross-validation
        :param corpus: target Corpus (must be annotated with pair information using PairedPrediction.transform())
        :param cv: optional CV model: default is LOOCV
        :return: cross-validation accuracy score
        """
        # Check if transform() needs to be run first
        sample_obj = next(corpus.iter_objs(self.obj_type))
        meta_keys = set(sample_obj.meta)
        required_keys = {self.pair_orientation_feat_name, self.pair_id_feat_name, self.label_feat_name}
        required_keys -= meta_keys
        if len(required_keys) > 0:
            raise ValueError("Some metadata features required for paired prediction are missing: {}. "
                             "You may need to run transform() first.".format(required_keys))

        pair_id_to_obj = {'pos': dict(), 'neg': dict()}
        for obj in corpus.iter_objs(self.obj_type, self.selector):
            if obj.meta[self.pair_orientation_feat_name] is None: continue
            pair_id_to_obj[obj.meta[self.label_feat_name]][obj.meta[self.pair_id_feat_name]] = obj

        pair_ids = set(pair_id_to_obj['pos'].keys()).intersection(set(pair_id_to_obj['neg'].keys()))

        # print(set(pair_id_to_obj['pos'].keys()))
        print("Found {} valid pairs.".format(len(pair_ids)))
        pair_id_to_objs = dict()
        for pair_id in pair_ids:
            pair_id_to_objs[pair_id] = (pair_id_to_obj['pos'][pair_id], pair_id_to_obj['neg'][pair_id])

        X, y = self._generate_paired_X_y(pair_id_to_objs)
        self.clf.fit(X, y)
        return np.mean(cross_val_score(self.clf, X, y, cv=cv, error_score='raise'))
Beispiel #2
0
    def transform(
            self,
            corpus: Corpus,
            y=None,
            selector: Callable[[CorpusObject],
                               bool] = lambda obj: True) -> Corpus:
        """
        Annotate corpus objects with scores and rankings.

        :param corpus: target corpus
        :param selector: (lambda) function taking in a Corpus object and returning True / False; selects for Corpus objects to annotate.
        :return: annotated corpus
        """
        obj_iters = {
            "conversation": corpus.iter_conversations,
            "user": corpus.iter_users,
            "utterance": corpus.iter_utterances
        }
        obj_scores = [(obj.id, self.score_func(obj))
                      for obj in obj_iters[self.obj_type](selector)]
        df = pd.DataFrame(obj_scores, columns=["id", self.score_feat_name]) \
            .set_index('id').sort_values(self.score_feat_name, ascending=False)
        df[self.rank_feat_name] = [idx + 1 for idx, _ in enumerate(df.index)]

        for obj in corpus.iter_objs(obj_type=self.obj_type):
            if obj.id in df.index:
                obj.add_meta(self.score_feat_name,
                             df.loc[obj.id][self.score_feat_name])
                obj.add_meta(self.rank_feat_name,
                             df.loc[obj.id][self.rank_feat_name])
            else:
                obj.add_meta(self.score_feat_name, None)
                obj.add_meta(self.rank_feat_name, None)
        return corpus
    def transform(self, corpus: Corpus) -> Corpus:
        """
        Annotate corpus objects with pair information (label, pair_id, pair_orientation)
        :param corpus: target Corpus
        :return: annotated Corpus
        """
        pos_objs, neg_objs = self._get_pos_neg_objects(corpus)
        obj_pairs = self._pair_objs(pos_objs, neg_objs)
        pair_orientations = self._assign_pair_orientations(obj_pairs)

        for pair_id, (pos_obj, neg_obj) in obj_pairs.items():
            pos_obj.add_meta(self.label_feat_name, "pos")
            neg_obj.add_meta(self.label_feat_name, "neg")
            pos_obj.add_meta(self.pair_id_feat_name, pair_id)
            neg_obj.add_meta(self.pair_id_feat_name, pair_id)
            pos_obj.add_meta(self.pair_orientation_feat_name, pair_orientations[pair_id])
            neg_obj.add_meta(self.pair_orientation_feat_name, pair_orientations[pair_id])

        for obj in corpus.iter_objs(self.obj_type):
            # unlabelled objects include both objects that did not pass the selector
            # and objects that were not selected in the pairing step
            if self.label_feat_name not in obj.meta:
                obj.add_meta(self.label_feat_name, None)
                obj.add_meta(self.pair_id_feat_name, None)
                obj.add_meta(self.pair_orientation_feat_name, None)

        return corpus
Beispiel #4
0
def extract_feats_dict(corpus: Corpus, obj_type: str, pred_feats: List[str],
                       selector: Callable[[CorpusObject], bool] = lambda x: True):
    """
    Extract features dictionary from a corpus
    :param corpus: target corpus
    :param obj_type: Corpus object type
    :param pred_feats: list of features to extract metadata for
    :param selector: function to select for Corpus objects to extract features from
    :return: dictionary mapping object id to a dictionary of predictive features
    """
    obj_id_to_feats = {obj.id: extract_feats_from_obj(obj, pred_feats) for obj in corpus.iter_objs(obj_type, selector)}

    return obj_id_to_feats
 def _get_pos_neg_objects(self, corpus: Corpus):
     """
     Get positively-labelled and negatively-labelled lists of objects
     :param corpus: target Corpus
     :return: list of positive objects, list of negative objects
     """
     pos_objects = []
     neg_objects = []
     for obj in corpus.iter_objs(self.obj_type, self.selector):
         if not self.selector(obj): continue
         if self.pos_label_func(obj):
             pos_objects.append(obj)
         elif self.neg_label_func(obj):
             neg_objects.append(obj)
     return pos_objects, neg_objects
Beispiel #6
0
def extract_label_dict(corpus: Corpus, obj_type: str, labeller: Callable[[CorpusObject], bool],
                       selector: Callable[[CorpusObject], bool] = lambda x: True):
    """
    Generate dictionary mapping Corpus object id to label from corpus
    :param corpus: target corpus
    :param obj_type: Corpus object type
    :param labeller: function that takes a Corpus object as input and outputs its label
    :param selector: function to select for Corpus objects to extract features from
    :return: dictionary mapping Corpus object id to label
    """
    obj_id_to_label = dict()
    for obj in corpus.iter_objs(obj_type, selector):
        obj_id_to_label[obj.id] = {'y': 1} if labeller(obj) else {'y': 0}

    return obj_id_to_label
    def transform(self,
                  corpus: Corpus,
                  selector: Callable[[CorpusComponent], bool] = lambda x: True,
                  config=None) -> Corpus:
        """
        Annotates the corpus component objects with the lists of fighting words that the object contains.

        The relevant fighting words to use are specified by the config parameter. By default, the annotation method
        is to annotate the corpus components with the top 10 fighting words of each class.

        Lists are stored under the metadata attributes defined when initializing the FightingWords Transformer.

        :param corpus: corpus to annotate
        :param selector: a (lambda) function that takes a CorpusComponent and returns True/False; this selects for
            corpus components that should be annotated with the fighting words
        :param config: a dictionary of configuration parameters for setting which fighting words are significant enough
            to annotate. The dictionary should hold the keys: annot_method ('top_k' or 'threshold'), and either
            'threshold' (a float for the min absolute z-score to be considered significant) or 'top_k' (an int to set
            the value of k). By default, config is {'annot_method': 'top_k', 'top_k': 10}.

        :return: annotated corpus
        """
        config = {
            'top_k': 10,
            'annot_method': 'top_k'
        } if config is None else config

        class1_ngrams, class2_ngrams = self.get_top_k_ngrams(top_k=config['top_k']) if \
            config['annot_method'] == "top_k" else self.get_ngrams_past_threshold(threshold=config['threshold'])

        for obj in corpus.iter_objs(
                self.obj_type
        ):  # improve the efficiency of this; tricky because ngrams #TODO
            if selector(obj):
                obj_text = self.text_func(obj)
                obj.meta[self.class1_attribute_name] = [
                    ngram for ngram in class1_ngrams if ngram in obj_text
                ]
                obj.meta[self.class2_attribute_name] = [
                    ngram for ngram in class2_ngrams if ngram in obj_text
                ]
            else:
                obj.meta[self.class1_attribute_name] = None
                obj.meta[self.class2_attribute_name] = None

        return corpus
    def fit(self,
            corpus: Corpus,
            class1_func: Callable[[CorpusComponent], bool],
            class2_func: Callable[[CorpusComponent], bool],
            y=None,
            selector: Callable[[CorpusComponent], bool] = lambda utt: True):
        """
        Learn the fighting words from a corpus, with an optional selector that selects for corpus components prior to
            grouping the corpus components into class1 / class2.

        :param corpus: target Corpus
        :param class1_func: selector function for identifying corpus components that belong to class 1
        :param class2_func: selector function for identifying corpus components that belong to class 2
        :param selector: a (lambda) function that takes a CorpusComponent and returns True/False; this selects for
            Corpus components that should be considered in this fitting step
        :return: fitted FightingWords Transformer

        """
        class1, class2 = [], []
        for obj in corpus.iter_objs(self.obj_type, selector):
            if class1_func(obj):
                class1.append(obj)
            elif class2_func(obj):
                class2.append(obj)

        if len(class1) == 0:
            raise ValueError("class1_func returned 0 valid corpus components.")
        if len(class2) == 0:
            raise ValueError("class2_func returned 0 valid corpus components.")

        print("class1_func returned {} valid corpus components. "
              "class2_func returned {} valid corpus components.".format(
                  len(class1), len(class2)))

        self.ngram_zscores = self._bayes_compare_language(class1, class2)
        print("ngram zscores computed.")
        return self
    def transform(self,
                  corpus: Corpus,
                  obj_type: str,
                  group_and_models: Callable[[Utterance],
                                             Tuple[str, List[str]]] = None,
                  group_model_attr_key: Callable[[str, str], str] = None,
                  selector: Callable[[CorpusComponent], bool] = lambda _: True,
                  target_text_func: Callable[[Utterance], List[str]] = None):
        """
    Annotates `obj_type` components in a corpus with surprise scores. Should be 
    called after fit().

    :param corpus: corpus to compute surprise for.
    :param obj_type: the type of corpus components to annotate. Should be either 
        'utterance', 'speaker', 'conversation', or 'corpus'. 
    :param group_and_models: optional function that defines how an utterance should 
        be grouped to form a target text and what models (contexts) the group should 
        be compared to when calculating surprise. Takes in an utterance and returns 
        a tuple containing the name of the group the utterance belongs to and a 
        list of models to calculate how surprising that group is against. Objects 
        will be annotated with a metadata field `self.surprise_attr_name` that is 
        maps a key corresponding to the `groupname` and `modelkey` to the surprise 
        score for utterances in the group when compared to the model. The key used 
        is defined by the `group_model_attr_key` parameter.
        If `group_and_models` is `None`, `self.model_key_selector` will be used 
        to select the group that an utterance belongs to. The surprise score will 
        be calculated for each group of utterances compared to the model in 
        `self.models` corresponding to the group.
    :param group_model_attr_key: optional function to define what key should be used 
        for a given `groupname` and `modelkey`. 
        If `group_model_attr_key` is `None`, the default key used will be 
        "GROUP_groupname_MODEL_modelkey" unless `groupname` and `modelkey` are equal 
        in which case just "modelkey" will be used as the key.
    :param selector: function to select objects to annotate. if function returns true, object will be annotated.
    :param target_text_func: optional function to define what the target text corresponding to an utterance should be. 
        takes in an utterance and returns a list of string tokens
    """
        if obj_type == 'corpus':
            utt_groups = defaultdict(list)
            group_models = defaultdict(set)
            for utt in corpus.iter_utterances():
                if group_and_models:
                    group_name, models = group_and_models(utt)
                else:
                    group_name = self.model_key_selector(utt)
                    models = {group_name}
                if target_text_func:
                    if group_name not in utt_groups:
                        utt_groups[group_name] = [target_text_func(utt)]
                else:
                    utt_groups[group_name].append(self.tokenizer(utt.text))
                group_models[group_name].update(models)
            surprise_scores = {}
            for group_name in tqdm(utt_groups, desc='transform'):
                for model_key in group_models[group_name]:
                    context = self.model_groups[model_key]
                    target = list(chain(*utt_groups[group_name]))
                    surprise_scores[Surprise._format_attr_key(
                        group_name, model_key,
                        group_model_attr_key)] = self._compute_surprise(
                            target, context)
            corpus.add_meta(self.surprise_attr_name, surprise_scores)
        elif obj_type == 'utterance':
            for utt in tqdm(corpus.iter_utterances(selector=selector),
                            desc='transform'):
                if group_and_models:
                    group_name, models = group_and_models(utt)
                    surprise_scores = {}
                    for model_key in models:
                        context = self.model_groups[model_key]
                        target = target_text_func(
                            utt) if target_text_func else self.tokenizer(
                                utt.text)
                        surprise_scores[Surprise._format_attr_key(
                            group_name, model_key,
                            group_model_attr_key)] = self._compute_surprise(
                                target, context)
                    utt.add_meta(self.surprise_attr_name, surprise_scores)
                else:
                    group_name = self.model_key_selector(utt)
                    context = self.model_groups[group_name]
                    target = target_text_func(
                        utt) if target_text_func else self.tokenizer(utt.text)
                    utt.add_meta(self.surprise_attr_name,
                                 self._compute_surprise(target, context))
        else:
            for obj in tqdm(corpus.iter_objs(obj_type, selector=selector),
                            desc='transform'):
                utt_groups = defaultdict(list)
                group_models = defaultdict(set)
                for utt in obj.iter_utterances():
                    if group_and_models:
                        group_name, models = group_and_models(utt)
                    else:
                        group_name = self.model_key_selector(utt)
                        models = {group_name}
                    if target_text_func:
                        if group_name not in utt_groups:
                            utt_groups[group_name] = [target_text_func(utt)]
                    else:
                        utt_groups[group_name].append(self.tokenizer(utt.text))
                    group_models[group_name].update(models)
                surprise_scores = {}
                for group_name in utt_groups:
                    for model_key in group_models[group_name]:
                        assert (model_key
                                in self.model_groups), 'invalid model key'
                        if not self.model_groups[model_key]: continue
                        context = self.model_groups[model_key]
                        target = list(chain(*utt_groups[group_name]))
                        surprise_scores[Surprise._format_attr_key(
                            group_name, model_key,
                            group_model_attr_key)] = self._compute_surprise(
                                target, context)
                obj.add_meta(self.surprise_attr_name, surprise_scores)
        return corpus