Exemple #1
0
    def test_with_overlap(self):
        """
        Basic merge: with overlap in utterance id (but utterance has same data & metadata)
        """
        corpus1 = Corpus(utterances=[
            Utterance(id="0", text="hello world", speaker=Speaker(id="alice")),
            Utterance(id="1", text="my name is bob", speaker=Speaker(
                id="bob")),
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
        ])

        corpus2 = Corpus(utterances=[
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
            Utterance(
                id="4", text="this is a sentence", speaker=Speaker(id="echo")),
            Utterance(id="5", text="goodbye", speaker=Speaker(id="foxtrot")),
        ])

        merged = corpus1.merge(corpus2)
        self.assertEqual(len(list(merged.iter_utterances())), 5)
        self.assertEqual(len(list(merged.iter_speakers())), 5)
        self.assertEqual(len(list(corpus1.iter_utterances())), 3)
        self.assertEqual(len(list(corpus2.iter_utterances())), 3)
Exemple #2
0
    def transform(self, corpus: Corpus):
        """Adds metadata about politicization to each utterance.

        :param corpus: the corpus to compute features for.
        :type corpus: Corpus
        """
        assert 'stem_tokens' in next(corpus.iter_utterances()).meta
        for utt in corpus.iter_utterances():
            if utt.meta['valid']:
                utt.meta['num_pol_words'] = len(
                    self.key_words.intersection(utt.meta['stem_tokens']))
                utt.meta['political'] = int(utt.meta['num_pol_words'] > 0)
            else:
                utt.meta['num_pol_words'] = None
                utt.meta['political'] = None

        # for conv_id in corpus.conversations:
        #     conv = corpus.get_conversation(conv_id)
        #     for utt in conv.iter_utterances():
        #         if utt.text != None:
        #             tokenized = word_tokenize(utt.text.lower())
        #             invocations = 0
        #             length = len(tokenized)
        #             pol_words = []
        #             for token in tokenized:
        #                 if token in self.key_words:
        #                     invocations += 1
        #                     pol_words.append(token)
        #             utt.meta["num_pol_refs"] = invocations
        #             if (length > 0):
        #                 utt.meta["num_pol_refs_incidence"] = (invocations/length)
        #             else:
        #                 utt.meta["num_pol_refs_incidence"] = 0
        #             utt.meta["pol_words"] = pol_words
        return corpus
Exemple #3
0
    def test_overlap_diff_data(self):
        """
        Merge with overlap in utterance id and utterance has diff data but same metadata

        Warning should be printed. Original utterance data should be preserved.
        """
        corpus1 = Corpus(utterances=[
            Utterance(id="0", text="hello world", speaker=Speaker(id="alice")),
            Utterance(id="1", text="my name is bob", speaker=Speaker(
                id="bob")),
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
        ])

        corpus2 = Corpus(utterances=[
            Utterance(
                id="2", text="this is a test2", speaker=Speaker(id="candace")),
            Utterance(
                id="4", text="this is a sentence", speaker=Speaker(id="echo")),
            Utterance(id="5", text="goodbye", speaker=Speaker(id="foxtrot")),
        ])

        merged = corpus1.merge(corpus2)
        self.assertEqual(len(list(merged.iter_utterances())), 5)
        self.assertEqual(len(list(merged.iter_speakers())), 5)
        self.assertEqual(len(list(corpus1.iter_utterances())), 3)
        self.assertEqual(len(list(corpus2.iter_utterances())), 3)

        self.assertEqual(merged.get_utterance("2").text, "this is a test")
        self.assertEqual(
            merged.get_utterance("2").speaker, Speaker(id="charlie"))
Exemple #4
0
    def transform(self, corpus: Corpus) -> Corpus:
        super().transform(corpus)
        if self.replace_text:
            selector = lambda utt_: self.input_filter(utt_, None)
            for utt in corpus.iter_utterances(selector):
                cleaned_text = utt.get_info(self.output_field)
                if self.save_original:
                    utt.set_info(self.output_field, utt.text)
                utt.text = cleaned_text

            if not self.save_original:
                next(corpus.iter_utterances(selector)).del_info(
                    self.output_field)  # deletes for all
        return corpus
    def transform(
        self,
        corpus: Corpus,
        selector: Callable[[Conversation], bool] = lambda convo: True,
        ignore_utterances: Callable[[Utterance], bool] = lambda utt: False
    ) -> Corpus:
        """
        Annotate the corpus utterances with forecast and forecast score information

        :param corpus: target Corpus
        :param selector: a (lambda) function that takes a Conversation and returns a bool: True if the Conversation is to be included in the transformation step. By default, includes all Conversations.
        :param ignore_utterances: a (lambda) function that takes an Utterance and returns a bool: True if the Utterance should be excluded from the Conversation in the transformation step. By default, all Utterances are included.
        :return: annotated Corpus
        """
        id_to_context_reply_label = self._get_context_reply_label_dict(
            corpus, selector, ignore_utterances, include_label=False)
        forecast_df = self.forecaster_model.forecast(id_to_context_reply_label)

        for utt in corpus.iter_utterances():
            if utt.id in forecast_df.index:
                utt.add_meta(self.forecast_feat_name,
                             forecast_df.loc[utt.id][self.forecast_feat_name])
                utt.add_meta(
                    self.forecast_prob_feat_name,
                    forecast_df.loc[utt.id][self.forecast_prob_feat_name])
            else:
                utt.add_meta(self.forecast_feat_name, None)
                utt.add_meta(self.forecast_prob_feat_name, None)

        return corpus
Exemple #6
0
    def _get_scores(self,
                    corpus: Corpus,
                    selector: Optional[Callable[[Utterance],
                                                bool]] = lambda utt: True):
        """
        Calculates average occurrence per utterance. Used in summarize()

        :param corpus: the target Corpus
        :param selector: (lambda) function specifying whether the utterance should be included
        """

        utts = list(corpus.iter_utterances(selector))
        if self.marker_attribute_name not in utts[0].meta:
            print(
                "Could not find politeness markers metadata. Running transform() on corpus first...",
                end="")
            self.transform(corpus, markers=True)
            print("Done.")

        counts = {
            k[21:len(k) - 2]: 0
            for k in utts[0].meta[self.marker_attribute_name].keys()
        }

        for utt in utts:
            for k, v in utt.meta[self.marker_attribute_name].items():
                counts[k[21:len(k) - 2]] += len(v)
        scores = {k: v / len(utts) for k, v in counts.items()}
        return scores
Exemple #7
0
    def transform(self, corpus: Corpus, selector: Optional[Callable[[Utterance], bool]] = lambda utt: True,
                  markers: bool = False):
        """
        Extract politeness strategies from each utterances in the corpus and annotate
        the utterances with the extracted strategies. Requires that the corpus has previously
        been transformed by a Parser, such that each utterance has dependency parse info in
        its metadata table.

        :param corpus: the corpus to compute features for.
        :param selector: a (lambda) function that takes an Utterance and returns a bool indicating whether the utterance should be included in this annotation step.
        :param markers: whether or not to add politeness occurrence markers
        """
        for utt in corpus.iter_utterances():
            if selector(utt):
                for i, sent in enumerate(utt.meta["parsed"]):
                    for p in sent["toks"]:
                        p["tok"] = re.sub("[^a-z,.:;]", "", p["tok"].lower())
                utt.meta[self.ATTR_NAME], marks = get_politeness_strategy_features(utt)

                if markers:
                    utt.meta[self.MRKR_NAME] = marks
            else:
                utt.meta[self.ATTR_NAME] = None
                utt.meta[self.MRKR_NAME] = None

        return corpus
Exemple #8
0
    def transform(
            self,
            corpus: Corpus,
            selector: Callable[[Utterance], bool] = lambda x: True) -> Corpus:
        """
        Annotates the corpus utterances with the lists of fighting words that the utterance contains.

        The relevant fighting words to use are specified by FightingWords.top_k or FightingWords.threshold,
            with FightingWords.annot_method indicating which criterion to use.

        Lists are stored under metadata keys 'fighting_words_class1', 'fighting_words_class2'

        :param corpus: corpus to annotate
        :param selector: a (lambda) function that takes an Utterance and returns True/False; this selects for utterances
            that should be annotated with the fighting words

        :return: annotated corpus
        """
        class1_ngrams, class2_ngrams = self.get_top_k_ngrams() if self.annot_method == "top_k" \
                                else self.get_ngrams_past_threshold()

        for utt in corpus.iter_utterances(
        ):  # improve the efficiency of this; tricky because ngrams #TODO
            if selector(utt):
                utt.meta['fighting_words_class1'] = [
                    ngram for ngram in class1_ngrams if ngram in utt.text
                ]
                utt.meta['fighting_words_class2'] = [
                    ngram for ngram in class2_ngrams if ngram in utt.text
                ]
            else:
                utt.meta['fighting_words_class1'] = None
                utt.meta['fighting_words_class2'] = None

        return corpus
Exemple #9
0
    def fit(self,
            corpus: Corpus,
            class1_func: Callable[[Utterance], bool],
            class2_func: Callable[[Utterance], bool],
            y=None,
            selector: Callable[[Utterance], bool] = lambda utt: True):
        """
        Learn the fighting words from a corpus, with an optional selector that selects for utterances prior to grouping the utterances into class1 / class2.

        :param corpus: target Corpus
        :param class1_func: selector function for identifying utterances that belong to class 1
        :param class2_func: selector function for identifying utterances that belong to class 2
        :param selector: a (lambda) function that takes an Utterance and returns True/False; this selects for utterances that should be included in this fitting step
        :return: fitted FightingWords Transformer

        """
        class1, class2 = [], []
        for utt in corpus.iter_utterances(selector):
            if class1_func(utt):
                class1.append(utt)
            elif class2_func(utt):
                class2.append(utt)

        if len(class1) == 0:
            raise ValueError("class1_func returned 0 valid utterances.")
        if len(class2) == 0:
            raise ValueError("class2_func returned 0 valid utterances.")

        print(
            "class1_func returned {} valid utterances. class2_func returned {} valid utterances."
            .format(len(class1), len(class2)))

        self.ngram_zscores = self._bayes_compare_language(class1, class2)
        print("ngram zscores computed.")
        return self
Exemple #10
0
 def summarize(self, corpus: Corpus, use_selector=True, exclude_na=True):
     """
     Returns a DataFrame of utterances and their forecasts (and forecast probabilities)
     :param corpus: target Corpus
     :param use_selector: whether to use Forecaster's convo and utterance selector functions
     :param exclude_na: whether to drop NaN results
     :return: a pandas DataFrame
     """
     utt_forecast_prob = []
     if use_selector:
         for convo in corpus.iter_conversations(self.convo_selector_func):
             for utt in convo.iter_utterances(self.utt_selector_func):
                 utt_forecast_prob.append(
                     (utt.id, utt.meta[self.forecast_feat_name],
                      utt.meta[self.forecast_prob_feat_name]))
     else:
         for utt in corpus.iter_utterances():
             utt_forecast_prob.append(
                 (utt.id, utt.meta[self.forecast_feat_name],
                  utt.meta[self.forecast_prob_feat_name]))
     forecast_df = pd.DataFrame(utt_forecast_prob, columns=["utt_id", self.forecast_feat_name, self.forecast_prob_feat_name]) \
         .set_index('utt_id').sort_values(self.forecast_prob_feat_name, ascending=False)
     if exclude_na:
         forecast_df = forecast_df.dropna()
     return forecast_df
    def fit(self,
            corpus: Corpus,
            text_func: Callable[[Utterance], List[str]] = None,
            selector: Callable[[Utterance], bool] = lambda utt: True):
        """
    Fits a model for each group of utterances in a corpus. The group that an 
    utterance belongs to is determined by the `model_key_selector` parameter in 
    the transformer's constructor.

    :param corpus: corpus to create models from.
    :param text_func: optional function to define how the text a model is trained 
        on should be selected. Takes an utterance as input and returns a list of 
        strings to train the model corresponding to that utterance on. The model 
        corresponding to the utterance is determined by `self.model_key_selector`. 
        For every utterance corresponding to the same model key, this function 
        should return the same result.
        If `text_func` is `None`, a model will be trained on the text from all 
        the utterances that belong to its group.
    :param selector: determines which utterances in the corpus to train models for.
    """
        self.model_groups = defaultdict(list)
        for utt in tqdm(corpus.iter_utterances(selector=selector),
                        desc='fit1'):
            key = self.model_key_selector(utt)
            if text_func:
                if key not in self.model_groups:
                    self.model_groups[key] = text_func(utt)
            else:
                self.model_groups[key].append(utt.text)
        for key in tqdm(self.model_groups, desc='fit2'):
            if not text_func:
                self.model_groups[key] = [' '.join(self.model_groups[key])]
            self.model_groups[key] = list(
                map(lambda x: self.tokenizer(x), self.model_groups[key]))
        return self
    def _preprocess_utterances(self, corpus: Corpus) -> Tuple[List[Hashable], List[Dict]]:
        """Convert each Utterance in the given Corpus into the representation expected
        by the politeness API. Assumes that the Corpus has already been parsed, so that
        each Utterance contains the `parsed` metadata entry
        
        :param corpus: the corpus to compute features for.
        :type corpus: Corpus
        """

        utt_ids = [] # keep track of the order in which we process the utterances, so we can join with the corpus at the end
        documents = []
        for i, utterance in enumerate(corpus.iter_utterances()):
            if self.verbose and i > 0 and (i % self.verbose) == 0:
                print("\t%03d" % i)
            utt_ids.append(utterance.id)
            doc = {"text": utterance.text, "sentences": [], "parses": []}
            # the politeness API goes sentence-by-sentence
            for sent in utterance.meta["parsed"].sents:
                doc["sentences"].append(sent.text)
                sent_parses = []
                pos = sent.start
                for tok in sent:
                    if tok.dep_ != "punct": # the politeness API does not know how to handle punctuation in parses
                        ele = "%s(%s-%d, %s-%d)"%(tok.dep_, tok.head.text, tok.head.i + 1 - pos, tok.text, tok.i + 1 - pos)
                        sent_parses.append(ele)
                doc["parses"].append(sent_parses)
            doc["unigrams"], doc["bigrams"] = get_unigrams_and_bigrams(doc)
            documents.append(doc)
        if self.verbose:
            print("Done!")
        return utt_ids, documents
    def transform(self, corpus: Corpus):
        '''
            compiles a list of all utterances by each user, organized by conversation; also annotates user with summary statistics.

            :param corpus: the Corpus to transform.
            :type corpus: Corpus  
        '''

        user_to_convo_utts = defaultdict(lambda: defaultdict(list))
        for utterance in corpus.iter_utterances():
            if not self.utterance_filter(utterance): continue
            user_to_convo_utts[utterance.user.name][utterance.root].append(
                (utterance.id, utterance.timestamp))
        for user, convo_utts in user_to_convo_utts.items():
            user_convos = {}
            for convo, utts in convo_utts.items():
                sorted_utts = sorted(utts, key=lambda x: x[1])
                user_convos[convo] = {
                    'utterance_ids': [x[0] for x in sorted_utts],
                    'start_time': sorted_utts[0][1],
                    'n_utterances': len(sorted_utts)
                }
            corpus.get_user(user).add_meta('conversations', user_convos)

        for user in corpus.iter_users():
            if 'conversations' not in user.meta: continue
            user.add_meta('n_convos', len(user.meta['conversations']))

            sorted_convos = sorted(user.meta['conversations'].items(),
                                   key=lambda x: x[1]['start_time'])
            user.add_meta('start_time', sorted_convos[0][1]['start_time'])
            for idx, (convo_id, _) in enumerate(sorted_convos):
                user.meta['conversations'][convo_id]['idx'] = idx
        return corpus
    def test_no_overlap(self):
        """
        Basic merge: no overlap in utterance id
        """
        corpus1 = Corpus(utterances=[
            Utterance(id=0, text="hello world", user=User(name="alice")),
            Utterance(id=1, text="my name is bob", user=User(name="bob")),
            Utterance(id=2, text="this is a test", user=User(name="charlie")),
        ])

        corpus2 = Corpus(utterances=[
            Utterance(id=3, text="i like pie", user=User(name="delta")),
            Utterance(id=4, text="this is a sentence", user=User(name="echo")),
            Utterance(id=5, text="goodbye", user=User(name="foxtrot")),
        ])

        merged = corpus1.merge(corpus2)
        self.assertEqual(len(list(merged.iter_utterances())), 6)
        self.assertEqual(len(list(merged.iter_users())), 6)
        self.assertEqual(len(list(corpus1.iter_utterances())), 3)
        self.assertEqual(len(list(corpus2.iter_utterances())), 3)
Exemple #15
0
    def transform(self, corpus: Corpus):
        """Adds metadata about politicization to each utterance.

        :param corpus: the corpus to compute features for.
        :type corpus: Corpus
        """
        assert 'stem_tokens' in next(corpus.iter_utterances()).meta
        counter = 1
        for utt in corpus.iter_utterances():
            if utt.meta['valid']:
                utt.meta['analysis'] = lexicon.analyze(
                    utt.text, categories=self.categories)
                for k in utt.meta['analysis'].keys():
                    if utt.meta['analysis'][k] != 0.0:
                        utt.meta['analysis'][k] = 1
            else:
                utt.meta['analysis'] = None

            counter = counter + 1
            if counter % 10000 == 0:
                print("processed ", counter, "utterances ")
        return corpus
    def transform(self, corpus: Corpus):
        '''
			tokenizes each utterance, and stores tokens as a space-separated string entry in the utterance metadata.

			:param corpus: the Corpus to tokenize utterances for.
			:type corpus: Corpus
		'''
        for idx, utterance in enumerate(corpus.iter_utterances()):
            if self._print_output(idx):
                print(idx, 'utterances tokenized')
            utterance.add_meta('tokens',
                               ' '.join(nltk.word_tokenize(utterance.text)))
        return corpus
Exemple #17
0
    def transform(self, corpus: Corpus) -> Corpus:
        super().transform(corpus)
        if self.replace_text:
            selector = lambda utt_: self.input_filter(utt_, None)
            for utt in corpus.iter_utterances(selector):
                cleaned_text = utt.retrieve_meta(self.output_field)
                if self.save_original:
                    utt.add_meta(self.output_field, utt.text)
                utt.text = cleaned_text

            if not self.save_original:
                corpus.delete_metadata('utterance', self.output_field)
        return corpus
Exemple #18
0
    def transform(self, corpus: Corpus):
        '''
			computes wordcount per utterance

			::param corpus: the Corpus to compute wordcount for.
			:type corpus: Corpus
		'''
        for utterance in corpus.iter_utterances():
            if self.use_tokenized:
                utterance.add_meta('wordcount',
                                   len(utterance.meta['tokens'].split()))
            else:
                utterance.add_meta('wordcount', len(utterance.text.split()))
        return corpus
    def transform(self, corpus: Corpus) -> Corpus:
        """
        Annotates the corpus utterances with the lists of fighting words that the utterance contains
        The relevant fighting words to use are specified by FightingWords.top_k or FightingWords.threshold,
        with FightingWords.annot_method indicating which criterion to use.

        Lists are stored under metadata keys 'fighting_words_class1', 'fighting_words_class2'
        :param corpus: corpus to annotate
        :return: annotated corpus
        """
        class1_ngrams, class2_ngrams = self.get_top_k_ngrams() if self.annot_method == "top_k" \
                                else self.get_ngrams_past_threshold()

        for utt in corpus.iter_utterances(): # improve the efficiency of this; tricky because ngrams #TODO
            utt.meta['fighting_words_class1'] = [ngram for ngram in class1_ngrams if ngram in utt.text]
            utt.meta['fighting_words_class2'] = [ngram for ngram in class2_ngrams if ngram in utt.text]
        return corpus
Exemple #20
0
    def _get_feat_df(self, corpus: Corpus, selector: Optional[Callable[[Utterance], bool]] = lambda utt: True):
        """
        Construct binary feature dataframe. Used in summarize()

        :param corpus: the target Corpus
        :param selector: (lambda) function specifying whether the utterance should be included
        """

        utts = list(corpus.iter_utterances(selector))

        if self.strategy_attribute_name not in utts[0].meta:
            print("Could not find politeness strategies metadata. Running transform() on corpus first...", end="")
            self.transform(corpus)
            print("Done.")

        df_feat = pd.DataFrame.from_dict({utt.id: utt.meta['politeness_strategies'] for utt in utts}, orient='index')

        return df_feat
Exemple #21
0
    def transform(self, corpus: Corpus) -> Corpus:
        """
            Computes per-utterance attributes for each utterance in the Corpus, storing these values in the `output_field` of each utterance as specified in the constructor. For utterances which do not contain all of the `input_field` attributes as specified in the constructor, or for utterances which return `False` on `input_filter`, this call will not annotate the utterance. 

            :param corpus: Corpus
            :return: the corpus
        """

        total_utts = len(corpus.utterances)

        for idx, utterance in enumerate(corpus.iter_utterances()):

            if self._print_output(idx):
                print('%03d/%03d utterances processed' % (idx, total_utts))
            if not self.input_filter(utterance, self.aux_input): continue
            if self.input_field is None:
                text_entry = utterance.text
            elif isinstance(self.input_field, str):
                text_entry = utterance.get_info(self.input_field)

            elif isinstance(self.input_field, list):
                text_entry = {
                    field: utterance.get_info(field)
                    for field in self.input_field
                }
                if sum(x is None for x in text_entry.values()) > 0:
                    text_entry = None
            if text_entry is None:
                continue
            if len(self.aux_input) == 0:
                result = self.proc_fn(text_entry)
            else:
                result = self.proc_fn(text_entry, self.aux_input)
            if self.multi_outputs:
                for res, out in zip(result, self.output_field):
                    utterance.set_info(out, res)
            else:
                utterance.set_info(self.output_field, result)
        if self.verbosity > 0:
            print('%03d/%03d utterances processed' % (total_utts, total_utts))
        return corpus
    def transform(self, corpus: Corpus):
        name_to_gender = self.genderDictionary(corpus)
        ps = PorterStemmer()
        for utt in corpus.iter_utterances():
            speaker_name = utt.user.name
            speaker_gender = utt.user.meta['sex']
            gender_is_female = speaker_gender == 'FEMALE'

            contains_romantic = False
            male_about_female = False
            female_about_male = False

            for romantic_word in Genderromantic2.romantic_words:
                if romantic_word.lower() in utt.text.lower():
                    contains_romantic = True
                    break

            utt.add_meta('female_about_male', female_about_male)
            utt.add_meta('male_about_female', male_about_female)
            utt.add_meta('contains_romantic', contains_romantic)
        return corpus
Exemple #23
0
    def transform(self, corpus: Corpus) -> Corpus:
        """
        Annotate the corpus utterances with forecast and forecast probability information
        :param corpus: target Corpus
        :return: annotated Corpus
        """
        id_to_context_reply_label = self._get_context_reply_label_dict(
            corpus, include_label=False)
        forecast_df = self.forecaster_model.forecast(id_to_context_reply_label)

        for utt in corpus.iter_utterances():
            if utt.id in forecast_df.index:
                utt.add_meta(self.forecast_feat_name,
                             forecast_df.loc[utt.id][self.forecast_feat_name])
                utt.add_meta(
                    self.forecast_prob_feat_name,
                    forecast_df.loc[utt.id][self.forecast_prob_feat_name])
            else:
                utt.add_meta(self.forecast_feat_name, None)
                utt.add_meta(self.forecast_prob_feat_name, None)

        return corpus
Exemple #24
0
    def transform(self, corpus: Corpus, selector: Optional[Callable[[Utterance], bool]] = lambda utt: True,
                  markers: bool = False):
        """
        Extract politeness strategies from each utterances in the corpus and annotate
        the utterances with the extracted strategies. Requires that the corpus has previously
        been transformed by a Parser, such that each utterance has dependency parse info in
        its metadata table.

        :param corpus: the corpus to compute features for.
        :param selector: a (lambda) function that takes an Utterance and returns a bool indicating whether the utterance should be included in this annotation step.
        :param markers: whether or not to add politeness occurrence markers
        """
    
        total_utts = len(corpus.utterances)
        
        for idx, utt in enumerate(corpus.iter_utterances()):
            
            if self.verbose > 0 and idx > 0 and idx % self.verbose == 0:
                print('%03d/%03d utterances processed' % (idx, total_utts))
            
            if selector(utt):    
                parsed = utt.retrieve_meta(self.parse_attribute_name)
                for i, sent in enumerate(parsed):
                    for p in sent["toks"]:
                        # p["tok"] = re.sub("[^a-z,.:;]", "", p["tok"].lower())
                        p["tok"] = p['tok'].lower()
                
                parses = [x["toks"] for x in parsed]
            
                utt.meta[self.strategy_attribute_name], marks = self._extractor_lookup[self.strategy_collection](parses)

                if markers:
                    utt.meta[self.marker_attribute_name] = marks
            else:
                utt.meta[self.strategy_attribute_name] = None
                utt.meta[self.marker_attribute_name] = None
            
        return corpus
    def fit(self, corpus: Corpus, y=None):
        """
        Learn the fighting words from a corpus
        :param corpus: target Corpus
        :return: fitted Transformer
        """
        class1, class2 = [], []
        for utt in corpus.iter_utterances():
            if self.class1_selector(utt):
                class1.append(utt)
            elif self.class2_selector(utt):
                class2.append(utt)

        if len(class1) == 0:
            raise ValueError("class1_func returned 0 valid utterances.")
        if len(class2) == 0:
            raise ValueError("class2_func returned 0 valid utterances.")

        print("class1_func returned {} valid utterances. class2_func returned {} valid utterances.".format(len(class1), len(class2)))

        self.ngram_zscores = self._bayes_compare_language(class1, class2)
        print("ngram zscores computed.")
        return self
    def transform(self,
                  corpus: Corpus,
                  obj_type: str,
                  group_and_models: Callable[[Utterance],
                                             Tuple[str, List[str]]] = None,
                  group_model_attr_key: Callable[[str, str], str] = None,
                  selector: Callable[[CorpusComponent], bool] = lambda _: True,
                  target_text_func: Callable[[Utterance], List[str]] = None):
        """
    Annotates `obj_type` components in a corpus with surprise scores. Should be 
    called after fit().

    :param corpus: corpus to compute surprise for.
    :param obj_type: the type of corpus components to annotate. Should be either 
        'utterance', 'speaker', 'conversation', or 'corpus'. 
    :param group_and_models: optional function that defines how an utterance should 
        be grouped to form a target text and what models (contexts) the group should 
        be compared to when calculating surprise. Takes in an utterance and returns 
        a tuple containing the name of the group the utterance belongs to and a 
        list of models to calculate how surprising that group is against. Objects 
        will be annotated with a metadata field `self.surprise_attr_name` that is 
        maps a key corresponding to the `groupname` and `modelkey` to the surprise 
        score for utterances in the group when compared to the model. The key used 
        is defined by the `group_model_attr_key` parameter.
        If `group_and_models` is `None`, `self.model_key_selector` will be used 
        to select the group that an utterance belongs to. The surprise score will 
        be calculated for each group of utterances compared to the model in 
        `self.models` corresponding to the group.
    :param group_model_attr_key: optional function to define what key should be used 
        for a given `groupname` and `modelkey`. 
        If `group_model_attr_key` is `None`, the default key used will be 
        "GROUP_groupname_MODEL_modelkey" unless `groupname` and `modelkey` are equal 
        in which case just "modelkey" will be used as the key.
    :param selector: function to select objects to annotate. if function returns true, object will be annotated.
    :param target_text_func: optional function to define what the target text corresponding to an utterance should be. 
        takes in an utterance and returns a list of string tokens
    """
        if obj_type == 'corpus':
            utt_groups = defaultdict(list)
            group_models = defaultdict(set)
            for utt in corpus.iter_utterances():
                if group_and_models:
                    group_name, models = group_and_models(utt)
                else:
                    group_name = self.model_key_selector(utt)
                    models = {group_name}
                if target_text_func:
                    if group_name not in utt_groups:
                        utt_groups[group_name] = [target_text_func(utt)]
                else:
                    utt_groups[group_name].append(self.tokenizer(utt.text))
                group_models[group_name].update(models)
            surprise_scores = {}
            for group_name in tqdm(utt_groups, desc='transform'):
                for model_key in group_models[group_name]:
                    context = self.model_groups[model_key]
                    target = list(chain(*utt_groups[group_name]))
                    surprise_scores[Surprise._format_attr_key(
                        group_name, model_key,
                        group_model_attr_key)] = self._compute_surprise(
                            target, context)
            corpus.add_meta(self.surprise_attr_name, surprise_scores)
        elif obj_type == 'utterance':
            for utt in tqdm(corpus.iter_utterances(selector=selector),
                            desc='transform'):
                if group_and_models:
                    group_name, models = group_and_models(utt)
                    surprise_scores = {}
                    for model_key in models:
                        context = self.model_groups[model_key]
                        target = target_text_func(
                            utt) if target_text_func else self.tokenizer(
                                utt.text)
                        surprise_scores[Surprise._format_attr_key(
                            group_name, model_key,
                            group_model_attr_key)] = self._compute_surprise(
                                target, context)
                    utt.add_meta(self.surprise_attr_name, surprise_scores)
                else:
                    group_name = self.model_key_selector(utt)
                    context = self.model_groups[group_name]
                    target = target_text_func(
                        utt) if target_text_func else self.tokenizer(utt.text)
                    utt.add_meta(self.surprise_attr_name,
                                 self._compute_surprise(target, context))
        else:
            for obj in tqdm(corpus.iter_objs(obj_type, selector=selector),
                            desc='transform'):
                utt_groups = defaultdict(list)
                group_models = defaultdict(set)
                for utt in obj.iter_utterances():
                    if group_and_models:
                        group_name, models = group_and_models(utt)
                    else:
                        group_name = self.model_key_selector(utt)
                        models = {group_name}
                    if target_text_func:
                        if group_name not in utt_groups:
                            utt_groups[group_name] = [target_text_func(utt)]
                    else:
                        utt_groups[group_name].append(self.tokenizer(utt.text))
                    group_models[group_name].update(models)
                surprise_scores = {}
                for group_name in utt_groups:
                    for model_key in group_models[group_name]:
                        assert (model_key
                                in self.model_groups), 'invalid model key'
                        if not self.model_groups[model_key]: continue
                        context = self.model_groups[model_key]
                        target = list(chain(*utt_groups[group_name]))
                        surprise_scores[Surprise._format_attr_key(
                            group_name, model_key,
                            group_model_attr_key)] = self._compute_surprise(
                                target, context)
                obj.add_meta(self.surprise_attr_name, surprise_scores)
        return corpus
Exemple #27
0
    def score(self,
              corpus: Corpus,
              speakers: Collection[Union[User, str]],
              group: Collection[Union[User, str]],
              focus: str = "speakers",
              speaker_thresh: int = 0,
              target_thresh: int = 3,
              utterances_thresh: int = 0,
              speaker_thresh_indiv: int = 0,
              target_thresh_indiv: int = 0,
              utterances_thresh_indiv: int = 0,
              utterance_thresh_func: Optional[Callable[
                  [Tuple[Utterance, Utterance]], bool]] = None,
              split_by_attribs: Optional[List[str]] = None,
              speaker_attribs: Optional[Dict] = None,
              target_attribs: Optional[Dict] = None) -> CoordinationScore:
        """Computes the coordination scores for each speaker, given a set of
        speakers and a group of targets.

        :param corpus: Corpus to compute scores on
        :param speakers: A collection of user ids or user objects corresponding
            to the speakers we want to compute scores for.
        :param group: A collection of user ids or user objects corresponding to
            the group of targets.
        :param focus: Either "speakers" or "targets". If "speakers", treat the
            set of targets for a particular speaker as a single person (i.e.
            concatenate all of their utterances); the returned dictionary will
            have speakers as keys. If "targets", treat the set of
            speakers for a particular target as a single person; the returned
            dictionary will have targets as keys.
        :param speaker_thresh: Thresholds based on minimum number of times the speaker uses each coordination marker.
        :param target_thresh: Thresholds based on minimum number of times the target uses each coordination marker.
        :param utterances_thresh: Thresholds based on the minimum number of utterances for each speaker.
        :param speaker_thresh_indiv: Like `speaker_thresh` but only considers the utterances between a speaker and a
        single target; thresholds whether the utterances for a single target should be considered for a particular speaker.
        :param target_thresh_indiv: Like `target_thresh` but thresholds whether a single target's utterances should be
        considered for a particular speaker.
        :param utterances_thresh_indiv: Like `utterances_thresh` but thresholds whether a single target's
        utterances should be considered for a particular speaker.
        :param utterance_thresh_func: Optional utterance-level threshold function that takes in a speaker `Utterance`
        and the `Utterance` the speaker replied to, and returns a `bool` corresponding to whether or not to include
        the utterance in scoring.
        :param split_by_attribs: Utterance meta attributes to split users by when tallying coordination
        (e.g. in supreme court transcripts, you may want to treat the same lawyer as a different person across
            different cases --- see coordination examples)

        :param speaker_attribs: attribute names and values the speaker must have
        :param target_attribs: attribute names and values the target must have

        :return: A :class:`CoordinationScore` object corresponding to the
            coordination scores for each speaker.
        """
        if corpus != self.corpus:
            raise Exception("Coordination: must fit and score on same corpus")
        if not self.precomputed:
            raise Exception("Must fit before calling score")

        if split_by_attribs is None: split_by_attribs = []
        if speaker_attribs is None: speaker_attribs = dict()
        if target_attribs is None: target_attribs = dict()

        #self.precompute()
        speakers = set(speakers)
        group = set(group)

        utterances = []
        for utt in corpus.iter_utterances():
            speaker = utt.user
            if speaker in speakers:
                if utt.reply_to is not None:
                    reply_to = corpus.get_utterance(utt.reply_to)
                    target = reply_to.user
                    if target in group:
                        utterances.append(utt)
        return self.scores_over_utterances(
            corpus, speakers, utterances, speaker_thresh, target_thresh,
            utterances_thresh, speaker_thresh_indiv, target_thresh_indiv,
            utterances_thresh_indiv, utterance_thresh_func, focus,
            split_by_attribs, speaker_attribs, target_attribs)