Esempio n. 1
0
    def transform(self, corpus: Corpus):
        """Adds metadata about politicization to each utterance.

        :param corpus: the corpus to compute features for.
        :type corpus: Corpus
        """
        assert 'stem_tokens' in next(corpus.iter_utterances()).meta
        for utt in corpus.iter_utterances():
            if utt.meta['valid']:
                utt.meta['num_pol_words'] = len(
                    self.key_words.intersection(utt.meta['stem_tokens']))
                utt.meta['political'] = int(utt.meta['num_pol_words'] > 0)
            else:
                utt.meta['num_pol_words'] = None
                utt.meta['political'] = None

        # for conv_id in corpus.conversations:
        #     conv = corpus.get_conversation(conv_id)
        #     for utt in conv.iter_utterances():
        #         if utt.text != None:
        #             tokenized = word_tokenize(utt.text.lower())
        #             invocations = 0
        #             length = len(tokenized)
        #             pol_words = []
        #             for token in tokenized:
        #                 if token in self.key_words:
        #                     invocations += 1
        #                     pol_words.append(token)
        #             utt.meta["num_pol_refs"] = invocations
        #             if (length > 0):
        #                 utt.meta["num_pol_refs_incidence"] = (invocations/length)
        #             else:
        #                 utt.meta["num_pol_refs_incidence"] = 0
        #             utt.meta["pol_words"] = pol_words
        return corpus
Esempio n. 2
0
    def transform(
        self,
        corpus: Corpus,
        selector: Optional[Callable[[Conversation], bool]] = lambda convo: True
    ) -> Corpus:
        """
        Retrieves features from the Corpus Conversations using retrieve_feats() and annotates Conversations with this feature set

        :param corpus: Corpus object to retrieve feature information from
        :param selector: a (lambda) function that takes a Conversation and returns True / False; function selects
            conversations to be annotated with hypergraph features. By default, all conversations will be annotated.
        :return: corpus with conversations having a new meta field with the specified feature name  containing the stats generated by retrieve_feats().
        """

        convo_id_to_feats = self.retrieve_feats(corpus, selector)
        df = pd.DataFrame(convo_id_to_feats).T
        corpus.set_vector_matrix(name=self.vector_name,
                                 ids=list(df.index),
                                 columns=list(df.columns),
                                 matrix=csr_matrix(
                                     df.values.astype('float64')))

        for convo in corpus.iter_conversations(selector):
            convo.add_vector(self.vector_name)
        return corpus
Esempio n. 3
0
    def transform(self, corpus: Corpus):
        for convo in corpus.iter_conversations():
            reciprocal = 0
            onesided = 0
            user_to_targets = dict()
            for user in convo.iter_users():
                user_to_targets[user.name] = {
                    corpus.get_utterance(utt.reply_to).user.name
                    for utt in user.iter_utterances()
                    if utt.reply_to is not None
                }

            for user1, user2 in combinations(convo.iter_users(), 2):
                user1_to_user2 = user2.name in user_to_targets[user1.name]
                user2_to_user1 = user1.name in user_to_targets[user2.name]

                if user1_to_user2 and user2_to_user1:
                    reciprocal += 1
                elif user1_to_user2 or user2_to_user1:
                    onesided += 1

            if reciprocal + onesided == 0:
                reciprocity_pct = 0
            else:
                reciprocity_pct = reciprocal / (reciprocal + onesided)

            convo.add_meta('reciprocity', reciprocity_pct)
        return corpus
    def test_add_utterance(self):
        corpus1 = Corpus(utterances=[
            Utterance(id=0, text="hello world", user=User(name="alice")),
            Utterance(id=1, text="my name is bob", user=User(name="bob")),
            Utterance(id=2,
                      text="this is a test",
                      user=User(name="charlie"),
                      meta={
                          'hey': 'jude',
                          'hello': 'world'
                      }),
        ])

        utts = [
            Utterance(id=1, text="i like pie", user=User(name="delta")),
            Utterance(id=2,
                      text="this is a test",
                      user=User(name="charlie"),
                      meta={
                          'hello': 'food',
                          'what': 'a mood'
                      }),
            Utterance(id=5, text="goodbye", user=User(name="foxtrot")),
        ]
        added = corpus1.add_utterances(utts)

        self.assertEqual(len(list(added.iter_utterances())), 4)
        self.assertEqual(len(added.get_utterance(2).meta), 3)
        self.assertEqual(added.get_utterance(2).meta['hello'], 'food')
    def summarize(self, corpus: Corpus, cv=LeaveOneOut()):
        """
        Run PairedPrediction on the corpus with cross-validation
        :param corpus: target Corpus (must be annotated with pair information using PairedPrediction.transform())
        :param cv: optional CV model: default is LOOCV
        :return: cross-validation accuracy score
        """
        # Check if transform() needs to be run first
        sample_obj = next(corpus.iter_objs(self.obj_type))
        meta_keys = set(sample_obj.meta)
        required_keys = {self.pair_orientation_feat_name, self.pair_id_feat_name, self.label_feat_name}
        required_keys -= meta_keys
        if len(required_keys) > 0:
            raise ValueError("Some metadata features required for paired prediction are missing: {}. "
                             "You may need to run transform() first.".format(required_keys))

        pair_id_to_obj = {'pos': dict(), 'neg': dict()}
        for obj in corpus.iter_objs(self.obj_type, self.selector):
            if obj.meta[self.pair_orientation_feat_name] is None: continue
            pair_id_to_obj[obj.meta[self.label_feat_name]][obj.meta[self.pair_id_feat_name]] = obj

        pair_ids = set(pair_id_to_obj['pos'].keys()).intersection(set(pair_id_to_obj['neg'].keys()))

        # print(set(pair_id_to_obj['pos'].keys()))
        print("Found {} valid pairs.".format(len(pair_ids)))
        pair_id_to_objs = dict()
        for pair_id in pair_ids:
            pair_id_to_objs[pair_id] = (pair_id_to_obj['pos'][pair_id], pair_id_to_obj['neg'][pair_id])

        X, y = self._generate_paired_X_y(pair_id_to_objs)
        self.clf.fit(X, y)
        return np.mean(cross_val_score(self.clf, X, y, cv=cv, error_score='raise'))
Esempio n. 6
0
    def test_basic_functions(self):
        """
        Test basic meta functions
        """

        corpus1 = Corpus(utterances=[
            Utterance(id="0", text="hello world", speaker=Speaker(id="alice")),
            Utterance(id="1", text="my name is bob", speaker=Speaker(
                id="bob")),
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
        ])

        first_utt = corpus1.get_utterance("0")
        first_utt.meta['hey'] = 9

        # correct class type stored
        self.assertEqual(corpus1.meta_index.utterances_index['hey'],
                         repr(type(9)))

        # keyErrors result in None output
        self.assertRaises(KeyError, lambda: first_utt.meta['nonexistent key'])

        # test that setting a custom get still works
        self.assertEqual(first_utt.meta.get('nonexistent_key', {}), {})
Esempio n. 7
0
 def summarize(self, corpus: Corpus, use_selector=True, exclude_na=True):
     """
     Returns a DataFrame of utterances and their forecasts (and forecast probabilities)
     :param corpus: target Corpus
     :param use_selector: whether to use Forecaster's convo and utterance selector functions
     :param exclude_na: whether to drop NaN results
     :return: a pandas DataFrame
     """
     utt_forecast_prob = []
     if use_selector:
         for convo in corpus.iter_conversations(self.convo_selector_func):
             for utt in convo.iter_utterances(self.utt_selector_func):
                 utt_forecast_prob.append(
                     (utt.id, utt.meta[self.forecast_feat_name],
                      utt.meta[self.forecast_prob_feat_name]))
     else:
         for utt in corpus.iter_utterances():
             utt_forecast_prob.append(
                 (utt.id, utt.meta[self.forecast_feat_name],
                  utt.meta[self.forecast_prob_feat_name]))
     forecast_df = pd.DataFrame(utt_forecast_prob, columns=["utt_id", self.forecast_feat_name, self.forecast_prob_feat_name]) \
         .set_index('utt_id').sort_values(self.forecast_prob_feat_name, ascending=False)
     if exclude_na:
         forecast_df = forecast_df.dropna()
     return forecast_df
 def setUp(self) -> None:
     self.corpus = Corpus(download('subreddit-hey'))
     self.utt_df = self.corpus.get_utterances_dataframe()
     self.convo_df = self.corpus.get_conversations_dataframe()
     self.speaker_df = self.corpus.get_speakers_dataframe()
     self.new_corpus = Corpus.from_pandas(self.utt_df, self.speaker_df,
                                          self.convo_df)
    def transform(self, corpus: Corpus):
        '''
            compiles a list of all utterances by each user, organized by conversation; also annotates user with summary statistics.

            :param corpus: the Corpus to transform.
            :type corpus: Corpus  
        '''

        user_to_convo_utts = defaultdict(lambda: defaultdict(list))
        for utterance in corpus.iter_utterances():
            if not self.utterance_filter(utterance): continue
            user_to_convo_utts[utterance.user.name][utterance.root].append(
                (utterance.id, utterance.timestamp))
        for user, convo_utts in user_to_convo_utts.items():
            user_convos = {}
            for convo, utts in convo_utts.items():
                sorted_utts = sorted(utts, key=lambda x: x[1])
                user_convos[convo] = {
                    'utterance_ids': [x[0] for x in sorted_utts],
                    'start_time': sorted_utts[0][1],
                    'n_utterances': len(sorted_utts)
                }
            corpus.get_user(user).add_meta('conversations', user_convos)

        for user in corpus.iter_users():
            if 'conversations' not in user.meta: continue
            user.add_meta('n_convos', len(user.meta['conversations']))

            sorted_convos = sorted(user.meta['conversations'].items(),
                                   key=lambda x: x[1]['start_time'])
            user.add_meta('start_time', sorted_convos[0][1]['start_time'])
            for idx, (convo_id, _) in enumerate(sorted_convos):
                user.meta['conversations'][convo_id]['idx'] = idx
        return corpus
Esempio n. 10
0
    def get_scores(self,
                   corpus: Corpus,
                   selector: Optional[Callable[[], bool]] = None):
        """
        Calculates average occurance per utterance. Used in summarize()
        
        :param corpus: the corpus used to compute averages
        :param selector: lambda function which takes in meta data and returns a boolean.
        """

        utts = [corpus.get_utterance(x) for x in corpus.get_utterance_ids()]

        if self.MRKR_NAME not in utts[0].meta:
            corpus = self.transform(corpus, markers=True)

        if selector != None:
            utts = [x for x in utts if selector(x.meta)]
            if len(utts) == 0:
                raise Exception("No query matches")

        counts = {
            k[21:len(k) - 2]: 0
            for k in utts[0].meta[self.MRKR_NAME].keys()
        }

        for utt in utts:
            for k, v in utt.meta[self.MRKR_NAME].items():
                counts[k[21:len(k) - 2]] += len(v)
        scores = {k: v / len(utts) for k, v in counts.items()}
        return scores
    def fit_transform(self, corpus: Corpus) -> Corpus:
        """
        fit_transform() retrieves features from the corpus conversational
        threads using retrieve_feats()

        :param corpus: Corpus object to retrieve feature information from

        :return: corpus with conversations having a new meta field "hyperconvo" containing the stats generated by retrieve_feats(). Each conversation's metadata then contains the stats for the thread(s) it contains.
        """
        feats = HyperConvo.retrieve_feats(corpus,
                                          prefix_len=self.prefix_len,
                                          min_thread_len=self.min_thread_len,
                                          include_root=self.include_root)
        if self.include_root: # threads start at root (post)
            for root_id in feats.keys():
                convo = corpus.get_conversation(root_id)
                convo.add_meta("hyperconvo", {root_id: feats[root_id]})
        else: # threads start at top-level-comment
            # Construct top-level-comment to root mapping
            tlc_to_root_mapping = dict() # tlc = top level comment
            threads = corpus.utterance_threads(prefix_len=self.prefix_len, include_root=False)
            root_to_tlc = dict()
            for tlc_id, utts in threads.items():
                if len(utts) < self.min_thread_len: continue
                thread_root = threads[tlc_id][tlc_id].root
                if thread_root in root_to_tlc:
                    root_to_tlc[thread_root][tlc_id] = feats[tlc_id]
                else:
                    root_to_tlc[thread_root] = {tlc_id: feats[tlc_id]}

            for root_id in root_to_tlc:
                convo = corpus.get_conversation(root_id)
                convo.add_meta("hyperconvo", root_to_tlc[root_id])

        return corpus
Esempio n. 12
0
    def transform(self, corpus: Corpus) -> Corpus:
        """Computes the average number of questions asked in a conversation
        
        :param corpus: the corpus to compute features for.
        :type corpus: Corpus
        """

        if self.verbose: print("Finding questions per utterance")

        questions = []
        allutterids = corpus.get_utterance_ids()
        for i in list(range(0, len(allutterids))):
            utter_id = allutterids[i]
            text = corpus.get_utterance(utter_id).text
            nquestions = len(re.findall(r'\?+', text))
            questions.append(
                nquestions)  #gives number of questions in each utterance

        if self.verbose: print("Finding questions per conversation")
        allconvoids = corpus.get_conversation_ids()
        for i in list(range(0, len(allconvoids))):
            convo_id = allconvoids[i]
            convo_utters = corpus.get_conversation(convo_id)._utterance_ids
            avgquestion = np.mean(
                np.asarray(questions)[np.asarray(convo_utters)])
            corpus.get_conversation(convo_id)._meta[
                self.ATTR_NAME] = avgquestion
            #adds average questions per conversation to conversation metadata

        return corpus
Esempio n. 13
0
    def fit_transform(self, corpus: Corpus) -> Corpus:
        """
        Groups threads together into communities.

        :param corpus: the Corpus to use

        :return: Modifies and returns Corpus with new meta key: "communityEmbedder", value: Dict,
                containing "pts": an array with rows corresponding to embedded communities,
                and "labels": an array whose ith entry is the community of the ith row of X.
        """
        if self.community_key is None:
            raise RuntimeError(
                "Must specify community_key to retrieve label information from utterance"
            )

        corpus_meta = corpus.get_meta()
        if "threadEmbedder" not in corpus_meta:
            raise RuntimeError(
                "Missing threadEmbedder metadata: "
                "threadEmbedder.fit_transform() must be run on the Corpus first"
            )

        thread_embed_data = corpus_meta["threadEmbedder"]

        X_mid = thread_embed_data["X"]
        roots = thread_embed_data["roots"]

        if self.method.lower() == "svd":
            f = TruncatedSVD
        elif self.method.lower() == "tsne":
            f = TSNE
        elif self.method.lower() == "none":
            f = None
        else:
            raise Exception("Invalid embed_communities embedding method")

        if f is not None:
            X_embedded = f(n_components=self.n_components).fit_transform(X_mid)
        else:
            X_embedded = X_mid

        labels = [
            corpus.get_utterance(root).get("meta")[self.community_key]
            for root in roots
        ]
        # label_counts = Counter(labels)
        subs = defaultdict(list)
        for x, label in zip(X_embedded, labels):
            subs[label].append(x / np.linalg.norm(x))

        labels, subs = zip(*subs.items())
        pts = [np.mean(sub, axis=0) for sub in subs]

        retval = {"pts": pts, "labels": labels}
        corpus.add_meta("communityEmbedder", retval)

        return corpus
Esempio n. 14
0
    def test_corpus_dump(self):
        corpus1 = Corpus(utterances=[
            Utterance(id="0", text="hello world", speaker=Speaker(id="alice")),
            Utterance(id="1", text="my name is bob", speaker=Speaker(
                id="bob")),
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
        ])

        corpus1.get_utterance("0").meta['foo'] = 'bar'
        corpus1.get_utterance("1").meta['foo'] = 'bar2'
        corpus1.get_utterance("2").meta['hey'] = 'jude'

        corpus1.get_conversation(None).meta['convo_meta'] = 1

        corpus1.get_speaker("alice").meta['surname'] = 1.0
        corpus1.dump('test_index_meta_corpus', base_path="./")
        corpus2 = Corpus(filename="test_index_meta_corpus")

        self.assertEqual(corpus1.meta_index.utterances_index,
                         corpus2.meta_index.utterances_index)
        self.assertEqual(corpus1.meta_index.speakers_index,
                         corpus2.meta_index.speakers_index)
        self.assertEqual(corpus1.meta_index.conversations_index,
                         corpus2.meta_index.conversations_index)
        self.assertEqual(corpus1.meta_index.overall_index,
                         corpus2.meta_index.overall_index)
    def _get_context_reply_label_dict(self,
                                      corpus: Corpus,
                                      convo_selector,
                                      utt_excluder,
                                      include_label=True):
        """
        Returns a dict mapping reply id to (context, reply, label).

        If self.forecast_mode == 'future': return a dict mapping the leaf utt id to the path from root utt to leaf utt
        """
        dialogs = []
        if self.convo_structure == "branched":
            for convo in corpus.iter_conversations(convo_selector):
                try:
                    for path in convo.get_root_to_leaf_paths():
                        path = [utt for utt in path if not utt_excluder(utt)]
                        if len(path) == 1: continue
                        dialogs.append(path)
                except ValueError as e:
                    if not self.skip_broken_convos:
                        raise e

        elif self.convo_structure == "linear":
            for convo in corpus.iter_conversations(convo_selector):
                utts = convo.get_chronological_utterance_list(
                    selector=lambda x: not utt_excluder(x))
                if len(utts) == 1: continue
                dialogs.append(utts)

        id_to_context_reply_label = dict()

        if self.forecast_mode == 'future':
            for dialog in dialogs:
                id_to_context_reply_label[dialog[-1].id] = (dialog, dialog[-1],
                                                            None)

        for dialog in dialogs:
            if self.use_last_only:
                reply = self.text_func(dialog[-1])
                context = [self.text_func(utt) for utt in dialog[:-1]]
                label = self.label_func(dialog[-1]) if include_label else None
                id_to_context_reply_label[dialog[-1].id] = (context, reply,
                                                            label)
            else:
                for idx in range(1, len(dialog)):
                    reply = self.text_func(dialog[idx])
                    label = self.label_func(
                        dialog[idx]) if include_label else None
                    reply_id = dialog[idx].id
                    context = [self.text_func(utt) for utt in dialog[:idx]]
                    id_to_context_reply_label[reply_id] = (
                        context, reply,
                        label) if include_label else (context, reply, None)

        return id_to_context_reply_label
Esempio n. 16
0
    def test_key_insertion_deletion(self):
        corpus1 = Corpus(utterances=[
            Utterance(id="0", text="hello world", speaker=Speaker(id="alice")),
            Utterance(id="1", text="my name is bob", speaker=Speaker(
                id="bob")),
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
        ])

        corpus1.get_utterance("0").meta['foo'] = 'bar'
        corpus1.get_utterance("1").meta['foo'] = 'bar2'
        corpus1.get_utterance("2").meta['hey'] = 'jude'

        corpus1.get_conversation(None).meta['convo_meta'] = 1

        corpus1.get_speaker("alice").meta['surname'] = 1.0

        self.assertEqual(corpus1.meta_index.utterances_index['foo'],
                         str(type('bar')))
        self.assertEqual(corpus1.meta_index.conversations_index['convo_meta'],
                         str(type(1)))
        self.assertEqual(corpus1.meta_index.speakers_index['surname'],
                         str(type(1.0)))

        # test that deleting a key from an utterance removes it from the index
        del corpus1.get_utterance("2").meta['hey']
        self.assertRaises(KeyError,
                          lambda: corpus1.meta_index.utterances_index['hey'])

        # test that deleting a key from an utterance removes it from the index and from all other objects of same type
        del corpus1.get_utterance("1").meta['foo']
        self.assertRaises(KeyError,
                          lambda: corpus1.meta_index.utterances_index['foo'])
        self.assertRaises(KeyError,
                          lambda: corpus1.get_utterance("0").meta["foo"])
Esempio n. 17
0
    def fit_transform(self, corpus: Corpus) -> Corpus:
        """
        :param corpus: the Corpus to use

        :return: Modifies and returns corpus with new meta key: "threadEmbedder",
             value: Dict, containing "X": an array with rows corresponding
             to embedded threads, "roots": an array whose ith entry is the
             thread root id of the ith row of X. If return_components is True,
             then the Dict contains a third key "components": the SVD components array
        """
        convos = corpus.iter_conversations()
        sample_convo_meta = next(iter(convos))
        if "hyperconvo" not in sample_convo_meta:
            raise RuntimeError(
                "Missing thread statistics: HyperConvo.fit_transform() must be run on the Corpus first"
            )

        thread_stats = dict()

        for convo in convos:
            thread_stats.update(convo.meta["hyperconvo"])

        X = []
        roots = []
        for root, feats in thread_stats.items():
            roots.append(root)
            row = np.array([
                v[1] if not (np.isnan(v[1]) or np.isinf(v[1])) else 0
                for v in sorted(feats.items())
            ])
            X.append(row)
        X = np.array(X)

        if self.norm_method.lower() == "standard":
            X = StandardScaler().fit_transform(X)
        elif self.norm_method.lower() == "none":
            pass
        else:
            raise Exception("Invalid embed_feats normalization method")

        if self.method.lower() == "svd":
            f = TruncatedSVD
        elif self.method.lower() == "tsne":
            f = TSNE
        else:
            raise Exception("Invalid embed_feats embedding method")

        emb = f(n_components=self.n_components)
        X_mid = emb.fit_transform(X) / emb.singular_values_

        retval = {"X": X_mid, "roots": roots}
        if self.return_components: retval["components"] = emb.components_

        corpus.add_meta("threadEmbedder", retval)
        return corpus
Esempio n. 18
0
 def transform(self, corpus: Corpus):
     for character in corpus.get_usernames():
         user1 = corpus.get_user(character)
         utterances = user1.get_utterance_ids()
         utterances_per_conversation = []
         conversations = []
         for uid in utterances:
             utterance = corpus.get_utterance(uid)
             conversation = corpus.get_conversation(utterance.root)
             conversations.append(utterance.root)
             utterances_per_conversation.append(
                 (utterance.root, len(conversation.get_usernames()),
                  len(conversation.get_utterance_ids())))
             first_last = 0
             if uid in (utterance.root,
                        list(conversation.get_utterance_ids())[-1]):
                 first_last += 1
         raw_count = len(utterances) / len(list(corpus.utterances.values()))
         total_conversations = len(set(conversations))
         #bootstrapping
         iterations = 0
         for i in range(20):
             samples = random.choices(utterances, k=25)
             #for politeness complexity#
             politeness_rows = []
             #many operations#
             for uid in samples:
                 politeness_rows.append(
                     list(
                         corpus.get_utterance(
                             uid).meta["politeness_strategies"].values()))
         #politeness#
             politeness_results = np.sum(politeness_rows, 0)
             politeness_results_count = len([
                 i / len(politeness_rows)
                 for i in politeness_results if i != 0.0
             ]) / len(politeness_rows)
             iterations += politeness_results_count
         #politness_final#
         politeness_final = iterations / 20
         #first/last#
         first_last_count = first_last / total_conversations
         #utterances_per_conversation#
         utterances_per_conversations = Counter(utterances_per_conversation)
         upc_final = []
         for k, v in utterances_per_conversations.items():
             average = k[2] / k[1]
             upc_final.append(v / average)
         upc_count = sum(upc_final) / len(utterances_per_conversations)
         user1.add_meta('politeness_complexity', politeness_final)
         user1.add_meta('utterance_per_conversation', upc_count)
         user1.add_meta('first_last_word', first_last_count)
         user1.add_meta('raw_count', raw_count)
     return (corpus)
Esempio n. 19
0
    def transform(self, corpus: Corpus) -> Corpus:
        """Adds the ARI score to the metadata table of each utterance in the corpus.

        :return: corpus, modified with ARI and Flesch-Kincaid grade level scores assigned to each utterance
        """
        utt_ids = corpus.get_utterance_ids()
        for utt_id in utt_ids:  #add scores to each utterances metadata
            corpus.get_utterance(utt_id).meta['ARI'] = ARI(
                corpus.get_utterance(utt_id).text)
            corpus.get_utterance(
                utt_id).meta['Flesch-Kincaid'] = Flesch_Kincaid(
                    corpus.get_utterance(utt_id).text)
        return corpus
Esempio n. 20
0
    def transform(self, corpus: Corpus) -> Corpus:
        super().transform(corpus)
        if self.replace_text:
            selector = lambda utt_: self.input_filter(utt_, None)
            for utt in corpus.iter_utterances(selector):
                cleaned_text = utt.retrieve_meta(self.output_field)
                if self.save_original:
                    utt.add_meta(self.output_field, utt.text)
                utt.text = cleaned_text

            if not self.save_original:
                corpus.delete_metadata('utterance', self.output_field)
        return corpus
Esempio n. 21
0
    def transform(self, corpus: Corpus) -> Corpus:
        super().transform(corpus)
        if self.replace_text:
            selector = lambda utt_: self.input_filter(utt_, None)
            for utt in corpus.iter_utterances(selector):
                cleaned_text = utt.get_info(self.output_field)
                if self.save_original:
                    utt.set_info(self.output_field, utt.text)
                utt.text = cleaned_text

            if not self.save_original:
                next(corpus.iter_utterances(selector)).del_info(
                    self.output_field)  # deletes for all
        return corpus
    def transform(self, corpus: Corpus) -> Corpus:
        """
        Annotate corpus objects with pair information (label, pair_id, pair_orientation)
        :param corpus: target Corpus
        :return: annotated Corpus
        """
        pos_objs, neg_objs = self._get_pos_neg_objects(corpus)
        obj_pairs = self._pair_objs(pos_objs, neg_objs)
        pair_orientations = self._assign_pair_orientations(obj_pairs)

        for pair_id, (pos_obj, neg_obj) in obj_pairs.items():
            pos_obj.add_meta(self.label_feat_name, "pos")
            neg_obj.add_meta(self.label_feat_name, "neg")
            pos_obj.add_meta(self.pair_id_feat_name, pair_id)
            neg_obj.add_meta(self.pair_id_feat_name, pair_id)
            pos_obj.add_meta(self.pair_orientation_feat_name, pair_orientations[pair_id])
            neg_obj.add_meta(self.pair_orientation_feat_name, pair_orientations[pair_id])

        for obj in corpus.iter_objs(self.obj_type):
            # unlabelled objects include both objects that did not pass the selector
            # and objects that were not selected in the pairing step
            if self.label_feat_name not in obj.meta:
                obj.add_meta(self.label_feat_name, None)
                obj.add_meta(self.pair_id_feat_name, None)
                obj.add_meta(self.pair_orientation_feat_name, None)

        return corpus
    def fit(self,
            corpus: Corpus,
            text_func: Callable[[Utterance], List[str]] = None,
            selector: Callable[[Utterance], bool] = lambda utt: True):
        """
    Fits a model for each group of utterances in a corpus. The group that an 
    utterance belongs to is determined by the `model_key_selector` parameter in 
    the transformer's constructor.

    :param corpus: corpus to create models from.
    :param text_func: optional function to define how the text a model is trained 
        on should be selected. Takes an utterance as input and returns a list of 
        strings to train the model corresponding to that utterance on. The model 
        corresponding to the utterance is determined by `self.model_key_selector`. 
        For every utterance corresponding to the same model key, this function 
        should return the same result.
        If `text_func` is `None`, a model will be trained on the text from all 
        the utterances that belong to its group.
    :param selector: determines which utterances in the corpus to train models for.
    """
        self.model_groups = defaultdict(list)
        for utt in tqdm(corpus.iter_utterances(selector=selector),
                        desc='fit1'):
            key = self.model_key_selector(utt)
            if text_func:
                if key not in self.model_groups:
                    self.model_groups[key] = text_func(utt)
            else:
                self.model_groups[key].append(utt.text)
        for key in tqdm(self.model_groups, desc='fit2'):
            if not text_func:
                self.model_groups[key] = [' '.join(self.model_groups[key])]
            self.model_groups[key] = list(
                map(lambda x: self.tokenizer(x), self.model_groups[key]))
        return self
Esempio n. 24
0
    def transform(self, corpus: Corpus, selector: Optional[Callable[[Utterance], bool]] = lambda utt: True,
                  markers: bool = False):
        """
        Extract politeness strategies from each utterances in the corpus and annotate
        the utterances with the extracted strategies. Requires that the corpus has previously
        been transformed by a Parser, such that each utterance has dependency parse info in
        its metadata table.

        :param corpus: the corpus to compute features for.
        :param selector: a (lambda) function that takes an Utterance and returns a bool indicating whether the utterance should be included in this annotation step.
        :param markers: whether or not to add politeness occurrence markers
        """
        for utt in corpus.iter_utterances():
            if selector(utt):
                for i, sent in enumerate(utt.meta["parsed"]):
                    for p in sent["toks"]:
                        p["tok"] = re.sub("[^a-z,.:;]", "", p["tok"].lower())
                utt.meta[self.ATTR_NAME], marks = get_politeness_strategy_features(utt)

                if markers:
                    utt.meta[self.MRKR_NAME] = marks
            else:
                utt.meta[self.ATTR_NAME] = None
                utt.meta[self.MRKR_NAME] = None

        return corpus
Esempio n. 25
0
    def transform(
            self,
            corpus: Corpus,
            y=None,
            selector: Callable[[CorpusObject],
                               bool] = lambda obj: True) -> Corpus:
        """
        Annotate corpus objects with scores and rankings.

        :param corpus: target corpus
        :param selector: (lambda) function taking in a Corpus object and returning True / False; selects for Corpus objects to annotate.
        :return: annotated corpus
        """
        obj_iters = {
            "conversation": corpus.iter_conversations,
            "user": corpus.iter_users,
            "utterance": corpus.iter_utterances
        }
        obj_scores = [(obj.id, self.score_func(obj))
                      for obj in obj_iters[self.obj_type](selector)]
        df = pd.DataFrame(obj_scores, columns=["id", self.score_feat_name]) \
            .set_index('id').sort_values(self.score_feat_name, ascending=False)
        df[self.rank_feat_name] = [idx + 1 for idx, _ in enumerate(df.index)]

        for obj in corpus.iter_objs(obj_type=self.obj_type):
            if obj.id in df.index:
                obj.add_meta(self.score_feat_name,
                             df.loc[obj.id][self.score_feat_name])
                obj.add_meta(self.rank_feat_name,
                             df.loc[obj.id][self.rank_feat_name])
            else:
                obj.add_meta(self.score_feat_name, None)
                obj.add_meta(self.rank_feat_name, None)
        return corpus
    def retrieve_feats(corpus: Corpus, prefix_len: int=10,
                       min_thread_len: int=10,
                       include_root: bool=True) -> Dict[Hashable, Dict]:
        """
        Retrieve all hypergraph features for a given corpus (viewed as a set
        of conversation threads).

        See init() for further documentation.

        :return: A dictionary from a thread root id to its stats dictionary,
            which is a dictionary from feature names to feature values. For degree-related
            features specifically.
        """

        threads_stats = dict()

        for i, (root, thread) in enumerate(
                corpus.utterance_threads(prefix_len=prefix_len, include_root=include_root).items()):
            if len(thread) < min_thread_len: continue
            stats = {}
            G = HyperConvo._make_hypergraph(uts=thread)
            G_mid = HyperConvo._make_hypergraph(uts=thread, exclude_id=root)
            for k, v in HyperConvo._degree_feats(G=G).items(): stats[k] = v
            for k, v in HyperConvo._motif_feats(G=G).items(): stats[k] = v
            for k, v in HyperConvo._degree_feats(G=G_mid,
                                           name_ext="mid-thread ").items(): stats[k] = v
            for k, v in HyperConvo._motif_feats(G=G_mid,
                                          name_ext=" over mid-thread").items(): stats[k] = v
            threads_stats[root] = stats
        return threads_stats
Esempio n. 27
0
def transform(self, corpus: Corpus):
        """Adds metadata about self-reflection to each utterance.

        :param corpus: the corpus to compute features for.
        :type corpus: Corpus
        """
        
        for conv_id in corpus.conversations:
            conv = corpus.get_conversation(conv_id)
            for utt in conv.iter_utterances():
                if utt.text != None:
                    tokenized = word_tokenize(utt.text.lower())
                    invocations = 0
                    length = len(tokenized)
                    pol_words = []
                    for token in tokenized:
                        if token in self.key_words:
                            invocations += 1
                            pol_words.append(token)
                    utt.meta["num_pol_refs"] = invocations
                    if (length > 0):
                        utt.meta["num_pol_refs_incidence"] = (invocations/length)
                    else:
                        utt.meta["num_pol_refs_incidence"] = 0
                    utt.meta["pol_words"] = pol_words
        return corpus
    def _preprocess_utterances(self, corpus: Corpus) -> Tuple[List[Hashable], List[Dict]]:
        """Convert each Utterance in the given Corpus into the representation expected
        by the politeness API. Assumes that the Corpus has already been parsed, so that
        each Utterance contains the `parsed` metadata entry
        
        :param corpus: the corpus to compute features for.
        :type corpus: Corpus
        """

        utt_ids = [] # keep track of the order in which we process the utterances, so we can join with the corpus at the end
        documents = []
        for i, utterance in enumerate(corpus.iter_utterances()):
            if self.verbose and i > 0 and (i % self.verbose) == 0:
                print("\t%03d" % i)
            utt_ids.append(utterance.id)
            doc = {"text": utterance.text, "sentences": [], "parses": []}
            # the politeness API goes sentence-by-sentence
            for sent in utterance.meta["parsed"].sents:
                doc["sentences"].append(sent.text)
                sent_parses = []
                pos = sent.start
                for tok in sent:
                    if tok.dep_ != "punct": # the politeness API does not know how to handle punctuation in parses
                        ele = "%s(%s-%d, %s-%d)"%(tok.dep_, tok.head.text, tok.head.i + 1 - pos, tok.text, tok.i + 1 - pos)
                        sent_parses.append(ele)
                doc["parses"].append(sent_parses)
            doc["unigrams"], doc["bigrams"] = get_unigrams_and_bigrams(doc)
            documents.append(doc)
        if self.verbose:
            print("Done!")
        return utt_ids, documents
Esempio n. 29
0
    def transform(
            self,
            corpus: Corpus,
            selector: Callable[[Utterance], bool] = lambda x: True) -> Corpus:
        """
        Annotates the corpus utterances with the lists of fighting words that the utterance contains.

        The relevant fighting words to use are specified by FightingWords.top_k or FightingWords.threshold,
            with FightingWords.annot_method indicating which criterion to use.

        Lists are stored under metadata keys 'fighting_words_class1', 'fighting_words_class2'

        :param corpus: corpus to annotate
        :param selector: a (lambda) function that takes an Utterance and returns True/False; this selects for utterances
            that should be annotated with the fighting words

        :return: annotated corpus
        """
        class1_ngrams, class2_ngrams = self.get_top_k_ngrams() if self.annot_method == "top_k" \
                                else self.get_ngrams_past_threshold()

        for utt in corpus.iter_utterances(
        ):  # improve the efficiency of this; tricky because ngrams #TODO
            if selector(utt):
                utt.meta['fighting_words_class1'] = [
                    ngram for ngram in class1_ngrams if ngram in utt.text
                ]
                utt.meta['fighting_words_class2'] = [
                    ngram for ngram in class2_ngrams if ngram in utt.text
                ]
            else:
                utt.meta['fighting_words_class1'] = None
                utt.meta['fighting_words_class2'] = None

        return corpus
    def retrieve_feats(self, corpus: Corpus) -> Dict[str, Dict]:
        """
        Retrieve all hypergraph features for a given corpus (viewed as a set
        of conversation threads).

        See init() for further documentation.

        :return: A dictionary from a thread root id to its stats dictionary,
            which is a dictionary from feature names to feature values. For degree-related
            features specifically.
        """

        threads_stats = dict()

        for convo in corpus.iter_conversations():
            ordered_utts = convo.get_chronological_utterance_list()
            if len(ordered_utts) < self.min_thread_len: continue
            utts = ordered_utts[:self.prefix_len]
            stats = {}
            G = Hypergraph.init_from_utterances(utterances=utts)
            G_mid = Hypergraph.init_from_utterances(
                utterances=utts[1:])  # exclude root
            for k, v in HyperConvo._degree_feats(graph=G).items():
                stats[k] = v
            for k, v in HyperConvo._motif_feats(graph=G).items():
                stats[k] = v
            for k, v in HyperConvo._degree_feats(
                    graph=G_mid, name_ext="mid-thread ").items():
                stats[k] = v
            for k, v in HyperConvo._motif_feats(
                    graph=G_mid, name_ext=" over mid-thread").items():
                stats[k] = v
            threads_stats[convo.id] = stats
        return threads_stats