def transform(self, corpus: Corpus):
        '''
            compiles a list of all utterances by each user, organized by conversation; also annotates user with summary statistics.

            :param corpus: the Corpus to transform.
            :type corpus: Corpus  
        '''

        user_to_convo_utts = defaultdict(lambda: defaultdict(list))
        for utterance in corpus.iter_utterances():
            if not self.utterance_filter(utterance): continue
            user_to_convo_utts[utterance.user.name][utterance.root].append(
                (utterance.id, utterance.timestamp))
        for user, convo_utts in user_to_convo_utts.items():
            user_convos = {}
            for convo, utts in convo_utts.items():
                sorted_utts = sorted(utts, key=lambda x: x[1])
                user_convos[convo] = {
                    'utterance_ids': [x[0] for x in sorted_utts],
                    'start_time': sorted_utts[0][1],
                    'n_utterances': len(sorted_utts)
                }
            corpus.get_user(user).add_meta('conversations', user_convos)

        for user in corpus.iter_users():
            if 'conversations' not in user.meta: continue
            user.add_meta('n_convos', len(user.meta['conversations']))

            sorted_convos = sorted(user.meta['conversations'].items(),
                                   key=lambda x: x[1]['start_time'])
            user.add_meta('start_time', sorted_convos[0][1]['start_time'])
            for idx, (convo_id, _) in enumerate(sorted_convos):
                user.meta['conversations'][convo_id]['idx'] = idx
        return corpus
コード例 #2
0
    def test_corpus_dump(self):
        corpus1 = Corpus(utterances=[
            Utterance(id="0", text="hello world", user=User(name="alice")),
            Utterance(id="1", text="my name is bob", user=User(name="bob")),
            Utterance(id="2", text="this is a test", user=User(
                name="charlie")),
        ])

        corpus1.get_utterance("0").meta['foo'] = 'bar'
        corpus1.get_utterance("1").meta['foo'] = 'bar2'
        corpus1.get_utterance("2").meta['hey'] = 'jude'

        corpus1.get_conversation(None).meta['convo_meta'] = 1

        corpus1.get_user("alice").meta['surname'] = 1.0
        corpus1.dump('test_index_meta_corpus', base_path="./")
        corpus2 = Corpus(filename="test_index_meta_corpus")

        self.assertEqual(corpus1.meta_index.utterances_index,
                         corpus2.meta_index.utterances_index)
        self.assertEqual(corpus1.meta_index.users_index,
                         corpus2.meta_index.users_index)
        self.assertEqual(corpus1.meta_index.conversations_index,
                         corpus2.meta_index.conversations_index)
        self.assertEqual(corpus1.meta_index.overall_index,
                         corpus2.meta_index.overall_index)
コード例 #3
0
    def test_key_insertion_deletion(self):
        corpus1 = Corpus(utterances=[
            Utterance(id="0", text="hello world", user=User(name="alice")),
            Utterance(id="1", text="my name is bob", user=User(name="bob")),
            Utterance(id="2", text="this is a test", user=User(
                name="charlie")),
        ])

        corpus1.get_utterance("0").meta['foo'] = 'bar'
        corpus1.get_utterance("1").meta['foo'] = 'bar2'
        corpus1.get_utterance("2").meta['hey'] = 'jude'

        corpus1.get_conversation(None).meta['convo_meta'] = 1

        corpus1.get_user("alice").meta['surname'] = 1.0

        self.assertEqual(corpus1.meta_index.utterances_index['foo'],
                         str(type('bar')))
        self.assertEqual(corpus1.meta_index.conversations_index['convo_meta'],
                         str(type(1)))
        self.assertEqual(corpus1.meta_index.users_index['surname'],
                         str(type(1.0)))

        # test that deleting a key from an utterance removes it from the index
        del corpus1.get_utterance("2").meta['hey']
        self.assertRaises(KeyError,
                          lambda: corpus1.meta_index.utterances_index['hey'])

        # test that deleting a key from an utterance removes it from the index and from all other objects of same type
        del corpus1.get_utterance("1").meta['foo']
        self.assertRaises(KeyError,
                          lambda: corpus1.meta_index.utterances_index['foo'])
        self.assertRaises(KeyError,
                          lambda: corpus1.get_utterance("0").meta["foo"])
コード例 #4
0
 def transform(self, corpus: Corpus):
     for character in corpus.get_usernames():
         user1 = corpus.get_user(character)
         utterances = user1.get_utterance_ids()
         utterances_per_conversation = []
         conversations = []
         for uid in utterances:
             utterance = corpus.get_utterance(uid)
             conversation = corpus.get_conversation(utterance.root)
             conversations.append(utterance.root)
             utterances_per_conversation.append(
                 (utterance.root, len(conversation.get_usernames()),
                  len(conversation.get_utterance_ids())))
             first_last = 0
             if uid in (utterance.root,
                        list(conversation.get_utterance_ids())[-1]):
                 first_last += 1
         raw_count = len(utterances) / len(list(corpus.utterances.values()))
         total_conversations = len(set(conversations))
         #bootstrapping
         iterations = 0
         for i in range(20):
             samples = random.choices(utterances, k=25)
             #for politeness complexity#
             politeness_rows = []
             #many operations#
             for uid in samples:
                 politeness_rows.append(
                     list(
                         corpus.get_utterance(
                             uid).meta["politeness_strategies"].values()))
         #politeness#
             politeness_results = np.sum(politeness_rows, 0)
             politeness_results_count = len([
                 i / len(politeness_rows)
                 for i in politeness_results if i != 0.0
             ]) / len(politeness_rows)
             iterations += politeness_results_count
         #politness_final#
         politeness_final = iterations / 20
         #first/last#
         first_last_count = first_last / total_conversations
         #utterances_per_conversation#
         utterances_per_conversations = Counter(utterances_per_conversation)
         upc_final = []
         for k, v in utterances_per_conversations.items():
             average = k[2] / k[1]
             upc_final.append(v / average)
         upc_count = sum(upc_final) / len(utterances_per_conversations)
         user1.add_meta('politeness_complexity', politeness_final)
         user1.add_meta('utterance_per_conversation', upc_count)
         user1.add_meta('first_last_word', first_last_count)
         user1.add_meta('raw_count', raw_count)
     return (corpus)
コード例 #5
0
    def transform(self, corpus: Corpus):
        '''
            computes diversity measures for each utterance in the corpus.

            :param corpus: the Corpus to compute features for.
            :type corpus: Corpus      
        '''

        convos_per_split = self.stage_size // 2
        ref_sample_size = self.sample_size * convos_per_split

        if self.verbosity is not None:
            print('preparing corpus')
        join_tokens = UserConvoAttrs(attr_name='tokens',
                                     agg_fn=lambda x: ' '.join(x))
        corpus = join_tokens.fit_transform(corpus)
        text_df = get_user_convo_attribute_table(corpus,
                                                 ['tokens', 'n_utterances'],
                                                 min_n_convos=self.stage_size,
                                                 max_convo_idx=self.max_exp)
        text_df['stage_idx'] = (text_df.convo_idx // self.stage_size).map(int)

        user_df = text_df.drop_duplicates('user')\
            .set_index('user')[['user_start_time', 'user_n_convos']]

        text_df['tokenized'] = text_df.tokens.apply(
            lambda x: x.lower().split())
        text_df['wordcount'] = text_df.tokenized.apply(lambda x: len(x))

        ref_groups = text_df[text_df.convo_idx % 2 == 0].groupby(
            ['user', 'stage_idx'])
        ref_df = ref_groups.tokenized.agg(self._chain_tokens).to_frame()\
            .join(ref_groups.wordcount.agg(sum))\
            .reset_index().join(user_df, on='user')
        ref_df = ref_df[ref_df.wordcount >= ref_sample_size]

        cmp_df = text_df[(text_df.convo_idx % 2 == 1)
                         & (text_df.n_utterances >= self.min_convo_len)
                         & (text_df.wordcount >= self.sample_size)
                         & (text_df.user_n_convos >= self.max_exp)]

        if self.test: cmp_df = cmp_df.head(1000)
        if self.verbosity is not None:
            print('computing diversities')
        for idx, (user_convo_id, row) in enumerate(cmp_df.iterrows()):
            if (self.verbosity is not None) \
                    and (idx % self.verbosity == 0) and (idx > 0):
                print(idx, '/', len(cmp_df))

            cmp_samples = np.random.choice(row.tokenized,
                                           (self.n_iters, self.sample_size))
            self_cmp = ref_df[(ref_df.user == row.user)
                              & (ref_df.stage_idx == row.stage_idx)]
            if len(self_cmp) > 0:
                self_samples = np.random.choice(
                    self_cmp.tokenized.values[0],
                    (self.n_iters, ref_sample_size))
            else:
                self_samples = [[]] * self.n_iters

            other_cmp = ref_df[(ref_df.user != row.user)
                               & (ref_df.stage_idx == row.stage_idx)
                               & (ref_df.user_n_convos >=
                                  (row.stage_idx + 1) * self.stage_size)]
            if self.cohort_delta is not None:
                other_cmp = other_cmp[other_cmp.user_start_time.between(
                    row.user_start_time - self.cohort_delta,
                    row.user_start_time + self.cohort_delta)]
            if len(other_cmp) > 0:
                other_samples = [
                    np.random.choice(tokens, ref_sample_size)
                    for tokens in other_cmp.tokenized.sample(self.n_iters,
                                                             replace=True)
                ]
            else:
                other_samples = [[]] * self.n_iters

            for k, v in self._compute_divergences(cmp_samples, self_samples,
                                                  other_samples).items():
                corpus.get_user(
                    row.user).meta['conversations'][row.convo_id][k] = v
        return corpus