def transform(self, corpus: Corpus): ''' compiles a list of all utterances by each user, organized by conversation; also annotates user with summary statistics. :param corpus: the Corpus to transform. :type corpus: Corpus ''' user_to_convo_utts = defaultdict(lambda: defaultdict(list)) for utterance in corpus.iter_utterances(): if not self.utterance_filter(utterance): continue user_to_convo_utts[utterance.user.name][utterance.root].append( (utterance.id, utterance.timestamp)) for user, convo_utts in user_to_convo_utts.items(): user_convos = {} for convo, utts in convo_utts.items(): sorted_utts = sorted(utts, key=lambda x: x[1]) user_convos[convo] = { 'utterance_ids': [x[0] for x in sorted_utts], 'start_time': sorted_utts[0][1], 'n_utterances': len(sorted_utts) } corpus.get_user(user).add_meta('conversations', user_convos) for user in corpus.iter_users(): if 'conversations' not in user.meta: continue user.add_meta('n_convos', len(user.meta['conversations'])) sorted_convos = sorted(user.meta['conversations'].items(), key=lambda x: x[1]['start_time']) user.add_meta('start_time', sorted_convos[0][1]['start_time']) for idx, (convo_id, _) in enumerate(sorted_convos): user.meta['conversations'][convo_id]['idx'] = idx return corpus
def test_corpus_dump(self): corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", user=User(name="alice")), Utterance(id="1", text="my name is bob", user=User(name="bob")), Utterance(id="2", text="this is a test", user=User( name="charlie")), ]) corpus1.get_utterance("0").meta['foo'] = 'bar' corpus1.get_utterance("1").meta['foo'] = 'bar2' corpus1.get_utterance("2").meta['hey'] = 'jude' corpus1.get_conversation(None).meta['convo_meta'] = 1 corpus1.get_user("alice").meta['surname'] = 1.0 corpus1.dump('test_index_meta_corpus', base_path="./") corpus2 = Corpus(filename="test_index_meta_corpus") self.assertEqual(corpus1.meta_index.utterances_index, corpus2.meta_index.utterances_index) self.assertEqual(corpus1.meta_index.users_index, corpus2.meta_index.users_index) self.assertEqual(corpus1.meta_index.conversations_index, corpus2.meta_index.conversations_index) self.assertEqual(corpus1.meta_index.overall_index, corpus2.meta_index.overall_index)
def test_key_insertion_deletion(self): corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", user=User(name="alice")), Utterance(id="1", text="my name is bob", user=User(name="bob")), Utterance(id="2", text="this is a test", user=User( name="charlie")), ]) corpus1.get_utterance("0").meta['foo'] = 'bar' corpus1.get_utterance("1").meta['foo'] = 'bar2' corpus1.get_utterance("2").meta['hey'] = 'jude' corpus1.get_conversation(None).meta['convo_meta'] = 1 corpus1.get_user("alice").meta['surname'] = 1.0 self.assertEqual(corpus1.meta_index.utterances_index['foo'], str(type('bar'))) self.assertEqual(corpus1.meta_index.conversations_index['convo_meta'], str(type(1))) self.assertEqual(corpus1.meta_index.users_index['surname'], str(type(1.0))) # test that deleting a key from an utterance removes it from the index del corpus1.get_utterance("2").meta['hey'] self.assertRaises(KeyError, lambda: corpus1.meta_index.utterances_index['hey']) # test that deleting a key from an utterance removes it from the index and from all other objects of same type del corpus1.get_utterance("1").meta['foo'] self.assertRaises(KeyError, lambda: corpus1.meta_index.utterances_index['foo']) self.assertRaises(KeyError, lambda: corpus1.get_utterance("0").meta["foo"])
def transform(self, corpus: Corpus): for character in corpus.get_usernames(): user1 = corpus.get_user(character) utterances = user1.get_utterance_ids() utterances_per_conversation = [] conversations = [] for uid in utterances: utterance = corpus.get_utterance(uid) conversation = corpus.get_conversation(utterance.root) conversations.append(utterance.root) utterances_per_conversation.append( (utterance.root, len(conversation.get_usernames()), len(conversation.get_utterance_ids()))) first_last = 0 if uid in (utterance.root, list(conversation.get_utterance_ids())[-1]): first_last += 1 raw_count = len(utterances) / len(list(corpus.utterances.values())) total_conversations = len(set(conversations)) #bootstrapping iterations = 0 for i in range(20): samples = random.choices(utterances, k=25) #for politeness complexity# politeness_rows = [] #many operations# for uid in samples: politeness_rows.append( list( corpus.get_utterance( uid).meta["politeness_strategies"].values())) #politeness# politeness_results = np.sum(politeness_rows, 0) politeness_results_count = len([ i / len(politeness_rows) for i in politeness_results if i != 0.0 ]) / len(politeness_rows) iterations += politeness_results_count #politness_final# politeness_final = iterations / 20 #first/last# first_last_count = first_last / total_conversations #utterances_per_conversation# utterances_per_conversations = Counter(utterances_per_conversation) upc_final = [] for k, v in utterances_per_conversations.items(): average = k[2] / k[1] upc_final.append(v / average) upc_count = sum(upc_final) / len(utterances_per_conversations) user1.add_meta('politeness_complexity', politeness_final) user1.add_meta('utterance_per_conversation', upc_count) user1.add_meta('first_last_word', first_last_count) user1.add_meta('raw_count', raw_count) return (corpus)
def transform(self, corpus: Corpus): ''' computes diversity measures for each utterance in the corpus. :param corpus: the Corpus to compute features for. :type corpus: Corpus ''' convos_per_split = self.stage_size // 2 ref_sample_size = self.sample_size * convos_per_split if self.verbosity is not None: print('preparing corpus') join_tokens = UserConvoAttrs(attr_name='tokens', agg_fn=lambda x: ' '.join(x)) corpus = join_tokens.fit_transform(corpus) text_df = get_user_convo_attribute_table(corpus, ['tokens', 'n_utterances'], min_n_convos=self.stage_size, max_convo_idx=self.max_exp) text_df['stage_idx'] = (text_df.convo_idx // self.stage_size).map(int) user_df = text_df.drop_duplicates('user')\ .set_index('user')[['user_start_time', 'user_n_convos']] text_df['tokenized'] = text_df.tokens.apply( lambda x: x.lower().split()) text_df['wordcount'] = text_df.tokenized.apply(lambda x: len(x)) ref_groups = text_df[text_df.convo_idx % 2 == 0].groupby( ['user', 'stage_idx']) ref_df = ref_groups.tokenized.agg(self._chain_tokens).to_frame()\ .join(ref_groups.wordcount.agg(sum))\ .reset_index().join(user_df, on='user') ref_df = ref_df[ref_df.wordcount >= ref_sample_size] cmp_df = text_df[(text_df.convo_idx % 2 == 1) & (text_df.n_utterances >= self.min_convo_len) & (text_df.wordcount >= self.sample_size) & (text_df.user_n_convos >= self.max_exp)] if self.test: cmp_df = cmp_df.head(1000) if self.verbosity is not None: print('computing diversities') for idx, (user_convo_id, row) in enumerate(cmp_df.iterrows()): if (self.verbosity is not None) \ and (idx % self.verbosity == 0) and (idx > 0): print(idx, '/', len(cmp_df)) cmp_samples = np.random.choice(row.tokenized, (self.n_iters, self.sample_size)) self_cmp = ref_df[(ref_df.user == row.user) & (ref_df.stage_idx == row.stage_idx)] if len(self_cmp) > 0: self_samples = np.random.choice( self_cmp.tokenized.values[0], (self.n_iters, ref_sample_size)) else: self_samples = [[]] * self.n_iters other_cmp = ref_df[(ref_df.user != row.user) & (ref_df.stage_idx == row.stage_idx) & (ref_df.user_n_convos >= (row.stage_idx + 1) * self.stage_size)] if self.cohort_delta is not None: other_cmp = other_cmp[other_cmp.user_start_time.between( row.user_start_time - self.cohort_delta, row.user_start_time + self.cohort_delta)] if len(other_cmp) > 0: other_samples = [ np.random.choice(tokens, ref_sample_size) for tokens in other_cmp.tokenized.sample(self.n_iters, replace=True) ] else: other_samples = [[]] * self.n_iters for k, v in self._compute_divergences(cmp_samples, self_samples, other_samples).items(): corpus.get_user( row.user).meta['conversations'][row.convo_id][k] = v return corpus