def test_partial_load_start_idx_specified_only(self): speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100]) speaker_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker( id="alice", meta={'speaker_binary_data': speaker_byte_arr1}), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob", meta={'speaker_binary_data': speaker_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus", utterance_start_index=1) self.assertEqual(len(list(corpus2.iter_utterances())), 2) self.assertEqual(corpus1.get_utterance("1"), corpus2.get_utterance("1")) self.assertEqual(corpus1.get_utterance("2"), corpus2.get_utterance("2"))
def test_dump_and_load_with_binary(self): """ Dump a corpus containing speakers with binary metadata and utterances with binary metadata Check that dumped corpus is successfully loaded with the same data """ speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100]) speaker_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice", meta={ 'speaker_binary_data': speaker_byte_arr1, 'index': 99 }), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob", meta={'speaker_binary_data': speaker_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) alice = corpus1.get_speaker("alice") bob = corpus1.get_speaker("bob") corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus") alice2 = corpus2.get_speaker("alice") bob2 = corpus2.get_speaker("bob") self.assertEqual(alice.meta, alice2.meta) self.assertEqual( corpus1.get_utterance('0').meta, corpus2.get_utterance('0').meta) self.assertEqual(bob.meta, bob2.meta) self.assertEqual( corpus1.get_utterance('1').meta, corpus2.get_utterance('1').meta)
def convert_df_to_corpus(df: DataFrame, id_col: str, text_col: str, meta_cols: List[str]) -> Corpus: """ Helper function to convert data to Corpus format Arguments: df {DataFrame} -- Actual data, in a pandas Dataframe id_col {str} -- name of the column that corresponds to utterances ids text_col {str} -- name of the column that stores texts of the utterances meta_cols {List[str]} -- set of columns that stores relevant metadata Returns: Corpus -- the converted corpus """ # in this particular case, speaker, reply_to, and timestamp information are all not applicable # and we will simply either create a placeholder entry, or leave it as None generic_speaker = Speaker(id="speaker") time = "NOT_RECORDED" utterance_list = [] for index, row in tqdm(df.iterrows()): # extracting meta data metadata = {} for meta_col in meta_cols: metadata[meta_col] = row[meta_col] utterance_list.append(Utterance(id=str(row[id_col]), speaker=generic_speaker, \ conversation_id=str(row[id_col]), reply_to=None, \ timestamp=time, text=row[text_col], \ meta=metadata)) return Corpus(utterances=utterance_list)
def build_manual_corpus() -> Corpus: print('Building corpus from manually created yml files...') manual_files = [] for root, dirs, files in os.walk('data/manual'): manual_files.extend([os.path.join(root, f) for f in files]) conversations = [] for path in manual_files: with open(path) as f: cs = yaml.load(f.read())['conversations'] for c in cs: conversations.append((c[0], c[1])) speakers = {'0': Speaker(id='0'), '_analysis': Speaker(id='_analysis')} utterances = [] i = 0 for _ in range(10): for prompt, response in conversations: id_1 = "M" + str(i) id_2 = "M" + str(i + 1) utts = [ Utterance(id=id_1, text=prompt, speaker=speakers["_analysis"], root=id_1, reply_to=None), Utterance( id=id_2, text=response, speaker=speakers["0"], root=id_1, reply_to=id_1, ), ] i = i + 2 utterances.extend(utts) return Corpus(utterances=utterances)
def transform_utterance(self, utt): """ Computes representations and statistics for a single utterance, which can be a ConvoKit Utterance or a string. Will return an Utterance object a nd write all of these characterizations (including vectors) to the utterance's metadata; attribute names are prefixed with the `output_prefix` constructor argument. :param utt: Utterance or string :return: the utterance, with per-utterance representation, range and cluster assignments. """ if isinstance(utt, str): utt = Utterance(text=utt, speaker=Speaker()) self.text_pipe.transform_utterance(utt) self.tfidf_model.transform_utterance(utt) return self.dualmodel.transform_utterance(utt)
def create_speakers(conversations): """Creates a convokit speakers class.""" speaker_meta = {} for conv in conversations: for speaker in conv.speakers: speaker_meta[speaker.id] = { "age": speaker.age, "gender": speaker.gender } corpus_speakers = { k: Speaker(id=k, meta=v) for k, v in speaker_meta.items() } return corpus_speakers
def format_as_corpus(self, conv): users = np.unique([utt['user'] for utt in conv]) users_dict = {user: Speaker(name=user) for user in users} utterances = [] for utt in conv: user = users_dict[utt['user']] utt_obj = Utterance(id=utt['utt_id'], user=user, text=utt['text'], root=str(utt['conv_id'])) utt_obj.add_meta('reply_depth', utt['indent_depth']) utterances.append(utt_obj) corpus = Corpus(utterances=utterances) return corpus
def build_imessage_corpus() -> Corpus: print('Building corpus from iMessages...') conn = sqlite3.connect(os.path.expanduser('~/Library/Messages/chat.db')) conn.row_factory = sqlite3.Row cur = conn.cursor() # Handles (AKA Speakers) cur.execute("select ROWID as handle_id, id as phone_number from handle") handles = [dict(x) for x in cur.fetchall()] speakers = { str(h['handle_id']): Speaker(id=str(h['handle_id']), meta=h) for h in handles } speakers.update({ '0': Speaker(id='0', meta={'phone_number': '+12155889243'}) }) # don't call me unless you want # Chats chats = pd.read_sql_query("select * from chat", conn) chats.rename(columns={ 'ROWID': 'chat_id', 'chat_identifier': 'chat_name' }, inplace=True) chat_cols = list(chats) chats[chat_cols] = chats[chat_cols].astype(str) # Messages messages = pd.read_sql_query("select * from message", conn) messages.rename(columns={'ROWID': 'message_id'}, inplace=True) messages = messages[[ 'message_id', 'text', 'handle_id', 'date', 'is_from_me' ]] messages['sender_id'] = messages.apply(lambda r: r['handle_id'] if r['is_from_me'] == 0 else '0', axis=1) # Add chat data to messages chat_message_joins = pd.read_sql_query("select * from chat_message_join", conn) messages = pd.merge(messages, chat_message_joins[['chat_id', 'message_id']], on='message_id', how='left').dropna() messages['chat_id'] = messages['chat_id'].astype(int) cols = list(messages) messages[cols] = messages[cols].astype(str) utterances = [] for _, chat in chats.iterrows(): chat_messages = messages.loc[messages['chat_id'] == chat['chat_id']].sort_values(by=['date']) num_messages = len(chat_messages.index) if num_messages == 0: print("Warning: chat '%s' has no messages" % chat['chat_name']) continue root_msg = chat_messages.iloc[0] for i in range(num_messages): msg = chat_messages.iloc[i] last_msg = chat_messages.iloc[i - 1] if i != 0 else None last_msg_id = chat_messages.iloc[ i - 1]['message_id'] if i != 0 else None # Make a new conversation if more than an hour has passed between messages if last_msg is not None and int(msg['date']) - int( chat_messages.iloc[i - 1]['date']) > 3.6e12: root_msg = chat_messages.iloc[i] last_msg_id = None msg_utt = Utterance(id=msg['message_id'], text=msg['text'], speaker=speakers[msg['sender_id']], root=root_msg['message_id'], reply_to=last_msg_id, meta=msg) utterances.append(msg_utt) return Corpus(utterances=utterances)