def test_partial_load_start_idx_specified_only(self):
        speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        speaker_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id="0",
                      text="hello world",
                      speaker=Speaker(
                          id="alice",
                          meta={'speaker_binary_data': speaker_byte_arr1}),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id="1",
                      text="my name is bob",
                      speaker=Speaker(
                          id="bob",
                          meta={'speaker_binary_data': speaker_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
        ])

        corpus1.dump('test_corpus', './')

        corpus2 = Corpus(filename="test_corpus", utterance_start_index=1)

        self.assertEqual(len(list(corpus2.iter_utterances())), 2)
        self.assertEqual(corpus1.get_utterance("1"),
                         corpus2.get_utterance("1"))
        self.assertEqual(corpus1.get_utterance("2"),
                         corpus2.get_utterance("2"))
    def test_dump_and_load_with_binary(self):
        """
        Dump a corpus containing speakers with binary metadata and utterances with binary metadata
        Check that dumped corpus is successfully loaded with the same data
        """

        speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        speaker_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id="0",
                      text="hello world",
                      speaker=Speaker(id="alice",
                                      meta={
                                          'speaker_binary_data':
                                          speaker_byte_arr1,
                                          'index': 99
                                      }),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id="1",
                      text="my name is bob",
                      speaker=Speaker(
                          id="bob",
                          meta={'speaker_binary_data': speaker_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
        ])

        alice = corpus1.get_speaker("alice")
        bob = corpus1.get_speaker("bob")

        corpus1.dump('test_corpus', './')
        corpus2 = Corpus(filename="test_corpus")

        alice2 = corpus2.get_speaker("alice")
        bob2 = corpus2.get_speaker("bob")

        self.assertEqual(alice.meta, alice2.meta)
        self.assertEqual(
            corpus1.get_utterance('0').meta,
            corpus2.get_utterance('0').meta)
        self.assertEqual(bob.meta, bob2.meta)
        self.assertEqual(
            corpus1.get_utterance('1').meta,
            corpus2.get_utterance('1').meta)
def convert_df_to_corpus(df: DataFrame, id_col: str, text_col: str,
                         meta_cols: List[str]) -> Corpus:
    """ Helper function to convert data to Corpus format
     
    Arguments:
        df {DataFrame} -- Actual data, in a pandas Dataframe
        id_col {str} -- name of the column that corresponds to utterances ids 
        text_col {str} -- name of the column that stores texts of the utterances  
        meta_cols {List[str]} -- set of columns that stores relevant metadata 
    
    Returns:
        Corpus -- the converted corpus
    """

    # in this particular case, speaker, reply_to, and timestamp information are all not applicable
    # and we will simply either create a placeholder entry, or leave it as None

    generic_speaker = Speaker(id="speaker")
    time = "NOT_RECORDED"

    utterance_list = []
    for index, row in tqdm(df.iterrows()):

        # extracting meta data
        metadata = {}
        for meta_col in meta_cols:
            metadata[meta_col] = row[meta_col]

        utterance_list.append(Utterance(id=str(row[id_col]), speaker=generic_speaker, \
                                        conversation_id=str(row[id_col]), reply_to=None, \
                                        timestamp=time, text=row[text_col], \
                                        meta=metadata))

    return Corpus(utterances=utterance_list)
Ejemplo n.º 4
0
def build_manual_corpus() -> Corpus:
    print('Building corpus from manually created yml files...')

    manual_files = []
    for root, dirs, files in os.walk('data/manual'):
        manual_files.extend([os.path.join(root, f) for f in files])

    conversations = []
    for path in manual_files:
        with open(path) as f:
            cs = yaml.load(f.read())['conversations']
            for c in cs:
                conversations.append((c[0], c[1]))

    speakers = {'0': Speaker(id='0'), '_analysis': Speaker(id='_analysis')}

    utterances = []
    i = 0
    for _ in range(10):
        for prompt, response in conversations:
            id_1 = "M" + str(i)
            id_2 = "M" + str(i + 1)
            utts = [
                Utterance(id=id_1,
                          text=prompt,
                          speaker=speakers["_analysis"],
                          root=id_1,
                          reply_to=None),
                Utterance(
                    id=id_2,
                    text=response,
                    speaker=speakers["0"],
                    root=id_1,
                    reply_to=id_1,
                ),
            ]
            i = i + 2
            utterances.extend(utts)

    return Corpus(utterances=utterances)
    def transform_utterance(self, utt):
        """
        Computes representations and statistics for a single utterance, which can be a ConvoKit Utterance or a string. 
        Will return an Utterance object a nd write all of these characterizations (including vectors) to the utterance's metadata; attribute names are prefixed with the `output_prefix` constructor argument.

        :param utt: Utterance or string
        :return: the utterance, with per-utterance representation, range and cluster assignments.
        """
        if isinstance(utt, str):
            utt = Utterance(text=utt, speaker=Speaker())
        self.text_pipe.transform_utterance(utt)
        self.tfidf_model.transform_utterance(utt)
        return self.dualmodel.transform_utterance(utt)
def create_speakers(conversations):
    """Creates a convokit speakers class."""
    speaker_meta = {}

    for conv in conversations:
        for speaker in conv.speakers:
            speaker_meta[speaker.id] = {
                "age": speaker.age,
                "gender": speaker.gender
            }

    corpus_speakers = {
        k: Speaker(id=k, meta=v)
        for k, v in speaker_meta.items()
    }
    return corpus_speakers
Ejemplo n.º 7
0
    def format_as_corpus(self, conv):
        users = np.unique([utt['user'] for utt in conv])
        users_dict = {user: Speaker(name=user) for user in users}

        utterances = []

        for utt in conv:
            user = users_dict[utt['user']]
            utt_obj = Utterance(id=utt['utt_id'],
                                user=user,
                                text=utt['text'],
                                root=str(utt['conv_id']))
            utt_obj.add_meta('reply_depth', utt['indent_depth'])
            utterances.append(utt_obj)

        corpus = Corpus(utterances=utterances)

        return corpus
Ejemplo n.º 8
0
def build_imessage_corpus() -> Corpus:
    print('Building corpus from iMessages...')
    conn = sqlite3.connect(os.path.expanduser('~/Library/Messages/chat.db'))
    conn.row_factory = sqlite3.Row
    cur = conn.cursor()

    # Handles (AKA Speakers)
    cur.execute("select ROWID as handle_id, id as phone_number from handle")
    handles = [dict(x) for x in cur.fetchall()]
    speakers = {
        str(h['handle_id']): Speaker(id=str(h['handle_id']), meta=h)
        for h in handles
    }
    speakers.update({
        '0': Speaker(id='0', meta={'phone_number': '+12155889243'})
    })  # don't call me unless you want

    # Chats
    chats = pd.read_sql_query("select * from chat", conn)
    chats.rename(columns={
        'ROWID': 'chat_id',
        'chat_identifier': 'chat_name'
    },
                 inplace=True)
    chat_cols = list(chats)
    chats[chat_cols] = chats[chat_cols].astype(str)

    # Messages
    messages = pd.read_sql_query("select * from message", conn)
    messages.rename(columns={'ROWID': 'message_id'}, inplace=True)
    messages = messages[[
        'message_id', 'text', 'handle_id', 'date', 'is_from_me'
    ]]
    messages['sender_id'] = messages.apply(lambda r: r['handle_id']
                                           if r['is_from_me'] == 0 else '0',
                                           axis=1)

    # Add chat data to messages
    chat_message_joins = pd.read_sql_query("select * from chat_message_join",
                                           conn)
    messages = pd.merge(messages,
                        chat_message_joins[['chat_id', 'message_id']],
                        on='message_id',
                        how='left').dropna()
    messages['chat_id'] = messages['chat_id'].astype(int)
    cols = list(messages)
    messages[cols] = messages[cols].astype(str)

    utterances = []
    for _, chat in chats.iterrows():
        chat_messages = messages.loc[messages['chat_id'] ==
                                     chat['chat_id']].sort_values(by=['date'])
        num_messages = len(chat_messages.index)

        if num_messages == 0:
            print("Warning: chat '%s' has no messages" % chat['chat_name'])
            continue

        root_msg = chat_messages.iloc[0]
        for i in range(num_messages):
            msg = chat_messages.iloc[i]
            last_msg = chat_messages.iloc[i - 1] if i != 0 else None
            last_msg_id = chat_messages.iloc[
                i - 1]['message_id'] if i != 0 else None

            # Make a new conversation if more than an hour has passed between messages
            if last_msg is not None and int(msg['date']) - int(
                    chat_messages.iloc[i - 1]['date']) > 3.6e12:
                root_msg = chat_messages.iloc[i]
                last_msg_id = None

            msg_utt = Utterance(id=msg['message_id'],
                                text=msg['text'],
                                speaker=speakers[msg['sender_id']],
                                root=root_msg['message_id'],
                                reply_to=last_msg_id,
                                meta=msg)
            utterances.append(msg_utt)

    return Corpus(utterances=utterances)