コード例 #1
0
    def test_partial_load_start_idx_specified_only(self):
        user_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        user_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id="0",
                      text="hello world",
                      user=User(name="alice",
                                meta={'user_binary_data': user_byte_arr1}),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id="1",
                      text="my name is bob",
                      user=User(name="bob",
                                meta={'user_binary_data': user_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(id="2", text="this is a test", user=User(
                name="charlie")),
        ])

        corpus1.dump('test_corpus', './')

        corpus2 = Corpus(filename="test_corpus", utterance_start_index=1)

        self.assertEqual(len(list(corpus2.iter_utterances())), 2)
        self.assertEqual(corpus1.get_utterance("1"),
                         corpus2.get_utterance("1"))
        self.assertEqual(corpus1.get_utterance("2"),
                         corpus2.get_utterance("2"))
    def test_dump_and_load_with_binary(self):
        """
        Dump a corpus containing speakers with binary metadata and utterances with binary metadata
        Check that dumped corpus is successfully loaded with the same data
        """

        speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        speaker_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id="0",
                      text="hello world",
                      speaker=Speaker(id="alice",
                                      meta={
                                          'speaker_binary_data':
                                          speaker_byte_arr1,
                                          'index': 99
                                      }),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id="1",
                      text="my name is bob",
                      speaker=Speaker(
                          id="bob",
                          meta={'speaker_binary_data': speaker_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
        ])

        alice = corpus1.get_speaker("alice")
        bob = corpus1.get_speaker("bob")

        corpus1.dump('test_corpus', './')
        corpus2 = Corpus(filename="test_corpus")

        alice2 = corpus2.get_speaker("alice")
        bob2 = corpus2.get_speaker("bob")

        self.assertEqual(alice.meta, alice2.meta)
        self.assertEqual(
            corpus1.get_utterance('0').meta,
            corpus2.get_utterance('0').meta)
        self.assertEqual(bob.meta, bob2.meta)
        self.assertEqual(
            corpus1.get_utterance('1').meta,
            corpus2.get_utterance('1').meta)
コード例 #3
0
def print_corpus(c: Corpus) -> None:
    leaves = get_corpus_leaf_ids(c)

    for leaf_id in leaves:
        utt = c.get_utterance(leaf_id)
        chain = [utt]
        while utt.reply_to:
            utt = c.get_utterance(utt.reply_to)
            chain.append(utt)

        depth = ""
        print("this conversation is", len(chain), "utterances long.")
        for utterance in reversed(chain):
            print(depth + utterance.text.replace("\n", " "))
            depth += "--> "
        print("\n")
コード例 #4
0
ファイル: data.py プロジェクト: rlaboulaye/turn-of-phrase
def add_title_to_root(corpus: Corpus):
    for conversation in corpus.iter_conversations():
        utterance = corpus.get_utterance(conversation.id)
        title = conversation.retrieve_meta('title')
        if title is None:
            title = ''
        if utterance.text is None:
            utterance.text = title
        else:
            utterance.text = title + ' ' + utterance.text
コード例 #5
0
ファイル: Rank.py プロジェクト: jschluger/first-convokit
 def transform(self, corpus: Corpus) -> Corpus:
     corpus = copy.deepcopy(corpus)
     for convo in corpus.iter_conversations():
         if 'rank' in convo.meta.keys():
             raise Exception(
                 'rank is already a key in this conversations meta! aborting'
             )
         t = 0
         for id in convo._utterance_ids:
             u = corpus.get_utterance(id)
             t += len(u.text)
         convo.meta['rank'] = t
     return corpus
コード例 #6
0
ファイル: w2v.py プロジェクト: LilianGong/AmazonAlexaAnalysis
from gensim.models import Word2Vec
from convokit import Corpus, Speaker, Utterance
from tqdm import tqdm

tartan_corpus = Corpus(filename="../tartan_corpus")

# gather all user utterances as training data
# 360,345 utterances
utterances = []
utt_id = tartan_corpus.get_utterance_ids()
for _id in tqdm(utt_id):
    utt = tartan_corpus.get_utterance(_id)
    if 'user' in utt._id:
        utterances.append(utt.text.split(" "))

# w2v model training
model = Word2Vec(utterances, min_count=1)
model.save("model/w2v_all.model")
コード例 #7
0
def run_stats(transformed_corpus: Corpus):
    male_speaking = 0
    male_speaking_about_female = 0
    male_speaking_about_female_romantic = 0

    male_speaking_not_about_female = 0

    female_speaking = 0
    female_speaking_about_male = 0
    female_speaking_about_male_romantic = 0

    female_speaking_not_about_male = 0

    romantic = 0
    not_romantic = 0

    utterance_ids = transformed_corpus.get_utterance_ids()

    for uid in utterance_ids:
        utt = transformed_corpus.get_utterance(uid)

        # First get whether it's a male or female speaker
        if 'gender' in utt.user.meta:
            speaker_gender = utt.user.meta['gender'].lower()
        elif 'sex' in utt.user.meta:
            speaker_gender = utt.user.meta['sex'].lower()

        if speaker_gender == "male":
            male_speaking += 1
        if speaker_gender == "female":
            female_speaking += 1

        # Then get whether the utterance is a male speaking about a female:
        if utt.meta["male_about_female"]:
            male_speaking_about_female += 1
            # And whether it was romantic
            if utt.meta["contains_romantic"]:
                male_speaking_about_female_romantic += 1
        else:
            male_speaking_not_about_female += 1

        # Then get whether the utterance is a female speaking about a male:
        if utt.meta["female_about_male"]:
            female_speaking_about_male += 1
            # And whether it was romantic
            if utt.meta["contains_romantic"]:
                female_speaking_about_male_romantic += 1
        else:
            female_speaking_not_about_male += 1

        # Then register whether the utt is romantic, period.
        rom = utt.meta["contains_romantic"]
        if rom:
            romantic += 1
        else:
            not_romantic += 1

    #Creating Percentages - help with graphs later
    perc_male_about_female = (float(male_speaking_about_female) /
                              float(male_speaking)) * 100
    perc_male_about_female_rom = (float(male_speaking_about_female_romantic) /
                                  float(male_speaking_about_female)) * 100
    perc_female_about_male = (float(female_speaking_about_male) /
                              float(female_speaking)) * 100
    perc_female_about_male_rom = (float(female_speaking_about_male_romantic) /
                                  float(female_speaking_about_male)) * 100

    print('male_speaking: ', male_speaking)
    print('male_speaking_about_female: ', male_speaking_about_female)
    print('male_speaking_about_female_romantic: ',
          male_speaking_about_female_romantic)
    print('pct male utterances about females', perc_male_about_female)
    print('pct male utterances about females that are romantic',
          perc_male_about_female_rom)
    print('male_speaking_not_about_female: ', male_speaking_not_about_female)
    print('\n')
    print('female_speaking: ', female_speaking)
    print('female_speaking_about_male: ', female_speaking_about_male)
    print('female_speaking_about_male_romantic: ',
          female_speaking_about_male_romantic)
    print('pct female utterances about males', perc_female_about_male)
    print('pct female utterances about males that are romantic',
          perc_female_about_male_rom)
    print('female_speaking_not_about_male: ', female_speaking_not_about_male)
    print('\n')
    print('romantic: ', romantic)
    print('not_romantic: ', not_romantic)
コード例 #8
0
ファイル: data.py プロジェクト: rlaboulaye/turn-of-phrase
def filter_winning_arguments_corpus(corpus: Corpus):
    utterance_ids = corpus.get_utterance_ids()

    #we want the original post made by op, the challenger's comments and all of OP's responses to the challengers
    #these three lists are utterance ids for the original post, challenger comments and op replies respectively

    opPost = []
    challengerComments = []
    opReplies = []
    for iD in utterance_ids:

        if corpus.get_utterance(iD).id == corpus.get_utterance(
                iD).conversation_id:
            opPost.append(iD)
        if corpus.get_utterance(iD).speaker.id != corpus.get_utterance(
                corpus.get_utterance(iD).conversation_id
        ).speaker.id and corpus.get_utterance(iD).meta['success'] == 0:
            challengerComments.append(iD)

        if corpus.get_utterance(iD).speaker.id != corpus.get_utterance(
                corpus.get_utterance(iD).conversation_id
        ).speaker.id and corpus.get_utterance(iD).meta['success'] == 1:
            challengerComments.append(iD)

        if corpus.get_utterance(iD).id != corpus.get_utterance(
                iD).conversation_id and corpus.get_utterance(
                    iD).speaker.id == corpus.get_utterance(
                        corpus.get_utterance(iD).conversation_id
                    ).speaker.id and corpus.get_utterance(
                        iD).meta['success'] == 0:
            opReplies.append(iD)
        if corpus.get_utterance(iD).id != corpus.get_utterance(
                iD).conversation_id and corpus.get_utterance(
                    iD).speaker.id == corpus.get_utterance(
                        corpus.get_utterance(iD).conversation_id
                    ).speaker.id and corpus.get_utterance(
                        iD).meta['success'] == 1:
            opReplies.append(iD)

    #subset challenger and op replies for later use (into successful and unsuccessful arguments)
    challengerPos = []
    challengerNeg = []
    for iD in challengerComments:
        if corpus.get_utterance(iD).meta['success'] == 1:
            challengerPos.append(iD)
        if corpus.get_utterance(iD).meta['success'] == 0:
            challengerNeg.append(iD)

    #these are OP's replies to successful and unsuccessful challengers
    opReplyPos = []
    opReplyNeg = []
    for iD in opReplies:
        if corpus.get_utterance(iD).meta['success'] == 1:
            opReplyPos.append(iD)
        if corpus.get_utterance(iD).meta['success'] == 0:
            opReplyNeg.append(iD)

    subset = opPost + challengerComments + opReplies

    #collect utterance dict given the subset of ids
    utterance_list = []
    for iD in subset:
        utterance_list.append(corpus.get_utterance(iD))

    #this subset separates OP comments and challenger utterances from all other comments in every conversation (thread)
    corpus = Corpus(utterances=utterance_list)

    return corpus
コード例 #9
0
# In 18

prompt_type_assignment_df.head()

# In 19

# noinspection PyTypeChecker
ps = PolitenessStrategies(verbose=1000)
awry_corpus = ps.transform(awry_corpus)

# In 20

utterance_ids = awry_corpus.get_utterance_ids()
rows = []
for uid in utterance_ids:
    rows.append(awry_corpus.get_utterance(uid).meta["politeness_strategies"])
politeness_strategies = pd.DataFrame(rows, index=utterance_ids)

# In 21

politeness_strategies.head(10)

# In 22

# first, we need to directly map comment IDs to their conversations. We'll build a DataFrame to do this
comment_ids = []
convo_ids = []
timestamps = []
page_ids = []
for conversation in awry_corpus.iter_conversations():
    for comment in conversation.iter_utterances():
コード例 #10
0
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec

nltk.download('punkt')
corpus = Corpus(filename=download("subreddit-creepypasta"))

corpusTXT = open("corpus.txt", "w")

utter_ids = corpus.get_utterance_ids()
length = len(utter_ids)

# Print the posts from subreddit to a file
i = 0
while i < 2:
    corpusTXT.write(corpus.get_utterance(utter_ids[i]).text)
    i += 1

corpusTXT.close()

corpusTXT_2 = open("corpus.txt", "r")

text = corpusTXT_2.read()

# Tokenize all of corpus.txt
nltk_sentences = sent_tokenize(text)
tokenized_sents = [word_tokenize(i) for i in nltk_sentences]
new_sents = []
stop_words = set(stopwords.words('english'))
whitespace = ' '
punctuation = string.punctuation
コード例 #11
0
import pickle

corpus = Corpus(filename=download("friends-corpus"))

Caption = namedtuple(
    'Caption', ['character', 'message', 'startTime', 'endTime', 'comments'])

captions = []

i = 1
while True:
    convoNumber = '{:0>2}'.format(i)
    try:
        convo = corpus.get_conversation(f"s08_e14_c{convoNumber}_u001")
        for utterance_id in convo.get_utterance_ids():
            utterance = corpus.get_utterance(utterance_id)
            if utterance.retrieve_meta("caption") is None:
                continue
            startTime, endTime, _ = utterance.retrieve_meta("caption")
            captions.append(
                Caption(utterance.speaker.id, utterance.text,
                        startTime // 1000, endTime // 1000, None))
        i += 1
    except KeyError:
        break  # there are no more conversations

captionsPath = "./data/friends/captions.pkl"

with open(captionsPath, 'wb') as captionsFile:
    pickle.dump(captions, captionsFile)
コード例 #12
0
ファイル: Rank.py プロジェクト: jschluger/first-convokit
 def convo_length(self, corpus: Corpus, convo):
     t = 0
     for id in convo._utterance_ids:
         u = corpus.get_utterance(id)
         t += len(u.text)
     return t