コード例 #1
0
ファイル: w2v.py プロジェクト: LilianGong/AmazonAlexaAnalysis
from gensim.models import Word2Vec
from convokit import Corpus, Speaker, Utterance
from tqdm import tqdm

tartan_corpus = Corpus(filename="../tartan_corpus")

# gather all user utterances as training data
# 360,345 utterances
utterances = []
utt_id = tartan_corpus.get_utterance_ids()
for _id in tqdm(utt_id):
    utt = tartan_corpus.get_utterance(_id)
    if 'user' in utt._id:
        utterances.append(utt.text.split(" "))

# w2v model training
model = Word2Vec(utterances, min_count=1)
model.save("model/w2v_all.model")
コード例 #2
0
top_6_subs = list(reddit.subreddits.popular(limit=6))

reddit.subreddits.recommended(top_6_subs)    

widgets = reddit.subreddit('askouija').widgets
for widget in widgets.sidebar:
    if isinstance(widget, praw.models.CommunityList):
        print(widget)
    

#####################

from convokit import Corpus, download

smalsubs = Corpus(filename=download('reddit-corpus-small')) # will not download twice if it already exists

ut_ids = smalsubs.get_utterance_ids()

len(ut_ids)

uid = ut_ids[0]

c_ids = smalsubs.get_conversation_ids()
cid = c_ids[0]

convo = smalsubs.get_conversation(cid)
convo.get_utterance_ids()
convo.get_utterance(uid)

top_level = [uid for uid in ut_ids if sub_corn.get]
コード例 #3
0
def run_stats(transformed_corpus: Corpus):
    male_speaking = 0
    male_speaking_about_female = 0
    male_speaking_about_female_romantic = 0

    male_speaking_not_about_female = 0

    female_speaking = 0
    female_speaking_about_male = 0
    female_speaking_about_male_romantic = 0

    female_speaking_not_about_male = 0

    romantic = 0
    not_romantic = 0

    utterance_ids = transformed_corpus.get_utterance_ids()

    for uid in utterance_ids:
        utt = transformed_corpus.get_utterance(uid)

        # First get whether it's a male or female speaker
        if 'gender' in utt.user.meta:
            speaker_gender = utt.user.meta['gender'].lower()
        elif 'sex' in utt.user.meta:
            speaker_gender = utt.user.meta['sex'].lower()

        if speaker_gender == "male":
            male_speaking += 1
        if speaker_gender == "female":
            female_speaking += 1

        # Then get whether the utterance is a male speaking about a female:
        if utt.meta["male_about_female"]:
            male_speaking_about_female += 1
            # And whether it was romantic
            if utt.meta["contains_romantic"]:
                male_speaking_about_female_romantic += 1
        else:
            male_speaking_not_about_female += 1

        # Then get whether the utterance is a female speaking about a male:
        if utt.meta["female_about_male"]:
            female_speaking_about_male += 1
            # And whether it was romantic
            if utt.meta["contains_romantic"]:
                female_speaking_about_male_romantic += 1
        else:
            female_speaking_not_about_male += 1

        # Then register whether the utt is romantic, period.
        rom = utt.meta["contains_romantic"]
        if rom:
            romantic += 1
        else:
            not_romantic += 1

    #Creating Percentages - help with graphs later
    perc_male_about_female = (float(male_speaking_about_female) /
                              float(male_speaking)) * 100
    perc_male_about_female_rom = (float(male_speaking_about_female_romantic) /
                                  float(male_speaking_about_female)) * 100
    perc_female_about_male = (float(female_speaking_about_male) /
                              float(female_speaking)) * 100
    perc_female_about_male_rom = (float(female_speaking_about_male_romantic) /
                                  float(female_speaking_about_male)) * 100

    print('male_speaking: ', male_speaking)
    print('male_speaking_about_female: ', male_speaking_about_female)
    print('male_speaking_about_female_romantic: ',
          male_speaking_about_female_romantic)
    print('pct male utterances about females', perc_male_about_female)
    print('pct male utterances about females that are romantic',
          perc_male_about_female_rom)
    print('male_speaking_not_about_female: ', male_speaking_not_about_female)
    print('\n')
    print('female_speaking: ', female_speaking)
    print('female_speaking_about_male: ', female_speaking_about_male)
    print('female_speaking_about_male_romantic: ',
          female_speaking_about_male_romantic)
    print('pct female utterances about males', perc_female_about_male)
    print('pct female utterances about males that are romantic',
          perc_female_about_male_rom)
    print('female_speaking_not_about_male: ', female_speaking_not_about_male)
    print('\n')
    print('romantic: ', romantic)
    print('not_romantic: ', not_romantic)
コード例 #4
0
ファイル: data.py プロジェクト: rlaboulaye/turn-of-phrase
def filter_winning_arguments_corpus(corpus: Corpus):
    utterance_ids = corpus.get_utterance_ids()

    #we want the original post made by op, the challenger's comments and all of OP's responses to the challengers
    #these three lists are utterance ids for the original post, challenger comments and op replies respectively

    opPost = []
    challengerComments = []
    opReplies = []
    for iD in utterance_ids:

        if corpus.get_utterance(iD).id == corpus.get_utterance(
                iD).conversation_id:
            opPost.append(iD)
        if corpus.get_utterance(iD).speaker.id != corpus.get_utterance(
                corpus.get_utterance(iD).conversation_id
        ).speaker.id and corpus.get_utterance(iD).meta['success'] == 0:
            challengerComments.append(iD)

        if corpus.get_utterance(iD).speaker.id != corpus.get_utterance(
                corpus.get_utterance(iD).conversation_id
        ).speaker.id and corpus.get_utterance(iD).meta['success'] == 1:
            challengerComments.append(iD)

        if corpus.get_utterance(iD).id != corpus.get_utterance(
                iD).conversation_id and corpus.get_utterance(
                    iD).speaker.id == corpus.get_utterance(
                        corpus.get_utterance(iD).conversation_id
                    ).speaker.id and corpus.get_utterance(
                        iD).meta['success'] == 0:
            opReplies.append(iD)
        if corpus.get_utterance(iD).id != corpus.get_utterance(
                iD).conversation_id and corpus.get_utterance(
                    iD).speaker.id == corpus.get_utterance(
                        corpus.get_utterance(iD).conversation_id
                    ).speaker.id and corpus.get_utterance(
                        iD).meta['success'] == 1:
            opReplies.append(iD)

    #subset challenger and op replies for later use (into successful and unsuccessful arguments)
    challengerPos = []
    challengerNeg = []
    for iD in challengerComments:
        if corpus.get_utterance(iD).meta['success'] == 1:
            challengerPos.append(iD)
        if corpus.get_utterance(iD).meta['success'] == 0:
            challengerNeg.append(iD)

    #these are OP's replies to successful and unsuccessful challengers
    opReplyPos = []
    opReplyNeg = []
    for iD in opReplies:
        if corpus.get_utterance(iD).meta['success'] == 1:
            opReplyPos.append(iD)
        if corpus.get_utterance(iD).meta['success'] == 0:
            opReplyNeg.append(iD)

    subset = opPost + challengerComments + opReplies

    #collect utterance dict given the subset of ids
    utterance_list = []
    for iD in subset:
        utterance_list.append(corpus.get_utterance(iD))

    #this subset separates OP comments and challenger utterances from all other comments in every conversation (thread)
    corpus = Corpus(utterances=utterance_list)

    return corpus
コード例 #5
0
prompt_type_assignment_df.columns = TYPE_NAMES

# In 18

prompt_type_assignment_df.head()

# In 19

# noinspection PyTypeChecker
ps = PolitenessStrategies(verbose=1000)
awry_corpus = ps.transform(awry_corpus)

# In 20

utterance_ids = awry_corpus.get_utterance_ids()
rows = []
for uid in utterance_ids:
    rows.append(awry_corpus.get_utterance(uid).meta["politeness_strategies"])
politeness_strategies = pd.DataFrame(rows, index=utterance_ids)

# In 21

politeness_strategies.head(10)

# In 22

# first, we need to directly map comment IDs to their conversations. We'll build a DataFrame to do this
comment_ids = []
convo_ids = []
timestamps = []
コード例 #6
0
import convokit
import nltk
import string
from nltk.corpus import stopwords
from convokit import Corpus, download
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec

nltk.download('punkt')
corpus = Corpus(filename=download("subreddit-creepypasta"))

corpusTXT = open("corpus.txt", "w")

utter_ids = corpus.get_utterance_ids()
length = len(utter_ids)

# Print the posts from subreddit to a file
i = 0
while i < 2:
    corpusTXT.write(corpus.get_utterance(utter_ids[i]).text)
    i += 1

corpusTXT.close()

corpusTXT_2 = open("corpus.txt", "r")

text = corpusTXT_2.read()

# Tokenize all of corpus.txt
nltk_sentences = sent_tokenize(text)