from gensim.models import Word2Vec from convokit import Corpus, Speaker, Utterance from tqdm import tqdm tartan_corpus = Corpus(filename="../tartan_corpus") # gather all user utterances as training data # 360,345 utterances utterances = [] utt_id = tartan_corpus.get_utterance_ids() for _id in tqdm(utt_id): utt = tartan_corpus.get_utterance(_id) if 'user' in utt._id: utterances.append(utt.text.split(" ")) # w2v model training model = Word2Vec(utterances, min_count=1) model.save("model/w2v_all.model")
top_6_subs = list(reddit.subreddits.popular(limit=6)) reddit.subreddits.recommended(top_6_subs) widgets = reddit.subreddit('askouija').widgets for widget in widgets.sidebar: if isinstance(widget, praw.models.CommunityList): print(widget) ##################### from convokit import Corpus, download smalsubs = Corpus(filename=download('reddit-corpus-small')) # will not download twice if it already exists ut_ids = smalsubs.get_utterance_ids() len(ut_ids) uid = ut_ids[0] c_ids = smalsubs.get_conversation_ids() cid = c_ids[0] convo = smalsubs.get_conversation(cid) convo.get_utterance_ids() convo.get_utterance(uid) top_level = [uid for uid in ut_ids if sub_corn.get]
def run_stats(transformed_corpus: Corpus): male_speaking = 0 male_speaking_about_female = 0 male_speaking_about_female_romantic = 0 male_speaking_not_about_female = 0 female_speaking = 0 female_speaking_about_male = 0 female_speaking_about_male_romantic = 0 female_speaking_not_about_male = 0 romantic = 0 not_romantic = 0 utterance_ids = transformed_corpus.get_utterance_ids() for uid in utterance_ids: utt = transformed_corpus.get_utterance(uid) # First get whether it's a male or female speaker if 'gender' in utt.user.meta: speaker_gender = utt.user.meta['gender'].lower() elif 'sex' in utt.user.meta: speaker_gender = utt.user.meta['sex'].lower() if speaker_gender == "male": male_speaking += 1 if speaker_gender == "female": female_speaking += 1 # Then get whether the utterance is a male speaking about a female: if utt.meta["male_about_female"]: male_speaking_about_female += 1 # And whether it was romantic if utt.meta["contains_romantic"]: male_speaking_about_female_romantic += 1 else: male_speaking_not_about_female += 1 # Then get whether the utterance is a female speaking about a male: if utt.meta["female_about_male"]: female_speaking_about_male += 1 # And whether it was romantic if utt.meta["contains_romantic"]: female_speaking_about_male_romantic += 1 else: female_speaking_not_about_male += 1 # Then register whether the utt is romantic, period. rom = utt.meta["contains_romantic"] if rom: romantic += 1 else: not_romantic += 1 #Creating Percentages - help with graphs later perc_male_about_female = (float(male_speaking_about_female) / float(male_speaking)) * 100 perc_male_about_female_rom = (float(male_speaking_about_female_romantic) / float(male_speaking_about_female)) * 100 perc_female_about_male = (float(female_speaking_about_male) / float(female_speaking)) * 100 perc_female_about_male_rom = (float(female_speaking_about_male_romantic) / float(female_speaking_about_male)) * 100 print('male_speaking: ', male_speaking) print('male_speaking_about_female: ', male_speaking_about_female) print('male_speaking_about_female_romantic: ', male_speaking_about_female_romantic) print('pct male utterances about females', perc_male_about_female) print('pct male utterances about females that are romantic', perc_male_about_female_rom) print('male_speaking_not_about_female: ', male_speaking_not_about_female) print('\n') print('female_speaking: ', female_speaking) print('female_speaking_about_male: ', female_speaking_about_male) print('female_speaking_about_male_romantic: ', female_speaking_about_male_romantic) print('pct female utterances about males', perc_female_about_male) print('pct female utterances about males that are romantic', perc_female_about_male_rom) print('female_speaking_not_about_male: ', female_speaking_not_about_male) print('\n') print('romantic: ', romantic) print('not_romantic: ', not_romantic)
def filter_winning_arguments_corpus(corpus: Corpus): utterance_ids = corpus.get_utterance_ids() #we want the original post made by op, the challenger's comments and all of OP's responses to the challengers #these three lists are utterance ids for the original post, challenger comments and op replies respectively opPost = [] challengerComments = [] opReplies = [] for iD in utterance_ids: if corpus.get_utterance(iD).id == corpus.get_utterance( iD).conversation_id: opPost.append(iD) if corpus.get_utterance(iD).speaker.id != corpus.get_utterance( corpus.get_utterance(iD).conversation_id ).speaker.id and corpus.get_utterance(iD).meta['success'] == 0: challengerComments.append(iD) if corpus.get_utterance(iD).speaker.id != corpus.get_utterance( corpus.get_utterance(iD).conversation_id ).speaker.id and corpus.get_utterance(iD).meta['success'] == 1: challengerComments.append(iD) if corpus.get_utterance(iD).id != corpus.get_utterance( iD).conversation_id and corpus.get_utterance( iD).speaker.id == corpus.get_utterance( corpus.get_utterance(iD).conversation_id ).speaker.id and corpus.get_utterance( iD).meta['success'] == 0: opReplies.append(iD) if corpus.get_utterance(iD).id != corpus.get_utterance( iD).conversation_id and corpus.get_utterance( iD).speaker.id == corpus.get_utterance( corpus.get_utterance(iD).conversation_id ).speaker.id and corpus.get_utterance( iD).meta['success'] == 1: opReplies.append(iD) #subset challenger and op replies for later use (into successful and unsuccessful arguments) challengerPos = [] challengerNeg = [] for iD in challengerComments: if corpus.get_utterance(iD).meta['success'] == 1: challengerPos.append(iD) if corpus.get_utterance(iD).meta['success'] == 0: challengerNeg.append(iD) #these are OP's replies to successful and unsuccessful challengers opReplyPos = [] opReplyNeg = [] for iD in opReplies: if corpus.get_utterance(iD).meta['success'] == 1: opReplyPos.append(iD) if corpus.get_utterance(iD).meta['success'] == 0: opReplyNeg.append(iD) subset = opPost + challengerComments + opReplies #collect utterance dict given the subset of ids utterance_list = [] for iD in subset: utterance_list.append(corpus.get_utterance(iD)) #this subset separates OP comments and challenger utterances from all other comments in every conversation (thread) corpus = Corpus(utterances=utterance_list) return corpus
prompt_type_assignment_df.columns = TYPE_NAMES # In 18 prompt_type_assignment_df.head() # In 19 # noinspection PyTypeChecker ps = PolitenessStrategies(verbose=1000) awry_corpus = ps.transform(awry_corpus) # In 20 utterance_ids = awry_corpus.get_utterance_ids() rows = [] for uid in utterance_ids: rows.append(awry_corpus.get_utterance(uid).meta["politeness_strategies"]) politeness_strategies = pd.DataFrame(rows, index=utterance_ids) # In 21 politeness_strategies.head(10) # In 22 # first, we need to directly map comment IDs to their conversations. We'll build a DataFrame to do this comment_ids = [] convo_ids = [] timestamps = []
import convokit import nltk import string from nltk.corpus import stopwords from convokit import Corpus, download from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from gensim.models import Word2Vec nltk.download('punkt') corpus = Corpus(filename=download("subreddit-creepypasta")) corpusTXT = open("corpus.txt", "w") utter_ids = corpus.get_utterance_ids() length = len(utter_ids) # Print the posts from subreddit to a file i = 0 while i < 2: corpusTXT.write(corpus.get_utterance(utter_ids[i]).text) i += 1 corpusTXT.close() corpusTXT_2 = open("corpus.txt", "r") text = corpusTXT_2.read() # Tokenize all of corpus.txt nltk_sentences = sent_tokenize(text)