def main(in_filename): doc = [] str = "post" #this is done because of the different formats of the post and comment files if sys.argv[1].find(str) >= 0: colnames = [ 'postid', 'time', 'user', 'no1', 'no2', 'no3', 'no4', 'title', 'post', 'now' ] posts = pandas.read_csv(sys.argv[1], names=colnames) num = posts.now.tolist() doc = posts.post.tolist() else: colnames = [ 'postid', 'commentid', 'time', 'user', 'no1', 'no2', 'no3', 'comment', 'now' ] comments = pandas.read_csv(sys.argv[1], names=colnames) num = comments.now.tolist() doc = comments.comment.tolist() id = comments.commentid.tolist() index = 0 all_postive_liwc_measures = [] all_negative_liwc_measures = [] print "Getting LIWC measures for ", sys.argv[1] liwc_lexicon = LIWCMeta.extract_liwc_features() for item in doc: if type(item) == float and np.isnan(item): item = "" outCountDict = LIWCMeta.getLex(item, liwc_lexicon) #creating an array of all positive category values for each post and adding them #creating an array of all negative category values for each post and adding them a = outCountDict['positive_affect'] if (((outCountDict['article']) + (outCountDict['preposition']) + (outCountDict['pronoun']) + (outCountDict['conjunction']) + (outCountDict['adverbs']) + (outCountDict['negation']) + (outCountDict['auxiliary_verbs'])) / float(num[index])) > 1: all_postive_liwc_measures.append(1.0) else: all_postive_liwc_measures.append( ((outCountDict['article']) + (outCountDict['preposition']) + (outCountDict['pronoun']) + (outCountDict['conjunction']) + (outCountDict['adverbs']) + (outCountDict['negation']) + (outCountDict['auxiliary_verbs'])) / float(num[index])) all_negative_liwc_measures.append( (outCountDict['pronoun']) / float(num[index])) index = index + 1 # This is for all the reddit posts to get postid , positive and negative comments for each post if in_filename.find(str) >= 0: colnames = [ 'postid', 'Time', 'Author', 'Nocomments', 'upvotes', 'downvotes', 'updown', 'title', 'commenttext' ] data = pandas.read_csv('combined_posts.csv', names=colnames) postid = list(data.postid) with open("combined_emotions_posts.csv", "w") as f: writer = csv.writer(f) writer.writerows( izip(postid, all_postive_liwc_measures, all_negative_liwc_measures)) # This is for all the reddit comments to get postid for each comment, positive and negative score for each comment else: colnames = [ 'postid', ' commentid', 'Time', 'Author', 'upvotes', 'downvotes', 'updown', 'commenttext' ] data = pandas.read_csv('combined_comments.csv', names=colnames) postid = list(data.postid) with open("combined_emotions_comments.csv", "w") as f: writer = csv.writer(f) writer.writerows( izip(postid, all_postive_liwc_measures, all_negative_liwc_measures)) # Get the csv into lists get aggregate LIWC values for comments for each post colnames = ['postid', 'positive', 'negative'] data = pandas.read_csv("combined_emotions_comments.csv", names=colnames) postid = list(data.postid) positive = list(data.positive) negative = list(data.negative) unique_postid = [] positive_unique_postid = [] negative_unique_postid = [] p1 = 0 sum_positive = 0 sum_negative = 0 while (p1 < len(postid)): for p2 in range(p1, len(postid)): if postid[p1] == postid[p2]: continue else: unique_postid.append(postid[p1]) for i in range(p1, p2): sum_positive += positive[i] sum_negative += negative[i] sum_positive = float(sum_positive) / (p2 - p1 + 1) sum_negative = float(sum_negative) / (p2 - p1 + 1) positive_unique_postid.append(sum_positive) negative_unique_postid.append(sum_negative) sum_positive = 0 sum_negative = 0 p1 = p2 - 1 break if p2 == len(postid) - 1: for i in range(p1, p2 + 1): sum_positive += positive[i] sum_negative += negative[i] positive_unique_postid.append(sum_positive) negative_unique_postid.append(sum_negative) unique_postid.append(postid[p1]) break p1 = p1 + 1 #write the average positive and negative LIWC values for all comments of a post into a CSV file with open("combined_emotions_comments_unique.csv", "w") as f: writer = csv.writer(f) writer.writerows( izip(unique_postid, positive_unique_postid, negative_unique_postid))
def main(in_filename): doc = [] str = "post"; #this is done because of the different formats of the post and comment files if sys.argv[1].find(str) >=0: colnames = ['postid' , 'time', 'user','no1','no2','no3','no4' ,'title','post', 'now'] posts = pandas.read_csv(sys.argv[1] ,names=colnames) num = posts.now.tolist() doc = posts.post.tolist() else: colnames = ['postid' , 'commentid','time', 'user','no1','no2','no3','comment','now'] comments = pandas.read_csv(sys.argv[1],names=colnames) num = comments.now.tolist() doc = comments.comment.tolist() id = comments.commentid.tolist() index =0 all_postive_liwc_measures = [] all_negative_liwc_measures = [] print "Getting LIWC measures for " , sys.argv[1] liwc_lexicon = LIWCMeta.extract_liwc_features() for item in doc: if type(item) == float and np.isnan(item): item = "" outCountDict = LIWCMeta.getLex(item, liwc_lexicon) #creating an array of all positive category values for each post and adding them #creating an array of all negative category values for each post and adding them a=outCountDict['positive_affect'] if (((outCountDict['article'])+(outCountDict['preposition']) + (outCountDict['pronoun'])+ (outCountDict['conjunction'])+ (outCountDict['adverbs'])+ (outCountDict['negation'])+(outCountDict['auxiliary_verbs'])) /float(num[index])) > 1: all_postive_liwc_measures.append(1.0) else: all_postive_liwc_measures.append(((outCountDict['article'])+(outCountDict['preposition']) + (outCountDict['pronoun'])+ (outCountDict['conjunction'])+ (outCountDict['adverbs'])+ (outCountDict['negation'])+(outCountDict['auxiliary_verbs'])) /float(num[index])) all_negative_liwc_measures.append((outCountDict['pronoun'] )/float(num[index])) index = index + 1 # This is for all the reddit posts to get postid , positive and negative comments for each post if in_filename.find(str) >=0: colnames = ['postid' , 'Time' , 'Author','Nocomments', 'upvotes' , 'downvotes' , 'updown' ,'title', 'commenttext'] data = pandas.read_csv('combined_posts.csv' ,names=colnames) postid = list(data.postid) with open("combined_emotions_posts.csv","w") as f: writer = csv.writer(f) writer.writerows(izip(postid,all_postive_liwc_measures,all_negative_liwc_measures)) # This is for all the reddit comments to get postid for each comment, positive and negative score for each comment else: colnames = ['postid' , ' commentid' , 'Time' , 'Author', 'upvotes' , 'downvotes' , 'updown' , 'commenttext'] data = pandas.read_csv('combined_comments.csv' ,names=colnames) postid = list(data.postid) with open("combined_emotions_comments.csv","w") as f: writer = csv.writer(f) writer.writerows(izip(postid,all_postive_liwc_measures,all_negative_liwc_measures)) # Get the csv into lists get aggregate LIWC values for comments for each post colnames = ['postid' , 'positive' ,'negative'] data = pandas.read_csv("combined_emotions_comments.csv", names=colnames) postid= list(data.postid) positive = list(data.positive) negative = list(data.negative) unique_postid = [] positive_unique_postid = [] negative_unique_postid = [] p1 =0 sum_positive = 0 sum_negative = 0 while(p1<len(postid)): for p2 in range(p1,len(postid)): if postid[p1] == postid[p2]: continue else: unique_postid.append(postid[p1]) for i in range(p1,p2): sum_positive += positive[i] sum_negative += negative[i] sum_positive = float(sum_positive) / (p2-p1+1) sum_negative = float(sum_negative) / (p2-p1+1) positive_unique_postid.append(sum_positive) negative_unique_postid.append(sum_negative) sum_positive = 0 sum_negative = 0 p1 = p2 -1 break if p2 == len(postid) -1: for i in range(p1,p2+1): sum_positive += positive[i] sum_negative += negative[i] positive_unique_postid.append(sum_positive) negative_unique_postid.append(sum_negative) unique_postid.append(postid[p1]) break p1 = p1 + 1 #write the average positive and negative LIWC values for all comments of a post into a CSV file with open("combined_emotions_comments_unique.csv","w") as f: writer = csv.writer(f) writer.writerows(izip(unique_postid,positive_unique_postid,negative_unique_postid))
from itertools import izip import itertools import pylab as pl # get all the posts number of words and time of all the posts colnames = ['hour' , 'post','num_words'] posts = pandas.read_csv('combined_post_hourly.csv' ,names=colnames) post_body = posts.post.tolist() num = posts.num_words.tolist() hour = posts.hour.tolist() #category is provided in the arguments. Get the normalized liwc values for this category category1 =[] index =0 liwc_lexicon = LIWCMeta.extract_liwc_features() for item in post_body: if type(item) == float and np.isnan(item): item = "" outCountDict = LIWCMeta.getLex(item, liwc_lexicon) category1.append((outCountDict[sys.argv[1]])/float(num[index])) index = index + 1 #aggregate the liwc values based on the day of the week one = [] count_one =0 two =[] count_two =0 three =[] count_three=0 four =[]
from itertools import izip import itertools import pylab as pl # get all the posts number of words and time of all the posts colnames = ['hour', 'post', 'num_words'] posts = pandas.read_csv('combined_post_hourly.csv', names=colnames) post_body = posts.post.tolist() num = posts.num_words.tolist() hour = posts.hour.tolist() #category is provided in the arguments. Get the normalized liwc values for this category category1 = [] index = 0 liwc_lexicon = LIWCMeta.extract_liwc_features() for item in post_body: if type(item) == float and np.isnan(item): item = "" outCountDict = LIWCMeta.getLex(item, liwc_lexicon) category1.append((outCountDict[sys.argv[1]]) / float(num[index])) index = index + 1 #aggregate the liwc values based on the day of the week one = [] count_one = 0 two = [] count_two = 0 three = [] count_three = 0 four = []