def process(inputdir,outfilename,brtime1,brtime2): # Empty dictionary to store the tweet counts for each user userlist={} # Process all the files in the input directory filelist=tweetproc.jsonindir(inputdir) for file in filelist: # Returns a dictionary of user ids with the counts of each behavior response for that user userbrcounts=tweetproc.tweetcodingbresponse(os.path.join(inputdir,file),brtime1,brtime2) # Loop over all of the users in the userbrs dictionary returned by the bresponse method # If the user is already in userlist, then add the behavior response counts, otherwise set the counts for user in userbrcounts: if user in userlist: # If user is already in the user list for br in userbrcounts[user]: # Loop over all the behavior responses userlist[user][br]+=userbrcounts[user][br] # Add each to the existing count else: userlist[user]=userbrcounts[user] # Just copy the behavior count dictionary over print "Processing",outfilename #pprint.pprint(userlist) # Save the results with io.open(outfilename,'w',encoding="utf-8",errors='ignore') as outfile: # Loop over all users in the userlist and save the values to the CSV file for user, brcounts in sorted(userlist.items()): userstr=user for br in brcounts: userstr=userstr+","+str(brcounts[br]) userstr+="\n" outfile.write(unicode(userstr)) brsummary={'handwash':0, 'handsanitize':0, 'cough':0, 'avoidgathering':0, 'avoidschool':0, 'total':0 } # Loop over all users in the userlist and summarize the number of behavior responses for user, brcounts in sorted(userlist.items()): for br in brcounts: print "user",user,"br",br,"brcounts[br]",brcounts[br],"brsum[br]",brsummary[br] brsummary[br]+=brcounts[br] sum=len(userlist) # For convenience calculate the ratio of users for each category # Write out the percentage of users with the count for br in brsummary: if sum>0: percent=float(brsummary[br]*100)/float(sum) else: percent=0.0 outfile.write(unicode(str(br)+","+str(brsummary[br])+","+str(round(percent,2))+"\n")) # Then write out the sum outfile.write(unicode("sum,"+str(sum)+"\n"))
def process(inputdir,outfilename,tictime1,tictime2): # Empty dictionary to store the tweet counts for each user userlist={} # Process all the files in the input directory filelist=tweetproc.jsonindir(inputdir) for file in filelist: usercounts=tweetproc.tic(os.path.join(inputdir,file),tictime1,tictime2) # Loop over all of the users in the usercounts dictionary returned by the tic method # If the user is already in userlist, then add the count, otherwise set the count for user in usercounts: if user in userlist: userlist[user]+=usercounts[user] else: userlist[user]=usercounts[user] print "Processing",outfilename with io.open(outfilename,'w',encoding="utf-8",errors='ignore') as outfile: # Loop over all users in the userlist and save the values to the CSV file for user, value in sorted(userlist.items()): userstr=user+","+str(value)+"\n" outfile.write(unicode(userstr)) # Loop over all users in the userlist and classify the user based on the number of tweets # The count goes from 1-5. counts=[0,0,0,0,0,0] for user, value in sorted(userlist.items()): if value >= 5: counts[5]+=1 else: counts[value]+=1 sum=len(userlist) # For convenience calculate the ratio of users in each category # Write out the percentage of users with 1, 2, 3, 4, 5+ tweets for i in xrange(0,len(counts)): if sum>0: percent=float(counts[i]*100)/float(sum) else: percent=0.0 outfile.write(unicode(str(i)+","+str(counts[i])+","+str(round(percent,2))+"\n")) # Then write out the sum outfile.write(unicode("sum,"+str(sum)+"\n"))
#!/usr/bin/python import tweetproc import os import io inputdir="data/geoebola-sites-Kent-25000" inputdir="data/geoebola" outfilename="csv/out.geoebola.csv" # Start with an empty string outputstring="" filelist=tweetproc.jsonindir(inputdir) for file in filelist: outputstring=outputstring+tweetproc.geocsv(os.path.join(inputdir,file)) with io.open(outfilename,'w',encoding="utf-8",errors='ignore') as outfile: outfile.write(outputstring)
def process(inputdir,outfilename,tctime1,tctime2): # Empty dictionary to store the tweet counts for each user userlist={} # Process all the files in the input directory filelist=tweetproc.jsonindir(inputdir) for file in filelist: # Returns a dictionary of user ids with the counts of each behavior response for that user usertccounts=tweetproc.tweetcoding(os.path.join(inputdir,file),tctime1,tctime2) print usertccounts # Loop over all of the users in the usertcs dictionary returned by the tcesponse method # If the user is already in userlist, then add the behavior response counts, otherwise set the counts for user in usertccounts: if user in userlist: # If user is already in the user list for tc in usertccounts[user]: # Loop over all the behavior responses userlist[user][tc]+=usertccounts[user][tc] # Add each to the existing count else: userlist[user]=usertccounts[user] # Just copy the behavior count dictionary over print "Processing",outfilename #pprint.pprint(userlist) # Save the results with io.open(outfilename,'w',encoding="utf-8",errors='ignore') as outfile: # Loop over all users in the userlist and save the values to the CSV file for user, tccounts in sorted(userlist.items()): userstr=user for tc in tccounts: userstr=userstr+","+str(tccounts[tc]) userstr+="\n" outfile.write(unicode(userstr)) tcsummary={'concern':0, 'experience':0, 'opinion':0, 'sarcasm':0, 'relief':0, 'downplay':0, 'frustration':0, 'total':0 } # Loop over all users in the userlist and summarize the number of behavior responses for user, tccounts in sorted(userlist.items()): for tc in tccounts: print "user",user,"tc",tc,"tccounts[tc]",tccounts[tc],"tcsum[tc]",tcsummary[tc] tcsummary[tc]+=tccounts[tc] sum=len(userlist) # For convenience calculate the ratio of users for each category # Write out the percentage of users with the count for tc in tcsummary: if sum>0: percent=float(tcsummary[tc]*100)/float(sum) else: percent=0.0 outfile.write(unicode(str(tc)+","+str(tcsummary[tc])+","+str(round(percent,2))+"\n")) # Then write out the sum outfile.write(unicode("sum,"+str(sum)+"\n"))