def main(): """Generate stats on localness metric performance. Expects tables in a PostgreSQL database containing n-day information for each user. Plurality can be generated from the n-day data. Expects a CSV file with geometric median results for all of the users. Expects a CSV file with locaiton field results for all of the users. :return: prints out a lot of different stats about the localness metrics and their overlap """ conn = psycopg2.connect("dbname={0}".format(DBNAME)) cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json) results = [['repository','Users (K)','% 5-D','%10-D', '%30-D', '%60-D', '% Med', '% Loc', '', '# VGI (M)', '% 5-D','%10-D','%30-D','%60-D', '% Plu', '% Med', '% Loc']] twitter_bots = bots.build_bots_filter() for repository in VGI_REPOSITORIES: cur.execute("SELECT uid, ntime, count, fips FROM {0}{1};".format(NDAY_TABLE_BASENAME, repository)) users = {} # keep track of the users who have been processed nday_localized_users_60 = set() # number of users who are local to at least one county when n=60 days nday_localized_users_30 = set() nday_localized_users_10 = set() nday_localized_users_5 = set() nday_local_content_60 = 0 # number of VGI that are local when n=60 days nday_local_content_30 = 0 nday_local_content_10 = 0 nday_local_content_5 = 0 nday_not_potentially_local_content = 0 # number of VGI that couldn't be local under n-days because user only contributed to once total_content = 0 # total number of VGI geolocated to counties users_plurality = {} # track county with most contributions for each user users_content = {} # track total amount of content per user for later stats in geometric median and location field for row in cur: uid = str(row[0]) # user ID if uid in twitter_bots: continue # ntime is a PostgreSQL interval converted to datetime.timedelta by psycopg2 # it's the time between first and last contributions to this county by the user ntime = row[1] cnt = row[2] # number of VGI contributed to the county by the user county_fip = row[3] # FIPS code for the county if county_fip: users[uid] = True total_content += cnt if ntime.days >= 60: nday_localized_users_60.add(uid) nday_local_content_60 += cnt if ntime.days >= 30: nday_localized_users_30.add(uid) nday_local_content_30 += cnt if ntime.days >= 10: nday_localized_users_10.add(uid) nday_local_content_10 += cnt if ntime.days >= 5: nday_localized_users_5.add(uid) nday_local_content_5 += cnt if uid in users_plurality: if cnt > users_plurality[uid]['plurality_count']: # new plurality county users_plurality[uid]['plurality'] = [county_fip] users_plurality[uid]['plurality_count'] = cnt elif cnt == users_plurality[uid]['plurality_count']: # tie: plurality assigned to all counties with that # of VGI users_plurality[uid]['plurality'].append(county_fip) users_content[uid][county_fip] = cnt else: users_plurality[uid] = {'plurality':[county_fip], 'plurality_count':cnt} users_content[uid] = {county_fip:cnt} plurality_local_content = 0 for uid in users_plurality: # only one contribution for a user = couldn't be n-day and I want to track that if users_plurality[uid]['plurality_count'] == 1 and len(users_plurality[uid]['plurality']) == 1: nday_not_potentially_local_content += 1 plurality_local_content += users_plurality[uid]['plurality_count'] * len(users_plurality[uid]['plurality']) print("{0}: {1} Total VGI and {2} users who only had one piece of VGI.".format(repository, total_content, nday_not_potentially_local_content)) print("60-day: {0} users processed and {1} had at least one county determined to be local for {2}.".format(len(users), len(nday_localized_users_60), repository)) print("30-day: {0} users processed and {1} had at least one county determined to be local for {2}.".format(len(users), len(nday_localized_users_30), repository)) print("10-day: {0} users processed and {1} had at least one county determined to be local for {2}.".format(len(users), len(nday_localized_users_10), repository)) print(" 5-day: {0} users processed and {1} had at least one county determined to be local for {2}.".format(len(users), len(nday_localized_users_5), repository)) vgi_median_fn = '{0}/{1}/{2}'.format(GEOMETRIC_MEDIAN_FOLDER, repository, GEOMETRIC_MEDIAN_FILENAME) median_localized_users = 0 # number of users who were assigned a county per geometric median median_users_in_nday = 0 # number of users who overlap with ndays userlist (geometric median might include users without US points median_additional_users = 0 # number of users in geometric median who aren't in n-days median_local_content = 0 # number of VGI that were local to the geometric median county median_potentially_local_content = 0 # number of VGI produced by the localized users median_users_nocontent = 0 # number of users who didn't contribute to their geometric median county with open(vgi_median_fn, 'r') as fin: csvreader = csv.reader(fin) #assert next(csvreader) == ['uid','county'] for line in csvreader: uid = line[0] if uid in twitter_bots: continue county = line[1] if uid in users: median_users_in_nday += 1 if county: for fips in users_content[uid]: median_potentially_local_content += users_content[uid][fips] median_localized_users += 1 if county in users_content[uid]: median_local_content += users_content[uid][county] else: median_users_nocontent += 1 else: median_additional_users += 1 print("VGI Median: {0} users found from nday of which {1} were localized and {2} additional users not considered for {3}.".format(median_users_in_nday, median_localized_users, median_additional_users, repository)) print("{0} VGI Median users with no content in the county where they are local.".format(median_users_nocontent)) print("Out of {0} VGI that could have been declared local (i.e. vgi median was successful for that user), {1} was local.".format(median_potentially_local_content, median_local_content)) locfield_fn = '{0}/{1}/'.format(LOCATION_FIELD_FOLDER, repository, LOCATION_FIELD_FILENAME) locfield_localized_users = set() locfield_users_in_nday = 0 locfield_additional_users = 0 locfield_local_content = 0 locfield_potentially_local_content = 0 locfield_users_nocontent = set() with open(locfield_fn, 'r') as fin: csvreader = csv.reader(fin) assert next(csvreader) == ['uid','loc_field','county'] for line in csvreader: uid = line[0] if uid in twitter_bots: continue counties = line[2].split(';') # usually one county, but sometimes multiple as for NYC or Twin Cities if uid in users: locfield_users_in_nday += 1 for county in counties: if county: for fips in users_content[uid]: locfield_potentially_local_content += users_content[uid][fips] locfield_localized_users.add(uid) if county in users_content[uid]: locfield_local_content += users_content[uid][county] else: locfield_users_nocontent.add(uid) else: locfield_additional_users += 1 print("LocField: {0} users found from nday of which {1} were localized and {2} additional users not considered for {3}.".format(locfield_users_in_nday, len(locfield_localized_users), locfield_additional_users, repository)) print("{0} LocField users with no content in the county where they are local.".format(len(locfield_users_nocontent))) print("Out of {0} VGI that could have been declared local (i.e. loc field was successful for that user), {1} was local.".format(locfield_potentially_local_content, locfield_local_content)) if repository == 'swarm': repository = 'swarm\t' # prints out prettier in standard out results.append([repository, round(len(users) / 1000.0, 1), '', # placeholder for better spacing of print round(float(len(nday_localized_users_5)) / (len(users) - nday_not_potentially_local_content), 3), round(float(len(nday_localized_users_10)) / (len(users) - nday_not_potentially_local_content), 3), round(float(len(nday_localized_users_30)) / (len(users) - nday_not_potentially_local_content), 3), round(float(len(nday_localized_users_60)) / (len(users) - nday_not_potentially_local_content), 3), round(float(median_localized_users) / median_users_in_nday, 3), round(float(len(locfield_localized_users)) / locfield_users_in_nday, 3), '', round(float(total_content) / 1000000.0, 1), '', round(float(nday_local_content_5) / (total_content - nday_not_potentially_local_content), 3), round(float(nday_local_content_10) / (total_content - nday_not_potentially_local_content), 3), round(float(nday_local_content_30) / (total_content - nday_not_potentially_local_content), 3), round(float(nday_local_content_60) / (total_content - nday_not_potentially_local_content), 3), round(float(plurality_local_content) / total_content, 3), round(float(median_local_content) / median_potentially_local_content, 3), round(float(locfield_local_content) / locfield_potentially_local_content, 3)]) print('\n') for result in results: print('\t'.join(str(r) for r in result))
def compute_happiness(scale='counties'): """Compute happiness by county based on localness-processed CSV from localness.py.""" # generate word -> happiness dictionary happy_dict = build_happiness_dict() bots_filter = bots.build_bots_filter() # directory containing all of the tweets sorted by state or county depending on scale - one file for each region tweets_dir = './{0}'.format(scale) tweets_fns = os.listdir(tweets_dir) output_fn = "./raw_happiness_results_{0}.csv".format(scale) with open(output_fn, "w") as fout: csvwriter = csv.writer(fout) for localness in LOCALNESS_METRICS: csvwriter.writerow([ '{0}_fips'.format(scale), '{0}_med_h'.format(localness), '{0}_avg_h'.format(localness), 'nonlocal_med_h', 'nonlocal_avg_h', 'unfiltered_med_h', 'unfiltered_avg_h', 'total_local', 'total_nonlocal', 'local_excluded', 'nonlocal_excluded' ]) local_filtered_out = 0 nonlocal_filtered_out = 0 for file in tweets_fns: with open(os.path.join(tweets_dir, file), 'r') as fin: fips = os.path.splitext(file)[ 0] # files named by <FIPS-CODE>.csv csvreader = csv.reader(fin) header = ['text', 'uid', 'nday', 'plurality'] txt_idx = header.index('text') uid_idx = header.index('uid') localness_idx = header.index(localness) assert next(csvreader) == header local_tweets = [] lt_no_happy_words = 0 non_local = [] nl_no_happy_words = 0 for line in csvreader: txt = line[txt_idx] uid = line[uid_idx] if not line[localness_idx]: continue local = (line[localness_idx] == 'True') if uid in bots_filter: if local: local_filtered_out += 1 else: nonlocal_filtered_out += 1 continue total_happ = 0.0 count_words = 0 for word in txt.split(): cleaned = word.lower().strip('?!.,;:()[]{}"\'') if cleaned in happy_dict: count_words += 1 total_happ += happy_dict[cleaned] if count_words > 0: h_avg_txt = total_happ / count_words if local: local_tweets.append(h_avg_txt) else: non_local.append(h_avg_txt) else: if local: lt_no_happy_words += 1 else: nl_no_happy_words += 1 local_med_h = numpy.median(local_tweets) local_avg_h = numpy.average(local_tweets) nonlocal_med_h = numpy.median(non_local) nonlocal_avg_h = numpy.average(non_local) unfiltered_med_h = numpy.median(local_tweets + non_local) unfiltered_avg_h = numpy.average(local_tweets + non_local) csvwriter.writerow([ fips, local_med_h, local_avg_h, nonlocal_med_h, nonlocal_avg_h, unfiltered_med_h, unfiltered_avg_h, len(local_tweets), len(non_local), lt_no_happy_words, nl_no_happy_words ]) print( "{0} 'local' tweets and {1} 'nonlocal' tweets filtered out from organizations for {2}." .format(local_filtered_out, nonlocal_filtered_out, localness)) process_happiness_results(scale, output_fn)
def compute_happiness(scale='counties'): """Compute happiness by county based on localness-processed CSV from localness.py.""" # generate word -> happiness dictionary happy_dict = build_happiness_dict() bots_filter = bots.build_bots_filter() # directory containing all of the tweets sorted by state or county depending on scale - one file for each region tweets_dir = './{0}'.format(scale) tweets_fns = os.listdir(tweets_dir) output_fn = "./raw_happiness_results_{0}.csv".format(scale) with open(output_fn, "w") as fout: csvwriter = csv.writer(fout) for localness in LOCALNESS_METRICS: csvwriter.writerow(['{0}_fips'.format(scale), '{0}_med_h'.format(localness), '{0}_avg_h'.format(localness), 'nonlocal_med_h', 'nonlocal_avg_h', 'unfiltered_med_h', 'unfiltered_avg_h', 'total_local', 'total_nonlocal', 'local_excluded', 'nonlocal_excluded']) local_filtered_out = 0 nonlocal_filtered_out = 0 for file in tweets_fns: with open(os.path.join(tweets_dir, file), 'r') as fin: fips = os.path.splitext(file)[0] # files named by <FIPS-CODE>.csv csvreader = csv.reader(fin) header = ['text','uid','nday','plurality'] txt_idx = header.index('text') uid_idx = header.index('uid') localness_idx = header.index(localness) assert next(csvreader) == header local_tweets = [] lt_no_happy_words = 0 non_local = [] nl_no_happy_words = 0 for line in csvreader: txt = line[txt_idx] uid = line[uid_idx] if not line[localness_idx]: continue local = (line[localness_idx] == 'True') if uid in bots_filter: if local: local_filtered_out += 1 else: nonlocal_filtered_out += 1 continue total_happ = 0.0 count_words = 0 for word in txt.split(): cleaned = word.lower().strip('?!.,;:()[]{}"\'') if cleaned in happy_dict: count_words += 1 total_happ += happy_dict[cleaned] if count_words > 0: h_avg_txt = total_happ / count_words if local: local_tweets.append(h_avg_txt) else: non_local.append(h_avg_txt) else: if local: lt_no_happy_words += 1 else: nl_no_happy_words += 1 local_med_h = numpy.median(local_tweets) local_avg_h = numpy.average(local_tweets) nonlocal_med_h = numpy.median(non_local) nonlocal_avg_h = numpy.average(non_local) unfiltered_med_h = numpy.median(local_tweets + non_local) unfiltered_avg_h = numpy.average(local_tweets + non_local) csvwriter.writerow([fips, local_med_h, local_avg_h, nonlocal_med_h, nonlocal_avg_h, unfiltered_med_h, unfiltered_avg_h, len(local_tweets), len(non_local), lt_no_happy_words, nl_no_happy_words]) print("{0} 'local' tweets and {1} 'nonlocal' tweets filtered out from organizations for {2}.".format(local_filtered_out, nonlocal_filtered_out, localness)) process_happiness_results(scale, output_fn)
def main(): parser = argparse.ArgumentParser() parser.add_argument('localness_fn', help='CSV output from localness.py script') parser.add_argument('output_stats_fn', help="Path to CSV file output containing the localness stats by county") parser.add_argument('--filter_bots', default=True) args = parser.parse_args() localness_fn = args.localness_fn output_fn = args.output_stats_fn county_idx = INPUT_HEADER.index('county') uid_idx = INPUT_HEADER.index('uid') nday_idx = INPUT_HEADER.index("nday") plur_idx = INPUT_HEADER.index("plurality") geomed_idx = INPUT_HEADER.index("geomed") locfield_idx = INPUT_HEADER.index("locfield") twitter_bots = {} if args.filter_bots: twitter_bots = bots.build_bots_filter() print("Processing {0} and outputting localness results to {1}.".format(localness_fn, output_fn)) output_header = ['fips','all','none','nday','plur','geomed','locfield','npg','ngl','npl','pgl','np','ng','nl','pg','pl','gl','bots'] tracking = {'fips' : ""} for i in range(1, len(output_header)): tracking[output_header[i]] = 0 county_stats = {} with open("resources/USCounties_bare.geojson",'r') as fin: counties = json.load(fin) for county in counties['features']: fips = str(county['properties']["FIPS"]) county_stats[fips] = tracking.copy() county_stats[fips]['fips'] = fips with open(localness_fn, 'r') as fin: csvreader = csv.reader(fin) assert next(csvreader) == INPUT_HEADER line_no = 0 for line in csvreader: line_no += 1 fips = line[county_idx] uid = line[uid_idx] n, p, g, l = False, False, False, False if fips: if uid in twitter_bots: county_stats[fips]['bots'] += 1 continue if line[nday_idx] == 'True': n = True if line[plur_idx] == "True": p = True if line[geomed_idx] == "True": g = True if line[locfield_idx] == "True": l = True if n and p and g and l: county_stats[fips]['all'] += 1 elif not n and not p and not g and not l: county_stats[fips]['none'] += 1 elif n and p and g: county_stats[fips]['npg'] += 1 elif n and g and l: county_stats[fips]['ngl'] += 1 elif n and p and l: county_stats[fips]['npl'] += 1 elif p and g and l: county_stats[fips]['pgl'] += 1 elif n and p: county_stats[fips]['np'] += 1 elif n and g: county_stats[fips]['ng'] += 1 elif n and l: county_stats[fips]['nl'] += 1 elif p and g: county_stats[fips]['pg'] += 1 elif p and l: county_stats[fips]['pl'] += 1 elif g and l: county_stats[fips]['gl'] += 1 elif n: county_stats[fips]['nday'] += 1 elif p: county_stats[fips]['plur'] += 1 elif g: county_stats[fips]['geomed'] += 1 elif l: county_stats[fips]['locfield'] += 1 if line_no % 100000 == 0: print('{0} lines processed.'.format(line_no)) print('{0} total lines processed.'.format(line_no)) with open(output_fn, "w") as fout: csvwriter = csv.DictWriter(fout, fieldnames=output_header) csvwriter.writeheader() for county in county_stats.values(): csvwriter.writerow(county)
def main(): parser = argparse.ArgumentParser() parser.add_argument('localness_fn', help='CSV output from localness.py script') parser.add_argument( 'output_stats_fn', help="Path to CSV file output containing the localness stats by county" ) parser.add_argument('--filter_bots', default=True) args = parser.parse_args() localness_fn = args.localness_fn output_fn = args.output_stats_fn county_idx = INPUT_HEADER.index('county') uid_idx = INPUT_HEADER.index('uid') nday_idx = INPUT_HEADER.index("nday") plur_idx = INPUT_HEADER.index("plurality") geomed_idx = INPUT_HEADER.index("geomed") locfield_idx = INPUT_HEADER.index("locfield") twitter_bots = {} if args.filter_bots: twitter_bots = bots.build_bots_filter() print("Processing {0} and outputting localness results to {1}.".format( localness_fn, output_fn)) output_header = [ 'fips', 'all', 'none', 'nday', 'plur', 'geomed', 'locfield', 'npg', 'ngl', 'npl', 'pgl', 'np', 'ng', 'nl', 'pg', 'pl', 'gl', 'bots' ] tracking = {'fips': ""} for i in range(1, len(output_header)): tracking[output_header[i]] = 0 county_stats = {} with open("resources/USCounties_bare.geojson", 'r') as fin: counties = json.load(fin) for county in counties['features']: fips = str(county['properties']["FIPS"]) county_stats[fips] = tracking.copy() county_stats[fips]['fips'] = fips with open(localness_fn, 'r') as fin: csvreader = csv.reader(fin) assert next(csvreader) == INPUT_HEADER line_no = 0 for line in csvreader: line_no += 1 fips = line[county_idx] uid = line[uid_idx] n, p, g, l = False, False, False, False if fips: if uid in twitter_bots: county_stats[fips]['bots'] += 1 continue if line[nday_idx] == 'True': n = True if line[plur_idx] == "True": p = True if line[geomed_idx] == "True": g = True if line[locfield_idx] == "True": l = True if n and p and g and l: county_stats[fips]['all'] += 1 elif not n and not p and not g and not l: county_stats[fips]['none'] += 1 elif n and p and g: county_stats[fips]['npg'] += 1 elif n and g and l: county_stats[fips]['ngl'] += 1 elif n and p and l: county_stats[fips]['npl'] += 1 elif p and g and l: county_stats[fips]['pgl'] += 1 elif n and p: county_stats[fips]['np'] += 1 elif n and g: county_stats[fips]['ng'] += 1 elif n and l: county_stats[fips]['nl'] += 1 elif p and g: county_stats[fips]['pg'] += 1 elif p and l: county_stats[fips]['pl'] += 1 elif g and l: county_stats[fips]['gl'] += 1 elif n: county_stats[fips]['nday'] += 1 elif p: county_stats[fips]['plur'] += 1 elif g: county_stats[fips]['geomed'] += 1 elif l: county_stats[fips]['locfield'] += 1 if line_no % 100000 == 0: print('{0} lines processed.'.format(line_no)) print('{0} total lines processed.'.format(line_no)) with open(output_fn, "w") as fout: csvwriter = csv.DictWriter(fout, fieldnames=output_header) csvwriter.writeheader() for county in county_stats.values(): csvwriter.writerow(county)