def trim_low_tweet_gangs(): # Trim gang tweets with low non-home tweet counts tty_polys, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) hbk_users_in_gang_t = load.loadUsersInGangTty(tty_polys, hbk_user_home_loc) print 'Trimming tweets by low tweeting gangs...' # read each gang's tweet count hbk_tweets_by_gang = {} for gang_id in hbk_users_in_gang_t: this_gang_tweets = keepUserIds(hbk_all_tweets, hbk_users_in_gang_t[gang_id]) hbk_tweets_by_gang[gang_id] = 0 for foreign_id in my.HBK_GANG_ID_LIST: if gang_id != foreign_id: hbk_tweets_by_gang[gang_id] += len(keepPolygon(this_gang_tweets, tty_polys[foreign_id])) print 'Each gang\'s tweet count: %s' % hbk_tweets_by_gang print 'Removing users from gangs: %s' % [gang_id for gang_id in hbk_tweets_by_gang if hbk_tweets_by_gang[gang_id] < my.MIN_NON_HOME_TWEETS] remove_user_list = [] for gang_id in hbk_tweets_by_gang: if hbk_tweets_by_gang[gang_id] < my.MIN_NON_HOME_TWEETS: remove_user_list += hbk_users_in_gang_t[gang_id] hbk_trimmed_tweets = removeUserIds(hbk_all_tweets, remove_user_list) # replace old tweet list with trimmed list print 'Replacing old set of tweets...' with open('data/' + my.DATA_FOLDER + my.HBK_TWEET_LOC_FILE, 'wb') as fp1: csv_writer = csv.writer(fp1, delimiter=',') for tweet in hbk_trimmed_tweets: csv_writer.writerow(tweet) print str(len(hbk_trimmed_tweets)) + ' total instances written.'
def generate_gang_tweet_counts(): # Generate each gang's tweet count tty_polys, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) hbk_users_in_gang_t = load.loadUsersInGangTty(tty_polys, hbk_user_home_loc) # read each gang's tweet count hbk_tweets_by_gang = {} print 'Finding tweet count by each gang...' for gang_id in hbk_users_in_gang_t: #hbk_tweets_by_gang[gang_id] = len(prep.keepUserIds(hbk_all_tweets, hbk_users_in_gang_t[gang_id])) #hbk_tweets_by_gang[gang_id] = len(prep.removePolygon(prep.keepUserIds(hbk_all_tweets, hbk_users_in_gang_t[gang_id]), tty_polys[gang_id])) this_gang_tweets = prep.keepUserIds(hbk_all_tweets, hbk_users_in_gang_t[gang_id]) hbk_tweets_by_gang[gang_id] = 0 for foreign_id in my.HBK_GANG_ID_LIST: if gang_id != foreign_id: hbk_tweets_by_gang[gang_id] += len(prep.keepPolygon(this_gang_tweets, tty_polys[foreign_id])) print 'Each gang\'s tweet count: %s' % hbk_tweets_by_gang if not os.path.exists('data/' + my.DATA_FOLDER + 'json/'): os.makedirs('data/' + my.DATA_FOLDER + 'json/') with open('data/' + my.DATA_FOLDER + 'json/' + 'gang_tweet_counts.json', 'wb') as fp1: fp1.write(anyjson.dumps(hbk_tweets_by_gang))
def trim_non_gang_users(): # trim all non gang users in data tty_polys, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) hbk_users_in_gang_t = load.loadUsersInGangTty(tty_polys, hbk_user_home_loc) hbk_trimmed_tweets = [] # tweet list with home clusters removed print 'Removing non gang user tweets...' gang_user_list = [] for gang_id in hbk_users_in_gang_t: gang_user_list += hbk_users_in_gang_t[gang_id] hbk_trimmed_tweets = keepUserIds(hbk_all_tweets, gang_user_list) print str(len(hbk_trimmed_tweets)) + ' instances after non gang users removed.' # replace old tweet list with trimmed list print 'Replacing old set of tweets...' with open('data/' + my.DATA_FOLDER + my.HBK_TWEET_LOC_FILE, 'wb') as fp1: csv_writer = csv.writer(fp1, delimiter=',') for tweet in hbk_trimmed_tweets: csv_writer.writerow(tweet) print str(len(hbk_trimmed_tweets)) + ' total instances written.'
def calc_visit_sets(): # Calculate all visit sets tty_polys, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) hbk_users_in_gang_t = load.loadUsersInGangTty(tty_polys, hbk_user_home_loc) # Visit matrix visit_mat = calcVisitationMat(hbk_all_tweets, tty_polys, hbk_users_in_gang_t) # Norm vectors non_home_norm = calcNonHomeNorm(visit_mat) tw_freq_norm = calcTwFreqNorm(visit_mat) #dist_norm = calcDistNorm() # Different distance norm functions dist_norm = calcDistNormCDF() # Metrics- No normalization (absolute fractions) store_visit_set_output(calc_visit_sets_from_visit_mat(visit_mat), 'no_norm') # Metrics- Non-home normalized (TO-tty normalized) normalized_visit_mat = apply_non_home_norm(visit_mat, non_home_norm) store_visit_set_output(calc_visit_sets_from_visit_mat(normalized_visit_mat), 'non_home_norm') # Metrics- Tweet freq normalized (FROM-tty normalized) normalized_visit_mat = apply_tw_freq_norm(visit_mat, tw_freq_norm) store_visit_set_output(calc_visit_sets_from_visit_mat(normalized_visit_mat), 'tw_freq_norm') # Metrics- Distance normalized visit_mat_dist_norm = calcVisitationMat(hbk_all_tweets, tty_polys, hbk_users_in_gang_t, dist_norm, hbk_user_home_loc) store_visit_set_output(calc_visit_sets_from_visit_mat(visit_mat_dist_norm), 'dist_norm') # Metrics- Rival count normalized normalized_visit_mat = apply_rivals_norm(visit_mat) store_visit_set_output(calc_visit_sets_from_visit_mat(normalized_visit_mat), 'rivals_norm') # Metrics- Distance + Non-home norm normalized_visit_mat = apply_non_home_norm(visit_mat_dist_norm, non_home_norm) store_visit_set_output(calc_visit_sets_from_visit_mat(normalized_visit_mat), 'dist__non_home') # Metrics- Distance + Tweet freq norm normalized_visit_mat = apply_tw_freq_norm(visit_mat_dist_norm, tw_freq_norm) store_visit_set_output(calc_visit_sets_from_visit_mat(normalized_visit_mat), 'dist__tw_freq') # Metrics- Distance + Rivals normalized normalized_visit_mat = apply_rivals_norm(visit_mat_dist_norm) store_visit_set_output(calc_visit_sets_from_visit_mat(normalized_visit_mat), 'dist__rivals') # Metrics- Distance + Tweet freq + Non-home norm normalized_visit_mat = apply_tw_freq_norm(visit_mat_dist_norm, tw_freq_norm) normalized_visit_mat = apply_non_home_norm(normalized_visit_mat, non_home_norm) store_visit_set_output(calc_visit_sets_from_visit_mat(normalized_visit_mat), 'dist__tw_freq__non_home') # Metrics- Distance + Tweet freq + Non-home + Rivals norm normalized_visit_mat = apply_rivals_norm(normalized_visit_mat) store_visit_set_output(calc_visit_sets_from_visit_mat(normalized_visit_mat), 'dist__tw_freq__non_home__rivals')
def generate_visit_mat(): # Generate visit matrix json tty_polys, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) hbk_users_in_gang_t = load.loadUsersInGangTty(tty_polys, hbk_user_home_loc) visit_mat = calc.calcVisitationMat(hbk_all_tweets, tty_polys, hbk_users_in_gang_t) if not os.path.exists('data/' + my.DATA_FOLDER + 'json/'): os.makedirs('data/' + my.DATA_FOLDER + 'json/') with open('data/' + my.DATA_FOLDER + 'json/' + 'visit_matrix.json', 'wb') as fp1: fp1.write(anyjson.dumps(visit_mat))
def test(): '''dist_norm = calc.calcDistNormCDF() for k in dist_norm: print '%s, %s' % (k, dist_norm[k])''' tty_polys, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) hbk_users_in_gang_t = load.loadUsersInGangTty(tty_polys, hbk_user_home_loc) user_list = reduce(lambda x,y: x+y, hbk_users_in_gang_t.values()) with open('data/' + my.DATA_FOLDER + 'hbk_final_users.csv', 'wb') as fp1: csv_writer = csv.writer(fp1, delimiter=',') for user in user_list: csv_writer.writerow([user])
def calc_rival_nonrival_matrics_dist_norm(): tty_polys, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) hbk_users_in_gang_t = load.loadUsersInGangTty(tty_polys, hbk_user_home_loc) # Different distance norm functions #dist_norm = calcDistNorm() dist_norm = calcDistNormCDF() visit_mat = calcVisitationMat(hbk_all_tweets, tty_polys, hbk_users_in_gang_t, dist_norm, hbk_user_home_loc) #print visit_mat norm = calcNorm(calcVisitationMat(hbk_all_tweets, tty_polys, hbk_users_in_gang_t)) measure1 = {} measure2 = {} for gang_id in my.HBK_GANG_AND_RIVAL_IDS: measure1[gang_id] = { 'rival' : [], 'nonrival' : [] } measure2[gang_id] = { 'rival' : [], 'nonrival' : [] } non_home_sum = sum(visit_mat[gang_id].values()) - visit_mat[gang_id][gang_id] for rival_id in my.HBK_GANG_AND_RIVAL_IDS[gang_id]: if gang_id != rival_id and visit_mat[gang_id][rival_id] != 0: frac = visit_mat[gang_id][rival_id]/float(non_home_sum) measure1[gang_id]['rival'].append(round(frac, 5)) measure2[gang_id]['rival'].append(round(frac/norm[rival_id], 5)) for non_rival_id in my.HBK_GANG_ID_LIST: if gang_id != non_rival_id and non_rival_id not in my.HBK_GANG_AND_RIVAL_IDS[gang_id]: if visit_mat[gang_id][non_rival_id] != 0 and norm[non_rival_id] != 0: frac = visit_mat[gang_id][non_rival_id]/float(non_home_sum) measure1[gang_id]['nonrival'].append(round(frac, 5)) measure2[gang_id]['nonrival'].append(round(frac/norm[non_rival_id], 5)) # Store metrics if not os.path.exists('data/' + my.DATA_FOLDER + 'metrics_dist-norm/'): os.makedirs('data/' + my.DATA_FOLDER + 'metrics_dist-norm/') with open('data/' + my.DATA_FOLDER + 'metrics_dist-norm/' + 'measure1.json', 'wb') as fp2: fp2.write(anyjson.serialize(measure1)) with open('data/' + my.DATA_FOLDER + 'metrics_dist-norm/' + 'measure2.json', 'wb') as fp2: fp2.write(anyjson.serialize(measure2))
def see_visit_mat(): # See Visit matrix tty_polys, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) hbk_users_in_gang_t = load.loadUsersInGangTty(tty_polys, hbk_user_home_loc) tty_names = load.loadLocNames() visit_mat = calc.calcVisitationMat(hbk_all_tweets, tty_polys, hbk_users_in_gang_t) x = ['GANG NAME', '#'] x.extend(range(23,55)) print '%20s - %2s: %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s' % tuple(x) for gang_id in visit_mat: x = [tty_names[gang_id], gang_id] y = dict([(to_id, visit_mat[gang_id][to_id]) if gang_id != to_id else (to_id, 0) for to_id in visit_mat[gang_id]]) y = dict([(to_id, y[to_id]) if y[to_id] != 0 else (to_id, '.') for to_id in y]) y = [str(y[to_id])+'r' if to_id in my.HBK_GANG_AND_RIVAL_IDS[gang_id] and y[to_id] !='.' else y[to_id] for to_id in y] x.extend(y) print '%20s - %2s: %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s %4s' % tuple(x)
def generate_gang_locs_json(): # Generate each gang's locations json tty_polys, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) hbk_users_in_gang_t = load.loadUsersInGangTty(tty_polys, hbk_user_home_loc) # trim each gang's tweets hbk_tweets_by_gang = {} print 'Finding tweets by each gang...' for gang_id in my.HBK_GANG_ID_LIST: this_gang_tweets = prep.keepUserIds(hbk_all_tweets, hbk_users_in_gang_t[gang_id]) if gang_id in hbk_users_in_gang_t else [] hbk_tweets_by_gang[gang_id] = [[tweet[1], tweet[2]] for tweet in this_gang_tweets] print 'Each gang\'s tweet count: %s' % dict([(gang_id, len(hbk_tweets_by_gang[gang_id])) for gang_id in hbk_tweets_by_gang]) print 'Total tweets = %s' % (sum([len(hbk_tweets_by_gang[gang_id]) for gang_id in hbk_tweets_by_gang])) if not os.path.exists('data/' + my.DATA_FOLDER + 'json/'): os.makedirs('data/' + my.DATA_FOLDER + 'json/') with open('data/' + my.DATA_FOLDER + 'json/' + 'gang_tweet_locs.json', 'wb') as fp1: fp1.write(anyjson.dumps(hbk_tweets_by_gang))
def trim_home_clusters(): # trim all home clusters _, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) hbk_trimmed_tweets = [] # home clusters removed # tweet list with home clusters removed print 'Removing home clusters...' hbk_home_list = dict([(user_home[0], [user_home[1], user_home[2]]) for user_home in hbk_user_home_loc]) hbk_trimmed_tweets = removeNearPoints(hbk_all_tweets, hbk_home_list, my.HOME_RADIUS) print str(len(hbk_trimmed_tweets)) + ' instances after home clusters removed.' # replace old tweet list with trimmed list print 'Replacing old set of tweets...' with open('data/' + my.DATA_FOLDER + my.HBK_TWEET_LOC_FILE, 'wb') as fp1: csv_writer = csv.writer(fp1, delimiter=',') for tweet in hbk_trimmed_tweets: csv_writer.writerow(tweet) print str(len(hbk_trimmed_tweets)) + ' total instances written.'
def see_gang_tweet_counts(): # See each gang's tweet count tty_polys, hbk_poly = load.loadLocPoly() tty_names = load.loadLocNames() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) hbk_users_in_gang_t = load.loadUsersInGangTty(tty_polys, hbk_user_home_loc) # read each gang's tweet count hbk_tweets_by_gang = {} print 'Finding tweet count by each gang...' for gang_id in hbk_users_in_gang_t: hbk_tweets_by_gang[gang_id] = len(prep.keepUserIds(hbk_all_tweets, hbk_users_in_gang_t[gang_id])) print 'Each gang\'s tweet count: %s' % hbk_tweets_by_gang print '%2s %15s %5s %5s %8s %6s' % ('ID', 'NAME', '#TWs', '#USERs', '#RIVALs', 'TW/USR') for gang_id in hbk_tweets_by_gang: if hbk_tweets_by_gang[gang_id] != 0: print '%2s %15s %5s %5s %8s %6s' % (gang_id, tty_names[gang_id], hbk_tweets_by_gang[gang_id], len(hbk_users_in_gang_t[gang_id]), len(my.HBK_GANG_AND_RIVAL_IDS[gang_id]), int(hbk_tweets_by_gang[gang_id]/float(len(hbk_users_in_gang_t[gang_id])))) print 'Total number of users: %s' % sum([len(hbk_users_in_gang_t[gang_id]) for gang_id in hbk_tweets_by_gang if hbk_tweets_by_gang[gang_id] != 0]) print 'Total tweets from all users: %s' % sum([hbk_tweets_by_gang[gang_id] for gang_id in hbk_tweets_by_gang])
def see_rivalry_list(): # See Rivalry list tty_polys, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) hbk_users_in_gang_t = load.loadUsersInGangTty(tty_polys, hbk_user_home_loc) tty_names = load.loadLocNames() visit_mat_1 = calc.calcVisitationMat(hbk_all_tweets, tty_polys, hbk_users_in_gang_t) dist_norm = calc.calcDistNormCDF() visit_mat = calc.calcVisitationMat(hbk_all_tweets, tty_polys, hbk_users_in_gang_t, dist_norm, hbk_user_home_loc) rivalry_list = {} for gang_id in my.HBK_GANG_AND_RIVAL_IDS: for rival_id in [to_id for to_id in my.HBK_GANG_ID_LIST if to_id != gang_id]: if visit_mat_1[gang_id][rival_id] >= 5 and str(gang_id)+str(rival_id) not in rivalry_list and str(rival_id)+str(gang_id) not in rivalry_list: this_row = [gang_id, tty_names[gang_id], rival_id, tty_names[rival_id], \ int(visit_mat[gang_id][rival_id]), \ int(visit_mat[rival_id][gang_id])] this_row.append('rival') if rival_id in my.HBK_GANG_AND_RIVAL_IDS[gang_id] else this_row.append('nonrival') affinity = round(1.0/abs(visit_mat[gang_id][rival_id]-visit_mat[rival_id][gang_id]), 3) if visit_mat[gang_id][rival_id] != visit_mat[rival_id][gang_id] else 0 this_row.append(affinity) this_row.append(int((visit_mat[gang_id][rival_id]+visit_mat[rival_id][gang_id])/2)) rivalry_list[str(gang_id)+str(rival_id)] = this_row rivalry_list = rivalry_list.values() rivals = [row for row in rivalry_list if row[6] == 'rival'] nonrivals = [row for row in rivalry_list if row[6] == 'nonrival'] rivalry_list = rivals + nonrivals val = ['A#', 'GANG A', 'B#', 'GANG B', 'A>B', 'B>A', 'RnR', 'Affinity', 'AvgTw'] print '%2s %20s => %2s %20s \t %4s \t %4s \t %8s \t %8s \t %5s' % tuple(val) for val in rivalry_list: print '%2s %20s => %2s %20s \t %4s \t %4s \t %8s \t %8s \t %5s' % tuple(val)
def trim_low_user_gangs(): # Trim gang tweets for gangs with low members tty_polys, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) hbk_users_in_gang_t = load.loadUsersInGangTty(tty_polys, hbk_user_home_loc) g_list = dict([(gang_id, len(hbk_users_in_gang_t[gang_id])) for gang_id in hbk_users_in_gang_t]) print 'Trimming tweets by low user gangs...' print 'Removing users from gangs: %s' % [gang_id for gang_id in g_list if g_list[gang_id] < my.MIN_GANG_USERS] remove_user_list = [] for gang_id in g_list: if g_list[gang_id] < my.MIN_GANG_USERS: remove_user_list += hbk_users_in_gang_t[gang_id] hbk_trimmed_tweets = removeUserIds(hbk_all_tweets, remove_user_list) # replace old tweet list with trimmed list print 'Replacing old set of tweets...' with open('data/' + my.DATA_FOLDER + my.HBK_TWEET_LOC_FILE, 'wb') as fp1: csv_writer = csv.writer(fp1, delimiter=',') for tweet in hbk_trimmed_tweets: csv_writer.writerow(tweet) print str(len(hbk_trimmed_tweets)) + ' total instances written.'
def calcTweetDistances(): print 'Calculating tweeting distances...' _, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) print [i[0] for i in hbk_user_home_loc]
def calc_rival_nonrival_matrics(): tty_polys, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) hbk_users_in_gang_t = load.loadUsersInGangTty(tty_polys, hbk_user_home_loc) visit_mat = calcVisitationMat(hbk_all_tweets, tty_polys, hbk_users_in_gang_t) for gang_id in visit_mat: for to_id in visit_mat[gang_id]: if visit_mat[gang_id][to_id] != 0: print str(gang_id) + ' => ' + str(to_id) + ' : ' + str(visit_mat[gang_id][to_id]) norm = calcNorm(visit_mat) for gang_id in norm: print str(gang_id) + ' : ' + str(norm[gang_id]) measure1 = {} measure2 = {} #-CH measure3 stores [frac, norm] instead of absolute values measure3 = {} for gang_id in my.HBK_GANG_AND_RIVAL_IDS: measure1[gang_id] = { 'rival' : [], 'nonrival' : [] } measure2[gang_id] = { 'rival' : [], 'nonrival' : [] } #-CH measure3[gang_id] = { 'rival' : [], 'nonrival' : [] } non_home_sum = sum(visit_mat[gang_id].values()) - visit_mat[gang_id][gang_id] for rival_id in my.HBK_GANG_AND_RIVAL_IDS[gang_id]: if gang_id != rival_id and visit_mat[gang_id][rival_id] != 0: frac = visit_mat[gang_id][rival_id]/float(non_home_sum) measure1[gang_id]['rival'].append(round(frac, 5)) measure2[gang_id]['rival'].append(round(frac/norm[rival_id], 5)) #-CH measure3[gang_id]['rival'].append([frac, norm[rival_id]]) for non_rival_id in my.HBK_GANG_ID_LIST: if gang_id != non_rival_id and non_rival_id not in my.HBK_GANG_AND_RIVAL_IDS[gang_id]: if visit_mat[gang_id][non_rival_id] != 0 and norm[non_rival_id] != 0: frac = visit_mat[gang_id][non_rival_id]/float(non_home_sum) measure1[gang_id]['nonrival'].append(round(frac, 5)) measure2[gang_id]['nonrival'].append(round(frac/norm[non_rival_id], 5)) #-CH measure3[gang_id]['nonrival'].append([frac, norm[non_rival_id]]) # Store metrics if not os.path.exists('data/' + my.DATA_FOLDER + 'metrics/'): os.makedirs('data/' + my.DATA_FOLDER + 'metrics/') with open('data/' + my.DATA_FOLDER + 'metrics/' + 'measure1.json', 'wb') as fp2: fp2.write(anyjson.serialize(measure1)) with open('data/' + my.DATA_FOLDER + 'metrics/' + 'measure2.json', 'wb') as fp2: fp2.write(anyjson.serialize(measure2)) #-CH with open('data/' + my.DATA_FOLDER + 'metrics/' + 'measure3.json', 'wb') as fp2: fp2.write(anyjson.serialize(measure3))