def _calc_visits(user_id): con = psycopg2.connect(my.DB_CONN_STRING) cur = con.cursor() SQL = '''SELECT ST_X(geo), ST_Y(geo) \ FROM {rel_tweet} \ WHERE user_id = %s \ '''.format(rel_tweet=my.REL_TWEET) cur.execute(SQL, (user_id, )) recs = cur.fetchall() con.close() home = homes[str(user_id)] visits = [0]*6 legend = {'w': 0, 'b': 1, 'a': 2, 'h': 3, 'o': 4} for rec in recs: lat, lng = rec dist = int(round(geo.distance(geo.xyz(home[0], home[1]), geo.xyz(lat, lng)))) if dist > my.MIN_DIST: race = _find_race([user_id, [lat, lng]]) if race: visits[legend[race[1]]] += 1 else: visits[5] += 1 print [user_id, visits] return [user_id, visits]
def _find_dir_list(): '''Find the list of direction for all users in region. [user_directions.csv]''' with open('data/' + my.DATA_FOLDER + 'user_list.json', 'rb') as fpr: user_ids = anyjson.loads(fpr.read()) user_ids = [int(user_id) for user_id in user_ids] print 'Read {0} user_ids'.format(len(user_ids)) user_directions = [] user_dir_trimmed = [] con = psycopg2.connect(my.DB_CONN_STRING) cur = con.cursor() for user_id in user_ids: SQL = 'SELECT ST_X(geo), ST_Y(geo) \ FROM {rel_home} \ WHERE user_id = %s'.format(rel_home=my.REL_HOME) cur.execute(SQL, (user_id,)) records = cur.fetchall() if len(records) > 0: home = records[0] hx, hy = home[1], home[0] SQL = 'SELECT ST_X(geo), ST_Y(geo) \ FROM {rel_tweet} \ WHERE user_id = %s'.format(rel_tweet=my.REL_TWEET, rel_home=my.REL_HOME) \ + my.QUERY_CONSTRAINT cur.execute(SQL, (user_id,)) records = cur.fetchall() for rec in records: lat, lng = rec x, y = lng-hx, lat-hy if x != 0 and y != 0: deg = int(round(_calc_angle(x, y))) user_directions.append([user_id, deg]) try: dist = int(round(geo.distance(geo.xyz(hy, hx), geo.xyz(lat, lng)))) except: dist = 0 if dist > my.MIN_DIR_DIST: user_dir_trimmed.append([user_id, deg]) else: print 'Missed 1 user_id!' con.close() with open('data/' + my.DATA_FOLDER + 'displacement/' + 'user_directions.csv', 'wb') as fpw: cw = csv.writer(fpw, delimiter=',') for row in user_directions: cw.writerow(row) with open('data/' + my.DATA_FOLDER + 'displacement/' + 'user_dir_trimmed.csv', 'wb') as fpw: cw = csv.writer(fpw, delimiter=',') for row in user_dir_trimmed: cw.writerow(row) # Statistics x = [d[1] for d in user_directions] print len(x), min(x), max(x), sum(x)/len(x) x = [d[1] for d in user_dir_trimmed] print len(x), min(x), max(x), sum(x)/len(x)
def _calc_distToHoods(from_id, centroids): # Calculate distance to all centroids from the centroid of from_id dists = {} for to_id in centroids: if to_id != from_id: dists[to_id] = int(geo.distance(geo.xyz(centroids[from_id][0], centroids[from_id][1]), \ geo.xyz(centroids[to_id][0], centroids[to_id][1]))) return dists
def calcVisitationMat(hbk_all_tweets, tty_polys, hbk_users_in_gang_t, dist_norm=None, hbk_user_home_loc=None): # visit_mat[i][j] = #tw(i) in j print 'Calculating visitation matrix...' # Load visit matrix .pickle if exists if not dist_norm and os.path.exists('data/' + my.DATA_FOLDER + 'json/visit_mat.pickle'): with open('data/' + my.DATA_FOLDER + 'json/' + 'visit_mat.pickle', 'rb') as fp1: visit_mat = pickle.load(fp1) elif dist_norm and os.path.exists('data/' + my.DATA_FOLDER + 'json/visit_mat__dist_norm.pickle'): with open('data/' + my.DATA_FOLDER + 'json/' + 'visit_mat__dist_norm.pickle', 'rb') as fp1: visit_mat = pickle.load(fp1) # Calculate visit matrix is .pickle doesn't exist else: hbk_home_list = {} if dist_norm: print '...for distance norm.' for user_home in hbk_user_home_loc: hbk_home_list[user_home[0]] = [user_home[1], user_home[2]] visit_mat = {} for gang_id in my.HBK_GANG_ID_LIST: visit_mat[gang_id] = {} for gang_id in my.HBK_GANG_ID_LIST: if gang_id not in hbk_users_in_gang_t: for to_id in my.HBK_GANG_ID_LIST: visit_mat[gang_id][to_id] = 0 #visit_mat[to_id][gang_id] = 0 else: this_gang_tweets = prep.keepUserIds(hbk_all_tweets, hbk_users_in_gang_t[gang_id]) for to_id in my.HBK_GANG_ID_LIST: this_tty_tweets = prep.keepPolygon(this_gang_tweets, tty_polys[to_id]) if dist_norm == None: visit_mat[gang_id][to_id] = len(this_tty_tweets) else: visit_val = 0 for tweet in this_tty_tweets: dist = geo.distance(geo.xyz(tweet[1], tweet[2]), geo.xyz(hbk_home_list[tweet[0]][0], hbk_home_list[tweet[0]][1])) dist_i = int(round(dist/100 + 1)) visit_val += 1/dist_norm[dist_i] #print str(dist_i) + '\t=>\t' + str(1/dist_norm[dist_i]) visit_mat[gang_id][to_id] = round(visit_val, 5) print 'Done calculating visitation matrix...' # Store visit matrix .pickle if not os.path.exists('data/' + my.DATA_FOLDER + 'json/'): os.makedirs('data/' + my.DATA_FOLDER + 'json/') if not dist_norm: with open('data/' + my.DATA_FOLDER + 'json/' + 'visit_mat.pickle', 'wb') as fp1: pickle.dump(visit_mat, fp1) else: with open('data/' + my.DATA_FOLDER + 'json/' + 'visit_mat__dist_norm.pickle', 'wb') as fp1: pickle.dump(visit_mat, fp1) return visit_mat
def _closest_dist(pol_a, pol_b): '''Find the closest distance between pol_a and pol_b. Closest among set of end points of line segments in pol_a and pol_b''' min_dist = 15000 for a in pol_a: for b in pol_b: try: dist = int(geo.distance(geo.xyz(a[0], a[1]), geo.xyz(b[0], b[1]))) min_dist = dist if dist < min_dist else min_dist except: print 'Error calculating distance!' return min_dist
def _territory_span(pol): '''Find the spanning distance of the territory. i.e. the maximum distance between any two end points of line segments in pol''' max_dist = 0 for a in pol: for b in pol: try: dist = int(geo.distance(geo.xyz(a[0], a[1]), geo.xyz(b[0], b[1]))) max_dist = dist if dist > max_dist else max_dist except: print 'Error calculating distance!' return max_dist
def _is_interaction(tw1, tw2): lat1, lng1, _, ts1, _ = tw1 lat2, lng2, _, ts2, _ = tw2 dist = geo.distance(geo.xyz(lat1, lng1), geo.xyz(lat2, lng2)) tds = ts1 - ts2 tds = abs(tds.total_seconds()) if dist <= my.MAX_INTERACTION_DIST and tds <= my.MAX_INTERACTION_TIME: #print dist, tds return True else: return False
def get_bounding_box(center, miles): this_point = [center[0], center[1]] while geo.distance(geo.xyz(center[0], center[1]), geo.xyz(this_point[0], this_point[1])) <= (miles*my.CONST_MILE_TO_METER): this_point[0] += 0.0001 # lat lat_hi = this_point[0] this_point = [center[0], center[1]] while geo.distance(geo.xyz(center[0], center[1]), geo.xyz(this_point[0], this_point[1])) <= (miles*my.CONST_MILE_TO_METER): this_point[0] -= 0.0001 # lat lat_lo = this_point[0] if lat_lo > lat_hi: lat_hi, lat_lo = lat_lo, lat_hi this_point = [center[0], center[1]] while geo.distance(geo.xyz(center[0], center[1]), geo.xyz(this_point[0], this_point[1])) <= (miles*my.CONST_MILE_TO_METER): this_point[1] += 0.0001 # lng lng_hi = this_point[1] this_point = [center[0], center[1]] while geo.distance(geo.xyz(center[0], center[1]), geo.xyz(this_point[0], this_point[1])) <= (miles*my.CONST_MILE_TO_METER): this_point[1] -= 0.0001 # lng lng_lo = this_point[1] if lng_lo > lng_hi: lng_hi, lng_lo = lng_lo, lng_hi #return [[lat_lo, lng_lo], [lat_hi, lng_lo], [lat_hi, lng_hi], [lat_lo, lng_hi]] #polygon return [[lat_lo, lng_lo], [lat_hi, lng_hi]] #bbox
def removeNearPoints(tweets, points, radius): print points new_tweets = [] for tweet in tweets: if (tweet[0] not in points) or geo.distance(geo.xyz(tweet[1], tweet[2]), geo.xyz(points[tweet[0]][0], points[tweet[0]][1])) > radius: new_tweets.append(tweet) #inside = False #for point in points: #if geo.distance(geo.xyz(tweet[1], tweet[2]), geo.xyz(point[0], point[1])) < radius: #inside = True #break #if not inside: #new_tweets.append(tweet) return new_tweets
def isNear(point, line): [[x1,y1], [x2,y2]] = line [px, py] = point if ((py<=y1 and py>=y2) or (py>=y1 and py<=y2)) and \ (px<=x1 and px>=x2) or (px>=x1 and px<=x2): # If inside the box - calc perpendicular distance if x1 == x2: # vertical line #return False x = x1 if (py<=y1 and py>=y2) or (py>=y1 and py<=y2): y = py else: y = y1 if abs(py-y1) < abs(py-y2) else y2 elif y1 == y2: # horizontal line #return False y = y1 if (px<=x1 and px>=x2) or (px>=x1 and px<=x2): x = px else: x = x1 if abs(px-x1) < abs(px-x2) else x2 else: # usual line m = (y2-y1)/(x2-x1) c = (y1 - m*x1) x = (m*py + px -m*c) / (m*m + 1) y = (m*m*py + m*px + c) / (m*m + 1) # Initial #m = (y2-y1)/(x2-x1) #x = (px + m * (m*x1 - y1 + py)) / (1 + m*m) #y = py - (x - px)/m try: if geo.distance(geo.xyz(px, py), geo.xyz(x, y)) < my.BORDER_LINE_SPAN: return True else: return False except Exception: print 'Couldn\'t calculate geo.distance' return False else: # If outside box - calc dist from end points of line if geo.distance(geo.xyz(px, py), geo.xyz(x1, y1)) < my.BORDER_LINE_SPAN or geo.distance(geo.xyz(px, py), geo.xyz(x2, y2)) < my.BORDER_LINE_SPAN : return True else: return False
def find_most_visited_loc(): points = [] results = [] clusters = [] with open(my.DATA_FOLDER + '/' + my.HBK_HOME_LOC_FILE, 'rb') as fp1: csv_reader = csv.reader(fp1, delimiter=',') for row in csv_reader: current_user = row[0] current_user_home = geo.xyz(float(row[1].strip()), float(row[2].strip())) try: with open(my.DATA_FOLDER + '/' + my.USER_TWEET_LOC_FOLDER + '/' + str(my.HBK_LOCATION_ID) + '/' + str(current_user) + '.csv', 'rb') as fp2: csv_reader2 = csv.reader(fp2, delimiter = ',') for row2 in csv_reader2: if row2[0].strip().__len__() != 0 and row2[1].strip().__len__() != 0: # if not near user's home this_point = geo.xyz(float(row2[0].strip()), float(row2[1].strip())) if int(round(geo.distance(current_user_home, this_point))) > 100: points.append([float(row2[0].strip()), float(row2[1].strip())]) except IOError as e: print 'No file found for user... ' + str(current_user) print 'Total latlng pairs read: ' + str(len(points)) if len(points) != 0: print 'Running DBScan... ' results = dbscan(points, my.DBSCAN_EPSILON, my.DBSCAN_MIN_POINTS) print 'Run complete... Number of clusters = ' + str(len(results)) for key in results: if key != -1: center = calc_center(results[key]) clusters.append([center[0], center[1], len(results[key])]) fp3 = open(my.DATA_FOLDER + '/' + my.MOST_VISITED_LOC_FILE, 'wb') csv_writer = csv.writer(fp3, delimiter=',') for row in clusters: csv_writer.writerow(row) fp3.close with open(my.DATA_FOLDER + '/' + my.MOST_VISITED_LOC_FILE_json, 'wb') as fp3: fp3.write(anyjson.serialize(clusters))
def _get_points(user_id): con = psycopg2.connect(my.DB_CONN_STRING) cur = con.cursor() SQL = '''SELECT ST_X(geo), ST_Y(geo) \ FROM {rel_tweet} \ WHERE user_id = %s \ '''.format(rel_tweet=my.REL_TWEET) cur.execute(SQL, (user_id, )) recs = cur.fetchall() con.close() home = homes[str(user_id)] points = [] for rec in recs: lat, lng = rec dist = int(round(geo.distance(geo.xyz(home[0], home[1]), geo.xyz(lat, lng)))) if dist > my.MIN_DIST: points.append([round(lat,5), round(lng,5)]) return [race_lookup[user_id], points]
def _get_dist(user_id): con = psycopg2.connect(my.DB_CONN_STRING) cur = con.cursor() SQL = '''SELECT ST_X(geo), ST_Y(geo) \ FROM {rel_tweet} \ WHERE user_id = %s \ '''.format(rel_tweet=my.REL_TWEET) cur.execute(SQL, (user_id, )) recs = cur.fetchall() con.close() home = homes[str(user_id)] disp = [] for rec in recs: lat, lng = rec dist = int(round(geo.distance(geo.xyz(home[0], home[1]), geo.xyz(lat, lng)))) if dist > 100: disp.append(dist) if len(disp) > 0: return disp
def make_feature_mat(mat, links, folder, file_name): X = [] y = [] centers = _load_nhood_centers() pols = _load_nhood_polygons() mat_f = _calc_mat_frac(mat) for a, b, label in links: instance = [ int(geo.distance(geo.xyz(centers[a][0], centers[a][1]), geo.xyz(centers[b][0], centers[b][1]))), # CENTROID_DIST _closest_dist(pols[a], pols[b]), # CLOSEST_DIST max(_territory_span(pols[a]), _territory_span(pols[b])), # MAX_TTY_SPAN abs(_territory_span(pols[a]) - _territory_span(pols[b])), #TTY_SPAN_DIFF pow(max(_territory_span(pols[a]), _territory_span(pols[b])), 2), # SPAN_SQ mat_f[a][b] + mat_f[b][a], # TOTAL_VISITS (mat_f[a][b] + mat_f[b][a])/2, # AVG_VISITS _in_density(a, mat_f) + _in_density(b, mat_f), # IN_DENSITY_ApB abs(_in_density(a, mat_f) - _in_density(b, mat_f)), # IN_DENSITY_AmB _out_density(a, mat_f) + _out_density(b, mat_f), # OUT_DENSITY_ApB abs(_out_density(a, mat_f) - _out_density(b, mat_f)), # OUT_DENSITY_AmB _in_entropy(a, mat_f) + _in_entropy(b, mat_f), _out_entropy(a, mat_f) + _out_entropy(b, mat_f), _in_cross_entropy(a, b, mat_f), # IN_CROSS_ENTROPY _out_cross_entropy(a, b, mat_f), # OUT_CROSS_ENTROPY ] X.append(instance) y.append(label) Xy = {'X': X, 'y': y} with open('data/' + my.DATA_FOLDER + 'predict_rivalry/' + folder + file_name + '/' + 'Xy.pickle', 'wb') as fp1: pickle.dump(Xy, fp1) return Xy
def _calc_visits_dist_norm(user_id): con = psycopg2.connect(my.DB_CONN_STRING) cur = con.cursor() SQL = '''SELECT ST_X(geo), ST_Y(geo) \ FROM {rel_tweet} \ WHERE user_id = %s \ '''.format(rel_tweet=my.REL_TWEET) cur.execute(SQL, (user_id, )) recs = cur.fetchall() con.close() home = homes[str(user_id)] visits = [0.0]*6 legend = {'w': 0, 'b': 1, 'a': 2, 'h': 3, 'o': 4} with open('data/' + my.DATA_FOLDER + 'user_disp_param.json', 'rb') as fp: param = anyjson.loads(fp.read()) amp = param['amp'] index = param['index'] powerlaw = lambda x: amp * (x**index) for rec in recs: lat, lng = rec dist = int(round(geo.distance(geo.xyz(home[0], home[1]), geo.xyz(lat, lng)))) if dist > my.MIN_DIST: weight = 1 - powerlaw(dist) race = _find_race([user_id, [lat, lng]]) if race: visits[legend[race[1]]] += weight else: visits[5] += weight visits = [round(i, 4) for i in visits] print [user_id, visits] return [user_id, visits]
def plot_rivalry(folder, file_name): with open('data/' + my.DATA_FOLDER + 'predict_rivalry/' + folder + file_name + '/' + 'links.pickle', 'rb') as fp1: links = pickle.load(fp1) with open('data/' + my.DATA_FOLDER + 'predict_rivalry/' + folder + file_name + '/' + 'predicted_links.pickle', 'rb') as fp1: links_p = pickle.load(fp1) with open('data/' + my.DATA_FOLDER + 'predict_rivalry/' + folder + file_name + '/' + 'info.txt', 'rb') as fp1: info = fp1.read() centers = _load_nhood_centers() names = _load_nhood_names() with open('data/' + my.DATA_FOLDER + 'rivalry_baseline.pickle', 'rb') as fp: rivalry_baseline = pickle.load(fp) actual = [(np.array([centers[a], centers[b]]), y) for a, b, y in links] baseline = [] predicted = [] for i in range(len(links)): a, b, y = links_p[i] if y != links[i][2]: y += 2 predicted.append((np.array([centers[a], centers[b]]), y)) y = rivalry_baseline[a][b] #y = my.BASELINE_PREDICTION[(a, b)] if (a, b) in my.BASELINE_PREDICTION else my.BASELINE_PREDICTION[(b, a)] if y != links[i][2]: y += 2 baseline.append((np.array([centers[a], centers[b]]), y)) y_ = [y for v,y in actual] y_pred = [y for v,y in baseline] true = len([1 for i in range(len(y_)) if y_[i]==y_pred[i]]) true_r = len([1 for i in range(len(y_)) if y_[i]==1 and y_[i]==y_pred[i]]) miss = len(y_) - true acc = true / float(len(y_pred)) acc_r = true_r / float(len([1 for i in range(len(y_)) if y_[i]==1])) base_info = 'Links: ' + '{0}'.ljust(10).format(str(len(y_))) + '\n' base_info += 'Network acc.: ' + '{0}'.format(str(round(acc*100, 2)) + '%') + '\n' base_info += 'Rivalry acc.: ' + '{0}'.format(str(round(acc_r*100, 2)) + '%') + '\n' pols = [] for pol in _load_nhood_polygons().values(): pol = [[ll[1], ll[0]] for ll in pol] pols.append(pol) lngs = [ll[0] for ll in pol for pol in pols] lats = [ll[1] for ll in pol for pol in pols] print max(lngs), min(lngs), max(lats), min(lats) ## MIGHT NEED TO SWAP x_dist and y_dist y_dist = geo.distance(geo.xyz(max(lats), max(lngs)), geo.xyz(max(lats), min(lngs))) x_dist = geo.distance(geo.xyz(max(lats), max(lngs)), geo.xyz(min(lats), max(lngs))) print x_dist, y_dist heat = [1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3] shuffle(heat) heat = np.array(heat) #fig = plt.figure(figsize=(1.5* 3*4, 1.5* 6)) #plt.subplots_adjust(left=0.02, right=0.98, top=0.99, bottom=0.0) # # Map # ''' markers = { 29: (-118.21769714355469, 34.074559310537), 25: (-118.20585250854492, 34.08948780782094), 33: (-118.17117691040039, 34.08692882376708), 42: (-118.15933227539062, 34.097306446504355), 37: (-118.18439483642578, 34.08394324461533), 47: (-118.19400787353516, 34.08422759002247), 32: (-118.19211959838867, 34.080246667433315), 36: (-118.20293426513672, 34.081099738028236), 31: (-118.20653915405273, 34.0729952204399), 41: (-118.20121765136719, 34.07143110146333), 44: (-118.16946029663086, 34.06787617820785), 26: (-118.19709777832031, 34.059628181822184), 46: (-118.22164535522461, 34.05102381295824), 50: (-118.2227611541748, 34.045476732062944), 30: (-118.22190284729004, 34.041138377469416), 49: (-118.21074485778809, 34.05130826886282), 39: (-118.20259094238281, 34.0488192473379), 52: (-118.19701194763184, 34.05166383740143), 48: (-118.19486618041992, 34.050028209776336), 40: (-118.1960678100586, 34.04327202221684), 43: (-118.20293426513672, 34.043556504127444), 35: (-118.2030200958252, 34.03843568373248), 54: (-118.20405006408691, 34.03139405087606), 45: (-118.20379257202148, 34.022786817002), 23: (-118.2143497467041, 34.02392501371833), 53: (-118.20671081542969, 34.02051037777654), 38: (-118.19520950317383, 34.018803008289744), 28: (-118.21610927581787, 34.04700577135851), 51: (-118.2134485244751, 34.047432475078324), 27: (-118.21108818054199, 34.04618791656029), 34: (-118.2070541381836, 34.044658862517366)} fig = plt.figure(figsize=(1.5* 4, 1.5* 6)) plt.subplots_adjust(left=0.01, right=0.99, top=0.99, bottom=0.0) ax = fig.add_subplot(111, aspect=1.2) coll = PolyCollection(pols, array=heat, cmap=mpl.cm.Accent, edgecolors='#111111', alpha=0.75) ax.add_collection(coll) ax.autoscale_view() ax.get_xaxis().set_ticklabels([]) ax.get_yaxis().set_ticklabels([]) count = 0 id_map = {} for h_id in markers: count += 1 id_map[h_id] = count x, y = markers[h_id] ax.text(x, y, str(count), backgroundcolor='#dddddd', color='#000000', fontsize=10, alpha=0.8, fontproperties=FontProperties(weight='bold')) ids = markers.keys() info1 = '\n'.join([str(str(id_map[i]) + ' : ' + names[i]) for i in ids[:8]]) info2 = '\n'.join([str(str(id_map[i]) + ' : ' + names[i]) for i in ids[8:]]) ax.text(0.05, 0.99, info1, ha='left', va='top', transform=ax.transAxes, fontsize=12) ax.text(0.6, 0.01, info2, ha='left', va='bottom', transform=ax.transAxes, fontsize=12) plt.savefig('data/' + my.DATA_FOLDER + 'predict_rivalry/' + folder + file_name + '/' + '_map' + '.pdf') ''' # # Actual # fig = plt.figure(figsize=(1.5* 4, 1.5* 6)) plt.subplots_adjust(left=0.01, right=0.99, top=0.99, bottom=0.0) ax = fig.add_subplot(111, aspect=1.2) #coll = PolyCollection(pols, array=heat, cmap=mpl.cm.Dark2, edgecolors='k', alpha=0.3) coll = PolyCollection(pols, facecolors='none', edgecolors='k', linewidths=1, alpha=0.3) ax.add_collection(coll) ax.autoscale_view() ax.get_xaxis().set_ticklabels([]) ax.get_yaxis().set_ticklabels([]) #ax.set_title('Actual') for vertices, y in actual: if y == 1: ax.plot(vertices[:,0], vertices[:,1], color=my.CMAP[y], alpha=0.95, linewidth=my.LINEWIDTH[y]) else: ax.plot(vertices[:,0], vertices[:,1], color=my.CMAP[y], alpha=0.75, linewidth=1, linestyle=my.LINESTYLE[y]) plt.savefig('data/' + my.DATA_FOLDER + 'predict_rivalry/' + folder + file_name + '/' + '_actual' + '.pdf') # # Baseline # fig = plt.figure(figsize=(1.5* 4, 1.5* 6)) plt.subplots_adjust(left=0.01, right=0.99, top=0.99, bottom=0.0) ax = fig.add_subplot(111, aspect=1.2) #coll = PolyCollection(pols, array=heat, cmap=mpl.cm.Dark2, edgecolors='k', alpha=0.3) coll = PolyCollection(pols, facecolors='none', edgecolors='k', linewidths=1, alpha=0.3) ax.add_collection(coll) ax.autoscale_view() ax.get_xaxis().set_ticklabels([]) ax.get_yaxis().set_ticklabels([]) #ax.set_title('Baseline') ax.text(0.98, 0.05, base_info, horizontalalignment='right', verticalalignment='bottom', transform = ax.transAxes, fontsize=19) for vertices, y in baseline: if y == 1: ax.plot(vertices[:,0], vertices[:,1], color=my.CMAP[y], alpha=0.95, linewidth=my.LINEWIDTH[y]) else: ax.plot(vertices[:,0], vertices[:,1], color=my.CMAP[y], alpha=0.9, linewidth=my.LINEWIDTH[y], linestyle=my.LINESTYLE[y]) plt.savefig('data/' + my.DATA_FOLDER + 'predict_rivalry/' + folder + file_name + '/' + '_baseline' + '.pdf') # # Predicted # fig = plt.figure(figsize=(1.5* 4, 1.5* 6)) plt.subplots_adjust(left=0.01, right=0.99, top=0.99, bottom=0.0) ax = fig.add_subplot(111, aspect=1.2) #coll = PolyCollection(pols, array=heat, cmap=mpl.cm.winter, edgecolors='k', alpha=0.2) coll = PolyCollection(pols, facecolors='none', edgecolors='k', linewidths=1, alpha=0.3) ax.add_collection(coll) ax.autoscale_view() ax.get_xaxis().set_ticklabels([]) ax.get_yaxis().set_ticklabels([]) #ax.set_title('Predicted') ax.text(0.98, 0.05, info, horizontalalignment='right', verticalalignment='bottom', transform = ax.transAxes, fontsize=19) for vertices, y in predicted: if y == 1: ax.plot(vertices[:,0], vertices[:,1], color=my.CMAP[y], alpha=0.95, linewidth=my.LINEWIDTH[y]) else: ax.plot(vertices[:,0], vertices[:,1], color=my.CMAP[y], alpha=0.9, linewidth=my.LINEWIDTH[y], linestyle=my.LINESTYLE[y]) plt.savefig('data/' + my.DATA_FOLDER + 'predict_rivalry/' + folder + file_name + '/' + '_predicted' + '.pdf')
def make_grid(): path = 'data/' + my.DATA_FOLDER + 'artificial/' if not os.path.exists(path): os.makedirs(path) lat1 = ( my.LAT_RANGE[0] / my.DELTA_METERS ) * my.LAT_DELTA lat2 = ( my.LAT_RANGE[1] / my.DELTA_METERS ) * my.LAT_DELTA lng1 = ( my.LNG_RANGE[0] / my.DELTA_METERS ) * my.LNG_DELTA lng2 = ( my.LNG_RANGE[1] / my.DELTA_METERS ) * my.LNG_DELTA print lng1, lng2, lat1, lat2 X = np.arange(lng1, lng2, my.LNG_DELTA) Y = np.arange(lat1, lat2, my.LAT_DELTA) #XY = list( itertools.product(X, Y) ) #print len(XY) #XY = np.array( XY ) #print len(X), len(Y), len(X) * len(Y), XY.shape, len(XY), len(XY.tolist()) #print XY.reshape(len(X), len(Y), 2).reshape(len(X), len(Y), 2) #XY = XY.reshape(len(X), len(Y), 2).tolist() #with open(path + 'grid_coordinates.json', 'wb') as fp: # fp.write( anyjson.dumps( XY ) ) with open('data/' + my.DATA_FOLDER + 'user_disp_param.json', 'rb') as fp: disp_param = anyjson.loads(fp.read()) amp = disp_param['amp'] index = disp_param['index'] powerlaw = lambda x: amp * (x**index) if x != 0 else 0 delta = [] theta = [] prob = [] points = [] # stored as (y, x) or (lat, lng) for y in Y: this_delta = [] this_theta = [] this_prob = [] for x in X: dist = int(round( geo.distance( geo.xyz(0, 0), geo.xyz(y, x) ))) this_delta.append( powerlaw(dist) ) this_theta.append( 1.0/360 ) this_prob.append( powerlaw(dist) * 1.0/360 ) points.append( (y, x, powerlaw(dist) * 1.0/360) ) delta.append(this_delta) theta.append(this_theta) prob.append(this_prob) all_prob = list(itertools.chain(*prob)) print min(all_prob), max(all_prob) mn = min(tuple(i for i in all_prob if i!=0)) print sum(int(round(i/mn)) for i in all_prob) points_ = [] for p in points: for i in range(int(round( p[2]/mn ))): points_.append( ( round(p[0], 4), round(p[1], 4) ) ) print len(points), len(points_) with open(path + 'artificial_points.json', 'wb') as fp: fp.write( anyjson.dumps( points_ ) ) with open(path + 'grid_delta.json', 'wb') as fp: fp.write( anyjson.dumps( delta ) ) #fp.write( jsb.beautify( anyjson.dumps( delta ) ) ) with open(path + 'grid_theta.json', 'wb') as fp: fp.write( anyjson.dumps( theta ) ) #fp.write( jsb.beautify( anyjson.dumps( theta ) ) ) with open(path + 'grid_prob.json', 'wb') as fp: fp.write( anyjson.dumps( prob ) ) #fp.write( jsb.beautify( anyjson.dumps( prob ) ) ) fig=plt.figure(figsize=(10, 10)) fig.set_tight_layout(True) ax=fig.add_subplot(111) ax.set_xlim(lng1, lng2) ax.set_ylim(lat1, lat2) ax.grid() ax.set_title('Single User Sample Space') plt.savefig(path + 'artificial_grid' + '.png')
def calcTweetDistances(): print 'Calculating tweeting distances...' _, hbk_poly = load.loadLocPoly() hbk_all_tweets = load.loadAllTweets() hbk_user_home_loc = load.loadAllHomeLoc(hbk_poly) print [i[0] for i in hbk_user_home_loc] hbk_home_list = {} for user_home in hbk_user_home_loc: hbk_home_list[user_home[0]] = [user_home[1], user_home[2]] with open('data/' + my.DATA_FOLDER + my.HBK_TWEET_DIST_FILE, 'wb') as fp: csv_writer = csv.writer(fp, delimiter=',') for tweet in hbk_all_tweets: user_id = tweet[0] #print tweet, hbk_home_list[user_id] dist = int(round(geo.distance(geo.xyz(tweet[1], tweet[2]), geo.xyz(hbk_home_list[user_id][0], hbk_home_list[user_id][1])))) csv_writer.writerow([user_id, dist]) print 'Done calculating tweeting distances...' # Apply Normalizing vectors to visit matrix def apply_non_home_norm(visit_mat, norm): print 'Applying Non-Home tweet count normalization to visit matrix...' for from_id in my.HBK_GANG_ID_LIST: for to_id in my.HBK_GANG_ID_LIST: if norm[to_id] != 0: visit_mat[from_id][to_id] /= norm[to_id] else: visit_mat[from_id][to_id] = 0 return visit_mat
def _find_daily_disp(user_id): '''Find daily max displacements for user_id and generate scatter plot''' # Displacement csv SQL = 'SELECT ST_X(geo), ST_Y(geo) \ FROM {rel_home} \ WHERE user_id = %s '.format(rel_home=my.REL_HOME) con = psycopg2.connect(my.DB_CONN_STRING) cur = con.cursor() cur.execute(SQL, (user_id,)) records = cur.fetchall() if len(records) > 0: home = records[0] user_disp = {} x, y = [], [] with open('data/' + my.DATA_FOLDER + 'city_bound_pol.txt', 'rb') as fpr: bound_pol = fpr.read().strip() SQL = 'SELECT ST_X(geo), ST_Y(geo), (timestamp AT TIME ZONE \'{timezone}\')::date \ FROM {rel_tweet} \ WHERE user_id = %s \ AND geo && ST_GeomFromGeoJSON(%s) '.format(rel_tweet=my.REL_TWEET, timezone=my.TIMEZONE) \ + my.QUERY_CONSTRAINT \ + 'ORDER BY timestamp' cur.execute(SQL, (user_id, bound_pol)) records = cur.fetchall() con.close() for rec in records: lat, lng, ds = rec x.append(lng-home[1]) y.append(lat-home[0]) if ds not in user_disp: user_disp[ds] = 0 try: dist = int(round(geo.distance(geo.xyz(home[0], home[1]), geo.xyz(lat, lng)))) except: dist = 0 if dist > user_disp[ds]: user_disp[ds] = dist if not os.path.exists('data/' + my.DATA_FOLDER + 'displacement/' + 'user_disp/'): os.makedirs('data/' + my.DATA_FOLDER + 'displacement/' + 'user_disp/') with open('data/' + my.DATA_FOLDER + 'displacement/' + 'user_disp/' + str(user_id) + '.csv', 'wb') as fpw: cw = csv.writer(fpw, delimiter=',') for ds in user_disp: cw.writerow([user_id, user_disp[ds], ds]) # Displacement plot fig = plt.figure(figsize=(5,5)) ax = fig.add_subplot(111) plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05) ax.set_autoscaley_on(False) ax.set_ylim([-0.5,0.5]) ax.set_xlim([-0.5,0.5]) ax.set_yticks([0.0]) ax.set_xticks([0.0]) ax.set_yticklabels([]) ax.set_xticklabels([]) ax.grid(True) ax.plot(x, y, 'b+') ax.plot([0], [0], 'r^') ax.text(-0.45, -0.45, str(user_id), fontsize=10) if not os.path.exists('data/' + my.DATA_FOLDER + 'displacement/' + 'plot_disp/'): os.makedirs('data/' + my.DATA_FOLDER + 'displacement/' + 'plot_disp/') plt.savefig('data/' + my.DATA_FOLDER + 'displacement/' + 'plot_disp/' + str(user_id) + '.png') else: con.close() print 'Missed 1 user_id!'
def calc_featAndPlot(folder='visits', file_name='visit_mat'): # Calculate features for each neighborhoods # and plot rankings hood_ids = _load_nhoodIDs() hood_info = _load_hoodInfo() visit_mat = _load_visitMat(folder, file_name) visit_mat_frac = _calc_visitMatFrac(visit_mat) inn = dict([(to_id, len([1 for from_id in hood_ids \ if from_id != to_id and visit_mat_frac[from_id][to_id] != 0])) \ for to_id in hood_ids]) outn = dict([(from_id, len([1 for to_id in hood_ids \ if to_id != from_id and visit_mat_frac[from_id][to_id] != 0])) \ for from_id in hood_ids]) inn = [i for i in inn if inn[i] > my.MIN_LINKS_FRAC*max(inn.values())] outn = [i for i in outn if outn[i] > my.MIN_LINKS_FRAC*max(outn.values())] print hood_ids print inn print outn #for a in visit_mat_frac: # print a, len([1 for b in visit_mat_frac if visit_mat_frac[a][b] != 0 and visit_mat_frac[b][a] !=0]) # Calculate each feature OUTFLOW_INFLOW = dict([(h_id, _calc_inflowVsOutflow(h_id, visit_mat_frac)) for h_id in hood_ids]) IN_DENSITY = dict([(h_id, _calc_inDensity(h_id, visit_mat_frac)) for h_id in hood_ids]) OUT_DENSITY = dict([(h_id, _calc_outDensity(h_id, visit_mat_frac)) for h_id in hood_ids]) POPULARITY = dict([(h_id, _calc_Popularity(h_id, visit_mat_frac)) for h_id in hood_ids]) ENTROPY_OUT = dict([(h_id, _calc_EntropyOut(h_id, visit_mat_frac)) for h_id in hood_ids]) ENTROPY_OUT_BYN = dict([(h_id, _calc_EntropyOut_byN(h_id, visit_mat_frac)) for h_id in hood_ids]) ENTROPY_OUT_ALL = dict([(h_id, _calc_EntropyOutAll(h_id, visit_mat_frac)) for h_id in hood_ids]) ENTROPY_IN = dict([(h_id, _calc_EntropyIn(h_id, visit_mat_frac)) for h_id in hood_ids]) ENTROPY_IN_BYN = dict([(h_id, _calc_EntropyIn_byN(h_id, visit_mat_frac)) for h_id in hood_ids]) ENTROPY_IN_ALL = dict([(h_id, _calc_EntropyInAll(h_id, visit_mat_frac)) for h_id in hood_ids]) KL_DIVERGENCE = dict([(h_id, _calc_KLDivergence(h_id, visit_mat_frac)) for h_id in hood_ids]) ENTROPY_OUT = _trim_ids(ENTROPY_OUT, outn) ENTROPY_IN = _trim_ids(ENTROPY_IN, inn) # Initialize features for plot features = {'OUTFLOW_INFLOW' : OUTFLOW_INFLOW, 'IN_DENSITY' : IN_DENSITY, 'OUT_DENSITY' : OUT_DENSITY, 'POPULARITY' : POPULARITY, 'ENTROPY_OUT' : ENTROPY_OUT, 'ENTROPY_OUT_(/N)' : ENTROPY_OUT_BYN, 'ENTROPY_OUT_ALL' : ENTROPY_OUT_ALL, 'ENTROPY_IN' : ENTROPY_IN, 'ENTROPY_IN_(/N)' : ENTROPY_IN_BYN, 'ENTROPY_IN_ALL' : ENTROPY_IN_ALL, 'KL_DIVERGENCE': KL_DIVERGENCE} #with open('data/' + my.DATA_FOLDER + 'features_' + folder + '.pickle', 'wb') as fp1: # pickle.dump(features, fp1) #feature_names = ['OUTFLOW_INFLOW', 'IN_DENSITY', 'OUT_DENSITY', 'POPULARITY', 'ENTROPY_OUT', 'ENTROPY_OUT_ALL', 'ENTROPY_IN', 'ENTROPY_IN_ALL'] #feature_names = ['OUTFLOW_INFLOW', 'POPULARITY', 'ENTROPY_OUT', 'ENTROPY_OUT_(/N)', 'ENTROPY_IN', 'ENTROPY_IN_(/N)'] colors = ["#4DAF4A","#3B3B3B","#984EA3","#E41A1C","#A65628","#FA71AF","#FF7F00","#377EB8"] # Plot all feature ranks '''width = 6 ind = np.arange(len(hood_ids)) * 10 count = 0 fig = plt.figure(figsize=(len(features)*2.5, len(hood_ids)*0.75)) plt.subplots_adjust(left=0.02, right=0.96, top=0.88, bottom=0.02) for name in feature_names: x = [hood_info[h_id]['name'] for h_id in sorted(features[name], key=features[name].get)] y = [features[name][h_id] for h_id in sorted(features[name], key=features[name].get)] count += 2 color = colors.pop() ax = fig.add_subplot(1, len(features)*2, count) ax.set_yticks(ind+(width/2)) plt.setp(ax, xticklabels=[]) plt.setp(ax, yticklabels=_conv_SplitLabels(x)) #ax.tick_params(axis='x', labelsize=10) ax.barh(ind, y, width, color=color, alpha=0.75, edgecolor=color) ax.set_title(name + '\n\n') fig.suptitle('Neighborhood ranks: ' + folder.upper() + ' (' + my.DATA_FOLDER[:-1].upper() + ')', fontsize=18) plt.savefig('data/' + my.DATA_FOLDER + folder + '/' + 'hood_ranks__' + my.DATA_FOLDER[:-1] + '.png') ''' # Plot map: polygons init pols = [] pol_seq = [] for h_id in hood_info: pol = hood_info[h_id]['polygon'][:-1] pol = [[ll[1], ll[0]] for ll in pol] pols.append(pol) pol_seq.append(h_id) lngs = [ll[0] for ll in pol for pol in pols] lats = [ll[1] for ll in pol for pol in pols] #print max(lngs), min(lngs), max(lats), min(lats) ## MIGHT NEED TO SWAP x_dist and y_dist y_dist = geo.distance(geo.xyz(max(lats), max(lngs)), geo.xyz(max(lats), min(lngs))) x_dist = geo.distance(geo.xyz(max(lats), max(lngs)), geo.xyz(min(lats), max(lngs))) #print x_dist, y_dist # Plot map: each feature fig = plt.figure(figsize=(2 * 6, 3 * y_dist * 6/x_dist)) plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05) count = 0 for name in feature_names: count += 1 #x = [hood_info[h_id]['name'] for h_id in sorted(features[name], key=features[name].get)] #y = [features[name][h_id] for h_id in sorted(features[name], key=features[name].get)] heat = np.array([features[name][h_id] for h_id in pol_seq]) #fig = plt.figure(figsize=(6, y_dist * 6/x_dist)) ax = fig.add_subplot(3, 2, count, aspect='equal') #ax = fig.add_subplot(4, 2, count, aspect=y_dist/x_dist) coll = PolyCollection(pols, array=heat, cmap=mpl.cm.OrRd, edgecolors='k', alpha=0.75) ## mpl.cm.datad for list of colormaps ax.add_collection(coll) ax.autoscale_view() ax.get_xaxis().set_ticklabels([]) ax.get_yaxis().set_ticklabels([]) fig.colorbar(coll, ax=ax) #ax.set_title(my.DATA_FOLDER[:-1].upper() + '(' + folder.upper() + '): ' + name) ax.set_title(name) fig.suptitle('Neighborhood ranks: ' + folder.upper() + ' (' + my.DATA_FOLDER[:-1].upper() + ')', fontsize=18) if not os.path.exists('data/' + my.DATA_FOLDER + 'nhood_rank/'): os.makedirs('data/' + my.DATA_FOLDER + 'nhood_rank/') plt.savefig('data/' + my.DATA_FOLDER + 'nhood_rank/' + file_name + '.png') # Plot seperate plots WEST LA '''feature_names = ['ENTROPY_OUT', 'ENTROPY_IN']