def pickleTrainingCorpus(filename): token_to_data = { } #< maps a token to a tuple of its coordinates, variance and its count #< ((lon, lat), variance, count) COUNT_THRESHOLD = 0 # Make connection database = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus="TRAIN") # Iterate over all tweets and split the tokenised texts. # Each token maps to a list of lon, lat tuples token_distribution_cart = {} tweet_coordinates = [] for tokens, lat, lon in database.getRows("`tokenised_low`, `lat`, `long`"): tweet_coordinates.append((lon, lat)) cartesian = EvaluationFunctions.convertLatLongToCartesian(lon, lat) for token in EvaluationFunctions.getCoOccurrences(tokens.split()): token_distribution_cart.setdefault(token, []).append(cartesian) for token, coordinates_of_tuple in token_distribution_cart.iteritems(): count = len(coordinates_of_tuple) if count > COUNT_THRESHOLD: # Convert coordinate list to numpy array np_list = np.asarray(coordinates_of_tuple, dtype=float) # Calculate the mean values for (mean_x, mean_y, mean_z) = tuple(np.mean(np_list, axis=0)) variance_num = 0 for (x, y, z) in coordinates_of_tuple: variance_num += (x - mean_x)**2 + (y - mean_y)**2 + (z - mean_z)**2 # Calculate the variance variance = variance_num / count # calculate the median (mean_x, mean_y, mean_z) = tuple(np.median(np_list, axis=0)) token_to_data[token] = ( EvaluationFunctions.convertCartesianToLatLong( mean_x, mean_y, mean_z), variance, count) pickle.dump(token_to_data, open(filename, 'wb')) return tweet_coordinates
def pickleTrainingCorpus(filename): token_to_data = {} #< maps a token to a tuple of its coordinates, variance and its count #< ((lon, lat), variance, count) COUNT_THRESHOLD = 0 # Make connection database = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus="TRAIN") # Iterate over all tweets and split the tokenised texts. # Each token maps to a list of lon, lat tuples token_distribution_cart = {} tweet_coordinates = [] for tokens, lat, lon in database.getRows("`tokenised_low`, `lat`, `long`"): tweet_coordinates.append((lon, lat)) cartesian = EvaluationFunctions.convertLatLongToCartesian(lon, lat) for token in EvaluationFunctions.getCoOccurrences(tokens.split()): token_distribution_cart.setdefault(token, []).append(cartesian) for token, coordinates_of_tuple in token_distribution_cart.iteritems(): count = len(coordinates_of_tuple) if count > COUNT_THRESHOLD: # Convert coordinate list to numpy array np_list = np.asarray(coordinates_of_tuple, dtype=float) # Calculate the mean values for (mean_x, mean_y, mean_z) = tuple(np.mean(np_list, axis=0)) variance_num = 0 for (x, y, z) in coordinates_of_tuple: variance_num += (x - mean_x)**2 + (y - mean_y)**2 + (z - mean_z)**2 # Calculate the variance variance = variance_num / count # calculate the median (mean_x, mean_y, mean_z) = tuple(np.median(np_list, axis=0)) token_to_data[token] = (EvaluationFunctions.convertCartesianToLatLong(mean_x, mean_y, mean_z), variance, count) pickle.dump(token_to_data, open(filename, 'wb')) return tweet_coordinates
def evaluateTweet(self, tokens, location, user): token_data_here = [] valid = 0 if self.draw: basemap = MapFunctions.prepareMap() text_pos = 1890000 # Look up the data for each token in the tweet for token in EvaluationFunctions.getCoOccurrences(tokens): token_id = self.signature.add(token) if token_id not in self.token_data: if False: #self.draw: plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | (fail)', color='grey', fontsize=6) text_pos -= 42000 continue data = self.token_data[token_id] variance = data['variance'] count = data['count'] x, y, z = data["median"] lon, lat = EvaluationFunctions.convertCartesianToLatLong(x, y, z) if self.checkVarianceThreshold(token_id): valid += 1 # 0-hypothese if self.null: token = self.token_data.keys()[randint( 0, len(self.token_data.keys()))] coordinates, variance, count = self.token_data[token] if self.draw: #plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count), color='black', fontsize=6) text_pos -= 42000 current_color = EvaluationFunctions.getColorForValue( variance) basemap.plot( lon, lat, 'o', latlon=True, markeredgecolor=current_color, color=current_color, markersize=EvaluationFunctions.getSizeForValue(count), alpha=0.7) token_data_here.append((token, variance, count, data["median"], data["variances"])) else: if self.draw: #plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count),color='grey', fontsize=6) text_pos -= 40000 current_color = 'gray' basemap.plot( lon, lat, 'o', latlon=True, markeredgecolor=current_color, color=current_color, markersize=EvaluationFunctions.getSizeForValue(count), alpha=0.1) if valid == 0: # use fallback #if user in self.fallback: # token_data_here = self.fallback[user] #else: # print user , " not in " , self.fallback.keys() if len(token_data_here) == 0: plt.clf() return None #else: # print "!" # Generate the data for the weighted midpoint coordinate_list, weight_list = self.evaluator.evaluate(token_data_here) # Calculate the midpoint lon_score, lat_score = EvaluationFunctions.getWeightedMidpointXYZ( coordinate_list, weight_list) distance = EvaluationFunctions.getDistance(lon_score, lat_score, location[0], location[1]) #print " ".join(tokens) #print distance #print valid #print "" if self.draw: basemap.plot(location[0], location[1], '^', mfc='none', markeredgecolor='black', latlon=True, alpha=1) basemap.plot(lon_score, lat_score, 'v', mfc='none', markeredgecolor='black', latlon=True, alpha=1) plt.text(10000, 10000, 'Distance: ' + str(round(distance, 1)) + 'km') plt.text(10000, 80000, 'Threshold: ' + str(self.variance_threshold)) plt.savefig('img/tweet_' + str(self.variance_threshold) + "_" + str(self.i) + ".png", format='png') plt.clf() return (lon_score, lat_score, location[0], location[1], distance)
import cPickle as pickle from Wrapper import MySQLConnection import os from Evaluation import EvaluationFunctions """ Print the most regional tokens for a given cluster. Usage: python PrintRegionalTokens.py Signature Lon Lat range """ if len(sys.argv) < 4: print "1. TokenData, 2. ClusterData, 3. Cluster to analyse" sys.exit(1) signature = pickle.load(open(sys.argv[1], 'rb')) lon = sys.argv[2] lat = sys.argv[2] rang = 50 #km token_to_data = {} token_db = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus="TOKENDATA") for tid, count, medx, medy, medz, varx, vary, varz in token_db.getTokenInfo(ids=None, columns="`id`, `count`, `median_x`, `median_y`, `median_z`, `variance_x`, `variance_y`, `variance_z`"): lon_, lat_ = EvaluationFunctions.convertCartesianToLatLong(medx, medy, medz) distance = EvaluationFunctions.getDistance(lon, lat, lon_, lat_) if distance < rang and count > 20: print signature.get(tid), ",", (varx,vary,varz), ",", count
pos -= 1 x = variances_x[pos] y = variances_y[pos] z = variances_z[pos] return (x,y,z) def checkVarianceThreshold((x,y,z)): (tx,ty,tz) = VARIANCE_THRESHOLD return x < tx and y < ty and z < tz """ EVALUATE """ # Sort by variance in the token data for i in range (1,100): i += 1 l = i / 100.0 COUNT_THRESHOLD = 10 VARIANCE_THRESHOLD = getThreshold(l) # Collect data for tid, (medx, medy, medz, vars, count) in token_to_data.iteritems(): if count > COUNT_THRESHOLD and checkVarianceThreshold(vars): coordinates_to_draw.append(EvaluationFunctions.convertCartesianToLatLong(medx, medy, medz)) pickle.dump(coordinates_to_draw, open(sys.argv[1] + "_" + str(l) + ".pickle", 'wb')) # # # Draw coordinates to the map: # for lon, lat in coordinates_to_draw: # basemap.plot(lon, lat, '.r', markeredgecolor='r', markersize=1,latlon=True) # # plt.savefig(sys.argv[1], format='png', bbox_inches='tight', dpi=900)
def evaluateTweet(self, tokens, location, user): token_data_here = [] valid = 0 if self.draw: basemap = MapFunctions.prepareMap() text_pos = 1890000 # Look up the data for each token in the tweet for token in EvaluationFunctions.getCoOccurrences(tokens): token_id = self.signature.add(token) if token_id not in self.token_data: if False: #self.draw: plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | (fail)', color='grey', fontsize=6) text_pos -= 42000 continue data = self.token_data[token_id] variance = data['variance'] count = data['count'] x,y,z = data["median"] lon, lat = EvaluationFunctions.convertCartesianToLatLong(x,y,z) if self.checkVarianceThreshold(token_id): valid += 1 # 0-hypothese if self.null: token = self.token_data.keys()[randint(0,len(self.token_data.keys()))] coordinates, variance, count = self.token_data[token] if self.draw: #plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count), color='black', fontsize=6) text_pos -= 42000 current_color = EvaluationFunctions.getColorForValue(variance) basemap.plot(lon, lat, 'o', latlon=True, markeredgecolor=current_color, color=current_color, markersize=EvaluationFunctions.getSizeForValue(count), alpha=0.7) token_data_here.append((token, variance, count, data["median"], data["variances"])) else: if self.draw: #plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count),color='grey', fontsize=6) text_pos -= 40000 current_color = 'gray' basemap.plot(lon, lat, 'o', latlon=True, markeredgecolor=current_color, color=current_color, markersize=EvaluationFunctions.getSizeForValue(count), alpha=0.1) if valid == 0: # use fallback #if user in self.fallback: # token_data_here = self.fallback[user] #else: # print user , " not in " , self.fallback.keys() if len(token_data_here) == 0: plt.clf() return None #else: # print "!" # Generate the data for the weighted midpoint coordinate_list, weight_list = self.evaluator.evaluate(token_data_here) # Calculate the midpoint lon_score, lat_score = EvaluationFunctions.getWeightedMidpointXYZ(coordinate_list, weight_list) distance = EvaluationFunctions.getDistance(lon_score, lat_score, location[0], location[1]) #print " ".join(tokens) #print distance #print valid #print "" if self.draw: basemap.plot(location[0], location[1], '^', mfc='none' , markeredgecolor='black', latlon=True, alpha=1) basemap.plot(lon_score, lat_score, 'v', mfc='none', markeredgecolor='black', latlon=True, alpha=1) plt.text(10000,10000,'Distance: '+ str(round(distance,1)) + 'km') plt.text(10000,80000, 'Threshold: ' + str(self.variance_threshold)) plt.savefig('img/tweet_' + str(self.variance_threshold) + "_" + str(self.i) + ".png", format='png') plt.clf() return (lon_score, lat_score, location[0], location[1], distance)