def evaluateCorpus(self, printmsg=False): distances = [] valids = 0 invalids = 0 distance_matches = 0 distance_mismatches = 0 cluster_matches = 0 cluster_mismatches = 0 n = len(self.clusters) real_to_calc_matches = [[0 for x in range(n+1)] for x in range(n)] for i in range(n): real_to_calc_matches[i][0] = i for self.i in range(0,self.n): values = self.evaluateTweet(self.tweets[self.i], self.location[self.i], self.users[self.i]) if values is None: invalids += 1 else: lon_calculated, lat_calculated, lon_real, lat_real, distance = values distances.append(distance) if EvaluationFunctions.evaluateDistance(distance, self.distance_threshold): distance_matches += 1 else: distance_mismatches += 1 if EvaluationFunctions.evaluateCluster(lon_calculated, lat_calculated, lon_real, lat_real, self.clusters, real_to_calc_matches): cluster_matches += 1 else: cluster_mismatches += 1 valids += 1 distances_np = np.asarray(distances, dtype=float) if printmsg: print 'valid: ', valids, 'invalid: ', invalids print 'distance_match: ', distance_matches, 'distance_mismatches: ', distance_mismatches if distance_matches + distance_mismatches > 0: print 'distance_ratio: ', str(float(distance_matches) / (distance_matches + distance_mismatches)) print 'cluster_matches: ', cluster_matches, 'cluster_mismatches: ', cluster_mismatches if cluster_matches + cluster_mismatches > 0: print 'cluster_ratio: ', str(float(cluster_matches) / (cluster_matches + cluster_mismatches)) # print "not used: ", self.tmpscore #print tabulate(real_to_calc_matches, tablefmt="latex",headers=range(n)) #print tabulate(EvaluationFunctions.transformStatistice(real_to_calc_matches), tablefmt="latex",headers=range(n)) valid_ratio = valids / float(valids + invalids) valid_ratio = valids if valids > 0: return (np.mean(distances_np), np.median(distances_np), valid_ratio) else: return (float('inf'), float('inf'), valid_ratio)
def createFallback(self): # create fall-back tokens for all users for user, tweets in self.user_to_tweets.iteritems(): tid_to_count = {} for tweet in tweets: for token in EvaluationFunctions.getCoOccurrences(tweet): tid = self.signature.add(token) if tid in self.token_data: if self.checkVarianceThreshold(tid): tid_to_count.setdefault(tid, 0) tid_to_count[tid] += 1 amount = 5 if len(tid_to_count) < amount: amount = len(tid_to_count) token_data = [] for tid, count in sorted(tid_to_count.iteritems(), key=itemgetter(1), reverse=True)[:amount]: data = self.token_data[tid] variance = data['variance'] count = data['count'] token_data.append((self.signature.get(tid), variance, count, data["median"], data["variances"])) self.fallback[user] = token_data
def pickleTrainingCorpus(filename): token_to_data = { } #< maps a token to a tuple of its coordinates, variance and its count #< ((lon, lat), variance, count) COUNT_THRESHOLD = 0 # Make connection database = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus="TRAIN") # Iterate over all tweets and split the tokenised texts. # Each token maps to a list of lon, lat tuples token_distribution_cart = {} tweet_coordinates = [] for tokens, lat, lon in database.getRows("`tokenised_low`, `lat`, `long`"): tweet_coordinates.append((lon, lat)) cartesian = EvaluationFunctions.convertLatLongToCartesian(lon, lat) for token in EvaluationFunctions.getCoOccurrences(tokens.split()): token_distribution_cart.setdefault(token, []).append(cartesian) for token, coordinates_of_tuple in token_distribution_cart.iteritems(): count = len(coordinates_of_tuple) if count > COUNT_THRESHOLD: # Convert coordinate list to numpy array np_list = np.asarray(coordinates_of_tuple, dtype=float) # Calculate the mean values for (mean_x, mean_y, mean_z) = tuple(np.mean(np_list, axis=0)) variance_num = 0 for (x, y, z) in coordinates_of_tuple: variance_num += (x - mean_x)**2 + (y - mean_y)**2 + (z - mean_z)**2 # Calculate the variance variance = variance_num / count # calculate the median (mean_x, mean_y, mean_z) = tuple(np.median(np_list, axis=0)) token_to_data[token] = ( EvaluationFunctions.convertCartesianToLatLong( mean_x, mean_y, mean_z), variance, count) pickle.dump(token_to_data, open(filename, 'wb')) return tweet_coordinates
def pickleTrainingCorpus(filename): token_to_data = {} #< maps a token to a tuple of its coordinates, variance and its count #< ((lon, lat), variance, count) COUNT_THRESHOLD = 0 # Make connection database = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus="TRAIN") # Iterate over all tweets and split the tokenised texts. # Each token maps to a list of lon, lat tuples token_distribution_cart = {} tweet_coordinates = [] for tokens, lat, lon in database.getRows("`tokenised_low`, `lat`, `long`"): tweet_coordinates.append((lon, lat)) cartesian = EvaluationFunctions.convertLatLongToCartesian(lon, lat) for token in EvaluationFunctions.getCoOccurrences(tokens.split()): token_distribution_cart.setdefault(token, []).append(cartesian) for token, coordinates_of_tuple in token_distribution_cart.iteritems(): count = len(coordinates_of_tuple) if count > COUNT_THRESHOLD: # Convert coordinate list to numpy array np_list = np.asarray(coordinates_of_tuple, dtype=float) # Calculate the mean values for (mean_x, mean_y, mean_z) = tuple(np.mean(np_list, axis=0)) variance_num = 0 for (x, y, z) in coordinates_of_tuple: variance_num += (x - mean_x)**2 + (y - mean_y)**2 + (z - mean_z)**2 # Calculate the variance variance = variance_num / count # calculate the median (mean_x, mean_y, mean_z) = tuple(np.median(np_list, axis=0)) token_to_data[token] = (EvaluationFunctions.convertCartesianToLatLong(mean_x, mean_y, mean_z), variance, count) pickle.dump(token_to_data, open(filename, 'wb')) return tweet_coordinates
def __init__(self, signature=None, clusters=None, corpus='DEV'): self.tweets = [] # list of tokenised tweets self.location = [] # list of lat, lan tuples self.n = 0 # The size of the corpus self.clusters = None # List of centroid coordinates self.variance_threshold = 0 self.distance_threshold = 0 self.draw = False # Toggle weather each tweet should be saved to a PNG file self.evaluator = None # Creates the weights for the tokens in a tweet self.null = False # Test 0-hypothesis self.signature = signature self.clusters = clusters self.users = [] self.fallback = {} # Load corpus from database: database = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus=corpus) self.user_to_tweets = {} for tokens, lat, lon, user in database.getRows( "`tokenised_low`, `lat`, `long`, `user_id`"): self.tweets.append(tokens.split()) self.users.append(user) self.user_to_tweets.setdefault(user, []).append(tokens.split()) self.location.append((lon, lat)) self.n = len(self.tweets) assert len(self.tweets) == len(self.location) # Lookup tokendata self.token_data = {} # collect ids ids = [] for tweet in self.tweets: for token in EvaluationFunctions.getCoOccurrences(tweet): i = self.signature.add(token) ids.append(i) ids = set(ids) # Get data from database token_db = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus="TOKENDATA") for token_id, medianx, mediany, medianz, variance, variancex, variancey, variancez, count \ in token_db.getTokenInfo(ids, columns= \ "`id`, `median_x`, `median_y`, `median_z`, `variance`, `variance_x`, `variance_y`, `variance_z`, `count`"): self.token_data[token_id] = { "median": (medianx, mediany, medianz), "variance": variance, "count": count, "variances": (variancex, variancey, variancez) }
def __init__(self, signature=None, clusters=None, corpus='DEV'): self.tweets = [] # list of tokenised tweets self.location = [] # list of lat, lan tuples self.n = 0 # The size of the corpus self.clusters = None # List of centroid coordinates self.variance_threshold = 0 self.distance_threshold = 0 self.draw = False # Toggle weather each tweet should be saved to a PNG file self.evaluator = None # Creates the weights for the tokens in a tweet self.null = False # Test 0-hypothesis self.signature = signature self.clusters = clusters self.users = [] self.fallback = {} # Load corpus from database: database = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus=corpus) self.user_to_tweets = {} for tokens, lat, lon, user in database.getRows("`tokenised_low`, `lat`, `long`, `user_id`"): self.tweets.append(tokens.split()) self.users.append(user) self.user_to_tweets.setdefault(user, []).append(tokens.split()) self.location.append((lon, lat)) self.n = len(self.tweets) assert len(self.tweets) == len(self.location) # Lookup tokendata self.token_data = {} # collect ids ids = [] for tweet in self.tweets: for token in EvaluationFunctions.getCoOccurrences(tweet): i = self.signature.add(token) ids.append(i) ids = set(ids) # Get data from database token_db = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus="TOKENDATA") for token_id, medianx, mediany, medianz, variance, variancex, variancey, variancez, count \ in token_db.getTokenInfo(ids, columns= \ "`id`, `median_x`, `median_y`, `median_z`, `variance`, `variance_x`, `variance_y`, `variance_z`, `count`"): self.token_data[token_id] = { "median" : (medianx, mediany, medianz), "variance" : variance, "count" : count, "variances" : (variancex, variancey, variancez)}
def createFallback(self): # create fall-back tokens for all users for user, tweets in self.user_to_tweets.iteritems(): tid_to_count = {} for tweet in tweets: for token in EvaluationFunctions.getCoOccurrences(tweet): tid = self.signature.add(token) if tid in self.token_data: if self.checkVarianceThreshold(tid): tid_to_count.setdefault(tid,0) tid_to_count[tid] += 1 amount = 5 if len(tid_to_count) < amount: amount = len(tid_to_count) token_data = [] for tid, count in sorted(tid_to_count.iteritems(), key=itemgetter(1), reverse=True)[:amount]: data = self.token_data[tid] variance = data['variance'] count = data['count'] token_data.append((self.signature.get(tid), variance, count, data["median"], data["variances"])) self.fallback[user] = token_data
cluster = int(sys.argv[3]) token_to_data_filtered = {} # Filter by count for token, items in token_to_data.iteritems(): if items[2] >= COUNT_THRESHOLD: token_to_data_filtered[token] = items if cluster != -1: # Print the common words (for all of DE,AT,CH) clusters = pickle.load(open(sys.argv[2], 'rb')) #< token_to_data_filtered_cluster = {} for token, items in token_to_data_filtered.iteritems(): lon, lat = items[0] c = EvaluationFunctions.getCluster(lon,lat, clusters) if c == cluster: token_to_data_filtered_cluster[token] = items token_to_data_filtered = token_to_data_filtered_cluster sorted_by_variance = sorted(token_to_data_filtered.iteritems(),key=lambda x: x[1][1], reverse=True) num_tokens = len(sorted_by_variance) # sort by variance for token in sorted_by_variance: print token[0] + " (" + str(token[1][2]) + ")" print num_tokens
def evaluateCorpus(self, printmsg=False): distances = [] valids = 0 invalids = 0 distance_matches = 0 distance_mismatches = 0 cluster_matches = 0 cluster_mismatches = 0 n = len(self.clusters) real_to_calc_matches = [[0 for x in range(n + 1)] for x in range(n)] for i in range(n): real_to_calc_matches[i][0] = i for self.i in range(0, self.n): values = self.evaluateTweet(self.tweets[self.i], self.location[self.i], self.users[self.i]) if values is None: invalids += 1 else: lon_calculated, lat_calculated, lon_real, lat_real, distance = values distances.append(distance) if EvaluationFunctions.evaluateDistance( distance, self.distance_threshold): distance_matches += 1 else: distance_mismatches += 1 if EvaluationFunctions.evaluateCluster(lon_calculated, lat_calculated, lon_real, lat_real, self.clusters, real_to_calc_matches): cluster_matches += 1 else: cluster_mismatches += 1 valids += 1 distances_np = np.asarray(distances, dtype=float) if printmsg: print 'valid: ', valids, 'invalid: ', invalids print 'distance_match: ', distance_matches, 'distance_mismatches: ', distance_mismatches if distance_matches + distance_mismatches > 0: print 'distance_ratio: ', str( float(distance_matches) / (distance_matches + distance_mismatches)) print 'cluster_matches: ', cluster_matches, 'cluster_mismatches: ', cluster_mismatches if cluster_matches + cluster_mismatches > 0: print 'cluster_ratio: ', str( float(cluster_matches) / (cluster_matches + cluster_mismatches)) # print "not used: ", self.tmpscore #print tabulate(real_to_calc_matches, tablefmt="latex",headers=range(n)) #print tabulate(EvaluationFunctions.transformStatistice(real_to_calc_matches), tablefmt="latex",headers=range(n)) valid_ratio = valids / float(valids + invalids) valid_ratio = valids if valids > 0: return (np.mean(distances_np), np.median(distances_np), valid_ratio) else: return (float('inf'), float('inf'), valid_ratio)
def evaluateTweet(self, tokens, location, user): token_data_here = [] valid = 0 if self.draw: basemap = MapFunctions.prepareMap() text_pos = 1890000 # Look up the data for each token in the tweet for token in EvaluationFunctions.getCoOccurrences(tokens): token_id = self.signature.add(token) if token_id not in self.token_data: if False: #self.draw: plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | (fail)', color='grey', fontsize=6) text_pos -= 42000 continue data = self.token_data[token_id] variance = data['variance'] count = data['count'] x, y, z = data["median"] lon, lat = EvaluationFunctions.convertCartesianToLatLong(x, y, z) if self.checkVarianceThreshold(token_id): valid += 1 # 0-hypothese if self.null: token = self.token_data.keys()[randint( 0, len(self.token_data.keys()))] coordinates, variance, count = self.token_data[token] if self.draw: #plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count), color='black', fontsize=6) text_pos -= 42000 current_color = EvaluationFunctions.getColorForValue( variance) basemap.plot( lon, lat, 'o', latlon=True, markeredgecolor=current_color, color=current_color, markersize=EvaluationFunctions.getSizeForValue(count), alpha=0.7) token_data_here.append((token, variance, count, data["median"], data["variances"])) else: if self.draw: #plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count),color='grey', fontsize=6) text_pos -= 40000 current_color = 'gray' basemap.plot( lon, lat, 'o', latlon=True, markeredgecolor=current_color, color=current_color, markersize=EvaluationFunctions.getSizeForValue(count), alpha=0.1) if valid == 0: # use fallback #if user in self.fallback: # token_data_here = self.fallback[user] #else: # print user , " not in " , self.fallback.keys() if len(token_data_here) == 0: plt.clf() return None #else: # print "!" # Generate the data for the weighted midpoint coordinate_list, weight_list = self.evaluator.evaluate(token_data_here) # Calculate the midpoint lon_score, lat_score = EvaluationFunctions.getWeightedMidpointXYZ( coordinate_list, weight_list) distance = EvaluationFunctions.getDistance(lon_score, lat_score, location[0], location[1]) #print " ".join(tokens) #print distance #print valid #print "" if self.draw: basemap.plot(location[0], location[1], '^', mfc='none', markeredgecolor='black', latlon=True, alpha=1) basemap.plot(lon_score, lat_score, 'v', mfc='none', markeredgecolor='black', latlon=True, alpha=1) plt.text(10000, 10000, 'Distance: ' + str(round(distance, 1)) + 'km') plt.text(10000, 80000, 'Threshold: ' + str(self.variance_threshold)) plt.savefig('img/tweet_' + str(self.variance_threshold) + "_" + str(self.i) + ".png", format='png') plt.clf() return (lon_score, lat_score, location[0], location[1], distance)
import cPickle as pickle from Wrapper import MySQLConnection import os from Evaluation import EvaluationFunctions """ Print the most regional tokens for a given cluster. Usage: python PrintRegionalTokens.py Signature Lon Lat range """ if len(sys.argv) < 4: print "1. TokenData, 2. ClusterData, 3. Cluster to analyse" sys.exit(1) signature = pickle.load(open(sys.argv[1], 'rb')) lon = sys.argv[2] lat = sys.argv[2] rang = 50 #km token_to_data = {} token_db = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus="TOKENDATA") for tid, count, medx, medy, medz, varx, vary, varz in token_db.getTokenInfo(ids=None, columns="`id`, `count`, `median_x`, `median_y`, `median_z`, `variance_x`, `variance_y`, `variance_z`"): lon_, lat_ = EvaluationFunctions.convertCartesianToLatLong(medx, medy, medz) distance = EvaluationFunctions.getDistance(lon, lat, lon_, lat_) if distance < rang and count > 20: print signature.get(tid), ",", (varx,vary,varz), ",", count
pos -= 1 x = variances_x[pos] y = variances_y[pos] z = variances_z[pos] return (x,y,z) def checkVarianceThreshold((x,y,z)): (tx,ty,tz) = VARIANCE_THRESHOLD return x < tx and y < ty and z < tz """ EVALUATE """ # Sort by variance in the token data for i in range (1,100): i += 1 l = i / 100.0 COUNT_THRESHOLD = 10 VARIANCE_THRESHOLD = getThreshold(l) # Collect data for tid, (medx, medy, medz, vars, count) in token_to_data.iteritems(): if count > COUNT_THRESHOLD and checkVarianceThreshold(vars): coordinates_to_draw.append(EvaluationFunctions.convertCartesianToLatLong(medx, medy, medz)) pickle.dump(coordinates_to_draw, open(sys.argv[1] + "_" + str(l) + ".pickle", 'wb')) # # # Draw coordinates to the map: # for lon, lat in coordinates_to_draw: # basemap.plot(lon, lat, '.r', markeredgecolor='r', markersize=1,latlon=True) # # plt.savefig(sys.argv[1], format='png', bbox_inches='tight', dpi=900)
def evaluateTweet(self, tokens, location, user): token_data_here = [] valid = 0 if self.draw: basemap = MapFunctions.prepareMap() text_pos = 1890000 # Look up the data for each token in the tweet for token in EvaluationFunctions.getCoOccurrences(tokens): token_id = self.signature.add(token) if token_id not in self.token_data: if False: #self.draw: plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | (fail)', color='grey', fontsize=6) text_pos -= 42000 continue data = self.token_data[token_id] variance = data['variance'] count = data['count'] x,y,z = data["median"] lon, lat = EvaluationFunctions.convertCartesianToLatLong(x,y,z) if self.checkVarianceThreshold(token_id): valid += 1 # 0-hypothese if self.null: token = self.token_data.keys()[randint(0,len(self.token_data.keys()))] coordinates, variance, count = self.token_data[token] if self.draw: #plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count), color='black', fontsize=6) text_pos -= 42000 current_color = EvaluationFunctions.getColorForValue(variance) basemap.plot(lon, lat, 'o', latlon=True, markeredgecolor=current_color, color=current_color, markersize=EvaluationFunctions.getSizeForValue(count), alpha=0.7) token_data_here.append((token, variance, count, data["median"], data["variances"])) else: if self.draw: #plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count),color='grey', fontsize=6) text_pos -= 40000 current_color = 'gray' basemap.plot(lon, lat, 'o', latlon=True, markeredgecolor=current_color, color=current_color, markersize=EvaluationFunctions.getSizeForValue(count), alpha=0.1) if valid == 0: # use fallback #if user in self.fallback: # token_data_here = self.fallback[user] #else: # print user , " not in " , self.fallback.keys() if len(token_data_here) == 0: plt.clf() return None #else: # print "!" # Generate the data for the weighted midpoint coordinate_list, weight_list = self.evaluator.evaluate(token_data_here) # Calculate the midpoint lon_score, lat_score = EvaluationFunctions.getWeightedMidpointXYZ(coordinate_list, weight_list) distance = EvaluationFunctions.getDistance(lon_score, lat_score, location[0], location[1]) #print " ".join(tokens) #print distance #print valid #print "" if self.draw: basemap.plot(location[0], location[1], '^', mfc='none' , markeredgecolor='black', latlon=True, alpha=1) basemap.plot(lon_score, lat_score, 'v', mfc='none', markeredgecolor='black', latlon=True, alpha=1) plt.text(10000,10000,'Distance: '+ str(round(distance,1)) + 'km') plt.text(10000,80000, 'Threshold: ' + str(self.variance_threshold)) plt.savefig('img/tweet_' + str(self.variance_threshold) + "_" + str(self.i) + ".png", format='png') plt.clf() return (lon_score, lat_score, location[0], location[1], distance)