Beispiel #1
0
def pickleTrainingCorpus(filename):
    token_to_data = {
    }  #< maps a token to a tuple of its coordinates,  variance and its count
    #< ((lon, lat), variance, count)
    COUNT_THRESHOLD = 0

    # Make connection
    database = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() +
                                                      "/",
                                                      corpus="TRAIN")

    # Iterate over all tweets and split the tokenised texts.
    # Each token maps to a list of lon, lat tuples
    token_distribution_cart = {}
    tweet_coordinates = []
    for tokens, lat, lon in database.getRows("`tokenised_low`, `lat`, `long`"):
        tweet_coordinates.append((lon, lat))
        cartesian = EvaluationFunctions.convertLatLongToCartesian(lon, lat)
        for token in EvaluationFunctions.getCoOccurrences(tokens.split()):
            token_distribution_cart.setdefault(token, []).append(cartesian)

    for token, coordinates_of_tuple in token_distribution_cart.iteritems():
        count = len(coordinates_of_tuple)
        if count > COUNT_THRESHOLD:
            # Convert coordinate list to numpy array
            np_list = np.asarray(coordinates_of_tuple, dtype=float)

            # Calculate the mean values for
            (mean_x, mean_y, mean_z) = tuple(np.mean(np_list, axis=0))

            variance_num = 0
            for (x, y, z) in coordinates_of_tuple:
                variance_num += (x - mean_x)**2 + (y - mean_y)**2 + (z -
                                                                     mean_z)**2

            # Calculate the variance
            variance = variance_num / count

            # calculate the median
            (mean_x, mean_y, mean_z) = tuple(np.median(np_list, axis=0))

            token_to_data[token] = (
                EvaluationFunctions.convertCartesianToLatLong(
                    mean_x, mean_y, mean_z), variance, count)

    pickle.dump(token_to_data, open(filename, 'wb'))
    return tweet_coordinates
def pickleTrainingCorpus(filename):
    token_to_data = {}    #< maps a token to a tuple of its coordinates,  variance and its count
                          #< ((lon, lat), variance, count)
    COUNT_THRESHOLD = 0

    # Make connection
    database = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus="TRAIN")

    # Iterate over all tweets and split the tokenised texts.
    # Each token maps to a list of lon, lat tuples
    token_distribution_cart = {}
    tweet_coordinates = []
    for tokens, lat, lon in database.getRows("`tokenised_low`, `lat`, `long`"):
        tweet_coordinates.append((lon, lat))
        cartesian = EvaluationFunctions.convertLatLongToCartesian(lon, lat)
        for token in EvaluationFunctions.getCoOccurrences(tokens.split()):
            token_distribution_cart.setdefault(token, []).append(cartesian)

    for token, coordinates_of_tuple in token_distribution_cart.iteritems():
        count = len(coordinates_of_tuple)
        if count > COUNT_THRESHOLD:
            # Convert coordinate list to numpy array
            np_list = np.asarray(coordinates_of_tuple, dtype=float)

            # Calculate the mean values for
            (mean_x, mean_y, mean_z) = tuple(np.mean(np_list, axis=0))

            variance_num = 0
            for (x, y, z) in coordinates_of_tuple:
                variance_num += (x - mean_x)**2 + (y - mean_y)**2 + (z - mean_z)**2

            # Calculate the variance
            variance = variance_num / count

            # calculate the median
            (mean_x, mean_y, mean_z) = tuple(np.median(np_list, axis=0))


            token_to_data[token] = (EvaluationFunctions.convertCartesianToLatLong(mean_x, mean_y, mean_z), variance, count)

    pickle.dump(token_to_data, open(filename, 'wb'))
    return tweet_coordinates
Beispiel #3
0
    def evaluateTweet(self, tokens, location, user):
        token_data_here = []

        valid = 0
        if self.draw:
            basemap = MapFunctions.prepareMap()

        text_pos = 1890000

        # Look up the data for each token in the tweet
        for token in EvaluationFunctions.getCoOccurrences(tokens):
            token_id = self.signature.add(token)
            if token_id not in self.token_data:
                if False:  #self.draw:
                    plt.text(10000,
                             text_pos,
                             token.decode('utf8', 'ignore') + ' | (fail)',
                             color='grey',
                             fontsize=6)
                    text_pos -= 42000
                continue

            data = self.token_data[token_id]
            variance = data['variance']
            count = data['count']
            x, y, z = data["median"]
            lon, lat = EvaluationFunctions.convertCartesianToLatLong(x, y, z)
            if self.checkVarianceThreshold(token_id):
                valid += 1
                # 0-hypothese
                if self.null:
                    token = self.token_data.keys()[randint(
                        0, len(self.token_data.keys()))]
                    coordinates, variance, count = self.token_data[token]

                if self.draw:
                    #plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count), color='black', fontsize=6)
                    text_pos -= 42000
                    current_color = EvaluationFunctions.getColorForValue(
                        variance)
                    basemap.plot(
                        lon,
                        lat,
                        'o',
                        latlon=True,
                        markeredgecolor=current_color,
                        color=current_color,
                        markersize=EvaluationFunctions.getSizeForValue(count),
                        alpha=0.7)

                token_data_here.append((token, variance, count, data["median"],
                                        data["variances"]))

            else:
                if self.draw:
                    #plt.text(10000, text_pos,   token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count),color='grey', fontsize=6)
                    text_pos -= 40000
                    current_color = 'gray'
                    basemap.plot(
                        lon,
                        lat,
                        'o',
                        latlon=True,
                        markeredgecolor=current_color,
                        color=current_color,
                        markersize=EvaluationFunctions.getSizeForValue(count),
                        alpha=0.1)

        if valid == 0:
            # use fallback
            #if user in self.fallback:
            #    token_data_here = self.fallback[user]
            #else:
            #    print user , " not in " , self.fallback.keys()
            if len(token_data_here) == 0:
                plt.clf()
                return None
            #else:
            #    print "!"

        # Generate the data for the weighted midpoint
        coordinate_list, weight_list = self.evaluator.evaluate(token_data_here)

        # Calculate the midpoint
        lon_score, lat_score = EvaluationFunctions.getWeightedMidpointXYZ(
            coordinate_list, weight_list)

        distance = EvaluationFunctions.getDistance(lon_score, lat_score,
                                                   location[0], location[1])

        #print " ".join(tokens)
        #print distance
        #print valid
        #print ""

        if self.draw:
            basemap.plot(location[0],
                         location[1],
                         '^',
                         mfc='none',
                         markeredgecolor='black',
                         latlon=True,
                         alpha=1)
            basemap.plot(lon_score,
                         lat_score,
                         'v',
                         mfc='none',
                         markeredgecolor='black',
                         latlon=True,
                         alpha=1)

            plt.text(10000, 10000,
                     'Distance: ' + str(round(distance, 1)) + 'km')
            plt.text(10000, 80000,
                     'Threshold: ' + str(self.variance_threshold))
            plt.savefig('img/tweet_' + str(self.variance_threshold) + "_" +
                        str(self.i) + ".png",
                        format='png')
            plt.clf()

        return (lon_score, lat_score, location[0], location[1], distance)
Beispiel #4
0
import cPickle as pickle
from Wrapper import MySQLConnection
import os
from Evaluation import EvaluationFunctions
"""
Print the most regional tokens for a given cluster.

Usage:
python PrintRegionalTokens.py Signature Lon Lat range

"""

if len(sys.argv) < 4:
    print "1. TokenData, 2. ClusterData, 3. Cluster to analyse"
    sys.exit(1)

signature = pickle.load(open(sys.argv[1], 'rb'))
lon = sys.argv[2]
lat = sys.argv[2]
rang = 50 #km

token_to_data = {}
token_db = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus="TOKENDATA")

for tid, count, medx, medy, medz, varx, vary, varz in token_db.getTokenInfo(ids=None, columns="`id`, `count`, `median_x`, `median_y`, `median_z`, `variance_x`, `variance_y`, `variance_z`"):
    lon_, lat_ = EvaluationFunctions.convertCartesianToLatLong(medx, medy, medz)
    distance = EvaluationFunctions.getDistance(lon, lat, lon_, lat_)
    if distance < rang and count > 20:
        print signature.get(tid), ",", (varx,vary,varz), ",", count

        pos -= 1
    x = variances_x[pos]
    y = variances_y[pos]
    z = variances_z[pos]
    return (x,y,z)

def checkVarianceThreshold((x,y,z)):
        (tx,ty,tz) = VARIANCE_THRESHOLD
        return x < tx and y < ty and z < tz

""" EVALUATE """
# Sort by variance in the token data
for i in range (1,100):
    i += 1
    l = i / 100.0
    COUNT_THRESHOLD = 10
    VARIANCE_THRESHOLD = getThreshold(l)

    # Collect data
    for tid, (medx, medy, medz, vars, count) in token_to_data.iteritems():
        if count > COUNT_THRESHOLD and checkVarianceThreshold(vars):
            coordinates_to_draw.append(EvaluationFunctions.convertCartesianToLatLong(medx, medy, medz))

    pickle.dump(coordinates_to_draw, open(sys.argv[1] + "_" + str(l) + ".pickle", 'wb'))
#
# # Draw coordinates to the map:
# for lon, lat in coordinates_to_draw:
#     basemap.plot(lon, lat, '.r', markeredgecolor='r', markersize=1,latlon=True)
#
# plt.savefig(sys.argv[1], format='png', bbox_inches='tight', dpi=900)
    def evaluateTweet(self, tokens, location, user):
        token_data_here = []

        valid = 0
        if self.draw:
            basemap = MapFunctions.prepareMap()

        text_pos = 1890000
       

        # Look up the data for each token in the tweet
        for token in EvaluationFunctions.getCoOccurrences(tokens):
            token_id =  self.signature.add(token)
            if token_id not in self.token_data:
                if False: #self.draw:
                    plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | (fail)', color='grey', fontsize=6)
                    text_pos -= 42000
                continue

            data = self.token_data[token_id]
            variance = data['variance']
            count = data['count']
            x,y,z = data["median"]
            lon, lat = EvaluationFunctions.convertCartesianToLatLong(x,y,z)
            if self.checkVarianceThreshold(token_id):
                valid += 1
                # 0-hypothese
                if self.null:
                    token = self.token_data.keys()[randint(0,len(self.token_data.keys()))]
                    coordinates, variance, count = self.token_data[token]

                if self.draw:
                    #plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count), color='black', fontsize=6)
                    text_pos -= 42000
                    current_color = EvaluationFunctions.getColorForValue(variance)
                    basemap.plot(lon, lat, 'o', latlon=True, markeredgecolor=current_color, color=current_color, markersize=EvaluationFunctions.getSizeForValue(count), alpha=0.7)

                token_data_here.append((token, variance, count, data["median"], data["variances"]))

            else:
                if self.draw:
                    #plt.text(10000, text_pos,   token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count),color='grey', fontsize=6)
                    text_pos -= 40000
                    current_color = 'gray'
                    basemap.plot(lon, lat, 'o', latlon=True, markeredgecolor=current_color, color=current_color, markersize=EvaluationFunctions.getSizeForValue(count), alpha=0.1)

        if valid == 0:
            # use fallback
            #if user in self.fallback:
            #    token_data_here = self.fallback[user]
            #else:
            #    print user , " not in " , self.fallback.keys()
            if len(token_data_here) == 0:
                plt.clf()
                return None
            #else:
            #    print "!"


        # Generate the data for the weighted midpoint
        coordinate_list, weight_list = self.evaluator.evaluate(token_data_here)

        # Calculate the midpoint
        lon_score, lat_score = EvaluationFunctions.getWeightedMidpointXYZ(coordinate_list, weight_list)

        distance = EvaluationFunctions.getDistance(lon_score, lat_score, location[0], location[1])
        
        #print " ".join(tokens)
        #print distance
        #print valid
        #print ""

        if self.draw:
            basemap.plot(location[0], location[1], '^', mfc='none' , markeredgecolor='black', latlon=True, alpha=1)
            basemap.plot(lon_score, lat_score, 'v',  mfc='none',  markeredgecolor='black', latlon=True, alpha=1)
           
            plt.text(10000,10000,'Distance: '+ str(round(distance,1)) + 'km')
            plt.text(10000,80000, 'Threshold: ' + str(self.variance_threshold))
            plt.savefig('img/tweet_' + str(self.variance_threshold) + "_" + str(self.i) + ".png", format='png')
            plt.clf()

        return (lon_score, lat_score, location[0], location[1], distance)