Esempio n. 1
0
def add_numbers():
    lat = request.args.get('lat', 0, type=float)
    lon = request.args.get('lon', 0, type=float)
    lat_c = request.args.get('lat_c', 0, type=float)
    lon_c = request.args.get('lon_c', 0, type=float)
    kde_score_max = request.args.get('kde_score_max', 0, type=float)
    tempfile = request.args.get('tempfile', '')
    
    try:
        kde = joblib.load(tempfile) 
    except:
        print 'kde does not exist!'
        return None

    xy = latlon_to_dist((lat,lon), (lat_c,lon_c))

    kde_score = np.exp(kde.score_samples(
        np.array([np.ones(24)*xy[0], np.ones(24)*xy[1], np.arange(0,24)]).T))

    kde_score /= (kde_score_max/5.0)
    kde_score[kde_score>5.0] = 5.0
    kde_score = np.around(kde_score, 1)
    
    return jsonify(result=pd.DataFrame(kde_score).to_dict())
Esempio n. 2
0
def map_output():
    query_address = request.args.get('address')
    query_time = int(request.args.get('time'))

    if query_time>23:
        query_time=23

    try:
        query_distance = float(request.args.get('distance'))
    except ValueError:
        query_distance = 5.0
        
    try:
        query_latlon = float(request.args.get('lat')), float(request.args.get('lon'))
    except:
        query_latlon = None
                
    if query_address == '':
        query_address = default_address

    if query_latlon is None:
        # try openmap geocoder first
        try:
            geolocator = Nominatim()
            location = geolocator.geocode(query_address)
            query_latlon = (location.latitude, location.longitude)
        except:
            try:
                geolocator = GoogleV3(api_key='AIzaSyB7LvwvLJN0l04rFfHbIyUBsqi61vP6qWA')
                location = geolocator.geocode(query_address)
                query_latlon = (location.latitude, location.longitude)
            except:
                return render_template("redirect.html",
                                       error_msg="The querried address does not exist.")
                
    print 'latlon = ', query_latlon
    print 'time = ', query_time
    print 'address = ', query_address
    print 'distance = ', query_distance

    sbox = get_bbox(query_latlon, query_distance)

    sql_query = """
SELECT DISTINCT photo_data_table.id,latitude,longitude,datetaken,
description,tags,url_t,url_m,dog_proba
FROM dog_proba_table 
INNER JOIN photo_data_table 
ON (dog_proba_table.index = photo_data_table.id)
WHERE photo_data_table.latitude > {lat_min} 
AND photo_data_table.latitude < {lat_max} 
AND photo_data_table.longitude > {lon_min} 
AND photo_data_table.longitude < {lon_max};
"""\
    .format(lat_min=sbox[1], lat_max=sbox[3],
            lon_min=sbox[0], lon_max=sbox[2])

    query_results = pd.read_sql_query(sql_query, con)
    # dog_proba = query_results[map(str, categories_dog)].sum(axis=1)

    # filter non-dogs
    query_results = query_results[query_results['dog_proba']>0.85]

    # convert latlon to xy coordinate in km
    xy = query_results[['latitude', 'longitude']]\
         .apply(lambda x: latlon_to_dist(x, query_latlon), axis=1)
         # .apply(lambda x: (x[0], x[1]), axis=1)
    xy = pd.DataFrame(xy, columns=['xy'])   
    for n, col in enumerate(['x', 'y']):
        xy[col] = xy['xy'].apply(lambda location: location[n])

    query_results['x'] = xy['x']
    query_results['y'] = xy['y']
        
    # convert datetaken to hour taken
    # scale: 1.0 means that 1 hour corresponds to 1 km
    scale = 1.0
    hours = query_results['datetaken'].apply(lambda x: x.hour+x.minute/60.0)
    xyh = pd.concat([xy[['x', 'y']], hours*scale], axis=1)

    query_results['hour'] = hours
    
    # no photos around the center
    if xy[['x','y']].shape[0] == 0:
        return render_template("redirect.html",
                               error_msg="There is no dogs around the querried area and time.")
    
    # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
    labels = DBSCAN(eps=0.3, metric='euclidean', min_samples=5)\
                    .fit_predict(xyh)
                    # .fit_predict(xy[['x','y']])
                        
    # add labels to dataframe
    query_results = pd.concat(
        [query_results,
         pd.DataFrame(labels, columns=['label'], index=query_results.index)],
        axis=1)
    
    # drop -1 clusters
    query_results = query_results[query_results['label']!=-1]

    if query_results.size == 0:
        return render_template("redirect.html",
                               error_msg="There is no dogs around the querried area and time.")
        
    # KDE
    kde = KernelDensity(bandwidth=0.4,
                        kernel='gaussian', algorithm='ball_tree')
    kde.fit(query_results[['x','y','hour']])

    kde_score = np.exp(kde.score_samples(query_results[['x','y','hour']]))

    kde_score_max = np.sort(kde_score)[::-1][len(kde_score)/5]
    kde_score /= (kde_score_max/5.0)
    kde_score[kde_score>5.0] = 5.0
    
    query_results = pd.concat(
        [query_results,
         pd.DataFrame(kde_score, index=query_results.index,
                      columns=['kde_score'])], axis=1)
    
    # import matplotlib.pyplot as plt    
    # query_results['kde_score'].hist(bins=100)
    # plt.savefig('h1.png')
    # plt.close()

    # save kde model to a temporary file
    f = NamedTemporaryFile(delete=False)
    joblib.dump(kde, f.name)
    
    # return only for the specified hour
    hours = query_results['datetaken'].apply(lambda x: x.hour)
    query_results = query_results[hours==query_time]

    # drop small-element cluster after sliced by an hour
    min_cluster = 5
    idx_preserve = (query_results.groupby('label')['label'].count()>min_cluster)
    idx_preserve = idx_preserve[idx_preserve==True]
    query_results = query_results[query_results.label.isin(idx_preserve.index)]

    # re-calucalte the kde score exactly at the querried hour
    qu_re = query_results[['x','y','hour']]    
    qu_re['hour'] = qu_re['hour'].apply(np.floor)
    kde_score_2 = np.exp(kde.score_samples(qu_re))

    # kde_score_max = np.sort(kde_score_2)[::-1][len(kde_score_2)/15]
    
    kde_score_2 /= (kde_score_max/5.0)
    kde_score_2[kde_score_2>5.0] = 5.0
    query_results['kde_score_2'] = kde_score_2
    
    # take top 3 clusters
    n_tops = 3
    label_groups = query_results[['kde_score_2', 'label']].groupby('label')
    label_measure = label_groups.count()

    top3_labels = label_measure.sort('kde_score_2', ascending=False)[:n_tops]

    top3_repr = []
    for idx in top3_labels.index:
        idx_max = query_results[query_results['label']==idx]['kde_score_2'].idxmax()
        top3_repr.append(query_results.loc[idx_max])
        
    top3_repr = pd.concat(top3_repr, axis=1)

    if query_results.size == 0:
        return render_template("redirect.html",
                               error_msg="There is no dogs around the querried area and time.")
    
    # gather cluster characteristics
    lb_unique, num_pics = np.unique(labels, return_counts=True)
    num_pics = dict(zip(lb_unique, num_pics))
    centroids = query_results.groupby('label').mean().transpose().to_dict()
    for key, value in centroids.iteritems():
        value['num_pics'] = np.sqrt(num_pics[key])

    # get mean and covariance of the groups
    covs = query_results.groupby('label')[['latitude','longitude']].cov()
    means = query_results.groupby('label')[['latitude','longitude']].mean()
    num_pics = query_results.groupby('label')[['label']].count()
    num_pics.columns = ['num_pics']

    labels_multi = covs.index.get_level_values('label').unique()

    cluster_shape = {}
    for lb in labels_multi:
        eigs = np.linalg.eigh(covs.loc[lb])
        radii = list(np.sqrt(eigs[0])*2)
        pvec = eigs[1][:,0]  # direction of the 1st eigenvector (lat, lng)
        pvec = np.array([pvec[1], pvec[0]])  # switch to make it (lng, lat)
        pdir = [np.arctan(pvec[1]/pvec[0])]  # get angle from x-axis (lng-direction)
        center = list(means.loc[lb])

        # distance of the radii of 95% confident ellipse
        # print latlon_to_dist(np.array(center)+np.array(radii), center)
        
        cluster_shape[lb] = center + radii + pdir

    cluster_shape = pd.DataFrame(
        cluster_shape, index=['lat_c','lon_c','radii_x','radii_y','orientation'])

    cluster_shape = pd.concat([cluster_shape, num_pics.transpose()])    

    print '# clusters to show:', len(set(query_results['label']))

    return render_template("map.html",
                           photos=query_results.to_dict(orient='index'),
                           max_label=query_results['label'].max(),
                           address=query_address,
                           hour=datetime.strptime(str(query_time), "%H").strftime("%-I %p"),
                           hour_24=datetime.strptime(str(query_time), "%H").strftime("%-H"),
                           distance=query_distance,
                           clusters=centroids,
                           cluster_shape=cluster_shape.to_dict(),
                           kde_score_max=kde_score_max,
                           top3=top3_repr.to_dict(),
                           tempfile=f.name,
                           center=query_latlon)