Esempio n. 1
0
def plot_clusters(path_to_cluster_results,
                  file_name,
                  plot_medoid_summary=True,
                  plot_each_cluster=True,
                  n_trajectories_per_cluster=4,
                  plot_heatmap=True,
                  save_outputs=True):
    """
    Function to plot the results of the sequence analysis. These are the old matplotlib plots (deprecated, do not use)

    Parameters: 
        path_to_cluster_results: path to the cluster results
        file_name: name of the cluster results file
        plot_medoid_summary: Boolean flag to plot of the medoids of all clusters
        plot_each_cluster: Boolean flag to plot sample trajectories of each cluster
        n_trajectories_per_cluster: Number of sample trajectories to plot in the clusterwise sample
        plot_heatmap: Boolean flag to plot of the Tuscany heatmap for each cluster
        save_outputs: Boolean flag to save the results

    Returns:
        saves the plots in the results folder
    """
    ## loading the cluster data
    cluster_results = pd.read_csv(path_to_cluster_results + file_name + ".csv")
    gp.str_to_list(cluster_results)
    r_path = os.path.dirname(os.path.realpath(__file__))
    ## loading the shape files
    path_shapefiles, regions, provinces, territories, municipalities, crs = read_files.read_shapefile_data(
        r_path + '/../read_shapefiles/', 'shape_files_path.json')
    df_mun = read_files.read_shapefiles_in(False, path_shapefiles, regions,
                                           crs)
    df_mun_tus = read_files.read_shapefiles_in(True, path_shapefiles,
                                               municipalities, crs)
    r_path = "../viz/"
    ## Initilizing the plot
    t = TrajectoryClustermap(df_mun,
                             path_to_centroids=r_path + "comune_centroids.csv")

    ## Plotting required ones
    if plot_medoid_summary:
        t.plot_medoids(cluster_results,
                       path_to_save=path_to_cluster_results,
                       file_name=file_name,
                       save=True)
    if plot_each_cluster:
        t.plot_samples(cluster_results,
                       n_trajectories_per_cluster,
                       path_to_save=path_to_cluster_results,
                       file_name=file_name,
                       save=True)
    if plot_heatmap:
        t.plot_trajectories_heatmap(cluster_results,
                                    df_mun_tus,
                                    path_to_save=path_to_cluster_results,
                                    file_name=file_name,
                                    save=True)
Esempio n. 2
0
def read_tusc(path):
    """Helper function to read in the Tuscany shapefile"""
    path_shapefiles, regions, provinces, territories, municipalities, crs = read_files.read_shapefile_data(
        path, 'shape_files_path.json')
    df_reg_tus = read_files.read_shapefiles_in(True, path_shapefiles, regions,
                                               crs)

    return df_reg_tus
Esempio n. 3
0
def plot_cluster_heatmaps(path_to_cluster_results, file_name):
    """
    Function to plot the interactive heatmap results of the sequence analysis.

    Parameters: 
        path_to_cluster_results: path to the cluster results
        file_name: name of the cluster results file

    Returns:
        saves the plots in the results folder
    """
    r_path = os.path.dirname(os.path.realpath(__file__))
    path_shapefiles, regions, provinces, territories, municipalities, crs = read_files.read_shapefile_data(
        r_path + '/../read_shapefiles/', 'shape_files_path.json')
    df_mun = read_files.read_shapefiles_in(False, path_shapefiles, regions,
                                           crs)
    df_mun_tus = read_files.read_shapefiles_in(True, path_shapefiles,
                                               municipalities, crs)

    # expand multipolygons
    shp_expanded = df_mun_tus.set_index(['PRO_COM'])['geometry'].apply(
        pd.Series).stack().reset_index()
    shp_expanded.rename(columns={0: 'geometry'}, inplace=True)
    df_mun_tus_exp = shp_expanded.merge(df_mun_tus.drop(columns='geometry'),
                                        on='PRO_COM',
                                        how='left')

    df_clusters = pd.read_csv(path_to_cluster_results + file_name + ".csv")
    gp.str_to_list(df_clusters)

    # create heatmaps gdfs
    for c in df_clusters['cluster'].unique():

        df_clus = df_clusters[df_clusters['cluster'] == c]

        # create list of municipalities for all the trips in a single cluster
        trips = []
        for t in range(df_clus.shape[0]):
            trips.extend(
                list(map(int, np.unique(df_clus['locations_list'].iloc[t]))))

        df_trips = pd.DataFrame(data={'pro_com': trips})
        df_counts = df_trips['pro_com'].value_counts().rename_axis(
            'pro_com').reset_index(name='counts')

        # counts for each municipality
        heatmap_gdf = df_mun_tus_exp.merge(df_counts,
                                           how='left',
                                           left_on='PRO_COM',
                                           right_on='pro_com').fillna(0)

        # Get lat lon from geometry to plot
        heatmap_toplot = heatmap_gdf.drop('geometry', axis=1).copy()

        heatmap_toplot['x'] = heatmap_gdf.apply(getGeometryCoords,
                                                geom='geometry',
                                                coord_type='x',
                                                shape_type='polygon',
                                                axis=1)

        heatmap_toplot['y'] = heatmap_gdf.apply(getGeometryCoords,
                                                geom='geometry',
                                                coord_type='y',
                                                shape_type='polygon',
                                                axis=1)
        # Make heatmap

        colors = brewer['Reds'][9][::-1]
        mapper = LinearColorMapper(palette=colors,
                                   high=heatmap_toplot['counts'].max(),
                                   low=heatmap_toplot['counts'].min())
        source = ColumnDataSource(data=dict(x=heatmap_toplot['x'],
                                            y=heatmap_toplot['y'],
                                            name=heatmap_toplot['COMUNE'],
                                            count=heatmap_toplot['counts']))

        p = figure(x_axis_location=None,
                   y_axis_location=None,
                   plot_width=800,
                   plot_height=700)
        p.grid.grid_line_color = None
        p.outline_line_color = None
        p.title.align = "center"
        p.title.text_font_size = "40px"

        p.patches('x',
                  'y',
                  source=source,
                  fill_color=transform('count', mapper),
                  fill_alpha=0.8,
                  line_color="gray",
                  line_width=0.3)

        color_bar = ColorBar(color_mapper=mapper,
                             major_label_text_font_size='10pt',
                             label_standoff=12,
                             border_line_color=None,
                             location=(0, 0))
        p.add_layout(color_bar, 'right')

        #Add tools
        hover = HoverTool(tooltips=[("Comune",
                                     "@name"), ("Number of visitors",
                                                "@count")])

        p.add_tools(PanTool(), WheelZoomTool(), hover)
        output_file(path_to_cluster_results + "/clusterwise_heatmaps/" +
                    file_name + "_heatmap_cluster_" + str(c) + ".html")
        save(p)
Esempio n. 4
0
def run(params):
    """
    Runs the Geo2vec model based on the params input
    """
    
    username = params['username']
    season = params['season']
    country = params['country']
    EMB_SIZE = params['EMB_SIZE']
    WINDOW_SIZE = params['WINDOW_SIZE']
    N_EPOCHS = params['N_EPOCHS']
    N_CLUSTERS = params['N_CLUSTERS']
    MIN_LENGTH = params['MIN_LENGTH']
    MIN_COUNT = params['MIN_COUNT']
    train_model = params['train_model']
    apply_tsne = params['apply_tsne']
    plot_clusters_italy=params['plot_clusters_italy']
    plot_clusters_tuscany=params['plot_clusters_tuscany']
    path_to_shapefiles = "../src/utils/read_shapefiles/"
    path_to_final_plots = "../results/geo2vec/"
                         
                         
    #####################################
    # Preprocessing                     #
    #####################################
    # load locations data
    if country == 'all':
        df_trips = load_df.get_geo2vec_data_all_country(username, season).rename(columns={'com_locs_trunc':'locations'})
    else:
        df_trips = load_df.get_geo2vec_data(username, season, country).rename(columns={'com_locs_trunc':'locations'})
        
    preprocessing.str_to_list(df_trips)
    df_trips_red = preprocessing.filter_short_trips(df_trips, min_length=MIN_LENGTH)
    preprocessing.descriptive_sanity_check(df_trips_red)

    ####################################
    # Load data from shapefiles        #
    ####################################
    path_shapefiles, regions, provinces, territories, municipalities, crs = read_files.read_shapefile_data(path_to_shapefiles, 'shape_files_path.json')
    df_mun = read_files.read_shapefiles_in(False, path_shapefiles, municipalities, crs)
    df_mun_tus = read_files.read_shapefiles_in(True, path_shapefiles, municipalities, crs)
    #df_ter_tus = read_files.read_shapefiles_in(True, path_shapefiles, territories, crs, apply_crs=False)

    ###################################
    # Geo2vec model                   #
    ###################################
    g2v = Geo2vec(EMB_SIZE, WINDOW_SIZE, season, country)
    # initialize model
    g2v.initialize(sequences=df_trips_red['locations_list'], min_count=MIN_COUNT)
    # train model
    if train_model:       
        g2v.train(df_trips_red['locations_list'], n_epochs=N_EPOCHS)
    g2v.print_params()
    # cluster model
    g2v.create_clusters(n_clusters=N_CLUSTERS)
    

    # save clusters for Italy
    # careful here: plotting Italy generates a HUGE html file
    if plot_clusters_italy:
        print('\nITALY -- plotting...')
        g2v.merge_gdf(df_mun)
        g2v.get_most_similar()
        g2v.pickle_cluster_labels(tag='Italy')       
        plt_loc(g2v.gdf_clusters,
                path_to_file=path_to_final_plots+'{}_{}/'.format(country, season),
                file_name='Location_cluster_Italy_EMB{}_WIN{}_EPO{}_CLU{}_MinL{}_MinC{}.html'.format(EMB_SIZE, WINDOW_SIZE, N_EPOCHS, N_CLUSTERS, MIN_LENGTH, MIN_COUNT))
        
        
    # save clusters for Tuscany
    if plot_clusters_tuscany:
        print('\nTUSCANY -- plotting...\n\n')
        g2v.merge_gdf(df_mun_tus)
        g2v.get_most_similar()
        g2v.pickle_cluster_labels(tag='Tuscany')   
        plt_loc(g2v.gdf_clusters,
                path_to_file=path_to_final_plots+'{}_{}/'.format(country, season),
                file_name='Location_cluster_Tuscany_EMB{}_WIN{}_EPO{}_CLU{}_MinL{}_MinC{}.html'.format(EMB_SIZE, WINDOW_SIZE, N_EPOCHS, N_CLUSTERS, MIN_LENGTH, MIN_COUNT))
    
    # Apply t-SNE to visualize the clusters
    if apply_tsne:
        g2v.apply_tsne_2D()
        g2v.plot_tsne_2D(path_to_save=path_to_final_plots+'{}_{}/'.format(country, season))