def test_aggplot(self): try: gplt.aggplot(dataframe_gaussian_points, hue='mock_category', projection=gcrs.PlateCarree()) gplt.aggplot(dataframe_gaussian_points, hue='mock_category', by='mock_category', projection=gcrs.PlateCarree()) finally: plt.close()
def test_aggplot(self, projection, sankey_hue, legend_vars, sankey_data_inputs): kwargs = {'projection': projection, 'hue': sankey_hue} kwargs = {**kwargs, **legend_vars, **sankey_data_inputs} try: gplt.aggplot(agg_data, **kwargs) finally: plt.close()
def geoplot(df, filter=None, n=0, p=0, x=None, y=None, figsize=(25, 10), inline=False, by=None, cmap='YlGn', **kwargs): """ Generates a geographical data nullity heatmap, which shows the distribution of missing data across geographic regions. The precise output depends on the inputs provided. If no geographical context is provided, a quadtree is computed and nullities are rendered as abstract geographic squares. If geographical context is provided in the form of a column of geographies (region, borough. ZIP code, etc.) in the `DataFrame`, convex hulls are computed for each of the point groups and the heatmap is generated within them. :param df: The DataFrame whose completeness is being geoplotted. :param filter: The filter to apply to the heatmap. Should be one of "top", "bottom", or None (default). :param n: The cap on the number of columns to include in the filtered DataFrame. :param p: The cap on the percentage fill of the columns in the filtered DataFrame. :param figsize: The size of the figure to display. This is a `matplotlib` parameter which defaults to `(25, 10)`. :param x: The variable in the dataset containing the x-coordinates of the dataset. :param y: The variable in the dataset containing the y-coordinates of the dataset. :param by: If specified, plot in convex hull mode, using the given column to cluster points in the same area. If not specified, plot in quadtree mode. :param cmap: The colormap to display the data with. Defaults to `YlGn`. :param inline: Whether or not the figure is inline. If it's not then instead of getting plotted, this method will return its figure. :param kwargs: Additional keyword arguments are passed to the underlying `geoplot` function. :return: If `inline` is False, the underlying `matplotlib.figure` object. Else, nothing. """ warnings.warn( "The 'geoplot' function has been deprecated, and will be removed in a future version " "of missingno. The 'geoplot' package has an example recipe for a more full-featured " "geospatial nullity plot: " "https://residentmario.github.io/geoplot/gallery/plot_san_francisco_trees.html" ) try: import geoplot as gplt except ImportError: raise ImportError( "Install geoplot <= 0.2.4 (the package) for geoplot function support" ) if gplt.__version__ >= "0.3.0": raise ImportError( "The missingno geoplot function requires geoplot package version 0.2.4 or lower." "To use the geoplot function, downgrade to an older version of the geoplot package." ) import geopandas as gpd from shapely.geometry import Point df = nullity_filter(df, filter=filter, n=n, p=p) nullity = df.notnull().sum(axis='columns') / df.shape[1] if x and y: gdf = gpd.GeoDataFrame(nullity, columns=['nullity'], geometry=df.apply( lambda srs: Point(srs[x], srs[y]), axis='columns')) else: raise ValueError("The 'x' and 'y' parameters must be specified.") if by: if df[by].isnull().any(): warnings.warn( 'The "{0}" column included null values. The offending records were dropped' .format(by)) df = df.dropna(subset=[by]) gdf = gdf.loc[df.index] vc = df[by].value_counts() if (vc < 3).any(): warnings.warn( 'Grouping by "{0}" included clusters with fewer than three points, which cannot be made ' 'polygonal. The offending records were dropped.'.format(by)) where = df[by].isin((df[by].value_counts() > 2).where(lambda b: b).dropna().index.values) gdf = gdf.loc[where] gdf[by] = df[by] gplt.aggplot(gdf, figsize=figsize, hue='nullity', agg=np.average, cmap=cmap, by=by, edgecolor='None', **kwargs) ax = plt.gca() if inline: warnings.warn( "The 'inline' argument has been deprecated, and will be removed in a future version " "of missingno.") plt.show() else: return ax
def test_aggplot(self): try: gplt.aggplot(series_gaussian_points, hue=list_hue_values) gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values) gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values) gplt.aggplot(dataframe_gaussian_points, hue=series_hue_values) gplt.aggplot(dataframe_gaussian_points, hue=map_hue_values()) gplt.aggplot(dataframe_gaussian_points, hue='hue_var') gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by='mock_category') # series gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by=dataframe_gaussian_points['mock_category']) # list gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by=list(dataframe_gaussian_points['mock_category'])) # map gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by=map( lambda v: v, list(dataframe_gaussian_points['mock_category']))) gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by='mock_category', geometry=aggplot_geometries) gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by=dataframe_gaussian_points['mock_category'], geometry=aggplot_geometries) # Series gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by=list(dataframe_gaussian_points['mock_category']), geometry=aggplot_geometries) # List gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by=map( lambda v: v, list(dataframe_gaussian_points['mock_category'])), geometry=aggplot_geometries) # Map finally: plt.close('all')
manhattan = manhattan.to_crs(epsg=4326) manhattan = manhattan.reset_index(drop=True) manhattan = manhattan.reset_index().rename(columns={'index': 'n'}) # Plot the data. # This plot demonstrates an extremely useful trick. When used with a provided geometry, the aggplot plot type expects # an iterable of geometries to be used for binning observations. The idea is that, in general, we have n observations # and some smaller number k of locations containing them, and we will match observations within the same bin, # average them in some way, and plot the result. # # Of course, what if n == k? In other words, what if every observation comes with its own location? In that case we # can can pass those locations to the ``geometry`` parameter and pass the data's index to the ``by`` parameter, # and ``aggplot`` will plot all of our records one at a time! # # This is a nice feature to have, and very useful for a wide variety of datasets. In this case we are plotting # building ages in Manhattan using data taken from MapPLUTO # (http://www1.nyc.gov/site/planning/data-maps/open-data/dwn-pluto-mappluto.page). # # Note that this plot is for the purposes of example only: it contains 40,000 geometries (far more than palatable) # and so takes a long time to render. To explore the data for real take a look at this all-NYC webmap: # http://pureinformation.net/building-age-nyc/. ax = gplt.aggplot(manhattan, projection=gcrs.PlateCarree(), geometry=manhattan.geometry, by=pd.Series(manhattan.index), hue='YearBuilt', linewidth=0) ax.set_title("Buildings in Manhattan by Year Built") plt.savefig("aggplot-singular.png", bbox_inches='tight', pad_inches=0.1)
def geoplot(df, filter=None, n=0, p=0, sort=None, x=None, y=None, figsize=(25, 10), inline=False, by=None, cmap='YlGn', **kwargs): """ Generates a geographical data nullity heatmap, which shows the distribution of missing data across geographic regions. The precise output depends on the inputs provided. If no geographical context is provided, a quadtree is computed and nullities are rendered as abstract geographic squares. If geographical context is provided in the form of a column of geographies (region, borough. ZIP code, etc.) in the `DataFrame`, convex hulls are computed for each of the point groups and the heatmap is generated within them. :param df: The DataFrame whose completeness is being geoplotted. :param filter: The filter to apply to the heatmap. Should be one of "top", "bottom", or None (default). :param sort: The sort to apply to the heatmap. Should be one of "ascending", "descending", or None. :param n: The cap on the number of columns to include in the filtered DataFrame. :param p: The cap on the percentage fill of the columns in the filtered DataFrame. :param figsize: The size of the figure to display. This is a `matplotlib` parameter which defaults to `(25, 10)`. :param x: The variable in the dataset containing the x-coordinates of the dataset. :param y: The variable in the dataset containing the y-coordinates of the dataset. :param by: If specified, plot in convex hull mode, using the given column to cluster points in the same area. If not specified, plot in quadtree mode. :param cmap: The colormap to display the data with. Defaults to `YlGn`. :param inline: Whether or not the figure is inline. If it's not then instead of getting plotted, this method will return its figure. :param kwargs: Additional keyword arguments are passed to the underlying `geoplot` function. :return: If `inline` is False, the underlying `matplotlib.figure` object. Else, nothing. """ import geoplot as gplt import geopandas as gpd from shapely.geometry import Point df = nullity_filter(df, filter=filter, n=n, p=p) df = nullity_sort(df, sort=sort) nullity = df.notnull().sum(axis='columns') / df.shape[1] if x and y: gdf = gpd.GeoDataFrame(nullity, columns=['nullity'], geometry=df.apply(lambda srs: Point(srs[x], srs[y]), axis='columns')) else: raise ValueError("The 'x' and 'y' parameters must be specified.") if by: if df[by].isnull().any(): warnings.warn('The "{0}" column included null values. The offending records were dropped'.format(by)) df = df.dropna(subset=[by]) gdf = gdf.loc[df.index] vc = df[by].value_counts() if (vc < 3).any(): warnings.warn('Grouping by "{0}" included clusters with fewer than three points, which cannot be made ' 'polygonal. The offending records were dropped.'.format(by)) where = df[by].isin((df[by].value_counts() > 2).where(lambda b: b).dropna().index.values) gdf = gdf.loc[where] gdf[by] = df[by] gplt.aggplot(gdf, figsize=figsize, hue='nullity', agg=np.average, cmap=cmap, by=by, edgecolor='None', **kwargs) ax = plt.gca() if inline: plt.show() else: return ax
# Load the data (uses the `quilt` package). from quilt.data.ResidentMario import geoplot_data import geopandas as gpd boston_zip_codes = gpd.read_file(geoplot_data.boston_zip_codes()) boston_zip_codes = boston_zip_codes.assign(id=boston_zip_codes.id.astype(float)).set_index('id') listings = gpd.read_file(geoplot_data.boston_airbnb_listings()) listings = listings.assign(zipcode=listings.zipcode.astype(float)) # Plot the data. import geoplot as gplt import geoplot.crs as gcrs import numpy as np import matplotlib.pyplot as plt ax = gplt.polyplot(boston_zip_codes.geometry, projection=gcrs.AlbersEqualArea(), facecolor='lightgray', edgecolor='gray', linewidth=0) gplt.aggplot(listings, projection=gcrs.AlbersEqualArea(), hue='price', by='zipcode', geometry=boston_zip_codes.geometry, agg=np.median, ax=ax, linewidth=0) ax.set_title("Median AirBnB Price by Boston Zip Code, 2016") plt.savefig("boston-airbnb-aggplot.png", bbox_inches='tight', pad_inches=0.1)
# Plot the data. import geoplot as gplt import geoplot.crs as gcrs import numpy as np import matplotlib.pyplot as plt f, axarr = plt.subplots(3, 1, figsize=(12, 12), subplot_kw={ 'projection': gcrs.AlbersEqualArea(central_latitude=40.7128, central_longitude=-74.0059) }) plt.suptitle('Max(Injuries) in Collision by Area, 2016', fontsize=16) plt.subplots_adjust(top=0.95) ax1 = gplt.aggplot(collisions, projection=gcrs.AlbersEqualArea(), hue='NUMBER OF PERSONS INJURED', agg=np.max, cmap='Reds', nmin=100, nmax=500, linewidth=0.5, edgecolor='white', ax=axarr[0]) ax1.set_title("No Geometry (Quadtree)") ax2 = gplt.aggplot(collisions, projection=gcrs.AlbersEqualArea(), hue='NUMBER OF PERSONS INJURED', agg=np.max, cmap='Reds', by='ZIP CODE', linewidth=0.5, edgecolor='white', ax=axarr[1]) ax2.set_title("Categorical Geometry (Convex Hull)") zip_codes = gplt.datasets.load('nyc-zip-codes') ax3 = gplt.aggplot(collisions, projection=gcrs.AlbersEqualArea(), hue='NUMBER OF PERSONS INJURED', agg=np.max, by='ZIP CODE', geometry=zip_codes.geometry,
import geopandas as gpd import geoplot as gplt import geoplot.crs as gcrs import numpy as np import matplotlib.pyplot as plt # load the data boston_zip_codes = gpd.read_file(gplt.datasets.get_path('boston_zip_codes')) boston_zip_codes = boston_zip_codes.assign( id=boston_zip_codes.id.astype(float)).set_index('id') boston_airbnb_listings = gpd.read_file( gplt.datasets.get_path('boston_airbnb_listings')) proj = gcrs.AlbersEqualArea() ax = gplt.polyplot(boston_zip_codes, projection=proj, facecolor='lightgray', edgecolor='gray', linewidth=0) gplt.aggplot(boston_airbnb_listings, projection=proj, hue='price', by='zipcode', geometry=boston_zip_codes, agg=np.median, ax=ax, linewidth=0) ax.set_title("Median AirBnB Price by Boston Zip Code, 2016") plt.savefig("boston-airbnb-aggplot.png", bbox_inches='tight', pad_inches=0.1)
}) plt.suptitle('Max(Injuries) in Collision by Area, 2016', fontsize=16) plt.subplots_adjust(top=0.95) # In the first plot we do not provide any geographic data at all as input. In this case aggplot takes the centroids # of whatever it is that we are throwing at it and uses them to decompose the boundaries of our data into squares, # with a cetain user specified minimum (nmin) and maximum (nmax) number of observations per square. This is known in # the literature as a QuadTree. An additional parameter, nsig, controls how many observations have to be made in a # square for that square to be considered significant (insignificant and empty squares are not colored in). The agg # parameter controls the method by which the observations are aggregated---in the default case np.mean is used, # in this case we have specified a maximum (np.max) instead. ax1 = gplt.aggplot(collisions, projection=gcrs.AlbersEqualArea(), hue='NUMBER OF PERSONS INJURED', agg=np.max, nmin=100, nmax=500, cmap='Reds', linewidth=0.5, edgecolor='white', ax=axarr[0]) ax1.set_title("No Geometry (Quadtree)") # In the second plot we provide more information than the first, by specifying a categorical column of data in the # dataset corresponding with sort of encoded geography---in this example, the postal zip code. Aggplot computes the # geometries it needs itself, using a simple convex hull around the observations' point cloud. Albeit not elegant, # the resulting geometry is functional---and, again, spares us the task of having to find our own. ax2 = gplt.aggplot(collisions, projection=gcrs.AlbersEqualArea(), hue='NUMBER OF PERSONS INJURED', agg=np.max, by='ZIP CODE',
def test_aggplot(self): try: gplt.aggplot(series_gaussian_points, hue=list_hue_values) gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values) gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values) gplt.aggplot(dataframe_gaussian_points, hue=series_hue_values) gplt.aggplot(dataframe_gaussian_points, hue=map_hue_values()) gplt.aggplot(dataframe_gaussian_points, hue='hue_var') gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by='mock_category') gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by=dataframe_gaussian_points['mock_category']) # Series gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by=list(dataframe_gaussian_points['mock_category'])) # List gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by=map(lambda v: v, list(dataframe_gaussian_points['mock_category']))) # Map gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by='mock_category', geometry=aggplot_geometries) gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by=dataframe_gaussian_points['mock_category'], geometry=aggplot_geometries) # Series gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by=list(dataframe_gaussian_points['mock_category']), geometry=aggplot_geometries) # List gplt.aggplot(dataframe_gaussian_points, hue=list_hue_values, by=map(lambda v: v, list(dataframe_gaussian_points['mock_category'])), geometry=aggplot_geometries) # Map finally: plt.close('all')
manhattan = manhattan.reset_index().rename(columns={'index': 'n'}) # Plot the data. # This plot demonstrates an extremely useful trick. When used with a provided geometry, the aggplot plot type expects # an iterable of geometries to be used for binning observations. The idea is that, in general, we have n observations # and some smaller number k of locations containing them, and we will match observations within the same bin, # average them in some way, and plot the result. # # Of course, what if n == k? In other words, what if every observation comes with its own location? In that case we # can can pass those locations to the ``geometry`` parameter and pass the data's index to the ``by`` parameter, # and ``aggplot`` will plot all of our records one at a time! # # This is a nice feature to have, and very useful for a wide variety of datasets. In this case we are plotting # building ages in Manhattan using data taken from MapPLUTO # (http://www1.nyc.gov/site/planning/data-maps/open-data/dwn-pluto-mappluto.page). # # Note that this plot is for the purposes of example only: it contains 40,000 geometries (far more than palatable) # and so takes a long time to render. To explore the data for real take a look at this all-NYC webmap: # http://pureinformation.net/building-age-nyc/. ax = gplt.aggplot(manhattan, projection=gcrs.PlateCarree(), geometry=manhattan.geometry, by=pd.Series(manhattan.index), hue='YearBuilt', linewidth=0) ax.set_title("Buildings in Manhattan by Year Built") plt.savefig("aggplot-singular.png", bbox_inches='tight', pad_inches=0.1)