def prepare_stats_data(route_name=None, weather_category=None, update=False, verbose=True): """ Prepare data of statistics. :param route_name: name of Route, defaults to ``None`` :type route_name: str, None :param weather_category: weather to which an incident is attributed, defaults to ``None`` :type weather_category: str, None :param update: whether to retrieve the source data (in case it has been updated), defaults to ``False`` :type update: bool :param verbose: defaults to ``False`` :type verbose: bool **Example**:: route_name = None weather_category = None update = False verbose = True prepare_stats_data(route_name, weather_category, update, verbose) """ metex = METExLite() # Get data of Schedule 8 incident locations incident_locations = metex.view_schedule8_costs_by_location( route_name=route_name, weather_category=weather_category, update=update, verbose=verbose) # Find the "midpoint" of each incident location incident_locations = find_midpoint_of_each_incident_location( incident_locations) # Split the data by "region" for region in incident_locations.Region.unique(): region_data = incident_locations[incident_locations.Region == region] # Sort data by (frequency of incident occurrences, delay minutes, delay cost) sort_by_cols = [ 'WeatherCategory', 'IncidentCount', 'DelayMinutes', 'DelayCost' ] region_data.sort_values(sort_by_cols, ascending=False, inplace=True) region_data.index = range(len(region_data)) export_path = cdd_exploration( "NC", "01", region.replace(" ", "-").lower() + ".csv") save(region_data, export_path, verbose=verbose) print("\nCompleted.")
def read_table_by_name(database_name, table_name, schema_name='dbo', col_names=None, chunk_size=None, index_col=None, save_as=None, data_dir=None, **kwargs): """ Get all data from a given table in a specific database. :param database_name: name of a database :type database_name: str :param table_name: name of a queried table from the given database :type table_name: str :param schema_name: defaults to ``'dbo'`` :type schema_name: str :param col_names: defaults to ``None`` :type col_names: list, None :param index_col: defaults to ``None`` :type index_col: str or list or None :param chunk_size: defaults to ``None`` :type chunk_size: int, None :param save_as: defaults to ``None`` :type save_as: str or None :param data_dir: defaults to ``None`` :type data_dir: str or None :param kwargs: optional parameters of `pandas.read_sql <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html>`_ :return: data of the queried table :rtype: pandas.DataFrame """ # Connect to the queried database db_conn = establish_mssql_connection(database_name) # Create a pandas.DataFrame of the queried table_name table_data = pd.read_sql_table(table_name=table_name, con=db_conn, schema=schema_name, columns=col_names, index_col=index_col, chunksize=chunk_size, **kwargs) # Disconnect the database db_conn.close() if save_as: path_to_file = os.path.join( os.path.realpath(data_dir if data_dir else ''), table_name + save_as) save(table_data, path_to_file) # Return the data frame of the queried table return table_data
def save_table_by_chunk(database_name, table_name, schema_name='dbo', col_names=None, index_col=None, chunk_size=1000000, save_as=".pickle", data_dir=None, **kwargs): """ Save a table chunk-wise from a database. :param database_name: name of a database to query :type database_name: str :param table_name: name of a queried table from the given database :type table_name: str :param schema_name: defaults to ``'dbo'`` :type schema_name: str :param col_names: e.g. a list of column names to retrieve; ``None`` (default) for all columns :type col_names: list, None :param index_col: defaults to ``None`` :type index_col: str or list or None :param chunk_size: defaults to ``None`` :type chunk_size: int, None :param save_as: defaults to ``None`` :type save_as: str or None :param data_dir: defaults to ``None`` :type data_dir: str or None :param kwargs: optional parameters of `pandas.read_sql <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html>`_ :return: data of the queried table :rtype: pandas.DataFrame """ assert isinstance(save_as, str) and save_as in (".pickle", ".csv", ".xlsx", ".txt") if col_names is not None: assert isinstance(col_names, (list, tuple)) and all(isinstance(x, str) for x in col_names) if data_dir: assert isinstance(save_as, str) # Connect to the queried database assert isinstance(database_name, str) db_conn = establish_mssql_connection(database_name) # Check if there is column of 'geometry' type assert isinstance(table_name, str) sql_query_geom_col = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS " \ "WHERE TABLE_NAME='{}' AND DATA_TYPE='geometry'".format(table_name) geom_col_res = db_conn.execute(sql_query_geom_col).fetchall() geom_col_names = list(itertools.chain.from_iterable(geom_col_res)) if geom_col_res else [] # Get a list of column names, excluding the 'geometry' one if it exists table_col_names = [x for x in get_table_column_names(database_name, table_name) if x not in geom_col_names] # Specify SQL query - read all the selected non-geom columns selected_col_names = [x for x in col_names if x not in geom_col_names] if col_names \ else table_col_names sql_query = 'SELECT {} FROM {}."{}"'.format( ', '.join('"' + tbl_col_name + '"' for tbl_col_name in selected_col_names), schema_name, table_name) dat_dir = os.path.realpath(data_dir if data_dir else 'temp_dat') if not geom_col_names: # Read the queried table_name into a pandas.DataFrame table_data = pd.read_sql(sql=sql_query, con=db_conn, columns=col_names, index_col=index_col, chunksize=chunk_size, **kwargs) for tbl_id, tbl_dat in enumerate(table_data): path_to_file = os.path.join(dat_dir, table_name + "_{}".format(tbl_id + 1) + save_as) save(tbl_dat, path_to_file, sheet_name="Sheet_{}".format(tbl_id + 1)) else: # Read the queried table_name into a pd.DataFrame table_data = pd.read_sql(sql=sql_query, con=db_conn, columns=col_names, index_col=index_col, chunksize=chunk_size, **kwargs) tbl_chunks = [tbl_dat for tbl_dat in table_data] if len(geom_col_names) == 1: geom_sql_query = 'SELECT {} FROM {}."{}"'.format( '"{}".STAsText()'.format(geom_col_names[0]), schema_name, table_name) else: geom_sql_query = 'SELECT {} FROM {}."{}"'.format( ', '.join('"' + x + '".STAsText()' for x in geom_col_names), schema_name, table_name) geom_data = pd.read_sql(geom_sql_query, db_conn, chunksize=chunk_size, **kwargs) geom_chunks = [geom_dat.applymap(shapely.wkt.loads) for geom_dat in geom_data] counter = 0 # noinspection PyTypeChecker for tbl_dat, geom_dat in zip(tbl_chunks, geom_chunks): path_to_file = os.path.join(dat_dir, table_name + "_{}".format(counter + 1) + save_as) save(tbl_dat.join(geom_dat), path_to_file, sheet_name="Sheet_{}".format(counter + 1)) counter += 1 # Disconnect the database db_conn.close()
def read_table_by_query(database_name, table_name, schema_name='dbo', col_names=None, index_col=None, chunk_size=None, save_as=None, data_dir=None, **kwargs): """ Get data from a table in a specific database by SQL query. :param database_name: name of a database :type database_name: str :param table_name: name of a queried table from the given database :type table_name: str :param schema_name: defaults to ``'dbo'`` :type schema_name: str :param col_names: defaults to ``None`` :type col_names: iterable, None :param index_col: defaults to ``None`` :type index_col: str or list or None :param chunk_size: defaults to ``None`` :type chunk_size: int, None :param save_as: defaults to ``None`` :type save_as: str or None :param data_dir: defaults to ``None`` :type data_dir: str or None :param kwargs: optional parameters of `pandas.read_sql <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html>`_ :return: data of the queried table :rtype: pandas.DataFrame """ if col_names is not None: assert isinstance(col_names, (list, tuple)) and all(isinstance(x, str) for x in col_names) if save_as: assert isinstance(save_as, str) and save_as in (".pickle", ".csv", ".xlsx", ".txt") if data_dir: assert isinstance(save_as, str) # Connect to the queried database assert isinstance(database_name, str) db_conn = establish_mssql_connection(database_name) # Check if there is column of 'geometry' type assert isinstance(table_name, str) sql_query_geom_col = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS " \ "WHERE TABLE_NAME='{}' AND DATA_TYPE='geometry'".format(table_name) geom_col_res = db_conn.execute(sql_query_geom_col).fetchall() geom_col_names = list(itertools.chain.from_iterable(geom_col_res)) if geom_col_res else [] # Get a list of column names, excluding the 'geometry' one if it exists table_col_names = [x for x in get_table_column_names(database_name, table_name) if x not in geom_col_names] # Specify SQL query - read all non-geom columns selected_col_names = [x for x in col_names if x not in geom_col_names] if col_names \ else table_col_names sql_query = 'SELECT {} FROM {}."{}"'.format( ', '.join('"' + tbl_col_name + '"' for tbl_col_name in selected_col_names), schema_name, table_name) # Read the queried table_name into a pandas.DataFrame table_data = pd.read_sql( sql=sql_query, con=db_conn, columns=col_names, index_col=index_col, chunksize=chunk_size, **kwargs) if chunk_size: table_data = pd.concat([pd.DataFrame(tbl_dat) for tbl_dat in table_data], ignore_index=True) # Read geom column(s) if geom_col_names: if len(geom_col_names) == 1: geom_sql_query = 'SELECT {} FROM {}."{}"'.format( '"{}".STAsText()'.format(geom_col_names[0]), schema_name, table_name) else: geom_sql_query = 'SELECT {} FROM {}."{}"'.format( ', '.join('"' + x + '".STAsText()' for x in geom_col_names), schema_name, table_name) # Read geom data chunks into a pandas.DataFrame geom_data = pd.read_sql(geom_sql_query, db_conn, chunksize=chunk_size, **kwargs) if chunk_size: geom_data = pd.concat( [pd.DataFrame(geom_dat).applymap(shapely.wkt.loads) for geom_dat in geom_data], ignore_index=True) geom_data.columns = geom_col_names # table_data = table_data.join(geom_data) # Disconnect the database db_conn.close() if save_as: path_to_file = os.path.join( os.path.realpath(data_dir if data_dir else ''), table_name + save_as) save(table_data, path_to_file) return table_data
def prepare_monthly_stats_data(route_name=None, weather_category=None, update=False, verbose=True): """ Prepare data of monthly statistics. :param route_name: name of Route, defaults to ``None`` :type route_name: str, None :param weather_category: weather to which an incident is attributed, defaults to ``None`` :type weather_category: str, None :param update: whether to retrieve the source data (in case it has been updated), defaults to ``False`` :type update: bool :param verbose: defaults to ``False`` :type verbose: bool **Example**:: route_name = None weather_category = None update = False verbose = True """ metex = METExLite() # Get data of Schedule 8 incidents by datetime and location dat = metex.view_schedule8_costs_by_datetime_location( route_name=route_name, weather_category=weather_category, update=update, verbose=verbose) print("Cleaning data ... ", end="") # dat = metex.view_schedule8_data() dat.insert( dat.columns.get_loc('EndDateTime') + 1, 'StartYear', dat.StartDateTime.dt.year) dat.insert( dat.columns.get_loc('StartYear') + 1, 'StartMonth', dat.StartDateTime.dt.month) stats_calc = { 'IncidentCount': np.count_nonzero, 'DelayMinutes': np.sum, 'DelayCost': np.sum } stats = dat.groupby(list(dat.columns[3:-3])).aggregate(stats_calc) stats.reset_index(inplace=True) # Find the "midpoint" of each incident location data = find_midpoint_of_each_incident_location(stats) print("Done.\n") sort_by_cols = [ 'WeatherCategory', 'IncidentCount', 'DelayMinutes', 'DelayCost' ] print("Processing monthly statistics ... ") for m in data.StartMonth.unique(): m_ = ("0" + str(m)) if m < 10 else m print(" \"{}\" ... ".format(m_), end="") dat1 = data[data.StartMonth == m] if not dat1.empty: dat1.sort_values(sort_by_cols, ascending=False, na_position='last', inplace=True) dat1.index = range(len(dat1)) export_path = cdd_exploration("NC\\02\\GB\\Month", "{}.csv".format(m_)) save(dat1, export_path, sep=',', verbose=False) print("Done.") print("Completed.\n") print("Processing monthly statistics of GB ... ") years, months = data.StartYear.unique(), data.StartMonth.unique() for y, m in list(itertools.product(years, months)): period = ("{}_0{}" if m < 10 else "{}_{}").format(y, m) print(" \"{}\" ... ".format(period), end="") dat = data[(data.StartYear == y) & (data.StartMonth == m)] if not dat.empty: dat.sort_values(sort_by_cols, ascending=False, na_position='last', inplace=True) dat.index = range(len(dat)) export_path = cdd_exploration("NC\\02\\GB\\Year_Month", "{}.csv".format(period)) save(dat, export_path, sep=',', verbose=False) print("Done.") print("Completed.\n") # Split the data by "region" print("Processing monthly statistics for each region ... ") for region in data.Region.unique(): region_data = data[data.Region == region] years, months = region_data.StartYear.unique( ), region_data.StartMonth.unique() print(" \"{}\" ... ".format(region), end="") for y, m in list(itertools.product(years, months)): dat_ = region_data[(region_data.StartYear == y) & (region_data.StartMonth == m)] if not dat_.empty: dat_.sort_values(sort_by_cols, ascending=False, na_position='last', inplace=True) dat_.index = range(len(dat_)) export_path = cdd_exploration( "NC", "02", "Region", "{}".format(region.replace(" ", "-").lower()), ("{}_0{}.csv" if m < 10 else "{}_{}.csv").format(y, m)) save(dat_, export_path, sep=',', verbose=False) print("Done.") print("Completed.\n")