def assert_results(result: DataFrame) -> None: """ Shared asserts for the different formats of CSV file, all of which contain the same data. """ # Assert assert result.count() == 3 assert result.collect()[1][0] == "2" assert result.collect()[1][1] == "bar" assert result.collect()[1][2] == "bar2"
def insert_df_xy(df: DataFrame, name: str, ws: str = "memory", spatial_reference: int = 3857) -> None: """Create ephemeral point feature class from given dataframe. Note - It is assume that the first two data fields are the point x/y values. :param df: A dataframe. :param name: The name of the feature class. :param ws: The feature class workspace. :param spatial_reference: The feature class spatial reference. """ fields = _df_to_fields(df, 2) rows = df.collect() insert_rows_xy(rows, name, fields, ws, spatial_reference)
def naaccr_read_fwf(flat_file: DataFrame, record_layout: DataFrame, value_col: str = 'value', exclude_pfx: str = 'reserved') -> DataFrame: """ @param flat_file: as from spark.read.text() typically with .value @param record_layout: as from http://datadictionary.naaccr.org/?c=7 with .start, .length, .xmlId """ fields = [ func.substring(flat_file[value_col], item.start, item.length).alias(item.xmlId) for item in record_layout.collect() if not item.xmlId.startswith(exclude_pfx) ] # type: List[Union[Column, str]] return flat_file.select(fields)
def insert_df_hex(df: DataFrame, name: str, size: float, ws: str = "memory") -> None: """Create ephemeral polygon feature class from given dataframe. Note - It is assume that the first field is the hex nume value. :param df: A dataframe. :param name: The name of the feature class. :param size: The hex size in meters. :param ws: The feature class workspace. """ layout = Layout(size) fields = _df_to_fields(df, 1) rows = df.collect() with insert_cursor(name, fields, ws=ws, shape_format="") as cursor: for nume, *tail in rows: coords = Hex.from_nume(nume).to_coords(layout) cursor.insertRow((coords, *tail))
def insert_df(df: DataFrame, name: str, ws: str = "memory", spatial_reference: int = 3857, shape_type: str = "POLYGON", shape_format: str = "WKB") -> None: """Create an ephemeral feature class given a dataframe. Note - it is assumed that the first data field is the shape field. :param df: A dataframe. :param name: The name of the feature class. :param ws: The output workspace. Default="memory". :param spatial_reference: The spatial reference id. Default=2857. :param shape_type: The feature class shape type (POINT,POLYGON,POLYLINE,MULTIPOINT). Default="POLYGON". :param shape_format: The shape format (WKB, WKT, ''). Default="WKB". """ fields = _df_to_fields(df, 1) rows = df.collect() insert_rows(rows, name, fields, ws, spatial_reference, shape_type, shape_format)
def synthesize_data(self, stats_nom: DataFrame, record_layout: DataFrame, qty: int = 100) -> pd.DataFrame: spark = self.__spark create_object(self.t_item_view, self.concepts_script, spark) create_object(self.txform_view, self.txform_script, spark) stats_nom.createOrReplaceTempView(self.agg_view) entity = spark.createDataFrame([(ix, ) for ix in range(0, qty)], ['case_index']) entity.createOrReplaceTempView(self.entity_view) # simulated_entity.limit(5).toPandas() for view in self.views: create_object(view, self.script, spark) spark.catalog.cacheTable(self.views[-1]) # ISSUE: SQL goes in .sql files sim_records_nom = spark.sql(''' select data.case_index, data.xmlId, data.value from simulated_naaccr_nom data join record_layout rl on rl.xmlId = data.xmlId join section on rl.section = section.section order by case_index, rl.start ''').toPandas() sim_records_nom = sim_records_nom.pivot(index='case_index', columns='xmlId', values='value') for col in sim_records_nom.columns: sim_records_nom[col] = sim_records_nom[col].astype('category') col_start = {row.xmlId: row.start for row in record_layout.collect()} sim_records_nom = sim_records_nom[sorted( sim_records_nom.columns, key=lambda xid: col_start[xid])] return sim_records_nom