Exemple #1
0
def get_parcels_to_spark(parcels_filepath='data/EXTR_Parcel.csv'):
    # Comment out to only use initial SparkSession
    # spark = SparkSession\
    # .builder\
    # .master('Local[4]')\
    # .appName("Get_Parcel_Data")\
    # .config("spark.master", "local")\
    # .getOrCreate()

    # Initially read in pre-cleaned Pandas DataFrame into Spark DataFrame
    parcel_pd = get_parcels(parcels_filepath)
    parcel = spark.createDataFrame(parcel_pd)

    # Normalize numerical data
    numerical_cols = [
        'PcntUnusable',
        'WfntFootage',
    ]
    numerical_assembler = VectorAssembler(inputCols=numerical_cols,
                                          outputCol='num_features')
    parcel = numerical_assembler.transform(parcel)

    parcel = StandardScaler(
        inputCol='num_features',
        outputCol='num_features_std').fit(parcel).transform(parcel)

    # Create index and dummy_vector column names of categorical colums, eventually dropping categorical and index columns
    cat_cols = [
        'Range',
        'Township',
        'Section',
        'QuarterSection',
        'Area',
        'SubArea',
        'LevyCode',
        'CurrentZoning',
        'PresentUse',
        'SqFtLot',
        'WaterSystem',
        'SewerSystem',
        'Access',
        'Topography',
        'StreetSurface',
        'InadequateParking',
        'MtRainier',
        'Olympics',
        'Cascades',
        'Territorial',
        'SeattleSkyline',
        'PugetSound',
        'LakeWashington',
        'SmallLakeRiverCreek',
        'OtherView',
        'WfntLocation',
        'WfntBank',
        'WfntPoorQuality',
        'WfntRestrictedAccess',
        'WfntAccessRights',
        'TidelandShoreland',
        'LotDepthFactor',
        'TrafficNoise',
        'NbrBldgSites',
        'Contamination',
    ]

    cat_index = []
    dummies = []
    for col in cat_cols:
        cat_index.append(col + '_index')
        dummies.append(col + '_dummy_vector')

    # Create and populate categorical index columns
    indexers = [
        StringIndexer(inputCol=column, outputCol=column + "_index").fit(parcel)
        for column in cat_cols
    ]
    cat_pipeline = Pipeline(stages=indexers)
    parcel = cat_pipeline.fit(parcel).transform(parcel)

    # Encode dummy_vector columns from categorical indeces
    encoder = OneHotEncoderEstimator(inputCols=cat_index, outputCols=dummies)
    model = encoder.fit(parcel)
    parcel = model.transform(parcel)

    # Drop categorical and index columns
    parcel = parcel.drop(*cat_cols)
    parcel = parcel.drop(*cat_index)
    parcel = parcel.drop(*numerical_cols)

    # Combine all features into single vector
    ignore = ['PIN']
    assembler = VectorAssembler(
        inputCols=[col for col in parcel.columns if col not in ignore],
        outputCol='parcel_features')
    parcel = assembler.transform(parcel)

    # Drop all columns that are now in the features column
    ignore.append('parcel_features')
    parcel = parcel.drop(*[col for col in parcel.columns if col not in ignore])

    # # Write to parquet - not sure if I will eventually open from this, but that's the idea
    # # gis.write.parquet('data/gis_parquet',mode='overwrite')

    return parcel
Exemple #2
0
def gis_data_to_spark(
    numFolds,
    gis_filepath='data/Parcels_for_King_County_with_Address_with_Property_Information__parcel_address_area.csv'
):

    # Initially read in pre-cleaned Pandas DataFrame into Spark DataFrame
    gis_pd = get_gis_data(gis_filepath)

    gis_pd['fold'] = np.random.randint(0, numFolds, gis_pd.shape[0])
    gis = spark.createDataFrame(gis_pd)

    # Normalize numerical data
    numerical_cols = [
        'LAT', 'LON', 'LOTSQFT', 'APPRLNDVAL', 'APPR_IMPR', 'TAX_LNDVAL',
        'TAX_IMPR', 'Shape_Length', 'Shape_Area', 'value_per_area',
        'improvement_over_land'
    ]
    numerical_assembler = VectorAssembler(inputCols=numerical_cols,
                                          outputCol='num_features')
    gis = numerical_assembler.transform(gis)

    gis = StandardScaler(inputCol='num_features',
                         outputCol='num_features_std').fit(gis).transform(gis)

    # Create index and dummy_vector column names of categorical colums, eventually dropping categorical and index columns
    cat_cols = [
        'KCTP_STATE', 'SITETYPE', 'LEVYCODE', 'NEW_CONSTR', 'TAXVAL_RSN',
        'QTS', 'SEC', 'TWP', 'RNG', 'KCA_ZONING', 'PROPTYPE', 'PREUSE_DESC'
    ]
    cat_index = []
    dummies = []
    for col in cat_cols:
        cat_index.append(col + '_index')
        dummies.append(col + '_dummy_vector')

    # Create and populate categorical index columns
    indexers = [
        StringIndexer(inputCol=column, outputCol=column + "_index").fit(gis)
        for column in cat_cols
    ]
    cat_pipeline = Pipeline(stages=indexers)
    gis = cat_pipeline.fit(gis).transform(gis)

    # Encode dummy_vector columns from categorical indeces
    encoder = OneHotEncoderEstimator(inputCols=cat_index, outputCols=dummies)
    model = encoder.fit(gis)
    gis = model.transform(gis)

    # Drop categorical and index columns
    gis = gis.drop(*cat_cols)
    gis = gis.drop(*cat_index)
    gis = gis.drop(*numerical_cols)

    # Combine all features into single vector
    ignore = ['PIN', 'MAJOR', 'MINOR', 'ADDR_FULL', 'TARGET', 'fold']
    assembler = VectorAssembler(
        inputCols=[col for col in gis.columns if col not in ignore],
        outputCol='gis_features')
    gis = assembler.transform(gis)

    # Drop all columns that are now in the features column
    ignore.append('gis_features')
    gis = gis.drop(*[col for col in gis.columns if col not in ignore])

    # Write to parquet - not sure if I will eventually open from this, but that's the idea
    # gis.write.parquet('data/gis_parquet',mode='overwrite')

    return gis