def _convert_vector(df, dtype):
    from pyspark.ml.linalg import VectorUDT
    from pyspark.mllib.linalg import VectorUDT as OldVectorUDT

    for field in df.schema:
        col_name = field.name
        if isinstance(field.dataType, VectorUDT) or \
                isinstance(field.dataType, OldVectorUDT):
            df = df.withColumn(col_name, vector_to_array(df[col_name], dtype))
    return df
Esempio n. 2
0
def cast_spark_df_with_vector_to_array(input_spark_df):
    """
    Finds columns of vector type in a spark dataframe and
    casts them to array<double> type.

    :param input_spark_df:
    :return: a spark dataframe with vector columns transformed to array<double> type
    """
    vector_type_columns = [
        _field.name for _field in input_spark_df.schema
        if isinstance(_field.dataType, VectorUDT)
    ]
    return reduce(
        lambda df, vector_col: df.withColumn(vector_col,
                                             vector_to_array(vector_col)),
        vector_type_columns,
        input_spark_df,
    )
Esempio n. 3
0
def _convert_vector(df, precision):
    from pyspark.ml.linalg import Vector
    from pyspark.mllib.linalg import Vector as OldVector

    types_set = {struct_field.dataType for struct_field in df.schema}
    found_vectors = not types_set.isdisjoint({Vector, OldVector})
    if not found_vectors:
        return df

    import pyspark
    if LooseVersion(pyspark.__version__) >= LooseVersion('3.0'):
        raise ValueError("Vector columns are not supported for pyspark<3.0.0.")
    # pylint: disable=import-error
    from pyspark.ml.functions import vector_to_array
    # pylint: enable=import-error

    for struct_field in df.schema:
        col_name = struct_field.name
        if struct_field.dataType in {Vector, OldVector}:
            df = df.withColumn(col_name,
                               vector_to_array(df[col_name], precision))
    return df
Esempio n. 4
0
def _validate_and_convert_feature_col_as_array_col(dataset, features_col_name):
    features_col_datatype = dataset.schema[features_col_name].dataType
    features_col = col(features_col_name)
    if isinstance(features_col_datatype, ArrayType):
        if not isinstance(
                features_col_datatype.elementType,
            (DoubleType, FloatType, LongType, IntegerType, ShortType),
        ):
            raise ValueError(
                "If feature column is array type, its elements must be number type."
            )
        features_array_col = features_col.cast(ArrayType(
            FloatType())).alias("values")
    elif isinstance(features_col_datatype, VectorUDT):
        features_array_col = vector_to_array(features_col,
                                             dtype="float32").alias("values")
    else:
        raise ValueError(
            "feature column must be array type or `pyspark.ml.linalg.Vector` type, "
            "if you want to use multiple numetric columns as features, please use "
            "`pyspark.ml.transform.VectorAssembler` to assemble them into a vector "
            "type column first.")
    return features_array_col
Esempio n. 5
0
def process_bus_data(bus_df):
    """ Method to process raw business data from Yelp."""
    def select_elibigble_bus(row):
        """ Select businesses which fall into selected categores."""

        global categories
        try:
            # Return true if business falls into category list, else false.
            row_cats = row.split(',')
            for cat in row_cats:
                if cat.strip() in categories:
                    return True
            return False
        except (TypeError, AttributeError):
            # Returns false if business has no defined categories.
            return False

    def unpack_bus_attributes(row):
        """ Unpacks Business attributes and assigns them an index value."""

        # List to store business attributes.
        unpacked = list()
        # Unpack all attributes except PriceRange and Parking
        temp = [row[s] for s in bus_attributes]

        # Process PriceRange
        try:
            priceRange = int(row["RestaurantsPriceRange2"])
        except (TypeError, ValueError):
            # If no price range specified - default=2
            priceRange = 2

        #Process Parking
        try:
            parking = 1 if (row["BusinessParking"].find("True")) != -1 else -1
        except AttributeError:
            parking = 0

        # Process WiFi
        if row["WiFi"] == 'no' or row["WiFi"] == "u'no'":
            wifi = -1
        elif row["WiFi"] == None:
            wifi = 0
        else:
            wifi = 1

        # Tokenize all Boolean attributes.
        for i in temp:
            if i == "True":
                unpacked.append(1)
            elif i == "False":
                unpacked.append(-1)
            else:
                unpacked.append(0)
        # Append the Parking and PriceRange attributes
        unpacked.append(wifi)
        unpacked.append(parking)
        unpacked.append(priceRange)

        # Print any arrays that are not of desired length (=30).
        if len(unpacked) != 30:
            print(unpacked)
        return _convert_to_vector(
            csc_matrix(np.asarray(unpacked).astype(float)).T)

    def unpack_bus_categories(row):
        """Unpacks all business cattegories."""

        # List to store business categories.
        unpacked = list()
        # Unpack all attributes except PriceRange and Parking
        for cat in row.split(','):
            unpacked.append(cat.strip())
        return unpacked

    def unpack_price_range(row):
        """ Returns price range."""
        return int(row[-1])

    # Package the functions above into Spark SQL user-defined functions
    udf_select_eligible_bus = udf(select_elibigble_bus, BooleanType())
    udf_unpack_bus_attributes = udf(unpack_bus_attributes, VectorUDT())
    udf_unpack_bus_categories = udf(unpack_bus_categories,
                                    ArrayType(StringType()))
    udf_unpack_price_range = udf(unpack_price_range, IntegerType())

    # Find businesses to include.
    eligible_bus = bus_df.withColumn("include", udf_select_eligible_bus(col("categories"))) \
        .filter(col("include") == True)

    # Process business attributes feature.
    all_bus_attributes = set(
        bus_df.select("attributes").take(1)[0].attributes.asDict().keys())
    bus_attributes_to_exclude = {
        'AcceptsInsurance', 'AgesAllowed', 'ByAppointmentOnly', 'Caters',
        'Corkage', 'DietaryRestrictions', 'HairSpecializesIn', 'Open24Hours',
        'RestaurantsAttire', 'RestaurantsPriceRange2', 'BusinessParking',
        'WiFi'
    }
    bus_attributes = list(all_bus_attributes - bus_attributes_to_exclude)
    bus_attributes.sort()
    eligible_attr = eligible_bus.withColumn(
        "unpackedAttr", udf_unpack_bus_attributes(col("attributes")))

    # Process business categories feature.
    eligible_cats = eligible_attr.withColumn(
        "unpackedCats", udf_unpack_bus_categories(col("categories")))
    cv = CountVectorizer(inputCol="unpackedCats", outputCol="vectorizedCats")
    vectorized_cats = cv.fit(eligible_cats).transform(eligible_cats)

    # Un-bundle price range from all other attributes.
    unpacked_pr = vectorized_cats.withColumn(
        "priceRange", udf_unpack_price_range(col("unpackedAttr")))
    unpacked_pr.take(1)

    # Reduce dimensions of attributes and categories features, respectively.
    pca_attr = PCA(k=3, inputCol="unpackedAttr",
                   outputCol="pcaAttr").fit(unpacked_pr)
    temp = pca_attr.transform(unpacked_pr)
    temp.show()
    pca_cats = PCA(k=1, inputCol="vectorizedCats",
                   outputCol="pcaCats").fit(temp)
    temp2 = pca_cats.transform(temp)
    temp2.show()

    # Assemble into final feature vector.
    va = VectorAssembler(
        inputCols=["stars", "priceRange", "pcaAttr", "pcaCats"],
        outputCol="featureVec")
    features = va.transform(temp2).select("business_id", "stars", "categories",
                                          "featureVec")
    features.take(1)

    # Unpack
    n_features = len(features.select("featureVec").take(1)[0].featureVec)
    final = features.withColumn("f", vector_to_array(col("featureVec"))) \
        .select(["business_id", "stars", "categories"] + [col("f")[i] for i in range(n_features)])

    return final, n_features
Esempio n. 6
0
#     url=db_url,
#     table="text_data_train_nm_tfidf",
#     mode="overwrite",
#     properties=db_properties,
# )
# test_finished.write.jdbc(
#     url=db_url,
#     table="text_data_test_nm_tfidf",
#     mode="overwrite",
#     properties=db_properties,
# )

# SVM

train_pred = train_pred.withColumn("rawPrediction",
                                   vector_to_array("rawPrediction"))
test_pred = test_pred.withColumn("rawPrediction",
                                 vector_to_array("rawPrediction"))

train_pred.createOrReplaceTempView("train_pred")
test_pred.createOrReplaceTempView("test_pred")

train_finished = spark.sql("""
        SELECT review_id,
            ROUND(rawPrediction[1], 3) AS svm_pred
        FROM train_pred
    """)

test_finished = spark.sql("""
        SELECT review_id,
            ROUND(rawPrediction[1], 3) AS svm_pred
Esempio n. 7
0
    )

    if scaler_model is None:
        scaler = StandardScaler(
            inputCol=temp_vector_col,
            outputCol=temp_normalized_vector_col,
            withStd=parse_parameter(bool, scale, "scale", True),
            withMean=parse_parameter(bool, center, "center", False),
        )
        scaler_model = fit_and_save_model(trained_parameters, "scaler_model", scaler, assembled_wo_nans)

    output_df = transform_using_trained_model(scaler_model, assembled, scaler_model_loaded)

    # convert the resulting vector back to numeric
    temp_flattened_vector_col = temp_col_name(output_df)
    output_df = output_df.withColumn(temp_flattened_vector_col, vector_to_array(temp_normalized_vector_col))

    # keep only the final scaled column.
    output_column = input_column if output_column is None or not output_column else output_column
    output_column_value = sf.col(temp_flattened_vector_col)[0].alias(output_column)
    output_df = output_df.withColumn(output_column, output_column_value)
    final_columns = list(dict.fromkeys((list(df.columns) + [output_column])))
    output_df = output_df.select(final_columns)

    return default_spark_with_trained_parameters(output_df, trained_parameters)


def process_numeric_robust_scaler(
    df,
    input_column=None,
    lower_quantile=None,