Beispiel #1
0
    def cluster(self, df, session, repartition_num=8):
        n = df.count()
        # index rows
        df_index = df.select((row_number().over(
            Window.partitionBy(lit(0)).orderBy(self.featureCol)) -
                              1).alias('id'), "*")
        df_features = df_index.select('id', self.featureCol)

        # prep for joining
        df_features = df_features.repartitionByRange(repartition_num, 'id')

        left_df = df_features.select(
            df_features['id'].alias('left_id'),
            df_features[self.featureCol].alias('left_features'))
        right_df = df_features.select(
            df_features['id'].alias('right_id'),
            df_features[self.featureCol].alias('right_features'))

        # join on self where left_id does not equal right_id
        joined_df = left_df.join(right_df,
                                 left_df['left_id'] != right_df['right_id'])

        # comupte cosine similarity between vectors
        joined_df = joined_df.select(
            'left_id', 'right_id',
            cosine_similarity_udf(
                array(joined_df['left_features'],
                      joined_df['right_features'])).alias('norm'))
        ranked = joined_df.select(
            'left_id', 'right_id',
            rank().over(
                Window.partitionBy('left_id').orderBy('norm')).alias('rank'))
        knn = ranked.where(ranked['rank'] <= 5)
        knn_grouped = knn.groupBy('left_id').agg(
            f.collect_list('right_id').alias('nn'))

        # generate laplacian
        laplacian = knn_grouped.select(
            knn_grouped['left_id'].alias('id'),
            toVector_udf(
                laplacian_vector_udf(knn_grouped['left_id'], knn_grouped['nn'],
                                     lit(n),
                                     lit(self.k_nearest))).alias('lap_vector'))

        pca = PCA(k=self.num_eigenvectors,
                  inputCol='lap_vector',
                  outputCol='features').fit(laplacian)
        eigenvectors = pca.transform(laplacian).select('id', 'features')

        model = KMeans(featuresCol='features',
                       predictionCol=self.predictionCol,
                       k=self.k).fit(eigenvectors)
        predictions = model.transform(eigenvectors).join(df_index, on='id')
        return predictions
Beispiel #2
0
def pca(dataset, inputCol, k=3):
    from pyspark.ml.feature import PCA
    model = PCA(k=3, inputCol=inputCol, outputCol=inputCol+'_pca').fit(dataset)
    return model.transform(dataset), model
Beispiel #3
0
def process_bus_data(bus_df):
    """ Method to process raw business data from Yelp."""
    def select_elibigble_bus(row):
        """ Select businesses which fall into selected categores."""

        global categories
        try:
            # Return true if business falls into category list, else false.
            row_cats = row.split(',')
            for cat in row_cats:
                if cat.strip() in categories:
                    return True
            return False
        except (TypeError, AttributeError):
            # Returns false if business has no defined categories.
            return False

    def unpack_bus_attributes(row):
        """ Unpacks Business attributes and assigns them an index value."""

        # List to store business attributes.
        unpacked = list()
        # Unpack all attributes except PriceRange and Parking
        temp = [row[s] for s in bus_attributes]

        # Process PriceRange
        try:
            priceRange = int(row["RestaurantsPriceRange2"])
        except (TypeError, ValueError):
            # If no price range specified - default=2
            priceRange = 2

        #Process Parking
        try:
            parking = 1 if (row["BusinessParking"].find("True")) != -1 else -1
        except AttributeError:
            parking = 0

        # Process WiFi
        if row["WiFi"] == 'no' or row["WiFi"] == "u'no'":
            wifi = -1
        elif row["WiFi"] == None:
            wifi = 0
        else:
            wifi = 1

        # Tokenize all Boolean attributes.
        for i in temp:
            if i == "True":
                unpacked.append(1)
            elif i == "False":
                unpacked.append(-1)
            else:
                unpacked.append(0)
        # Append the Parking and PriceRange attributes
        unpacked.append(wifi)
        unpacked.append(parking)
        unpacked.append(priceRange)

        # Print any arrays that are not of desired length (=30).
        if len(unpacked) != 30:
            print(unpacked)
        return _convert_to_vector(
            csc_matrix(np.asarray(unpacked).astype(float)).T)

    def unpack_bus_categories(row):
        """Unpacks all business cattegories."""

        # List to store business categories.
        unpacked = list()
        # Unpack all attributes except PriceRange and Parking
        for cat in row.split(','):
            unpacked.append(cat.strip())
        return unpacked

    def unpack_price_range(row):
        """ Returns price range."""
        return int(row[-1])

    # Package the functions above into Spark SQL user-defined functions
    udf_select_eligible_bus = udf(select_elibigble_bus, BooleanType())
    udf_unpack_bus_attributes = udf(unpack_bus_attributes, VectorUDT())
    udf_unpack_bus_categories = udf(unpack_bus_categories,
                                    ArrayType(StringType()))
    udf_unpack_price_range = udf(unpack_price_range, IntegerType())

    # Find businesses to include.
    eligible_bus = bus_df.withColumn("include", udf_select_eligible_bus(col("categories"))) \
        .filter(col("include") == True)

    # Process business attributes feature.
    all_bus_attributes = set(
        bus_df.select("attributes").take(1)[0].attributes.asDict().keys())
    bus_attributes_to_exclude = {
        'AcceptsInsurance', 'AgesAllowed', 'ByAppointmentOnly', 'Caters',
        'Corkage', 'DietaryRestrictions', 'HairSpecializesIn', 'Open24Hours',
        'RestaurantsAttire', 'RestaurantsPriceRange2', 'BusinessParking',
        'WiFi'
    }
    bus_attributes = list(all_bus_attributes - bus_attributes_to_exclude)
    bus_attributes.sort()
    eligible_attr = eligible_bus.withColumn(
        "unpackedAttr", udf_unpack_bus_attributes(col("attributes")))

    # Process business categories feature.
    eligible_cats = eligible_attr.withColumn(
        "unpackedCats", udf_unpack_bus_categories(col("categories")))
    cv = CountVectorizer(inputCol="unpackedCats", outputCol="vectorizedCats")
    vectorized_cats = cv.fit(eligible_cats).transform(eligible_cats)

    # Un-bundle price range from all other attributes.
    unpacked_pr = vectorized_cats.withColumn(
        "priceRange", udf_unpack_price_range(col("unpackedAttr")))
    unpacked_pr.take(1)

    # Reduce dimensions of attributes and categories features, respectively.
    pca_attr = PCA(k=3, inputCol="unpackedAttr",
                   outputCol="pcaAttr").fit(unpacked_pr)
    temp = pca_attr.transform(unpacked_pr)
    temp.show()
    pca_cats = PCA(k=1, inputCol="vectorizedCats",
                   outputCol="pcaCats").fit(temp)
    temp2 = pca_cats.transform(temp)
    temp2.show()

    # Assemble into final feature vector.
    va = VectorAssembler(
        inputCols=["stars", "priceRange", "pcaAttr", "pcaCats"],
        outputCol="featureVec")
    features = va.transform(temp2).select("business_id", "stars", "categories",
                                          "featureVec")
    features.take(1)

    # Unpack
    n_features = len(features.select("featureVec").take(1)[0].featureVec)
    final = features.withColumn("f", vector_to_array(col("featureVec"))) \
        .select(["business_id", "stars", "categories"] + [col("f")[i] for i in range(n_features)])

    return final, n_features
Beispiel #4
0
def apply_pca(training, testing):
    pca = PCA(k=3, inputCol="features", outputCol="pca_features").fit(training)
    training = pca.transform(training).cache()
    testing = pca.transform(testing).cache()
    return training, testing
Beispiel #5
0
def preprocessing(df, num_pca=10):
    argo_df_og = df

    # Cast temp as DoubleType()
    argo_df_og = argo_df_og.withColumn("tempTmp", argo_df_og['temp'].cast(DoubleType()))\
                       .drop("temp")\
                       .withColumnRenamed("tempTmp", "temp")\
                       .select("profile_id", "pres", "temp", "lat", "lon", "psal", "date")\
                       .persist()


    argo_filterby = argo_df_og.groupBy("profile_id") \
                          .agg(min("pres").alias("min_pres"),
                               max("pres").alias("max_pres"),
                               count("profile_id").alias("count_profile_id"))

    # Now, here are the profile_ids we want to keep, to be inner joined with original argo_df_og
    argo_keep_ids = argo_filterby.filter("count_profile_id >= 50 and min_pres <= 25 and max_pres >= 999") \
                             .select("profile_id")

    # Inner join the profile_ids to keep with original argo_df_og to filter and keep only desired IDs
    argo_df_keep = argo_keep_ids.join(argo_df_og, "profile_id",
                                      "inner").persist()

    #Final filtered df after pressure cleaning
    argo_df = argo_df_keep.select("profile_id", "pres", "temp", "lat", "lon", "psal", "date",
                              month("date").alias("month"), year("date").alias("year")) \
                      .persist()

    #INTERPOLATION

    #Create vector mapping correspoding temperatures with pressures
    argo_df_listed = argo_df.select('profile_id', 'lat', 'lon', array(argo_df['temp'], argo_df['pres']).alias('temp_pres'))\
                        .groupBy('profile_id').agg(collect_list('temp_pres').alias('temp_pres_list'),
                                                   fn.min(argo_df['lat']).alias('lat'),
                                                   fn.min(argo_df['lon']).alias('lon'))

    # Ordering by pressure
    argo_df_listed = argo_df_listed.select(
        'profile_id', 'lat', 'lon',
        insane_sort(argo_df_listed['temp_pres_list']).alias('temp_pres_list'))

    # Interpolating missing temps at specified grid points
    pres = argo_df_listed.select(
        'profile_id', 'lat', 'lon',
        interp_udf('temp_pres_list').alias('temp_interp'))

    # Finding profiles with temps as nans
    check_pres = pres.select(
        "profile_id", "temp_interp", 'lat', 'lon',
        null_udf("temp_interp").alias("temp_interp_hasNA"),
        lenarray_udf("temp_interp").alias("temp_interp_len199"))

    # Filtering profiles with temps as nans
    filtered_pres = check_pres.filter("temp_interp_hasNA == False").select(
        "profile_id", "temp_interp", 'lat', 'lon')

    # Finding profiles with temps < -5
    check_pres = filtered_pres.select(
        "profile_id", "temp_interp", 'lat', 'lon',
        neg_udf("temp_interp").alias("temp_interp_hasNeg5s"))
    # Filtering profiles with temps < -5
    argo_df_clean = check_pres.filter("temp_interp_hasNeg5s == False").select(
        "profile_id", "temp_interp", 'lat', 'lon')

    argo_df_clean = argo_df_clean.select(
        'profile_id',
        toVector_udf(argo_df_clean['temp_interp']).alias('features'), 'lat',
        'lon')

    pca = PCA(k=num_pca, inputCol='features',
              outputCol='features_pca').fit(argo_df_clean)
    argo_df_clean = pca.transform(argo_df_clean)
    argo_df_clean = argo_df_clean.select(
        'profile_id', argo_df_clean['features_pca'].alias('features'), 'lat',
        'lon')

    return argo_df_clean
label_mapping = dict(enumerate(labelIndexer.labels()))
reverse_mapping = {}
for key in label_mapping:
    reverse_mapping[label_mapping[key]] = key


# ## Dimensionality reduction
# 
# Feature selection is not really supported yet in mllib, therefore, we just applied dim reduction using PCA

# In[509]:

pca = PCA(inputCol="features", outputCol="pca", k=15).fit(train_df)

train_df = pca.transform(train_df)
test_df = pca.transform(test_df)


# ## Classification algorithms

# In[ ]:

rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="pca", numTrees=5000)
#rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="features", numTrees=5000)
model = rf.fit(train_df)


# ## Evaluation & results

# In[ ]:
test_df = labelIndexer.transform(test_df)

label_mapping = dict(enumerate(labelIndexer.labels()))
reverse_mapping = {}
for key in label_mapping:
    reverse_mapping[label_mapping[key]] = key

# ## Dimensionality reduction
#
# Feature selection is not really supported yet in mllib, therefore, we just applied dim reduction using PCA

# In[509]:

pca = PCA(inputCol="features", outputCol="pca", k=15).fit(train_df)

train_df = pca.transform(train_df)
test_df = pca.transform(test_df)

# ## Classification algorithms

# In[ ]:

rf = RandomForestClassifier(labelCol="indexedResult",
                            featuresCol="pca",
                            numTrees=5000)
#rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="features", numTrees=5000)
model = rf.fit(train_df)

# ## Evaluation & results

# In[ ]:
Beispiel #8
0
    except:
        rel['features'] = Vectors.dense(0, 0, 0, 0, 0, 0)
    rel['label'] = str(x[14].strip('.'))
    return rel


df = spark.sparkContext.textFile('./adult/adult.data').map(lambda line: line.split(',')).\
    map(lambda x: Row(**f(x))).toDF()

test = spark.sparkContext.textFile('./adult/adult.test').map(lambda line: line.split(',')).\
    map(lambda x: Row(**f(x))).toDF()

# 构建模型
pca = PCA(k=3, inputCol='features', outputCol='pcaFeatures').fit(df)

result = pca.transform(df)

test_data = pca.transform(test)

result.show(truncate=False)
test_data.show(truncate=False)

# 在主成分分析的基础上做逻辑回归
labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(result)
for label in labelIndexer.labels:
    print(label)

featureIndexer = VectorIndexer(inputCol='pcaFeatures', outputCol='indexedFeatures').fit(result)
print(featureIndexer.numFeatures)

labelConverter = IndexToString(inputCol='prediction', outputCol='predictedLabel', labels=labelIndexer.labels)
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(filteredData)

idf = IDF(inputCol="rawFeatures", outputCol="tfidffeatures")
idfModel = idf.fit(featurizedData)

tfidfData = idfModel.transform(featurizedData)

# COMMAND ----------

from pyspark.ml.feature import PCA

pca = PCA(k = 5, inputCol = 'tfidffeatures', outputCol = 'features').fit(tfidfData)

data_pca = pca.transform(tfidfData)

data_pca.select("book_id", "features").show(truncate=False)

# COMMAND ----------

# DBTITLE 1,K-means
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans().setK(20).setSeed(1)
model = kmeans.fit(data_pca)

predictions = model.transform(data_pca)