def normalize(self):
        from pyspark.ml.feature import Normalizer
        from pyspark.ml.linalg import Vectors

        df = self.session.createDataFrame([(0, [1.0, 0.5, -1.0]),
                                           (1, [2.0, 1.0, 1.0]),
                                           (2, [4.0, 10.0, 2.0])],
                                          ["id", "features"])

        # Vector概念解释
        @F.udf(returnType=VectorUDT())
        def vectorize_from_array(a):
            return Vectors.dense(a)

        df = df.withColumn("features", vectorize_from_array(F.col("features")))
        # Normalize each Vector using $L^1$ norm.
        normalizer = Normalizer(inputCol="features",
                                outputCol="normFeatures",
                                p=1.0)
        l1NormData = normalizer.transform(df)
        print("Normalized using L^1 norm")
        l1NormData.show()

        # Normalize each Vector using $L^\infty$ norm.
        lInfNormData = normalizer.transform(df, {normalizer.p: float("inf")})
        print("Normalized using L^inf norm")
        lInfNormData.show()
Beispiel #2
0
def main():
    spark = SparkSession.builder \
        .appName("Spark CV-job ad matching") \
        .config("spark.some.config.option", "some-value") \
        .master("local[*]") \
        .getOrCreate()

    VECTOR_SIZE = 50

    df_jobs = spark.read.json("alljobs4rdd/alljobs.jsonl").filter("description is not NULL").cache()
    df_jobs.registerTempTable("jobs")
    df_cvs = spark.read.json("allcvs4rdd/allcvs.jsonl").cache()
    df_cvs.registerTempTable("cvs")
    df_categories = spark.read.json("allcategories4rdd/allcategories.jsonl").cache()
    df_categories.registerTempTable("categories")

    joined = spark.sql("SELECT description AS text, jobId AS id, 'job' AS type FROM jobs UNION ALL \
               SELECT description AS text, cvid AS id, 'cv' AS type FROM cvs UNION ALL \
               SELECT skillText AS text, id AS id, 'categories' AS type FROM categories")

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    tokenized = tokenizer.transform(joined)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    removed = remover.transform(tokenized)

    word2Vec = Word2Vec(vectorSize=VECTOR_SIZE, minCount=0, inputCol="filtered", outputCol="vectors")
    model = word2Vec.fit(removed)
    resultDF = model.transform(removed)

    normalizer = Normalizer(inputCol="vectors", outputCol="result", p=2)
    l1NormData = normalizer.transform(resultDF)

    l1NormData.registerTempTable("resultTable")
    jobs = spark.sql("SELECT result AS jobsVec, id AS jobId FROM resultTable WHERE type = 'job'")
    cvs = spark.sql("SELECT result AS cvsVec, id AS cvid FROM resultTable WHERE type = 'cv'")
    categories = spark.sql("SELECT result AS categoriesVec, cat.id, cat.skillName, category FROM resultTable AS rt\
    LEFT JOIN categories AS cat ON rt.id = cat.id WHERE type = 'categories'")

    #Calculate job-cv similarity START
    crossJoined_job_cv = jobs.crossJoin(cvs)
    calculated_job_cv = crossJoined_job_cv.rdd.map(lambda x: (x.jobId, x.cvid, calculate_distance(x.jobsVec, x.cvsVec)))\
    .toDF(["jobid", "cvid", "distance"]).orderBy(asc("jobid")).coalesce(2)
    calculated_job_cv.write.csv('Calculated/word2vec2/job-cv')
    #Calculate job-cv similarity END

    #Calculate cv-category similarity START
    crossJoined_cv_cat = cvs.crossJoin(categories)
    calculated_cv_cat = crossJoined_cv_cat.rdd.map(lambda x: (x.cvid, x.id, x.skillName, x.category, calculate_distance(x.cvsVec, x.categoriesVec)))\
    .toDF(["cvid", "category_id", "skillName", "category", "distance"]).orderBy(asc("cvid"), asc("distance")).coalesce(2)
    calculated_cv_cat.write.csv('Calculated/word2vec2/cv-category')
    #Calculate cv-category similarity END

    #Job-category START
    crossJoined_job_cat = jobs.select("jobId", "jobsVec").crossJoin(categories.select("id", "skillName", "category", "categoriesVec"))
    calculatedDF_job_cat = crossJoined_job_cat.rdd\
    .map(lambda x: (x.jobId, x.id, x.skillName, x.category, calculate_distance(x.jobsVec, x.categoriesVec)))\
    .toDF(["jobid", "catid", "skillName", "category", "distance"])
    ordered_job_cat = calculatedDF_job_cat.orderBy( asc("distance")).coalesce(2)
    ordered_job_cat.write.csv('Calculated/word2vec2/job-category')
Beispiel #3
0
    def get_product_similarity(self):
        """
        Calculate the similarity between items/users
        """
        product_taxonomy = self.data.select(self.productCol,
                                            self.taxonomyCol).distinct()
        product_taxonomy = self.__data_manipulation(product_taxonomy)

        hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf")
        tf = hashingTF.transform(product_taxonomy)

        idf = IDF(inputCol="tf", outputCol="feature").fit(tf)
        tfidf = idf.transform(tf)

        normalizer = Normalizer(inputCol="feature", outputCol="norm")
        norma_data = normalizer.transform(tfidf)

        col1 = "i." + self.productCol
        col2 = "j." + self.productCol

        dot_udf = udf(lambda x, y: float(x.dot(y)), DoubleType())
        result = norma_data.alias("i").crossJoin(norma_data.alias("j"))\
            .select(
                col(col1).alias("i"),
                col(col2).alias("j"),
                dot_udf("i.norm", "j.norm").alias("dot"))\
            .sort("i", "j")

        result = result.filter(result.i < result.j & result.dot > 0.5)

        return result
Beispiel #4
0
    def __data_manipulation(self, col):

        data = self.data.select(col, self.taxonomyCol).distinct()
        data = data.withColumn(self.taxonomyCol,
                               data[self.taxonomyCol].cast(StringType()))

        concat_list = udf(lambda lst: ", ".join(lst), StringType())
        data = data.groupby(col).agg(
            collect_list(self.taxonomyCol).alias(self.taxonomyCol))

        data = data.withColumn(self.taxonomyCol, concat_list(self.taxonomyCol))
        data = data.withColumn(
            self.taxonomyCol,
            split(regexp_replace(self.taxonomyCol, " ", ""), ','))

        hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf")
        tf = hashingTF.transform(data)

        idf = IDF(inputCol="tf", outputCol="feature").fit(tf)
        tfidf = idf.transform(tf)

        normalizer = Normalizer(inputCol="feature", outputCol="norm")
        norma_data = normalizer.transform(tfidf)

        return norma_data
Beispiel #5
0
    def test_model_normalizer_2(self):
        data = self.spark.createDataFrame([(0, Vectors.dense(1.0, 0.5, -1.0)),
                                           (1, Vectors.dense(2.0, 1.0, 1.0)),
                                           (2, Vectors.dense(4.0, 10.0, 2.0))
                                           ]).toDF("id", "features")
        model = Normalizer(inputCol='features',
                           outputCol='norm_feature',
                           p=2.0)

        model_onnx = convert_sparkml(model, 'Sparkml Normalizer',
                                     [('features', FloatTensorType([1, 3]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().norm_feature.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlNormalizer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['norm_feature'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
def normalize(dataFrame, inputColNames, p_norm=2.0):
    if type(p_norm) is str:
        if p_norm.lower() == "inf":
            p_norm = float('inf')
        else:
            raise ValueError("The p_norm has to be float or 'inf'.")
    if type(inputColNames) is list:
        outputColName = "normalized features"
        assembler = VectorAssembler(inputCols=inputColNames, \
                                    outputCol="features")
        assembledDF = assembler.transform(dataFrame)
        normalizer=Normalizer(inputCol="features", \
                              outputCol=outputColName, \
                              p = p_norm)
        normalizedDF = normalizer.transform(assembledDF)
        colList = ""
        for inputColName in inputColNames:
            colList += " '" + inputColName + "' "
        if(p_norm == float('inf')):
            print ("Successfully assembled the column {0:s} to a feature vector and normalized using L^inf norm and create two new columns 'features' and 'normalized features'.".format(colList))
        else:
            print ("Successfully assembled the column {0:s} to a feature vector and normalized using L^{1:f} norm and create two new columns 'features' and 'normalized features'.".format(colList, p_norm))
        return normalizedDF
    else:
        raise ValueError("The inputColNames has to be a list of columns to generate a feature vector and then do normalization.")
Beispiel #7
0
def create_tfidf_model(sentenceDataFrame, ngrams=1, minDocFreq=0):

    tokenized = Tokenizer(inputCol="text",
                          outputCol="words").transoform(sentenceDataFrame)

    ngramDataFrame = NGram(n=ngrams, inputCol="words",
                           outputCol="ngrams").transform(tokenized)

    countVect = CountVectorizer(inputCol="ngrams", outputCol="rawFeatures")

    countVectModel = countVect.fit(ngramDataFrame)

    featurizedData = countVectModel.transform(ngramDataFrame)

    idf = IDF(minDocFreq=minDocFreq,
              inputCol="rawFeatures",
              outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    rescaledData.select("label", "features")

    normalizer = Normalizer(inputCol="features", outputCol='scores')
    X = normalizer.transform(rescaledData)

    return X
def get_sim(tfidf, threshold, save_dir):
    normalizer = Normalizer(inputCol='features', outputCol='norm')
    data = normalizer.transform(tfidf)
    dot_udf = udf(lambda x, y: float(x.dot(y)), DoubleType())
    sim_df = (
        data.alias('i').join(data.alias('j'),
                             col('i.id') < col('j.id')).select(
                                 col('i.id').alias('i'),
                                 col('j.id').alias('j'),
                                 dot_udf('i.norm',
                                         'j.norm').alias('similarity'))
        # .sort('i', 'j')
    )
    sim_df_filtered = sim_df.filter(col('similarity').between(threshold, 1.0))

    edges = [(row.i, row.j, row.similarity)
             for row in sim_df_filtered.collect()]
    print('Edges: {}'.format(len(edges)))
    vertices = set()
    for e in edges:
        vertices.add(e[0])
        vertices.add(e[1])
    vertices = [(v, ) for v in list(vertices)]
    doc_sim = {'edges': edges, 'vertices': vertices}

    pkl.dump(doc_sim, open(os.path.join(save_dir, 'doc_sim.pkl'), 'wb'))
Beispiel #9
0
 def getNormalizer(self, dataFrameFeatures, outputColName):
     #define Normalizer to get normailize freatures
     normalized = Normalizer(inputCol="features",
                             outputCol=outputColName,
                             p=2.0)
     #Get Normalize feature
     normData = normalized.transform(dataFrameFeatures)
     return normData
Beispiel #10
0
def clustering(input_df, input_col_name, n):
    """ KMeans and PCA """
    input_df = input_df.select('state','categories','stars',input_col_name)
    norm = Normalizer(inputCol=input_col_name, outputCol="features", p=1.0)
    df = norm.transform(input_df)
    kmeans = KMeans(k=n, seed=2)
    KMmodel = kmeans.fit(df)
    predicted = KMmodel.transform(df).cache()
    pca = PCA(k=2, inputCol='features', outputCol="pc")
    df =  pca.fit(dfsample).transform(dfsample).cache()
    return df
def get_feature_vector(input_df):
    assembler_1 = VectorAssembler(inputCols=["Open", "Close"],
                                  outputCol="stock_features")
    scaler = Normalizer(inputCol="stock_features",
                        outputCol="scaled_stock_features")
    assembled_df = assembler_1.transform(input_df)
    scaled_stock = scaler.transform(assembled_df).drop('stock_features')
    assembler_2 = VectorAssembler(
        inputCols=["scaled_stock_features", "Sentiment"], outputCol="features")
    final_df = assembler_2.transform(scaled_stock)
    return final_df.drop('scaled_stock_features')
Beispiel #12
0
def termFrequency(table):

    #calculates the term frequency of attributes
    hashingTF = HashingTF(inputCol='key_words', outputCol='hashing')
    tf = hashingTF.transform(table)
    tf.cache()

    #normalises the term frequency data
    normalizer = Normalizer(inputCol='hashing', outputCol='norm')
    term = normalizer.transform(tf)

    return term
Beispiel #13
0
def getNormalizerTest(dataFrameFeatures, outputColName):

    print("inside normalizer test")
    print(dataFrameFeatures)
    #define Normalizer to get normailize freatures
    normalized = Normalizer(inputCol="features",
                            outputCol=outputColName,
                            p=2.0)
    #Get Normalize feature
    normData = normalized.transform(dataFrameFeatures)
    #print normData.show()
    return normData
 def normalizedDF(self):
     from pyspark.ml.feature import Normalizer
     assembler = VectorAssembler(inputCols=varNames, outputCol="features")
     normalizer = Normalizer(inputCol="features",
                             outputCol="normFeatures",
                             p=2)  #p is order of norm
     pipeline = Pipeline(stages=[assembler, normalizer])
     self.df_norm = pipeline.fit(vecdf)
     # Normalize each Vector using $L^\infty$ norm.
     lInfNormData = normalizer.transform(dataFrame,
                                         {normalizer.p: float("inf")})
     return (0)
Beispiel #15
0
def normalize(dataFrame, inputColNames, p_norm=2.0):
    if type(p_norm) is str:
        if p_norm.lower() == "inf":
            p_norm = float('inf')
        else:
            raise ValueError("The p_norm has to be float or 'inf'.")
    if type(inputColNames) is list:
        outputColName = "normalized features"
        assembledDF = getAssembledDataFrame(dataFrame, inputColNames)
        normalizer=Normalizer(inputCol="features", \
                              outputCol=outputColName, \
                              p = p_norm)
        normalizedDF = normalizer.transform(assembledDF).drop("features")
        return normalizedDF
    else:
        raise ValueError(
            "The inputColNames has to be a list of columns to generate a feature vector and then do normalization."
        )
Beispiel #16
0
def train_and_save_model_df(sc_local):
    trainingData = sc_local.textFile(FILE_TRAINING_DATA) \
        .flatMap(lambda line: parse_apache_log_line(line))
    data = trainingData.toDF()

    indexers = [
        StringIndexer(inputCol=c,
                      outputCol="{0}_indexed".format(c),
                      handleInvalid="keep") for c in ['endpoint', 'method']
    ]
    encoders = [
        OneHotEncoder(inputCol=indexer.getOutputCol(),
                      outputCol="{0}_encoded".format(indexer.getOutputCol()))
        for indexer in indexers
    ]
    assembler = VectorAssembler(
        inputCols=['response_code'] +
        [encoder.getOutputCol() for encoder in encoders],
        outputCol='features')
    pipeline = Pipeline(stages=indexers + encoders + [assembler])
    transform_model = pipeline.fit(data)
    output = transform_model.transform(data)

    remove_existing_model(TRANSFORM_MODEL_LOCATION)
    transform_model.save(TRANSFORM_MODEL_LOCATION)

    normalizer = Normalizer(inputCol="features",
                            outputCol="normFeatures",
                            p=1.0)
    output = normalizer.transform(output)

    kmeans = pyspark.ml.clustering.KMeans().setK(2).setSeed(1)
    model = kmeans.fit(output)

    remove_existing_model(MODEL_LOCATION)
    model.save(MODEL_LOCATION)

    predictions = model.transform(output)
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)

    print('Silhouette: ', silhouette)
    costs = model.computeCost(output)
    print('Costs: ', costs)
Beispiel #17
0
    def term_frequency(self):
        # TODO: save vocabulary to firebase
        beers = self.beer_reviews
        cv = CountVectorizer(inputCol='lemmatized_tokens',
                             outputCol='features_tf',
                             vocabSize=7500)
        # cv_model = cv.fit(self.beer_reviews)
        # self.beer_reviews = cv_model.transform(self.beer_reviews)
        cv_model = cv.fit(beers)
        # self.beer_reviews = cv_model.transform(beers)
        beers = cv_model.transform(beers)
        self.vocabulary = {
            idx: val.encode('utf-8')
            for idx, val in enumerate(cv_model.vocabulary)
        }

        normalizer = Normalizer(inputCol='features_tf',
                                outputCol='features_normalized')
        # self.beer_reviews = normalizer.transform(self.beer_reviews)
        self.beer_reviews = normalizer.transform(beers)
Beispiel #18
0
    def run_tune(self, importance_feature, data):
        sample_ratio = self.sample_ratio
        df = data.sample(fraction=sample_ratio, seed=1688)
        df = df.na.fill(0)

        ssembler = VectorAssembler(inputCols=importance_feature,
                                   outputCol="non_norm_features")
        output = ssembler.transform(df)

        normalizer = Normalizer(inputCol='non_norm_features',
                                outputCol="features")
        l1NormData = normalizer.transform(output, {normalizer.p: float(2)})
        features_label = l1NormData.select("features", "label")
        # split the dataset random
        train_data, test_data = features_label.randomSplit([0.8, 0.2])

        # train the model
        lr_params = ({
            'regParam': 0.00
        }, {
            'fitIntercept': True
        }, {
            'elasticNetParam': 0.5
        })
        lr = LinearRegression(maxIter=100, regParam=lr_params[0]['regParam'], \
                              fitIntercept=lr_params[1]['fitIntercept'], \
                              elasticNetParam=lr_params[2]['elasticNetParam'])

        model = lr.fit(train_data)
        pred = model.evaluate(test_data)

        # model of evaluate
        eval = RegressionEvaluator(labelCol="label",
                                   predictionCol="prediction",
                                   metricName="mae")
        bef_mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"})
        r2 = eval.evaluate(pred.predictions, {eval.metricName: "r2"})
        print('r2....' + str(r2))
        print('mae....' + str(bef_mae))

        lrParamGrid = ParamGridBuilder() \
            .addGrid(lr.regParam, [0.005, 0.01, 0.1, 0.5]) \
            .addGrid(lr.fitIntercept, [False, True]) \
            .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.5, 1.0]) \
            .build()

        #build model Estimator such as LinearRegression
        #set RegressionEvaluator
        #fit model use k-fold,calculate avg of evaluate
        #save the best param

        if self.is_cv:
            train_valid = CrossValidator(estimator=lr,
                                         estimatorParamMaps=lrParamGrid,
                                         evaluator=eval,
                                         numFolds=5)
            tune_model = train_valid.fit(train_data)
            best_parameters = [(
                [{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric) \
                for params, metric in zip(
                    tune_model.getEstimatorParamMaps(),
                    tune_model.avgMetrics)]

        else:
            train_valid = TrainValidationSplit(estimator=lr,
                                               estimatorParamMaps=lrParamGrid,
                                               evaluator=eval)
            tune_model = train_valid.fit(train_data)
            best_parameters = [(
                [{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric) \
                for params, metric in zip(
                    tune_model.getEstimatorParamMaps(),
                    tune_model.validationMetrics)]

        lr_best_params = sorted(best_parameters,
                                key=lambda el: el[1],
                                reverse=True)[0][0]
        regParam_ky = [i for i in lr_best_params if i.get('regParam')][0]
        elasticNetParam_ky = [
            i for i in lr_best_params if i.get('elasticNetParam')
        ][0]
        if [i for i in lr_best_params if i.get('fitIntercept')] is True:
            fitIntercept_ky = [i for i in check_d if i.get('fitIntercept')][0]
        else:
            fitIntercept_ky = {'fitIntercept': False}

        pd_best_params = pd.DataFrame({
            'regParam': [regParam_ky['regParam']],
            'elasticNetParam': [elasticNetParam_ky['elasticNetParam']],
            'fitIntercept': [fitIntercept_ky['fitIntercept']]
        })

        pd_best_params['update_date'] = self.today
        pd_best_params['update_time'] = self.update_time
        pd_best_params['model_type'] = 'linear'

        # use the best param to predict
        lr = LinearRegression(
            maxIter=100,
            regParam=float(regParam_ky['regParam']),
            elasticNetParam=float(elasticNetParam_ky['elasticNetParam']),
            fitIntercept=bool(fitIntercept_ky['fitIntercept']))

        model = lr.fit(train_data)
        print('....intercept....' + str(model.intercept))
        print('....coefficients....' + str(model.coefficients))
        pred = model.evaluate(test_data)
        # evaluation model
        eval = RegressionEvaluator(labelCol="label",
                                   predictionCol="prediction",
                                   metricName="mae")

        r2_tune = eval.evaluate(pred.predictions, {eval.metricName: "r2"})
        tune_mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"})
        pd_best_params['bef_mae'] = str(bef_mae)
        pd_best_params['tune_mae'] = str(tune_mae)
        pd_best_params['tune_r2'] = str(r2_tune)
        pd_best_params = pd_best_params[[
            'regParam', 'fitIntercept', 'elasticNetParam', 'model_type',
            'bef_mae', 'tune_mae', 'tune_r2', 'update_date', 'update_time'
        ]]
        if pd_best_params.shape[0] < 1:
            raise ValueError("tune the best param is wrong at {},{}".format(
                self.today, self.update_time))
        pd_best_params = spark.createDataFrame(pd_best_params)
        try:
            pd_best_params.write.mode("append").format('hive').saveAsTable(
                'temp.regression_model_best_param')
        except:
            pd_best_params.createOrReplaceTempView('pd_best_params')
            spark.sql(
                """drop table if exists temp.regression_model_best_param""")
            spark.sql(
                """create table temp.regression_model_best_param as select * from pd_best_params"""
            )
# COMMAND ----------

from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors
scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
scalingUp = ElementwiseProduct()\
  .setScalingVec(scaleUpVec)\
  .setInputCol("features")
scalingUp.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import Normalizer
manhattanDistance = Normalizer().setP(1).setInputCol("features")
manhattanDistance.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import StringIndexer
lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
idxRes = lblIndxr.fit(simpleDF).transform(simpleDF)
idxRes.show()


# COMMAND ----------

valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd")
valIndexer.fit(simpleDF).transform(simpleDF).show()
Beispiel #20
0
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer

#Generamos un vector con la columna label  y la columna array features
ignore = ['label']
assembler = VectorAssembler(
    inputCols=[x for x in df.columns if x not in ignore],
    outputCol='features_without_norm')
df = assembler.transform(df).select(
    ["MONTH", 'label', 'features_without_norm'])

# COMMAND ----------

# Normalizamos los datos
normalizer = Normalizer(inputCol="features_without_norm", outputCol="features")
df_normalized = normalizer.transform(df).select(["MONTH", 'label', 'features'])

# COMMAND ----------

# Definimos nuestros datos de entrenamiento del modelo ("train") y los de prediccion ("test")
train = df_normalized.where(df.MONTH != 12)
test = df_normalized.where(df.MONTH == 12)

#Dividimos el archivo en train a su vez en: train(90%) y evaluation(10%)
(train, evaluation) = train.randomSplit((0.9, 0.1))

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
Beispiel #21
0
from pyspark.ml.feature import Normalizer
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkContext("local", "samp")
sqlContext = SQLContext(sc)
data = sqlContext.read.format("libsvm").load("D:\Spark\spark-1.6.1-bin-hadoop2.6\data\mllib\sample_libsvm_data.txt")
indexer = Normalizer(p=1.0, inputCol="features", outputCol="normFeatures")
indexedData = indexer.transform(data)
indexedData.show()
lInfNormData = indexer.transform(data, {indexer.p: float("inf")})
lInfNormData.show()
"""Output
+-----+--------------------+--------------------+
|label|            features|        normFeatures|
+-----+--------------------+--------------------+
|  0.0|(692,[127,128,129...|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|(692,[154,155,156...|
Beispiel #22
0
        dataset.append((cont, f, data))

rdd = sc.parallelize(dataset)
shemaData = rdd.map(lambda x: Row(num=x[0], title=x[1], text=x[2]))
dataFrame = sqlContext.createDataFrame(shemaData)
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(dataFrame)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("title", "features").show()
#Normalizacion y transformada de la matriz
normalizer = Normalizer(inputCol="features", outputCol="norm")
data = normalizer.transform(rescaledData)

#Proceso de similaridad hallando la norma y el producto punto
mat = IndexedRowMatrix(
    data.select("num", "norm")\
        .rdd.map(lambda row: IndexedRow(row.num, row.norm.toArray()))).toBlockMatrix()
dot = mat.multiply(mat.transpose())
dot.toLocalMatrix().toArray()

dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType())
data.alias("i").join(data.alias("j"), psf.col("i.num") < psf.col("j.num"))\
    .select(
        psf.col("i.num").alias("i"),
        psf.col("j.num").alias("j"),
        dot_udf("i.norm", "j.norm").alias("dot"))\
    .sort("i", "j")\
Beispiel #23
0
stopWordsRemover = StopWordsRemover(inputCol="words",
                                    outputCol="filtered_words")
filtered_data = stopWordsRemover.transform(tokenized_data)
hashingTF = HashingTF(inputCol="filtered_words",
                      outputCol="raw_features",
                      numFeatures=20)
featurizedData = hashingTF.transform(filtered_data)
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(featurizedData)
featurized_data = idfModel.transform(featurizedData)

# In[14]:

from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="features", outputCol="norm")
data = normalizer.transform(featurized_data)

# In[15]:

import math
import pyspark.sql.functions as psf
from pyspark.sql.types import DoubleType
dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType())
s=data.alias("i").join(data.alias("j"), psf.col("i.id") < psf.col("j.id"))      .select(
          psf.col("i.id").alias("src"),
          psf.col("j.id").alias("dst"),
          dot_udf("i.norm", "j.norm").alias("relationship"))\
      .sort("src", "dst")\

# In[ ]:
    if pred[i] == list(Y_test)[i]:
        corr+=1
    else :
        pass
print('正确率:'+str(corr*1.01/len(pred)/1.01))

#Spark上进行BP神经网络建模
from pyspark.sql import Row
from pyspark.ml.feature import Normalizer  
lines = sc.textFile("hdfs:///lushun/a.txt")
parts = lines.map(lambda l: l.split(" "))
df = parts.map(lambda p: Row(features=p[:-1], labe1=int(p[-1])))
df = spark.createDataFrame(df)
df.createOrReplaceTempView("df")
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)  
l1NormData = normalizer.transform(df)
l1NormData = spark.sql("SELECT labe1,normFeatures FROM l1NormData")  
l1NormData.show()       
from pyspark.ml.classification import MultilayerPerceptronClassifier  
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 
splits = lInfNormData.randomSplit([0.7, 0.3]) 
train = splits[0]  
test = splits[1]
layers = [36300, 200, 200, 6] 
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers,seed=1234)
model = trainer.fit(train)  
# compute accuracy on the test set  
result = model.transform(test)  
predictionAndLabels = result.select("prediction", "label")  
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")  
print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))  
Beispiel #25
0
from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors

scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
scalingUp = ElementwiseProduct()\
  .setScalingVec(scaleUpVec)\
  .setInputCol("features")
scalingUp.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import Normalizer

manhattanDistance = Normalizer().setP(1).setInputCol("features")
manhattanDistance.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import StringIndexer

lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
idxRes = lblIndxr.fit(simpleDF).transform(simpleDF)
idxRes.show()

# COMMAND ----------

valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd")
valIndexer.fit(simpleDF).transform(simpleDF).show(5)

# COMMAND ----------
rddTrain = rddTrain.mapPartitions(lambda x: csv.reader(x))
Result_train = rddTrain.map(
    lambda x: (x[0], x[1], x[2], DenseVector((x[3]).split(',')))).toDF(
        ["index", "url", "productId", "features"])

DataFile_test = spark.textFile('new_test.csv')
Result_test = DataFile_test.map(lambda line : line.split("\n")).flatMap(lambda words : (word.split(",") for word in words))\
.zipWithIndex().map(lambda x: (DenseVector(x[0]), x[1])).toDF(["features_test", "index"])
#
# print Result_train.show()
# print Result_test.show()

normalizer = Normalizer(inputCol="features",
                        outputCol="normFeatures_train",
                        p=2.0)
l1NormData = normalizer.transform(Result_train)

normalizer = Normalizer(inputCol="features_test",
                        outputCol="normFeatures_test",
                        p=2.0)
l1NormData1 = normalizer.transform(Result_test)

print l1NormData.show()
print l1NormData1.show()

Train_DF = l1NormData.createOrReplaceTempView("Result_train1")
# sqlDF_train = sqlContext.sql("select * from Result_train1 limit 1").show()

Test_DF = l1NormData1.createOrReplaceTempView("Result_test1")
# sqlDF_test = sqlContext.sql("select * from Result_test1 limit 1").show()
    tags_users_df=sqlContext.createDataFrame(tags_users)
    print(tags_users_df.take(2))
    #
    #
    # print('Indexing strings')
    cVec = CountVectorizer(inputCol='tags', outputCol="tag_features",minDF=10.)
    model=cVec.fit(tags_users_df)
    td=model.transform(tags_users_df)

    with open('/home/erlenda/data/konsum/countvec_vocabulary.pkl',mode='wb') as ff:
        pkl.dump(model.vocabulary,ff)



    normalizer=Normalizer(p=1.,inputCol='tag_features',outputCol='tags_normalized')
    tdNorm=normalizer.transform(td)
    print(tdNorm.take(5))

    tdNorm.write.save('/home/erlenda/data/konsum/tag_profiler_parquet')

    samples=tdNorm.filter(tdNorm.posts_with_tags>10).take(10)
    #pprint(samples)




    # stringIndexer = StringIndexer(inputCol="tags", outputCol="indexed_tags")
    # model=stringIndexer.fit(tags_users_df)
    # td=model.transform(tags_users_df)
    # print('Retrieving indices')
    #
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([(
    0,
    Vectors.dense([1.0, 0.5, -1.0]),
), (
    1,
    Vectors.dense([2.0, 1.0, 1.0]),
), (
    2,
    Vectors.dense([4.0, 10.0, 2.0]),
)], ["id", "features"])

# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(dataFrame)
print("Normalized using L^1 norm")
l1NormData.show()

# Normalize each Vector using $L^\infty$ norm.
lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
print("Normalized using L^inf norm")
lInfNormData.show()

# COMMAND ----------

###MinMaxScaler (0, 1)
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([(
Beispiel #29
0
    df = df.withColumn('scores',
                       udf(lambda x: x.values(), ArrayType(FloatType))(df.kws))

    keywords_length = len(df.select("scores").take(1)[0]["scores"])
    assembler = VectorAssembler(
        inputCols=["scores[{}]".format(i) for i in range(keywords_length)],
        outputCol="user_score")
    ddf = assembler.transform(
        df.select("*",
                  *(df["scores"].getItem(i)
                    for i in range(keywords_length)))).select("user_score")

    normalizer = Normalizer(inputCol="user_score",
                            outputCol="normFeatures",
                            p=2.0)
    extended_user_df = normalizer.transform(ddf)
    extended_user_df.cache()

    seed_user_df = extended_user_df

    # LSH Algorithm
    brp = BucketedRandomProjectionLSH(inputCol="normFeatures",
                                      outputCol="hashes",
                                      bucketLength=bucketLength,
                                      numHashTables=numHashTables)
    lsh = brp.fit(extended_user_df)
    df_users = lsh.approxSimilarityJoin(seed_user_df,
                                        extended_user_df,
                                        1 - minimum_similarity_score,
                                        distCol="EuclideanDistance")
    df_users = df_users.withColumn(
        mx = type_mapping[4]['max']
        suggested_type = suggest_data_type(mean, mx)

        if suggested_type == 'FLOAT':
            d_type = 1
        elif suggested_type == 'INT':
            d_type = 2
        elif suggested_type == 'DATE':
            d_type = 3
        else:
            d_type = 4

        stats_feat = assembler.transform(features)
        stats_feat_results = stats_feat.select("features")

        l1NormData = normalizer.transform(stats_feat_results, {normalizer.p: float("inf")})
        normfeatures = l1NormData.select("normFeatures").rdd.flatMap(list).collect()[0]

        features_combined = [(normfeatures, d_type)]

        final_feat_frame = spark.createDataFrame(features_combined, ["meta_features", "data_type"])
        final_results = final_assembler.transform(final_feat_frame)
        combined_features = final_results.select("final_features").rdd.flatMap(list).collect()

        features2 = \
            spark.sql(
                """
                select 
                    case 
                    when count({0}) > 0 then
                    cast(count(distinct {0}) as int) else 0 end as dist_cnt,
Beispiel #31
0
def getTestData(imgUrl):
    testDataFeature = requests.post(
        'http://ares.styfi.in:81/vs/getproductfeatures/',
        data=json.dumps({'imgUrl': imgUrl})).content

    testDataFeature = json.loads(testDataFeature)
    testData = {}
    testData['testDataImgFeatures'] = (testDataFeature['imgFeatures'])
    testData['imgUrl'] = imgUrl
    a = [json.dumps(testData)]
    jsonRDD = spark.parallelize(a)
    df = sparkSess.read.json(jsonRDD)
    df.show()
    df.printSchema()
    to_vector = udf(lambda a: DenseVector(a), VectorUDT())
    data = df.select("imgUrl",
                     to_vector("testDataImgFeatures").alias("features"))
    data.printSchema()
    data.show()
    #stestDataFeature = testDataFeature.items()

    normalizer = Normalizer(inputCol="features",
                            outputCol="normFeatures_test",
                            p=2.0)
    l1NormData1 = normalizer.transform(data)
    l1NormData1.show()

    # rddTrain = spark.textFile("/home/leena/dev/visualsearch/testFinal.csv")
    # #rddTrain = rddTrain.mapPartitions(lambda x: csv.reader(x))
    # #print rddTrain.collect()
    # Result_train = rddTrain.map(lambda x: x[0])

    #Result_test = rddTrain.map(lambda line : line.split("\n")).flatMap(lambda words : [word.split(",") for word in words]).zipWithIndex();
    #print Result_test.show()
    #print Result_train.collect()

    # df = sqlContext.createDataFrame([json.loads(y) for line in y.iter_lines()])
    # df.show()
    j = {'abc': 1, 'def': 2, 'ghi': 3}
    a = [json.dumps(j)]
    jsonRDD = spark.parallelize(a)
    df = sparkSess.read.json(jsonRDD)
    df.show()

    #writeToCsv("/home/leena/dev/visualsearch/testFinal",testDataFeature)

    #
    # schema =[
    #         StructField("URL", StringType(), True),
    #         StructField("Features", DoubleType(), True)]
    #
    #     df = sparkSess.createDataFrame(jsonRDD, schema)
    #
    #     df.show()

    # #print y
    # df = spark.parallelize(y).toDF()
    # df.show()
    # otherPeople = sparkSess.read.json(y)
    # otherPeople.show()

    return ""
Beispiel #32
0
	return wordbag
	

documents = sqlContext.createDataFrame(sc.pickleFile('merged_file/part-00000').map(lambda x : [x['eval_id'],x['no'],create_wordbag(x),x['professor'],x['lec_code'][:4],x['lec_code'][5],x['eval_total'],x['eval_id']]),['eval_id','no','words','prof_name','department','grade','eval_total','eval_id'])

#users = sqlContext.createDataFrame(sc.pickleFile('merged_file').map(lambda x : (x['mb_no'],x['lec_code'][:4])),['user','department']).orderBy('department')
#for u in users.select('department','user').take(10000):
#	print u
'''
professors = documents.select('prof_name').distinct()
department = documents.select('department').distinct()
#grade	1/2/3/4
eval_total = documents.select('eval_total').distinct() # 1/2/3/4/5

for e in eval_total.collect():
	print e
'''



htf = HashingTF(inputCol= 'words',outputCol = 'rawFeatures')
featured = htf.transform(documents)
idf = IDF(inputCol = 'rawFeatures',outputCol = 'idf')
idfModel = idf.fit(featured)
tf_idf = idfModel.transform(featured)
normalizer = Normalizer(inputCol = 'idf', outputCol = 'idf_norm', p = 2.0)
normData = normalizer.transform(tf_idf)

normData.rdd.saveAsPickleFile('idf_normalized')

Beispiel #33
0
    " ")
typedData = csvData

for colName in columnsToKeep:
    typedData = csvData.withColumn(
        colName, typedData[colName].cast(IntegerType()).alias(colName))

typedData = typedData.na.drop()
print(typedData.schema)

assembler = VectorAssembler().setInputCols(columnsToKeep).setOutputCol(
    "features")
dataWithFeatures = assembler.transform(typedData)
dataWithFeatures.show()

normalizer = Normalizer().setInputCol("features").setOutputCol("normFeatures")
normData = normalizer.transform(dataWithFeatures)

kmeans = KMeans().setK(5).setFeaturesCol("normFeatures")
model = kmeans.fit(normData)

predictions = model.transform(normData)
predictions.select("features", "prediction").show()

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

spark.stop()
Beispiel #34
0
# Transforing the data using Tokenizer (No fitting because nothing to train)
token_df = tokenizer.transform(df)

# Transforming the data using Stopword Removal (No fitting because nothing to train)
stop_df = stopwordsRemover.transform(token_df)

# If you apply Count Vectorizer, here you will have to fit and then transform
# If you apply hashingtf, here you can use direct transformation on the data
#tf_df = hashtf.transform(stop_df)
tf_df = hashtf.fit(stop_df).transform(stop_df)

# Fit and Transforing the data using Tokenizer IDF
idf_df = idf.fit(tf_df).transform(tf_df)

# This is very important, L2 Normalisation is missing in pyspark idf implementation. We explicitly build this function
norm_df = norm.transform(idf_df)

# Clean data into features, this has to be done before you send the data for model training
raw_dataset = clean_data.transform(norm_df)

# This step will have a drastic change in training for naive bayes and logistic regression.
dataset = raw_dataset.withColumn("label", raw_dataset["stars"] - 1)

# Have a look at the preprocessed data
print((dataset.count(), len(dataset.columns)))

dataset.printSchema()

dataset.show()

# Split the dataset into Training and Testing using random split
from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("NormalizerExample")\
        .getOrCreate()

    # $example on$
    dataFrame = spark.createDataFrame([
        (0, Vectors.dense([1.0, 0.5, -1.0]),),
        (1, Vectors.dense([2.0, 1.0, 1.0]),),
        (2, Vectors.dense([4.0, 10.0, 2.0]),)
    ], ["id", "features"])

    # Normalize each Vector using $L^1$ norm.
    normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
    l1NormData = normalizer.transform(dataFrame)
    print("Normalized using L^1 norm")
    l1NormData.show()

    # Normalize each Vector using $L^\infty$ norm.
    lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
    print("Normalized using L^inf norm")
    lInfNormData.show()
    # $example off$

    spark.stop()
Beispiel #36
0
	def trainModel(self):
		
		logger.info("Training the model...")		

		query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;'''

		def SQLtoURL(query):
    
    			data = query.replace('\n', ' ').replace('\t',' ').replace('   ',' ').replace('  ',' ')
    			return data


		def QueryXXXXX(query, file = None):
   
    			session = Session()
    			response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
    			return response.content

		table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
		title_list = [x['c'] for x in table['rows']]
		table_cols = [d['label'] for d in table['cols']]  
		def convert_row(row):
    			rowlist = [d['v'] for d in row]
    			return rowlist

		rd = self.sc.parallelize(title_list).map(convert_row)
		titleData = self.spark.createDataFrame(rd, table_cols)
		titleData = titleData.dropna()
		
		hebrew_stopwords = stop_words()
		def rmv(words):
    			for punc in punctuation:
        			words = words.replace(punc,"")
    			for hword in hebrew_stopwords:
        			words = words.replace(hword, " ")
    			return words

		self.spark.udf.register("rmv", rmv, StringType())
		titleData.registerTempTable("wordstable")
		cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable")
		tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words")
		wordsData = tokenizer.transform(cleanedSentenceData)

		cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0)
		cvModel = cv.fit(wordsData)
		featurizedData = cvModel.transform(wordsData)

		idf = IDF(inputCol="rawFeatures", outputCol="features")
		idfModel = idf.fit(featurizedData)
		rescaledData = idfModel.transform(featurizedData)

		lda = LDA(k=100)
		ldaModel = lda.fit(rescaledData)
		postFactorizedData = ldaModel.transform(rescaledData)

		norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist")
		scaledFactorizedNormalizedData = norm.transform(postFactorizedData)
		
		self.model = scaledFactorizedNormalizedData
		
		logger.info("model is built!")
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler 
vectorassembler = VectorAssembler(
							inputCols = ['x','y', 'z'],
							outputCol = 'features'
							)
features_vectorized = vectorassembler.transform(encoded)
features_vectorized.show()


# In[12]:


from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol = 'features', outputCol='features_norm', p=1.0)
normalized_data = normalizer.transform(features_vectorized)
normalized_data.show()


# In[17]:


from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [
					indexer, encoder, vectorassembler, normalizer
					])
					
					
model = pipeline.fit(df)					
prediction = model.transform(df)
prediction.show()
# On a bien remplacé ici du coup les mots par les vecteurs sparse
print "DataFrame(1-gram): On a bien remplacé ici du coup les mots par les vecteurs sparse"
dfVect.show()


udfVectorizeBi=UserDefinedFunction(lambda x : vectorizeBi(x),VectorUDT())
dfVect2 = dfVect.withColumn("bigrams", udfVectorizeBi("bigrams"))
print "DataFrame(bi-gram): On a bien remplacé ici du coup les mots par les vecteurs sparse"
dfVect2.show()

# Pour les opérations de traitement du langage, il est d'usage de normaliser (L2)
# les vecteurs de features : c'est ce qui marche le mieux apparemment.
from pyspark.ml.feature import Normalizer
normalizerUni = Normalizer(inputCol='words',outputCol='normWords',p=2.0)
normalizerBi = Normalizer(inputCol="bigrams",outputCol='normBigrams',p=2.0)
dfNorm = normalizerUni.transform(dfVect2)
dfNorm2 = normalizerBi.transform(dfNorm)
print "DataFrame(bi-gram): normalisé"
dfNorm2.select('words','normWords').show()
# La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements 
# non nuls et pas leur valeur
# On passe au TFIDF
# Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs
# à n'importz quelle colonne (bigrammes, avec stop words ou sans...)
from pyspark.ml.feature import HashingTF
htf = HashingTF(inputCol='words',outputCol='wordsTF',numFeatures=10000)
dfTrainTF = htf.transform(dfTrainTokNoSw)
# INverse doc frequency
from pyspark.ml.feature import IDF
idf = IDF(inputCol=htf.getOutputCol(),outputCol="wordsTFIDF")
idfModel = idf.fit(dfTrainTF)