def normalize(self): from pyspark.ml.feature import Normalizer from pyspark.ml.linalg import Vectors df = self.session.createDataFrame([(0, [1.0, 0.5, -1.0]), (1, [2.0, 1.0, 1.0]), (2, [4.0, 10.0, 2.0])], ["id", "features"]) # Vector概念解释 @F.udf(returnType=VectorUDT()) def vectorize_from_array(a): return Vectors.dense(a) df = df.withColumn("features", vectorize_from_array(F.col("features"))) # Normalize each Vector using $L^1$ norm. normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) l1NormData = normalizer.transform(df) print("Normalized using L^1 norm") l1NormData.show() # Normalize each Vector using $L^\infty$ norm. lInfNormData = normalizer.transform(df, {normalizer.p: float("inf")}) print("Normalized using L^inf norm") lInfNormData.show()
def main(): spark = SparkSession.builder \ .appName("Spark CV-job ad matching") \ .config("spark.some.config.option", "some-value") \ .master("local[*]") \ .getOrCreate() VECTOR_SIZE = 50 df_jobs = spark.read.json("alljobs4rdd/alljobs.jsonl").filter("description is not NULL").cache() df_jobs.registerTempTable("jobs") df_cvs = spark.read.json("allcvs4rdd/allcvs.jsonl").cache() df_cvs.registerTempTable("cvs") df_categories = spark.read.json("allcategories4rdd/allcategories.jsonl").cache() df_categories.registerTempTable("categories") joined = spark.sql("SELECT description AS text, jobId AS id, 'job' AS type FROM jobs UNION ALL \ SELECT description AS text, cvid AS id, 'cv' AS type FROM cvs UNION ALL \ SELECT skillText AS text, id AS id, 'categories' AS type FROM categories") tokenizer = Tokenizer(inputCol="text", outputCol="words") tokenized = tokenizer.transform(joined) remover = StopWordsRemover(inputCol="words", outputCol="filtered") removed = remover.transform(tokenized) word2Vec = Word2Vec(vectorSize=VECTOR_SIZE, minCount=0, inputCol="filtered", outputCol="vectors") model = word2Vec.fit(removed) resultDF = model.transform(removed) normalizer = Normalizer(inputCol="vectors", outputCol="result", p=2) l1NormData = normalizer.transform(resultDF) l1NormData.registerTempTable("resultTable") jobs = spark.sql("SELECT result AS jobsVec, id AS jobId FROM resultTable WHERE type = 'job'") cvs = spark.sql("SELECT result AS cvsVec, id AS cvid FROM resultTable WHERE type = 'cv'") categories = spark.sql("SELECT result AS categoriesVec, cat.id, cat.skillName, category FROM resultTable AS rt\ LEFT JOIN categories AS cat ON rt.id = cat.id WHERE type = 'categories'") #Calculate job-cv similarity START crossJoined_job_cv = jobs.crossJoin(cvs) calculated_job_cv = crossJoined_job_cv.rdd.map(lambda x: (x.jobId, x.cvid, calculate_distance(x.jobsVec, x.cvsVec)))\ .toDF(["jobid", "cvid", "distance"]).orderBy(asc("jobid")).coalesce(2) calculated_job_cv.write.csv('Calculated/word2vec2/job-cv') #Calculate job-cv similarity END #Calculate cv-category similarity START crossJoined_cv_cat = cvs.crossJoin(categories) calculated_cv_cat = crossJoined_cv_cat.rdd.map(lambda x: (x.cvid, x.id, x.skillName, x.category, calculate_distance(x.cvsVec, x.categoriesVec)))\ .toDF(["cvid", "category_id", "skillName", "category", "distance"]).orderBy(asc("cvid"), asc("distance")).coalesce(2) calculated_cv_cat.write.csv('Calculated/word2vec2/cv-category') #Calculate cv-category similarity END #Job-category START crossJoined_job_cat = jobs.select("jobId", "jobsVec").crossJoin(categories.select("id", "skillName", "category", "categoriesVec")) calculatedDF_job_cat = crossJoined_job_cat.rdd\ .map(lambda x: (x.jobId, x.id, x.skillName, x.category, calculate_distance(x.jobsVec, x.categoriesVec)))\ .toDF(["jobid", "catid", "skillName", "category", "distance"]) ordered_job_cat = calculatedDF_job_cat.orderBy( asc("distance")).coalesce(2) ordered_job_cat.write.csv('Calculated/word2vec2/job-category')
def get_product_similarity(self): """ Calculate the similarity between items/users """ product_taxonomy = self.data.select(self.productCol, self.taxonomyCol).distinct() product_taxonomy = self.__data_manipulation(product_taxonomy) hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf") tf = hashingTF.transform(product_taxonomy) idf = IDF(inputCol="tf", outputCol="feature").fit(tf) tfidf = idf.transform(tf) normalizer = Normalizer(inputCol="feature", outputCol="norm") norma_data = normalizer.transform(tfidf) col1 = "i." + self.productCol col2 = "j." + self.productCol dot_udf = udf(lambda x, y: float(x.dot(y)), DoubleType()) result = norma_data.alias("i").crossJoin(norma_data.alias("j"))\ .select( col(col1).alias("i"), col(col2).alias("j"), dot_udf("i.norm", "j.norm").alias("dot"))\ .sort("i", "j") result = result.filter(result.i < result.j & result.dot > 0.5) return result
def __data_manipulation(self, col): data = self.data.select(col, self.taxonomyCol).distinct() data = data.withColumn(self.taxonomyCol, data[self.taxonomyCol].cast(StringType())) concat_list = udf(lambda lst: ", ".join(lst), StringType()) data = data.groupby(col).agg( collect_list(self.taxonomyCol).alias(self.taxonomyCol)) data = data.withColumn(self.taxonomyCol, concat_list(self.taxonomyCol)) data = data.withColumn( self.taxonomyCol, split(regexp_replace(self.taxonomyCol, " ", ""), ',')) hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf") tf = hashingTF.transform(data) idf = IDF(inputCol="tf", outputCol="feature").fit(tf) tfidf = idf.transform(tf) normalizer = Normalizer(inputCol="feature", outputCol="norm") norma_data = normalizer.transform(tfidf) return norma_data
def test_model_normalizer_2(self): data = self.spark.createDataFrame([(0, Vectors.dense(1.0, 0.5, -1.0)), (1, Vectors.dense(2.0, 1.0, 1.0)), (2, Vectors.dense(4.0, 10.0, 2.0)) ]).toDF("id", "features") model = Normalizer(inputCol='features', outputCol='norm_feature', p=2.0) model_onnx = convert_sparkml(model, 'Sparkml Normalizer', [('features', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().norm_feature.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlNormalizer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['norm_feature'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def normalize(dataFrame, inputColNames, p_norm=2.0): if type(p_norm) is str: if p_norm.lower() == "inf": p_norm = float('inf') else: raise ValueError("The p_norm has to be float or 'inf'.") if type(inputColNames) is list: outputColName = "normalized features" assembler = VectorAssembler(inputCols=inputColNames, \ outputCol="features") assembledDF = assembler.transform(dataFrame) normalizer=Normalizer(inputCol="features", \ outputCol=outputColName, \ p = p_norm) normalizedDF = normalizer.transform(assembledDF) colList = "" for inputColName in inputColNames: colList += " '" + inputColName + "' " if(p_norm == float('inf')): print ("Successfully assembled the column {0:s} to a feature vector and normalized using L^inf norm and create two new columns 'features' and 'normalized features'.".format(colList)) else: print ("Successfully assembled the column {0:s} to a feature vector and normalized using L^{1:f} norm and create two new columns 'features' and 'normalized features'.".format(colList, p_norm)) return normalizedDF else: raise ValueError("The inputColNames has to be a list of columns to generate a feature vector and then do normalization.")
def create_tfidf_model(sentenceDataFrame, ngrams=1, minDocFreq=0): tokenized = Tokenizer(inputCol="text", outputCol="words").transoform(sentenceDataFrame) ngramDataFrame = NGram(n=ngrams, inputCol="words", outputCol="ngrams").transform(tokenized) countVect = CountVectorizer(inputCol="ngrams", outputCol="rawFeatures") countVectModel = countVect.fit(ngramDataFrame) featurizedData = countVectModel.transform(ngramDataFrame) idf = IDF(minDocFreq=minDocFreq, inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features") normalizer = Normalizer(inputCol="features", outputCol='scores') X = normalizer.transform(rescaledData) return X
def get_sim(tfidf, threshold, save_dir): normalizer = Normalizer(inputCol='features', outputCol='norm') data = normalizer.transform(tfidf) dot_udf = udf(lambda x, y: float(x.dot(y)), DoubleType()) sim_df = ( data.alias('i').join(data.alias('j'), col('i.id') < col('j.id')).select( col('i.id').alias('i'), col('j.id').alias('j'), dot_udf('i.norm', 'j.norm').alias('similarity')) # .sort('i', 'j') ) sim_df_filtered = sim_df.filter(col('similarity').between(threshold, 1.0)) edges = [(row.i, row.j, row.similarity) for row in sim_df_filtered.collect()] print('Edges: {}'.format(len(edges))) vertices = set() for e in edges: vertices.add(e[0]) vertices.add(e[1]) vertices = [(v, ) for v in list(vertices)] doc_sim = {'edges': edges, 'vertices': vertices} pkl.dump(doc_sim, open(os.path.join(save_dir, 'doc_sim.pkl'), 'wb'))
def getNormalizer(self, dataFrameFeatures, outputColName): #define Normalizer to get normailize freatures normalized = Normalizer(inputCol="features", outputCol=outputColName, p=2.0) #Get Normalize feature normData = normalized.transform(dataFrameFeatures) return normData
def clustering(input_df, input_col_name, n): """ KMeans and PCA """ input_df = input_df.select('state','categories','stars',input_col_name) norm = Normalizer(inputCol=input_col_name, outputCol="features", p=1.0) df = norm.transform(input_df) kmeans = KMeans(k=n, seed=2) KMmodel = kmeans.fit(df) predicted = KMmodel.transform(df).cache() pca = PCA(k=2, inputCol='features', outputCol="pc") df = pca.fit(dfsample).transform(dfsample).cache() return df
def get_feature_vector(input_df): assembler_1 = VectorAssembler(inputCols=["Open", "Close"], outputCol="stock_features") scaler = Normalizer(inputCol="stock_features", outputCol="scaled_stock_features") assembled_df = assembler_1.transform(input_df) scaled_stock = scaler.transform(assembled_df).drop('stock_features') assembler_2 = VectorAssembler( inputCols=["scaled_stock_features", "Sentiment"], outputCol="features") final_df = assembler_2.transform(scaled_stock) return final_df.drop('scaled_stock_features')
def termFrequency(table): #calculates the term frequency of attributes hashingTF = HashingTF(inputCol='key_words', outputCol='hashing') tf = hashingTF.transform(table) tf.cache() #normalises the term frequency data normalizer = Normalizer(inputCol='hashing', outputCol='norm') term = normalizer.transform(tf) return term
def getNormalizerTest(dataFrameFeatures, outputColName): print("inside normalizer test") print(dataFrameFeatures) #define Normalizer to get normailize freatures normalized = Normalizer(inputCol="features", outputCol=outputColName, p=2.0) #Get Normalize feature normData = normalized.transform(dataFrameFeatures) #print normData.show() return normData
def normalizedDF(self): from pyspark.ml.feature import Normalizer assembler = VectorAssembler(inputCols=varNames, outputCol="features") normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=2) #p is order of norm pipeline = Pipeline(stages=[assembler, normalizer]) self.df_norm = pipeline.fit(vecdf) # Normalize each Vector using $L^\infty$ norm. lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")}) return (0)
def normalize(dataFrame, inputColNames, p_norm=2.0): if type(p_norm) is str: if p_norm.lower() == "inf": p_norm = float('inf') else: raise ValueError("The p_norm has to be float or 'inf'.") if type(inputColNames) is list: outputColName = "normalized features" assembledDF = getAssembledDataFrame(dataFrame, inputColNames) normalizer=Normalizer(inputCol="features", \ outputCol=outputColName, \ p = p_norm) normalizedDF = normalizer.transform(assembledDF).drop("features") return normalizedDF else: raise ValueError( "The inputColNames has to be a list of columns to generate a feature vector and then do normalization." )
def train_and_save_model_df(sc_local): trainingData = sc_local.textFile(FILE_TRAINING_DATA) \ .flatMap(lambda line: parse_apache_log_line(line)) data = trainingData.toDF() indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c), handleInvalid="keep") for c in ['endpoint', 'method'] ] encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ] assembler = VectorAssembler( inputCols=['response_code'] + [encoder.getOutputCol() for encoder in encoders], outputCol='features') pipeline = Pipeline(stages=indexers + encoders + [assembler]) transform_model = pipeline.fit(data) output = transform_model.transform(data) remove_existing_model(TRANSFORM_MODEL_LOCATION) transform_model.save(TRANSFORM_MODEL_LOCATION) normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) output = normalizer.transform(output) kmeans = pyspark.ml.clustering.KMeans().setK(2).setSeed(1) model = kmeans.fit(output) remove_existing_model(MODEL_LOCATION) model.save(MODEL_LOCATION) predictions = model.transform(output) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print('Silhouette: ', silhouette) costs = model.computeCost(output) print('Costs: ', costs)
def term_frequency(self): # TODO: save vocabulary to firebase beers = self.beer_reviews cv = CountVectorizer(inputCol='lemmatized_tokens', outputCol='features_tf', vocabSize=7500) # cv_model = cv.fit(self.beer_reviews) # self.beer_reviews = cv_model.transform(self.beer_reviews) cv_model = cv.fit(beers) # self.beer_reviews = cv_model.transform(beers) beers = cv_model.transform(beers) self.vocabulary = { idx: val.encode('utf-8') for idx, val in enumerate(cv_model.vocabulary) } normalizer = Normalizer(inputCol='features_tf', outputCol='features_normalized') # self.beer_reviews = normalizer.transform(self.beer_reviews) self.beer_reviews = normalizer.transform(beers)
def run_tune(self, importance_feature, data): sample_ratio = self.sample_ratio df = data.sample(fraction=sample_ratio, seed=1688) df = df.na.fill(0) ssembler = VectorAssembler(inputCols=importance_feature, outputCol="non_norm_features") output = ssembler.transform(df) normalizer = Normalizer(inputCol='non_norm_features', outputCol="features") l1NormData = normalizer.transform(output, {normalizer.p: float(2)}) features_label = l1NormData.select("features", "label") # split the dataset random train_data, test_data = features_label.randomSplit([0.8, 0.2]) # train the model lr_params = ({ 'regParam': 0.00 }, { 'fitIntercept': True }, { 'elasticNetParam': 0.5 }) lr = LinearRegression(maxIter=100, regParam=lr_params[0]['regParam'], \ fitIntercept=lr_params[1]['fitIntercept'], \ elasticNetParam=lr_params[2]['elasticNetParam']) model = lr.fit(train_data) pred = model.evaluate(test_data) # model of evaluate eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae") bef_mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"}) r2 = eval.evaluate(pred.predictions, {eval.metricName: "r2"}) print('r2....' + str(r2)) print('mae....' + str(bef_mae)) lrParamGrid = ParamGridBuilder() \ .addGrid(lr.regParam, [0.005, 0.01, 0.1, 0.5]) \ .addGrid(lr.fitIntercept, [False, True]) \ .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.5, 1.0]) \ .build() #build model Estimator such as LinearRegression #set RegressionEvaluator #fit model use k-fold,calculate avg of evaluate #save the best param if self.is_cv: train_valid = CrossValidator(estimator=lr, estimatorParamMaps=lrParamGrid, evaluator=eval, numFolds=5) tune_model = train_valid.fit(train_data) best_parameters = [( [{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric) \ for params, metric in zip( tune_model.getEstimatorParamMaps(), tune_model.avgMetrics)] else: train_valid = TrainValidationSplit(estimator=lr, estimatorParamMaps=lrParamGrid, evaluator=eval) tune_model = train_valid.fit(train_data) best_parameters = [( [{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric) \ for params, metric in zip( tune_model.getEstimatorParamMaps(), tune_model.validationMetrics)] lr_best_params = sorted(best_parameters, key=lambda el: el[1], reverse=True)[0][0] regParam_ky = [i for i in lr_best_params if i.get('regParam')][0] elasticNetParam_ky = [ i for i in lr_best_params if i.get('elasticNetParam') ][0] if [i for i in lr_best_params if i.get('fitIntercept')] is True: fitIntercept_ky = [i for i in check_d if i.get('fitIntercept')][0] else: fitIntercept_ky = {'fitIntercept': False} pd_best_params = pd.DataFrame({ 'regParam': [regParam_ky['regParam']], 'elasticNetParam': [elasticNetParam_ky['elasticNetParam']], 'fitIntercept': [fitIntercept_ky['fitIntercept']] }) pd_best_params['update_date'] = self.today pd_best_params['update_time'] = self.update_time pd_best_params['model_type'] = 'linear' # use the best param to predict lr = LinearRegression( maxIter=100, regParam=float(regParam_ky['regParam']), elasticNetParam=float(elasticNetParam_ky['elasticNetParam']), fitIntercept=bool(fitIntercept_ky['fitIntercept'])) model = lr.fit(train_data) print('....intercept....' + str(model.intercept)) print('....coefficients....' + str(model.coefficients)) pred = model.evaluate(test_data) # evaluation model eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae") r2_tune = eval.evaluate(pred.predictions, {eval.metricName: "r2"}) tune_mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"}) pd_best_params['bef_mae'] = str(bef_mae) pd_best_params['tune_mae'] = str(tune_mae) pd_best_params['tune_r2'] = str(r2_tune) pd_best_params = pd_best_params[[ 'regParam', 'fitIntercept', 'elasticNetParam', 'model_type', 'bef_mae', 'tune_mae', 'tune_r2', 'update_date', 'update_time' ]] if pd_best_params.shape[0] < 1: raise ValueError("tune the best param is wrong at {},{}".format( self.today, self.update_time)) pd_best_params = spark.createDataFrame(pd_best_params) try: pd_best_params.write.mode("append").format('hive').saveAsTable( 'temp.regression_model_best_param') except: pd_best_params.createOrReplaceTempView('pd_best_params') spark.sql( """drop table if exists temp.regression_model_best_param""") spark.sql( """create table temp.regression_model_best_param as select * from pd_best_params""" )
# COMMAND ---------- from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors scaleUpVec = Vectors.dense(10.0, 15.0, 20.0) scalingUp = ElementwiseProduct()\ .setScalingVec(scaleUpVec)\ .setInputCol("features") scalingUp.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import Normalizer manhattanDistance = Normalizer().setP(1).setInputCol("features") manhattanDistance.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import StringIndexer lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd") idxRes = lblIndxr.fit(simpleDF).transform(simpleDF) idxRes.show() # COMMAND ---------- valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd") valIndexer.fit(simpleDF).transform(simpleDF).show()
from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import Normalizer #Generamos un vector con la columna label y la columna array features ignore = ['label'] assembler = VectorAssembler( inputCols=[x for x in df.columns if x not in ignore], outputCol='features_without_norm') df = assembler.transform(df).select( ["MONTH", 'label', 'features_without_norm']) # COMMAND ---------- # Normalizamos los datos normalizer = Normalizer(inputCol="features_without_norm", outputCol="features") df_normalized = normalizer.transform(df).select(["MONTH", 'label', 'features']) # COMMAND ---------- # Definimos nuestros datos de entrenamiento del modelo ("train") y los de prediccion ("test") train = df_normalized.where(df.MONTH != 12) test = df_normalized.where(df.MONTH == 12) #Dividimos el archivo en train a su vez en: train(90%) y evaluation(10%) (train, evaluation) = train.randomSplit((0.9, 0.1)) # COMMAND ---------- from pyspark.ml.classification import LogisticRegression, OneVsRest from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer from pyspark import SparkContext from pyspark.sql import SQLContext sc = SparkContext("local", "samp") sqlContext = SQLContext(sc) data = sqlContext.read.format("libsvm").load("D:\Spark\spark-1.6.1-bin-hadoop2.6\data\mllib\sample_libsvm_data.txt") indexer = Normalizer(p=1.0, inputCol="features", outputCol="normFeatures") indexedData = indexer.transform(data) indexedData.show() lInfNormData = indexer.transform(data, {indexer.p: float("inf")}) lInfNormData.show() """Output +-----+--------------------+--------------------+ |label| features| normFeatures| +-----+--------------------+--------------------+ | 0.0|(692,[127,128,129...|(692,[127,128,129...| | 1.0|(692,[158,159,160...|(692,[158,159,160...| | 1.0|(692,[124,125,126...|(692,[124,125,126...| | 1.0|(692,[152,153,154...|(692,[152,153,154...| | 1.0|(692,[151,152,153...|(692,[151,152,153...| | 0.0|(692,[129,130,131...|(692,[129,130,131...| | 1.0|(692,[158,159,160...|(692,[158,159,160...| | 1.0|(692,[99,100,101,...|(692,[99,100,101,...| | 0.0|(692,[154,155,156...|(692,[154,155,156...| | 0.0|(692,[127,128,129...|(692,[127,128,129...| | 1.0|(692,[154,155,156...|(692,[154,155,156...| | 0.0|(692,[153,154,155...|(692,[153,154,155...| | 0.0|(692,[151,152,153...|(692,[151,152,153...| | 1.0|(692,[129,130,131...|(692,[129,130,131...| | 0.0|(692,[154,155,156...|(692,[154,155,156...|
dataset.append((cont, f, data)) rdd = sc.parallelize(dataset) shemaData = rdd.map(lambda x: Row(num=x[0], title=x[1], text=x[2])) dataFrame = sqlContext.createDataFrame(shemaData) tokenizer = Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(dataFrame) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("title", "features").show() #Normalizacion y transformada de la matriz normalizer = Normalizer(inputCol="features", outputCol="norm") data = normalizer.transform(rescaledData) #Proceso de similaridad hallando la norma y el producto punto mat = IndexedRowMatrix( data.select("num", "norm")\ .rdd.map(lambda row: IndexedRow(row.num, row.norm.toArray()))).toBlockMatrix() dot = mat.multiply(mat.transpose()) dot.toLocalMatrix().toArray() dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType()) data.alias("i").join(data.alias("j"), psf.col("i.num") < psf.col("j.num"))\ .select( psf.col("i.num").alias("i"), psf.col("j.num").alias("j"), dot_udf("i.norm", "j.norm").alias("dot"))\ .sort("i", "j")\
stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered_words") filtered_data = stopWordsRemover.transform(tokenized_data) hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=20) featurizedData = hashingTF.transform(filtered_data) idf = IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(featurizedData) featurized_data = idfModel.transform(featurizedData) # In[14]: from pyspark.ml.feature import Normalizer normalizer = Normalizer(inputCol="features", outputCol="norm") data = normalizer.transform(featurized_data) # In[15]: import math import pyspark.sql.functions as psf from pyspark.sql.types import DoubleType dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType()) s=data.alias("i").join(data.alias("j"), psf.col("i.id") < psf.col("j.id")) .select( psf.col("i.id").alias("src"), psf.col("j.id").alias("dst"), dot_udf("i.norm", "j.norm").alias("relationship"))\ .sort("src", "dst")\ # In[ ]:
if pred[i] == list(Y_test)[i]: corr+=1 else : pass print('正确率:'+str(corr*1.01/len(pred)/1.01)) #Spark上进行BP神经网络建模 from pyspark.sql import Row from pyspark.ml.feature import Normalizer lines = sc.textFile("hdfs:///lushun/a.txt") parts = lines.map(lambda l: l.split(" ")) df = parts.map(lambda p: Row(features=p[:-1], labe1=int(p[-1]))) df = spark.createDataFrame(df) df.createOrReplaceTempView("df") normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) l1NormData = normalizer.transform(df) l1NormData = spark.sql("SELECT labe1,normFeatures FROM l1NormData") l1NormData.show() from pyspark.ml.classification import MultilayerPerceptronClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator splits = lInfNormData.randomSplit([0.7, 0.3]) train = splits[0] test = splits[1] layers = [36300, 200, 200, 6] trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers,seed=1234) model = trainer.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors scaleUpVec = Vectors.dense(10.0, 15.0, 20.0) scalingUp = ElementwiseProduct()\ .setScalingVec(scaleUpVec)\ .setInputCol("features") scalingUp.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import Normalizer manhattanDistance = Normalizer().setP(1).setInputCol("features") manhattanDistance.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import StringIndexer lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd") idxRes = lblIndxr.fit(simpleDF).transform(simpleDF) idxRes.show() # COMMAND ---------- valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd") valIndexer.fit(simpleDF).transform(simpleDF).show(5) # COMMAND ----------
rddTrain = rddTrain.mapPartitions(lambda x: csv.reader(x)) Result_train = rddTrain.map( lambda x: (x[0], x[1], x[2], DenseVector((x[3]).split(',')))).toDF( ["index", "url", "productId", "features"]) DataFile_test = spark.textFile('new_test.csv') Result_test = DataFile_test.map(lambda line : line.split("\n")).flatMap(lambda words : (word.split(",") for word in words))\ .zipWithIndex().map(lambda x: (DenseVector(x[0]), x[1])).toDF(["features_test", "index"]) # # print Result_train.show() # print Result_test.show() normalizer = Normalizer(inputCol="features", outputCol="normFeatures_train", p=2.0) l1NormData = normalizer.transform(Result_train) normalizer = Normalizer(inputCol="features_test", outputCol="normFeatures_test", p=2.0) l1NormData1 = normalizer.transform(Result_test) print l1NormData.show() print l1NormData1.show() Train_DF = l1NormData.createOrReplaceTempView("Result_train1") # sqlDF_train = sqlContext.sql("select * from Result_train1 limit 1").show() Test_DF = l1NormData1.createOrReplaceTempView("Result_test1") # sqlDF_test = sqlContext.sql("select * from Result_test1 limit 1").show()
tags_users_df=sqlContext.createDataFrame(tags_users) print(tags_users_df.take(2)) # # # print('Indexing strings') cVec = CountVectorizer(inputCol='tags', outputCol="tag_features",minDF=10.) model=cVec.fit(tags_users_df) td=model.transform(tags_users_df) with open('/home/erlenda/data/konsum/countvec_vocabulary.pkl',mode='wb') as ff: pkl.dump(model.vocabulary,ff) normalizer=Normalizer(p=1.,inputCol='tag_features',outputCol='tags_normalized') tdNorm=normalizer.transform(td) print(tdNorm.take(5)) tdNorm.write.save('/home/erlenda/data/konsum/tag_profiler_parquet') samples=tdNorm.filter(tdNorm.posts_with_tags>10).take(10) #pprint(samples) # stringIndexer = StringIndexer(inputCol="tags", outputCol="indexed_tags") # model=stringIndexer.fit(tags_users_df) # td=model.transform(tags_users_df) # print('Retrieving indices') #
from pyspark.ml.linalg import Vectors dataFrame = spark.createDataFrame([( 0, Vectors.dense([1.0, 0.5, -1.0]), ), ( 1, Vectors.dense([2.0, 1.0, 1.0]), ), ( 2, Vectors.dense([4.0, 10.0, 2.0]), )], ["id", "features"]) # Normalize each Vector using $L^1$ norm. normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) l1NormData = normalizer.transform(dataFrame) print("Normalized using L^1 norm") l1NormData.show() # Normalize each Vector using $L^\infty$ norm. lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")}) print("Normalized using L^inf norm") lInfNormData.show() # COMMAND ---------- ###MinMaxScaler (0, 1) from pyspark.ml.feature import MinMaxScaler from pyspark.ml.linalg import Vectors dataFrame = spark.createDataFrame([(
df = df.withColumn('scores', udf(lambda x: x.values(), ArrayType(FloatType))(df.kws)) keywords_length = len(df.select("scores").take(1)[0]["scores"]) assembler = VectorAssembler( inputCols=["scores[{}]".format(i) for i in range(keywords_length)], outputCol="user_score") ddf = assembler.transform( df.select("*", *(df["scores"].getItem(i) for i in range(keywords_length)))).select("user_score") normalizer = Normalizer(inputCol="user_score", outputCol="normFeatures", p=2.0) extended_user_df = normalizer.transform(ddf) extended_user_df.cache() seed_user_df = extended_user_df # LSH Algorithm brp = BucketedRandomProjectionLSH(inputCol="normFeatures", outputCol="hashes", bucketLength=bucketLength, numHashTables=numHashTables) lsh = brp.fit(extended_user_df) df_users = lsh.approxSimilarityJoin(seed_user_df, extended_user_df, 1 - minimum_similarity_score, distCol="EuclideanDistance") df_users = df_users.withColumn(
mx = type_mapping[4]['max'] suggested_type = suggest_data_type(mean, mx) if suggested_type == 'FLOAT': d_type = 1 elif suggested_type == 'INT': d_type = 2 elif suggested_type == 'DATE': d_type = 3 else: d_type = 4 stats_feat = assembler.transform(features) stats_feat_results = stats_feat.select("features") l1NormData = normalizer.transform(stats_feat_results, {normalizer.p: float("inf")}) normfeatures = l1NormData.select("normFeatures").rdd.flatMap(list).collect()[0] features_combined = [(normfeatures, d_type)] final_feat_frame = spark.createDataFrame(features_combined, ["meta_features", "data_type"]) final_results = final_assembler.transform(final_feat_frame) combined_features = final_results.select("final_features").rdd.flatMap(list).collect() features2 = \ spark.sql( """ select case when count({0}) > 0 then cast(count(distinct {0}) as int) else 0 end as dist_cnt,
def getTestData(imgUrl): testDataFeature = requests.post( 'http://ares.styfi.in:81/vs/getproductfeatures/', data=json.dumps({'imgUrl': imgUrl})).content testDataFeature = json.loads(testDataFeature) testData = {} testData['testDataImgFeatures'] = (testDataFeature['imgFeatures']) testData['imgUrl'] = imgUrl a = [json.dumps(testData)] jsonRDD = spark.parallelize(a) df = sparkSess.read.json(jsonRDD) df.show() df.printSchema() to_vector = udf(lambda a: DenseVector(a), VectorUDT()) data = df.select("imgUrl", to_vector("testDataImgFeatures").alias("features")) data.printSchema() data.show() #stestDataFeature = testDataFeature.items() normalizer = Normalizer(inputCol="features", outputCol="normFeatures_test", p=2.0) l1NormData1 = normalizer.transform(data) l1NormData1.show() # rddTrain = spark.textFile("/home/leena/dev/visualsearch/testFinal.csv") # #rddTrain = rddTrain.mapPartitions(lambda x: csv.reader(x)) # #print rddTrain.collect() # Result_train = rddTrain.map(lambda x: x[0]) #Result_test = rddTrain.map(lambda line : line.split("\n")).flatMap(lambda words : [word.split(",") for word in words]).zipWithIndex(); #print Result_test.show() #print Result_train.collect() # df = sqlContext.createDataFrame([json.loads(y) for line in y.iter_lines()]) # df.show() j = {'abc': 1, 'def': 2, 'ghi': 3} a = [json.dumps(j)] jsonRDD = spark.parallelize(a) df = sparkSess.read.json(jsonRDD) df.show() #writeToCsv("/home/leena/dev/visualsearch/testFinal",testDataFeature) # # schema =[ # StructField("URL", StringType(), True), # StructField("Features", DoubleType(), True)] # # df = sparkSess.createDataFrame(jsonRDD, schema) # # df.show() # #print y # df = spark.parallelize(y).toDF() # df.show() # otherPeople = sparkSess.read.json(y) # otherPeople.show() return ""
return wordbag documents = sqlContext.createDataFrame(sc.pickleFile('merged_file/part-00000').map(lambda x : [x['eval_id'],x['no'],create_wordbag(x),x['professor'],x['lec_code'][:4],x['lec_code'][5],x['eval_total'],x['eval_id']]),['eval_id','no','words','prof_name','department','grade','eval_total','eval_id']) #users = sqlContext.createDataFrame(sc.pickleFile('merged_file').map(lambda x : (x['mb_no'],x['lec_code'][:4])),['user','department']).orderBy('department') #for u in users.select('department','user').take(10000): # print u ''' professors = documents.select('prof_name').distinct() department = documents.select('department').distinct() #grade 1/2/3/4 eval_total = documents.select('eval_total').distinct() # 1/2/3/4/5 for e in eval_total.collect(): print e ''' htf = HashingTF(inputCol= 'words',outputCol = 'rawFeatures') featured = htf.transform(documents) idf = IDF(inputCol = 'rawFeatures',outputCol = 'idf') idfModel = idf.fit(featured) tf_idf = idfModel.transform(featured) normalizer = Normalizer(inputCol = 'idf', outputCol = 'idf_norm', p = 2.0) normData = normalizer.transform(tf_idf) normData.rdd.saveAsPickleFile('idf_normalized')
" ") typedData = csvData for colName in columnsToKeep: typedData = csvData.withColumn( colName, typedData[colName].cast(IntegerType()).alias(colName)) typedData = typedData.na.drop() print(typedData.schema) assembler = VectorAssembler().setInputCols(columnsToKeep).setOutputCol( "features") dataWithFeatures = assembler.transform(typedData) dataWithFeatures.show() normalizer = Normalizer().setInputCol("features").setOutputCol("normFeatures") normData = normalizer.transform(dataWithFeatures) kmeans = KMeans().setK(5).setFeaturesCol("normFeatures") model = kmeans.fit(normData) predictions = model.transform(normData) predictions.select("features", "prediction").show() evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) spark.stop()
# Transforing the data using Tokenizer (No fitting because nothing to train) token_df = tokenizer.transform(df) # Transforming the data using Stopword Removal (No fitting because nothing to train) stop_df = stopwordsRemover.transform(token_df) # If you apply Count Vectorizer, here you will have to fit and then transform # If you apply hashingtf, here you can use direct transformation on the data #tf_df = hashtf.transform(stop_df) tf_df = hashtf.fit(stop_df).transform(stop_df) # Fit and Transforing the data using Tokenizer IDF idf_df = idf.fit(tf_df).transform(tf_df) # This is very important, L2 Normalisation is missing in pyspark idf implementation. We explicitly build this function norm_df = norm.transform(idf_df) # Clean data into features, this has to be done before you send the data for model training raw_dataset = clean_data.transform(norm_df) # This step will have a drastic change in training for naive bayes and logistic regression. dataset = raw_dataset.withColumn("label", raw_dataset["stars"] - 1) # Have a look at the preprocessed data print((dataset.count(), len(dataset.columns))) dataset.printSchema() dataset.show() # Split the dataset into Training and Testing using random split
from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("NormalizerExample")\ .getOrCreate() # $example on$ dataFrame = spark.createDataFrame([ (0, Vectors.dense([1.0, 0.5, -1.0]),), (1, Vectors.dense([2.0, 1.0, 1.0]),), (2, Vectors.dense([4.0, 10.0, 2.0]),) ], ["id", "features"]) # Normalize each Vector using $L^1$ norm. normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) l1NormData = normalizer.transform(dataFrame) print("Normalized using L^1 norm") l1NormData.show() # Normalize each Vector using $L^\infty$ norm. lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")}) print("Normalized using L^inf norm") lInfNormData.show() # $example off$ spark.stop()
def trainModel(self): logger.info("Training the model...") query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;''' def SQLtoURL(query): data = query.replace('\n', ' ').replace('\t',' ').replace(' ',' ').replace(' ',' ') return data def QueryXXXXX(query, file = None): session = Session() response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},) return response.content table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table'] title_list = [x['c'] for x in table['rows']] table_cols = [d['label'] for d in table['cols']] def convert_row(row): rowlist = [d['v'] for d in row] return rowlist rd = self.sc.parallelize(title_list).map(convert_row) titleData = self.spark.createDataFrame(rd, table_cols) titleData = titleData.dropna() hebrew_stopwords = stop_words() def rmv(words): for punc in punctuation: words = words.replace(punc,"") for hword in hebrew_stopwords: words = words.replace(hword, " ") return words self.spark.udf.register("rmv", rmv, StringType()) titleData.registerTempTable("wordstable") cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable") tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words") wordsData = tokenizer.transform(cleanedSentenceData) cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0) cvModel = cv.fit(wordsData) featurizedData = cvModel.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) lda = LDA(k=100) ldaModel = lda.fit(rescaledData) postFactorizedData = ldaModel.transform(rescaledData) norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist") scaledFactorizedNormalizedData = norm.transform(postFactorizedData) self.model = scaledFactorizedNormalizedData logger.info("model is built!")
from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler vectorassembler = VectorAssembler( inputCols = ['x','y', 'z'], outputCol = 'features' ) features_vectorized = vectorassembler.transform(encoded) features_vectorized.show() # In[12]: from pyspark.ml.feature import Normalizer normalizer = Normalizer(inputCol = 'features', outputCol='features_norm', p=1.0) normalized_data = normalizer.transform(features_vectorized) normalized_data.show() # In[17]: from pyspark.ml import Pipeline pipeline = Pipeline(stages = [ indexer, encoder, vectorassembler, normalizer ]) model = pipeline.fit(df) prediction = model.transform(df) prediction.show()
# On a bien remplacé ici du coup les mots par les vecteurs sparse print "DataFrame(1-gram): On a bien remplacé ici du coup les mots par les vecteurs sparse" dfVect.show() udfVectorizeBi=UserDefinedFunction(lambda x : vectorizeBi(x),VectorUDT()) dfVect2 = dfVect.withColumn("bigrams", udfVectorizeBi("bigrams")) print "DataFrame(bi-gram): On a bien remplacé ici du coup les mots par les vecteurs sparse" dfVect2.show() # Pour les opérations de traitement du langage, il est d'usage de normaliser (L2) # les vecteurs de features : c'est ce qui marche le mieux apparemment. from pyspark.ml.feature import Normalizer normalizerUni = Normalizer(inputCol='words',outputCol='normWords',p=2.0) normalizerBi = Normalizer(inputCol="bigrams",outputCol='normBigrams',p=2.0) dfNorm = normalizerUni.transform(dfVect2) dfNorm2 = normalizerBi.transform(dfNorm) print "DataFrame(bi-gram): normalisé" dfNorm2.select('words','normWords').show() # La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements # non nuls et pas leur valeur # On passe au TFIDF # Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs # à n'importz quelle colonne (bigrammes, avec stop words ou sans...) from pyspark.ml.feature import HashingTF htf = HashingTF(inputCol='words',outputCol='wordsTF',numFeatures=10000) dfTrainTF = htf.transform(dfTrainTokNoSw) # INverse doc frequency from pyspark.ml.feature import IDF idf = IDF(inputCol=htf.getOutputCol(),outputCol="wordsTFIDF") idfModel = idf.fit(dfTrainTF)