def normalize(self): from pyspark.ml.feature import Normalizer from pyspark.ml.linalg import Vectors df = self.session.createDataFrame([(0, [1.0, 0.5, -1.0]), (1, [2.0, 1.0, 1.0]), (2, [4.0, 10.0, 2.0])], ["id", "features"]) # Vector概念解释 @F.udf(returnType=VectorUDT()) def vectorize_from_array(a): return Vectors.dense(a) df = df.withColumn("features", vectorize_from_array(F.col("features"))) # Normalize each Vector using $L^1$ norm. normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) l1NormData = normalizer.transform(df) print("Normalized using L^1 norm") l1NormData.show() # Normalize each Vector using $L^\infty$ norm. lInfNormData = normalizer.transform(df, {normalizer.p: float("inf")}) print("Normalized using L^inf norm") lInfNormData.show()
def get_sim(tfidf, threshold, save_dir): normalizer = Normalizer(inputCol='features', outputCol='norm') data = normalizer.transform(tfidf) dot_udf = udf(lambda x, y: float(x.dot(y)), DoubleType()) sim_df = ( data.alias('i').join(data.alias('j'), col('i.id') < col('j.id')).select( col('i.id').alias('i'), col('j.id').alias('j'), dot_udf('i.norm', 'j.norm').alias('similarity')) # .sort('i', 'j') ) sim_df_filtered = sim_df.filter(col('similarity').between(threshold, 1.0)) edges = [(row.i, row.j, row.similarity) for row in sim_df_filtered.collect()] print('Edges: {}'.format(len(edges))) vertices = set() for e in edges: vertices.add(e[0]) vertices.add(e[1]) vertices = [(v, ) for v in list(vertices)] doc_sim = {'edges': edges, 'vertices': vertices} pkl.dump(doc_sim, open(os.path.join(save_dir, 'doc_sim.pkl'), 'wb'))
def fit_kmeans(spark, products_df): step = 0 step += 1 tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer") step += 1 stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords") step += 1 tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16) step += 1 idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf") step += 1 normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer") step += 1 kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20) kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans]) model = kmeans_pipeline.fit(products_df) words_prediction = model.transform(products_df) model.save("./kmeans") # the whole machine learning instance is saved in a folder return model, words_prediction
def main(): spark = SparkSession.builder \ .appName("Spark CV-job ad matching") \ .config("spark.some.config.option", "some-value") \ .master("local[*]") \ .getOrCreate() VECTOR_SIZE = 50 df_jobs = spark.read.json("alljobs4rdd/alljobs.jsonl").filter("description is not NULL").cache() df_jobs.registerTempTable("jobs") df_cvs = spark.read.json("allcvs4rdd/allcvs.jsonl").cache() df_cvs.registerTempTable("cvs") df_categories = spark.read.json("allcategories4rdd/allcategories.jsonl").cache() df_categories.registerTempTable("categories") joined = spark.sql("SELECT description AS text, jobId AS id, 'job' AS type FROM jobs UNION ALL \ SELECT description AS text, cvid AS id, 'cv' AS type FROM cvs UNION ALL \ SELECT skillText AS text, id AS id, 'categories' AS type FROM categories") tokenizer = Tokenizer(inputCol="text", outputCol="words") tokenized = tokenizer.transform(joined) remover = StopWordsRemover(inputCol="words", outputCol="filtered") removed = remover.transform(tokenized) word2Vec = Word2Vec(vectorSize=VECTOR_SIZE, minCount=0, inputCol="filtered", outputCol="vectors") model = word2Vec.fit(removed) resultDF = model.transform(removed) normalizer = Normalizer(inputCol="vectors", outputCol="result", p=2) l1NormData = normalizer.transform(resultDF) l1NormData.registerTempTable("resultTable") jobs = spark.sql("SELECT result AS jobsVec, id AS jobId FROM resultTable WHERE type = 'job'") cvs = spark.sql("SELECT result AS cvsVec, id AS cvid FROM resultTable WHERE type = 'cv'") categories = spark.sql("SELECT result AS categoriesVec, cat.id, cat.skillName, category FROM resultTable AS rt\ LEFT JOIN categories AS cat ON rt.id = cat.id WHERE type = 'categories'") #Calculate job-cv similarity START crossJoined_job_cv = jobs.crossJoin(cvs) calculated_job_cv = crossJoined_job_cv.rdd.map(lambda x: (x.jobId, x.cvid, calculate_distance(x.jobsVec, x.cvsVec)))\ .toDF(["jobid", "cvid", "distance"]).orderBy(asc("jobid")).coalesce(2) calculated_job_cv.write.csv('Calculated/word2vec2/job-cv') #Calculate job-cv similarity END #Calculate cv-category similarity START crossJoined_cv_cat = cvs.crossJoin(categories) calculated_cv_cat = crossJoined_cv_cat.rdd.map(lambda x: (x.cvid, x.id, x.skillName, x.category, calculate_distance(x.cvsVec, x.categoriesVec)))\ .toDF(["cvid", "category_id", "skillName", "category", "distance"]).orderBy(asc("cvid"), asc("distance")).coalesce(2) calculated_cv_cat.write.csv('Calculated/word2vec2/cv-category') #Calculate cv-category similarity END #Job-category START crossJoined_job_cat = jobs.select("jobId", "jobsVec").crossJoin(categories.select("id", "skillName", "category", "categoriesVec")) calculatedDF_job_cat = crossJoined_job_cat.rdd\ .map(lambda x: (x.jobId, x.id, x.skillName, x.category, calculate_distance(x.jobsVec, x.categoriesVec)))\ .toDF(["jobid", "catid", "skillName", "category", "distance"]) ordered_job_cat = calculatedDF_job_cat.orderBy( asc("distance")).coalesce(2) ordered_job_cat.write.csv('Calculated/word2vec2/job-category')
def create_tfidf_model(sentenceDataFrame, ngrams=1, minDocFreq=0): tokenized = Tokenizer(inputCol="text", outputCol="words").transoform(sentenceDataFrame) ngramDataFrame = NGram(n=ngrams, inputCol="words", outputCol="ngrams").transform(tokenized) countVect = CountVectorizer(inputCol="ngrams", outputCol="rawFeatures") countVectModel = countVect.fit(ngramDataFrame) featurizedData = countVectModel.transform(ngramDataFrame) idf = IDF(minDocFreq=minDocFreq, inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features") normalizer = Normalizer(inputCol="features", outputCol='scores') X = normalizer.transform(rescaledData) return X
def test_model_normalizer_2(self): data = self.spark.createDataFrame([(0, Vectors.dense(1.0, 0.5, -1.0)), (1, Vectors.dense(2.0, 1.0, 1.0)), (2, Vectors.dense(4.0, 10.0, 2.0)) ]).toDF("id", "features") model = Normalizer(inputCol='features', outputCol='norm_feature', p=2.0) model_onnx = convert_sparkml(model, 'Sparkml Normalizer', [('features', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().norm_feature.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlNormalizer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['norm_feature'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def get_product_similarity(self): """ Calculate the similarity between items/users """ product_taxonomy = self.data.select(self.productCol, self.taxonomyCol).distinct() product_taxonomy = self.__data_manipulation(product_taxonomy) hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf") tf = hashingTF.transform(product_taxonomy) idf = IDF(inputCol="tf", outputCol="feature").fit(tf) tfidf = idf.transform(tf) normalizer = Normalizer(inputCol="feature", outputCol="norm") norma_data = normalizer.transform(tfidf) col1 = "i." + self.productCol col2 = "j." + self.productCol dot_udf = udf(lambda x, y: float(x.dot(y)), DoubleType()) result = norma_data.alias("i").crossJoin(norma_data.alias("j"))\ .select( col(col1).alias("i"), col(col2).alias("j"), dot_udf("i.norm", "j.norm").alias("dot"))\ .sort("i", "j") result = result.filter(result.i < result.j & result.dot > 0.5) return result
def __data_manipulation(self, col): data = self.data.select(col, self.taxonomyCol).distinct() data = data.withColumn(self.taxonomyCol, data[self.taxonomyCol].cast(StringType())) concat_list = udf(lambda lst: ", ".join(lst), StringType()) data = data.groupby(col).agg( collect_list(self.taxonomyCol).alias(self.taxonomyCol)) data = data.withColumn(self.taxonomyCol, concat_list(self.taxonomyCol)) data = data.withColumn( self.taxonomyCol, split(regexp_replace(self.taxonomyCol, " ", ""), ',')) hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf") tf = hashingTF.transform(data) idf = IDF(inputCol="tf", outputCol="feature").fit(tf) tfidf = idf.transform(tf) normalizer = Normalizer(inputCol="feature", outputCol="norm") norma_data = normalizer.transform(tfidf) return norma_data
def execute(): data = spark.read.csv('/Users/brillap/downloads/kc_house_data.csv', header=True, inferSchema=True) # data = spark.read.csv('hdfs://hadoop-master:9000/user/root/kc_house_data.csv', header=True, inferSchema=True) assembler = VectorAssembler() \ .setInputCols(["bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors"]) \ .setOutputCol("features") \ .transform(data) # assembler.show() normalizer = Normalizer() \ .setInputCol("features") \ .setOutputCol("normFeatures") \ .setP(2.0) \ .transform(assembler) # normalizer.show() linear_regression = LinearRegression() \ .setLabelCol("price") \ .setFeaturesCol("normFeatures") \ .setMaxIter(10) \ .setRegParam(1.0) \ .setElasticNetParam(1.0) result_array = normalizer.randomSplit([0.7, 0.3]) lr_model = linear_regression.fit(result_array[0]) training_summary = lr_model.summary print("RMSE: %f" % training_summary.rootMeanSquaredError) predicted_data = lr_model.transform(result_array[1]).select("features", "normFeatures", "price", "prediction") predicted_data.show()
def normalize(dataFrame, inputColNames, p_norm=2.0): if type(p_norm) is str: if p_norm.lower() == "inf": p_norm = float('inf') else: raise ValueError("The p_norm has to be float or 'inf'.") if type(inputColNames) is list: outputColName = "normalized features" assembler = VectorAssembler(inputCols=inputColNames, \ outputCol="features") assembledDF = assembler.transform(dataFrame) normalizer=Normalizer(inputCol="features", \ outputCol=outputColName, \ p = p_norm) normalizedDF = normalizer.transform(assembledDF) colList = "" for inputColName in inputColNames: colList += " '" + inputColName + "' " if(p_norm == float('inf')): print ("Successfully assembled the column {0:s} to a feature vector and normalized using L^inf norm and create two new columns 'features' and 'normalized features'.".format(colList)) else: print ("Successfully assembled the column {0:s} to a feature vector and normalized using L^{1:f} norm and create two new columns 'features' and 'normalized features'.".format(colList, p_norm)) return normalizedDF else: raise ValueError("The inputColNames has to be a list of columns to generate a feature vector and then do normalization.")
def getNormalizer(self, dataFrameFeatures, outputColName): #define Normalizer to get normailize freatures normalized = Normalizer(inputCol="features", outputCol=outputColName, p=2.0) #Get Normalize feature normData = normalized.transform(dataFrameFeatures) return normData
def clustering(input_df, input_col_name, n): """ KMeans and PCA """ input_df = input_df.select('state','categories','stars',input_col_name) norm = Normalizer(inputCol=input_col_name, outputCol="features", p=1.0) df = norm.transform(input_df) kmeans = KMeans(k=n, seed=2) KMmodel = kmeans.fit(df) predicted = KMmodel.transform(df).cache() pca = PCA(k=2, inputCol='features', outputCol="pc") df = pca.fit(dfsample).transform(dfsample).cache() return df
def get_feature_vector(input_df): assembler_1 = VectorAssembler(inputCols=["Open", "Close"], outputCol="stock_features") scaler = Normalizer(inputCol="stock_features", outputCol="scaled_stock_features") assembled_df = assembler_1.transform(input_df) scaled_stock = scaler.transform(assembled_df).drop('stock_features') assembler_2 = VectorAssembler( inputCols=["scaled_stock_features", "Sentiment"], outputCol="features") final_df = assembler_2.transform(scaled_stock) return final_df.drop('scaled_stock_features')
def termFrequency(table): #calculates the term frequency of attributes hashingTF = HashingTF(inputCol='key_words', outputCol='hashing') tf = hashingTF.transform(table) tf.cache() #normalises the term frequency data normalizer = Normalizer(inputCol='hashing', outputCol='norm') term = normalizer.transform(tf) return term
def normalizedDF(self): from pyspark.ml.feature import Normalizer assembler = VectorAssembler(inputCols=varNames, outputCol="features") normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=2) #p is order of norm pipeline = Pipeline(stages=[assembler, normalizer]) self.df_norm = pipeline.fit(vecdf) # Normalize each Vector using $L^\infty$ norm. lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")}) return (0)
def getNormalizerTest(dataFrameFeatures, outputColName): print("inside normalizer test") print(dataFrameFeatures) #define Normalizer to get normailize freatures normalized = Normalizer(inputCol="features", outputCol=outputColName, p=2.0) #Get Normalize feature normData = normalized.transform(dataFrameFeatures) #print normData.show() return normData
def transform_input(input_text): ''' ''' lines = [(input_text, )] df = spark.createDataFrame(lines, ['text']) def removePunctuation(text): text = text.lower().strip() text = re.sub('[^0-9a-zA-Z ]', '', text) return text remove_punt_udf = udf(removePunctuation, StringType()) tokenizer = Tokenizer(inputCol='text_noPunct', outputCol='token_text') df_new = df.withColumn('text_noPunct', remove_punt_udf('text')) df_new = tokenizer.transform(df_new) def remove_blank_token(text): text = list(filter(lambda x: x != '', text)) return text remove_blank_token_udf = udf(remove_blank_token, ArrayType(StringType())) df_new = df_new.withColumn('token_text', remove_blank_token_udf('token_text')) sw_remover = StopWordsRemover(inputCol='token_text', outputCol='stop_token') normalizer = Normalizer(inputCol='w2v', outputCol='w2v_norm') pipe = PipelineModel(stages=(sw_remover, w2v_model, normalizer)) df_final = pipe.transform(df_new) return df_final
def normalizer(df, input_cols, p=2.0): """ Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which specifies the p-norm used for normalization. (p=2) by default. :param df: Dataframe to be transformed :param input_cols: Columns to be normalized. :param p: p-norm used for normalization. :return: Dataframe with normalized columns. """ # Check if columns argument must be a string or list datatype: if is_(input_cols, [str, list]): RaiseIt.type_error(input_cols, [str, list]) if is_str(input_cols): input_cols = [input_cols] if is_(input_cols, [float, int]): RaiseIt.type_error(input_cols, [float, int]) df = df.cols.cast(input_cols, "vector") normal = [ Normalizer(inputCol=column, outputCol=column + "_normalized", p=p) for column in list(set(input_cols)) ] pipeline = Pipeline(stages=normal) df = pipeline.fit(df).transform(df) return df
def tfidf_top_tokens(df, token_cols, min_freq=1): output = df for c in token_cols: pre = c cv = CountVectorizer(inputCol=pre, outputCol=pre + '_rawFeatures', minDF=min_freq) idf = IDF(inputCol=pre + "_rawFeatures", outputCol=pre + "_features", minDocFreq=min_freq) normalizer = Normalizer(p=2.0, inputCol=pre + "_features", outputCol=pre + '_tfidf') stages = [cv, idf, normalizer] pipeline = Pipeline(stages=stages) model = pipeline.fit(output) output = model.transform(output)\ .drop(pre+'_rawFeatures', pre+'_features') cvModel = model.stages[0] vocab = spark.sparkContext.broadcast(cvModel.vocabulary) output = output.withColumn( pre + '_top_tokens', top_kw_from_tfidf(vocab, n=5)(f.col(pre + "_tfidf"))) return output
def normalizer(df, input_cols, p=2.0): """ Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which specifies the p-norm used for normalization. (p=2) by default. :param df: Dataframe to be transformed :param input_cols: Columns to be normalized. :param p: p-norm used for normalization. :return: Dataframe with normalized columns. """ # Check if columns argument must be a string or list datatype: if is_(input_cols, [str, list]): RaiseIt.type_error(input_cols, [str, list]) if is_str(input_cols): input_cols = [input_cols] if is_(input_cols, [float, int]): RaiseIt.type_error(input_cols, [float, int]) df = df.cols.cast(input_cols, "vector") # TODO https://developer.ibm.com/code/2018/04/10/improve-performance-ml-pipelines-wide-dataframes-apache-spark-2-3/ normal = [ Normalizer(inputCol=col_name, outputCol=name_col(col_name, "normalized"), p=p) for col_name in list(set(input_cols)) ] pipeline = Pipeline(stages=normal) df = pipeline.fit(df).transform(df) return df
def normalize(dataFrame, inputColNames, p_norm=2.0): if type(p_norm) is str: if p_norm.lower() == "inf": p_norm = float('inf') else: raise ValueError("The p_norm has to be float or 'inf'.") if type(inputColNames) is list: outputColName = "normalized features" assembledDF = getAssembledDataFrame(dataFrame, inputColNames) normalizer=Normalizer(inputCol="features", \ outputCol=outputColName, \ p = p_norm) normalizedDF = normalizer.transform(assembledDF).drop("features") return normalizedDF else: raise ValueError( "The inputColNames has to be a list of columns to generate a feature vector and then do normalization." )
def train_and_save_model_df(sc_local): trainingData = sc_local.textFile(FILE_TRAINING_DATA) \ .flatMap(lambda line: parse_apache_log_line(line)) data = trainingData.toDF() indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c), handleInvalid="keep") for c in ['endpoint', 'method'] ] encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ] assembler = VectorAssembler( inputCols=['response_code'] + [encoder.getOutputCol() for encoder in encoders], outputCol='features') pipeline = Pipeline(stages=indexers + encoders + [assembler]) transform_model = pipeline.fit(data) output = transform_model.transform(data) remove_existing_model(TRANSFORM_MODEL_LOCATION) transform_model.save(TRANSFORM_MODEL_LOCATION) normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) output = normalizer.transform(output) kmeans = pyspark.ml.clustering.KMeans().setK(2).setSeed(1) model = kmeans.fit(output) remove_existing_model(MODEL_LOCATION) model.save(MODEL_LOCATION) predictions = model.transform(output) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print('Silhouette: ', silhouette) costs = model.computeCost(output) print('Costs: ', costs)
def term_frequency(self): # TODO: save vocabulary to firebase beers = self.beer_reviews cv = CountVectorizer(inputCol='lemmatized_tokens', outputCol='features_tf', vocabSize=7500) # cv_model = cv.fit(self.beer_reviews) # self.beer_reviews = cv_model.transform(self.beer_reviews) cv_model = cv.fit(beers) # self.beer_reviews = cv_model.transform(beers) beers = cv_model.transform(beers) self.vocabulary = { idx: val.encode('utf-8') for idx, val in enumerate(cv_model.vocabulary) } normalizer = Normalizer(inputCol='features_tf', outputCol='features_normalized') # self.beer_reviews = normalizer.transform(self.beer_reviews) self.beer_reviews = normalizer.transform(beers)
def execute(): input_data = spark.read.csv('hdfs://hadoop-master:9000/kc_house_data.csv', header=True, inferSchema=True) # input_data = spark.read.csv('/Users/krithikab/Desktop/PRACT/SparkML/kc_house_data.csv', header=True, inferSchema=True) data = input_data\ .filter(input_data.price > 0)\ .withColumn("age", 2020-input_data.yr_built)\ .drop_duplicates() assembler = VectorAssembler() \ .setInputCols( ["bedrooms", "bathrooms", "sqft_living", "floors", "condition", "sqft_lot", "waterfront", "view", "grade", "sqft_above", "sqft_basement", "age", "zipcode", "lat", "long", "sqft_living15", "sqft_lot15"]) \ .setOutputCol("features") \ .transform(data) normalizer = Normalizer() \ .setInputCol("features") \ .setOutputCol("normFeatures") \ .transform(assembler) linear_regression = LinearRegression() \ .setLabelCol("price") \ .setFeaturesCol("normFeatures") \ .setMaxIter(10) \ .setRegParam(1.0) \ .setElasticNetParam(1.0) result_array = normalizer.randomSplit([0.7, 0.3]) lr_model = linear_regression.fit(result_array[0]) predicted_data = lr_model.transform(result_array[1]).select( "features", "normFeatures", "price", "prediction") # predicted_data.select("price", "prediction").write.csv("result.csv") predicted_data.select( "price", "prediction").write.csv("hdfs://hadoop-master:9000/prediction.csv")
def predict(team_a,team_b): col=['player_name','Usg%','Per','time'] dataA=[] dataB=[] for player in team_a: playerInfo = team_a[player] name = playerInfo[0] time = int(playerInfo[1]) Usgp = pd_df.loc[pd_df['player_name']==name,'Usg%'].values[0] Per = pd_df.loc[pd_df['player_name']==name,'Per'].values[0] dataA.append((name,Usgp,Per,time)) #print(dataA) for player in team_b: playerInfo = team_b[player] name = playerInfo[0] #print(name) time = int(playerInfo[1]) Usgp = pd_df.loc[pd_df['player_name']==name,'Usg%'].values[0] Per = pd_df.loc[pd_df['player_name']==name,'Per'].values[0] dataB.append((name,Usgp,Per,time)) #print(dataB) col=['player_name','Usg%','Per','time'] pd_tmp = pandas.DataFrame(dataA,columns=col) df_teamA = sqlContext.createDataFrame(pd_tmp) pd_tmp=pandas.DataFrame(dataB,columns=col) df_teamB = sqlContext.createDataFrame(pd_tmp) #df_teamA.show() #df_teamB.show() vector = VectorAssembler(inputCols=['Usg%','Per','time'],outputCol='features') normalizer = Normalizer(p=2.0, inputCol="features", outputCol="norm_test") pipeline = Pipeline(stages=[vector,normalizer]) pipeline_fit = pipeline.fit(df_teamA) df_A = pipeline_fit.transform(df_teamA) #df_A.show() pipeline_fit = pipeline.fit(df_teamB) df_B = pipeline_fit.transform(df_teamB) #df_B.show() model = cl.RandomForestClassificationModel.load('Model_v3_0') predictions_A = model.transform(df_A) #predictions_A.show() predictions_B = model.transform(df_B) #predictions_B.show() percentageA = 100 / predictions_A.count() percentageB = 100 / predictions_B.count() a = predictions_A.where(predictions_A['prediction']==1).count() b = predictions_B.where(predictions_B['prediction']==0).count() percentage = (a* percentageA + b*percentageB)/2 return percentage
def normalizer(df, input_cols, p=2.0): """ Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which specifies the p-norm used for normalization. (p=2) by default. :param df: Dataframe to be transformed :param input_cols: Columns to be normalized. :param p: p-norm used for normalization. :return: Dataframe with normalized columns. """ # Check if columns argument must be a string or list datatype: assert isinstance(input_cols, (str, list)), \ "Error: %s argument must be a string or a list." % "input_cols" if isinstance(input_cols, str): input_cols = [input_cols] assert isinstance( p, (float, int)), "Error: p argument must be a numeric value." # Convert ArrayType() column to DenseVector def arr_to_vec(arr_column): """ :param arr_column: Column name :return: Returns DenseVector by converting an ArrayType() column """ return DenseVector(arr_column) # User-Defined function # TODO: use apply() to use Pyarrow udf_arr_to_vec = F.udf(arr_to_vec, VectorUDT()) # Check for columns which are not DenseVector types and convert them into DenseVector for col in input_cols: if not is_(df[col], DenseVector): df = df.withColumn(col, udf_arr_to_vec(df[col])) normal = [ Normalizer(inputCol=column, outputCol=column + "_normalized", p=p) for column in list(set(input_cols)) ] pipeline = Pipeline(stages=normal) df = pipeline.fit(df).transform(df) return df
def get_feature_eng_stages(categoricalColumns, label="has_heart_disease"): stages = [] # stages in our Pipeline for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index") # Use OneHotEncoder to convert categorical variables into binary SparseVectors encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) # Add stages. These are not run here, but will run all at once later on. stages += [stringIndexer, encoder] label_stringIdx = StringIndexer(inputCol = label, outputCol="label") stages += [label_stringIdx] numericCols = ["age", "is_smoker"] assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="raw_features") normalizer = Normalizer(inputCol="raw_features", outputCol="features", p=1.0) stages += [assembler,normalizer] return(stages)
df = spark.read.csv('file:///home/zfar/Sentiment Analysis Dataset.csv', header=True) df = df.select(df['ItemID'], df['SentimentText'], df['label']) training = df.selectExpr("cast(itemID as int) id", "SentimentText", "cast(label as int) label") tokenizer = Tokenizer(inputCol="SentimentText", outputCol="words") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered") ngrams = NGram(n=2, inputCol=remover.getOutputCol(), outputCol="ngrams") hashingTF = HashingTF(inputCol=ngrams.getOutputCol(), outputCol="rawfeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idffeatures") normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol="features", p=1.0) #lr = LogisticRegression(maxIter=10, regParam=0.001) nb = NaiveBayes(smoothing=1.0) pipeline = Pipeline( stages=[tokenizer, remover, ngrams, hashingTF, idf, normalizer, nb]) model = pipeline.fit(training) """ paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=2)
from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("NormalizerExample")\ .getOrCreate() # $example on$ dataFrame = spark.createDataFrame([ (0, Vectors.dense([1.0, 0.5, -1.0]),), (1, Vectors.dense([2.0, 1.0, 1.0]),), (2, Vectors.dense([4.0, 10.0, 2.0]),) ], ["id", "features"]) # Normalize each Vector using $L^1$ norm. normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) l1NormData = normalizer.transform(dataFrame) print("Normalized using L^1 norm") l1NormData.show() # Normalize each Vector using $L^\infty$ norm. lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")}) print("Normalized using L^inf norm") lInfNormData.show() # $example off$ spark.stop()
from pyspark.ml.feature import Normalizer from pyspark.ml.linalg import Vectors dataFrame = spark.createDataFrame([( 0, Vectors.dense([1.0, 0.5, -1.0]), ), ( 1, Vectors.dense([2.0, 1.0, 1.0]), ), ( 2, Vectors.dense([4.0, 10.0, 2.0]), )], ["id", "features"]) # Normalize each Vector using $L^1$ norm. normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) l1NormData = normalizer.transform(dataFrame) print("Normalized using L^1 norm") l1NormData.show() # Normalize each Vector using $L^\infty$ norm. lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")}) print("Normalized using L^inf norm") lInfNormData.show() # COMMAND ---------- ###MinMaxScaler (0, 1) from pyspark.ml.feature import MinMaxScaler from pyspark.ml.linalg import Vectors
df_energy.createOrReplaceTempView('df_energy') # In[6]: df_join = spark.sql(""" select * from df inner join df_energy on df.class=df_energy.class """) df_join.show() # In[7]: from pyspark.ml.feature import VectorAssembler, Normalizer vectorAssembler = VectorAssembler(inputCols=["x", "y", "z"], outputCol="features") normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0) # In[8]: from pyspark.ml.regression import LinearRegression lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # In[9]: from pyspark.ml import Pipeline pipeline = Pipeline(stages=[vectorAssembler, normalizer, lr]) # In[10]: model = pipeline.fit(df_join)
# COMMAND ---------- from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors scaleUpVec = Vectors.dense(10.0, 15.0, 20.0) scalingUp = ElementwiseProduct()\ .setScalingVec(scaleUpVec)\ .setInputCol("features") scalingUp.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import Normalizer manhattanDistance = Normalizer().setP(1).setInputCol("features") manhattanDistance.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import StringIndexer lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd") idxRes = lblIndxr.fit(simpleDF).transform(simpleDF) idxRes.show() # COMMAND ---------- valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd") valIndexer.fit(simpleDF).transform(simpleDF).show()
tags_users_df=sqlContext.createDataFrame(tags_users) print(tags_users_df.take(2)) # # # print('Indexing strings') cVec = CountVectorizer(inputCol='tags', outputCol="tag_features",minDF=10.) model=cVec.fit(tags_users_df) td=model.transform(tags_users_df) with open('/home/erlenda/data/konsum/countvec_vocabulary.pkl',mode='wb') as ff: pkl.dump(model.vocabulary,ff) normalizer=Normalizer(p=1.,inputCol='tag_features',outputCol='tags_normalized') tdNorm=normalizer.transform(td) print(tdNorm.take(5)) tdNorm.write.save('/home/erlenda/data/konsum/tag_profiler_parquet') samples=tdNorm.filter(tdNorm.posts_with_tags>10).take(10) #pprint(samples) # stringIndexer = StringIndexer(inputCol="tags", outputCol="indexed_tags") # model=stringIndexer.fit(tags_users_df) # td=model.transform(tags_users_df) # print('Retrieving indices')
def build_model(df_ml): ''' Function builds machine learning model to predict churn INPUT: df_ml - dataset which contains user features to predict customer churn OUTPUT: model - model which predicts customer churn ''' # split into train, test and validation sets (60% - 20% - 20%) df_ml = df_ml.withColumnRenamed("churn", "label") train, test_valid = df_ml.randomSplit([0.6, 0.4], seed=42) test, validation = test_valid.randomSplit([0.5, 0.5], seed=42) # index and encode categorical features gender, level and state stringIndexerGender = StringIndexer(inputCol="gender", outputCol="genderIndex", handleInvalid='skip') stringIndexerLevel = StringIndexer(inputCol="last_level", outputCol="levelIndex", handleInvalid='skip') stringIndexerState = StringIndexer(inputCol="last_state", outputCol="stateIndex", handleInvalid='skip') encoder = OneHotEncoderEstimator( inputCols=["genderIndex", "levelIndex", "stateIndex"], outputCols=["genderVec", "levelVec", "stateVec"], handleInvalid='keep') # create vector for features features = [ 'genderVec', 'levelVec', 'stateVec', 'days_active', 'avg_songs', 'avg_events', 'thumbs_up', 'thumbs_down', 'addfriend' ] assembler = VectorAssembler(inputCols=features, outputCol="rawFeatures") # normalize features normalizer = Normalizer(inputCol="rawFeatures", outputCol="features", p=1.0) # initialize random forest classifier with tuned hyperparameters rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, impurity='gini', maxDepth=5, featureSubsetStrategy='sqrt') # assemble pipeline pipeline = Pipeline(stages=[ stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder, assembler, normalizer, rf ]) # fit model model = pipeline.fit(train) # predict churn pred_train = model.transform(train) pred_test = model.transform(test) pred_valid = model.transform(validation) # evaluate results predictionAndLabels = pred_train.rdd.map( lambda lp: (float(lp.prediction), float(lp.label))) # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabels) # print F1-score print("F1 score on train dataset is %s" % metrics.fMeasure()) predictionAndLabels = pred_test.rdd.map( lambda lp: (float(lp.prediction), float(lp.label))) # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabels) # F1 score print("F1 score on test dataset is %s" % metrics.fMeasure()) predictionAndLabels = pred_valid.rdd.map( lambda lp: (float(lp.prediction), float(lp.label))) # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabels) # F1 score print("F1 score on validation dataset is %s" % metrics.fMeasure()) return model
return wordbag documents = sqlContext.createDataFrame(sc.pickleFile('merged_file/part-00000').map(lambda x : [x['eval_id'],x['no'],create_wordbag(x),x['professor'],x['lec_code'][:4],x['lec_code'][5],x['eval_total'],x['eval_id']]),['eval_id','no','words','prof_name','department','grade','eval_total','eval_id']) #users = sqlContext.createDataFrame(sc.pickleFile('merged_file').map(lambda x : (x['mb_no'],x['lec_code'][:4])),['user','department']).orderBy('department') #for u in users.select('department','user').take(10000): # print u ''' professors = documents.select('prof_name').distinct() department = documents.select('department').distinct() #grade 1/2/3/4 eval_total = documents.select('eval_total').distinct() # 1/2/3/4/5 for e in eval_total.collect(): print e ''' htf = HashingTF(inputCol= 'words',outputCol = 'rawFeatures') featured = htf.transform(documents) idf = IDF(inputCol = 'rawFeatures',outputCol = 'idf') idfModel = idf.fit(featured) tf_idf = idfModel.transform(featured) normalizer = Normalizer(inputCol = 'idf', outputCol = 'idf_norm', p = 2.0) normData = normalizer.transform(tf_idf) normData.rdd.saveAsPickleFile('idf_normalized')
for i in range(len(pred)): if pred[i] == list(Y_test)[i]: corr+=1 else : pass print('正确率:'+str(corr*1.01/len(pred)/1.01)) #Spark上进行BP神经网络建模 from pyspark.sql import Row from pyspark.ml.feature import Normalizer lines = sc.textFile("hdfs:///lushun/a.txt") parts = lines.map(lambda l: l.split(" ")) df = parts.map(lambda p: Row(features=p[:-1], labe1=int(p[-1]))) df = spark.createDataFrame(df) df.createOrReplaceTempView("df") normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) l1NormData = normalizer.transform(df) l1NormData = spark.sql("SELECT labe1,normFeatures FROM l1NormData") l1NormData.show() from pyspark.ml.classification import MultilayerPerceptronClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator splits = lInfNormData.randomSplit([0.7, 0.3]) train = splits[0] test = splits[1] layers = [36300, 200, 200, 6] trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers,seed=1234) model = trainer.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
def trainModel(self): logger.info("Training the model...") query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;''' def SQLtoURL(query): data = query.replace('\n', ' ').replace('\t',' ').replace(' ',' ').replace(' ',' ') return data def QueryXXXXX(query, file = None): session = Session() response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},) return response.content table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table'] title_list = [x['c'] for x in table['rows']] table_cols = [d['label'] for d in table['cols']] def convert_row(row): rowlist = [d['v'] for d in row] return rowlist rd = self.sc.parallelize(title_list).map(convert_row) titleData = self.spark.createDataFrame(rd, table_cols) titleData = titleData.dropna() hebrew_stopwords = stop_words() def rmv(words): for punc in punctuation: words = words.replace(punc,"") for hword in hebrew_stopwords: words = words.replace(hword, " ") return words self.spark.udf.register("rmv", rmv, StringType()) titleData.registerTempTable("wordstable") cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable") tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words") wordsData = tokenizer.transform(cleanedSentenceData) cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0) cvModel = cv.fit(wordsData) featurizedData = cvModel.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) lda = LDA(k=100) ldaModel = lda.fit(rescaledData) postFactorizedData = ldaModel.transform(rescaledData) norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist") scaledFactorizedNormalizedData = norm.transform(postFactorizedData) self.model = scaledFactorizedNormalizedData logger.info("model is built!")
from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler vectorassembler = VectorAssembler( inputCols = ['x','y', 'z'], outputCol = 'features' ) features_vectorized = vectorassembler.transform(encoded) features_vectorized.show() # In[12]: from pyspark.ml.feature import Normalizer normalizer = Normalizer(inputCol = 'features', outputCol='features_norm', p=1.0) normalized_data = normalizer.transform(features_vectorized) normalized_data.show() # In[17]: from pyspark.ml import Pipeline pipeline = Pipeline(stages = [ indexer, encoder, vectorassembler, normalizer ]) model = pipeline.fit(df) prediction = model.transform(df)
# à la place on crée une deuxième dataframe où on ajoute la colonne qu'on veut. dfVect = dfBigram.withColumn("words", udfVectorizeUni("words")) # On a bien remplacé ici du coup les mots par les vecteurs sparse print "DataFrame(1-gram): On a bien remplacé ici du coup les mots par les vecteurs sparse" dfVect.show() udfVectorizeBi=UserDefinedFunction(lambda x : vectorizeBi(x),VectorUDT()) dfVect2 = dfVect.withColumn("bigrams", udfVectorizeBi("bigrams")) print "DataFrame(bi-gram): On a bien remplacé ici du coup les mots par les vecteurs sparse" dfVect2.show() # Pour les opérations de traitement du langage, il est d'usage de normaliser (L2) # les vecteurs de features : c'est ce qui marche le mieux apparemment. from pyspark.ml.feature import Normalizer normalizerUni = Normalizer(inputCol='words',outputCol='normWords',p=2.0) normalizerBi = Normalizer(inputCol="bigrams",outputCol='normBigrams',p=2.0) dfNorm = normalizerUni.transform(dfVect2) dfNorm2 = normalizerBi.transform(dfNorm) print "DataFrame(bi-gram): normalisé" dfNorm2.select('words','normWords').show() # La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements # non nuls et pas leur valeur # On passe au TFIDF # Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs # à n'importz quelle colonne (bigrammes, avec stop words ou sans...) from pyspark.ml.feature import HashingTF htf = HashingTF(inputCol='words',outputCol='wordsTF',numFeatures=10000) dfTrainTF = htf.transform(dfTrainTokNoSw) # INverse doc frequency from pyspark.ml.feature import IDF