def main(spark, data_file, model_file, user_file, track_file): df = spark.read.parquet(data_file) user_indexer = StringIndexer(inputCol="user_id", outputCol="user_idx", handleInvalid="keep") track_indexer = StringIndexer(inputCol="track_id", outputCol="track_idx", handleInvalid="keep") pipeline = Pipeline(stages=[user_indexer, track_indexer]) mapping = pipeline.fit(df) df = mapping.transform(df) #create + fit an ALS model als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True, ratingCol="count", userCol="user_idx", itemCol="track_idx") als_model = als.fit(df) #save trained ALS model als_model.write().overwrite().save(model_file) print("Model sucessfully saved to HFS") #save string indexers user_indexer.write().overwrite().save(user_file) track_indexer.write().overwrite().save(track_file) print("String Indexers sucessfully saved to HFS")
def encode_strings(df, cols, fname) -> pyspark.sql.DataFrame: """ """ for col in cols: indexer = StringIndexer(inputCol=f"{col}", outputCol=f"encoded_{col}", stringOrderType="alphabetAsc") model = indexer.fit(df) df = model.transform(df) # We'll write the models in a TEMP dictorory and later we'll move them # into our project's subdirectory. indexer.write().overwrite().save( f"{config.TEMP}/{fname}_stringindexer_{col}") model.write().overwrite().save( f"{config.TEMP}/{fname}_stringindexer_model_{col}") shutil.move( f"{config.TEMP}/{fname}_stringindexer_{col}", f"{config.SPARK_MODELS}/{fname}/stringindexer_{col}", ) shutil.move( f"{config.TEMP}/{fname}_stringindexer_model_{col}", f"{config.SPARK_MODELS}/{fname}/stringindexer_model_{col}", ) # indexer.save(f"{config.SPARK_MODELS}/stringindexer_{col}") # model.save(f"{config.SPARK_MODELS}/stringindexer_model_{col}") return df
def main(spark, data_file, model_file, user_file, track_file, model_formulation=None): df = spark.read.parquet(data_file) if model_formulation == 'log': #log compression on training df = df.withColumn('count', F.log(F.col('count'))) print("log") elif model_formulation == 'ct1': #subsetting all train counts greater than 1 df.createOrReplaceTempView('df') df = spark.sql('SELECT * FROM df WHERE count > 1') print("ct1") elif model_formulation == 'ct2': #subsetting all train counts greater than 2 df.createOrReplaceTempView('df') df = spark.sql('SELECT * FROM df WHERE count > 2') print("ct2") else: #If no model formulation is specified, pass print("default") pass user_indexer = StringIndexer(inputCol="user_id", outputCol="user_idx", handleInvalid="keep") track_indexer = StringIndexer(inputCol="track_id", outputCol="track_idx", handleInvalid="keep") pipeline = Pipeline(stages=[user_indexer, track_indexer]) mapping = pipeline.fit(df) df = mapping.transform(df) #create + fit an ALS model als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True, ratingCol="count", userCol="user_idx", itemCol="track_idx") als_model = als.fit(df) #save trained ALS model als_model.write().overwrite().save(model_file) print("Model sucessfully saved to HFS") #save string indexers user_indexer.write().overwrite().save(user_file) track_indexer.write().overwrite().save(track_file) print("String Indexers sucessfully saved to HFS")
def review_ids_to_number(dataframe): #build indexer model for user_id indexer_user = StringIndexer(inputCol ="user_id",outputCol="user_id_num").fit(dataframe) indexer_user_save = os.path.join('model','user_ind_model') indexer_user.write().overwrite().save(indexer_user_save) #build indexer model for business_id indexer_business = StringIndexer(inputCol ="business_id",outputCol="business_id_num",handleInvalid="skip").fit(dataframe) indexer_business_save = os.path.join('model', 'bus_ind_model') indexer_business.write().overwrite().save(indexer_business_save) #transform id columns to string indexed = indexer_user.transform(dataframe) final_indexed = indexer_business.transform(indexed) final_indexed.show(20) #save fitted strtingIndexer models final_indexed_save = os.path.join('dataset','review_vegas_als.parquet') final_indexed.write.mode('overwrite').parquet(final_indexed_save) logger.error('Indexed dataframe for ALS traing saved to review_vegas_als.parquet') logger.error('{} seconds has elapsed'.format(time.time() - start_time))
def stringIndexer(infoData): colmToIndex = infoData.get(mc.COLMTOINDEX) dataset = infoData.get(mc.DATASET) indexedColm = infoData.get(mc.INDEXEDCOLM) storageLocation = infoData.get(mc.STORAGELOCATION) indexerName = colmToIndex + mc.INDEXER file = storageLocation + indexerName # check if the datatype of the col is integer or float or double. if yes then no need to do the indexing-- sahil. '''for now converting each datatypes to string and then indexing it.''' dataset = dataset.withColumn(colmToIndex, dataset[colmToIndex].cast(StringType())) stringIndexer = StringIndexer(inputCol=colmToIndex, outputCol=indexedColm, handleInvalid="keep").fit(dataset) dataset = stringIndexer.transform(dataset) stringIndexer.write().overwrite().save(file) # will update this later indexerPathMapping = infoData.get(mc.INDEXERPATHMAPPING) indexerPathMapping.update({colmToIndex: file}) infoData.update({ mc.INDEXERPATHMAPPING: indexerPathMapping, mc.DATASET: dataset }) return infoData
def prepare_data_ml3(spark, jenkins_builds, sonar_issues, sonar_analyses, spark_artefacts_dir, run_mode): # Change build result to only SUCCESS/FAIL for binary classification modify_result = udf(lambda x: "SUCCESS" if x == "SUCCESS" else "FAIL", StringType()) spark.udf.register("modify_result", modify_result) if jenkins_builds is not None: jenkins_builds = jenkins_builds.withColumn("result", modify_result("result")) pipeline_path = Path(spark_artefacts_dir).joinpath("pipeline_3") label_idx_model_path = Path(spark_artefacts_dir).joinpath( "label_indexer_3") # Getting pipeline and label indexer models if run_mode == "first": pipeline_model = get_ml3_pipeline().fit(sonar_issues) pipeline_model.write().overwrite().save(str(pipeline_path.absolute())) label_idx_model = StringIndexer( inputCol="result", outputCol="label", handleInvalid="skip").fit(jenkins_builds) label_idx_model.write().overwrite().save( str(label_idx_model_path.absolute())) elif run_mode == "incremental": pipeline_model = PipelineModel.load(str(pipeline_path.absolute())) label_idx_model = StringIndexerModel.load( str(label_idx_model_path.absolute())) # Columns to return rules = pipeline_model.stages[0].labels columns = list(map(lambda x: "removed_" + x, rules)) + list( map(lambda x: "introduced_" + x, rules)) # Preparing removed_rules_df = sonar_issues.filter( "status IN ('RESOLVED', 'CLOSED', 'REVIEWED')").select( "current_analysis_key", "rule") df1 = pipeline_model.transform(removed_rules_df) rdd1 = df1.rdd.map(lambda x : (x[0],x[3])).reduceByKey(lambda v1,v2: sum_sparse_vectors(v1,v2)) \ .map(lambda x: Row(current_analysis_key = x[0], removed_rule_vec = x[1])) if rdd1.count() == 0: return None, columns removed_issues_rule_vec_df = spark.createDataFrame(rdd1) introduced_rules_df = sonar_issues.filter( "status IN ('OPEN', 'REOPENED', 'CONFIRMED', 'TO_REVIEW')").select( "creation_analysis_key", "rule") df2 = pipeline_model.transform(introduced_rules_df) rdd2 = df2.rdd.map(lambda x : (x[0],x[3])).reduceByKey(lambda v1,v2: sum_sparse_vectors(v1,v2)) \ .map(lambda x: Row(creation_analysis_key = x[0], introduced_rule_vec = x[1])) if rdd2.count() == 0: return None, columns introduced_issues_rule_vec_df = spark.createDataFrame(rdd2) joined_sonar_rules_df = removed_issues_rule_vec_df.join( introduced_issues_rule_vec_df, removed_issues_rule_vec_df.current_analysis_key == introduced_issues_rule_vec_df.creation_analysis_key, how="outer") joined_sonar_rules_df.createOrReplaceTempView("sonar_rules") joined_sonar_rules_df = spark.sql("""SELECT coalesce(current_analysis_key, creation_analysis_key) AS analysis_key, introduced_rule_vec, removed_rule_vec FROM sonar_rules """) num_rules = len(pipeline_model.stages[0].labels) imputed_sonar_rules_rdd = joined_sonar_rules_df.rdd.map( lambda row: Row(analysis_key=row[0], introduced_rule_vec=SparseVector(num_rules, {}) if row[1] is None else row[1], removed_rule_vec=SparseVector(num_rules, {}) if row[2] is None else row[2])) imputed_sonar_rules_df = spark.createDataFrame(imputed_sonar_rules_rdd) v_assembler = VectorAssembler( inputCols=["removed_rule_vec", "introduced_rule_vec"], outputCol="features") sonar_issues_df = v_assembler.transform(imputed_sonar_rules_df).select( "analysis_key", "features") sonar_df = sonar_issues_df.join( sonar_analyses, sonar_issues_df.analysis_key == sonar_analyses.analysis_key, how="inner") df = sonar_df.join(jenkins_builds, sonar_df.revision == jenkins_builds.revision_number, how="inner").select("result", "features") ml_df = label_idx_model.transform(df).select("label", "features") return ml_df, columns
def dataTranform(self, dataInfo): featuresColm = dataInfo.get( PredictiveConstants.FEATURESCOLM) # featureColmList -replaced labelColm = dataInfo.get(PredictiveConstants.LABELCOLM) modelSheetName = dataInfo.get(PredictiveConstants.MODELSHEETNAME) modelId = dataInfo.get(PredictiveConstants.MODELID) storageLocation = dataInfo.get(PredictiveConstants.LOCATIONADDRESS) indexerPathMapping = {} oneHotEncoderPathMapping = {} self.labelColm = None if labelColm == None else labelColm self.featuresColm = None if featuresColm == None else featuresColm dataset = self.dataset vectorizedFeaturescolmName = modelSheetName + PredictiveConstants.DMXFEATURE dataset = dataset.drop(vectorizedFeaturescolmName) schemaData = dataset.schema if self.labelColm is not None: for labelName in self.labelColm: label = labelName else: label = self.labelColm nonNumericData = self.nonNumericToString(schemaData=schemaData, dataset=dataset) categoricalFeatures = nonNumericData.get( PredictiveConstants.CATEGORICALFEATURES) numericalFeatures = nonNumericData.get( PredictiveConstants.NUMERICALFEATURES) dataset = nonNumericData.get(PredictiveConstants.DATASET) schemaData = dataset.schema # indexing of label column isLabelIndexed = "no" if self.labelColm is not None: labelIndexedInfo = self.isLabelIndexed(schemaData=schemaData, label=label, dataset=dataset) dataset = labelIndexedInfo.get(PredictiveConstants.DATASET) isLabelIndexed = labelIndexedInfo.get( PredictiveConstants.ISLABELINDEXED) labelIndexer = labelIndexedInfo.get( PredictiveConstants.LABELINDEXER) # store the label indexer here. if labelIndexer is not None: labelIndexerStoragepath = storageLocation + modelId.upper( ) + label.upper() + PredictiveConstants.INDEXER.upper( ) + PredictiveConstants.PARQUETEXTENSION labelIndexer.save(labelIndexerStoragepath) #correct this indexerPathMapping.update({label: labelIndexerStoragepath}) oneHotEncodedFeaturesList = [] indexedFeatures = [] nonOneHotEncoded = [] for colm in categoricalFeatures: indexedColmName = PredictiveConstants.INDEXED_ + colm oneHotEncodedColName = PredictiveConstants.ONEHOTENCODED_ + colm indexer = StringIndexer(inputCol=colm, outputCol=indexedColmName, handleInvalid="skip").fit(dataset) dataset = indexer.transform(dataset) '''store the indexer here- saving mechanism should be modelid+colmName+indexer.parquet --> not storing the features indexer for now but keeping this for future use.''' featuresIndexerPath = storageLocation + modelId.upper( ) + colm.upper() + PredictiveConstants.INDEXER.upper( ) + PredictiveConstants.PARQUETEXTENSION indexer.write().overwrite().save(featuresIndexerPath) indexerPathMapping.update({colm: featuresIndexerPath}) rowNo = dataset.select(indexedColmName).distinct().count() '''Incase of only one category in the colm or more than one in case of training and prediction -- naming it with the onehotencoded colmName to avoid that uncertainity. ''' if (rowNo == 1): nonOneHotEncoded.append( oneHotEncodedColName ) # to avoid the problem when only single value colm is passed at the time of prediction. else: indexedFeatures.append(indexedColmName) oneHotEncodedFeaturesList.append(oneHotEncodedColName) oneHotEncoder = OneHotEncoderEstimator( inputCols=indexedFeatures, outputCols=oneHotEncodedFeaturesList, handleInvalid="error") oneHotEncoderPath = storageLocation + modelId.upper( ) + PredictiveConstants.ONEHOTENCODED.upper( ) + PredictiveConstants.PARQUETEXTENSION oneHotEncoder.write().overwrite().save(oneHotEncoderPath) oneHotEncoderPathMapping.update( {PredictiveConstants.ONEHOTENCODED: oneHotEncoderPath}) oneHotEncoderFit = oneHotEncoder.fit(dataset) dataset = oneHotEncoderFit.transform(dataset) combinedFeatures = oneHotEncodedFeaturesList + numericalFeatures + nonOneHotEncoded categoryColmListDict = {} countOfCategoricalColmList = [] for value in categoricalFeatures: listValue = [] categoryColm = dataset.groupby(value).count() countOfCategoricalColmList.append(categoryColm.count()) categoryColmJson = categoryColm.toJSON() for row in categoryColmJson.collect(): categoryColmSummary = json.loads(row) listValue.append(categoryColmSummary) categoryColmListDict[value] = listValue self.numericalFeatures = numericalFeatures self.categoricalFeatures = categoricalFeatures if not categoricalFeatures: maxCategories = 5 else: maxCategories = max(countOfCategoricalColmList) dataset = dataset.drop(vectorizedFeaturescolmName) featureassembler = VectorAssembler( inputCols=combinedFeatures, outputCol=vectorizedFeaturescolmName, handleInvalid="skip") dataset = featureassembler.transform(dataset) # retrieve the features colm name after onehotencoding indexOfFeatures = dataset.schema.names.index( vectorizedFeaturescolmName) oneHotEncodedFeaturesDict = dataset.schema.fields[ indexOfFeatures].metadata['ml_attr']['attrs'] idNameFeatures = {} if not oneHotEncodedFeaturesDict: idNameFeaturesOrderedTemp = None else: for type, value in oneHotEncodedFeaturesDict.items(): for subKey in value: idNameFeatures[subKey.get("idx")] = subKey.get("name") idNameFeaturesOrderedTemp = {} for key in sorted(idNameFeatures): idNameFeaturesOrderedTemp[key] = idNameFeatures[ key].replace(PredictiveConstants.ONEHOTENCODED_, "") idNameFeaturesOrdered = None if idNameFeaturesOrderedTemp == None else idNameFeaturesOrderedTemp # retrieve the label colm name only after label encoding indexedLabelNameDict = {} if isLabelIndexed == "yes": indexOfLabel = dataset.schema.names.index(label) indexedLabel = dataset.schema.fields[indexOfLabel].metadata[ "ml_attr"]["vals"] for value in indexedLabel: indexedLabelNameDict[indexedLabel.index(value)] = value # this code was for vector indexer since it is not stable for now from spark end # so will use it in future if needed. ''' vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features', maxCategories=maxCategories, handleInvalid="skip").fit(dataset) categorical_features = vec_indexer.categoryMaps print("Choose %d categorical features: %s" % (len(categorical_features), ", ".join(str(k) for k in categorical_features.keys()))) dataset= vec_indexer.transform(dataset) ''' result = { PredictiveConstants.DATASET: dataset, PredictiveConstants.CATEGORICALFEATURES: categoricalFeatures, PredictiveConstants.NUMERICALFEATURES: numericalFeatures, PredictiveConstants.MAXCATEGORIES: maxCategories, PredictiveConstants.CATEGORYCOLMSTATS: categoryColmListDict, PredictiveConstants.INDEXEDFEATURES: indexedFeatures, PredictiveConstants.LABEL: label, PredictiveConstants.ONEHOTENCODEDFEATURESLIST: oneHotEncodedFeaturesList, PredictiveConstants.INDEXEDLABELNAMEDICT: indexedLabelNameDict, PredictiveConstants.ISLABELINDEXED: isLabelIndexed, PredictiveConstants.VECTORFEATURES: vectorizedFeaturescolmName, PredictiveConstants.IDNAMEFEATURESORDERED: idNameFeaturesOrdered, PredictiveConstants.INDEXERPATHMAPPING: indexerPathMapping, PredictiveConstants.ONEHOTENCODERPATHMAPPING: oneHotEncoderPathMapping } return result
def business_ids_to_number(dataframe): indexer_business = StringIndexer(inputCol ="business_id",outputCol="business_id_num").fit(dataframe) indexer_business_save = os.path.join('model', 'bus_ind_model') indexer_business.write().overwrite().save(indexer_business_save)
def main(): spark = SparkSession \ .builder \ .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:2.7.0') \ .getOrCreate() # use for only one file # filename = 'chicago_taxi_trips_2016_01.csv' # use for reading all files filename = '*' df = spark.read \ .format('csv') \ .options(header=True, inferSchema=True) \ .load(os.path.join(etl_conf['s3_taxi_dir_path'], filename)) # df.printSchema() # Take a look at the top rows # df.limit(5).toPandas() # Check initial number of records # df.count() df_with_hour = df.withColumn('year', year(df.trip_start_timestamp))\ .withColumn('month', month(df.trip_start_timestamp))\ .withColumn('day', dayofmonth(df.trip_start_timestamp))\ .withColumn('hour', hour(df.trip_start_timestamp)) df_features = df_with_hour.select('year', 'month', 'day', 'hour', 'pickup_community_area', 'dropoff_community_area') df_no_nulls = df_features.dropna() # df_no_nulls.count() # Create StringIndexer and fit + transform pickup data pickup_indexer = StringIndexer(inputCol='pickup_community_area', outputCol='pickup_community_area_indexed') pickup_indexer_model = pickup_indexer.fit(df_no_nulls) df_pickup_indexed = pickup_indexer_model.transform(df_no_nulls) # Create StringIndexer and fit + transform dropoff data dropoff_indexer = StringIndexer(inputCol='dropoff_community_area', outputCol='dropoff_community_area_indexed') dropoff_indexer_model = dropoff_indexer.fit(df_pickup_indexed) df_dropoff_indexed = dropoff_indexer_model.transform(df_pickup_indexed) # Create OneHotEncoder and fit + transform pickup & dropoff data encoder = OneHotEncoderEstimator() \ .setInputCols(['hour', 'pickup_community_area_indexed', 'dropoff_community_area_indexed']) \ .setOutputCols(['hour_encoded', 'pickup_community_area_encoded', 'dropoff_community_area_encoded']) encoder_model = encoder.fit(df_dropoff_indexed) df_encoded = encoder_model.transform(df_dropoff_indexed) # df_encoded.printSchema() bucket = output_conf['s3_bucket'] key = output_conf['s3_model_key'] # save the pickup stringINdexer and model pickup_indexer_name = 'pickup_indexer_name' pickup_indexer_path = os.path.join(bucket, key, pickup_indexer_name) pickup_indexer.write().overwrite().save(pickup_indexer_path) pickup_indexer_model_name = 'pickup_indexer_model_name' pickup_indexer_model_name_path = os.path.join(bucket, key, pickup_indexer_model_name) pickup_indexer_model \ .write() \ .overwrite() \ .save(pickup_indexer_model_name_path) # save the dropoff stringINdexer and model dropoff_indexer_name = 'dropoff_indexer_name' dropoff_indexer_path = os.path.join(bucket, key, dropoff_indexer_name) dropoff_indexer.write().overwrite().save(dropoff_indexer_path) dropoff_indexer_model_name = 'dropoff_indexer_model_name' dropoff_indexer_model_name_path = os.path.join(bucket, key, dropoff_indexer_model_name) dropoff_indexer_model \ .write() \ .overwrite() \ .save(dropoff_indexer_model_name_path) # save the one-hot encoder and model encoder_name = 'encoder_name' encoder_name_path = os.path.join(bucket, key, encoder_name) encoder.write().overwrite().save(encoder_name_path) encoder_model_name = 'encoder_model_name' encoder_model_name_path = os.path.join(bucket, key, encoder_model_name) encoder_model.write().overwrite().save(encoder_model_name_path) # make final dataframe and store back to S3 df_final = df_encoded.select('year', 'month', 'day', 'hour_encoded', 'pickup_community_area_encoded', 'dropoff_community_area_encoded') bucket = output_conf['s3_bucket'] key = output_conf['s3_data_key'] output_path = os.path.join(bucket, key) df_final.write.partitionBy('year', 'month', 'day') \ .parquet(output_path, mode='overwrite')
return ss if __name__ == '__main__': start_time = time.time() ss = init_spark_session() #initial spark session reviewDF_vegas_save = os.path.join('dataset', 'review_vegas.parquet') reviewDF_vegas = ss.read.parquet(reviewDF_vegas_save).cache() #covert user_id and bussiness_id from string to int indexer_user = StringIndexer(inputCol="user_id", outputCol="user_id_int").fit(reviewDF_vegas) indexer_user_save = os.path.join('model', 'user_ind_model') indexer_user.write().overwrite().save(indexer_user_save) indexer_business = StringIndexer( inputCol="business_id", outputCol="business_id_int").fit(reviewDF_vegas) indexer_business_save = os.path.join('model', 'bus_ind_model') indexer_business.write().overwrite().save(indexer_business_save) #transform id columns to string indexed = indexer_user.transform(reviewDF_vegas) final_indexed = indexer_business.transform(indexed) final_indexed.show(20) #save fitted strtingIndexer models final_indexed_save = os.path.join('dataset', 'review_vegas_als.parquet') final_indexed.write.mode('overwrite').parquet(final_indexed_save) logger.error('index compelted save to file') logger.error('{} seconds has elapsed'.format(time.time() - start_time))