def __init__(self, args, args2): """ Spark version for normalizing data, including minmax and zscore @param args: dict columns: list """ self.logger = logging.getLogger(self.__class__.__name__) self.originalDF = None self.transformedDF = None self.parameterDF = None self.inputUrl1 = args["input"][0]["value"] try: self.inputUrl2 = args["input"][1]["value"] except IndexError: self.inputUrl2 = "" self.outputUrl1 = args["output"][0]["value"] self.outputUrl2 = args["output"][1]["value"] try: self.columns = args["param"]["columns"] except KeyError as e: self.columns = None self.param = args["param"] self.logger.info("initializing SparkSession") self.spark = utils.init_spark()
def generate_match_accident_road_of_one_month(year, month): filepath = workdir + f'data/match_accident-road_{year}-{month}.parquet' if isdir(filepath): # Skip if already done return print(f'Generating {year}-{month}') spark = init_spark() road_df = get_road_df(spark, use_cache=True) accident_df = preprocess_accidents(get_accident_df(spark)) start_day_str = f'{year}-{month:02}-01' if month == 12: end_year = year + 1 month = 0 else: end_year = year end_day_str = f'{end_year}-{(month + 1):02}-01' start_day = datetime.datetime.fromisoformat(start_day_str) end_day = datetime.datetime.fromisoformat(end_day_str) accident_df = (accident_df.filter((col('date') >= start_day) & (col('date') < end_day))) match_accident_road = \ match_accidents_with_roads(spark, road_df, accident_df, use_cache=False) match_accident_road.write.parquet(filepath) spark.stop() # Force garbage collection and empty temp dir
def getIn(self): # Spark模型加载 self.logger.debug("using PySpark") from pyspark.ml import PipelineModel self.logger.info("initialize SparkSession") self.spark = utils.init_spark() self.originalDF = utils.dataUtil.SparkReadHive(self.inputUrl2, self.spark) self.model = PipelineModel.load(self.inputUrl1)
def __init__(self, args, args2): """ Spark version for reading data from HDFS and then write to HIVE @param args: dict inputUrl: String outputUrl: String """ self.logger = logging.getLogger(self.__class__.__name__) self.outputUrl1 = args["output"][0]["value"] self.type = args["param"]["type"] self.inputUrl1 = args["param"]["path"] self.DF = None self.logger.info("initializing SparkSession") self.spark = utils.init_spark()
def __init__(self, args, args2): """ Spark version for initializing RandomForest binary classifier @param args: dict n_estimators: int criterion: string one of "gini" and "entropy" max_depth: int min_samples_split: int min_samples_leaf: int """ self.logger = logging.getLogger(self.__class__.__name__) self.outputUrl1 = args["output"][0]["value"] self.param = args["param"] self.model = None self.logger.info("initializing spark") self.spark = utils.init_spark()
def __init__(self, args, args2): """ Standalone version for type transformation @param args: dict toDoubleColumns: list defaultDoubleValue: double toIntColumns: list defaultIntValue: int toCategoricalColumns: list """ self.logger = logging.getLogger(self.__class__.__name__) self.originalDF = None self.transformedDF = None self.paramDF = None self.inputUrl1 = args["input"][0]["value"] self.outputUrl1 = args["output"][0]["value"] self.outputUrl2 = self.outputUrl1 + "toCategorical" self.param = args["param"] try: self.toDoubleColumns = self.param["toDouble"] except KeyError: self.toDoubleColumns = None try: self.defaultDoubleValue = self.param["defaultDoubleValue"] except KeyError: self.defaultDoubleValue = 0.0 try: self.toIntColumns = self.param["toInt"] except KeyError: self.toIntColumns = None try: self.defaultIntValue = self.param["defaultIntValue"] except KeyError: self.defaultIntValue = 0 try: self.toCategoricalColumns = self.param["toCategoricalColumns"] except KeyError: self.toCategoricalColumns = None self.mode = self.param["mode"] self.logger.info("initializing SparkSession") self.spark = utils.init_spark()
def __init__(self, args, args2): """ Spark version for training clustering model @param args: dict featureCols: list """ self.logger = logging.getLogger(self.__class__.__name__) self.originalDF = None self.transformDF = None self.inputUrl1 = args["input"][0]["value"] self.inputUrl2 = args["input"][1]["value"] self.outputUrl1 = args["output"][0]["value"] self.outputUrl2 = args["output"][1]["value"] self.featureCols = args["param"]["features"] self.model = None self.pipelineModel = None self.logger.debug("initializing SparkSession") self.spark = utils.init_spark()
def __init__(self, args, args2): """ Spark version for evaluating binary classifier @param args: dict featureCols: list labelCol: string """ self.logger = logging.getLogger(self.__class__.__name__) self.inputUrl1 = args["input"][0]["value"] self.inputUrl2 = args["input"][1]["value"] self.outputUrl1 = args["output"][0]["value"] self.featureCols = args["param"]["features"] self.originalDF = None self.transformDF = None self.model = None self.result = None self.logger.debug("initializing SparkSession") self.spark = utils.init_spark()
def __init__(self, args, args2): """ Standalone version for initializing KMeans clustering @param args: dict K: int init: string one of "k-means++" and "random" n_init: int max_iter: int tol: float """ # init logging self.logger = logging.getLogger(self.__class__.__name__) # init parameters self.outputUrl1 = args["output"][0]["value"] self.param = args["param"] self.model = None self.logger.info("initializing SparkSession") # init SparkSession self.spark = utils.init_spark()
def __init__(self, args, args2): """ Spark version for conducting PCA on input dataset @param args: dict inputUrl: String outputUrl: String columns: list k: int """ self.logger = logging.getLogger(self.__class__.__name__) self.originalDF = None self.transformDF = None self.pipelineModel = None self.inputUrl1 = args["input"][0]["value"] self.outputUrl1 = args["output"][0]["value"] self.columns = args["param"]["columns"] self.k = args["param"]["k"] self.logger.info("initializing SparkSession") self.spark = utils.init_spark()
def __init__(self, args, args2): """ Standalone version for split data, including byRatio and byThreshold @param args: dict splitBy: string one of byRation and byThreshold ratio: double thresholdColumn: stirng threshold: double """ self.logger = logging.getLogger(self.__class__.__name__) self.originalDF = None self.DF1 = None self.DF2 = None self.inputUrl1 = args["input"][0]["value"] self.outputUrl1 = args["output"][0]["value"] self.outputUrl2 = args["output"][1]["value"] self.param = args["param"] self.logger.info("initializing SparkSession") self.spark = utils.init_spark()
#!/usr/bin/env python import accident_prediction_montreal from pyspark.sql.functions import udf, min, max, col from pyspark.sql.types import FloatType from pyspark.ml.feature import VectorAssembler from preprocess import get_negative_samples, get_positive_samples from utils import init_spark from preprocess import get_dataset_df from export_results import * result_dir = create_result_dir('base') spark = init_spark() neg_samples = get_negative_samples(spark).sample(0.5) pos_samples = get_positive_samples(spark) imbalance_ratio = (neg_samples.count() / pos_samples.count()) train_set, test_set = get_dataset_df(spark, pos_samples, neg_samples) train_set, test_set = train_set.persist(), test_set.persist() get_accidents_count = udf(lambda v: float(v[7]), FloatType()) def fit(train_set): accidents_count = train_set.select( get_accidents_count('features').alias('accidents_count'), 'label') accidents_count_to_proba = [] for i in range(377): accidents_count_higher = \ accidents_count.filter(col('accidents_count') >= i) proba = (accidents_count_higher.filter(col('label') == 1.0).count() /