def spark_transform(self): """ transforms Spark RDD with raw data into the RDD that contains top-n pickup spots for each block and time slot """ BatchTransformer.spark_transform(self) n = self.psql_config["topntosave"] # calculation of top-n spots for each block and time slot self.data = (self.data.map(lambda x: ( (x["block_id"], x["time_slot"], x["sub_block_id"]), x["passengers"] )).reduceByKey(lambda x, y: x + y).map(lambda x: ((x[0][0], x[0][ 1]), [(x[0][2], x[1])])).reduceByKey(lambda x, y: x + y).mapValues( lambda vals: heapq.nlargest(n, vals, key=lambda x: x[1])).map( lambda x: { "block_id": x[0][0], "time_slot": x[0][1], "subblocks_psgcnt": x[1] })) self.data.persist(pyspark.StorageLevel(True, True, False, False, 3)).count() # MEMORY_AND_DISK_3 # recalculation of top-n, where for each key=(block_id, time_slot) top-n is calculated # based on top-n of (block_id, time_slot) and top-ns of (adjacent_block, time_slot+1) # from all adjacent blocks maxval = self.psql_config["upperBound"] self.data = (self.data.map(lambda x: ((x["block_id"], x[ "time_slot"]), x["subblocks_psgcnt"])).flatMap(lambda x: [x] + [ ((bl, (x[0][1] - 1) % maxval), x[1]) for bl in helpers.get_neighboring_blocks(x[0][0]) ]).reduceByKey(lambda x, y: x + y).mapValues( lambda vals: heapq.nlargest(n, vals, key=lambda x: x[1])).map( lambda x: { "block_latid": x[0][0][0], "block_lonid": x[0][0][1], "time_slot": x[0][1], "longitude": [ helpers.determine_subblock_lonlat(el[0])[0] for el in x[1] ], "latitude": [ helpers.determine_subblock_lonlat(el[0])[1] for el in x[1] ], "passengers": [el[1] for el in x[1]] })) self.data.persist(pyspark.StorageLevel(True, True, False, False, 3)).count() # MEMORY_AND_DISK_3
def generate_digraph(edge_count=40, batch_size=50, steps_to_python_gc=50, domain_size=50): sc = pyspark.context.SparkContext.getOrCreate() sc.setCheckpointDir("~/.transitive_closure") spark = pyspark.sql.SparkSession(sc) # Translation to Spark format is ludicrously slow without PyArrow spark.conf.set("spark.sql.execution.arrow.enabled", "true") batches_to_python_gc = steps_to_python_gc // batch_size batch_count = int(np.ceil(edge_count/batch_size)) digraph=[] for i in range(batch_count): if i%batches_to_python_gc == 0: gc.collect() if i == batch_count-1 and edge_count % batch_size > 0: batch_size = edge_count % batch_size new_origins = np.random.randint(0,domain_size,dtype=np.int32, size=(batch_size,1)) new_termini = np.random.randint(0,domain_size,dtype=np.int32, size=(batch_size,1)) partial_digraph = pd.DataFrame(np.concatenate( [new_origins, new_termini], 1), columns=("origin", "terminus") ) partial_digraph = spark.createDataFrame(partial_digraph).distinct() if digraph != []: digraph[0] = digraph[1].union(digraph[0])\ .orderBy(["origin", "terminus"], ascending=[True, True])\ .distinct()\ .persist(pyspark.StorageLevel(True, False, False, True, 1))\ .checkpoint() digraph[1] = partial_digraph else: digraph = [partial_digraph,partial_digraph] if i == batch_count-1: gc.collect() digraph = digraph[1].union(digraph[0])\ .orderBy(["origin", "terminus"], ascending=[True, True])\ .distinct()\ .persist(pyspark.StorageLevel(True, False, False, True, 1))\ .checkpoint() df=digraph.toPandas() sc.stop() return df
def create_df_from_generator(gen, names): a = sc.parallelize(gen, 20) a.persist(pyspark.StorageLevel(True, True, False, True, 1)) df = sqlContext.createDataFrame(a, schema=names, samplingRatio=None).repartition(20) #df.persist(pyspark.StorageLevel(True, True, False, True, 1)) return df
def transitive_closure_from_dataframe(digraph): dir_path = os.path.dirname(os.path.realpath(__file__)) save_file = os.path.join(dir_path,'solution.pickle') try: shutil.rmtree(save_file) except: pass sc = pyspark.context.SparkContext.getOrCreate() sc.setCheckpointDir("~/.transitive_closure") spark = pyspark.sql.SparkSession(sc) # Translation to Spark format is ludicrously slow with PyArrow spark.conf.set("spark.sql.execution.arrow.enabled", "true") start_time=time.time() orig_digraph = spark.createDataFrame(digraph.copy()) new_edges = spark.createDataFrame(digraph.copy()).checkpoint() new_edges_mem = digraph closed_digraph = spark.createDataFrame(digraph).checkpoint() while not new_edges_mem.empty: new_edges = spark.createDataFrame(new_edges_mem).persist( pyspark.StorageLevel(False, True, False, False, 1)).checkpoint() new_edges = new_edges.join(orig_digraph, (new_edges.terminus == orig_digraph.origin))\ .select(new_edges.origin, orig_digraph.terminus)\ .union(orig_digraph.join(new_edges, (new_edges.origin == orig_digraph.terminus))\ .select(orig_digraph.origin, new_edges.terminus))\ .distinct()\ .exceptAll(closed_digraph) # I don't see any copy method, and PyArrow is nearly # instantaneous, so I'm going to use pandas to copy the new # edges to memory. new_edges_mem = new_edges.toPandas().copy() closed_digraph = closed_digraph.union(new_edges.persist( pyspark.StorageLevel(True, False, False, True, 1) ).checkpoint()) # Putting this in just because of how much trouble I had with the # generate_digraph function gc.collect() # Ideally, we would be able to use the fact that closed_digraph is # already stored on the disk, but I'm not quite sure how to do it. # Also, for clusters with separate hard drives, # we would want to mount it somewhere with HDFS or similar. closed_digraph.rdd.saveAsPickleFile(save_file) df = closed_digraph.toPandas() sc.stop() return df
def create_dataframe(spark_context, sql_context, table, column_names): data = spark_context.parallelize(table, 20) data.persist(pyspark.StorageLevel(True, True, False, True, 1)) df = sql_context.createDataFrame(data, schema=column_names, samplingRatio=None).repartition(20) # df.persist(pyspark.StorageLevel(True, True, False, True, 1)) return df
return result #print(personName+","+changeManager+","+promote+","+str(ManagerCata.index(lastManager)) ) def getCataDict(): return {0: 2, 1: 2, 2: len(ManagerCata)} couchbase_host = '10.1.193.189' couchbase_bucket = 'persona' couchbucket = Couchbase.connect(bucket=couchbase_bucket, host=couchbase_host) # reference http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#module-pyspark.mllib.tree pyspark.StorageLevel(True, False, False, False, 1) sc = SparkContext("local", "ad RD leave predict") if __name__ == '__main__': prepareLeaverList('../var/all2015rdleaves') data = [] l = 500 for i in range(6): data.extend(buildDataFromCouchbase(l, l * i)) cataDict = getCataDict() model = GradientBoostedTrees.trainClassifier(sc.parallelize(data), cataDict, numIterations=10, maxBins=500)
def __init__(self): pyspark.StorageLevel(True, True, False, False, 2)
def main(result_dir_master, result_dir_s3): CON_CONFIGS = {} CON_CONFIGS["result_dir_master"] = result_dir_master CON_CONFIGS["result_dir_s3"] = result_dir_s3 # ## user to specify: hyper-params # clustering CON_CONFIGS["n_clusters"] = 3 CON_CONFIGS["warn_threshold_np_ratio"] = 1 # classification CON_CONFIGS["n_eval_folds"] = 5 CON_CONFIGS["n_cv_folds"] = 5 CON_CONFIGS["lambdas"] = list(10.0 ** numpy.arange(-2, 2, 1.0)) CON_CONFIGS["alphas"] = list(numpy.linspace(0, 1, 3)) CON_CONFIGS["desired_recalls"] = [0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2,0.225,0.25] # CON_CONFIGS["desired_recalls"] = [0.05,0.10] # ## read data and some meta studff # user to specify : seed in Random Forest model CON_CONFIGS["seed"] = 42 CON_CONFIGS["data_path"] = "s3://emr-rwes-pa-spark-dev-datastore/lichao.test/data/BI/smaller_data/" CON_CONFIGS["pos_file"] = "pos_70.0pct.csv" CON_CONFIGS["neg_file"] = "neg_70.0pct.csv" CON_CONFIGS["ss_file"] = "ss_70.0pct.csv" #reading in the data from S3 spark = SparkSession.builder.appName(os.path.basename(__file__)).getOrCreate() org_pos_data = spark.read.option("header", "true")\ .option("inferSchema", "true")\ .csv(CON_CONFIGS["data_path"] + CON_CONFIGS["pos_file"]) org_neg_data = spark.read.option("header", "true")\ .option("inferSchema", "true")\ .csv(CON_CONFIGS["data_path"] + CON_CONFIGS["neg_file"])\ .select(org_pos_data.columns) org_ss_data = spark.read.option("header", "true")\ .option("inferSchema", "true")\ .csv(CON_CONFIGS["data_path"] +CON_CONFIGS["ss_file"])\ .select(org_pos_data.columns) # user to specify: original column names for predictors and output in data orgOutputCol = "label" matchCol = "matched_positive_id" patIDCol = "patid" nonFeatureCols = [matchCol, orgOutputCol, patIDCol] # orgPredictorCols = ["PATIENT_AGE", "LOOKBACK_DAYS", "LVL3_CHRN_ISCH_HD_FLAG", "LVL3_ABN_CHST_XRAY_FLAG"] # org_pos_data = org_pos_data.select(nonFeatureCols + orgPredictorCols) # org_neg_data = org_neg_data.select(nonFeatureCols + orgPredictorCols) # org_ss_data = org_ss_data.select(nonFeatureCols + orgPredictorCols) # sanity check if type(org_pos_data.select(orgOutputCol).schema.fields[0].dataType) not in (DoubleType, IntegerType): raise TypeError("The output column is not of type integer or double. ") org_pos_data = org_pos_data.withColumn(orgOutputCol, org_pos_data[orgOutputCol].cast("double")) orgPredictorCols = [x for x in org_pos_data.columns if x not in nonFeatureCols] orgPredictorCols4Clustering = [x for x in orgPredictorCols if "FLAG" in x] if type(org_neg_data.select(orgOutputCol).schema.fields[0].dataType) not in (DoubleType, IntegerType): raise TypeError("The output column is not of type integer or double. ") org_neg_data = org_neg_data.withColumn(orgOutputCol, org_neg_data[orgOutputCol].cast("double")) if type(org_ss_data.select(orgOutputCol).schema.fields[0].dataType) not in (DoubleType, IntegerType): raise TypeError("The output column is not of type integer or double. ") org_ss_data = org_ss_data.withColumn(orgOutputCol, org_ss_data[orgOutputCol].cast("double")) # clusterFeatureCol = "cluster_features" clusterCol = "cluster_id" # user to specify: the collective column name for all predictors collectivePredictorCol = "features" # in-cluster distance distCol = "dist" # user to specify: the column name for prediction predictionCol = "probability" CON_CONFIGS["orgPredictorCols"] = orgPredictorCols CON_CONFIGS["orgPredictorCols4Clustering"] = orgPredictorCols4Clustering CON_CONFIGS["n_predictors_classification"] = len(orgPredictorCols) CON_CONFIGS["n_rows_pos"] = org_pos_data.count() CON_CONFIGS["n_rows_neg"] = org_neg_data.count() CON_CONFIGS["n_rows_ss"] = org_ss_data.count() save_analysis_info(\ result_dir_master, "analysis_info.txt", CON_CONFIGS ) # convert to ml-compatible format assembler = VectorAssembler(inputCols=orgPredictorCols, outputCol=collectivePredictorCol) posFeatureAssembledData = assembler.transform(org_pos_data)\ .select(nonFeatureCols + [collectivePredictorCol]) negFeatureAssembledData = assembler.transform(org_neg_data)\ .select(nonFeatureCols + [collectivePredictorCol]) # evalIDCol = "evalFoldID" cvIDCol = "cvFoldID" pos_neg_data = posFeatureAssembledData.union(negFeatureAssembledData) pos_neg_data_with_eval_ids = AppendDataMatchingFoldIDs(pos_neg_data, CON_CONFIGS["n_eval_folds"], matchCol, foldCol=evalIDCol) # the model (pipeline) classifier_spec = LogisticRegression(maxIter=1e5, featuresCol = collectivePredictorCol, labelCol = orgOutputCol, standardization = True) evaluator = BinaryClassificationEvaluatorWithPrecisionAtRecall(\ rawPredictionCol=predictionCol, labelCol=orgOutputCol, metricName="precisionAtGivenRecall", metricParams={"recallValue":0.05}\ ) paramGrid = ParamGridBuilder()\ .addGrid(classifier_spec.regParam, CON_CONFIGS["lambdas"])\ .addGrid(classifier_spec.elasticNetParam, CON_CONFIGS["alphas"])\ .build() # cross-evaluation predictionsAllData = None kmeans = KMeans(featuresCol=clusterFeatureCol, predictionCol=clusterCol).setK(CON_CONFIGS["n_clusters"]) cluster_assembler = VectorAssembler(inputCols=orgPredictorCols4Clustering, outputCol=clusterFeatureCol) metricSets = [{"metricName": "precisionAtGivenRecall", "metricParams": {"recallValue": x}} for x in CON_CONFIGS["desired_recalls"]] filename_loop_info = result_dir_master + "loop_info.txt" file_loop_info = open(filename_loop_info, "w") inputTrivialNegPredCols = ["_pos_prob", "_neg_prob"] trivial_neg_pred_assembler = VectorAssembler(inputCols=inputTrivialNegPredCols, outputCol=predictionCol) for iFold in range(CON_CONFIGS["n_eval_folds"]): condition = pos_neg_data_with_eval_ids[evalIDCol] == iFold leftoutFold = pos_neg_data_with_eval_ids.filter(condition).drop(evalIDCol) trainFolds = pos_neg_data_with_eval_ids.filter(~condition).drop(evalIDCol) file_loop_info.write("####################################################################\n\n".format(iFold)) file_loop_info.write("iFold: {}\n\n".format(iFold)) file_loop_info.write("n_rows of leftoutFold: {}\n".format(leftoutFold.count())) file_loop_info.write("n_rows of trainFolds: {}\n".format(trainFolds.count())) # ## clustering to be done here pos_data_4_clustering = trainFolds\ .filter(F.col(orgOutputCol)==1)\ .select(patIDCol)\ .join(org_pos_data, patIDCol) pos_data_4_clustering_assembled = cluster_assembler.transform(pos_data_4_clustering)\ .select([patIDCol, matchCol] + [clusterFeatureCol]) cluster_model, clustered_pos = clustering(pos_data_4_clustering_assembled, kmeans, clusterFeatureCol, clusterCol, distCol) nPosesAllClusters = clustered_pos.count() predictionsOneFold = None file_loop_info.write("nPosesAllClusters: {}\n".format(nPosesAllClusters)) for i_cluster in range(CON_CONFIGS["n_clusters"]): file_loop_info.write("i_cluster: {}\n\n".format(i_cluster)) # the positive data for training the classifier train_pos = clustered_pos\ .filter(clustered_pos[clusterCol]==i_cluster)\ .select(patIDCol)\ .join(trainFolds, patIDCol) file_loop_info.write("n_rows of train_pos: {}\n".format(train_pos.count())) posPctThisClusterVSAllClusters = float(train_pos.count()) / nPosesAllClusters file_loop_info.write("posPctThisClusterVSAllClusters: {}\n".format(posPctThisClusterVSAllClusters)) # select negative training data based on the clustering result corresponding_neg = train_pos\ .select(matchCol)\ .join(org_neg_data, matchCol) corresponding_neg_4_clustering_assembled = cluster_assembler.transform(corresponding_neg)\ .select([patIDCol, matchCol] + [clusterFeatureCol]) similar_neg_ids = select_certain_pct_ids_per_positive_closest_to_cluster_centre(\ corresponding_neg_4_clustering_assembled, clusterFeatureCol, cluster_model.clusterCenters()[i_cluster], posPctThisClusterVSAllClusters, patIDCol, matchCol ) train_data = similar_neg_ids\ .join(trainFolds, patIDCol)\ .select(train_pos.columns)\ .union(train_pos) file_loop_info.write("n_rows of train_data: {}\n".format(train_data.count())) trainDataWithCVFoldID = AppendDataMatchingFoldIDs(train_data, CON_CONFIGS["n_cv_folds"], matchCol, foldCol=cvIDCol) trainDataWithCVFoldID.coalesce(int(trainFolds.rdd.getNumPartitions() * posPctThisClusterVSAllClusters) + 1) # sanity check: if there are too few negatives for any positive # thresh_n_neg_per_fold = round(train_pos.count() / float(CON_CONFIGS["n_cv_folds"])) * CON_CONFIGS["warn_threshold_np_ratio"] # neg_counts_all_cv_folds = trainDataWithCVFoldID\ # .filter(F.col(orgOutputCol)==0)\ # .groupBy(cvIDCol)\ # .agg(F.count(orgOutputCol).alias("_tmp"))\ # .select("_tmp")\ # .collect() # if any(map(lambda x: x["_tmp"] < thresh_n_neg_per_fold, neg_counts_all_cv_folds)): # raise ValueError("Insufficient number of negative data in at least one cv fold.") # ## train the classifier validator = CrossValidatorWithStratificationID(\ estimator=classifier_spec, estimatorParamMaps=paramGrid, evaluator=evaluator, stratifyCol=cvIDCol\ ) cvModel = validator.fit(trainDataWithCVFoldID) # ## test data entireTestData = org_ss_data\ .join(leftoutFold.filter(F.col(orgOutputCol)==1).select(matchCol), matchCol).select(org_pos_data.columns)\ .union(org_pos_data.join(leftoutFold.select(patIDCol), patIDCol).select(org_pos_data.columns))\ .union(org_neg_data.join(leftoutFold.select(patIDCol), patIDCol).select(org_pos_data.columns)) entireTestDataAssembled4Clustering = cluster_assembler.transform(entireTestData)\ .select([patIDCol, matchCol] + [clusterFeatureCol]) file_loop_info.write("n_rows of entireTestData: {}\n".format(entireTestData.count())) filteredTestData = select_certain_pct_overall_ids_closest_to_cluster_centre(\ entireTestDataAssembled4Clustering, clusterFeatureCol, cluster_model.clusterCenters()[i_cluster], posPctThisClusterVSAllClusters, patIDCol ).join(entireTestData, patIDCol) file_loop_info.write("n_rows of filteredTestData: {}\n".format(filteredTestData.count())) filteredTestDataAssembled = assembler.transform(filteredTestData)\ .select(nonFeatureCols + [collectivePredictorCol]) # testing predictions = cvModel\ .transform(filteredTestDataAssembled)\ .select(nonFeatureCols + [collectivePredictorCol, predictionCol]) # need to union the test data filtered away (all classified as negative) discarded_test_ids = entireTestData\ .select(patIDCol)\ .subtract(filteredTestData.select(patIDCol)) discardedTestData = discarded_test_ids\ .join(entireTestData, patIDCol) discardedTestDataAssembled = assembler.transform(discardedTestData, )\ .select(nonFeatureCols + [collectivePredictorCol]) predictionsDiscardedTestData = discardedTestDataAssembled\ .withColumn(inputTrivialNegPredCols[0], F.lit(0.0))\ .withColumn(inputTrivialNegPredCols[1], F.lit(1.0)) predictionsDiscardedTestDataAssembled = trivial_neg_pred_assembler\ .transform(predictionsDiscardedTestData)\ .select(predictions.columns) predictionsEntireTestData = predictions.union(predictionsDiscardedTestDataAssembled) metricValuesOneCluster = evaluator\ .evaluateWithSeveralMetrics(predictionsEntireTestData, metricSets = metricSets) file_name_metrics_one_cluster = result_dir_master + "metrics_cluster_" + str(i_cluster) + "fold_" + str(iFold) + "_.csv" save_metrics(file_name_metrics_one_cluster, metricValuesOneCluster) predictionsEntireTestData.write.csv(result_dir_s3 + "predictions_fold_" + str(iFold) + "_cluster_" + str(i_cluster) + ".csv") predictionsEntireTestData.persist(pyspark.StorageLevel(True, False, False, False, 1)) if predictionsOneFold is not None: predictionsOneFold = predictionsOneFold.union(predictionsEntireTestData) else: predictionsOneFold = predictionsEntireTestData # save the metrics for all hyper-parameter sets in cv cvMetrics = cvModel.avgMetrics cvMetricsFileName = result_dir_s3 + "cvMetrics_cluster_" + str(i_cluster) + "_fold_" + str(iFold) cvMetrics.coalesce(4).write.csv(cvMetricsFileName, header="true") # save the hyper-parameters of the best model bestParams = validator.getBestModelParams() file_best_params = result_dir_master + "bestParams_cluster_" + str(i_cluster) + "_fold_" + str(iFold) + ".txt" with open(file_best_params, "w") as fileBestParams: fileBestParams.write(str(bestParams)) os.chmod(file_best_params, 0o777) # summarise all clusters from the fold metricValuesOneFold = evaluator\ .evaluateWithSeveralMetrics(predictionsOneFold, metricSets = metricSets) file_name_metrics_one_fold = result_dir_master + "metrics_fold_" + str(iFold) + "_.csv" save_metrics(file_name_metrics_one_fold, metricValuesOneFold) if predictionsAllData is not None: predictionsAllData = predictionsAllData.union(predictionsOneFold) else: predictionsAllData = predictionsOneFold # save all predictions predictionsFileName = result_dir_s3 + "predictionsAllData" predictionsAllData.select(orgOutputCol, getitem(1)(predictionCol).alias('prob_1'))\ .write.csv(predictionsFileName, header="true") # metrics of predictions on the entire dataset metricValues = evaluator\ .evaluateWithSeveralMetrics(predictionsAllData, metricSets = metricSets) save_metrics(result_dir_master + "metricValuesEntireData.csv", metricValues) file_loop_info.close() os.chmod(file_loop_info, 0o777) spark.stop()