def test_do_cartesian(self): spark_session = sql.SparkSession(self.sc) string_rdd = self.sc.parallelize(self.test_data).map( lambda x: pyspark.Row(id=x[0], label=x[1], vector=ml_linalg.DenseVector(x[2]))) string_df = string_rdd.toDF() test_demon = do_cartesian(sc=self.sc, df=string_df, id_col='id', feature_col='vector') check_diagonal = test_demon.filter(lambda x: x.i == x.j).map(lambda x: x.value).collect() for diag in check_diagonal: self.assertEqual(1.0, diag)
def connect_spark_sql(app_name='JUPYTER'): spark_conf = SparkConf().setAppName(app_name) # for k, v in crd.spark_cassandra.items(): # spark_conf.set(k, v) sc = SparkContext(conf=spark_conf) spark = sql.SparkSession(sc) return sc, spark
def save(acquired, bounds, products, product_dates, spark_context, clip=False, specs_fn=chip_specs.get, chips_fn=chips.get): """Saves requested products to iwds Args: acquired (str): / separated datestrings in iso8601 format. Used to determine the daterange of input data. bounds (str): sequence of points ((x1, y1), (x2, y2), ...). Bounds are minboxed and then corresponding chip ids are determined from the result. products (sequence): products to save product_dates (sequence): product dates to produce and save spark_context: a spark cluster connection clip (bool): If True any points not falling within the minbox of bounds are filtered out. Returns: generator: {product: dataframe} """ ss = sql.SparkSession(spark_context) queries = fb.chip_spec_queries(fb.SPECS_URL) spec = first(specs_fn(queries[first(queries)])) coordinates = chips.bounds_to_coordinates(tuple(bounds), spec) job, jobconf = init(acquired=acquired, chip_ids=coordinates, products=products, product_dates=product_dates, specs_fn=specs_fn, refspec=spec, chips_fn=chips_fn, spark_context=spark_context, clip_box=f.minbox(bounds) if clip else None) # first, save the jobconf used to generate the products md5, cfg = f.serialize({k: f.represent(v.value) for k, v in jobconf.items()}) write(table='jobconf', dataframe=ss.createDataFrame([[md5, cfg]], jobconf_schema())) for p in products: df = ss.createDataFrame( job[p].map(lambda x: (float(x[0][0]), float(x[0][1]), float(x[0][2]), float(x[0][3]), str(x[0][5]), second(f.serialize(x[1])), second(f.serialize(x[2])), str(md5)))\ .repartition(fb.STORAGE_PARTITION_COUNT), schema=result_schema()) yield {p: write(table=f.cqlstr(job[p].name()), dataframe=df)}
def run(sc, args): sc.setLogLevel('FATAL') arg_parser = argparse.ArgumentParser() arg_parser.add_argument('year', help='Year of prediction, in format YYYY.', type=int) arg_parser.add_argument('month', help='Month of prediction, in format MM.', type=int) arg_parser.add_argument('day', help='Day of prediction, in format DD.', type=int) args = arg_parser.parse_args(args) ss = sql.SparkSession(sc) latlongrid = grid.LatLonGrid(lat_min=40.488320, lat_max=40.957189, lon_min=-74.290739, lon_max=-73.635679, lat_step=grid.get_lon_delta( 1000, (40.957189 - 40.488320) / 2.0), lon_step=grid.get_lat_delta(1000)) tweets_df = import_twitter_data(ss, 'tweets2.csv') prediction_date = datetime.date(args.year, args.month, args.day) NUM_DAYS_IN_HISTORY = 31 history_cutoff = prediction_date - datetime.timedelta( days=NUM_DAYS_IN_HISTORY) filtered_tweets_df = filter_by_dates(ss, tweets_df, history_cutoff, prediction_date) tokens_df = group_by_grid_square_and_tokenize(ss, latlongrid, filtered_tweets_df) hashing_tf = feature.HashingTF(numFeatures=(2 ^ 18) - 1, inputCol='tokens', outputCol='token_frequencies') lda = (clustering.LDA().setFeaturesCol('token_frequencies').setK( 10).setTopicDistributionCol('topic_distribution')) topic_distribution_pipeline = ml.Pipeline(stages=[hashing_tf, lda]) lda_model = topic_distribution_pipeline.fit(tokens_df) topic_distributions = (lda_model.transform(tokens_df).select( ['grid_square', 'topic_distribution'])) complaints_df = load_filter_format_valid_complaints( ss, 'crime_complaints_with_header.csv') complaints_df.show()
def load_mnist(sc, n_samples = None, **kwargs): """ Creates a dataframe with mnist data :param n_samples: extra parameter that enables extra digits :return: """ path = kwargs.get('path','/home/svanhmic/workspace/data/DABAI/mnist') package = kwargs.get('package','pandas') if package == 'pandas': train_df = pd.read_csv(path+'/train.csv', header=0) test_df = pd.read_csv(path+'/test.csv', header=0) else: spark_session = sql.SparkSession(sparkContext=sc) train_df = spark_session.read.csv( path=path+'/train.csv', header=True, inferSchema=True, mode='PERMISSIVE' ) test_df = spark_session.read.csv( path=path+'/test.csv', header=True, inferSchema=True, mode='PERMISSIVE' ) return train_df, test_df
import os from pyspark import SparkConf, SparkContext, sql if __name__ == '__main__': conf = SparkConf().setAppName("Banner-S3-Test") sc = SparkContext(conf=conf) spark = sql.SparkSession(sc) connectionProperties = { "user": os.environ.get('BANNER_USERNAME'), "password": os.environ.get('BANNER_PASSWORD'), "driver": "oracle.jdbc.driver.OracleDriver" } jdbcUrl = f"jdbc:oracle:thin:@{os.environ.get('BANNER_HOST')}:{os.environ.get('BANNER_PORT')}:{os.environ.get('BANNER_DATABASE')}" query_table = os.environ.get('BANNER_TABLE') s3_uri = 's3a://{}'.format(os.environ.get('S3_FOLDER_NAME')) jdbc_query = os.environ.get('BANNER_QUERY') jdbc_table = "(" + jdbc_query + ") base_tables_alias" df = spark.read.jdbc( \ url=jdbcUrl, \ table=jdbc_table, \ numPartitions = 1, \ properties=connectionProperties \ ) df.write.parquet(s3_uri)
def create_spark_data(sc, func, **kwargs): spark = sql.SparkSession(sparkContext=sc) spark.conf.set("spark.sql.crossJoin.enabled", "true") return spark.createDataFrame(func(**kwargs))
def setUpClass(cls): cls.sc = pyspark.SparkContext(appName=cls.__class__.__name__) cls.sc.setLogLevel('FATAL') cls.ss = sql.SparkSession(cls.sc)