def test_do_cartesian(self):
     spark_session = sql.SparkSession(self.sc)
     string_rdd = self.sc.parallelize(self.test_data).map(
         lambda x: pyspark.Row(id=x[0], label=x[1], vector=ml_linalg.DenseVector(x[2])))
     string_df = string_rdd.toDF()
     test_demon = do_cartesian(sc=self.sc, df=string_df, id_col='id', feature_col='vector')
     check_diagonal = test_demon.filter(lambda x: x.i == x.j).map(lambda x: x.value).collect()
     for diag in check_diagonal:
         self.assertEqual(1.0, diag)
Beispiel #2
0
def connect_spark_sql(app_name='JUPYTER'):
    spark_conf = SparkConf().setAppName(app_name)

    # for k, v in crd.spark_cassandra.items():
    #     spark_conf.set(k, v)

    sc = SparkContext(conf=spark_conf)
    spark = sql.SparkSession(sc)

    return sc, spark
def save(acquired, bounds, products, product_dates, spark_context, clip=False,
         specs_fn=chip_specs.get, chips_fn=chips.get):
    """Saves requested products to iwds

    Args:
        acquired (str): / separated datestrings in iso8601 format.  Used to
                        determine the daterange of input data.
        bounds (str): sequence of points ((x1, y1), (x2, y2), ...).  Bounds are
                      minboxed and then corresponding chip ids are determined
                      from the result.
        products (sequence): products to save
        product_dates (sequence): product dates to produce and save
        spark_context: a spark cluster connection
        clip (bool): If True any points not falling within the minbox of bounds
                     are filtered out.

    Returns:
        generator: {product: dataframe}
    """

    ss = sql.SparkSession(spark_context)
    queries = fb.chip_spec_queries(fb.SPECS_URL)
    spec = first(specs_fn(queries[first(queries)]))
    coordinates  = chips.bounds_to_coordinates(tuple(bounds), spec)

    job, jobconf = init(acquired=acquired,
                        chip_ids=coordinates,
                        products=products,
                        product_dates=product_dates,
                        specs_fn=specs_fn,
                        refspec=spec,
                        chips_fn=chips_fn,
                        spark_context=spark_context,
                        clip_box=f.minbox(bounds) if clip else None)

    # first, save the jobconf used to generate the products
    md5, cfg = f.serialize({k: f.represent(v.value)
                            for k, v in jobconf.items()})

    write(table='jobconf',
          dataframe=ss.createDataFrame([[md5, cfg]], jobconf_schema()))

    for p in products:
        df = ss.createDataFrame(
            job[p].map(lambda x: (float(x[0][0]), float(x[0][1]),
                                  float(x[0][2]), float(x[0][3]),
                                  str(x[0][5]),
                                  second(f.serialize(x[1])),
                                  second(f.serialize(x[2])),
                                  str(md5)))\
                                  .repartition(fb.STORAGE_PARTITION_COUNT),
            schema=result_schema())

        yield {p: write(table=f.cqlstr(job[p].name()), dataframe=df)}
def run(sc, args):
    sc.setLogLevel('FATAL')
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('year',
                            help='Year of prediction, in format YYYY.',
                            type=int)
    arg_parser.add_argument('month',
                            help='Month of prediction, in format MM.',
                            type=int)
    arg_parser.add_argument('day',
                            help='Day of prediction, in format DD.',
                            type=int)
    args = arg_parser.parse_args(args)

    ss = sql.SparkSession(sc)

    latlongrid = grid.LatLonGrid(lat_min=40.488320,
                                 lat_max=40.957189,
                                 lon_min=-74.290739,
                                 lon_max=-73.635679,
                                 lat_step=grid.get_lon_delta(
                                     1000, (40.957189 - 40.488320) / 2.0),
                                 lon_step=grid.get_lat_delta(1000))

    tweets_df = import_twitter_data(ss, 'tweets2.csv')

    prediction_date = datetime.date(args.year, args.month, args.day)
    NUM_DAYS_IN_HISTORY = 31
    history_cutoff = prediction_date - datetime.timedelta(
        days=NUM_DAYS_IN_HISTORY)
    filtered_tweets_df = filter_by_dates(ss, tweets_df, history_cutoff,
                                         prediction_date)

    tokens_df = group_by_grid_square_and_tokenize(ss, latlongrid,
                                                  filtered_tweets_df)

    hashing_tf = feature.HashingTF(numFeatures=(2 ^ 18) - 1,
                                   inputCol='tokens',
                                   outputCol='token_frequencies')
    lda = (clustering.LDA().setFeaturesCol('token_frequencies').setK(
        10).setTopicDistributionCol('topic_distribution'))
    topic_distribution_pipeline = ml.Pipeline(stages=[hashing_tf, lda])
    lda_model = topic_distribution_pipeline.fit(tokens_df)
    topic_distributions = (lda_model.transform(tokens_df).select(
        ['grid_square', 'topic_distribution']))

    complaints_df = load_filter_format_valid_complaints(
        ss, 'crime_complaints_with_header.csv')

    complaints_df.show()
def load_mnist(sc, n_samples = None, **kwargs):
    """
    Creates a dataframe with mnist data
    :param n_samples: extra parameter that enables extra digits
    :return:
    """
    path = kwargs.get('path','/home/svanhmic/workspace/data/DABAI/mnist')
    package = kwargs.get('package','pandas')
    if package == 'pandas':
        train_df = pd.read_csv(path+'/train.csv', header=0)
        test_df = pd.read_csv(path+'/test.csv', header=0)
    else:
        spark_session = sql.SparkSession(sparkContext=sc)
        train_df = spark_session.read.csv(
            path=path+'/train.csv', header=True,
            inferSchema=True, mode='PERMISSIVE'
        )
        test_df = spark_session.read.csv(
            path=path+'/test.csv', header=True,
            inferSchema=True, mode='PERMISSIVE'
        )
    return train_df, test_df
Beispiel #6
0
import os
from pyspark import SparkConf, SparkContext, sql

if __name__ == '__main__':

    conf = SparkConf().setAppName("Banner-S3-Test")

    sc = SparkContext(conf=conf)
    spark = sql.SparkSession(sc)

    connectionProperties = {
        "user": os.environ.get('BANNER_USERNAME'),
        "password": os.environ.get('BANNER_PASSWORD'),
        "driver": "oracle.jdbc.driver.OracleDriver"
    }
    jdbcUrl = f"jdbc:oracle:thin:@{os.environ.get('BANNER_HOST')}:{os.environ.get('BANNER_PORT')}:{os.environ.get('BANNER_DATABASE')}"
    query_table = os.environ.get('BANNER_TABLE')

    s3_uri = 's3a://{}'.format(os.environ.get('S3_FOLDER_NAME'))

    jdbc_query = os.environ.get('BANNER_QUERY')
    jdbc_table = "(" + jdbc_query + ") base_tables_alias"

    df = spark.read.jdbc( \
      url=jdbcUrl, \
      table=jdbc_table, \
        numPartitions = 1, \
        properties=connectionProperties \
      )

    df.write.parquet(s3_uri)
def create_spark_data(sc, func, **kwargs):
    spark = sql.SparkSession(sparkContext=sc)
    spark.conf.set("spark.sql.crossJoin.enabled", "true")
    return spark.createDataFrame(func(**kwargs))
Beispiel #8
0
 def setUpClass(cls):
     cls.sc = pyspark.SparkContext(appName=cls.__class__.__name__)
     cls.sc.setLogLevel('FATAL')
     cls.ss = sql.SparkSession(cls.sc)