Example #1
0
class MLlibTestCase(unittest.TestCase):
    def setUp(self):
        self.sc = SparkContext('local[4]', "MLlib tests")
        self.spark = SparkSession(self.sc)

    def tearDown(self):
        self.spark.stop()
Example #2
0
def benchmark_spark(ratings, factors, iterations=5):
    conf = (SparkConf()
            .setAppName("implicit_benchmark")
            .setMaster('local[*]')
            .set('spark.driver.memory', '16G')
            )
    context = SparkContext(conf=conf)
    spark = SparkSession(context)

    times = {}
    try:
        ratings = convert_sparse_to_dataframe(spark, context, ratings)

        for rank in factors:
            als = ALS(rank=rank, maxIter=iterations,
                      alpha=1, implicitPrefs=True,
                      userCol="row", itemCol="col", ratingCol="data")
            start = time.time()
            als.fit(ratings)
            elapsed = time.time() - start
            times[rank] = elapsed / iterations
            print("spark. factors=%i took %.3f" % (rank, elapsed/iterations))
    finally:
        spark.stop()

    return times
Example #3
0
 def test_get_active_session_when_no_active_session(self):
     active = SparkSession.getActiveSession()
     self.assertEqual(active, None)
     spark = SparkSession.builder \
         .master("local") \
         .getOrCreate()
     active = SparkSession.getActiveSession()
     self.assertEqual(active, spark)
     spark.stop()
     active = SparkSession.getActiveSession()
     self.assertEqual(active, None)
Example #4
0
def _test():
    import doctest
    import os
    import tempfile
    import py4j
    from pyspark.context import SparkContext
    from pyspark.sql import SparkSession, Row
    import pyspark.sql.readwriter

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.readwriter.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    try:
        spark = SparkSession.withHiveSupport(sc)
    except py4j.protocol.Py4JError:
        spark = SparkSession(sc)

    globs['tempfile'] = tempfile
    globs['os'] = os
    globs['sc'] = sc
    globs['spark'] = spark
    globs['df'] = spark.read.parquet('python/test_support/sql/parquet_partitioned')
    globs['sdf'] = \
        spark.read.format('text').stream('python/test_support/sql/streaming')

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.readwriter, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    sc.stop()
    if failure_count:
        exit(-1)
Example #5
0
 def test_active_session(self):
     spark = SparkSession.builder \
         .master("local") \
         .getOrCreate()
     try:
         activeSession = SparkSession.getActiveSession()
         df = activeSession.createDataFrame([(1, 'Alice')], ['age', 'name'])
         self.assertEqual(df.collect(), [Row(age=1, name=u'Alice')])
     finally:
         spark.stop()
Example #6
0
 def test_get_active_session_after_create_dataframe(self):
     session2 = None
     try:
         activeSession1 = SparkSession.getActiveSession()
         session1 = self.spark
         self.assertEqual(session1, activeSession1)
         session2 = self.spark.newSession()
         activeSession2 = SparkSession.getActiveSession()
         self.assertEqual(session1, activeSession2)
         self.assertNotEqual(session2, activeSession2)
         session2.createDataFrame([(1, 'Alice')], ['age', 'name'])
         activeSession3 = SparkSession.getActiveSession()
         self.assertEqual(session2, activeSession3)
         session1.createDataFrame([(1, 'Alice')], ['age', 'name'])
         activeSession4 = SparkSession.getActiveSession()
         self.assertEqual(session1, activeSession4)
     finally:
         if session2 is not None:
             session2.stop()
Example #7
0
 def test_active_session_with_None_and_not_None_context(self):
     from pyspark.context import SparkContext
     from pyspark.conf import SparkConf
     sc = None
     session = None
     try:
         sc = SparkContext._active_spark_context
         self.assertEqual(sc, None)
         activeSession = SparkSession.getActiveSession()
         self.assertEqual(activeSession, None)
         sparkConf = SparkConf()
         sc = SparkContext.getOrCreate(sparkConf)
         activeSession = sc._jvm.SparkSession.getActiveSession()
         self.assertFalse(activeSession.isDefined())
         session = SparkSession(sc)
         activeSession = sc._jvm.SparkSession.getActiveSession()
         self.assertTrue(activeSession.isDefined())
         activeSession2 = SparkSession.getActiveSession()
         self.assertNotEqual(activeSession2, None)
     finally:
         if session is not None:
             session.stop()
         if sc is not None:
             sc.stop()
    def __init__(self, sc, moldb_name, isocalc):
        self._sc = sc
        self._moldb_name = moldb_name
        self._isocalc = isocalc
        self._sm_config = SMConfig.get_conf()
        self._parquet_chunks_n = 64
        self._iso_gen_part_n = 512

        self._spark_session = SparkSession(self._sc)
        self._ion_centroids_path = '{}/{}/{}/{}'.format(self._sm_config['isotope_storage']['path'],
                                                        self._moldb_name,
                                                        self._isocalc.sigma,
                                                        self._isocalc.charge)
        self.ion_df = None
        self.ion_centroids_df = None
Example #9
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.Builder().appName('Example').getOrCreate()

sales_df = spark.read \
     .option("inferSchema", "true") \
     .option("header", "true") \
     .csv("sales.csv")

result = sales_df.groupBy("COUNTRY_CODE")\
                 .sum("AMOUNT")\
                 .orderBy(desc("sum(AMOUNT)"))

result.show()
Example #10
0
# Load Amazon metadata
path = '../data/Amazon/amazon_meta_p2.json'
records = []
for line in open(path, 'r'):
    item = json.loads(line.strip())
    record = (item['asin'], html.unescape(item['title']))
    records.append(record)
amazon_df = pd.DataFrame.from_records(records, columns=['asin', 'title'])

sc = SparkContext()
sc.setLogLevel('ERROR')
imdb_df = sc.broadcast(imdb_df).value
amazon_df = sc.broadcast(amazon_df).value

spark = SparkSession(sc)

imdb_df = spark.createDataFrame(imdb_df)
amazon_df = spark.createDataFrame(amazon_df)

k = 1
print(f'Performing matching round {k}...')
src_encode = udf(lambda x: encode(x, lower=False, replace=False))
desc_encode = udf(
    lambda x: encode(x,
                     end_words=['['],
                     skip_words=['vhs', 'dvd', 'anglais', 'italien'],
                     lower=False,
                     replace=False))
matched, amazon_df = match(imdb_df, amazon_df, 'title', 'imdb_id', src_encode,
                           desc_encode)
Example #11
0
import atexit
import os
import platform
import warnings

from pyspark.context import SparkContext
from pyspark.sql import SparkSession

if os.environ.get("SPARK_EXECUTOR_URI"):
    SparkContext.setSystemProperty("spark.executor.uri",
                                   os.environ["SPARK_EXECUTOR_URI"])

SparkContext._ensure_initialized()  # type: ignore

try:
    spark = SparkSession._create_shell_session()  # type: ignore
except Exception:
    import sys
    import traceback

    warnings.warn("Failed to initialize Spark session.")
    traceback.print_exc(file=sys.stderr)
    sys.exit(1)

sc = spark.sparkContext
sql = spark.sql
atexit.register(lambda: sc.stop())

# for compatibility
sqlContext = spark._wrapped
sqlCtx = sqlContext
Example #12
0
def test(spark: SparkSession):
    spark.sql("use aijiami")
    spark.sql("desc dw_opera_detail").show(100)
Example #13
0
    except TypeError:
        pass

    return dictX


if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: kafka_wordcount.py <file> <hdfs-files> <zk> <topic> ",
              file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="Kafka with DT")
    sc.setLogLevel("ERROR")

    spark = SparkSession(sc)

    #Create model
    a = 0
    orig = sys.argv[1]
    path = 'hdfs://master:9000/user/app/'
    file = orig.split('app/')[1]
    features = sc.textFile(path + 'features-des.txt').collect()
    feat = []
    for i in features:
        #feat.append(i.split('-')[0].split(' ')[0])
        feat.append(i.split(',')[1])

    [model, index] = getModel(path, file)

    if path_exist(
# -*- coding: UTF-8 -*-
from pyspark import SparkContext, SparkConf
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SQLContext
from pyspark.ml.feature import StringIndexer, VectorIndexer, IndexToString
from pyspark.ml.classification import DecisionTreeClassificationModel, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import time
from pyspark.ml import Pipeline

conf = SparkConf().setAppName("dt")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
sqlContext = SQLContext(sc)
df = spark.read.csv('hdfs://192.168.100.6:9000/user/ubuntu/Dataset75.csv',
                    header=True)

data = df.rdd.map(list)
print(data.first())
score = data.map(lambda s: 1.0
                 if s[1].isdigit() and float(s[1]) == 1.0 else 0.0)
comment = data.map(lambda s: s[3])
split_neg_data2 = score.zip(comment)
tranform_data = split_neg_data2.map(lambda p: (p[0], p[1]))

sentenceData = spark.createDataFrame(tranform_data,
                                     ["label", "sentence"])  #转化DataFrame
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
Example #15
0
def convert(
    spark: SparkSession,
    dataset_root: str,
    limit: int = 0,
    asset_dir: Optional[str] = None,
) -> DataFrame:
    """Convert a Coco Dataset into Rikai dataset.

    This function expects the COCO datasets are stored in directory with the
    following structure:

    - dataset
        - annotations
          - captions_train2017.json
          - instances_train2017.json
          - ...
        - train2017
        - val2017
        - test2017

    Parameters
    ----------
    spark : SparkSession
        A live spark session
    dataset_root : str
        The directory of dataset
    limit : int, optional
        The number of images of each split to be converted.
    asset_dir : str, optional
        The asset directory to store images, can be a s3 directory.

    Return
    ------
    DataFrame
        Returns a Spark DataFrame
    """
    train_json = os.path.join(dataset_root, "annotations",
                              "instances_train2017.json")
    val_json = os.path.join(dataset_root, "annotations",
                            "instances_val2017.json")

    categories = load_categories(train_json)

    examples = []
    for split, anno_file in zip(["train", "val"], [train_json, val_json]):
        coco = COCO(annotation_file=anno_file)
        # Coco has native dependencies, so we do not distributed them
        # to the workers.
        image_ids = coco.imgs
        if limit > 0:
            image_ids = islice(image_ids, limit)
        for image_id in image_ids:
            ann_id = coco.getAnnIds(imgIds=image_id)
            annotations = coco.loadAnns(ann_id)
            annos = []
            for ann in annotations:
                bbox = Box2d(*ann["bbox"])
                annos.append({
                    "category_id":
                    ann["category_id"],
                    "category_text":
                    categories[ann["category_id"]]["name"],
                    "bbox":
                    bbox,
                    "area":
                    float(ann["area"]),
                })
            image_payload = coco.loadImgs(ids=image_id)[0]
            example = {
                "image_id":
                image_id,
                "annotations":
                annos,
                "image":
                Image(
                    os.path.abspath(
                        os.path.join(
                            os.curdir,
                            "dataset",
                            "{}2017".format(split),
                            image_payload["file_name"],
                        ))),
                "split":
                split,
            }
            examples.append(example)

    schema = StructType([
        StructField("image_id", LongType(), False),
        StructField(
            "annotations",
            ArrayType(
                StructType([
                    StructField("category_id", IntegerType()),
                    StructField("category_text", StringType()),
                    StructField("area", FloatType()),
                    StructField("bbox", Box2dType()),
                ])),
            False,
        ),
        StructField("image", ImageType(), False),
        StructField("split", StringType(), False),
    ])
    df = spark.createDataFrame(examples, schema=schema)
    if asset_dir:
        asset_dir = asset_dir if asset_dir.endswith("/") else asset_dir + "/"
        print("ASSET DIR: ", asset_dir)
        df = df.withColumn("image", image_copy(col("image"), lit(asset_dir)))
    return df
Example #16
0
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark import SQLContext
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import col, udf, lag, date_add, explode, lit, concat, unix_timestamp, sum, abs
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import PipelineModel

sc = SparkContext(appName="MyFirstApp3_Task_task2")
spark = SparkSession(sc)

df_node16 = spark.read.format("parquet").load(
    path="hdfs://namenode:9000/example3/test.parquet")
model_node17 = PipelineModel.load("hdfs://namenode:9000/example3/model/")
df_node18 = model_node17.transform(df_node16)

evaluator_node19 = MulticlassClassificationEvaluator(
    labelCol="indexedSurvived",
    predictionCol="prediction",
    metricName="accuracy")
score_node19 = evaluator_node19.evaluate(df_node18)
df_node19 = spark.createDataFrame([(score_node19, )], ["score"])

df_node19.write.format("csv").save(
    path="hdfs://namenode:9000/example3/EvalResult3.csv",
    quote="\"",
    header=True,
    sep=",")
from pyspark.sql.functions import struct, array, lit, monotonically_increasing_id, col, expr, when, concat, udf, split, size, lag, count, isnull
from pyspark.sql import Window
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import GBTRegressor, LinearRegression, GeneralizedLinearRegression, RandomForestRegressor
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml.feature import VectorIndexer, VectorAssembler, StringIndexer, IndexToString
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator

# In[2]:

from pyspark import SparkContext
from pyspark.sql import SQLContext, SparkSession
#sc = SparkContext()
sparkSession = SparkSession(sc).builder.getOrCreate()

# # 1. Data Ingestion

# ### Peeking into data

# In[4]:

get_ipython().system(
    u'curl -i -L "http://edwdemo0.field.hortonworks.com:50070/webhdfs/v1/data/NFLPlaybyPlay2015.csv?op=OPEN" | tail -n 5'
)

# ### Load Data from Remote HDP Cluster (from HDFS)

# In[5]:
Example #18
0
#chapter 12 - rdd, spark definitive guide
#refer chapter 3, programming with rdds, learning spark

from pyspark.sql import SparkSession
spark = SparkSession.Builder().appName("rdd").master("local[3]").getOrCreate()

sc = spark.sparkContext

r1 = spark.range(10).rdd

#below statement would fail if file does not exist
r2 = sc.textFile(
    "/home/user/workarea/projects/learn-pyspark/data/sampledata/sample01.txt")

#to handle the situation, use below code:

from py4j.protocol import Py4JJavaError


def try_read(path):
    rdd = sc.textFile(path)
    try:
        rdd.first()
        return rdd
    except Py4JJavaError as e:
        print("file does not exist, returning empty rdd")
        return sc.emptyRDD()


#now, passing a non-existent file path
rdd = try_read("/home/user/workarea/projects/learn-pyspark/data/sample01.txt")
Example #19
0
def main():

    # Configure argparse
    parser = argparse.ArgumentParser(description='NetLytics Job')

    parser.add_argument('--connector',
                        metavar='connector',
                        type=str,
                        help='Connector class name')
    parser.add_argument('--input_path',
                        metavar='input_path',
                        type=str,
                        help='Base Log Files Input Path')
    parser.add_argument('--start_day',
                        metavar='start_day',
                        type=str,
                        help='Start day for analysis, format YYYY_MM_DD')
    parser.add_argument('--end_day',
                        metavar='end_day',
                        type=str,
                        help='End day for analysis, format YYYY_MM_DD')
    parser.add_argument(
        '--output_path',
        metavar='output_path',
        type=str,
        help='Path where to store resulting labeled Data Table')
    parser.add_argument('--algo',
                        metavar='algo',
                        type=str,
                        help='Clustering Algorithm to run')
    parser.add_argument(
        '--params',
        metavar='params',
        type=str,
        default="{}",
        help='Parameters to be given to the Clustering Algorithm, in Json')
    parser.add_argument(
        '--query',
        metavar='query',
        type=str,
        default=None,
        help='Eventual SQL query to execute to preprocess the dataset')
    parser.add_argument(
        '--numerical_features',
        metavar='numerical_features',
        type=str,
        default="",
        help='Columns to use as numerical features, separated by comma')
    parser.add_argument(
        '--categorical_features',
        metavar='categorical_features',
        type=str,
        default="",
        help='Columns to use as categorical features, separated by comma')
    parser.add_argument("--normalize",
                        action="store_true",
                        help="Normalize data before clustering")

    # Get parameters
    args = vars(parser.parse_args())
    input_path = args["input_path"]
    output_path = args["output_path"]
    connector = args["connector"]
    algo = args["algo"]
    params = args["params"]
    start_day = args["start_day"]
    end_day = args["end_day"]
    query = args["query"]
    numerical_features = args["numerical_features"].split(
        ",") if args["numerical_features"] != "" else []
    categorical_features = args["categorical_features"].split(
        ",") if args["categorical_features"] != "" else []
    normalize = args["normalize"]

    # Get path of NetLytics
    base_path = os.path.dirname(os.path.realpath(__file__))

    # Create Spark Context
    conf = (SparkConf().setAppName("NetLytics Job"))
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)

    # Create the dataframe
    dataset = core.utils.get_dataset(sc, spark, base_path, connector,
                                     input_path, start_day, end_day)

    # Pre process the dataframe
    manipulated_dataset = core.utils.transform(dataset,spark,\
                                                sql_query = query,\
                                                numerical_features = numerical_features,
                                                categorical_features = categorical_features,
                                                normalize=normalize)
    # Run Clustering
    clustering_algo_module = my_import(algo, sc)
    clustering_algo_instance = clustering_algo_module(json.loads(params))
    prediction = clustering_algo_instance.run(manipulated_dataset)

    # Save Output in CSV
    rdd = prediction.rdd.map(RowToStr)
    rdd.saveAsTextFile(output_path)
Example #20
0
 def setUp(self):
     self.sc = SparkContext.getOrCreate(SparkConf())
     self.spark = SparkSession(self.sc)
     self.obj = MotelsHomeRecommendation('', '', '', '')
Example #21
0
 def test_sqlcontext_with_stopped_sparkcontext(self):
     # SPARK-30856: test initialization via SparkSession when only the SparkContext is stopped
     self.sc.stop()
     self.sc = SparkContext('local[4]', self.sc.appName)
     self.spark = SparkSession(self.sc)
     self.assertIs(SQLContext.getOrCreate(self.sc).sparkSession, self.spark)
Example #22
0
    print("Could not convert datatype to an Float.")

# print(df)

""" Checking datatype of each column """
# print('Close ',df['Close'].dtype)
# print('Volume ',df['Volume'].dtype)
# print('Low ',df['Low'].dtype)
# print('Open ',df['Open'].dtype)
# print('High ',df['High'].dtype)

from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext()
sparkSession = SparkSession(sc)
stockData = sparkSession.createDataFrame(df)

# print(stock_price_data)
# print(stock_price_data.printSchema())
# print(stock_price_data.describe().toPandas().transpose())

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

print("Seperating the Open, High and Low:")
featureAssembler = VectorAssembler(inputCols=["Open", "High", "Low"], outputCol="Independent Columns")
output = featureAssembler.transform(stockData)
# print(output.show())
Example #23
0
    def transform_data(self, sc, args):
        """This method gets datajet files as input and prepare them on a daily
        intermediary basis for Marreco's main algorithm DIMSUM.

        :type sc: spark context
        :param sc: spark context for running jobs.

        :param args:
          
          :type days_init: int
          :param days: How many days to scan through the files to be used
                       in the transformation phase. If this value is say
                       ``5`` then Marreco will take today's date and come
                       back 5 days in time from where it will start reading
                       input files.

          :type days_end: int
          :param days_end: Similar to ``days_init`` but tells where the end
                           of scanning should be. If set say equals to ``3``,
                           then scans back in time until 3 days ago couting
                           from today.

          :type w_browse: float
          :param w_browse: Weight associated to browsing events on skus.

          :type w_purchase: float
          :param w_purchase: Weight associated to purchasing events on skus.

          :type force: str
          :param force: Either ``yes``, in which case forces recreation of
                        files, or ``no``, in which case if files already
                        exist then do nothing.

          :type source_uri: str
          :param source_uri: URI from where to read input data from.

          :type inter_uri: str
          :param inter_uri: URI to save intermediate results.

          :type neighbor_uri: str
          :param neighbor_uri: URI for where to save similarity matrix result.

          :type threshold: float
          :param threshold: This should be converted to float. It asserts how
                            much quality we should sacrifice in order to gain
                            performance.

          :type decay: float
          :param decay: How much less of an influence a score has given how
                        long ago it happened. The further ago the more this
                        ``decay`` factor diminishes the value.
        """
        spark = SparkSession(sc)
        for day in range(args.days_init, args.days_end - 1, -1):
            formatted_day = self.get_formatted_date(day)
            source_uri = args.source_uri.format(formatted_day)
            inter_uri = args.inter_uri.format(formatted_day)
            try:
                inter_data = spark.read.json(
                    inter_uri,
                    schema=self._load_users_matrix_schema()).first()

                if args.force == 'yes' or not inter_data:
                    self._process_datajet_day(sc,
                                              source_uri,
                                              inter_uri,
                                              args,
                                              mode='overwrite')
            except (Py4JJavaError, AnalysisException):
                self._process_datajet_day(sc, source_uri, inter_uri, args)
Example #24
0
class DeltaTableTests(PySparkTestCase):
    def setUp(self):
        super(DeltaTableTests, self).setUp()
        self.sqlContext = SQLContext(self.sc)
        self.spark = SparkSession(self.sc)
        self.tempPath = tempfile.mkdtemp()
        self.tempFile = os.path.join(self.tempPath, "tempFile")

    def tearDown(self):
        self.spark.stop()
        shutil.rmtree(self.tempPath)
        super(DeltaTableTests, self).tearDown()

    def test_forPath(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        dt = DeltaTable.forPath(self.spark, self.tempFile).toDF()
        self.__checkAnswer(dt, [('a', 1), ('b', 2), ('c', 3)])

    def test_alias_and_toDF(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        dt = DeltaTable.forPath(self.spark, self.tempFile).toDF()
        self.__checkAnswer(
            dt.alias("myTable").select('myTable.key', 'myTable.value'),
            [('a', 1), ('b', 2), ('c', 3)])

    def test_delete(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)])
        dt = DeltaTable.forPath(self.spark, self.tempFile)

        # delete with condition as str
        dt.delete("key = 'a'")
        self.__checkAnswer(dt.toDF(), [('b', 2), ('c', 3), ('d', 4)])

        # delete with condition as Column
        dt.delete(col("key") == lit("b"))
        self.__checkAnswer(dt.toDF(), [('c', 3), ('d', 4)])

        # delete without condition
        dt.delete()
        self.__checkAnswer(dt.toDF(), [])

        # bad args
        with self.assertRaises(TypeError):
            dt.delete(condition=1)

    def test_update(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)])
        dt = DeltaTable.forPath(self.spark, self.tempFile)

        # update with condition as str and with set exprs as str
        dt.update("key = 'a' or key = 'b'", {"value": "1"})
        self.__checkAnswer(dt.toDF(), [('a', 1), ('b', 1), ('c', 3), ('d', 4)])

        # update with condition as Column and with set exprs as Columns
        dt.update(expr("key = 'a' or key = 'b'"), {"value": expr("0")})
        self.__checkAnswer(dt.toDF(), [('a', 0), ('b', 0), ('c', 3), ('d', 4)])

        # update without condition
        dt.update(set={"value": "200"})
        self.__checkAnswer(dt.toDF(), [('a', 200), ('b', 200), ('c', 200),
                                       ('d', 200)])

        # bad args
        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.update({"value": "200"})

        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.update(condition='a')

        with self.assertRaisesRegex(TypeError, "must be a dict"):
            dt.update(set=1)

        with self.assertRaisesRegex(TypeError,
                                    "must be a Spark SQL Column or a string"):
            dt.update(1, {})

        with self.assertRaisesRegex(TypeError,
                                    "Values of dict in .* must contain only"):
            dt.update(set={"value": 1})

        with self.assertRaisesRegex(TypeError,
                                    "Keys of dict in .* must contain only"):
            dt.update(set={1: ""})

        with self.assertRaises(TypeError):
            dt.update(set=1)

    def test_merge(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)])
        source = self.spark.createDataFrame([('a', -1), ('b', 0), ('e', -5),
                                             ('f', -6)], ["k", "v"])

        def reset_table():
            self.__overwriteDeltaTable([('a', 1), ('b', 2), ('c', 3),
                                        ('d', 4)])

        dt = DeltaTable.forPath(self.spark, self.tempFile)

        # ============== Test basic syntax ==============

        # String expressions in merge condition and dicts
        reset_table()
        dt.merge(source, "key = k") \
            .whenMatchedUpdate(set={"value": "v + 0"}) \
            .whenNotMatchedInsert(values={"key": "k", "value": "v + 0"}) \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 0), ('c', 3),
                                        ('d', 4), ('e', -5), ('f', -6)]))

        # Column expressions in merge condition and dicts
        reset_table()
        dt.merge(source, expr("key = k")) \
            .whenMatchedUpdate(set={"value": col("v") + 0}) \
            .whenNotMatchedInsert(values={"key": "k", "value": col("v") + 0}) \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 0), ('c', 3),
                                        ('d', 4), ('e', -5), ('f', -6)]))

        # ============== Test clause conditions ==============

        # String expressions in all conditions and dicts
        reset_table()
        dt.merge(source, "key = k") \
            .whenMatchedUpdate(condition="k = 'a'", set={"value": "v + 0"}) \
            .whenMatchedDelete(condition="k = 'b'") \
            .whenNotMatchedInsert(condition="k = 'e'", values={"key": "k", "value": "v + 0"}) \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('c', 3), ('d', 4),
                                        ('e', -5)]))

        # Column expressions in all conditions and dicts
        reset_table()
        dt.merge(source, expr("key = k")) \
            .whenMatchedUpdate(
                condition=expr("k = 'a'"),
                set={"value": col("v") + 0}) \
            .whenMatchedDelete(condition=expr("k = 'b'")) \
            .whenNotMatchedInsert(
                condition=expr("k = 'e'"),
                values={"key": "k", "value": col("v") + 0}) \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('c', 3), ('d', 4),
                                        ('e', -5)]))

        # Positional arguments
        reset_table()
        dt.merge(source, "key = k") \
            .whenMatchedUpdate("k = 'a'", {"value": "v + 0"}) \
            .whenMatchedDelete("k = 'b'") \
            .whenNotMatchedInsert("k = 'e'", {"key": "k", "value": "v + 0"}) \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('c', 3), ('d', 4),
                                        ('e', -5)]))

        # ============== Test updateAll/insertAll ==============

        # No clause conditions and insertAll/updateAll + aliases
        reset_table()
        dt.alias("t") \
            .merge(source.toDF("key", "value").alias("s"), expr("t.key = s.key")) \
            .whenMatchedUpdateAll() \
            .whenNotMatchedInsertAll() \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 0), ('c', 3),
                                        ('d', 4), ('e', -5), ('f', -6)]))

        # String expressions in all clause conditions and insertAll/updateAll + aliases
        reset_table()
        dt.alias("t") \
            .merge(source.toDF("key", "value").alias("s"), "s.key = t.key") \
            .whenMatchedUpdateAll("s.key = 'a'") \
            .whenNotMatchedInsertAll("s.key = 'e'") \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 2), ('c', 3),
                                        ('d', 4), ('e', -5)]))

        # Column expressions in all clause conditions and insertAll/updateAll + aliases
        reset_table()
        dt.alias("t") \
            .merge(source.toDF("key", "value").alias("s"), expr("t.key = s.key")) \
            .whenMatchedUpdateAll(expr("s.key = 'a'")) \
            .whenNotMatchedInsertAll(expr("s.key = 'e'")) \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 2), ('c', 3),
                                        ('d', 4), ('e', -5)]))

        # ============== Test bad args ==============
        # ---- bad args in merge()
        with self.assertRaisesRegex(TypeError, "must be DataFrame"):
            dt.merge(1, "key = k")

        with self.assertRaisesRegex(TypeError,
                                    "must be a Spark SQL Column or a string"):
            dt.merge(source, 1)

        # ---- bad args in whenMatchedUpdate()
        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.merge(source, "key = k").whenMatchedUpdate({"value": "v"})

        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.merge(source, "key = k").whenMatchedUpdate(1)

        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.merge(source,
                     "key = k").whenMatchedUpdate(condition="key = 'a'")

        with self.assertRaisesRegex(TypeError,
                                    "must be a Spark SQL Column or a string"):
            dt.merge(source, "key = k").whenMatchedUpdate(1, {"value": "v"})

        with self.assertRaisesRegex(TypeError, "must be a dict"):
            dt.merge(source, "key = k").whenMatchedUpdate("k = 'a'", 1)

        with self.assertRaisesRegex(TypeError,
                                    "Values of dict in .* must contain only"):
            dt.merge(source, "key = k").whenMatchedUpdate(set={"value": 1})

        with self.assertRaisesRegex(TypeError,
                                    "Keys of dict in .* must contain only"):
            dt.merge(source, "key = k").whenMatchedUpdate(set={1: ""})

        with self.assertRaises(TypeError):
            dt.merge(source,
                     "key = k").whenMatchedUpdate(set="k = 'a'",
                                                  condition={"value": 1})

        # bad args in whenMatchedDelete()
        with self.assertRaisesRegex(TypeError,
                                    "must be a Spark SQL Column or a string"):
            dt.merge(source, "key = k").whenMatchedDelete(1)

        # ---- bad args in whenNotMatchedInsert()
        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.merge(source, "key = k").whenNotMatchedInsert({"value": "v"})

        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.merge(source, "key = k").whenNotMatchedInsert(1)

        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.merge(source,
                     "key = k").whenNotMatchedInsert(condition="key = 'a'")

        with self.assertRaisesRegex(TypeError,
                                    "must be a Spark SQL Column or a string"):
            dt.merge(source, "key = k").whenNotMatchedInsert(1, {"value": "v"})

        with self.assertRaisesRegex(TypeError, "must be a dict"):
            dt.merge(source, "key = k").whenNotMatchedInsert("k = 'a'", 1)

        with self.assertRaisesRegex(TypeError,
                                    "Values of dict in .* must contain only"):
            dt.merge(source,
                     "key = k").whenNotMatchedInsert(values={"value": 1})

        with self.assertRaisesRegex(TypeError,
                                    "Keys of dict in .* must contain only"):
            dt.merge(source,
                     "key = k").whenNotMatchedInsert(values={1: "value"})

        with self.assertRaises(TypeError):
            dt.merge(source,
                     "key = k").whenNotMatchedInsert(values="k = 'a'",
                                                     condition={"value": 1})

    def test_history(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        self.__overwriteDeltaTable([('a', 3), ('b', 2), ('c', 1)])
        dt = DeltaTable.forPath(self.spark, self.tempFile)
        operations = dt.history().select('operation')
        self.__checkAnswer(
            operations, [Row("WRITE"), Row("WRITE")],
            StructType([StructField("operation", StringType(), True)]))

        lastMode = dt.history(1).select('operationParameters.mode')
        self.__checkAnswer(
            lastMode, [Row("Overwrite")],
            StructType(
                [StructField("operationParameters.mode", StringType(), True)]))

    def test_vacuum(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        dt = DeltaTable.forPath(self.spark, self.tempFile)
        self.__createFile('abc.txt', 'abcde')
        self.__createFile('bac.txt', 'abcdf')
        self.assertEqual(True, self.__checkFileExists('abc.txt'))
        dt.vacuum()  # will not delete files as default retention is used.
        dt.vacuum(1000)  # test whether integers work

        self.assertEqual(True, self.__checkFileExists('bac.txt'))
        retentionConf = "spark.databricks.delta.retentionDurationCheck.enabled"
        self.spark.conf.set(retentionConf, "false")
        dt.vacuum(0.0)
        self.spark.conf.set(retentionConf, "true")
        self.assertEqual(False, self.__checkFileExists('bac.txt'))
        self.assertEqual(False, self.__checkFileExists('abc.txt'))

    def test_convertToDelta(self):
        df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)],
                                        ["key", "value"])
        df.write.format("parquet").save(self.tempFile)
        dt = DeltaTable.convertToDelta(self.spark,
                                       "parquet.`%s`" % self.tempFile)
        self.__checkAnswer(
            self.spark.read.format("delta").load(self.tempFile), [('a', 1),
                                                                  ('b', 2),
                                                                  ('c', 3)])

        # test if convert to delta with partition columns work
        tempFile2 = self.tempFile + "_2"
        df.write.partitionBy("value").format("parquet").save(tempFile2)
        schema = StructType()
        schema.add("value", IntegerType(), True)
        dt = DeltaTable.convertToDelta(self.spark, "parquet.`%s`" % tempFile2,
                                       schema)
        self.__checkAnswer(
            self.spark.read.format("delta").load(tempFile2),
            [('a', 1), ('b', 2), ('c', 3)])

        # convert to delta with partition column provided as a string
        tempFile3 = self.tempFile + "_3"
        df.write.partitionBy("value").format("parquet").save(tempFile3)
        dt = DeltaTable.convertToDelta(self.spark, "parquet.`%s`" % tempFile3,
                                       "value int")
        self.__checkAnswer(
            self.spark.read.format("delta").load(tempFile3),
            [('a', 1), ('b', 2), ('c', 3)])

    def test_isDeltaTable(self):
        df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)],
                                        ["key", "value"])
        df.write.format("parquet").save(self.tempFile)
        tempFile2 = self.tempFile + '_2'
        df.write.format("delta").save(tempFile2)
        self.assertEqual(DeltaTable.isDeltaTable(self.spark, self.tempFile),
                         False)
        self.assertEqual(DeltaTable.isDeltaTable(self.spark, tempFile2), True)

    def __checkAnswer(self, df, expectedAnswer, schema=["key", "value"]):
        if not expectedAnswer:
            self.assertEqual(df.count(), 0)
            return
        expectedDF = self.spark.createDataFrame(expectedAnswer, schema)
        try:
            self.assertEqual(df.count(), expectedDF.count())
            self.assertEqual(len(df.columns), len(expectedDF.columns))
            self.assertEqual([], df.subtract(expectedDF).take(1))
            self.assertEqual([], expectedDF.subtract(df).take(1))
        except AssertionError:
            print("Expected:")
            expectedDF.show()
            print("Found:")
            df.show()
            raise

    def __writeDeltaTable(self, datalist):
        df = self.spark.createDataFrame(datalist, ["key", "value"])
        df.write.format("delta").save(self.tempFile)

    def __overwriteDeltaTable(self, datalist):
        df = self.spark.createDataFrame(datalist, ["key", "value"])
        df.write.format("delta").mode("overwrite").save(self.tempFile)

    def __createFile(self, fileName, content):
        with open(os.path.join(self.tempFile, fileName), 'w') as f:
            f.write(content)

    def __checkFileExists(self, fileName):
        return os.path.exists(os.path.join(self.tempFile, fileName))
Example #25
0
import platform
import warnings

import py4j

from pyspark import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, SQLContext

if os.environ.get("SPARK_EXECUTOR_URI"):
    SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])

SparkContext._ensure_initialized()

try:
    spark = SparkSession._create_shell_session()
except Exception:
    import sys
    import traceback
    warnings.warn("Failed to initialize Spark session.")
    traceback.print_exc(file=sys.stderr)
    sys.exit(1)

sc = spark.sparkContext
sql = spark.sql
atexit.register(lambda: sc.stop())

# for compatibility
sqlContext = spark._wrapped
sqlCtx = sqlContext
Example #26
0
 def setUp(self):
     super(DeltaTableTests, self).setUp()
     self.sqlContext = SQLContext(self.sc)
     self.spark = SparkSession(self.sc)
     self.tempPath = tempfile.mkdtemp()
     self.tempFile = os.path.join(self.tempPath, "tempFile")
Example #27
0
from pyspark.sql import SQLContext

# start JVM gateway
client = GatewayClient(port=${JVM_GATEWAY_PORT})
gateway = JavaGateway(client, auto_convert=True)

java_import(gateway.jvm, "org.apache.spark.SparkEnv")
java_import(gateway.jvm, "org.apache.spark.SparkConf")
java_import(gateway.jvm, "org.apache.spark.api.java.*")
java_import(gateway.jvm, "org.apache.spark.api.python.*")
java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")

intp = gateway.entry_point
jsc = intp.getJavaSparkContext()

java_import(gateway.jvm, "org.apache.spark.sql.*")
java_import(gateway.jvm, "org.apache.spark.sql.hive.*")
java_import(gateway.jvm, "scala.Tuple2")

jconf = jsc.getConf()
conf = SparkConf(_jvm=gateway.jvm, _jconf=jconf)
sc = _zsc_ = SparkContext(jsc=jsc, gateway=gateway, conf=conf)

if intp.isSpark2():
    from pyspark.sql import SparkSession

    spark = __zSpark__ = SparkSession(sc, intp.getSparkSession())
    sqlContext = sqlc = __zSqlc__ = __zSpark__._wrapped
else:
    sqlContext = sqlc = __zSqlc__ = SQLContext(sparkContext=sc, sqlContext=intp.getSQLContext())
Example #28
0
 def setUpClass(cls):
     PySparkTestCase.setUpClass()
     cls.spark = SparkSession(cls.sc)
Example #29
0
def main(spark) -> None:
    """ run example """
    PG_CONFIG = get_pg_config()

    print('TRY: create df')
    df = spark.range(1, 20, 1, 4).withColumn('mono_id',
                                             F.monotonically_increasing_id())
    print('OK: create df')
    df.show()

    print('')

    print('TRY: write_to_pg')
    write_to_pg(df=df, config=PG_CONFIG, table='test_table')
    print('OK: write_to_pg')

    print('')

    print('TRY: read_from_pg')
    df_loaded = read_from_pg(config=PG_CONFIG, sql='test_table', sc=sc)
    print('OK: read_from_pg')
    df_loaded.show()


if __name__ == '__main__':
    sc = init_spark_context('app')
    spark = SparkSession(sc)
    main(spark)
    spark.stop()
def run_spark(output_file):
    sc = SparkContext()
    spark = SparkSession(sc)

    violations_df = get_violations_df(violations, spark)
    streets_df = get_streets_df(streets, spark)

    streets_dict = streets_df.rdd.flatMap(mapper).reduceByKey(
        lambda x, y: x + y).collectAsMap()
    streets_dict_bc = sc.broadcast(streets_dict)

    def get_val(borocode, street, num0, num1):
        res = None
        housenum = (num0, num1)
        if num0 != 0 and num1 == 0:
            housenum = (num1, num0)
        candidates = streets_dict_bc.value.get((borocode, street))
        if candidates:
            res = search_candidates(candidates, housenum)
            if res is None and num0 > 1000 and num1 == 0:
                housenum = (int(num0 / 100), num0 % 100)
                res = search_candidates(candidates, housenum)
            if res is None and num0 != 0 and num1 != 0:
                housenum = (0, (num0 * 100) + num1)
                res = search_candidates(candidates, housenum)
            if res is None and num0 != 0 and num1 != 0:
                housenum = (0, num1)
                res = search_candidates(candidates, housenum)
        return res

    get_val_udf = udf(get_val)
    matched_violations = violations_df.withColumn(
        'PHYSICALID',
        get_val_udf(violations_df['v.COUNTY'], violations_df['v.STREETNAME'],
                    violations_df['v.NUM0'], violations_df['v.NUM1']))
    matched_violations = matched_violations.filter(
        matched_violations['PHYSICALID'].isNotNull())
    matched_violations = matched_violations.withColumn(
        "PHYSICALID", matched_violations["PHYSICALID"].cast("integer"))
    matched_violations = matched_violations.orderBy("PHYSICALID")
    matched_violations = matched_violations.groupBy("PHYSICALID", "YEAR").agg(
        count("*").alias("YEAR_COUNT"))
    matched_violations.createOrReplaceTempView("matched_violations")

    summaries = spark.sql(
        "select PHYSICALID, " +
        "MAX(CASE WHEN (YEAR = 2015) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2015, "
        +
        "MAX(CASE WHEN (YEAR = 2016) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2016, "
        +
        "MAX(CASE WHEN (YEAR = 2017) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2017, "
        +
        "MAX(CASE WHEN (YEAR = 2018) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2018, "
        +
        "MAX(CASE WHEN (YEAR = 2019) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2019  "
        + "from matched_violations " + "group by PHYSICALID " +
        "order by PHYSICALID ")

    getOLS_udf = udf(getOLS)
    summaries = summaries.withColumn(
        'OLS_COEF',
        getOLS_udf(
            array('COUNT_2015', 'COUNT_2016', 'COUNT_2017', 'COUNT_2018',
                  'COUNT_2019')))

    streets_df = streets_df.select(col("s.PHYSICALID")) \
                        .join(summaries, "PHYSICALID", how='left') \
                        .distinct() \
                        .orderBy("PHYSICALID") \

    streets_df = streets_df.withColumn("COUNT_2015",
                                       coalesce("COUNT_2015", lit(0)))
    streets_df = streets_df.withColumn("COUNT_2016",
                                       coalesce("COUNT_2016", lit(0)))
    streets_df = streets_df.withColumn("COUNT_2017",
                                       coalesce("COUNT_2017", lit(0)))
    streets_df = streets_df.withColumn("COUNT_2018",
                                       coalesce("COUNT_2018", lit(0)))
    streets_df = streets_df.withColumn("COUNT_2019",
                                       coalesce("COUNT_2019", lit(0)))
    streets_df = streets_df.withColumn("OLS_COEF",
                                       coalesce("OLS_COEF", lit(0.0)))

    streets_df.write.csv(output_file, header=False)
Example #31
0
local_data_directory = '/home/brendan/FastData/pubmed/gz/'

################################################
print('initializing spark')
# init spark
conf = SparkConf()
conf = (
    conf.setMaster('local[*]').set('spark.driver.memory', '96G')  # 40
    .set('spark.driver.maxResultSize', '500M'))
#.set('spark.storage.memoryFraction',0))  # this setting is now a legacy option
#.set('spark.executor.memory','1G')  # 20
#.set('spark.python.worker.reuse', 'false')
#.set('spark.python.worker.memory','512m')
#.set('spark.executor.cores','1'))
sc = SparkContext(conf=conf)
spark = SparkSession(sc)  # don't need this for vanilla RDDs

print(sc._conf.getAll())

###############################################


def ftp_helper(namestr):
    # local save file for ftp'd xml
    target = target_brdcst.value + namestr.split(
        '.')[0] + '.xml.gz'  # pubmed year chunk# .xml
    try:
        with FTP(ftp_dir_broadcast.value) as ftp:
            ftp.login()

            r = BytesIO()
Example #32
0
def init():
    global sc
    sc = SparkContext()
    sc.setLogLevel("ALL")
    # sc.addPyFile("load.py")
    spark = SparkSession(sc)
Example #33
0
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from tableFunctions import *

sc = SparkContext()
spark = SparkSession(sc)

#start SparkSQL session
sqlContext = SQLContext(sc)

#read the file with its header
df = sqlContext.read.option("header",True).csv("*.csv")
#count the number of rows (it is 4820022)
df.count()

#count the unique values in recipient_name (it is 153449)
df.select('recipient_name').distinct().count()

#delete periods, commas, and spaces from recipient names
df = df.withColumn('recipient_name', regexp_replace('recipient_name', ' AND ', ''))
df = df.withColumn('recipient_name', regexp_replace('recipient_name', '&', ''))
df = df.withColumn('recipient_name', regexp_replace('recipient_name', '\.', ''))
df = df.withColumn('recipient_name', regexp_replace('recipient_name', ',', ''))
df = df.withColumn('recipient_name', regexp_replace('recipient_name', ' ', ''))

#count again (it is now 149582)
df.select('recipient_name').distinct().count()
os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages graphframes:graphframes:0.8.1-spark3.0-s_2.12 pyspark-shell")

import pyspark
from pyspark.sql import SparkSession
from App.utils import *
from graphframes import *

hdfs_host = "hdfs://localhost:9000"
hdfs_root_path = "/SafeEntry_Analytics/"

conf = pyspark.SparkConf().setAppName("Process Entry Record Graph").setMaster(
    "local[*]")
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

resident_file_dest = "resident.parquet"
safe_entry_daily_file_dest = "entry_record.parquet"
contact_graph_edge_file_dest = "contact_graph_edge.parquet"
contact_graph_vertex_file_dest = "contact_graph_vertex.parquet"

# Step 1: read resident parquet file
resident_df = read_parquet_file(
    spark, hdfs_host + hdfs_root_path + resident_file_dest)
resident_df.cache()


def lookup_nric(resident_id):
    return resident_df.filter(
        resident_df['resident_id'] == resident_id).collect()[0]['nric']
Example #35
0
    def setUpClass(cls):
        super(ReusedSQLTestCase, cls).setUpClass()
        cls.spark = SparkSession(cls.sc)

        cls.spark.conf.set('spark.sql.execution.arrow.enabled', True)
class SparkConfig:
    sc = pyspark.SparkContext(master=os.environ['SPARK_MASTER'],
                              appName='anomaly_detector')
    ss = SparkSession(sc)
class IonCentroidsGenerator(object):
    """ Generator of theoretical isotope peaks for all molecules in a database.

    Args
    ----------
    sc : pyspark.SparkContext
    moldb_name : str
    isocalc: IsocalcWrapper
    """
    def __init__(self, sc, moldb_name, isocalc):
        self._sc = sc
        self._moldb_name = moldb_name
        self._isocalc = isocalc
        self._sm_config = SMConfig.get_conf()
        self._parquet_chunks_n = 64
        self._iso_gen_part_n = 512

        self._spark_session = SparkSession(self._sc)
        self._ion_centroids_path = '{}/{}/{}/{}'.format(self._sm_config['isotope_storage']['path'],
                                                        self._moldb_name,
                                                        self._isocalc.sigma,
                                                        self._isocalc.charge)
        self.ion_df = None
        self.ion_centroids_df = None

    def exists(self):
        """ Check if ion centroids saved to parquet
        """
        if self._ion_centroids_path.startswith('s3a://'):
            cred_dict = dict(aws_access_key_id=self._sm_config['aws']['aws_access_key_id'],
                             aws_secret_access_key=self._sm_config['aws']['aws_secret_access_key'])
            bucket, key = split_s3_path(self._ion_centroids_path)
            s3 = boto3.client('s3', **cred_dict)
            try:
                s3.head_object(Bucket=bucket, Key=key + '/ions/_SUCCESS')
            except ClientError:
                return False
            else:
                return True
        else:
            return Path(self._ion_centroids_path + '/ions/_SUCCESS').exists()

    def generate(self, isocalc, sfs, adducts):
        """ Generate isotopic peaks

        Args
        ---
        isocalc: IsocalcWrapper
            Cannot be a class field as Spark doesn't allow to pass 'self' to functions
        adducts: list
        """
        logger.info('Generating molecular isotopic peaks')

        def calc_centroids(args):
            ion_i, sf, adduct = args
            mzs, ints = isocalc.ion_centroids(sf, adduct)
            if mzs is not None:
                return zip(repeat(ion_i),
                           range(0, len(mzs)),
                           map(float, mzs),
                           map(float, ints))
            else:
                return []

        ion_df = pd.DataFrame([(i, sf, adduct) for i, (sf, adduct) in
                               enumerate(sorted(product(sfs, adducts)))],
                              columns=['ion_i', 'sf', 'adduct']).set_index('ion_i')

        ion_centroids_rdd = (self._sc.parallelize(ion_df.reset_index().values,
                                                  numSlices=self._iso_gen_part_n)
                             .flatMap(calc_centroids))
        self.ion_centroids_df = (pd.DataFrame(data=ion_centroids_rdd.collect(),
                                              columns=['ion_i', 'peak_i', 'mz', 'int'])
                                 .sort_values(by='mz')
                                 .set_index('ion_i'))

        self.ion_df = ion_df.loc[self.ion_centroids_df.index.unique()]

        # Use when pandas DataFrames get way too big
        # ion_centroids_df = self._spark_session.createDataFrame(data=ion_centroids_rdd,
        #                                                        schema=self.ion_centroids_df_fields)
        # self.ion_centroids_df = (ion_centroids_df
        #                          .sort(ion_centroids_df.mz.asc())
        #                          .coalesce(self._parquet_chunks_n))

    def save(self):
        """ Save isotopic peaks
        """
        logger.info('Saving peaks')

        centr_spark_df = self._spark_session.createDataFrame(self.ion_centroids_df.reset_index())
        centr_spark_df.write.parquet(self._ion_centroids_path + '/ion_centroids', mode='overwrite')
        ion_spark_df = self._spark_session.createDataFrame(self.ion_df.reset_index())
        ion_spark_df.write.parquet(self._ion_centroids_path + '/ions', mode='overwrite')

    def restore(self):
        logger.info('Restoring peaks')

        self.ion_df = self._spark_session.read.parquet(
            self._ion_centroids_path + '/ions').toPandas().set_index('ion_i')
        self.ion_centroids_df = self._spark_session.read.parquet(
            self._ion_centroids_path + '/ion_centroids').toPandas().set_index('ion_i')

    def sf_adduct_centroids_df(self):
        return self.ion_df.join(self.ion_centroids_df).set_index(['sf', 'adduct'])

    def centroids_subset(self, ions):
        """ Restore isotopic peaks dataframe only for the 'ions'

        Args
        ---
        ions: list of tuples

        Returns
        ---
        : pandas.DataFrame
        """
        assert self.ion_df is not None

        ion_map = self.ion_df.reset_index().set_index(['sf', 'adduct']).ion_i
        ion_ids = ion_map.loc[ions].values
        return self.ion_centroids_df.loc[ion_ids].sort_values(by='mz')

    def generate_if_not_exist(self, isocalc, sfs, adducts):
        if not self.exists():
            self.generate(isocalc=isocalc, sfs=sfs, adducts=adducts)
            self.save()
        else:
            self.restore()

    def ions(self, adducts):
        return (self.ion_df[self.ion_df.adduct.isin(adducts)]
                .sort_values(by=['sf', 'adduct'])
                .to_records(index=False))
Example #38
0
 def setUp(self):
     self.sc = SparkContext('local[4]', "MLlib tests")
     self.spark = SparkSession(self.sc)
Example #39
0
        if data['BOT'] == 'False':
            return [(('Domain', data['Domain']), 1), (('Bot', data['BOT']), 1),
                    (('User', data['User']), 1), (('Class', data['Class']), 1)]
        else:
            return [(('Domain', data['Domain']), 1), (('Bot', data['BOT']), 1),
                    (('Class', data['Class']), 1)]
    except:
        return []


if __name__ == '__main__':

    print('Starting Spark Context and Session')
    #Initialize Spark
    sc = SparkContext(appName='WikiStats')
    sparksql = SparkSession(sc)
    sc.setLogLevel("ERROR")

    #Streaming and window variables
    window_slide = 6
    window_duration = 40
    batch_time = 2

    #Read streams from Kafka Topic 'wikipedia' as dstreams
    ssc = StreamingContext(sc, batch_time)
    ssc.checkpoint("checkpoint_wiki")

    print('Building process pipeline')
    ##Stream Pipeline Design
    wikiKafkaStream = KafkaUtils.createDirectStream(
        ssc, [KAFKA_TOPIC_read], {"metadata.broker.list": KAFKA_BROKER_read})
Example #40
0
def init_opaque_sql(testing=False):
    sc = SparkContext.getOrCreate()
    sc._jvm.edu.berkeley.cs.rise.opaque.Utils.initOpaqueSQL(
        SparkSession(sc)._jsparkSession, testing)
Example #41
0
        class SparkWithCustomGateway:
            def __init__(self):
                spark_conf = SparkConf()
                spark_conf.setAppName(spark_nlp_config.app_name)
                spark_conf.setMaster(spark_nlp_config.master)
                spark_conf.set("spark.driver.memory", memory)
                spark_conf.set("spark.serializer", spark_nlp_config.serializer)
                spark_conf.set("spark.kryoserializer.buffer.max",
                               spark_nlp_config.serializer_max_buffer)
                spark_conf.set("spark.driver.maxResultSize",
                               spark_nlp_config.driver_max_result_size)

                if gpu:
                    spark_conf.set("spark.jars.packages",
                                   spark_nlp_config.maven_gpu_spark)
                else:
                    spark_conf.set("spark.jars.packages",
                                   spark_nlp_config.maven_spark)

                # Make the py4j JVM stdout and stderr available without buffering
                popen_kwargs = {
                    'stdout': subprocess.PIPE,
                    'stderr': subprocess.PIPE,
                    'bufsize': 0
                }

                # Launch the gateway with our custom settings
                self.gateway = launch_gateway(conf=spark_conf,
                                              popen_kwargs=popen_kwargs)
                self.process = self.gateway.proc
                # Use the gateway we launched
                spark_context = SparkContext(gateway=self.gateway)
                self.spark_session = SparkSession(spark_context)

                self.out_thread = threading.Thread(target=self.output_reader)
                self.error_thread = threading.Thread(target=self.error_reader)
                self.std_background_listeners()

            def std_background_listeners(self):
                self.out_thread.start()
                self.error_thread.start()

            def output_reader(self):
                for line in iter(self.process.stdout.readline, b''):
                    print('{0}'.format(line.decode('utf-8')), end='')

            def error_reader(self):
                RED = '\033[91m'
                RESET = '\033[0m'
                for line in iter(self.process.stderr.readline, b''):
                    if output_level == 0:
                        print(RED + '{0}'.format(line.decode('utf-8')) + RESET,
                              end='')
                    else:
                        # output just info
                        pass

            def shutdown(self):
                self.spark_session.stop()
                self.gateway.shutdown()
                self.process.communicate()

                self.out_thread.join()
                self.error_thread.join()
Example #42
0
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.storagelevel import StorageLevel

if os.environ.get("SPARK_EXECUTOR_URI"):
    SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])

sc = SparkContext()
atexit.register(lambda: sc.stop())

try:
    # Try to access HiveConf, it will raise exception if Hive is not added
    sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
    spark = SparkSession.withHiveSupport(sc)
except py4j.protocol.Py4JError:
    spark = SparkSession(sc)
except TypeError:
    spark = SparkSession(sc)

# for compatibility
sqlContext = spark._wrapped
sqlCtx = sqlContext

print("""Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version %s
      /_/