def test_group_transactions(spark):
    sql = SQLContext(spark)
    create_spark_views(spark, customers_location, products_location,
                       transactions_location)
    group_transactions(spark)
    columns = spark.sql("""SELECT * FROM transactions_grouped""").columns
    assert "transactions_grouped" in sql.tableNames()
    assert columns == ["customer_id", "product_id", "count"]
Esempio n. 2
0
 def __init__(self, sparkContext, snappyContext=None):
     self._sc = sparkContext
     self._jsc = self._sc._jsc
     self._jvm = self._sc._jvm
     snappySession = SnappySession(sparkContext)
     SQLContext.__init__(self, sparkContext, snappySession)
     if snappyContext:
         self._scala_SnappyContext = snappyContext
Esempio n. 3
0
 def __init__(self, sparkContext, snappyContext=None):
     self._sc = sparkContext
     self._jsc = self._sc._jsc
     self._jvm = self._sc._jvm
     snappySession = SnappySession(sparkContext)
     SQLContext.__init__(self, sparkContext, snappySession)
     if snappyContext:
         self._scala_SnappyContext = snappyContext
Esempio n. 4
0
 def init(self):
     os.environ[
         "SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
     # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
     # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
     conf = SparkConf()
     conf.setMaster("local")
     conf.setAppName("PySparkShell")
     conf.set("spark.executor.memory", "2g")
     # conf.set("spark.driver.memory", "1g")
     self.sc = SparkContext(conf=conf)
     self.sqlContext = SQLContext(self.sc)
Esempio n. 5
0
def main():

    sc = initializeSpark()

    spark = SparkSession(sc)

    directory, post_id = parse_pls()
    rdds = make_rdds_from_dir(directory, sc)
    post_rdd = rdds["posts_rdd"]

    string = make_stripped_string(post_rdd, post_id)

    print("\n Body from post_id: " + str(post_id) +
          ", stripped of shitespaces and special characters:\n")
    print("'" + string + "'\n")

    # Tokenize the string
    tokens = tokenize(string)
    # remove duplicate entries
    tokens_unique = remove_dupes(tokens)

    # Assign id to the unique tokens
    token_id_tuple = assign_id_to_list(tokens_unique)
    # Now assign these id's the the original token list
    token_id_all = assign_unique_ids(token_id_tuple, tokens)

    print("\nTokens retrieved from the body with their respective id's: \n")
    for i in token_id_all:
        print(i)

    print("\n\nEdges:\n")
    ids = []
    for i in token_id_all:
        ids.append(i[0])

    # Create edges on a window size of 5, using the ids of the tokens
    edges = create_edges(ids, 5)
    # Removes duplicate edges from list
    edges = remove_dupe_tuples(edges)
    print(edges)
    print("\n\nPageRank:")

    sqlContext = SQLContext(sc)

    v = sqlContext.createDataFrame(token_id_tuple, ["id", "word"])

    e = sqlContext.createDataFrame(edges, ["src", "dst"])

    g = graphframes.GraphFrame(v, e)

    results = g.pageRank(resetProbability=0.15, tol=0.0001)
    results.vertices.select("word", "pagerank").show(truncate=False)
Esempio n. 6
0
 def __init__(self, sparkContext: SparkContext, jsparkSession: Optional[JavaObject] = None):
     from pyspark.sql.context import SQLContext
     self._sc = sparkContext
     self._jsc = self._sc._jsc  # type: ignore[attr-defined]
     self._jvm = self._sc._jvm  # type: ignore[attr-defined]
     if jsparkSession is None:
         if self._jvm.SparkSession.getDefaultSession().isDefined() \
                 and not self._jvm.SparkSession.getDefaultSession().get() \
                     .sparkContext().isStopped():
             jsparkSession = self._jvm.SparkSession.getDefaultSession().get()
         else:
             jsparkSession = self._jvm.SparkSession(self._jsc.sc())
     self._jsparkSession = jsparkSession
     self._jwrapped = self._jsparkSession.sqlContext()
     self._wrapped = SQLContext(self._sc, self, self._jwrapped)
     _monkey_patch_RDD(self)
     install_exception_handler()
     # If we had an instantiated SparkSession attached with a SparkContext
     # which is stopped now, we need to renew the instantiated SparkSession.
     # Otherwise, we will use invalid SparkSession when we call Builder.getOrCreate.
     if SparkSession._instantiatedSession is None \
             or SparkSession._instantiatedSession._sc._jsc is None:  # type: ignore[attr-defined]
         SparkSession._instantiatedSession = self
         SparkSession._activeSession = self
         self._jvm.SparkSession.setDefaultSession(self._jsparkSession)
         self._jvm.SparkSession.setActiveSession(self._jsparkSession)
Esempio n. 7
0
    def __init__(self, sparkContext, jsparkSession=None):
        """Creates a new SparkSession.

        >>> from datetime import datetime
        >>> spark = SparkSession(sc)
        >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1,
        ...     b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1),
        ...     time=datetime(2014, 8, 1, 14, 1, 5))])
        >>> df = allTypes.toDF()
        >>> df.createOrReplaceTempView("allTypes")
        >>> spark.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a '
        ...            'from allTypes where b and i > 0').collect()
        [Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \
            dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)]
        >>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect()
        [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
        """
        from pyspark.sql.context import SQLContext
        self._sc = sparkContext
        self._jsc = self._sc._jsc
        self._jvm = self._sc._jvm
        if jsparkSession is None:
            jsparkSession = self._jvm.SparkSession.builder().getOrCreate()
        self._jsparkSession = jsparkSession
        self._jwrapped = self._jsparkSession.sqlContext()
        self._wrapped = SQLContext(self._sc, self, self._jwrapped)
        _monkey_patch_RDD(self)
        install_exception_handler()
        # If we had an instantiated SparkSession attached with a SparkContext
        # which is stopped now, we need to renew the instantiated SparkSession.
        # Otherwise, we will use invalid SparkSession when we call Builder.getOrCreate.
        if SparkSession._instantiatedSession is None \
                or SparkSession._instantiatedSession._sc._jsc is None:
            SparkSession._instantiatedSession = self
    def test_estimator_graph_dataframe(self):
        tf.reset_default_graph()

        model = SimpleModel()
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        sc = init_nncontext()
        sqlcontext = SQLContext(sc)
        df = sqlcontext.read.csv(file_path, header=True, inferSchema=True)

        est = Estimator.from_graph(
            inputs=[model.user, model.item],
            labels=[model.label],
            outputs=[model.logits],
            loss=model.loss,
            optimizer=tf.train.AdamOptimizer(),
            metrics={"loss": model.loss})

        est.fit(data=df,
                batch_size=8,
                epochs=10,
                feature_cols=['user', 'item'],
                label_cols=['label'],
                validation_data=df)

        result = est.evaluate(df, batch_size=4, feature_cols=['user', 'item'],
                              label_cols=['label'])
        print(result)

        prediction_df = est.predict(df, batch_size=4, feature_cols=['user', 'item'])
        assert 'prediction' in prediction_df.columns
        predictions = prediction_df.collect()
        assert len(predictions) == 48
Esempio n. 9
0
    def __init__(self, sparkContext, jsparkSession=None):
        """Creates a new SparkSession.

        >>> from datetime import datetime
        >>> spark = SparkSession(sc)
        >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1,
        ...     b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1),
        ...     time=datetime(2014, 8, 1, 14, 1, 5))])
        >>> df = allTypes.toDF()
        >>> df.registerTempTable("allTypes")
        >>> spark.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a '
        ...            'from allTypes where b and i > 0').collect()
        [Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \
            dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)]
        >>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect()
        [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
        """
        from pyspark.sql.context import SQLContext
        self._sc = sparkContext
        self._jsc = self._sc._jsc
        self._jvm = self._sc._jvm
        if jsparkSession is None:
            jsparkSession = self._jvm.SparkSession(self._jsc.sc())
        self._jsparkSession = jsparkSession
        self._jwrapped = self._jsparkSession.wrapped()
        self._wrapped = SQLContext(self._sc, self, self._jwrapped)
        _monkey_patch_RDD(self)
        install_exception_handler()
        if SparkSession._instantiatedContext is None:
            SparkSession._instantiatedContext = self
Esempio n. 10
0
    def __init__(self, session: SparkSession, jsparkSession=None):
        """Creates a new SequilaSession.

        """
        ss = session._jvm.org.apache.spark.sql.SequilaSession(
            session._jsparkSession)
        session._jvm.org.biodatageeks.utils.SequilaRegister.register(ss)
        session._jvm.org.biodatageeks.utils.UDFRegister.register(ss)
        session._jvm.SequilaSession.setDefaultSession(ss)
        sequilaSession = SequilaSession._instantiatedSession
        from pyspark.sql.context import SQLContext
        self._sc = sequilaSession._sc
        self._jsc = self._sc._jsc
        self._jvm = session._jvm
        if jsparkSession is None:
            if self._jvm.SequilaSession.getDefaultSession().isDefined() \
                    and not self._jvm.SequilaSession.getDefaultSession().get() \
                    .sparkContext().isStopped():
                jsparkSession = self._jvm.SequilaSession.getDefaultSession(
                ).get()
            else:
                jsparkSession = self._jvm.SequilaSession(self._jsc.sc())
        self._jsparkSession = jsparkSession
        self._jwrapped = self._jsparkSession.sqlContext()
        self._wrapped = SQLContext(self._sc, self, self._jwrapped)
        if SequilaSession._instantiatedSession is None \
                or SequilaSession._instantiatedSession._sc._jsc is None:
            SequilaSession._instantiatedSession = self
            self._jvm.SparkSession.setDefaultSession(self._jsparkSession)
Esempio n. 11
0
 def init(self):
     os.environ[
         "SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
     # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
     # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
     conf = SparkConf()
     conf.setMaster("local")
     conf.setAppName("My application")
     conf.set("spark.executor.memory", "2g")
     self.sc = SparkContext(conf=conf)
     self.sqlContext = SQLContext(self.sc)
     self.df_user = self.sqlContext.read.json("dataset/user.json").cache()
     self.df_review = self.sqlContext.read.json(
         "dataset/review.json").cache()
     self.df_business = self.sqlContext.read.json(
         "dataset/business.json").cache()
     self.df_user.registerTempTable("user")
Esempio n. 12
0
 def __init__(self, sc, spark):
     self.sc = sc
     self.spark = spark
     self.sql_context = SQLContext(self.sc)
     self.schema = StructType([
         StructField("sensor_id", StringType(), False),
         StructField("currentTemperature", IntegerType(), False),
         StructField("status", StringType(), False)
     ])
Esempio n. 13
0
 def init(self):
     os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
     # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
     # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
     conf = SparkConf()
     conf.setMaster("local[10]")
     conf.setAppName("PySparkShell")
     conf.set("spark.executor.memory", "2g")
     conf.set("spark.driver.memory", "1g")
     self.sc = SparkContext(conf=conf)
     self.sqlContext = SQLContext(self.sc)        
Esempio n. 14
0
class MainApp(object):
    def __init__(self):
        pass
    
    def init(self):
        os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
        # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
        # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
        conf = SparkConf()
        conf.setMaster("local[10]")
        conf.setAppName("PySparkShell")
        conf.set("spark.executor.memory", "2g")
        conf.set("spark.driver.memory", "1g")
        self.sc = SparkContext(conf=conf)
        self.sqlContext = SQLContext(self.sc)        

    def loadData(self):
        category_list = self.sc.textFile("/Users/abhinavrungta/Desktop/uf-study/snc/github/SNC-WEB/src/yahoo/ydata-ymovies-user-movie-ratings-train-v1_0.txt").map(lambda line: (int(line.split(',')[0]), int(line.split(',')[1]), float(line.split(',')[2]), long(line.split(',')[3])))
        category_schema = StructType([
            StructField("userid", IntegerType(), True),
            StructField("movieid", IntegerType(), True),
            StructField("rating", FloatType(), True),
            StructField("time", LongType(), True)
        ])
        category_list = self.sqlContext.createDataFrame(category_list, category_schema)
        category_list.registerTempTable("data")
        movie_list = self.sqlContext.sql("SELECT movieid, COUNT(movieid) AS ct FROM data GROUP BY movieid")
        movie_list.registerTempTable("movie")
        movieid = movie_list.sort(movie_list.ct.desc()).first().movieid
        # movieid = category_list.first().movieid
        category_list = self.sqlContext.sql("SELECT * FROM data WHERE movieid = {0}".format(movieid))
        category_list.registerTempTable("data")
        user_list = self.sqlContext.sql("SELECT DISTINCT userid FROM data LIMIT 50")
        print(user_list.count())
        user_list.show()
        user_list.registerTempTable("users")
        category_list = self.sqlContext.sql("SELECT d.userid AS userid, d.movieid AS movieid, d.rating AS rating, d.time AS time FROM data d, users u WHERE d.userid = u.userid").repartition(1)
        #category_list = self.sqlContext.createDataFrame(category_list, category_schema)
        category_list = category_list.map(lambda line: str(line.userid) + "," + str(line.movieid) + "," + str(line.rating) + "," + str(line.time))
        category_list = category_list.repartition(1)
        category_list.saveAsTextFile("data.txt")
def Compute_Average(rinput, output):

    ## ratings data
    ### Craeting ratings-rdd
    my_RDD_strings = sc.textFile(rinput + '/' + 'ratings.csv')
    data = my_RDD_strings.map(lambda line: readline(line))

    ## Extraacting header row

    header_info = data.first()

    ### Selects all the rows except the header row
    data_mr = data.filter(lambda ratings: ratings != header_info)

    data_mr = data_mr.map(lambda ratings: string_to_float(ratings))
    data_mr_sum_count = data_mr.aggregateByKey(
        (0, 0), lambda U, s: (U[0] + s, U[1] + 1), lambda U, V:
        (U[0] + V[0], U[1] + V[1]))
    ### format (tag(sum,count))
    avg_ratings = data_mr_sum_count.map(lambda (a, (b, c)): (a, float(b) / c))
    ### here a=movie,b=sum,c=count

    sorted_avg_ratings = avg_ratings.sortByKey(
    )  ## sorting by movieID in accending order

    ## Creating output csv file based on the dataset-folder

    if 'ml-20m' in rinput:
        result_csv = 'Prashanth_Manja_result_task1_big.csv'
    else:
        result_csv = 'Prashanth_Manja_result_task1_small.csv'

    sqlContext = SQLContext(sc)
    data_frame = sqlContext.createDataFrame(sorted_avg_ratings)
    panda_data_frame = data_frame.toPandas()

    ## Output as csv file
    panda_data_frame.to_csv(output + '/' + result_csv,
                            encoding='utf-8',
                            header=['movieID', 'rating_avg'],
                            index=False)
Esempio n. 16
0
def pytest_sessionstart(session):
    print('Starting spark context')

    pysparkrpc.clear()

    sc = SparkContext()
    sqlContext = SQLContext(sc)
    spark = SparkSession.builder.getOrCreate()

    pytest.spark = spark
    pytest.sc = sc
    pytest.sqlcontext = sqlContext
def test_da_resampler_resample_dataframe_with_correct_number_of_rows(
        spark, df):
    uat = da.Resampler(SQLContext.getOrCreate(spark.sparkContext))
    result = uat.resample(df,
                          time_col='time',
                          timezone='Europe/Vienna',
                          step_size='500ms',
                          join_tolerance='180ms')

    df.show()
    result.show()

    assert result.count() == ((df.count()) * 2) - 1
Esempio n. 18
0
 def getOrCreate(self):
     """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new
     one based on the options set in this builder.
     """
     with self._lock:
         from pyspark.conf import SparkConf
         from pyspark.context import SparkContext
         from pyspark.sql.context import SQLContext
         sparkConf = SparkConf()
         for key, value in self._options.items():
             sparkConf.set(key, value)
         sparkContext = SparkContext.getOrCreate(sparkConf)
         return SQLContext.getOrCreate(sparkContext).sparkSession
Esempio n. 19
0
 def getOrCreate(self):
     """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new
     one based on the options set in this builder.
     """
     with self._lock:
         from pyspark.conf import SparkConf
         from pyspark.context import SparkContext
         from pyspark.sql.context import SQLContext
         sparkConf = SparkConf()
         for key, value in self._options.items():
             sparkConf.set(key, value)
         sparkContext = SparkContext.getOrCreate(sparkConf)
         return SQLContext.getOrCreate(sparkContext).sparkSession
def read_existing_parquet():
    sc = get_configured_context()
    sql_context = SQLContext(sparkContext=sc)

    # Loads parquet file located in AWS S3 / minio into RDD Data Frame
    parquet_file = sql_context.read.parquet("s3a://testparquet/nation.parquet")
    parquet_file.registerTempTable("parquet_file")

    # Run standard SQL queries against temporary table
    nations_all_sql = sql_context.sql("SELECT * FROM parquet_file")
    nations_all = nations_all_sql.rdd.map(
        lambda p: "Country: {0:15} Ipsum Comment: {1}".format(
            p.N_NAME, p.N_COMMENT))

    for idx, nation in enumerate(nations_all.collect()):
        if idx == 0:
            print("All Nations and Comments -- `SELECT * FROM parquet_file`")
            print_horizontal()
        print(nation)
    else:
        print_horizontal()

    # Use standard SQL to filter
    nations_filtered_sql = sql_context.sql(
        "SELECT N_NAME FROM parquet_file WHERE N_NAME LIKE '%IND%'")
    nations_filtered = nations_filtered_sql.rdd.map(
        lambda p: "Country: {0:20}".format(p.N_NAME))

    for idx, nation in enumerate(nations_filtered.collect()):
        if idx == 0:
            print(
                "Nations Filtered -- `SELECT name FROM parquet_file WHERE name LIKE '%IND%'`"
            )
            print_horizontal()
        print(nation)
    else:
        print_horizontal()
Esempio n. 21
0
    def test_estimator_graph_dataframe_exception(self):

        tf.reset_default_graph()

        model = SimpleModel()
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        sc = init_nncontext()
        sqlcontext = SQLContext(sc)
        df = sqlcontext.read.csv(file_path, header=True, inferSchema=True)

        est = Estimator.from_graph(inputs=[model.user, model.item],
                                   labels=[model.label],
                                   outputs=[model.logits],
                                   loss=model.loss,
                                   optimizer=tf.train.AdamOptimizer(),
                                   metrics={"loss": model.loss})

        with self.assertRaises(Exception) as context:
            est.fit(data=df,
                    batch_size=8,
                    epochs=10,
                    feature_cols=['user', 'item'],
                    validation_data=df)
        self.assertTrue(
            'label columns is None; it should not be None in training' in str(
                context.exception))

        est.fit(data=df,
                batch_size=8,
                epochs=10,
                feature_cols=['user', 'item'],
                labels_cols=['label'])
        with self.assertRaises(Exception) as context:
            predictions = est.predict(df, batch_size=4).collect()
        self.assertTrue(
            'feature columns is None; it should not be None in prediction' in
            str(context.exception))

        with self.assertRaises(Exception) as context:
            est.fit(data=df,
                    batch_size=8,
                    epochs=10,
                    feature_cols=['user', 'item'],
                    labels_cols=['label'],
                    validation_data=[1, 2, 3])
        self.assertTrue(
            'train data and validation data should be both Spark DataFrame' in
            str(context.exception))
Esempio n. 22
0
    def __init__(
        self,
        sparkContext: SparkContext,
        jsparkSession: Optional[JavaObject] = None,
        options: Dict[str, Any] = {},
    ):
        from pyspark.sql.context import SQLContext

        self._sc = sparkContext
        self._jsc = self._sc._jsc
        self._jvm = self._sc._jvm

        assert self._jvm is not None

        if jsparkSession is None:
            if (
                self._jvm.SparkSession.getDefaultSession().isDefined()
                and not self._jvm.SparkSession.getDefaultSession().get().sparkContext().isStopped()
            ):
                jsparkSession = self._jvm.SparkSession.getDefaultSession().get()
                getattr(getattr(self._jvm, "SparkSession$"), "MODULE$").applyModifiableSettings(
                    jsparkSession, options
                )
            else:
                jsparkSession = self._jvm.SparkSession(self._jsc.sc(), options)
        else:
            getattr(getattr(self._jvm, "SparkSession$"), "MODULE$").applyModifiableSettings(
                jsparkSession, options
            )
        self._jsparkSession = jsparkSession
        self._jwrapped = self._jsparkSession.sqlContext()
        self._wrapped = SQLContext(self._sc, self, self._jwrapped)
        _monkey_patch_RDD(self)
        install_exception_handler()
        # If we had an instantiated SparkSession attached with a SparkContext
        # which is stopped now, we need to renew the instantiated SparkSession.
        # Otherwise, we will use invalid SparkSession when we call Builder.getOrCreate.
        if (
            SparkSession._instantiatedSession is None
            or SparkSession._instantiatedSession._sc._jsc is None  # type: ignore[attr-defined]
        ):
            SparkSession._instantiatedSession = self
            SparkSession._activeSession = self
            assert self._jvm is not None
            self._jvm.SparkSession.setDefaultSession(self._jsparkSession)
            self._jvm.SparkSession.setActiveSession(self._jsparkSession)
Esempio n. 23
0
def main():
    #main function to execute code
    sqlContext = SQLContext(sc)
    zk_host = zk_ip + ":2181"
    consumer_group = "reading-consumer-group"
    kafka_partitions = {topic: 1}
    #create kafka stream
    kvs = KafkaUtils.createStream(ssc,
                                  zk_host,
                                  consumer_group,
                                  kafka_partitions,
                                  valueDecoder=decoder)
    lines = kvs.map(lambda x: x[1])
    readings = lines.map(lambda x: Row(device_id=x["device_id"],\
        metric_time=datetime.datetime.fromtimestamp(int(x["metric_time"])),\
        metric_name=x["metric_name"],\
        metric_value=float(x["metric_value"])))
    readings.foreachRDD(process)
    ssc.start()
    ssc.awaitTermination()
Esempio n. 24
0
def run(sc, args):
    min_tokens = 70
    min_length = 20
    output_path = 'text-reuse/pipeline/wiki_preprocessed'
    input_file = 'text-reuse/wiki_00'

    sqlC = SQLContext(sc)

    df = sqlC.read.format('com.databricks.spark.xml').options(
        charset="UTF-8", nullValue="", rowTag='doc',
        mode='DROPMALFORMED').load(input_file)
    df = df.selectExpr("_id as id", "_title as title", "content as content")
    df.show()

    #Preprocess Wikipedia

    #generate paragarphs
    df = generate_text_paragraphs(df, 'content', 'paragraphs')
    df.show()

    #Filter out empty paragraphs
    df = df.filter(size(col('paragraphs')) > 0)

    #normalize paragraphs
    df = normalize_paragraphs(df, 'paragraphs', 'paragraphs', min_tokens)
    df.show()

    #flatten docs into paragraphs
    df = flatten_paragraphs(df, 'paragraphs', 'paragraph')
    df.show()

    #clean-up the text
    df = clean_up_text(df, 'paragraph', 'paragraph')
    df.show()

    #filter only paragraphs that are longer than min_tokens
    df = filter_paras_by_length(df, 'paragraph', min_length)

    #rename columns and save the df
    df = df.selectExpr('para_id as p_id', 'id as d_id', 'paragraph')
    df.rdd.saveAsPickleFile(output_path)
Esempio n. 25
0
        def getOrCreate(self):
            """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a
            new one based on the options set in this builder.

            This method first checks whether there is a valid thread-local SparkSession,
            and if yes, return that one. It then checks whether there is a valid global
            default SparkSession, and if yes, return that one. If no valid global default
            SparkSession exists, the method creates a new SparkSession and assigns the
            newly created SparkSession as the global default.

            In case an existing SparkSession is returned, the config options specified
            in this builder will be applied to the existing SparkSession.
            """
            with self._lock:
                from pyspark.conf import SparkConf
                from pyspark.context import SparkContext
                from pyspark.sql.context import SQLContext
                sparkConf = SparkConf()
                for key, value in self._options.items():
                    sparkConf.set(key, value)
                sparkContext = SparkContext.getOrCreate(sparkConf)
                return SQLContext.getOrCreate(sparkContext).sparkSession
Esempio n. 26
0
        def getOrCreate(self):
            """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a
            new one based on the options set in this builder.

            This method first checks whether there is a valid thread-local SparkSession,
            and if yes, return that one. It then checks whether there is a valid global
            default SparkSession, and if yes, return that one. If no valid global default
            SparkSession exists, the method creates a new SparkSession and assigns the
            newly created SparkSession as the global default.

            In case an existing SparkSession is returned, the config options specified
            in this builder will be applied to the existing SparkSession.
            """
            with self._lock:
                from pyspark.conf import SparkConf
                from pyspark.context import SparkContext
                from pyspark.sql.context import SQLContext
                sparkConf = SparkConf()
                for key, value in self._options.items():
                    sparkConf.set(key, value)
                sparkContext = SparkContext.getOrCreate(sparkConf)
                return SQLContext.getOrCreate(sparkContext).sparkSession
Esempio n. 27
0
    def test_pyspark_gateway(self):
        pg = PysparkGateway()

        import pyspark
        from pyspark import SparkContext, SparkConf
        from pyspark.sql.context import SQLContext
        from pyspark.sql.functions import udf

        conf = SparkConf().set('spark.io.encryption.enabled', 'true')
        sc = SparkContext(gateway=pg.gateway, conf=conf)
        sqlContext = SQLContext.getOrCreate(sc)

        self.assertEqual(type(sc), SparkContext)

        df = sqlContext.createDataFrame([(1,2,'value 1')], ['id1', 'id2', 'val'])
        self.assertEqual(df.count(), 1)

        rows = df.collect()
        self.assertEqual(rows[0].id1, 1)

        pd = df.toPandas()
        self.assertEqual(type(pd), pandas.core.frame.DataFrame)

        data = [(1,2,'a'),(3,4,'b'),(5,6,'c')]
        df = sqlContext.createDataFrame(data, ['foo', 'bar', 'baz'])
        df.createOrReplaceTempView('foo_table')

        def squared(v):
            return v * v

        sqlContext.udf.register('squared', squared)

        squared_df = sqlContext.sql('select squared(foo) AS val from foo_table')
        rows = squared_df.collect()

        self.assertEqual(rows[2].val, '25')

        sc.stop()
        pg.gateway.shutdown()
Esempio n. 28
0
def main(input_data_directory):

    # parser = argparse.ArgumentParser(description='Process html files')
    #
    # parser.add_argument('--input_location',
    #                     dest='input_location',
    #                     type=str,
    #                     default=os.path.join(__WORKDIR__, "data", "clean"),
    #                     help='Input location for the html files')
    #
    # parser.add_argument('--spark_context_name',
    #                     dest='spark_context_name',
    #                     type=str,
    #                     default="dudewhat",
    #                     help='Name of the Spark context')
    #
    # args = parser.parse_args()

    conf = SparkConf().setAppName("Reporter Review.")
    sc = SparkContext(conf=conf).getOrCreate()
    sqlContext = SQLContext(sc)

    df = clean_data(sc, sqlContext, input_data_directory)
    generate_insights(sc, sqlContext, df)
Esempio n. 29
0

def isNull(column_value):
    if column_value is None:
        return ''
    else:
        return column_value


if __name__ == '__main__':

    conf = SparkConf().setAppName('Type2_Example').setMaster('local[*]')

    # create spark context and sql context
    sc = SparkContext(conf=conf)
    sql_context = SQLContext(sc)

    # read the input data file and create pandas dataframe
    type_2_dataframe = sql_context.read.format("com.databricks.spark.csv")\
    .option("header", "false") \
    .option("inferschema", "true") \
    .option("delimiter", "|") \
    .option("mode", "DROPMALFORMED") \
    .load("/home/mandar/ProjectWorkspace/Example/com/spark/example/type2_data")

    type_2_dataframe.printSchema()

    # print "--------------------------Total number of record -------------------------"
    # print type_2_dataframe.count()

    type_2_dataframe.show(10)
Esempio n. 30
0
try:
    spark = SparkSession._create_shell_session()
except Exception:
    import sys
    import traceback

    warnings.warn("Failed to initialize Spark session.")
    traceback.print_exc(file=sys.stderr)
    sys.exit(1)

sc = spark.sparkContext
sql = spark.sql
atexit.register((lambda sc: lambda: sc.stop())(sc))

# for compatibility
sqlContext = SQLContext._get_or_create(sc)
sqlCtx = sqlContext

print(
    r"""Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version %s
      /_/
"""
    % sc.version
)
print(
    "Using Python version %s (%s, %s)"
    % (platform.python_version(), platform.python_build()[0], platform.python_build()[1])
Esempio n. 31
0
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext

sc = SparkContext(master='local[*]')
sql_context = SQLContext(sc)
Esempio n. 32
0
import json
import time
import csv
import cryptography

from cryptography.fernet import Fernet
from pyspark.sql.context import SQLContext
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql.functions import lit

# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, 5)

sqlContext = SQLContext(sc)
players_filepath = '/project/input/players.csv'
df_players = sqlContext.read.load(players_filepath,
                                  format='com.databricks.spark.csv',
                                  header='true',
                                  inferSchema='true')
df_players = df_players.withColumn('PlayerContribution', lit(0))
df_players = df_players.withColumn('PlayerPerformance',
                                   df_players.PlayerContribution)
df_players = df_players.withColumn('PlayerRanking', lit(0.5))
df_players = df_players.withColumn('Chemisty', lit(0.5))
df_players.show()
df_players.printSchema()

teams_filepath = '/project/input/teams.csv'
df_teams = sqlContext.read.load(teams_filepath,
import sys
import os
import json
from pyspark.sql.types import StructField, StructType, IntegerType

ascontext=None
if len(sys.argv) < 2 or sys.argv[1] != "-test":
    import spss.pyspark.runtime
    ascontext = spss.pyspark.runtime.getContext()
    sc = ascontext.getSparkContext()
    sqlCtx = ascontext.getSparkSQLContext()
    df = ascontext.getSparkInputData()
    schema = ascontext.getSparkInputSchema()
else:
    sc = SparkContext('local')
    sqlCtx = SQLContext(sc)
    # get an input dataframe with sample data by looking in working directory for file DRUG1N.json
    wd = os.getcwd()
    df = sqlCtx.read.json(sys.argv[2]).repartition(4) # argv[2] of form file://DRUG1N.json
    schema = df.schema
    modelpath_base = "/tmp/model1234"
    modelpath = "file://"+modelpath_base+"/model"
    metadatapath = modelpath_base+"/metadata"
    model_metadata = json.loads(open(metadatapath,"r").read())


prediction_field = "$K-cluster"
prediction_type = IntegerType()

output_schema = StructType(schema.fields + [StructField(prediction_field, prediction_type, nullable=True)])
Esempio n. 34
0
def convert_to_line(json_list):
    json_string = ""
    for line in json_list:
        json_string += json.dumps(line) + "\n"
        print(json_string)
    return json_string


def parse_json(json_data, sc):
    r = convert_to_line(json_data)
    mylist = []
    for line in r.splitlines():
        mylist.append(line)
    rdd = sc.parallelize(mylist, 8)
    df = sqlContext.read.json(rdd)
    return df


if __name__ == '__main__':
    sprk = Spark_Session()
    conn = sprk.Spark_Context()
    sql_conn = sprk.Spark_Connect()
    sqlContext = SQLContext(conn)

##https://api.github.com/users?since=100
with urllib.request.urlopen("https://api.github.com/users?since=100") as url:
    data = parse_json(parse(url.read().decode("utf-8")), conn)

data.show()
Esempio n. 35
0
__author__ = 'hanhanw'

import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

conf = SparkConf().setAppName("shortest path")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
assert sc.version >= '1.5.1'

inputs = sys.argv[1]
output = sys.argv[2]
source_node = sys.argv[3]
dest_node = sys.argv[4]

textinput = sc.textFile(inputs)

def get_graphedges(line):
    list1 = line.split(':')
    if list1[1] == '':
        return None
    list2 = list1[1].split(' ')
    list2 = filter(None, list2)
    results = []
    s = list1[0]
    for d in list2:
        results.append((s, d))
    return results
import os
import json
from pyspark.sql.types import StructField, StructType, StringType, DoubleType

ascontext=None
try:
    import spss.pyspark.runtime
    ascontext = spss.pyspark.runtime.getContext()
    sc = ascontext.getSparkContext()
    sqlCtx = ascontext.getSparkSQLContext()
    df = ascontext.getSparkInputData()
    target = '%%target_field%%'
    schema = ascontext.getSparkInputSchema()
except:
    sc = SparkContext('local')
    sqlCtx = SQLContext(sc)
    # get an input dataframe with sample data by looking in working directory for file DRUG1N.json
    wd = os.getcwd()
    df = sqlCtx.load("file://"+wd+"/DRUG1N.json","json").repartition(4)
    schema = df.schema
    modelpath_base = "/tmp/model1234"
    modelpath = "file://"+modelpath_base+"/model"
    metadatapath = modelpath_base+"/metadata"
    model_metadata = json.loads(open(metadatapath,"r").read())
    target = model_metadata["target"]


prediction_field = "$T-" + target
prediction_type = StringType()

output_schema = StructType(schema.fields + [StructField(prediction_field, prediction_type, nullable=True)])
Esempio n. 37
0
class MainApp(object):
    def __init__(self):
        pass

    def init(self):
        os.environ[
            "SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
        # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
        # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
        conf = SparkConf()
        conf.setMaster("local")
        conf.setAppName("My application")
        conf.set("spark.executor.memory", "2g")
        self.sc = SparkContext(conf=conf)
        self.sqlContext = SQLContext(self.sc)
        self.df_user = self.sqlContext.read.json("dataset/user.json").cache()
        self.df_review = self.sqlContext.read.json(
            "dataset/review.json").cache()
        self.df_business = self.sqlContext.read.json(
            "dataset/business.json").cache()
        self.df_user.registerTempTable("user")

    def getS3File(self, s3FilePath, destinationPathOnLocal):
        r = requests.get(s3FilePath)
        fileOb = open(destinationPathOnLocal, 'w')
        fileOb.write(r.text)
        fileOb.close()

    def writeToS3File(self, s3FilePath, sourcePathOnLocal):
        fileOb = open(sourcePathOnLocal, 'r')
        payload = fileOb.read()
        fileOb.close()

        headers = {"x-amz-acl": "public-read-write"}
        return requests.put(s3FilePath, headers=headers, data=payload)

    def reads3spark(self, path):
        # path = "s3n://b-datasets/flight_data/*"
        x = self.sc.textFile(path)  # we can just specify all the files.
        return x

    def writes3spark(self, x, path):
        x.saveAsTextFile(path)

    def createFeatures(self):
        userData = self.sqlContext.sql(
            "SELECT user_id, name, review_count, votes, fans, yelping_since, elite FROM user"
        )
        userData = userData.map(mapUsers).coalesce(1)
        res = self.sqlContext.createDataFrame(userData)

        review_user = self.df_review.select(self.df_review.business_id,
                                            self.df_review.user_id)
        business_loc = self.df_business.select(self.df_business.business_id,
                                               self.df_business.city,
                                               self.df_business.state)
        df_join_reviewAndBusiness = review_user.join(
            business_loc,
            review_user.business_id == business_loc.business_id).select(
                "user_id", "city", "state")
        df_grouped = df_join_reviewAndBusiness.groupBy(
            ["user_id", "city", "state"]).count()
        df_panda = res.toPandas()
        for name, group in df_grouped:
            if (group['city'] > 10):
                user_id = df_grouped.get_group(name)[0]['user_id']
                df_panda[user_id]['k'] = df_panda[user_id]['k'] + 1

        res = self.sqlContext.createDataFrame(df_panda)
        res.toJSON().saveAsTextFile('user_features.json')
Esempio n. 38
0
 def __init__(self, sparkContext, snappyContext=None):
     SQLContext.__init__(self, sparkContext)
     if snappyContext:
         self._scala_SnappyContext = snappyContext
Esempio n. 39
0
# -*- coding: utf-8 -*-
'''
Created on 2018年1月5日

@author: root
'''
from com.bjsxt.python.test import SparkUtil
from pyspark.sql.context import SQLContext
conf = SparkUtil.initSparkConf(True, "DataFrameOpsFromFile")
sc = SparkUtil.initSparkContext(conf)
sqlContext = SQLContext(sc)
df = sqlContext.read.json("../data/people.json")
df.registerTempTable("people")
sqlContext.sql("select * from people where age > 20").show()
sc.stop()
Esempio n. 40
0
from pyspark.sql.functions import rand, randn, mean, min, max
from pyspark.sql.context import SQLContext
from pyspark.context import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("sparkDataFrame")
sc = SparkContext(conf = conf)
sqlcontext = SQLContext(sc)

# 1. Create a DataFrame with one int column and 10 rows.
df = sqlcontext.range(0, 10)
df.show()

# Generate two other columns using uniform distribution and normal distribution.
df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal"))
df.show()

# 2. Summary and Descriptive Statistics
df = sqlcontext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27))
df.describe('uniform', 'normal').show()

df.select([mean('uniform'), min('uniform'), max('uniform')]).show()

# 3. Sample covariance and correlation
# Covariance is a measure of how two variables change with respect to each other. 
# A positive number would mean that there is a tendency that as one variable increases, 
# the other increases as well. 
# A negative number would mean that as one variable increases, 
# the other variable has a tendency to decrease.
df = sqlcontext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))
df.stat.cov('rand1', 'rand2')
df.stat.cov('id', 'id')
ascontext=None
try:
    import spss.pyspark.runtime
    ascontext = spss.pyspark.runtime.getContext()
    sc = ascontext.getSparkContext()
    df = ascontext.getSparkInputData()
    model_type = '%%model_type%%'
    target = '%%target_field%%'
    lambda_param = float('%%lambda%%')
    predictors = map(lambda x: x.strip(),"%%predictor_fields%%".split(","))

    modelpath = ascontext.createTemporaryFolder()
except:
    import os
    sc = SparkContext('local')
    sqlCtx = SQLContext(sc)
    # get an input dataframe with sample data by looking in working directory for file DRUG1N.json
    wd = os.getcwd()
    df = sqlCtx.load("file://"+wd+"/DRUG1N.json","json").repartition(4)
    # specify predictors and target
    predictors = ["Drug","BP", "Sex", "Age"]
    target = "Cholesterol"
    lambda_param = 1.0

    modelpath_base = "/tmp/model1234"
    import shutil
    try:
        shutil.rmtree(modelpath_base)
    except:
        pass
    modelpath = "file://"+modelpath_base+"/model"
Esempio n. 42
0
import sys
import re
import datetime

from pyspark import SparkConf, SparkContext
from pyspark.sql.context import SQLContext


inputs1 = sys.argv[1]
inputs2 = sys.argv[2]
output = sys.argv[3]

conf = SparkConf().setAppName("load logs")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
assert sc.version >= '1.5.1'

text = sc.textFile(inputs1)+sc.textFile(inputs2)

def parseline(line):
    linere = re.compile('^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$')
    match = re.search(linere, line)
    if match:
        m = re.match(linere, line)
        host = m.group(1)
        dt = datetime.datetime.strptime(m.group(2), '%d/%b/%Y:%H:%M:%S')
        path = m.group(3)
        bys = float(m.group(4))
        dct = {"host": host, "datetime": dt, "path": path, "bys": bys}
        return dct
Esempio n. 43
0
class MainApp(object):
    def __init__(self):
        pass

    def init(self):
        os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
        # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
        # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
        conf = SparkConf()
        conf.setMaster("local")
        conf.setAppName("PySparkShell")
        conf.set("spark.executor.memory", "2g")
        # conf.set("spark.driver.memory", "1g")
        self.sc = SparkContext(conf=conf)
        self.sqlContext = SQLContext(self.sc)

    def loadData(self):
        self.df_review = self.sqlContext.read.json(
            "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json"
        ).cache()
        # self.df_review = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_review.json").cache()
        self.df_business = self.sqlContext.read.json(
            "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json"
        ).cache()
        # self.df_business = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_business.json").cache()
        self.df_review.registerTempTable("reviews")
        self.df_business.registerTempTable("business")

    def createCheckInDataPerUser(self):
        review_user = self.sqlContext.sql("SELECT business_id, user_id FROM reviews")
        business_loc = self.sqlContext.sql("SELECT business_id, latitude, longitude FROM business")
        review_user.registerTempTable("reviews_user")
        business_loc.registerTempTable("business_loc")

        self.df_join_reviewAndBusiness = self.sqlContext.sql(
            "SELECT r.user_id, b.latitude, b.longitude FROM reviews_user r JOIN business_loc b ON r.business_id = b.business_id"
        ).cache()
        self.df_join_reviewAndBusiness.registerTempTable("userBusiness")

        self.df_unique_users = self.sqlContext.sql(
            'SELECT DISTINCT user_id FROM userBusiness where user_id = "SIfJLNMv7vBwo-fSipxNgg"'
        )
        self.df_unique_users.registerTempTable("users")

        pd = self.df_join_reviewAndBusiness.toPandas()
        global_db = self.sc.broadcast(pd)

        schema = StructType([StructField("latitude", FloatType()), StructField("longitude", FloatType())])
        partialFunc = partial(getLocationsOfUser, business_db=global_db.value)

        self.get_locations = udf(partialFunc, ArrayType(schema))
        self.get_centers = udf(getCentersOfUser, ArrayType(schema))

        self.df_unique_users = self.df_unique_users.withColumn(
            "user_locations", self.get_locations(self.df_unique_users["user_id"])
        )
        self.df_unique_users.registerTempTable("users")

        self.df_unique_users.repartition(1).write.save("user.json", "json", "overwrite")

        print(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"]))

        self.df_unique_users = self.df_unique_users.withColumn(
            "user_centers", self.get_centers(self.df_unique_users["user_locations"])
        )
        self.df_unique_users.registerTempTable("users")

        self.df_unique_users.repartition(1).write.save("center.json", "json", "overwrite")
        self.df_unique_users.show()

    def distanceCalc(self):
        self.df_unique_users = self.sqlContext.read.json(
            "user.json/part-r-00000-23a1b514-f5fe-4f61-9a64-01ebbc88c146"
        ).cache()
        print(len(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"])))