Ejemplo n.º 1
0
class MLlibTestCase(unittest.TestCase):
    def setUp(self):
        self.sc = SparkContext('local[4]', "MLlib tests")
        self.spark = SparkSession(self.sc)

    def tearDown(self):
        self.spark.stop()
Ejemplo n.º 2
0
def benchmark_spark(ratings, factors, iterations=5):
    conf = (SparkConf()
            .setAppName("implicit_benchmark")
            .setMaster('local[*]')
            .set('spark.driver.memory', '16G')
            )
    context = SparkContext(conf=conf)
    spark = SparkSession(context)

    times = {}
    try:
        ratings = convert_sparse_to_dataframe(spark, context, ratings)

        for rank in factors:
            als = ALS(rank=rank, maxIter=iterations,
                      alpha=1, implicitPrefs=True,
                      userCol="row", itemCol="col", ratingCol="data")
            start = time.time()
            als.fit(ratings)
            elapsed = time.time() - start
            times[rank] = elapsed / iterations
            print("spark. factors=%i took %.3f" % (rank, elapsed/iterations))
    finally:
        spark.stop()

    return times
Ejemplo n.º 3
0
 def test_active_session_with_None_and_not_None_context(self):
     from pyspark.context import SparkContext
     from pyspark.conf import SparkConf
     sc = None
     session = None
     try:
         sc = SparkContext._active_spark_context
         self.assertEqual(sc, None)
         activeSession = SparkSession.getActiveSession()
         self.assertEqual(activeSession, None)
         sparkConf = SparkConf()
         sc = SparkContext.getOrCreate(sparkConf)
         activeSession = sc._jvm.SparkSession.getActiveSession()
         self.assertFalse(activeSession.isDefined())
         session = SparkSession(sc)
         activeSession = sc._jvm.SparkSession.getActiveSession()
         self.assertTrue(activeSession.isDefined())
         activeSession2 = SparkSession.getActiveSession()
         self.assertNotEqual(activeSession2, None)
     finally:
         if session is not None:
             session.stop()
         if sc is not None:
             sc.stop()
Ejemplo n.º 4
0
SELECT id,
       COUNT(*) AS n_connections

FROM (
    SELECT id_1 AS id
    FROM connections
    UNION ALL
    SELECT id_2 AS id
    FROM connections
    )

GROUP BY 1
ORDER BY 2 DESC

''')

connection_counts.show(20)
connection_counts.createOrReplaceTempView('connection_counts')

avg_connections = spark.sql(
    'SELECT COUNT(*), AVG(n_connections) FROM connection_counts').collect()[0]

msg = '''
{0} ids in the dataset,
with an average connection count of {1}.
'''.format(*avg_connections)

print(msg)

spark.stop()
Ejemplo n.º 5
0
class DeltaTableTests(PySparkTestCase):
    def setUp(self):
        super(DeltaTableTests, self).setUp()
        self.sqlContext = SQLContext(self.sc)
        self.spark = SparkSession(self.sc)
        self.tempPath = tempfile.mkdtemp()
        self.tempFile = os.path.join(self.tempPath, "tempFile")

    def tearDown(self):
        self.spark.stop()
        shutil.rmtree(self.tempPath)
        super(DeltaTableTests, self).tearDown()

    def test_forPath(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        dt = DeltaTable.forPath(self.spark, self.tempFile).toDF()
        self.__checkAnswer(dt, [('a', 1), ('b', 2), ('c', 3)])

    def test_alias_and_toDF(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        dt = DeltaTable.forPath(self.spark, self.tempFile).toDF()
        self.__checkAnswer(
            dt.alias("myTable").select('myTable.key', 'myTable.value'),
            [('a', 1), ('b', 2), ('c', 3)])

    def test_history(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        self.__overwriteDeltaTable([('a', 3), ('b', 2), ('c', 1)])
        dt = DeltaTable.forPath(self.spark, self.tempFile)
        operations = dt.history().select('operation')
        self.__checkAnswer(
            operations, [Row("WRITE"), Row("WRITE")],
            StructType([StructField("operation", StringType(), True)]))

        lastMode = dt.history(1).select('operationParameters.mode')
        self.__checkAnswer(
            lastMode, [Row("Overwrite")],
            StructType(
                [StructField("operationParameters.mode", StringType(), True)]))

    def test_vacuum(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        dt = DeltaTable.forPath(self.spark, self.tempFile)
        self.__createFile('abc.txt', 'abcde')
        self.__createFile('bac.txt', 'abcdf')
        self.assertEqual(True, self.__checkFileExists('abc.txt'))
        dt.vacuum()  # will not delete files as default retention is used.

        self.assertEqual(True, self.__checkFileExists('bac.txt'))
        retentionConf = "spark.databricks.delta.retentionDurationCheck.enabled"
        self.spark.conf.set(retentionConf, "false")
        dt.vacuum(0.0)
        self.spark.conf.set(retentionConf, "true")
        self.assertEqual(False, self.__checkFileExists('bac.txt'))
        self.assertEqual(False, self.__checkFileExists('abc.txt'))

    def test_convertToDelta(self):
        df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)],
                                        ["key", "value"])
        df.write.format("parquet").save(self.tempFile)
        self.tempFile2 = self.tempFile + "_"
        dt = DeltaTable.convertToDelta(self.spark,
                                       "parquet.`" + self.tempFile + "`")
        self.__checkAnswer(
            self.spark.read.format("delta").load(self.tempFile), [('a', 1),
                                                                  ('b', 2),
                                                                  ('c', 3)])

        # test if convert to delta with partition columns work
        df.write.partitionBy("value").format("parquet").save(self.tempFile2)
        schema = StructType()
        schema.add("value", IntegerType(), True)
        dt = DeltaTable.convertToDelta(self.spark,
                                       "parquet.`" + self.tempFile2 + "`",
                                       schema)
        self.__checkAnswer(
            self.spark.read.format("delta").load(self.tempFile2), [('a', 1),
                                                                   ('b', 2),
                                                                   ('c', 3)])

    def __checkAnswer(self, df, expectedAnswer, schema=["key", "value"]):
        if not expectedAnswer:
            self.assertEqual(df.count(), 0)
            return
        expectedDF = self.spark.createDataFrame(expectedAnswer, schema)
        self.assertEqual(df.count(), expectedDF.count())
        self.assertEqual(len(df.columns), len(expectedDF.columns))
        self.assertEqual([], df.subtract(expectedDF).take(1))
        self.assertEqual([], expectedDF.subtract(df).take(1))

    def __writeDeltaTable(self, datalist):
        df = self.spark.createDataFrame(datalist, ["key", "value"])
        df.write.format("delta").save(self.tempFile)

    def __overwriteDeltaTable(self, datalist):
        df = self.spark.createDataFrame(datalist, ["key", "value"])
        df.write.format("delta").mode("overwrite").save(self.tempFile)

    def __createFile(self, fileName, content):
        with open(os.path.join(self.tempFile, fileName), 'w') as f:
            f.write(content)

    def __checkFileExists(self, fileName):
        return os.path.exists(os.path.join(self.tempFile, fileName))
class SCDHTest(testBase):

    def setUp(self):
        StockCustReturnByPrdInd.logLevel = 'debug'
        self.scdh = StockCustReturnByPrdInd(None)
        os.environ['SPARK_HOME'] = "/usr/local/Cellar/apache-spark/2.2.0/libexec"
        sys.path.append("/usr/local/Cellar/apache-spark/2.2.0/libexec/python")
        conf = SparkConf().setMaster("local").setAppName("hello")
        self.spark = SparkSession(SparkContext(conf=conf))

    def tearDown(self):
        self.spark.stop()

    def test_local_spark(self):
        doc = self.spark.createDataFrame([['a', 'b', 'c'], ['b', 'd', 'd']])
        print doc.show()
        print "successful!"

    def test_get_base_data(self):
        self.scdh._get_base_data("2017-03-16", "2017-03-18", 1, 5)

    def test_init_data(self):
        self.scdh.init_data()

    def test_daily_compute(self):
        self.scdh.daily_compute("2017-03-16", "2017-03-16")

    def test_check_1(self):
        sql = """
            SELECT * from adatatest.stock_cust_daily_return
            where short_return_rate>1 or long_return_rate>1 or total_return_rate>1
        """
        self.spark.sql(sql)

    def test_travel_row(self):
        # """
        # stock_cust_return_by_prd_ind.prd_ind	unknown
        # stock_cust_return_by_prd_ind.return	-44623.789999999964
        # stock_cust_return_by_prd_ind.return_rate	-0.006018969744297111
        # stock_cust_return_by_prd_ind.trade_id	12466
        # stock_cust_return_by_prd_ind.return_ratio	0.4610100448676952
        # stock_cust_return_by_prd_ind.return_rank	2
        # stock_cust_return_by_prd_ind.return_rate_rank	1
        # stock_cust_return_by_prd_ind.busi_date	2017-03-23
        # stock_cust_return_by_prd_ind.compute	7
        # """
        # spark.sql("""
        #   select  trade_id,prd_ind,collect_list(detail_item) detail_list from (
        #     select trade_id,trim(prd_ind) prd_ind,
        #            (str_to_map(concat(
        #                 'pre_mkt_val:',pre_mkt_val,
        #                 ',now_mkt_val:',now_mkt_val,
        #                 ',pos_cash_flow:',pos_cash_flow,
        #                 ',neg_cash_flow:',pos_cash_flow,
        #                 ',exception_label:',exception_label,
        #                 ',trd_type:',trd_type,
        #                 ',return:',return,
        #                 ',busi_date:',busi_date),",",":")) detail_item
        #     from adatatest.stock_cust_daily_holding
        #     where  busi_date<='2017-03-23' and trade_id='12466' and prd_ind='unknown'
        #   ) a
        #   GROUP  by trade_id,prd_ind
        # """)
        r = Row(trade_id=u'12466', prd_ind=u'unknown', detail_list=[
            {u'return': u'-13008.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'1263402.0', u'now_mkt_val': u'1250394.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'6344.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'135176.0', u'now_mkt_val': u'141520.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'-12803.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'1308384.0', u'now_mkt_val': u'1295581.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'-4.229999999999563', u'trd_type': u'long_related',
             u'pos_cash_flow': u'16940.23', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'16936.0', u'busi_date': u'2017-03-23',
             u'neg_cash_flow': u'16940.23'},
            {u'return': u'1612.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'208052.0', u'now_mkt_val': u'209664.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'35466.53', u'now_mkt_val': u'18526.3',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'4730.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'679400.0', u'now_mkt_val': u'684130.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'-1662.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'271183.0', u'now_mkt_val': u'269521.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'-693.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-130207.0', u'now_mkt_val': u'-130900.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'-21138.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'1284540.0', u'now_mkt_val': u'1263402.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'2079.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-132286.0', u'now_mkt_val': u'-130207.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'6771.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'128405.0', u'now_mkt_val': u'135176.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'306163.19', u'now_mkt_val': u'35466.53',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'-12470.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'691870.0', u'now_mkt_val': u'679400.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'-122.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'128527.0', u'now_mkt_val': u'128405.0',
             u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'},
            {u'return': u'11.429999999999836', u'trd_type': u'long_related',
             u'pos_cash_flow': u'2273.57', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'2285.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'2273.57'},
            {u'return': u'539.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-132825.0', u'now_mkt_val': u'-132286.0',
             u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'},
            {u'return': u'8673.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'1327382.0', u'now_mkt_val': u'1336055.0',
             u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'},
            {u'return': u'15399.439999999944', u'trd_type': u'long_related',
             u'pos_cash_flow': u'1274560.56', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'1289960.0', u'busi_date': u'2017-03-20',
             u'neg_cash_flow': u'1274560.56'},
            {u'return': u'197.7399999999907', u'trd_type': u'short_related',
             u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'-132825.0', u'busi_date': u'2017-03-20', u'neg_cash_flow': u'0.0'},
            {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'3497135.97', u'now_mkt_val': u'1510820.28',
             u'busi_date': u'2017-03-20', u'neg_cash_flow': u'0.0'},
            {u'return': u'12845.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-1333311.0', u'now_mkt_val': u'-1320466.0',
             u'busi_date': u'2017-03-17', u'neg_cash_flow': u'0.0'},
            {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'3497135.97', u'now_mkt_val': u'3497135.97',
             u'busi_date': u'2017-03-17', u'neg_cash_flow': u'0.0'},
            {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'2177000.0', u'now_mkt_val': u'3497135.97',
             u'busi_date': u'2017-03-16', u'neg_cash_flow': u'0.0'},
            {u'return': u'-17.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'2247.0', u'now_mkt_val': u'2230.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'15414.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-1333311.0', u'now_mkt_val': u'-1317897.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'-38.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'2285.0', u'now_mkt_val': u'2247.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'5138.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-1338449.0', u'now_mkt_val': u'-1333311.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'-27671.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'1336055.0', u'now_mkt_val': u'1308384.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'-2808.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'210860.0', u'now_mkt_val': u'208052.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'486.3400000000256', u'trd_type': u'long_related',
             u'pos_cash_flow': u'270696.66', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'271183.0', u'busi_date': u'2017-03-22',
             u'neg_cash_flow': u'270696.66'},
            {u'return': u'-2753.609999999986', u'trd_type': u'long_related',
             u'pos_cash_flow': u'694623.61', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'691870.0', u'busi_date': u'2017-03-21',
             u'neg_cash_flow': u'694623.61'},
            {u'return': u'-5420.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'1289960.0', u'now_mkt_val': u'1284540.0',
             u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'},
            {u'return': u'-2569.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-1335880.0', u'now_mkt_val': u'-1338449.0',
             u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'},
            {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'1510820.28', u'now_mkt_val': u'306163.19',
             u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'},
            {u'return': u'1299.6199999999953', u'trd_type': u'long_related',
             u'pos_cash_flow': u'209560.38', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'210860.0', u'busi_date': u'2017-03-21',
             u'neg_cash_flow': u'209560.38'},
            {u'return': u'-15414.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-1320466.0', u'now_mkt_val': u'-1335880.0',
             u'busi_date': u'2017-03-20', u'neg_cash_flow': u'0.0'},
            {u'return': u'5451.600000000093', u'trd_type': u'long_related',
             u'pos_cash_flow': u'1321930.4', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'1327382.0', u'busi_date': u'2017-03-20',
             u'neg_cash_flow': u'1321930.4'},
            {u'return': u'150.9100000000035', u'trd_type': u'long_related',
             u'pos_cash_flow': u'128376.09', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'128527.0', u'busi_date': u'2017-03-20',
             u'neg_cash_flow': u'128376.09'},
            {u'return': u'-13175.030000000028', u'trd_type': u'short_related',
             u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'-1333311.0', u'busi_date': u'2017-03-16', u'neg_cash_flow': u'0.0'}])
        r2 = _travel_row(r, '2017-03-23')
        self.assertTrue(int(r2.get("return")), -44623)
Ejemplo n.º 7
0
from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext(appName="Convert CSV's to Dataframe")
ss = SparkSession(sc)

# Convert street files
filesPath = "hdfs://namenode:9000/csvfiles/street/*.csv"
df = ss.read.csv(filesPath)
df.printSchema()
print("Number of rows: " + str(df.count()))

dfPath = "hdfs://namenode:9000/dataframes/street.csv"
df.write.csv(dfPath)

ss.stop()
Ejemplo n.º 8
0
        class SparkWithCustomGateway:

            def __init__(self):
                spark_conf = SparkConf()
                spark_conf.setAppName(spark_nlp_config.app_name)
                spark_conf.setMaster(spark_nlp_config.master)
                spark_conf.set("spark.driver.memory", memory)
                spark_conf.set("spark.serializer", spark_nlp_config.serializer)
                spark_conf.set("spark.kryoserializer.buffer.max", spark_nlp_config.serializer_max_buffer)
                spark_conf.set("spark.driver.maxResultSize", spark_nlp_config.driver_max_result_size)

                if gpu:
                    spark_conf.set("spark.jars.packages", spark_nlp_config.maven_gpu_spark)
                else:
                    spark_conf.set("spark.jars.packages", spark_nlp_config.maven_spark)

                if cache_folder != '':
                    spark_conf.config("spark.jsl.settings.pretrained.cache_folder", cache_folder)
                if log_folder != '':
                    spark_conf.config("spark.jsl.settings.annotator.log_folder", log_folder)
                if cluster_tmp_dir != '':
                    spark_conf.config("spark.jsl.settings.storage.cluster_tmp_dir", cluster_tmp_dir)

                # Make the py4j JVM stdout and stderr available without buffering
                popen_kwargs = {
                    'stdout': subprocess.PIPE,
                    'stderr': subprocess.PIPE,
                    'bufsize': 0
                }

                # Launch the gateway with our custom settings
                self.gateway = launch_gateway(conf=spark_conf, popen_kwargs=popen_kwargs)
                self.process = self.gateway.proc
                # Use the gateway we launched
                spark_context = SparkContext(gateway=self.gateway)
                self.spark_session = SparkSession(spark_context)

                self.out_thread = threading.Thread(target=self.output_reader)
                self.error_thread = threading.Thread(target=self.error_reader)
                self.std_background_listeners()

            def std_background_listeners(self):
                self.out_thread.start()
                self.error_thread.start()

            def output_reader(self):
                for line in iter(self.process.stdout.readline, b''):
                    print('{0}'.format(line.decode('utf-8')), end='')

            def error_reader(self):
                RED = '\033[91m'
                RESET = '\033[0m'
                for line in iter(self.process.stderr.readline, b''):
                    if output_level == 0:
                        print(RED + '{0}'.format(line.decode('utf-8')) + RESET, end='')
                    else:
                        # output just info
                        pass

            def shutdown(self):
                self.spark_session.stop()
                self.gateway.shutdown()
                self.process.communicate()

                self.out_thread.join()
                self.error_thread.join()
Ejemplo n.º 9
0
class SCDHTest(testBase):
    def setUp(self):
        StockCustReturnByPrd.logLevel = 'debug'
        os.environ[
            'SPARK_HOME'] = "/usr/local/Cellar/apache-spark/2.2.0/libexec"
        sys.path.append("/usr/local/Cellar/apache-spark/2.2.0/libexec/python")
        conf = SparkConf().setMaster("local").setAppName("hello")
        self.spark = SparkSession(SparkContext(conf=conf))
        self.scdh = StockCustReturnByPrd(self.spark)

    def tearDown(self):
        self.spark.stop()

    def test_mapPartition(self):
        df = self.spark.createDataFrame([{
            u"a": "1",
            u"b": "2"
        }, {
            u"a": "3",
            u"b": "4"
        }])
        df.persist()

        def t(ite, s):
            for ss in ite:
                yield {"t": ss.a, "s1": s}

        d = self.spark.createDataFrame(
            df.rdd.mapPartitions(lambda x: t(x, 1), 2))
        print d.withColumn("month", F.lit("hh")).show()

    def test_local_spark(self):
        doc = self.spark.createDataFrame([['a', 'b', 'c'], ['b', 'd', 'd']])
        print doc.show()
        print "successful!"

    def test_get_base_data(self):
        self.scdh._get_base_data("2017-03-16", "2017-03-18", 1, 5)

    def test_init_data(self):
        self.scdh.init_data()

    def test_daily_compute(self):
        self.scdh.daily_compute("2017-03-16", "2017-03-16")

    def test_check_1(self):
        sql = """
            SELECT * from adatatest.stock_cust_daily_return
            where short_return_rate>1 or long_return_rate>1 or total_return_rate>1
        """
        self.spark.sql(sql)

    # 收益率和收益率单测
    def test_travel_row(self):
        row1 = Row(trade_id=u'10036',
                   prd_no=u'2.300262',
                   prd_ind=u'\u73af\u5883\u4e0e\u8bbe\u65bd\u670d\u52a1',
                   detail_list=[{
                       u'return': u'5058',
                       u'trd_type': u'long_related',
                       u'pos_cash_flow': u'3297265',
                       u'exception_label': u'0',
                       u'pre_mkt_val': u'0.0',
                       u'now_mkt_val': u'3302324.0',
                       u'busi_date': u'2017-03-17',
                       u'neg_cash_flow': u'3297265'
                   }])

        row2 = Row(trade_id=u'10178',
                   prd_no=u'1.600729',
                   prd_ind=u'unknown',
                   detail_list=[
                       {
                           u'return': u'-18',
                           u'trd_type': u'long_related',
                           u'pos_cash_flow': u'13508',
                           u'exception_label': u'0',
                           u'pre_mkt_val': u'12000',
                           u'now_mkt_val': u'13490.0',
                           u'busi_date': u'2017-03-16',
                           u'neg_cash_flow': u'13508'
                       },
                       {
                           u'return': u'3816.0',
                           u'trd_type': u'short_related',
                           u'pos_cash_flow': u'0.0',
                           u'exception_label': u'0',
                           u'pre_mkt_val': u'-255036.0',
                           u'now_mkt_val': u'-251220.0',
                           u'busi_date': u'2017-03-17',
                           u'neg_cash_flow': u'0.0'
                       },
                       {
                           u'return': u'-381',
                           u'trd_type': u'short_related',
                           u'pos_cash_flow': u'45000',
                           u'exception_label': u'0',
                           u'pre_mkt_val': u'3121',
                           u'now_mkt_val': u'-255036.0',
                           u'busi_date': u'2017-03-16',
                           u'neg_cash_flow': u'0.0'
                       },
                       {
                           u'return': u'-400.0',
                           u'trd_type': u'long_related',
                           u'pos_cash_flow': u'0.0',
                           u'exception_label': u'0',
                           u'pre_mkt_val': u'13490.0',
                           u'now_mkt_val': u'13090.0',
                           u'busi_date': u'2017-03-17',
                           u'neg_cash_flow': u'0.0'
                       },
                   ])
        rowDict = _travel_row(row1, '2017-03-18')
        # 计算单个值的return,
        self.assertEqual(rowDict.get("return"), 5058, "return error!!!")
        # 计算单个值的return rate
        return_rate = 5058 * 1.0 / (3297265 + 3297265 + 0.0)
        print rowDict.get("return_rate"), return_rate
        self.assertEqual(
            rowDict.get("return_rate"), return_rate,
            "first[{}],second[{}],msg[{}]".format(
                rowDict.get("return_rate"), return_rate,
                "return rate isn't expected"))
        # 模拟所有数据类型,计算return
        rowDict2 = _travel_row(row2, '2017-03-18')
        self.assertEquals(rowDict2.get("return"), -18 - 381 - 400.0 + 3816.0,
                          "return isn't expected")
        # 模拟所有数据类型,计算return_rate
        print rowDict2.get("return_rate"), (-18 - 381 - 400.0 + 3816.0) / (
            12000 + 13508 + 13508 + 251220.0 + 45000)
        self.assertEquals(rowDict2.get("return_rate"),
                          (-18 - 381 - 400.0 + 3816.0) /
                          (12000 + 13508 + 13508 + 251220.0 + 45000),
                          "return isn't expected")

    def test_check_date_detail(self):
        # test trade_id=1987
        """
        stock_cust_return_by_prd.prd_ind	建筑机械与重型卡车
        stock_cust_return_by_prd.prd_no	2.000816
        stock_cust_return_by_prd.return	-115226.41999999993
        stock_cust_return_by_prd.return_rate	-0.009502134028067698
        stock_cust_return_by_prd.trade_id	1987
        stock_cust_return_by_prd.return_ratio	0.6923033340521149
        stock_cust_return_by_prd.return_rank	4
        stock_cust_return_by_prd.return_rate_rank	2
        stock_cust_return_by_prd.busi_date	2017-03-23
        stock_cust_return_by_prd.compute	7
        """
        r = Row(
            trade_id=u'1987',
            prd_no=u'2.000816',
            prd_ind=u'\u5efa\u7b51\u673a\u68b0\u4e0e\u91cd\u578b\u5361\u8f66',
            detail_list=[{
                u'return': u'-115226.41999999993',
                u'trd_type': u'long_related',
                u'pos_cash_flow': u'6063186.42',
                u'exception_label': u'0',
                u'pre_mkt_val': u'0.0',
                u'now_mkt_val': u'5947960.0',
                u'busi_date': u'2017-03-23',
                u'neg_cash_flow': u'6063186.42'
            }])
        rowDict2 = _travel_row(r, '2017-03-18').get("return")
        self.assertTrue(int(rowDict2) == -115226)
        # """
        # stock_cust_return_by_prd.prd_ind	基础化工
        # stock_cust_return_by_prd.prd_no	1.600301
        # stock_cust_return_by_prd.return	-5924.530000000028
        # stock_cust_return_by_prd.return_rate	-0.009933835241121989
        # stock_cust_return_by_prd.trade_id	12466
        # stock_cust_return_by_prd.return_ratio	0.051785613453405335
        # stock_cust_return_by_prd.return_rank	4
        # stock_cust_return_by_prd.return_rate_rank	6
        # stock_cust_return_by_prd.busi_date	2017-03-23
        # stock_cust_return_by_prd.compute	7
        # """
        r2 = Row(trade_id=u'12466',
                 prd_no=u'1.600301',
                 prd_ind=u'\u57fa\u7840\u5316\u5de5',
                 detail_list=[{
                     u'return': u'225.0',
                     u'trd_type': u'long_related',
                     u'pos_cash_flow': u'0.0',
                     u'exception_label': u'0',
                     u'pre_mkt_val': u'296100.0',
                     u'now_mkt_val': u'296325.0',
                     u'busi_date': u'2017-03-22',
                     u'neg_cash_flow': u'0.0'
                 }, {
                     u'return': u'-4050.0',
                     u'trd_type': u'long_related',
                     u'pos_cash_flow': u'0.0',
                     u'exception_label': u'0',
                     u'pre_mkt_val': u'296325.0',
                     u'now_mkt_val': u'292275.0',
                     u'busi_date': u'2017-03-23',
                     u'neg_cash_flow': u'0.0'
                 }, {
                     u'return': u'-2099.530000000028',
                     u'trd_type': u'long_related',
                     u'pos_cash_flow': u'298199.53',
                     u'exception_label': u'0',
                     u'pre_mkt_val': u'0.0',
                     u'now_mkt_val': u'296100.0',
                     u'busi_date': u'2017-03-21',
                     u'neg_cash_flow': u'298199.53'
                 }])
        rd = _travel_row(r2, "2017-03-18").get("return")
        self.assertTrue(int(rd), -5924)
        rd_rate = _travel_row(r2, "2017-03-18").get("return_rate")
        print rd_rate

    def test_checkdata_sql(self):
        df = self.spark.sql("""
        select df.trade_id,df.prd_no,df.return,df.return_rate,df.return_ratio,df.return_rank,
        df.return_rate_rank,busi_date from adatatest.stock_cust_return_by_prd df
        where return>0
         """)

        df.where(df.trade_id == '17898').where(
            df.busi_date == '2017-03-31').where(
                df.prd_no == '2.002763').select(
                    df.trade_id, df.prd_no, "return", df.return_rate,
                    df.return_ratio, df.return_rank, df.return_rate_rank,
                    df.busi_date).orderBy("return_rank", "busi_date").show()

        df.where(df.trade_id == '17898').where(
            df.busi_date == '2017-03-31').where(
                df.prd_no == '2.002763').select("return_ratio").orderBy(
                    "return_rank", "busi_date")