Ejemplo n.º 1
0
 def test_DateString_Ymd(self):
     self.assertEqual(
         wd.dateCleanup('20200205').strftime("%Y%m%d%H%M%S"),
         '20200205000000')
Ejemplo n.º 2
0
 def test_DateWithColonTimeNoSeperationFormat(self):
     self.assertEqual(
         wd.dateCleanup('2020:02:2812:30:00').strftime("%Y%m%d%H%M%S"),
         '20200228123000')
Ejemplo n.º 3
0
 def test_DayLessThanTweleve(self):
     self.assertEqual(
         wd.dateCleanup('2020-02-05').strftime("%Y%m%d%H%M%S"),
         '20200205000000')
Ejemplo n.º 4
0
 def test_alphaDateShortFormat(self):
     self.assertEqual(
         wd.dateCleanup('MAR 25 2020').strftime("%Y%m%d%H%M%S"),
         '20200325000000')
Ejemplo n.º 5
0
 def test_DateWithTimeFormat(self):
     self.assertEqual(
         wd.dateCleanup('2020-02-28 12:30:00').strftime("%Y%m%d%H%M%S"),
         '20200228123000')
Ejemplo n.º 6
0
 def test_germanDateFormatWithTime(self):
     self.assertEqual(
         wd.dateCleanup('07.04.2020 12:12:12').strftime("%Y%m%d%H%M%S"),
         '20200407121212')
Ejemplo n.º 7
0
 def test_alphaDateFormat(self):
     self.assertEqual(
         wd.dateCleanup('march 25, 2020').strftime("%Y%m%d%H%M%S"),
         '20200325000000')
Ejemplo n.º 8
0
 def test_germanDateFormat(self):
     self.assertEqual(
         wd.dateCleanup('07.04.2020').strftime("%Y%m%d%H%M%S"),
         '20200407000000')
Ejemplo n.º 9
0
 def test_TZFormat(self):
     self.assertEqual(
         wd.dateCleanup('2015-03-26T10:58:51Z').strftime("%Y%m%d%H%M%S"),
         '20150326105851')
Ejemplo n.º 10
0
 def test_epochFormat(self):
     self.assertEqual(
         wd.dateCleanup(1571824800000, epoch=True).strftime("%Y%m%d%H%M%S"),
         '20191023100000')
Ejemplo n.º 11
0
 def dc(val):
     result = wd.dateCleanup(val, epoch=True)
     return result
Ejemplo n.º 12
0
    def __init__(self, spark):
        # ---------------------------------------------------------------------
        # set config attributes
        # ---------------------------------------------------------------------
        if len(sys.argv) > 1:
            with open(sys.argv[1]) as dependencyFile:
                conf = json.load(dependencyFile)
            for x in conf:
                setattr(parent, x, conf.get(x, ""))
        # ---------------------------------------------------------------------
        # set the process state
        # ---------------------------------------------------------------------
        processStart = wd.dateCleanup(
            datetime.utcnow().strftime("%Y%m%d%H%M%S"))

        # ---------------------------------------------------------------------
        # create udf dateCleanup
        # ---------------------------------------------------------------------

        def dc(val):
            result = wd.dateCleanup(val, epoch=True)
            return result

        dateCleanup = F.udf(dc, TimestampType())
        # ---------------------------------------------------------------------
        # set audit
        # ---------------------------------------------------------------------
        tableId = self.name
        columnAudit = "tableid,ins_gmt_ts,process_timestamp"
        namespaceAudit = "ea_sc_kif"
        tableAudit = "batch_process_times"
        cfAudit = "o"
        dfAudit = DataFrame(
            spark.sparkContext._jvm.com.hpe.hbase.HbaseManager.getDF(
                columnAudit, namespaceAudit, tableAudit, cfAudit),
            spark).where(F.col("tableid") == tableId)
        # ---------------------------------------------------------------------
        # get last proccesed time
        # ---------------------------------------------------------------------
        col = "ins_gmt_ts"
        lastProcessedDate = dfAudit.select(col).collect()
        if len(lastProcessedDate) > 0:
            DTS = str([ele[col] for ele in lastProcessedDate][0])
            deltaFilter = (F.col('epoch_ts') > DTS)
        else:
            DTS = ""
            deltaFilter = ""
        # ---------------------------------------------------------------------
        # get mapper attributes
        # ---------------------------------------------------------------------
        with open(self.app["mapper-properties"]) as mapperFile:
            topMap = json.load(mapperFile)
        # ---------------------------------------------------------------------
        # set hbase table properties
        # ---------------------------------------------------------------------
        hbase = topMap["hbase"]
        table = hbase.get("table", "none")
        cf = hbase.get("cf", "none")
        namespace = hbase.get("namespace", "none")
        column = hbase.get("column", "none")
        latestVersionMapped = hbase.get("latestVersion", "none")
        if latestVersionMapped == "True":
            latestVersion = True
        else:
            latestVersion = False
        # ---------------------------------------------------------------------
        # create data frame
        # ---------------------------------------------------------------------
        df = spark.sparkContext \
            ._jvm.com.hpe.hbase \
            .HbaseManager.getDF(column,
                                namespace,
                                table,
                                cf,
                                latestVersion)
        pyDF = DataFrame(df, spark)
        pyDF = pyDF.withColumn(
            "epoch_ts",
            F.to_timestamp(dateCleanup(pyDF["ts"]), 'yyyy-MM-dd HH:mm:ss'))
        sqlDF = pyDF
        if type(deltaFilter) is not str:
            sqlDF = sqlDF.where(deltaFilter)
        # ---------------------------------------------------------------------
        # max date
        # ---------------------------------------------------------------------
        maxDate = dfColumnToString(
            sqlDF.agg(F.max("epoch_ts")).select("max(epoch_ts)"),
            "max(epoch_ts)")
        # ---------------------------------------------------------------------
        # if no delta results available then end else process delta
        # ---------------------------------------------------------------------
        if maxDate:
            printInc = ("Found delta maxdate of '{}'." +
                        " Starting incremental update on process of " +
                        "delta records.")
            print(printInc.format(maxDate))
            # -----------------------------------------------------------------
            # set checkpoint dir
            # -----------------------------------------------------------------
            spark \
                .sparkContext \
                .setCheckpointDir(self.app["checkpoint"])
            sqlDF.persist(StorageLevel.MEMORY_AND_DISK)
            sqlDF.take(1)
            # -----------------------------------------------------------------
            # set recursive table properties
            # -----------------------------------------------------------------
            recursive = topMap["recursive"]
            id = str(recursive.get("id", "none"))
            parentid = str(recursive.get("parentid", "none"))
            parentLookupCol = str(recursive.get("parentLookupCol", "none"))
            lookupCol = str(recursive.get("lookupCol", "none"))
            levels = recursive.get("levels", "2")
            # -----------------------------------------------------------------
            # set output DF columns
            # -----------------------------------------------------------------
            outCols = stringToList(column)
            outCols.insert(0, id)
            outCols.append(lookupCol)
            # -----------------------------------------------------------------
            # run recursiveLookup
            # -----------------------------------------------------------------
            recursiveDF = recursiveLookup(sqlDF, id, parentid, parentLookupCol,
                                          lookupCol, levels).select(outCols)
            recursiveDF = recursiveDF.withColumn(
                "parentlevel",
                F.col("parentlevel").cast(StringType()))
            recursiveDF = recursiveDF.withColumn("key", F.col(id))
            # -----------------------------------------------------------------
            # store df to hbase
            # -----------------------------------------------------------------
            outputCols = appendString("key", "parentlevel", lookupCol)
            print("writing recursive output to Hbase . . .")
            spark.sparkContext \
                ._jvm.com.hpe.hbase \
                .HbaseManager.setDF(recursiveDF._jdf,
                                    outputCols,
                                    "key",
                                    namespace,
                                    table,
                                    cf)
            # -----------------------------------------------------------------
            # close process state
            # -----------------------------------------------------------------
            processEnd = wd.dateCleanup(
                datetime.utcnow().strftime("%Y%m%d%H%M%S"))
            dfAuditWrite = sqlDF.agg(F.max("epoch_ts")
                                     .alias("ins_gmt_ts")) \
                .select(F.col("ins_gmt_ts").cast(StringType())) \
                .withColumn("tableid", F.lit(tableId)) \
                .withColumn("process_start_ts",
                            F.lit(processStart).cast(StringType())) \
                .withColumn("process_end_ts", F.lit(processEnd
                                                    ).cast(StringType()))
            auditWriteCols = "tableid,ins_gmt_ts,process_start_ts" \
                + ",process_end_ts"
            # -----------------------------------------------------------------
            # write to the audit log
            # -----------------------------------------------------------------
            print("updating audit log . . .")
            spark.sparkContext \
                ._jvm.com.hpe.hbase \
                .HbaseManager \
                .setDF(dfAuditWrite._jdf,
                       auditWriteCols,
                       "tableid",
                       namespaceAudit,
                       tableAudit,
                       cfAudit)
            # -----------------------------------------------------------------
            # add finial df to class (if run outside of __main__)
            # -----------------------------------------------------------------
            self.df = recursiveDF
        else:
            # -----------------------------------------------------------------
            # No delta results
            # -----------------------------------------------------------------
            print("No delta available.")