Exemple #1
0
def mergeFiles(config, vendorDS):
    servPath = config['InvoicePath']['servingPath']
    tempPath = config['InvoicePath']['servingTempPath']
    oldServPath = config['InvoicePath']['oldServingPath']

    if os.path.exists(oldServPath):
        # list of invoice id
        filterID = vendorDS.select(collect_set(
            col("_InvoiceNumber"))).first()[0]

        # read the temp file
        tempDS = createDatasetFromCSVFile(config, tempPath)

        # read the older serving file
        oldServDS = createDatasetFromCSVFile(config, oldServPath)

        # get all the invoice except the invoice need to update
        oldServDS = oldServDS.filter(~(col("InvoiceID").isin(*filterID)))

        DS = oldServDS.union(tempDS)
    else:
        DS = createDatasetFromCSVFile(config, tempPath)

    # writing into serving csv
    writeIntoServingLayer(DS, servPath + "_" + str(datetime.date.today()),
                          "Overwrite")

    # deleting temporary path
    shutil.rmtree(tempPath)
    # if os.path.exists(oldServPath):
    #     shutil.rmtree(oldServPath)
    return None
    def transfLineitems(ds, config):
        invoiceDS: DataFrame = ds

        invoiceDS = invoiceDS.withColumn("TypeOfService", col("_LineItems._Description")) \
            .withColumn("ServiceAmount", col("_LineItems._TotalPriceNetto"))

        if invoiceDS.schema["TypeOfService"].dataType == StringType():
            invoiceDS = invoiceDS.withColumn("TypeOfService", array(col("_LineItems._Description"))) \
                .withColumn("ServiceAmount", array(col("_LineItems._TotalPriceNetto")))

        invoiceDS = invoiceDS.withColumn("ServiceNRate", arrays_zip(col("TypeOfService"), col("ServiceAmount")))\
                    .withColumn("ServiceNRate", explode_outer(col("ServiceNRate")))

        # udf_service = udf(get_genesis_TOS, StringType()) # same code of line 56, remove costTypeList

        billPath = config["Master"]["billPath"]
        CostTypeList = createDatasetFromCSVFile("", billPath).filter(col("VENDOR_NAME").rlike("GENESIS MARINE, LLC"))\
            .rdd.map(lambda x: x.COST_LINE_ITEM_TYPE).collect()

        def udf_service(costType):
            return udf(lambda l: get_genesis_TOS_search(l, costType),
                       StringType())

        invoiceDS = invoiceDS.withColumn("TypeOfService", udf_service(CostTypeList)(regexp_replace(
                                                        col("ServiceNRate.TypeOfService"), "\n", " ")))\
            .withColumn("ServiceAmount", col("ServiceNRate.ServiceAmount")) \

        return invoiceDS
def getServingData(config, primary_key_column, fromDate, toDate):
    servPath = config['InvoicePath']['servingPath']

    servingLayerColumns = config['Ecap']['servingLayerColumns']
    constServingLayerColumns = config['Ecap']['constServingLayerColumns']

    # reading the serving data
    serDS = createDatasetFromCSVFile(config,
                                     servPath).select(*servingLayerColumns)

    # renaming the columns, where we don't need to change if the columns in serving layer changes.
    serDS = serDS.toDF(*constServingLayerColumns)

    if (fromDate != '') | (toDate != ''):
        serDS = serDS.filter((col('invoice_date') >= lit(fromDate))
                             & (col('invoice_date') <= lit(toDate)))

    # summing the TOS amount
    serDS = serDS.withColumn('total_amount_lineitem',
                             sum(col('t_lineitem_amount')).over(Window.partitionBy(*primary_key_column))) \
        .withColumn('total_amount_lineitem', round(col('total_amount_lineitem'), 2).cast(DoubleType())) \
        .withColumn('t_lineitem_amount',
                    collect_list(col('t_lineitem_amount')).over(Window.partitionBy(*primary_key_column))) \
        .distinct()

    return serDS
def getEcapData(config, primary_key_column, fromDate, toDate):
    ecapPath = config["Master"]["ecapPath"]

    ecapColumns = config['Ecap']['ecapColumns']
    constEcapColumns = config['Ecap']['constEcapColumns']

    # reading the ecap data
    ecapDS = createDatasetFromCSVFile(config, ecapPath).select(*ecapColumns)

    # renaming the columns, where we don't need to change if the columns in ecap file changes.
    ecapDS = ecapDS.toDF(*constEcapColumns)

    if (fromDate != '') | (toDate != ''):
        ecapDS = ecapDS.filter((col('invoice_date') >= lit(fromDate))
                               & (col('invoice_date') <= lit(toDate)))

    # summing the amount for same invoice id
    ecapDS = ecapDS.withColumn('total_amount_ecap',
                               regexp_replace(col('total_amount_ecap'), ',', '').cast(DoubleType())) \
        .withColumn('total_amount_lineitem_ecap', col('total_amount_ecap')) \
        .withColumn('total_amount_ecap', round(
        sum(col('total_amount_ecap')).over(Window.partitionBy(*primary_key_column)), 2)) \
        .dropDuplicates(primary_key_column)

    return ecapDS
def bookInfo(ds, bookPath):
    # reading master CSV, collecting multiple books as set and converting it into string in order to save in csv.
    bookDS = createDatasetFromCSVFile("", path=bookPath) \
        .withColumn("BOOKS", concat_ws(",", collect_set(col("BOOK")).over(Window.partitionBy(col("TRIP"))))) \
        .select("TRIP", "BOOKS").distinct()

    # joining invoice DS with masterBook DS to get the Book column
    DS = ds.join(bookDS, col("_ShellTripID") == col("TRIP"), "left_outer").drop("TRIP", "BOOK")

    return DS
def billTypeInfo(ds, billPath):
    DS = ds

    # reading billType CSV
    billDS = createDatasetFromCSVFile("", billPath)

    # joining invoice DS with billing DS to get the BillType column, use rename option while joining to remove grey warn
    DS = DS.join(billDS,
                 (lower(DS._Name) == lower(billDS.VENDOR_NAME)) &
                 (lower(DS.TypeOfService) == lower(billDS.COST_LINE_ITEM_TYPE)),
                 "left_outer") \
        .drop("VEDNOR_NAME", "COST_LINE_ITEM_TYPE")

    return DS
Exemple #7
0
def updateFiles(config, vendorDS):
    servPath = config['InvoicePath']['servingPath']
    tempPath = config['InvoicePath']['servingTempPath']
    oldServPath = config['InvoicePath']['oldServingPath']
    newServingName = config['InvoicePath']['newServingName']
    parquetServingPath = config['InvoicePath']['parquetServingPath']

    check_dir = subprocess.Popen(["hadoop", "fs", "-test", "-e", oldServPath],
                                 stdout=subprocess.PIPE)
    wait = check_dir.wait()
    check = check_dir.returncode

    # if os.path.exists(oldServPath):
    if check == 0:
        # list of invoice id
        filterID = vendorDS.select(collect_set(
            col("_InvoiceNumber"))).first()[0]

        # read the temp file
        tempDS = createDatasetFromCSVFile(config, tempPath)

        # read the older serving file
        oldServDS = createDatasetFromCSVFile(config, oldServPath)

        # get all the invoice except the invoice need to update
        oldServDS = oldServDS.filter(~(col("InvoiceID").isin(*filterID)))

        DS = oldServDS.union(tempDS)
    else:
        DS = createDatasetFromCSVFile(config, tempPath)

    # writing into serving csv
    writeIntoServingLayer(DS, servPath, "Overwrite")

    if os.path.exists(oldServPath):
        serDS = createDatasetFromCSVFile(config, servPath)
        writeIntoServingLayer(serDS, oldServPath, "Overwrite")
        writeIntoParquetServingLayer(serDS, parquetServingPath, "Overwrite")
    else:
        writeIntoServingLayer(DS, oldServPath, "Overwrite")
        writeIntoParquetServingLayer(DS, parquetServingPath, "Overwrite")

    # # deleting temporary path
    # shutil.rmtree(tempPath)
    # # if os.path.exists(oldServPath):
    # #     shutil.rmtree(oldServPath)
    #
    # # renaming the serving csv file
    # file = os.listdir(servPath)
    # for files in file:
    #     if files.startswith("part"):
    #         os.rename(os.path.join(servPath,files), os.path.join(servPath,'InvoiceSummary.csv'))

    testNeed = config["test"]["testNeed"]
    if testNeed == 'yes':
        trail = config["test"]["trail"]
        uri = config["test"]["uri"]

        sc = singleSession().getSparkSession(config).sparkContext

        if trail == 1:
            subprocess.call(["hadoop", "fs", "-rm", "-r", oldServPath])
            subprocess.call(
                ["hadoop", "fs", "-mv", servPath + "/part*", newServingName])
        if trail == 2:
            subprocess.call(["hadoop", "fs", "-rm", "-f", oldServPath])
        elif trail == 3:
            fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(
                sc._jsc.hadoopConfiguration())
            fs.delete(sc._jvm.org.apache.hadoop.fs.Path(oldServPath), True)
        elif trail == 4:
            URI = sc._gateway.jvm.java.net.URI
            Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
            FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
            fs = FileSystem.get(URI(uri), sc._jsc.hadoopConfiguration())
            fs.delete(Path(oldServPath))
        elif trail == 5:
            URI = sc._gateway.jvm.java.net.URI
            Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
            FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
            fs = FileSystem.get(URI(uri), sc._jsc.hadoopConfiguration())
            # We can now use the Hadoop FileSystem API (https://hadoop.apache.org/docs/current/api/org/apache/hadoop/fs/FileSystem.html)
            fs.listStatus(Path(oldServPath))

    return None