def mainOps(spark = None,baseSource = 'Not_ODS',snapshotsNeedread = False,todaysDate = 0): quiet_logs(sc_ = spark) #print("Start reading text data") hc, sqlc = hiveInit(sc_ = spark) groupedDict = globalMain(sc_= spark) print("start reading dictionary base") dictionaryBase = None if baseSource is 'ODS': final_list = groupedDict['20184521'] + groupedDict['20184522'] + groupedDict['20184523'] print len(final_list) dictionaryBase = transferSnapshots(sqlc,final_list,spark )#'/npd/s_test2/dictionaryBase/') print "size of base :" + str(dictionaryBase.count()) print("start writing down dictionary base") deletePath('/npd/s_test2/dictionaryBase/',sc_ = spark) writeDown(dictionaryBase,'/npd/s_test2/dictionaryBase/') print ("end of writing down") if baseSource is 'ODS_BASE': dictionaryBase = startReadingfromhdfs(sqlc = sqlc,listOffiles = '/npd/s_test2/dictionaryBase/',multi = 1,spark=spark) deletePath('/npd/s_test2/dictionaryBase1/',sc_= spark) writeDown(dictionaryBase,'/npd/s_test2/dictionaryBase1/') if snapshotsNeedread is True: deletePath('/npd/s_test2/snapshotFilestemp/',sc_= spark) snapshotIndex = 1 totalsnapShots = sqlc.createDataFrame(spark.emptyRDD(),StructType([])) fileList = [] for each in groupedDict.iteritems(): if str(each[0]).find('201845') is not -1: print "Base files written on that day" continue print "snapshot id: " + str(each[0]) #print "start reading " + str(snapshotIndex) + " snapshot" fileList = fileList + each[1] print fileList snapshotRdd = startReadingfromhdfs(sqlc = sqlc,listOffiles = fileList,spark = spark) print "start writing snapshot files" writeDown(snapshotRdd,'/npd/s_test2/snapshotFilestemp/') nondupSnapshotrdd = startOverlapdetector(snapshotRdd,['./createExternalTable.sql'],sqlc,hc,spark) #nondupBaserdd = startOverlapdetector(['./createExttabledictbase.sql'],sqlc,hc,spark) #colNames = columnRenaming(listNames) #snapshotBase = snapshotBase.selectExpr(colNames) array= nondupSnapshotrdd.select(['poi_id']) array = array.rdd.map(lambda x : x.poi_id).collect() #array = [lit(poi_id).alias("poi_id").cast("long") for poi_id in array] base = nondupBaserdd.where(~col("poi_id").isin(array)) #base = startFilteringfrombase(nondupBaserdd,nondupSnapshotrdd) base = base.unionAll(nondupSnapshotrdd) base = base.withColumn("updated",base["updated"].cast("string")) base = base.withColumn("added",base["added"].cast("string")) print "dictionary base size: " + str(base.count()) deletePath('/npd/s_test2/uniqueBasedictionary',sc_=spark) writeDown(base,'/npd/s_test2/uniqueBasedictionary') upc_map = generateUPCmap(rdddata=base) sku_map = generateSKUmap(rdddata=base) model_map = generateMODELmap(rdddata=base) return
def generateMaps(baseDict = None, spark = None,upcM = 1 ,skuM = 1,modelM = 0,root_directory1 = '/npd/test/maps/dictionary/itemid_maps/', root_directory2 = '/npd/test/maps/dictionary/itemid_maps2/', businessId = None, sku_map_dir = None, upc_map_dir = None, mod_map_dir = None): if baseDict.rdd.isEmpty() is True: print "No data to generate the mappings" print "System is exiting and returing" return "No map has been generated" upc_map = generateUPCmap(rdddata = baseDict) if upcM is 1 else None if upc_map is not None: deletePath( upc_map_dir , sc_ = spark) if businessId is None else deletePath(root_directory2 + 'upc_map/' + businessId , sc_ = spark) path_name = root_directory2 + 'upc_map' if businessId is None else root_directory2 + 'upc_map/' + businessId writeDown(upc_map, upc_map_dir, partitions = 400) if upc_map is not None else None upc_map.unpersist() if upc_map is not None else None sku_map = generateSKUmap(rdddata = baseDict) if skuM is 1 else None if sku_map is not None: deletePath(sku_map_dir , sc_ = spark) if businessId is None else deletePath(root_directory2 + 'sku_map/' + businessId ,sc_ = spark) path_name = root_directory2 + 'sku_map/' if businessId is None else root_directory2 + 'sku_map/' + businessId writeDown(sku_map , sku_map_dir, partitions = 400 ) if sku_map is not None else None sku_map.unpersist() if sku_map is not None else None path_name = root_directory2 + 'model_map/' if businessId is None else root_directory2 + 'model_map/' + businessId model_map = generateMODELmap(rdddata = baseDict) if modelM is 1 else None if model_map is not None: deletePath(root_directory2 + 'model_map/',sc_ = spark) if businessId is None else deletePath(root_directory2 + 'model_map/' + businessId ,sc_ = spark) writeDown(model_map, path_name, partitions = 400 ) if model_map is not None else None model_map.unpersist() if model_map is not None else None #rdd = testing_readData(spark , hc) return "End of Successful generation and write down of maps"
def startFilteringstep(spark): hc, sqlc = hiveInit(sc_ = spark) dictionaryBase = startReadingfromhdfs(sqlc = sqlc,listOffiles = '/npd/s_test2/uniqueBasedictionary', multi = 1, spark = spark) dictionaryBase.show() listNames = ["poi_id" ,"business_id" ,"posoutlet" ,"outletdivision" ,"outletdepartment" ,"outletsubdepartment" ,"outletclass" ,"outletsubclass" ,"outletbrand" ,"outletitemnumber","outletdescription" ,"outletbrandmatch" ,"outletitemnumbermatch" ,"outletdescriptionmatch" ,"sku" ,"manufacturercodetype" ,"manufacturercode" ,"zzzppmonthfrom" ,"zzzppmonthto" , "zzzppmonthlastused" ,"itemid" ,"itemtype" ,"price" ,"manufacturercodestatus" ,"loadid" ,"status" ,"added" ,"updated","ppweekfrom" ,"ppweekto" ,"ppweeklastused" ,"matched_country_code" ,"previous_poiid" ,"include_data_ppmonthfrom" ,"include_data_ppweekfrom" ,"manufacturercodematch" ,"skumatch" , "unitofmeasure" ,"packsize" ,"manufacturername" ,"manufacturernamematch" ,"privatelabel" ,"outletdescriptionsupplement" ,"total_confidence_score" , "parent_poiid" ,"parent_poiid_status"] colNames = columnRenaming(listNames) dictionaryBase = dictionaryBase.selectExpr(colNames) dictionaryBase.show() snapshotBase = startReadingfromhdfs(sqlc = sqlc,listOffiles = '/npd/s_test2/uniqueSnapshotFilestemp', multi = 1, spark = spark) snapshotBase.show() listNames1 = ["poi_id1" ,"business_id1" ,"posoutlet1" ,"outletdivision1" ,"outletdepartment1" ,"outletsubdepartment1" ,"outletclass1" ,"outletsubclass1" ,"outletbrand1" ,"outletitemnumber1","outletdescription1" ,"outletbrandmatch1" ,"outletitemnumbermatch1" ,"outletdescriptionmatch1" ,"sku1" ,"manufacturercodetype1" ,"manufacturercode1" ,"zzzppmonthfrom1" ,"zzzppmonthto1" , "zzzppmonthlastused1" ,"itemid1" ,"itemtype1" ,"price1" ,"manufacturercodestatus1" ,"loadid1" ,"status1" ,"added1" ,"updated1","ppweekfrom1" ,"ppweekto1" ,"ppweeklastused1" ,"matched_country_code1" ,"previous_poiid1" ,"include_data_ppmonthfrom1" ,"include_data_ppweekfrom1" ,"manufacturercodematch1" ,"skumatch1" , "unitofmeasure1" ,"packsize1" ,"manufacturername1" ,"manufacturernamematch1" ,"privatelabel1" ,"outletdescriptionsupplement1" ,"total_confidence_score1" , "parent_poiid1" ,"parent_poiid_status1"] #ilistNames1 = ["poi_id1","business_id1","posoutlet1","outletdivision1", "outletdepartment1","outletclass1","outletbrand1","outletitemnumber1" ,"outletdescription1","outletbrandmatch1","manufacturercode1", "sku1", "itemid1", "itemtype1" , "price1" ,"manufacturercodestatus1" ,"loadid1", "status1", "added1", "updated1" , "matched_country_code1", "previous_poiid1", "parent_poiid1" ," parent_poiid_status1"] colNames = columnRenaming(listNames) snapshotBase = snapshotBase.selectExpr(colNames) array= snapshotBase.select(['poi_id']) array = array.rdd.map(lambda x : x.poi_id).collect() #array = [lit(poi_id).alias("poi_id").cast("long") for poi_id in array] base = dictionaryBase.where(~col("poi_id").isin(array)) #startFilteringfrombase(dictionaryBase,snapshotBase) base = base.unionAll(snapshotBase) writeDown(base ,'/npd/s_test2/finalResults/') return
def updateOdsposoutlet(snapshotRdd, baseDict = None, itemidRdd = None, process_dict = 0, spark = None , ranges = 2, readHdfs = 1, repartBase = 0, appendMode = 0, addpartitionCol = 0, process_zero = 0, listOffiles = None, fileList = None, rddwithPartition = None, lastFilenumber = None ,configOb = None, table_name = None, hdfs_output = '/npd/s_test2/uniqueBasedictionary/', debug = 0, writeTohdfs = 0): hc, sqlc = hiveInit(sc_ = spark) if configOb is None : print "configuration object can not be None" print "system exiting" sys.exit(0) if len(fileList ) is 0: print "Its empty exiting" exit(0) _, last_file_num, _ = getLastfilenumber(fileList) table_name = configOb.hivedbOb.get_dbName(index = 0) + "." + configOb.hivedbOb.get_tabNames(dbName_ = configOb.hivedbOb.get_dbName(index = 0), index = 2 ) if table_name is None else table_name if baseDict is None: print "base has to be updated can not be None" print "End of the operation no update operation for base dictionary" sys.exit(0) print ("add partitions to snapshot data") snapshotRdd , itemidRdd, tracker = automation(rdd = snapshotRdd, joinedRdd = rddwithPartition, itemidRdd = itemidRdd, spark = spark, hc = hc, sqlc = sqlc) print ("Start detecting overlap data and return uniques") nondupSnapshotrdd = startOverlapdetector(snapshotRdd, ['src/main/python/dictionary/fileSource/hivecreateScripts/createExternalTable.sql'], sqlc, hc, spark) baseDict = startOverlapdetector(baseDict, ['src/main/python/dictionary/fileSource/hivecreateScripts/createExternalTable.sql'], sqlc, hc, spark) if process_dict is 1 else baseDict print "End of detecting the overlap data and return the uniques" print "start seperating the zero itemids" zeroRdd = nondupSnapshotrdd.where(nondupSnapshotrdd.itemid == 0) if process_zero is 1 else sqlc.createDataFrame(spark.emptyRDD(), StructType([])) nondupSnapshotrdd = nondupSnapshotrdd.filter(nondupSnapshotrdd.itemid != 0) if zeroRdd.rdd.isEmpty() is False else nondupSnapshotrdd zerobaseRdd = baseDict.where(baseDict.itemid == 0) if process_zero is 1 else sqlc.createDataFrame(spark.emptyRDD(), StructType([])) baseDict = baseDict.filter(baseDict.itemid != 0) if zerobaseRdd.rdd.isEmpty() is False else baseDict #print "size of base dictionary after filtering 0 itemid: " + str(baseDict.count()) zeroRdd = zeroRdd.withColumn("poi_id",zeroRdd["poi_id"].cast("long")) if zeroRdd.rdd.isEmpty() is False else zeroRdd zeroRdd = addPartitionColumn(zeroRdd) if zeroRdd.rdd.isEmpty() is False else zeroRdd zeroRdd = zeroRdd.withColumn("partitioner",zeroRdd["partitioner"].cast("string")) if zeroRdd is False else zeroRdd #print "size of non zero snapshot itemid: " + str(nondupSnapshotrdd.count()) print "start seperating the zero itemid for base dictionary" zerobaseRdd = zerobaseRdd.repartition(baseDict.rdd.getNumPartitions()) if zerobaseRdd.rdd.isEmpty() is False else zerobaseRdd zerobaseRdd = zerobaseRdd.withColumn("poi_id",zerobaseRdd["poi_id"].cast("long")) if zerobaseRdd.rdd.isEmpty() is False else zerobaseRdd zerobaseRdd = addPartitionColumn(zerobaseRdd) if zerobaseRdd.rdd.isEmpty() is False else zerobaseRdd zerobaseRdd = zerobaseRdd.withColumn("partitioner", zerobaseRdd["partitioner"].cast("string")) if zerobaseRdd.rdd.isEmpty() is False else zerobaseRdd #print "size of non zero itemid: " + str(zerobaseRdd.count()) if nondupSnapshotrdd.rdd.isEmpty() is True: print "snapshot rdd is empty" print "if Non dup snapshots are empty we can avoid overwriting database and hdfs" print "system is exiting" sys.exit(0) print "End of seperating the zero itemids for base dictionary" #print ("Find the partition for each itemid") final_rdd = nondupSnapshotrdd #final_rdd, itemidRdd, tracker = automation(rdd = nondupSnapshotrdd,joinedRdd = rddwithPartition, itemidRdd = itemidRdd,spark = spark, hc = hc, sqlc = sqlc) final_rdd = final_rdd.unionAll(zeroRdd) if zeroRdd.rdd.isEmpty() is False else final_rdd print ("type cast the updated and added date column") final_rdd = final_rdd.withColumn("updated",final_rdd["updated"].cast("string")) final_rdd = final_rdd.withColumn("added",final_rdd["added"].cast("string")) print ("read the base dictionary" ) ########### Reading unique base dictionary using spark csv reader ######################## ################# reading unique base dictionary using hive external table ####################################### print ("perform left anti join on poi_id to retrieve unique poi_id based records") #print ("final_rdd size before left anti join " + str(final_rdd.count())) #condition_list = [psf.col("basetemp.itemid") == psf.col("finalrddtemp.itemid"),psf.col("basetemp.poi_id") == psf.col("finalrddtemp.poi_id")] baseDict = baseDict.withColumn("itemid",baseDict["itemid"].cast("long")) baseDict = baseDict.withColumn("poi_id",baseDict["poi_id"].cast("long")) final_rdd = final_rdd.withColumn("poi_id",final_rdd["poi_id"].cast("long")) itemidRdd = itemidRdd.withColumn("vitemid",itemidRdd["vitemid"].cast("long")) condition_list = [psf.col("basetemp.itemid") == psf.col("finalrddtemp.itemid"),psf.col("basetemp.poi_id") == psf.col("finalrddtemp.poi_id")] if readHdfs is 1: baseDict.persist() itemidRdd.persist() baseDict = baseDict.alias('basedict').join(itemidRdd.alias('itemidrdd'),(psf.col("basedict.itemid") == psf.col("itemidrdd.vitemid")),'inner') if addpartitionCol is 1 else baseDict #print "size of the base dictionary after adding the partitioner column: " + str(baseDict.count()) baseDict = baseDict.drop("vitemid") if addpartitionCol is 1 else baseDict #baseDict = baseDict.select([column for column in baseDict.columns if column not in droplist]) #final_rdd = createBroadcast(final_rdd, spark) baseDict = baseDict.alias("basetemp").join(final_rdd.alias("finalrddtemp"), (psf.col("basetemp.poi_id") == psf.col("finalrddtemp.poi_id")),"leftanti") #if appendMode is 0 else final_rdd.alias("finalrddtemp").join(baseDict.alias("basetemp"), (psf.col("finalrddtemp.poi_id") == psf.col("basetemp.poi_id")),"leftanti") baseDict = baseDict.unionAll(final_rdd) if configOb.append_in_hive is 0 else baseDict baseDict = baseDict.unionAll(zerobaseRdd) if zerobaseRdd.rdd.isEmpty() is False else baseDict #if debug is True : # print ("final base dict size after left anti join and union of new add all " + str(baseDict.count())) #baseDict = baseDict.unionAll(final_rdd) print "Repartition the base dictionary data before start writing" baseDict = baseDict.repartition(400) #print "count of basedictionary data: " + str(baseDict.count()) #listOfdata =baseDict.groupBy("partitioner").count().select( "partitioner" , psf.col("count").alias("counting")).rdd.map(lambda x:(x.partitioner,x.counting)).collect() #for each in listOfdata: # print "partitioner: " + str(each[0]) +" count: " + str(each[1]) if writeTohdfs is 1: print "delete the /npd/s_test2/uniqueBasedictionary path before writing it back" deletePath(hdfs_output, sc_ = spark) print "writing down the unique snapshots retrieved" writeDown(baseDict, hdfs_output) print "End of writing down the unique table into hdfs" print "Start writing back to hive table" #print "get a hive write back object" #hivewritingback = hivewriteback(spark = spark) #hivewritingback.insertIntopartitiontable(partitionFiles = listOffiles , dictRdd = baseDict, append = 0) #writebackTohive(baseDict,append = 0 if appendMode is 0 else 1 ) #print "End of hive transfer" print "Update lastfile read number in configuration file" getLastfilenumber(rw = 1, file_num_ = last_file_num, ft = 0) print("updating itemIdWithPartition file with new information") if len(tracker) is not 0: print "start of updating itemidpartition.txt file with new add itemid" with open('src/main/python/dictionary/maps/itemIdWithPartition08.txt','a+') as dataWriter: for key,value in tracker.iteritems(): dataWriter.write("{}\n".format(str(1) + '\t' + str(key) + '\t' + value.strip())) print "End of itemidpartition file with new add itemid" #executeScripts('src/main/python/dictionary/fileSource/hivecreateScripts/createFinaldatatransfer.sql',hc, sqlc,spark) print "start type casting for date columns" baseDict = stringTotimestamp(baseDict, cols = ['updated','added'], formats = 'yyyy_MM_dd_hh_mm_ss', types = 'timestamp') print "end of type casting for date columns" #print "Get a hive write back object to write backe to hdfs" if configOb.stage['updatehivetable'] is 1 : print "Get a hive write back object to write backe to hdfs" hivewritingback = hivewriteback(spark = spark) hivewritingback.setTablename(tableName = table_name) #if configOb.append_in_hive is 0: #hivewritingback.insertIntopartitiontable(partitionFiles = listOffiles , dictRdd = baseDict, append = 0, table_name = table_name) #else: hivewritingback.insertIntobucketedtable(partitionFiles = listOffiles, dictRdd = baseDict, append = 0 if configOb.append_in_hive is 0 else 1, table_name = table_name, numberOfbus = 4, cols = ["business_id","partitioner"], hc = hc ) #writebackTohive(baseDict,append = 0 if appendMode is 0 else 1 ) print "End of transfer of data into hive internal table" print "Table been updated and written back successfully into hive" return "Successful completion of posoutlet table update and written back to hdfs"
def startWorkingsnapshots(snapshotRdd=None, baseDict=None, spark=None, ranges=2, process_dict=1, dict_hdfs=0, dict_hive=1, writebackType=0, debug=0, fileList=None, lastFilenumber=None, table_name=None, hdfs_output='/npd/s_test2/uniqueOdsitems/', writeTohdfs=0, append_in_hive=0, updatehivetable=0): #quiet_logs(sc_ = spark) hc, sqlc = hiveInit(sc_=spark) snapshotIndex = 1 totalsnapShots = sqlc.createDataFrame(spark.emptyRDD(), StructType([])) if len(fileList) is 0: print "snapshot files are empty" print "Its empty exiting" exit(0) print("Get the filtered files to read") needToread, last_file_num, fn = getLastfilenumber(fileList) if debug is True: snapshot_size = snapshotRdd.count() print("Total data inside the snapshot rdd: " + str(snapshot_size)) print "Total data inside base dict: " + str(baseDict.count()) if snapshotRdd.rdd.isEmpty() is True: print("Calling off the operation nothing to work on snapshot is empty") print( "look into lastfile number all the files might have been processed" ) print "application is exiting gracefully ....." sys.exit(0) print("Start detecting overlap data and return uniques") final_rdd = startOverlapdetector(snapshotRdd, [ 'src/main/python/dictionary/fileSource/hivecreateScripts/createExternalTable.sql' ], sqlc, hc, spark) baseDict = startOverlapdetector(baseDict, [ 'src/main/python/dictionary/fileSource/hivecreateScripts/createExttabledictbase.sql' ], sqlc, hc, spark) if process_dict is 1 else baseDict print "End of overlap detection of given snapshot rdd" if final_rdd.rdd.isEmpty() is True: print "snapshot rdd is empty" sys.exit(0) print("Type cast the updated and added date column") final_rdd = final_rdd.withColumn("updated", final_rdd["updated"].cast("string")) final_rdd = final_rdd.withColumn("added", final_rdd["added"].cast("string")) print("read the base dictionary") ########### Reading unique base dictionary using spark csv reader ######################## ################# reading unique base dictionary using hive external table ####################################### #baseDict = hiveExecutecommands(line = " select * from dqdictionaryhivedb.mainTemptableextpersist",hive_context = hc ) if debug is 1: print("base dictionary size " + str(baseDict.count())) print("new add size: " + str(final_rdd.count())) print("final_rdd size before left anti join " + str(final_rdd.count())) print( "perform left anti join on itemid to retrieve unique itemid based records" ) baseDict = baseDict.withColumn("itemid", baseDict["itemid"].cast("long")) final_rdd = final_rdd.withColumn("itemid", final_rdd["itemid"].cast("long")) #final_rdd = final_rdd.withColumn("added",to_timestamp("added","yyyy_MM_dd hh_mm_ss")) #final_rdd = final_rdd.withColumn("updated",to_timestamp("updated","yyyy_MM_dd hh_mm_ss")) #final_rdd = final_rdd.withColumn("outletitem_map_change_date",to_timestamp("outletitem_map_change_date","yyyy_MM_dd hh_mm_ss")) baseDict.persist() final_rdd.persist() if debug is 1: print "size of the base dictionary after adding the partitioner column: " + str( baseDict.count()) baseDict = baseDict.alias("basetemp").join( final_rdd.alias("finalrddtemp"), (psf.col("basetemp.itemid") == psf.col("finalrddtemp.itemid")), "leftanti") #baseDict = baseDict.alias("basetemp").join(final_rdd.alias("finalrddtemp"),condition_list,"leftanti") #droplist = ['vitemid'] #baseDict = baseDict.select([column for column in baseDict.columns if column not in droplist]) if debug is 1: print("final base dict size after left anti join " + str(baseDict.count())) baseDict = baseDict.unionAll( final_rdd) if append_in_hive is 0 else baseDict if debug is 1: print( "final base dict size after left anti join and union of new add all " + str(baseDict.count())) #baseDict = baseDict.unionAll(final_rdd) if writeTohdfs is 1: deletePath(hdfs_output, sc_=spark) writeDown(baseDict, hdfs_output) print "Update lastfile read number" getLastfilenumber(rw=1, file_num_=last_file_num) #hiveExecutecommands(line = "drop table dqdictionaryhivedb.uniqueodsitems_int",hive_context = hc) print "Start writing back to hive table" baseDict = stringTotimestamp(baseDict, columns=['added', 'updated'], types='timestamp') #if append_in_hive is 0 : # writebackTohive(baseDict, writebackType = writebackType, table_name = table_name ) if updatehivetable is 1: print "Get a hive write back object to write backe to hdfs" hivewritingback = hivewriteback(spark=spark) hivewritingback.setTablename(tableName=table_name) #hivewritingback.insertIntopartitiontable(partitionFiles = listOffiles , dictRdd = baseDict, table_name = table_name) hivewritingback.insertIntobucketedtable( partitionFiles=listOffiles, dictRdd=baseDict, append=0 if append_in_hive is 0 else 1, table_name=table_name, numberOfbus=4, cols=["businessid"], hc=hc) # print "End of hive transfer" return "Successful update of odspositem table"