def gngrph_srch_metarepo_qry_fetch_api(gnsrch_ops, gnp_spark, srchfilter): sqlst = "select * from gnmetanodes WHERE gnnodetype='GNMetaNode' OR gnnodetype='GNMetaNodeAttr'" if (gnsrch_ops.srch_init_meta_status() == 0): rj = { "nodes": [], "edges": [], "nodelen": 0, "edgelen": 0, "status": "ERROR" } return rj gnsrch_log("GnSrchOps: searching for metarepo srchfilter :" + srchfilter) #if (srchfilter == ""): # (mnJson, meJson) = gnsrch_ops.gngrph_metarepo_get() #else: nodemode = 2 (mnJson, meJson) = gngrph_srch_metarepo_qry_execute(gnsrch_ops, gnp_spark, sqlst, nodemode) rJ = {} rJ["nodes"] = mnJson rJ["edges"] = meJson rJ["nodelen"] = len(rJ["nodes"]) rJ["edgelen"] = len(rJ["edges"]) rJ["status"] = "SUCCESS" gnsrch_log('GnSrchOps: Meta nodes and edges and derived nodes fetched') #print(rJ) return (rJ)
def datanode_flatten_jsonfields(self, baseDataNodeDF, spk): gnsrch_log('GnStaticFOps: Flattening datanode fields: ') try: # First flatten gndatanodeobj n1_df = spk.read.json( baseDataNodeDF.rdd.map(lambda row: row.gndatanodeobj)) n1_df.show(2) n1_schema = n1_df.schema gnsrch_log('GnStaticFOps: Flattening n1_schema datanodeobj ') gnsrch_log(n1_schema) n2_schema = spk.read.json( baseDataNodeDF.rdd.map(lambda row: row.gndatanodeprop)).schema gnsrch_log(' Flattening n2_schema datanodeprop ') gnsrch_log(n2_schema) datanodeFlattenDF = baseDataNodeDF.withColumn("gndatanodeobj", from_json("gndatanodeobj", n1_schema)) \ .withColumn("gndatanodeprop", from_json("gndatanodeprop", n2_schema)) \ .select(col("gnnodeid"), col("gnnodetype"), col("gnmetanodeid"), col("gndatanodeobj.*"), col("gndatanodeprop.*"), col("uptmstmp")) except Exception as err: gnsrch_log_err('GnStaticFOps: Flatten fields ran into Exception ' + str(err)) datanodeFlattenDF = None return datanodeFlattenDF
def gngrph_srch_metarepo_qry_fetch_nodes_api(gnsrch_ops, gnp_spark, srchfilter): sqlst = "select * from gnmetanodes WHERE gnnodetype='GNMetaNode' OR gnnodetype='GNMetaNodeAttr'" gnsrch_log("GnSrchOps: searching metarepo fetch srchfilter " + srchfilter) if (gnsrch_ops.srch_init_meta_status() == 0): rj = {} rj["nodes"] = [] rj["edges"] = [] rJ["nodelen"] = 0 rJ["edgelen"] = 0 rJ["status"] = "ERROR" return rj if (srchfilter == ""): (mnJson, meJson) = gnsrch_ops.gngrph_metarepo_get() else: nodesonly = 1 (mnJson, meJson) = gngrph_srch_metarepo_qry_execute(gnsrch_ops, gnp_spark, sqlst, nodesonly) rJ = {} rJ["nodes"] = mnJson rJ["edges"] = meJson rJ["nodelen"] = len(rJ["nodes"]) rJ["edgelen"] = len(rJ["edges"]) rJ["status"] = "SUCCESS" gnsrch_log('GnSrchOps: Meta nodes fetched') #print(rJ) return (rJ)
def gngrph_metarepo_remap(self, nodemode): gnsrch_log('GnSrchOps: Remapping meta nodes and edges ' + str(nodemode)) #if (nodemode == 2): self.__gnmetaNodesDF_cached = None self.__gnmetaNodesDF_cached = None self.__gnmetaDerivedNodesDF_cached = None self.gngraph_meta_nodes_edges_setup() gnsrch_log('GnSrchOps: Remapping meta nodes and edges completed ')
def get_bizrule_metainfo(self, bizrid, spk): if spk is None: gnsrch_log_err('GnStaticFOps: spark is none ') sqlstr = "SELECT * FROM gnbizrules where gnrelid='" + bizrid + "'" gnsrch_log('GnStaticFOps: get_bizrule_metainfo: sqlstr ' + sqlstr) bizrEnt = spk.sql(sqlstr) #bizrEnt.show() jobj = json.loads(bizrEnt.toJSON().first()) return jobj
def gngraph_config_init(self, gngrp_datadir, accessmode, fargs, dbargs): # check if the config_file exists gngraph_path = gngrp_datadir + "/gngraph" self.__gncfg_accessmode = accessmode if (self.__gncfg_accessmode['sfmode'] == 1): self.__gncfg["staticfiles"] = 1 if (self.__gncfg_accessmode['dbmode'] == 1): self.__gncfg["gdbflag"] = 1 self.__gncfg["gngraph_root_dir"] = gngraph_path self.__gncfg["gngraph_cfg_dir"] = gngraph_path + "/config" self.__gncfg_fargs[ "gngraph_data_dir"] = self.__gncfg["gngraph_root_dir"] + "/data" self.__gncfg["gngraph_cfg_filename"] = "gngraph_config.json" self.__gncfg_fargs["gngraph_metanode_filename"] = "gnmetanodes.json" self.__gncfg_fargs[ "gngraph_metanodeattr_filename"] = "gnmetanodeattrs.json" self.__gncfg_fargs["gngraph_edge_filename"] = "gnmetaedges.json" self.__gncfg_fargs["gngraph_node_filename"] = "gnmetanodes.json" if (self.__gncfg_accessmode['sfmode'] == 1): self.__gngrp_sfops = GNGraphSrchStaticFileOps( self.__gncfg["gngraph_root_dir"], self.__spark) self.meta_edge_filepath = self.__gncfg_fargs[ "gngraph_data_dir"] + "/" + self.__gncfg_fargs[ "gngraph_edge_filename"] self.meta_node_filepath = self.__gncfg_fargs[ "gngraph_data_dir"] + "/" + self.__gncfg_fargs[ "gngraph_node_filename"] if (self.__gncfg_accessmode['dbmode'] == 1): self.__gncfg_dbargs = dbargs ##with open(self.__gncfg_dbargs["gdbcredsfpath"], encoding="utf-8") as fh: # gdb_creds = json.load(fh) gdb_creds = gn_pgresdb_getconfiguration( self.__gncfg_dbargs["gdbcredsfpath"]) gnsrch_log('GnSrchOps: reading dbmode creds from ' + self.__gncfg_dbargs["gdbcredsfpath"]) print(gdb_creds) self.__gngrp_dbops = GNGraphSrchPgresDBOps.from_args( gdb_creds["serverIP"], gdb_creds["serverPort"], gdb_creds["username"], gdb_creds["password"], gdb_creds["dbname"], self.__spark) print(' access mode ') print(self.__gncfg_accessmode) if self.__gncfg_accessmode['dbmode'] == 1: self.__gncfg_readmode = 'dbmode' ##elif self.__gncfg_accessmode['sfmode'] == 1: else: self.__gncfg_readmode = 'sfmode' print(' read mode is set ' + self.__gncfg_readmode)
def map_gnmeta_nodes_edges(self): self.__gnmetaNodesList = [] gnsrch_log('GnSrchOps: map and cache gnmeta nodes and edges ') sqlst = "select * from gnmetanodes WHERE gnnodetype='GNMetaNode' OR gnnodetype='GNMetaNodeAttr'" (resNodesDF, nJson, status) = self.gngraph_execute_sqlqry(sqlst) if (status == "ERROR"): gnsrch_log( 'GnSrchOps: mapping metanodes and edges failed due query failure ' ) return (self.__gnmetaNodesList) self.__gnmetaNodesDF_cached = resNodesDF (eDF, dnDF) = self.gngraph_metarepo_qry_getedges( self.__gnmetaNodesDF_cached, sqlst, 0) self.__gnmetaEdgesDF_cached = eDF self.__gnmetaDerivedNodesDF_cached = dnDF self.__gnmetaNodesDF_cached.show(5) self.__gnmetaEdgesDF_cached.show(5) ### get list of GNMetaNodes list nDF = self.__gnmetaNodesDF_cached n1DF = nDF.filter(col("gnnodetype") == "GNMetaNode") \ .withColumnRenamed("gnnodeid", "id") \ .withColumnRenamed("gnnodetype", "nodetype") \ .withColumnRenamed("gnnodename", "nodename") self.__gnmetaNodesList = n1DF.toJSON().collect() gnsrch_log('GnSrchOps: MetaNodes List ') gnsrch_log(self.__gnmetaNodesList)
def gngrph_srch_datarepo_qry_fetch_api(gnsrch_ops, gnp_spark, sqlst, nodemode, lnodes): gnsrch_log('GnSrchOps: datanodes qry fetch ') (retval, msg, sql_formatted) = gnsrch_ops.gngraph_search_setup(sqlst, lnodes) if (retval < 0): gnsrch_log('GnSrchOps: search failed with msg ' + msg) rJ = {} rJ["nodes"] = [] rJ["edges"] = [] rJ["nodelen"] = 0 rJ["edgelen"] = 0 rJ["status"] = "ERROR" rJ["errmsg"] = msg return (rJ) (nJson, eJson) = gngrph_srch_datarepo_qry_execute(gnsrch_ops, sql_formatted, nodemode) ##print(nJson) ##print(eJson) rJ = {} rJ["nodes"] = nJson rJ["nodelen"] = len(rJ["nodes"]) rJ["edges"] = eJson rJ["edgelen"] = len(rJ["edges"]) rJ["status"] = "SUCCESS" gnsrch_log('GnSrchOps: datanodes qry enumerated ' + str(rJ["nodelen"])) gnsrch_log('GnSrchOps: datanodes edges enumerated ' + str(rJ["edgelen"])) return (rJ)
def gnspk_process_request_thrfn(gngrph_cls, gnp_spark, req): if (req["cmd"] == "metasearch"): srchfilter = req["args"] if (srchfilter == "null"): srchfilter = "" rJ = gngrph_srch_metarepo_qry_fetch_api(gngrph_cls, gnp_spark, srchfilter) resp = {} resp["cmd"] = req["cmd"] resp["status"] = "SUCCESS" resp["data"] = rJ return resp if (req["cmd"] == "metanodes"): rJ = gngrph_cls.gngrph_metarepo_nodes_get() resp = {} resp["cmd"] = req["cmd"] resp["status"] = "SUCCESS" resp["data"] = rJ gnsrch_log('GnSrchOps: metanodes get resp') gnsrch_log(resp) return resp if (req["cmd"] == "metaremap"): nodemode = req["nodemode"] rJ = gngrph_cls.gngrph_metarepo_remap(nodemode) resp = {} resp["cmd"] = req["cmd"] resp["status"] = "SUCCESS" resp["data"] = rJ gnsrch_log('GnSrchOps: meta repo remap get resp') gnsrch_log(resp) return resp ## datasearch if (req["cmd"] == "datasearch"): sqlst = req["args"] if (sqlst == "null"): sqlst = "" resp = {} resp["cmd"] = req["cmd"] resp["status"] = "ERROR" resp["data"] = [] resp["errmsg"] = "No Search string" return resp ###NodeMode 1 Nodes only 2 Nodes+Edges 3 Nodes+Edges+Derived nodes nodemode = req["nodemode"] print(req) lnodes = req["lnodes"] rJ = gngrph_srch_datarepo_qry_fetch_api(gngrph_cls, gnp_spark, sqlst, nodemode, lnodes) resp = {} resp["cmd"] = req["cmd"] resp["status"] = "SUCCESS" resp["data"] = rJ return resp
def gngraph_execute_sqlqry(self, sqlst): try: resDF = self.__spark.sql(sqlst) ##resDF.show(10) ###print(resDF.count()) resJson = {} status = "SUCCESS" gnsrch_log('GnSrchOps: executed sql :' + sqlst) except (AnalysisException, Exception) as err: gnsrch_log('GnSrchOps: ran into Exception:' + str(err)) resDF = None resJson = {} status = "ERROR" ##print(resJson) return (resDF, resJson, status)
def get_datanode_mapped_df(self, node_name, bizdomain, spk): # map the datanode file to spark dataframe gnsrch_log("GnSrchPgresDBOps: map " + node_name + " domain " + bizdomain + " ") dnodeDF = self.gngrph_tblmap_dataframe(self.__gdb_dbname, bizdomain, node_name, spk) retDF = None if dnodeDF is not None: ###dnodeDF.show(1) # flatten gndatanodeprop and gndatanodeobj (actual dataset attibutes) retDF = self.datanode_flatten_jsonfields(dnodeDF, spk) # also map the node to tempview with nodename retDF.createOrReplaceTempView(node_name) return retDF
def get_metanodes_mapped_df(self): if self.__gncfg_readmode == "dbmode": self.__gnmetaNodesDF_cached = self.__gngrp_dbops.gnmetanodes_map_df( self.__spark) elif self.__gncfg_readmode == "sfmode": self.__gnmetaNodesDF_cached = self.__gngrp_sfops.gnmetanodes_map_df( self.__spark) else: self.__gnmetaNodesDF_cached = None if (self.__gnmetaNodesDF_cached is None): gnsrch_log("GnGrphSrchOps: metaNodesDF is not mapped") self.__init_meta = 0 else: gnsrch_log("GnSrchOps:" + self.__gncfg_readmode + " metanodes are mapped ") self.__init_meta = 1 return
def gnmetaedges_map_df(self, spk): gnsrch_log('GnSrchPgresDBOps: map metaedges dataframe') metadb_connstr = "jdbc:postgresql://" + self.__gdb_dbserver + ":" + self.__gdb_dbport + "/" + self.__gdb_dbname metaedge_tbl = self.__gdb_metadb_schema + "." + self.__gdb_metaedges_tbl ###datadb_connstr = "jdbc:postgresql://"+self.__gdb_dbserver+":"+self.__gdb_dbport+"/"+self.__gdb_datadb gnsrch_log('GnSrchPgresDBOps: connecting database table: ' + metaedge_tbl) try: metaEdgeDF = spk.read \ .format("jdbc") \ .option("url", metadb_connstr) \ .option("dbtable", metaedge_tbl) \ .option("user", self.__gdb_dbuser) \ .option("password", self.__gdb_dbpasswd) \ .option("driver", "org.postgresql.Driver") \ .load() ###metaEdgeDF.show(2) # also flatten json objects print('GNPgressSrchOps: metaedgeDF mapped ') edge_schema = spk.read.json( metaEdgeDF.rdd.map(lambda row: row.gnedgeprop)).schema self.__gnmetaEdgeDF = metaEdgeDF.withColumn( "gnedgeprop", from_json("gnedgeprop", edge_schema)).select(col('gnedgeid'), col('gnedgename'), col('gnedgetype'), col('gnsrcnodeid'), col('gntgtnodeid'), col('gnedgeprop.*')) self.__gnmetaEdgeDF.createOrReplaceTempView("gnmetaedges") print('GNPgresSrchOps: mapped dataframe COMPLETED ') except Exception as error: print('GNPgresSrchOps: ERROR failed') print(error) self.__gnmetaEdgeDF = None return self.__gnmetaEdgeDF
def gngrph_metarepo_get(self): gnsrch_log('GnSrchOps: getting metarepo information ') if ((self.__gnmetaNodesDF_cached is None) and (self.__gnmetaEdgesDF_cached is None)): #map gnnodes and edges self.map_gnmeta_nodes_edges() nnodes = self.__gnmetaNodesDF_cached.count() njson = {} ejson = {} if (nnodes > 0): nDF = self.__gnmetaNodesDF_cached.select(col("gnnodeid").alias("id"), \ col("gnnodetype").alias("nodetype"), \ col("gnnodename").alias("nodename")) njson = nDF.toJSON().collect() nEdges = self.__gnmetaEdgesDF_cached.count() if (nEdges > 0): eResDF = self.__gnmetaEdgesDF_cached.select(col("gnedgeid").alias("id"), \ col("gnedgetype").alias("type"), \ col("gnsrcnodeid").alias("source"), \ col("gntgtnodeid").alias("target")) ejson = eResDF.toJSON().collect() if (self.__gnmetaDerivedNodesDF_cached is not None): nDNodes = self.__gnmetaDerivedNodesDF_cached.count() else: nDNNodes = 0 if (nDNNodes > 0): dnDF = self.__gnmetaDerivedNodesDF_cached.select(col("gnnodeid").alias("id"), \ col("gnnodetype").alias("nodetype"), \ col("gnnodename").alias("nodename")) rDF = nDF.unionByName(dnDF, allowMissingColumns=True) njson = rDF.toJSON().collect() return (njson, ejson)
def gngrph_srch_datarepo_qry_execute(gnsrch_ops, sqlst, nodemode): ejson = {} njson = {} (resNodeDF, nodesjson, status) = gnsrch_ops.gngraph_execute_sqlqry(sqlst) if (status == "ERROR"): gnsrch_log('GnSrchOps: datarepo qry failed ') return (nsjon, ejson) #resNodeDF.show(10) nodeCount = resNodeDF.count() gnsrch_log('GnSrchOps: datanodes fetched #nodes ' + str(nodeCount)) if (nodeCount > 0): resNodeDF.show(5) #nDF = resNodeDF.select(col("gnnodeid").alias("id"), \ # col("gnnodetype").alias("nodetype"), \ # col("gnlabel").alias("nodename")) nDF = resNodeDF \ .withColumnRenamed("gnnodeid", "id") \ .withColumnRenamed("gnnodetype", "nodetype") \ .withColumnRenamed("gnlabel", "nodename") njson = nDF.toJSON().collect() if (nodemode > 1): (eDF, dnDF) = gnsrch_ops.gngraph_datarepo_qry_getedges( resNodeDF, sqlst, nodemode) nEdges = eDF.count() if (dnDF is not None): nDNodes = dnDF.count() else: nDNodes = 0 if (nEdges > 0): eResDF = eDF.select(col("gnedgeid").alias("id"), \ col("gnedgetype").alias("type"), \ col("gnsrcnodeid").alias("source"), \ col("gntgtnodeid").alias("target")) ejson = eResDF.toJSON().collect() if (nDNodes > 0): dnDF1 = dnDF.select(col("gnnodeid").alias("id"), \ col("gnnodetype").alias("nodetype"), \ col("gnnodename").alias("nodename")) # Combine the derived nodes to source nodes rDF = nDF.unionByName(dnDF1, allowMissingColumns=True) njson = rDF.toJSON().collect() #gnsrch_log('GnSrchOps: Nodes Json ') #print(njson) #print('GnSrchOps: Edges Json ') #print(ejson) gnsrch_log('GnSrchOps: datanodes and edges qry complete SUCCESS') return (njson, ejson)
def test_datarepo_qry_fn(): gnsrch_log("GnSrchOps: Test gnsearch datarepo qry ") app_name = "gngraph" gndata_folder = gnRootDir + "/gndata" gngraph_creds_folder = gnRootDir + "/creds/gngraph" sqlst = "SELECT * from Customer LIMIT 10000" ### Set spark session gnp_spark = SparkSession.builder.appName(app_name).getOrCreate() nodesonly = 0 accessmode = {'sfmode': 1, 'dbmode': 0} gngrph_cls = gngrph_search_init(gnp_spark, gndata_folder, gngraph_creds_folder, accessmode) nodesonly = 1 rJ = gngrph_srch_datarepo_qry_fetch_api(gngrph_cls, gnp_spark, sqlst, nodesonly) if (rJ["status"] == "ERROR"): gnsrch_log('GnSrchOps: Testing search query failed ') gnsrch_log('GnSrchOps: Err msg: ' + rJ["errmsg"]) return rfile = "dnodes.json" with open(rfile, 'w') as fp: json.dump(rJ["nodes"], fp) efile = "dedges.json" with open(efile, "w") as fp: json.dump(rJ["edges"], fp)
def __init__(self, gngrp_datadir, accessmode, fargs, dbargs, sp): ###Set up config init routine self.__gncfg = {} self.__gncfg_fargs = {} self.__spark = sp self.__entlist = [] self.__init_data = 0 self.__init_meta = 0 self.__gngrp_dnDFList = [] ##self.__gnmetaNodeDF = None ##self.__gnmetaEdgeDF = None self.__gnmetaNodesDF_cached = None self.__gnmetaEdgesDF_cached = None self.__gnmetaDerivedNodesDF_cached = None self.__gnmetaNodesList = [] gnsrch_log('GnSrchOps: ########### SearchOps Init ##############') self.gngraph_config_init(gngrp_datadir, accessmode, fargs, dbargs) gnsrch_log( "GnSrchOps: ### Search configuration complete default mode " + self.__gncfg_readmode) self.gngraph_meta_nodes_edges_setup() gnsrch_log( "GnSrchOps: ####Search Initialization complete SUCCESS #### ")
def gnmetanodes_map_df(self, spk): # map the datanode file to spark dataframe mnodefile = "gnmetanodes.json" mnodes_fpath = self.gndata_graph_data_folder + "/" + mnodefile gnsrch_log('GnStaticFOps: Mapping gnmetanodes ') if (self.__gnmetaNodes_mapped == 1): gnsrch_log('GnStaticFOps: gnmetanodes are already mapped ') return self.__gnmetaNodeDF retDF = None if path.exists(mnodes_fpath): metaNodeDF = spk.read.json(mnodes_fpath) metaNodeDF.show(2) mnode_schema = spk.read.json( metaNodeDF.rdd.map(lambda row: row.gnnodeprop)).schema #gnsrch_log(mnode_schema) self.__gnmetaNodeDF = metaNodeDF.withColumn( "gnnodeprop", from_json("gnnodeprop", mnode_schema)).select(col('gnnodeid'), col('gnnodename'), col('gnnodetype'), col('gnnodeprop.*')) self.__gnmetaNodeDF.show(2) # also map the node to tempview with nodename self.__gnmetaNodeDF.createOrReplaceTempView("gnmetanodes") self.__gnmetaNodes_mapped = 1 else: self.__gnmetaNodeDF = None gnsrch_log("GnStaticFOps: gnmetanodes are mapped to df SUCCESS") return self.__gnmetaNodeDF
def gngrph_tblmap_dataframe(self, dbname, schema, tbl_name, spk): gnsrch_log('GnSrchPgresDBOps: map node tabl to dataframe') db_connstr = "jdbc:postgresql://" + self.__gdb_dbserver + ":" + self.__gdb_dbport + "/" + dbname node_tbl = schema + '."' + tbl_name + '"' ###datadb_connstr = "jdbc:postgresql://"+self.__gdb_dbserver+":"+self.__gdb_dbport+"/"+self.__gdb_datadb gnsrch_log('GnSrchPgresDBOps: dbconnstr:' + db_connstr) gnsrch_log('GnSrchPgresDBOps: node_tbl ' + node_tbl) try: retDF = spk.read \ .format("jdbc") \ .option("url", db_connstr) \ .option("dbtable", node_tbl) \ .option("user", self.__gdb_dbuser) \ .option("password", self.__gdb_dbpasswd) \ .option("driver", "org.postgresql.Driver") \ .load() ###retDF.show(2) dnodeDF = self.datanode_flatten_jsonfields(retDF, spk) retDF.createOrReplaceTempView(tbl_name) print('GnSrchPgresDBOps: ' + tbl_name + ' mapped dataframe') except Exception as error: print('GnSrchPgresDBOps: ERROR failed') print(error) retDF = None return retDF
def gnmetaedges_map_df(self, spk): # map the datanode file to spark dataframe edgefile = "gnmetaedges.json" edge_fpath = self.gndata_graph_data_folder + "/" + edgefile gnsrch_log('GnStaticFOps: Mapping gnmetaedges ') #if we already mapped df just return if (self.__gnmetaEdges_mapped == 1): print('GnStaticFOps: gnmetaedges are already mapped ') return self.__gnmetaEdgeDF retDF = None if path.exists(edge_fpath): metaEdgeDF = spk.read.json(edge_fpath) metaEdgeDF.show(2) edge_schema = spk.read.json( metaEdgeDF.rdd.map(lambda row: row.gnedgeprop)).schema print('GnStaticFOps: showing gnmetaedge schema ') print(edge_schema) self.__gnmetaEdgeDF = metaEdgeDF.withColumn( "gnedgeprop", from_json("gnedgeprop", edge_schema)).select(col('gnedgeid'), col('gnedgename'), col('gnedgetype'), col('gnsrcnodeid'), col('gntgtnodeid'), col('gnedgeprop.*')) self.__gnmetaEdgeDF.show(2) # also map the node to tempview with nodename self.__gnmetaEdgeDF.createOrReplaceTempView("gnmetaedges") self.__gnmetaEdges_mapped = 1 else: self.__gnmetaEdgeDF = None self.__gnmetaEdges_mapped = 0 gnsrch_log('GnStaticFOps: gnmetaedges are mapped to df SUCCESS') return self.__gnmetaEdgeDF
def get_metanode_info(self, node, spk): jobj = {} if spk is None: gnsrch_log_err('GnStaticFOps: spark session is not found') return jobj if self.__init_failed == 1: gnsrch_log_err('GNStaticFileOps: Static Files Init failed ') return jobj sqlstr = "SELECT * FROM gnmetanodes where gnnodename='" + node + "'" gnsrch_log('GnStaticFOps: getting metanode info ' + node + ' sqlstr ' + sqlstr) nodeEnt = spk.sql(sqlstr) nodeCount = nodeEnt.count() if (nodeCount == 0): gnsrch_log('GnStaticFOps: node ' + node + ' does not exists') return jobj ##nodeEnt.show() jobj = json.loads(nodeEnt.toJSON().first()) return jobj
def gnspk_thread_main(gnRootDir, accessmode, req_q, resp_q): gnsrch_log('GnSrchOps: starting spark session thread ') app_name = "gngraph" gndata_folder = gnRootDir + "/gndata" gngraph_creds_folder = gnRootDir + "/creds/gngraph" gnsrch_log('GnSrchOpsThr: initializing spark session thread ') print(accessmode) conf = SparkConf() conf.set('spark.executor.memory', '4g') conf.set('spark.driver.memory', '4g') gnp_spark = SparkSession.builder.appName(app_name).getOrCreate() gnp_spark.sparkContext.setLogLevel("WARN") gngrph_cls = gngrph_search_init(gnp_spark, gndata_folder, gngraph_creds_folder, accessmode) ### initialized spark session and now wait for some task while True: gnsrch_log('GnSrchOps: thread waiting for request ') req = req_q.get() if (req is None): gnsrch_log('GnSrchOps: empty request returned ') req_q.task_done() return else: resp = gnspk_process_request_thrfn(gngrph_cls, gnp_spark, req) # put the response on output queue resp_q.put(resp) time.sleep(4) gnsrch_log('GnSrchOps: Processing of message done')
def gnmetanodes_map_dataframe(self, spk): gnsrch_log('GnSrchPgresDBOps: map metanode dataframe') metadb_connstr = "jdbc:postgresql://" + self.__gdb_dbserver + ":" + self.__gdb_dbport + "/" + self.__gdb_dbname metanode_tbl = self.__gdb_metadb_schema + "." + self.__gdb_metanodes_tbl datadb_connstr = "jdbc:postgresql://" + self.__gdb_dbserver + ":" + self.__gdb_dbport + "/" + self.__gdb_dbname try: self.__gnmetaNodeDF = spk.read \ .format("jdbc") \ .option("url", metadb_connstr) \ .option("dbtable", metanode_tbl) \ .option("user", self.__gdb_dbuser) \ .option("password", self.__gdb_dbpasswd) \ .option("driver", "org.postgresql.Driver") \ .load() ####self.__gnmetaNodeDF.show(2) self.__gnmetaNodeDF.createOrReplaceTempView("gnmetanodes") gnsrch_log('GnSrchPgresDbOps: mapped dataframe COMPLETED ') except Exception as error: gnsrch_log('GnSrchPgresDbOps: ERROR failed') gnsrch_log(error) self.__gnmetaNodeDF = None
def gngrph_search_init(gnp_spark, gndata_folder, gngraph_creds_folder, accessmode): gnsrch_log( 'GnSrchOps: ####################### searching initialization ##############' ) gnsrch_log('GnSrchOps: init searchOps using spark session ') gdb_creds_filepath = gngraph_creds_folder ###+"/gngraph_pgres_dbcreds.json" fileargs = {} gdbargs = {} gdbargs["gdb"] = "pgres" gdbargs["gdbflag"] = 1 gdbargs["gdbcredsfpath"] = gdb_creds_filepath gdbargs["gnmetaDB"] = "gngraph_db" gdbargs["gndataDB"] = "gngraph_db" gdbargs["staticfiles"] = 1 gdbargs["staticfpath"] = gndata_folder + "/uploads" gdbargs["gndatafolder"] = gndata_folder fargs = {} fargs["gngraphfolder"] = gndata_folder + "/gngraph" fargs["gnmetanodesfname"] = "gnmeanodes.json" fargs["gnmetaedgesfname"] = "gnmetaedges.json" ###entlist = gngrph_srch_get_entlist(sqlst) gnsrch_ops = GNGraphSearchOps(gndata_folder, accessmode, fargs, gdbargs, gnp_spark) ## Map metanodes and edges gnsrch_ops.get_metanodes_mapped_df() gnsrch_ops.get_metaedges_mapped_df() if (gnsrch_ops.srch_init_data_status() == 1): gnsrch_log_err('GnSrchOps: ERROR srchops init failed ') gnsrch_ops = '' return gnsrch_ops gnsrch_log('GnSrchOps: gngraph searchOps init COMPLETE ') return gnsrch_ops
def gngraph_datarepo_qry_getedges(self, dnodeDF, sqlst, nodemode): try: # first map gnedges gnsrch_log( 'GnSrchOps:datanodes querying for edges and derived nodes flages ' ) self.get_metaedges_mapped_df() self.get_metanodes_mapped_df() gnsrch_log('GnSrchOps: Enumerating edges for datanodes on join ') cond = [ ((self.__gnmetaEdgesDF_cached.gntgtnodeid == dnodeDF.gnnodeid) | (self.__gnmetaEdgesDF_cached.gnsrcnodeid == dnodeDF.gnnodeid)) & (self.__gnmetaEdgesDF_cached.gnedgetype == 'GNDataNodeEdge') ] jDF = self.__gnmetaEdgesDF_cached.join(dnodeDF, cond, 'inner') jDF.show(4) e1DF = jDF.select("gnedgeid", "gnedgename", "gnedgetype", "gnsrcnodeid", "gntgtnodeid") eDF = e1DF.dropDuplicates(['gnedgeid']).sort('gnedgeid') ecount = eDF.count() gnsrch_log('GnSrchOps: showing unique edges #nodes ' + str(ecount)) eDF.show(5) mcols = [F.col("gnsrcnodeid"), F.col("gntgtnodeid")] res = eDF.withColumn("edgenodes", F.array(mcols))\ .select("edgenodes") gnsrch_log('GnSrchOps: gnedges filter result 1 ') res.show(5) f1DF = res.select(F.explode(F.col("edgenodes")).alias("gnnodeid")) f1count = f1DF.count() gnsrch_log('GnSrchOps: Filter datanodes exploded #nodes ' + str(f1count)) f1DF.show(10) gnsrch_log('GnSrchOps: Filtered datanodes and remove duplicates ') f2DF = f1DF.select("gnnodeid").distinct().sort("gnnodeid") f2count = f2DF.count() gnsrch_log('GnSrchOps: Filter nodes and distict #nodes ' + str(f2count)) f2DF.show(10) derivedNodeDF = dnodeDF.select("gnnodeid").join( f2DF, on=['gnnodeid'], how='left_anti').distinct().orderBy('gnnodeid') gnsrch_log('GnSrchOps: Enumerating derived datanodes ') nderivedNodes = derivedNodeDF.count() gnsrch_log('GnSrchOps: derived datanodes #of nodes ' + str(nderivedNodes)) dnJson = {} dnDF = None if (nderivedNodes > 0): derivedNodeDF.show(10) deriveNodeList = derivedNodeDF.collect() derived_NodeList = [] for row in derivedNodeList: ####print(row['fnodes']) derived_NodeList.append(row['gnnodeid']) ### now iterate over list and get gnnode gnsrch_log('GnSrchOps: Node info for derived datanodes ') gnsrch_log(derived_NodeList) nodelist = [] nodeid_list = "( " i = 0 for x in derived_NodeList: if (i > 0): nodeid_list += "," nodeid_list += "" + str(x) + "" i = i + 1 nodeid_list += ")" gnsrch_log('GnSrchOps: Getting node info for list ' + nodeid_list) sqlstr = "SELECT * from gnmetanodes where gnnodeid in " + nodeid_list + " " gnsrch_log('GnGraphSearchOps: executing sql ' + sqlstr) dnDF = self.__spark.sql(sqlstr) #resJson = jDF.toJSON().map(lambda j: json.loads(j)).collect() ###dnJson = dnDF.toJSON().collect() dnCount = dnDF.count() gnsrch_log('GnSrchOps: Derived datanodes enumerated #nodes ' + str(dnCount)) except Exception as err: gnsrch_log('GnSrchOps: Exception received ' + str(err)) eDF = None dbDF = None ####print(nodelist) gnsrch_log('GnSrchOps: Completed datanodes gnedges fetch ') return (eDF, dnDF)
def gnp_spark_thread_join(gnspk_thr_config): gnsrch_log(' Joining for the thread ') gnspk_thr_config["spkthr"].join()
def gnp_spark_app_server_socket(gnRootDir, accessmode): gnsrch_log("GnSrchOps: Starting the gnspark thread application") app_name = "gngraph" ###accessmode={'sfmode': 1, 'dbmode':0 } gnspk_thr_cfg = gnp_spark_thread_setup(gnRootDir, accessmode) gnsrch_log("GnSrchOps: starting socket server...") SERVER_HOST = "0.0.0.0" SERVER_PORT = 4141 BUFFER_SIZE = 4096 SEPARATOR = "," s = socket.socket() s.bind((SERVER_HOST, SERVER_PORT)) s.listen(10) gnsrch_log(f"GnSrchOps: starting srch thres {SERVER_HOST}:{SERVER_PORT}") while True: client_sock, address = s.accept() received = client_sock.recv(BUFFER_SIZE).decode() gnsrch_log('GnSrchOps: received command: ' + str(received)) ##(cmd, args, nodeonly) = received.split(SEPARATOR) cmdJ = json.loads(received) gnsrch_log('GnSrchops: ') gnsrch_log(cmdJ) tskmsg = cmdJ resp = gnp_spark_thread_send_receive_task(gnspk_thr_cfg, tskmsg) resp_str = json.dumps(resp) ###progress = tqdm.tqdm(range(filesize), f"Receiving {filename}", unit="B", unit_scale=True, unit_divisor=1024) # Send message client_sock.sendall(resp_str.encode()) #with open(filename, "wb") as f: # while True: # bytes_read = client_socket.recv(BUFFER_SIZE) # if not bytes_read: # break # f.write(bytes_read) # progress.update(len(bytes_read)) client_sock.close() s.close() # join the thread gnp_spark_thread_join(gnspk_thr_cfg)
def gngraph_search_setup(self, sqlst, lnodes): gn_srch_sql = sqlst gnsrch_log("GnSrchOps: Parsing sql st " + gn_srch_sql) gn_ssql_parsed = GNGraphSqlParserOps(gn_srch_sql) #self.__entlist t = gn_ssql_parsed.get_entlist() aEntList = [] for x in t: if x not in self.__entlist: aEntList.append(x) gnsrch_log('GnSrchOps: search setup data entities list parsed ') gnsrch_log(aEntList) gn_ssql_parsed_where_str = gn_ssql_parsed.get_where_str() ###self.__gngrp_dnDFList = [] ###entlist is list of nodesname that need to be mapped for ent in aEntList: entD = {} ent_metanode_info = self.get_metanode_info(ent) gnsrch_log('GnSrchOps: got metanode info for ' + ent + ' ') gnsrch_log(ent_metanode_info) if (len(ent_metanode_info) == 0): gnsrch_log('GnSrchOps: node ' + ent + ' does not exist ') errmsg = f" node " + ent + " does not exist " return (-1, errmsg) ##jprop = json.loads(ent_metanode_info["gnnodeprop"]) node_name = ent_metanode_info["gnnodename"] bizdomain = ent_metanode_info["bizdomain"] gnsrch_log("GnSrchOps: setup api nodename " + node_name + " bizdomain:" + bizdomain) entnodeDF = self.get_datanode_mapped_df(node_name, bizdomain) if (entnodeDF is not None): ent_metanode_info["df"] = entnodeDF self.__gngrp_dnDFList.append(ent_metanode_info) self.__entlist.append(ent) gnsrch_log('GnSrchOps: node entity ' + node_name + ' is mapped ') else: gnsrch_log("GnSrchOps: NodeDF setup " + node_name + " nodeDF is empty ") self.__sql_formatted = sqlst limit_rec = gn_ssql_parsed.get_limit_records() if (limit_rec == -1): limit_str = "LIMIT " + str(lnodes) self.__sql_formatted = sqlst + " " + limit_str msg = f" Search Setup is successful" return (0, msg, self.__sql_formatted)
def gngraph_meta_nodes_edges_setup(self): gnsrch_log('GnSrchOps: mapping meta nodes and edges setup ') self.get_metanodes_mapped_df() self.get_metaedges_mapped_df() self.map_gnmeta_nodes_edges()
def gngrph_srch_metarepo_qry_execute(gnsrch_ops, gnp_spark, sqlst, nodemode): njson = {} ejson = {} ###gnsrch_ops.gngraph_meta_nodes_edges_setup() if (gnsrch_ops.srch_init_meta_status() == 0): gnsrch_log('GnSrchOps: Meta data initialized is not completed ') return (njson, ejson) (resNodeDF, nodesjson, status) = gnsrch_ops.gngraph_execute_sqlqry(sqlst) if (status == "ERROR"): gnsrch_log('GnSrchOps: meta query failed ') return (njson, ejson) gnsrch_log('GnSrchOps: metanodes fetched ') ##print(nodesjson) nnodes = resNodeDF.count() gnsrch_log('GnSrchOps: metanodes for search returned #nodes ' + str(nnodes)) if (nnodes > 0): resNodeDF.show(5) ## Prepare njson output nDF = resNodeDF.select(col("gnnodeid").alias("id"), \ col("gnnodetype").alias("nodetype"), \ col("gnnodename").alias("nodename")) njson = nDF.toJSON().collect() if (nodemode == 2): ### Need to derive edges and derived nodes (eDF, dnDF) = gnsrch_ops.gngraph_metarepo_qry_getedges( resNodeDF, sqlst, 0) nEdges = eDF.count() if (dnDF is not None): nDNodes = dnDF.count() else: nDNodes = 0 if (nEdges > 0): eResDF = eDF.select(col("gnedgeid").alias("id"), \ col("gnedgetype").alias("type"), \ col("gnsrcnodeid").alias("source"), \ col("gntgtnodeid").alias("target")) ejson = eResDF.toJSON().collect() if (nDNodes > 0): dnDF1 = dnDF.select(col("gnnodeid").alias("id"), \ col("gnnodetype").alias("nodetype"), \ col("gnnodename").alias("nodename")) # Combine the derived nodes to source nodes rDF = nDF.unionByName(dnDF1, allowMissingColumns=True) njson = rDF.toJSON().collect() #print('GnSrchOps: Nodes Json ') #print(njson) #print('GnSrchOps: Edges Json ') #print(ejson) gnsrch_log('GnSrchOps: Meta edges and derived nodes enumerated ') return (njson, ejson)