def importRDFfiles(): global WORKER_INSTANCES inst_str = "number of log rows: " + str(nr_of_log_rows) + "\n" inst_str += "number of processed rows: " + str(line_counter) + "\n" inst_str += "length of POST list: " + str(postListSize) + "\n" list_len = len(WORKER_INSTANCES) inst_str += "number of worker instaces: " + str(list_len) + "\n" #print("WORKERS " + str(WORKER_INSTANCES)) if list_len > 0: for instance in WORKER_INSTANCES: wwwname = instance['name'] inst_str += "\nworkername: " + wwwname + "\nmachineType: " + instance[ 'machineType'] + "\n" wwwip = instance['networkInterfaces'][0]['accessConfigs'][0][ 'natIP'] www_data = dict() www_data["ip"] = wwwip www_data["name"] = wwwname www_data["statfile"] = mylargefile p = subprocess.Popen( ["python3", "download_rdf_files.py", json.dumps(www_data)]) #Wait for process to terminate. out, err = p.communicate() #add info about instances comm.saveStatistics(mylargefile, inst_str + "\n\n") else: comm.printException(comm.pathToSaveDownloadErrors, errString='No instances to list.')
def importRDFfiles(): global WORKER_INSTANCES inst_str = "number of log rows: " + str(nr_of_log_rows) + "\n" inst_str += "number of processed rows: " + str(line_counter) + "\n" inst_str += "length of POST list: " + str(postListSize) + "\n" list_len = len(WORKER_INSTANCES) inst_str += "number of worker instaces: " + str(list_len) + "\n" #print("WORKERS " + str(WORKER_INSTANCES)) if list_len > 0: for instance in WORKER_INSTANCES: wwwname = instance['name'] inst_str += "\nworkername: " + wwwname + "\nmachineType: " + instance['machineType'] + "\n" wwwip = instance['networkInterfaces'][0]['accessConfigs'][0]['natIP'] www_data = dict() www_data["ip"] = wwwip www_data["name"] = wwwname www_data["statfile"] = mylargefile p = subprocess.Popen(["python3", "download_rdf_files.py", json.dumps(www_data)]) #Wait for process to terminate. out, err = p.communicate() #add info about instances comm.saveStatistics(mylargefile, inst_str + "\n\n") else: comm.printException(comm.pathToSaveDownloadErrors, errString='No instances to list.')
#!/usr/local/bin/python3 # -*- coding: utf-8 -*- # enable debugging import cgitb cgitb.enable() import os, sys import json from rdflib import Graph import commonVariables as comm if __name__ == '__main__': www_data = json.loads(sys.argv[1]) mylargefile = www_data["statfile"] #save info about amount of triples for fname in comm.rdfFnames: g_path = comm.pathToRDFdir + fname + ".rdf" if (os.path.exists(g_path)): g_old = Graph() g_old.parse(g_path) if (mylargefile is not ""): note = (fname + " nr of triples: " + str(len(g_old)) + "\n") comm.saveStatistics(mylargefile, note)
worker_counter += 1 if (worker_counter > (len(ipList) - 1)): #start over from first worker in list worker_counter = 0 #dont let memory to grow too buzy if (len(distinct_urls) > 1000): distinct_urls = set() #ylejaagi postitamine if (len(urlsList) > 0): #send list of urls to worker worker_counter = postToWorker.detectConnection( ipList, worker_counter, urlsList) #start of statistics! comm.saveStatistics( mylargefile, "###########################\n chunksize: " + str(comm.chunksize) + "\n") #the time spent end = datetime.datetime.now() span = end - start #save statistics! note = "creating RDFs: \n" + "time spent (h:m:s.mm): " + str( span) + " \n\n" comm.saveStatistics(mylargefile, note) #print("totalseconds: ", span.total_seconds()) ### ### #List instances, aggregate all rdf files into 3 files in master start = datetime.datetime.now()
del urlsList[:] #empty list of urls #prepare next worker worker_counter += 1 if (worker_counter > (len(ipList)-1)): #start over from first worker in list worker_counter = 0 #dont let memory to grow too buzy if (len(distinct_urls) > 1000): distinct_urls = set() #ylejaagi postitamine if(len(urlsList) > 0): #send list of urls to worker worker_counter = postToWorker.detectConnection(ipList, worker_counter, urlsList) #start of statistics! comm.saveStatistics(mylargefile, "###########################\n chunksize: "+ str(comm.chunksize)+"\n") #the time spent end = datetime.datetime.now() span = end-start #save statistics! note = "creating RDFs: \n"+"time spent (h:m:s.mm): " + str(span) + " \n\n" comm.saveStatistics(mylargefile, note) #print("totalseconds: ", span.total_seconds()) ### ### #List instances, aggregate all rdf files into 3 files in master start = datetime.datetime.now() importRDFfiles() ##delete folder, where RDF-files where collected (in master VM)
#!/usr/local/bin/python3 # -*- coding: utf-8 -*- # enable debugging import cgitb cgitb.enable() import os, sys import json from rdflib import Graph import commonVariables as comm if __name__ == '__main__': www_data = json.loads(sys.argv[1]) mylargefile = www_data["statfile"] #save info about amount of triples for fname in comm.rdfFnames: g_path = comm.pathToRDFdir + fname + ".rdf" if (os.path.exists(g_path)): g_old = Graph() g_old.parse(g_path) if(mylargefile is not ""): note=(fname + " nr of triples: " + str(len(g_old)) + "\n") comm.saveStatistics(mylargefile, note)