def sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding, localFilename, pageread): if ("excel" in contentType): try: dirToSaveDownloads = comm.downloadsDir + baseUrl if not os.path.isdir(dirToSaveDownloads): os.makedirs(dirToSaveDownloads) fileparser.spreadURLsByContentType(redirectedTo, None, contentType, od, _encoding, filePath=(dirToSaveDownloads + "/" + localFilename)) except: comm.printException( comm.pathToSaveProgrammingErrors, "create_dir_for_excel_and_send_file_to_parser") pass else: try: fileparser.spreadURLsByContentType(redirectedTo, pageread, contentType, od, _encoding) except: comm.printException(comm.pathToSaveProgrammingErrors, "send_file_to_parser") pass
def importRDFfiles(): global WORKER_INSTANCES inst_str = "number of log rows: " + str(nr_of_log_rows) + "\n" inst_str += "number of processed rows: " + str(line_counter) + "\n" inst_str += "length of POST list: " + str(postListSize) + "\n" list_len = len(WORKER_INSTANCES) inst_str += "number of worker instaces: " + str(list_len) + "\n" #print("WORKERS " + str(WORKER_INSTANCES)) if list_len > 0: for instance in WORKER_INSTANCES: wwwname = instance['name'] inst_str += "\nworkername: " + wwwname + "\nmachineType: " + instance['machineType'] + "\n" wwwip = instance['networkInterfaces'][0]['accessConfigs'][0]['natIP'] www_data = dict() www_data["ip"] = wwwip www_data["name"] = wwwname www_data["statfile"] = mylargefile p = subprocess.Popen(["python3", "download_rdf_files.py", json.dumps(www_data)]) #Wait for process to terminate. out, err = p.communicate() #add info about instances comm.saveStatistics(mylargefile, inst_str + "\n\n") else: comm.printException(comm.pathToSaveDownloadErrors, errString='No instances to list.')
def readExcel(filePath, url, ontologyData): try: urr(url, filePath) try: workbook = xlrd.open_workbook(filePath) worksheets = workbook.sheet_names() for worksheet_name in worksheets: worksheet = workbook.sheet_by_name(worksheet_name) num_rows = worksheet.nrows - 1 num_cells = worksheet.ncols - 1 curr_row = -1 while curr_row < num_rows: curr_row += 1 # row = worksheet.row(curr_row) # print ('Row:', curr_row) curr_cell = -1 while curr_cell < num_cells: curr_cell += 1 # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank cell_type = worksheet.cell_type(curr_row, curr_cell) cell_value = worksheet.cell_value(curr_row, curr_cell) if cell_type == 1: sentences = comm.replaceToPunkts(cell_value) for sentence in sentences: getEntities.getEntities(url, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass
def processObject(client, itemname): try: #try to access json object in Google Compute Storage # Get Payload Data req = client.objects().get_media( bucket = comm.jsonsDir, object=itemname) #store info whether a json-object exists in the bucket or not fileExists = True try: resp = req.execute() except: fileExists = False pass #continue only when the object exists if (fileExists): # The BytesIO object may be replaced with any io.Base instance. fh = io.BytesIO() #prepare for reading a json-object downloader = MediaIoBaseDownload(fh, req, chunksize=1024*1024) done = False while not done: status, done = downloader.next_chunk() #load accessed json-object into dictionary jsonToDict = json.loads(fh.getvalue())#json.loads(fh.getvalue())#return value #print ("RETURNED VALUE: " + jsonToDict) doDownloadJob(jsonToDict, itemname) #store error message into respective errors bucket except oauth2_client.AccessTokenRefreshError: comm.printException(comm.pathToSaveDownloadErrors, errString="False credentials") pass
def readExcel(filePath, url, ontologyData): try: urr(url, filePath) try: workbook = xlrd.open_workbook(filePath) worksheets = workbook.sheet_names() for worksheet_name in worksheets: worksheet = workbook.sheet_by_name(worksheet_name) num_rows = worksheet.nrows - 1 num_cells = worksheet.ncols - 1 curr_row = -1 while curr_row < num_rows: curr_row += 1 #row = worksheet.row(curr_row) #print ('Row:', curr_row) curr_cell = -1 while curr_cell < num_cells: curr_cell += 1 # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank cell_type = worksheet.cell_type(curr_row, curr_cell) cell_value = worksheet.cell_value(curr_row, curr_cell) if (cell_type == 1): sentences = comm.replaceToPunkts(cell_value) for sentence in sentences: getEntities.getEntities(url, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass
def deleteRDFsInWorker(workerip): #delete RDF-files and download excel-files in a worker worker_address = "http://" + workerip + "/delete_rdf_files.php" try: requests.post(worker_address) except: comm.printException(comm.pathToConnectionErrors, errString='ConnectionError_to_worker_' + workerip)
def importRDFfiles(): global WORKER_INSTANCES list_len = len(WORKER_INSTANCES) #print("WORKERS " + str(WORKER_INSTANCES)) try: if list_len > 0: for instance in WORKER_INSTANCES: wwwname = instance['name'] wwwip = instance['networkInterfaces'][0]['accessConfigs'][0][ 'natIP'] www_data = dict() www_data["ip"] = wwwip www_data["name"] = wwwname www_data["statfile"] = "" subprocess.Popen([ "python3", comm.parentDir + "upload_logfile/download_rdf_files.py", json.dumps(www_data) ]) else: comm.printException(comm.updateErrorsFilePath, errString='No instances to list.') except: comm.printException(comm.updateErrorsFilePath, errString='Importing_RDFs.')
def importRDFfiles(): global WORKER_INSTANCES inst_str = "number of log rows: " + str(nr_of_log_rows) + "\n" inst_str += "number of processed rows: " + str(line_counter) + "\n" inst_str += "length of POST list: " + str(postListSize) + "\n" list_len = len(WORKER_INSTANCES) inst_str += "number of worker instaces: " + str(list_len) + "\n" #print("WORKERS " + str(WORKER_INSTANCES)) if list_len > 0: for instance in WORKER_INSTANCES: wwwname = instance['name'] inst_str += "\nworkername: " + wwwname + "\nmachineType: " + instance[ 'machineType'] + "\n" wwwip = instance['networkInterfaces'][0]['accessConfigs'][0][ 'natIP'] www_data = dict() www_data["ip"] = wwwip www_data["name"] = wwwname www_data["statfile"] = mylargefile p = subprocess.Popen( ["python3", "download_rdf_files.py", json.dumps(www_data)]) #Wait for process to terminate. out, err = p.communicate() #add info about instances comm.saveStatistics(mylargefile, inst_str + "\n\n") else: comm.printException(comm.pathToSaveDownloadErrors, errString='No instances to list.')
def processObject(client, itemname): try: #try to access json object in Google Compute Storage # Get Payload Data req = client.objects().get_media(bucket=comm.jsonsDir, object=itemname) #store info whether a json-object exists in the bucket or not fileExists = True try: resp = req.execute() except: fileExists = False pass #continue only when the object exists if (fileExists): # The BytesIO object may be replaced with any io.Base instance. fh = io.BytesIO() #prepare for reading a json-object downloader = MediaIoBaseDownload(fh, req, chunksize=1024 * 1024) done = False while not done: status, done = downloader.next_chunk() #load accessed json-object into dictionary jsonToDict = json.loads( fh.getvalue()) #json.loads(fh.getvalue())#return value #print ("RETURNED VALUE: " + jsonToDict) doDownloadJob(jsonToDict, itemname) #store error message into respective errors bucket except oauth2_client.AccessTokenRefreshError: comm.printException(comm.pathToSaveDownloadErrors, errString="False credentials") pass
def readPlainText(htmlurl, plaintext, ontologyData): try: punc = (plaintext).strip() sentences = comm.replaceToPunkts(punc) for sentence in sentences: getEntities.getEntities(htmlurl, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_plaintext.py " + htmlurl)
def insertNewSubFileToDict(localFilename, page_redirected, page_info, page_sha254, page_status): try: newDict = dict() insertValuesToDict(newDict, localFilename, page_redirected, page_info, page_sha254, page_status) return newDict except: comm.printException(comm.pathToSaveJsonErrors) pass
def downloadDocuments(ajadir): start = datetime.datetime.now() currTime = time.strftime("%H:%M:%S") nrOfDownloads = dowloadFromJsons(ajadir) end = datetime.datetime.now() try: saveUpdateStatistics(currTime, start, end, action="Download_datasets:_nr_of_downloads:_"+str(nrOfDownloads), isLast = True) except: comm.printException(comm.updateErrorsFilePath, errString="download") pass
def postListToWorker(wip, urlsList): pikkus = len(urlsList) dik = dict(zip(range(pikkus),urlsList)) address = "http://"+wip try: requests.post(address, data={'data':json.dumps(dik), 'chunksize':json.dumps(comm.chunksize)}) return 1 except: comm.printException(comm.pathToConnectionErrors, errString='ConnectionError_to_worker_' + wip) return 0 pass
def saveUpdateStatistics(currtime, start, end, action="", isLast=False): span = end - start try: jf = open(comm.monthly_updates_path, 'a') if (isLast): jf.write(action + " " + currtime + " " + str(span) + "\n") else: jf.write(action + " " + currtime + " " + str(span) + " ") jf.close() except: comm.printException(comm.updateErrorsFilePath, errString="update") pass
def saveUpdateStatistics(currtime, start, end, action="", isLast = False): span = end-start try: jf = open(comm.monthly_updates_path, 'a') if(isLast): jf.write(action + " " + currtime + " " + str(span) + "\n") else: jf.write(action + " " + currtime + " " + str(span) + " ") jf.close() except: comm.printException(comm.updateErrorsFilePath, errString="update") pass
def measureDownloadsTime(path_to_stat_file, ajadir): start = datetime.datetime.now() currTime = time.strftime("%H:%M:%S") nrOfDownloads = dowloadFromJsons(ajadir) end = datetime.datetime.now() span = end-start try: jf = open(path_to_stat_file, 'a', encoding='utf-8') jf.write(currTime + " " + str(nrOfDownloads) + " " + str(span) + " " + "\n") jf.close() except: comm.printException(comm.pathToSaveDownloadErrors, errString="download") pass
def importExcelFiles(WORKER_INSTANCES): list_len = len(WORKER_INSTANCES) if list_len > 0: for instance in WORKER_INSTANCES: wwwip = instance['networkInterfaces'][0]['accessConfigs'][0]['natIP']#download excels try: #excels_url ="http://146.148.115.150/downloaded_files/" excels_url = "http://" + wwwip + "/downloaded_files/" dest = comm.downloadsDir_for_excels os.system('wget -r --no-parent -nH --cut-dirs=1 --reject "index.html*" ' + excels_url + " -P " + dest) except: comm.printException(comm.pathToSaveDownloadErrors, errString="collecting_Excels_and_PDFs") pass
def readJson(jsonurl, readedPage, od): '''#if httpResponse is filepath jsonfile = (httpResponse.read()).decode('utf-8') ''' #if httpResponse is saved nto string already try: jsonfile = (readedPage).strip() dictnry = json.loads(jsonfile) readDictValues(jsonurl, dictnry, set(), od) except: comm.printException(comm.pathToSaveParsingErrors, "read_json.py " + jsonurl) pass
def insertNewContentToDict(localFilename, page_redirected, page_info, page_sha254, page_status): try: newDict = dict() newDict[page_sha254] = page_info newDict[page_sha254]["localFilename"] = localFilename newDict[page_sha254]["file_url"] = page_redirected newDict[page_sha254]["sha224"] = page_sha254 newDict[page_sha254]["status"] = page_status newDict[page_sha254]["timeDir"] = comm.timeDir return newDict except: comm.printException(comm.pathToSaveJsonErrors) return False
def insertValuesToDict(dictnry, localFilename, page_redirected, page_info, page_sha254, page_status): try: dictnry[localFilename] = dict() dictnry[localFilename][page_sha254] = page_info dictnry[localFilename][page_sha254]["localFilename"] = localFilename dictnry[localFilename][page_sha254]["file_url"] = page_redirected dictnry[localFilename][page_sha254]["sha224"] = page_sha254 dictnry[localFilename][page_sha254]["status"] = page_status dictnry[localFilename][page_sha254]["timeDir"] = comm.timeDir return dictnry except: comm.printException(comm.pathToSaveJsonErrors) pass
def downloadDocuments(ajadir): start = datetime.datetime.now() currTime = time.strftime("%H:%M:%S") nrOfDownloads = dowloadFromJsons(ajadir) end = datetime.datetime.now() try: saveUpdateStatistics(currTime, start, end, action="Download_datasets:_nr_of_downloads:_" + str(nrOfDownloads), isLast=True) except: comm.printException(comm.updateErrorsFilePath, errString="download") pass
def measureDownloadsTime(path_to_stat_file, ajadir): start = datetime.datetime.now() currTime = time.strftime("%H:%M:%S") nrOfDownloads = dowloadFromJsons(ajadir) end = datetime.datetime.now() span = end - start try: jf = open(path_to_stat_file, 'a', encoding='utf-8') jf.write(currTime + " " + str(nrOfDownloads) + " " + str(span) + " " + "\n") jf.close() except: comm.printException(comm.pathToSaveDownloadErrors, errString="download") pass
def sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding, localFilename, pageread): if("excel" in contentType): try: dirToSaveDownloads = comm.downloadsDir + baseUrl if not os.path.isdir(dirToSaveDownloads): os.makedirs(dirToSaveDownloads) fileparser.spreadURLsByContentType(redirectedTo, None, contentType, od, _encoding, filePath = (dirToSaveDownloads + "/" + localFilename)) except: comm.printException(comm.pathToSaveProgrammingErrors, "create_dir_for_excel_and_send_file_to_parser") pass else: try: fileparser.spreadURLsByContentType(redirectedTo, pageread, contentType, od, _encoding) except: comm.printException(comm.pathToSaveProgrammingErrors, "send_file_to_parser") pass
def getMyAuthService(service_name = 'bigquery', service_version = 'v2'): http = httplib2.Http() token_uri = '%s/%s/token' % (METADATA_SERVER, SERVICE_ACCOUNT) resp, content = http.request(token_uri, method='GET', body=None, headers={'Metadata-Flavor': 'Google'}) if resp.status == 200: d = json.loads(content) access_token = d['access_token'] # Save the access token credentials = oauth2_client.AccessTokenCredentials(access_token, 'my-user-agent/1.0') AUTH_HTTP = credentials.authorize(http) return build(service_name, service_version, http=AUTH_HTTP) else: comm.printException(comm.pathToSaveauthErrors, errString="AUTHENTICATION RESPONSE STATUS: " + resp.status) pass
def postListToWorker(wip, urlsList): pikkus = len(urlsList) dik = dict(zip(range(pikkus), urlsList)) address = "http://" + wip try: requests.post(address, data={ 'data': json.dumps(dik), 'chunksize': json.dumps(comm.chunksize) }) return 1 except: comm.printException(comm.pathToConnectionErrors, errString='ConnectionError_to_worker_' + wip) return 0 pass
def importExcelFiles(WORKER_INSTANCES): list_len = len(WORKER_INSTANCES) if list_len > 0: for instance in WORKER_INSTANCES: wwwip = instance['networkInterfaces'][0]['accessConfigs'][0][ 'natIP'] #download excels try: #excels_url ="http://146.148.115.150/downloaded_files/" excels_url = "http://" + wwwip + "/downloaded_files/" dest = comm.downloadsDir_for_excels os.system( 'wget -r --no-parent -nH --cut-dirs=1 --reject "index.html*" ' + excels_url + " -P " + dest) except: comm.printException(comm.pathToSaveDownloadErrors, errString="collecting_Excels_and_PDFs") pass
def readPdf(url, readdedpdf, od): b = BytesIO(readdedpdf) pdfFile = PdfFileReader(b, "rb") pdfFile.strict = False #pdfFile = PdfFileReader("pdf-sample.pdf", "rb") #print(pdfFile) try: for i in range(pdfFile.numPages): #print(i) pageObject = pdfFile.getPage(i)#ContentStream(pdfFile.getPage(i)["/Contents"]) text = (pageObject.extractText()) sentences = comm.replaceToPunkts(text) for sentence in sentences: getEntities.getEntities(url, sentence, od) except: comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url) pass
def importRDFfiles(): global WORKER_INSTANCES list_len = len(WORKER_INSTANCES) #print("WORKERS " + str(WORKER_INSTANCES)) try: if list_len > 0: for instance in WORKER_INSTANCES: wwwname = instance['name'] wwwip = instance['networkInterfaces'][0]['accessConfigs'][0]['natIP'] www_data = dict() www_data["ip"] = wwwip www_data["name"] = wwwname www_data["statfile"] = "" subprocess.Popen(["python3", comm.parentDir + "upload_logfile/download_rdf_files.py", json.dumps(www_data)]) else: comm.printException(comm.updateErrorsFilePath, errString='No instances to list.') except: comm.printException(comm.updateErrorsFilePath, errString='Importing_RDFs.')
def readPdf(url, readdedpdf, od): b = BytesIO(readdedpdf) pdfFile = PdfFileReader(b, "rb") pdfFile.strict = False #pdfFile = PdfFileReader("pdf-sample.pdf", "rb") #print(pdfFile) try: for i in range(pdfFile.numPages): #print(i) pageObject = pdfFile.getPage( i) #ContentStream(pdfFile.getPage(i)["/Contents"]) text = (pageObject.extractText()) sentences = comm.replaceToPunkts(text) for sentence in sentences: getEntities.getEntities(url, sentence, od) except: comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url) pass
def getMyAuthService(service_name='bigquery', service_version='v2'): http = httplib2.Http() token_uri = '%s/%s/token' % (METADATA_SERVER, SERVICE_ACCOUNT) resp, content = http.request(token_uri, method='GET', body=None, headers={'Metadata-Flavor': 'Google'}) if resp.status == 200: d = json.loads(content) access_token = d['access_token'] # Save the access token credentials = oauth2_client.AccessTokenCredentials( access_token, 'my-user-agent/1.0') AUTH_HTTP = credentials.authorize(http) return build(service_name, service_version, http=AUTH_HTTP) else: comm.printException(comm.pathToSaveauthErrors, errString="AUTHENTICATION RESPONSE STATUS: " + resp.status) pass
def listOfJsonObjects(client): try: # Get Metadata req = client.objects().list(bucket=comm.jsonsDir) # If you have too many items to list in one request, list_next() will # automatically handle paging with the pageToken. while req is not None: resp = req.execute() #print json.dumps(resp, indent=2) #print ("----------------------------------") if resp and 'items' in resp: for item in (resp["items"]): #print(item["name"]) itemname = (item["name"]) processObject(client, itemname) req = client.objects().list_next(req, resp) except oauth2_client.AccessTokenRefreshError: comm.printException(comm.pathToSaveDownloadErrors, errString="False credentials") pass
def addTriples(self, chunkedList, addLemmas = True): try: newDataExists = False g = self.getPerRdfGraph() g_new = Graph() #define specific namespace prefix self.bindNamespaces(g) for andmed in chunkedList: for webpage in andmed: gName = andmed[webpage]["gName"] fName = andmed[webpage]["fName"] name = andmed[webpage]["name"] lemmaList = andmed[webpage]["lemmaSet"] #print (lemmaList) try: #make triples newPerson = URIRef(self.perStr + name.replace (">", "").replace ("<", "").replace ("|", "").replace (" ", "_").lower()) newGivenName = Literal(gName) newFamilyName = Literal(fName) newPerName = Literal(name) newWebpage = URIRef(webpage); #add triples #check if graph contains bob already if ( newPerson, RDF.type, URIRef(self.person)) not in g: newDataExists = True g_new.add( (newPerson, RDF.type, URIRef(self.person)) ) if(newGivenName != Literal("")): g_new.add( (newPerson, self.givenName, newGivenName) ) if(newFamilyName != Literal("")): g_new.add( (newPerson, self.familyName, newFamilyName) ) g_new.add( (newPerson, self.perName, newPerName) ) #check if graph contains bob already if ( newPerson, self.mentionedAtSite, newWebpage) not in g: newDataExists = True g_new.add( (newPerson, self.mentionedAtSite, newWebpage) ) #add lemmas also if(addLemmas): for newLemma in lemmaList: #check if graph contains bob already if ( newPerson, self.lemma, Literal(newLemma)) not in g: newDataExists = True g_new.add( (newPerson, self.lemma, Literal(newLemma)) ) except: comm.printException(comm.initRdfErrorsFilePath, "build_per_graph") pass #print(str(newDataExists)) #write rdf into file if (newDataExists): try: gg = g+g_new (gg).serialize(self.perRdf, format='pretty-xml', encoding='utf-8') except: comm.printException(comm.initRdfErrorsFilePath, "RDF People Manager serialization error: ") pass except: comm.printException(comm.initRdfErrorsFilePath, "RDF People Manager (addTriples) error: ") pass
def doDownloadJob(jsonToDict, itemname): global nrOfDownloads try: base_url = jsonToDict['base_url']#becomes folderName for fname_key in jsonToDict.keys(): #At first level, there are two sorts of keys in json-file: #1. base-url #2. sha(s) of filename(s), (sha of file full url, including "/"-slashes) ##As the file content may change over time, every sha(filename)-element contains ##1. sha(s) of a content(s) ###Every sha of a content contains ###1. metadata of a file/content if (fname_key != 'base_url'):#fname_key(sha of file url) becomes local filename #'jsonToDict[fname_key].keys()' is a list of sha(content) of a current sha(filename) #loop over every sha(content) of a sha(filename) #here, csha is the sha(filecontent) for csha in jsonToDict[fname_key].keys(): contentKeyExists=False """check if metadata contains key 'Content-Type'""" try: if ('Content-Type' in jsonToDict[fname_key][csha]): contentKeyExists=True except: contentKeyExists=False pass #Get the time the json-file was made timeDir = jsonToDict[fname_key][csha]['timeDir'] #download only changes that are no older than #the date of start of current process! process_start_date = comm.makeDateObj(ajadir) json_model_date = comm.makeDateObj(timeDir) #continueonly if #date in model is younger or equal to a #date of a process start if(contentKeyExists) & (json_model_date >= process_start_date): #excel type is already downloaded if("excel" not in jsonToDict[fname_key][csha]['Content-Type']): #full URL of a file file_url = jsonToDict[fname_key][csha]['file_url'] dirPath = comm.downloadsDir + base_url + "/" try: #create folder for this 'date/base_url' if does not exist if (not os.path.isdir(dirPath)) & (not os.path.exists(dirPath)): os.makedirs(dirPath) try: #download the file into that folder #fname_key is the sha(filename) #resulting path of a file will become 'date/base_url/sha(filename)' urr(file_url, dirPath + fname_key) nrOfDownloads += 1 except: comm.printException(comm.pathToSaveDownloadErrors, itemname) pass except: comm.printException(comm.pathToSaveDownloadErrors, itemname) pass except: comm.printException(comm.pathToSaveDownloadErrors, itemname) pass
def readHtmlPage(htmlurl, readedPage, ontologyData): try: sentences = set() root = parse(htmlurl).getroot() #if the root is null, the html is incorrectly formed if(root is not None): for element in root.iter("head"): element.drop_tree() for element in root.iter("script"): element.drop_tree() for element in root.iter("style"): element.drop_tree() for element in root.iter("noscript"): element.drop_tree() for element in root.iter("input"): element.drop_tree() for element in root.iter("form"): element.drop_tree() for element in root.iter("title"): element.drop_tree() for element in root.iter("img"): element.drop_tree() for element in root.iter("body"): try: sentences.add(element.text_content()) except: pass if(len(sentences) > 0): lsent = list(sentences) for lau in lsent: if(lau != ""): laused = comm.replaceToPunkts(lau) for s6ne in laused: getEntities.getEntities(htmlurl, s6ne.strip(), ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_html.py " + htmlurl) pass
def readXml(xmlurl, pathToFile, ontologyData): # https://docs.python.org/3.4/library/functions.html#setattr """ #https://docs.python.org/3.4/library/xml.etree.elementtree.html?highlight=elementtree#parsing-xml if httpResponse is path to xml file: tree = ET.parse(httpResponse) root = tree.getroot() Or directly from a string: (xml is read already): """ try: root = ET.fromstring(pathToFile) if root is not None: for data in root.iter(): if data.text is not None: stripped = data.text.strip() if (stripped is not None) & (len(stripped) > 2): sentences = comm.replaceToPunkts(stripped) for sentence in sentences: getEntities.getEntities(xmlurl, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_xml.py " + xmlurl) pass
def readXml(xmlurl, pathToFile, ontologyData): #https://docs.python.org/3.4/library/functions.html#setattr ''' #https://docs.python.org/3.4/library/xml.etree.elementtree.html?highlight=elementtree#parsing-xml if httpResponse is path to xml file: tree = ET.parse(httpResponse) root = tree.getroot() Or directly from a string: (xml is read already): ''' try: root = ET.fromstring(pathToFile) if(root is not None): for data in root.iter(): if(data.text is not None): stripped = data.text.strip() if(stripped is not None) & (len(stripped)>2): sentences = comm.replaceToPunkts(stripped) for sentence in sentences: getEntities.getEntities(xmlurl, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_xml.py " + xmlurl) pass
def addTriples(self, chunkedList, addLemmas = True): try: newDataExists = False g = self.getLocRdfGraph() g_new = Graph() #define specific namespace prefix self.bindNamespaces(g) for andmed in chunkedList: for webpage in andmed: for objName in andmed[webpage]: lemmaList = andmed[webpage][objName] #print (lemmaList) try: #make triples newLocation = URIRef(self.locStr + objName.replace (">", "").replace ("<", "").replace ("|", "").replace (" ", "_").lower()) newLocationName = Literal(objName) newWebpage = URIRef(webpage); #add triples #check if graph contains bob already if ( newLocation, RDF.type, URIRef(self.location)) not in g: newDataExists = True g_new .add( (newLocation, RDF.type, URIRef(self.location)) ) g_new .add( (newLocation, self.locationName, newLocationName) ) #g_new .add( (newLocation, od.mentionedAtSite, newWebpage) ) #check if graph contains bob already if ( newLocation, self.mentionedAtSite, newWebpage) not in g: newDataExists = True g_new .add( (newLocation, self.mentionedAtSite, newWebpage) ) #add lemmas also if(addLemmas): for newLemma in lemmaList: #check if graph contains bob already if ( newLocation, self.lemma, Literal(newLemma)) not in g: newDataExists = True g_new .add( (newLocation, self.lemma, Literal(newLemma)) ) except: comm.printException(comm.initRdfErrorsFilePath, "build_loc_graph") pass #write rdf into file if (newDataExists): try: gg = g+g_new (gg).serialize(self.locRdf, format='pretty-xml', encoding='utf-8') except: comm.printException(comm.initRdfErrorsFilePath, "RDF Location Manager serialization error: ") pass except: comm.printException(comm.initRdfErrorsFilePath, "RDF Location Manager (addTriples) error: ") pass
def addTriples(self, chunkedList, addLemmas=True): try: newDataExists = False g = self.getLocRdfGraph() g_new = Graph() #define specific namespace prefix self.bindNamespaces(g) for andmed in chunkedList: for webpage in andmed: for objName in andmed[webpage]: lemmaList = andmed[webpage][objName] #print (lemmaList) try: #make triples newLocation = URIRef( self.locStr + objName.replace(">", "").replace("<", ""). replace("|", "").replace(" ", "_").lower()) newLocationName = Literal(objName) newWebpage = URIRef(webpage) #add triples #check if graph contains bob already if (newLocation, RDF.type, URIRef(self.location)) not in g: newDataExists = True g_new.add((newLocation, RDF.type, URIRef(self.location))) g_new.add((newLocation, self.locationName, newLocationName)) #g_new .add( (newLocation, od.mentionedAtSite, newWebpage) ) #check if graph contains bob already if (newLocation, self.mentionedAtSite, newWebpage) not in g: newDataExists = True g_new.add((newLocation, self.mentionedAtSite, newWebpage)) #add lemmas also if (addLemmas): for newLemma in lemmaList: #check if graph contains bob already if (newLocation, self.lemma, Literal(newLemma)) not in g: newDataExists = True g_new.add((newLocation, self.lemma, Literal(newLemma))) except: comm.printException(comm.initRdfErrorsFilePath, "build_loc_graph") pass #write rdf into file if (newDataExists): try: gg = g + g_new (gg).serialize(self.locRdf, format='pretty-xml', encoding='utf-8') except: comm.printException( comm.initRdfErrorsFilePath, "RDF Location Manager serialization error: ") pass except: comm.printException(comm.initRdfErrorsFilePath, "RDF Location Manager (addTriples) error: ") pass
def dowloadFromJsons(ajadir): nrOfDownloads = 0 jsons = comm.jsonsDir """loop over every file in jsons-folder""" for filePath in listdir(jsons): if (filePath != "errors.txt"): #open json file: #'jsons' is a folder where json-files are saved #'filePath' is a filename in this folder ##'jsons'-folder lives in the folder "datadownload" ##'downloaded_files'-folder lives also in the folder "datadownload" try: """load json-file into directory-type""" jsonToDict = json.load(open(jsons + filePath)) except: continue #'base_url' is the hostname, before "/"-slashes, in json-file #'base_url' is the json-file name ('filePath'), followed by an extension '.json' #'base_url' is also a directory name in 'downloaded_files'-folder base_url = jsonToDict['base_url'] #becomes folderName for fname_key in jsonToDict.keys(): #At first level, there are two sorts of keys in json-file: #1. base-url #2. sha(s) of filename(s), (sha of file full url, including "/"-slashes) ##As the file content may change over time, every sha(filename)-element contains ##1. sha(s) of a content(s) ###Every sha of a content contains ###1. metadata of a file/content if (fname_key != 'base_url' ): #fname_key(sha of file url) becomes local filename #'jsonToDict[fname_key].keys()' is a list of sha(content) of a current sha(filename) #loop over every sha(content) of a sha(filename) #here, csha is the sha(filecontent) for csha in jsonToDict[fname_key].keys(): contentKeyExists = False """check if metadata contains key 'Content-Type'""" try: if ('Content-Type' in jsonToDict[fname_key][csha]): contentKeyExists = True except: contentKeyExists = False pass """Get the time the json-file was made""" timeDir = jsonToDict[fname_key][csha]['timeDir'] #download only today's changes! if (contentKeyExists) & (ajadir == timeDir): #excel type is already downloaded if ("excel" not in jsonToDict[fname_key][csha] ['Content-Type']): """Full URL of a file""" file_url = jsonToDict[fname_key][csha][ 'file_url'] """'dirPath' is the path of a folder of a file currently wants to be downloaded""" dirPath = comm.downloadsDir + base_url + "/" try: """create folder for this 'date/base_url' if does not exist""" if (not os.path.isdir(dirPath)) & ( not os.path.exists(dirPath)): os.makedirs(dirPath) try: #download the file into that folder #fname_key is the sha(filename) #resulting path of a file will become 'date/base_url/sha(filename)' urr(file_url, dirPath + fname_key) nrOfDownloads += 1 #print(timeDir, base_url, , file_url) except: comm.printException( comm.pathToSaveDownloadErrors, filePath) pass except: comm.printException( comm.pathToSaveDownloadErrors, filePath) pass return nrOfDownloads
def dowloadFromJsons(ajadir): nrOfDownloads = 0 jsons=comm.jsonsDir """loop over every file in jsons-folder""" for filePath in listdir(jsons): if(filePath != "errors.txt"): #open json file: #'jsons' is a folder where json-files are saved #'filePath' is a filename in this folder ##'jsons'-folder lives in the folder "datadownload" ##'downloaded_files'-folder lives also in the folder "datadownload" try: """load json-file into directory-type""" jsonToDict = json.load(open(jsons+filePath)); except: continue #'base_url' is the hostname, before "/"-slashes, in json-file #'base_url' is the json-file name ('filePath'), followed by an extension '.json' #'base_url' is also a directory name in 'downloaded_files'-folder base_url = jsonToDict['base_url']#becomes folderName for fname_key in jsonToDict.keys(): #At first level, there are two sorts of keys in json-file: #1. base-url #2. sha(s) of filename(s), (sha of file full url, including "/"-slashes) ##As the file content may change over time, every sha(filename)-element contains ##1. sha(s) of a content(s) ###Every sha of a content contains ###1. metadata of a file/content if (fname_key != 'base_url'):#fname_key(sha of file url) becomes local filename #'jsonToDict[fname_key].keys()' is a list of sha(content) of a current sha(filename) #loop over every sha(content) of a sha(filename) #here, csha is the sha(filecontent) for csha in jsonToDict[fname_key].keys(): contentKeyExists=False """check if metadata contains key 'Content-Type'""" try: if ('Content-Type' in jsonToDict[fname_key][csha]): contentKeyExists=True except: contentKeyExists=False pass """Get the time the json-file was made""" timeDir = jsonToDict[fname_key][csha]['timeDir'] #download only today's changes! if(contentKeyExists) & (ajadir == timeDir): #excel type is already downloaded if("excel" not in jsonToDict[fname_key][csha]['Content-Type']): """Full URL of a file""" file_url = jsonToDict[fname_key][csha]['file_url'] """'dirPath' is the path of a folder of a file currently wants to be downloaded""" dirPath = comm.downloadsDir + base_url + "/" try: """create folder for this 'date/base_url' if does not exist""" if (not os.path.isdir(dirPath)) & (not os.path.exists(dirPath)): os.makedirs(dirPath) try: #download the file into that folder #fname_key is the sha(filename) #resulting path of a file will become 'date/base_url/sha(filename)' urr(file_url, dirPath + fname_key) nrOfDownloads += 1 #print(timeDir, base_url, , file_url) except: comm.printException(comm.pathToSaveDownloadErrors, filePath) pass except: comm.printException(comm.pathToSaveDownloadErrors, filePath) pass return nrOfDownloads
def mergeRDFfiles(): try: for dname in rdfFnames: g_old_path = comm.pathToRDFdir + dname + ".rdf" # file: /var/www/html/master/rdf_files/ORG.rdf # temporary dirs ORG, LOC, PER rdf_file_dir = comm.pathToRDFdir + dname # dir: /var/www/html/master/rdf_files/ORG g_copy_path = comm.rdf_copypath + dname + ".rdf" # file: /var/www/html/master/rdf_copy/ORG.rdf g_new = Graph() g_new_for_copy = Graph() for rdf_file in listdir(rdf_file_dir): tmp_path = rdf_file_dir + "/" + rdf_file # /var/www/html/master/rdf_files/ORG/<worker-1>_<date_dir>.rdf try: g_new.parse(tmp_path) # load temporary file into graph g_new_for_copy.parse(tmp_path) # load temporary file into graph except: comm.printException( comm.pathToSaveDownloadErrors, errString="cannot load temporary file into graph" ) pass if os.path.exists(g_old_path): # there is an existing oath to rdf-file try: g_new.parse(g_old_path) # load old file into graph, adding it to new graph g_new.serialize(g_old_path, format="pretty-xml", encoding="utf-8") except: comm.printException(comm.pathToSaveDownloadErrors, errString="cannot merge RDF") pass # delete used files os.system("sudo rm -r " + rdf_file_dir + "/*") else: # no existing path to RDF files yet try: g_new.serialize(g_old_path, format="pretty-xml", encoding="utf-8") except: comm.printException(comm.pathToSaveDownloadErrors, errString="cannot create RDF") pass ##BACKUP RDF files if os.path.exists(g_copy_path): # there is an existing oath to rdf-file try: g_new_for_copy.parse(g_copy_path) # load old file into graph, adding it to new graph g_new_for_copy.serialize(g_copy_path, format="pretty-xml", encoding="utf-8") # save into file except: comm.printException(comm.pathToSaveDownloadErrors, errString="cannot merge backup RDF") pass else: # no existing path to RDF files yet try: g_new_for_copy.serialize(g_copy_path, format="pretty-xml", encoding="utf-8") # save into file except: comm.printException(comm.pathToSaveDownloadErrors, errString="cannot create backup RDF") pass except: comm.printException(comm.pathToSaveDownloadErrors, errString="merge RDF") pass
filedic["statfile"] = mylargefile subprocess.Popen(["python3", "amountOfTriples.py", json.dumps(filedic)]) #end of statistics! comm.saveStatistics(mylargefile, "\n--------------------------- ") #delete RDF-files and download excel-files in each worker postToWorker.deleteRDFsInWorkers(ipList) ### ### try: # Download json-objects from cloud storage downloadJsons.getJsonObjects(client) except: comm.printException(comm.pathToSaveDownloadErrors, errString="downloadJsons") pass ### ### try: ### ### Download generated error-objects from cloud storage downloadErrors.getErrorObjects(client) except: comm.printException(comm.pathToSaveDownloadErrors, errString="downloadErrors") pass else: print("no worker instances")
start = datetime.datetime.now() currTime = time.strftime("%d/%m/%Y_%H:%M:%S") #dir of json-files jsonFiles=comm.jsonsDir_local try: p = subprocess.Popen(["python2", comm.parentDir + "upload_logfile/getworkers.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() #print((out.decode())) WORKER_INSTANCES = json.loads(out.decode()) #print(WORKER_INSTANCES) #tuple of ip list and list of tuple of ip and amount of cpus ipList_and_tuple = getListOfWorkerIPs(WORKER_INSTANCES) ipList = ipList_and_tuple[0] ipList_tuple = ipList_and_tuple[1] except Exception as e: comm.printException(comm.updateErrorsFilePath, errString="subprocess_to_getworkers.py" + str(e)) pass #if there is at least 1 ip n the list lenIpList = len(ipList) if (lenIpList>0): #the list of doc URLs that is going to send to worker listOfUrls = [] #loop over all json-files for (dirpath, dirnames, filenames) in walk(jsonFiles): for fname in filenames: #start to search for changed content, #comparing content hash saved in json-file to the #content of web document at current moment #print(os.path.join(dirpath, fname))
dirpath = dirpath0 jsonsList.extend(filenames) #collect all json-files into list break for filename in jsonsList: jobs.append(dirpath+filename) nrOfJobs += 1 #pool of processes pool = Pool(processes=os.cpu_count()) #search for changes nrOfChanges_ = pool.map(addChangedContent, jobs) pool.close() pool.join() except: comm.printException(comm.updateErrorsFilePath, errString="update") pass #import getpass #print("USER: "******" " + str(nrOfJobs) + " " + str(span) + " " + str(comm.chunksize) + " " + str(sum(nrOfChanges_)) + " ") jf.close() except: comm.printException(comm.updateErrorsFilePath, errString="update")
def saveMetadata(url, od): canOpen = True try: redirectedTo = requests.get(url).url except: canOpen = False errStr = (url + " Cannot_open_web-source \n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass if canOpen is True: if not os.path.isdir(comm.jsonsDir): os.makedirs(comm.jsonsDir) try: page = requests.get(redirectedTo) statusCode = page.status_code pageread = page.text pageInfo = dict(page.headers) #page.close() localFilename = comm.getUrlSHA(redirectedTo) contentType = page.headers['content-type'] _encoding = page.encoding ''' if(pageencoding is None) or (len(pageencoding) == 0): _encoding = commethods.getDocumentEncoding(contentType) else: _encoding = pageencoding.upper() ''' #base_url becomes also jsonFile name (pathToSaveMetadata) baseUrl = (urlparse(redirectedTo)).netloc sha224_ = (hashlib.sha224(pageread.encode('utf-8')).hexdigest()) pathToSaveMetadata = comm.jsonsDir + baseUrl + ".json" isDesiredType = comm.isDesiredContent(contentType) #print("isDesiredType ", isDesiredType)# if (isDesiredType): #if this file does not exist yet in locale storage, create #if this base-url path does not exist: if not os.path.isfile(pathToSaveMetadata): #print("excel in contentType ", ("excel" in contentType))# #create directory for the downloads from this base_url and save file into it #downloadedFilePath = downLoadFile(pageread, localFilename, baseUrl) #print("downloadedFilePath ", downloadedFilePath) #if(downloadedFilePath): #print()# infoDict_tmp = dict() infoDict_tmp["base_url"] = baseUrl infoDict = insertValuesToDict(infoDict_tmp, localFilename, redirectedTo, pageInfo, sha224_, statusCode) saveJsonToFile(pathToSaveMetadata, infoDict) #send the file to parser and then to estner for extracting entities: sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding, localFilename, pageread) #if this file does exist already, upload new version else: #print("ELSE excel in contentType ", ("excel" in contentType))# someNewData = False # Open the json file (<baseUrl>.json) for reading # and comparing sha224 values of sha saved in json and in opened file in_file = open(pathToSaveMetadata, "r") # Load the contents from the file, which creates a new dictionary isValidJsonFile = True try: #open json file existingFileDict = json.load(in_file) except: isValidJsonFile = False pass # Close the file... we don't need it anymore in_file.close() if (isValidJsonFile): #(existingFileDict_tmp['base_url']) e.g. www.temtec.ee if (existingFileDict['base_url'] in redirectedTo ): #same file resource was requested #print("BASE URL ", existingFileDict['base_url']) #dict has two 1-level keys: 'base_url' and sha244 of a file name fNameKey = [ k for k in existingFileDict.keys() if k != 'base_url' ] #print(" fNameKey", fNameKey) #print(" localFilename", localFilename) #print(" localFilename in fNameKey", (localFilename in fNameKey)) #this list may contain 'localFilename' if (localFilename in fNameKey): #if earlier saved file's sha does not equal to current sha, #the contents of file has changed. #Saved file's sha is saved into key, find it or not: shaKeys = existingFileDict[localFilename].keys( ) if (sha224_ not in shaKeys): #file has changed #search for date, if it same, update existing sha key replaceSha = "" #if there is same timedDir under some sha key, get this sha and replace for sk in shaKeys: savedDate = existingFileDict[ localFilename][sk]["timeDir"] if (savedDate == comm.timeDir): replaceSha = sk break if ( replaceSha != "" ): #delete sha, because of same day date del existingFileDict[localFilename][ replaceSha] #print("REPLACING!") #add new value with new content_sha-key under filename_sha-key #filename-url is same, but content is changed #so add new content-sha newDataDict = insertNewContentToDict( localFilename, redirectedTo, pageInfo, sha224_, statusCode) if (newDataDict): existingFileDict[localFilename].update( newDataDict) someNewData = True else: #new file (resource) from same domain (or 'base_url') requested #add new value with new filename_sha-key for that base-resource newDataDict = insertNewSubFileToDict( localFilename, redirectedTo, pageInfo, sha224_, statusCode) if (newDataDict): existingFileDict.update(newDataDict) someNewData = True if someNewData: #save metadata of file saveJsonToFile(pathToSaveMetadata, existingFileDict) #send the file to parser, then to estner entity extractor: sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding, localFilename, pageread) #save errors except urr.HTTPError as e: errStr = (redirectedTo + " HTTPError " + str(e.code) + " " + str(e.reason) + " \n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except urr.URLError as e: errStr = (redirectedTo + " URLError " + str(e.reason) + " \n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except IOError as e: errStr = (redirectedTo + " " + str("I/O_erRror({0}):_{1}".format(e.errno, e.strerror)) + "\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except ValueError: errStr = (redirectedTo + " ValueError_Could_not_convert_data_to_an_integer.\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except TypeError: errStr = (redirectedTo + " TypeError\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except: errStr = (redirectedTo + " Unexpected_error:_" + (str(sys.exc_info()[0])) + "\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass
def addTriples(self, chunkedList, addLemmas=True): try: newDataExists = False g = self.getPerRdfGraph() g_new = Graph() #define specific namespace prefix self.bindNamespaces(g) for andmed in chunkedList: for webpage in andmed: gName = andmed[webpage]["gName"] fName = andmed[webpage]["fName"] name = andmed[webpage]["name"] lemmaList = andmed[webpage]["lemmaSet"] #print (lemmaList) try: #make triples newPerson = URIRef( self.perStr + name.replace(">", "").replace("<", "").replace( "|", "").replace(" ", "_").lower()) newGivenName = Literal(gName) newFamilyName = Literal(fName) newPerName = Literal(name) newWebpage = URIRef(webpage) #add triples #check if graph contains bob already if (newPerson, RDF.type, URIRef(self.person)) not in g: newDataExists = True g_new.add( (newPerson, RDF.type, URIRef(self.person))) if (newGivenName != Literal("")): g_new.add( (newPerson, self.givenName, newGivenName)) if (newFamilyName != Literal("")): g_new.add((newPerson, self.familyName, newFamilyName)) g_new.add((newPerson, self.perName, newPerName)) #check if graph contains bob already if (newPerson, self.mentionedAtSite, newWebpage) not in g: newDataExists = True g_new.add( (newPerson, self.mentionedAtSite, newWebpage)) #add lemmas also if (addLemmas): for newLemma in lemmaList: #check if graph contains bob already if (newPerson, self.lemma, Literal(newLemma)) not in g: newDataExists = True g_new.add((newPerson, self.lemma, Literal(newLemma))) except: comm.printException(comm.initRdfErrorsFilePath, "build_per_graph") pass #print(str(newDataExists)) #write rdf into file if (newDataExists): try: gg = g + g_new (gg).serialize(self.perRdf, format='pretty-xml', encoding='utf-8') except: comm.printException( comm.initRdfErrorsFilePath, "RDF People Manager serialization error: ") pass except: comm.printException(comm.initRdfErrorsFilePath, "RDF People Manager (addTriples) error: ") pass
subprocess.Popen( ["python3", "amountOfTriples.py", json.dumps(filedic)]) #end of statistics! comm.saveStatistics(mylargefile, "\n--------------------------- ") #delete RDF-files and download excel-files in each worker postToWorker.deleteRDFsInWorkers(ipList) ### ### try: # Download json-objects from cloud storage downloadJsons.getJsonObjects(client) except: comm.printException(comm.pathToSaveDownloadErrors, errString="downloadJsons") pass ### ### try: ### ### Download generated error-objects from cloud storage downloadErrors.getErrorObjects(client) except: comm.printException(comm.pathToSaveDownloadErrors, errString="downloadErrors") pass else: print("no worker instances")
def getEntities(url, text, ontologyData, orgWords=[ 'kogu', 'selts', 'ansambel', 'keskus', 'as', 'klubi', 'asutus', 'keskus', 'fond', 'cup' ], locWords=['vabarii', 'maakond']): ntwl = list() try: ner_tagged = tagger(analyzer(tokenizer(text))) except: comm.printException( comm.pathToSaveParsingErrors, "getEntities.py-def_getEntities_:_ner_tagged " + text[0:100] + " URL_" + url) pass try: ntwl = ner_tagged.named_entities except: comm.printException( comm.pathToSaveParsingErrors, "getEntities.py-def_getEntities_:_ntwl" + str(len(ntwl)) + " " + text[0:100] + " URL_" + url) pass try: if (len(ntwl) > 0): andmed = dict() #andmed[url]=dict() #print(text) #print(ner_tagged) #print(ntwl) for i in ntwl: label = i.label freqLemma = i.lemma.replace(';', '').replace(':', '').replace( ',', '').replace('.', '').replace('?', '').replace( '!', '').replace('"', '').replace("'", '').replace( ' | ', '').replace('|', '').lower() #correct some ner labels for ow in orgWords: if (ow.lower() in freqLemma.lower()): label = "ORG" for lw in locWords: if (lw.lower() in freqLemma.lower()): label = "LOC" #process values by labels if label == "PER": entitySet = set() if (freqLemma != ""): name = freqLemma.title() names = name.split(' ') gName = "" fName = "" try: if len(names) > 1: if len(names) > 2: gName = names[0] + " " + names[1] fName = names[2] elif len(names) == 2: gName = names[0] fName = names[1] except: comm.printException( comm.pathToSaveParsingErrors, "getEntities.py-def_getEntities_gname-fname") pass entitySet.add(freqLemma) #to later remove, currently for avoid double values entitySet.add(name) entitySet.add(gName) entitySet.add(fName) wConcat = (' '.join(w.text for w in i.words)).replace( ';', '').replace(':', '').replace(',', '').replace( '.', '').replace('?', '').replace('!', '') entitySet.add(wConcat) lemmalist = list() for w in i.words: lemmalist.append(w.lemmas) produkt = itertools.product(*lemmalist) for j in produkt: entitySet.add(" ".join( str(u) for u in (list(j)) if ((u.lower() != name.lower()) & (u != "") & (u.title() in names)))) #now remove double values if name in entitySet: entitySet.remove(name) if gName in entitySet: entitySet.remove(gName) if fName in entitySet: entitySet.remove(fName) if "" in entitySet: entitySet.remove("") andmed = { url: { "gName": gName, "fName": fName, "name": name, "lemmaSet": entitySet } } if not (ontologyData.sharedList_per._callmethod( '__contains__', (andmed, ))): ontologyData.sharedList_per._callmethod( 'append', (andmed, )) if ((ontologyData.sharedList_per )._callmethod('__len__') > comm.chunksize): try: chunkedList = ontologyData.sharedList_per[:] #makes copy,not refrence del ontologyData.sharedList_per[:] perManager = initRdf.PeopleManager( ontologyData) perManager.addTriples(chunkedList) except: comm.printException(comm.initRdfErrorsFilePath, "get_PER_entities") pass else: objName = freqLemma.title() entitySet = set() entitySet.add(freqLemma) wConcat = (' '.join(w.text for w in i.words)).replace( ';', '').replace(':', '').replace(',', '').replace( '.', '').replace('?', '').replace('!', '') entitySet.add(wConcat) lemmalist = list() for w in i.words: lemmalist.append(w.lemmas) produkt = itertools.product(*lemmalist) for j in produkt: entitySet.add(" ".join( str(u) for u in (list(j)) if ((u.lower() != objName.lower()) & (u != "")))) if "" in entitySet: entitySet.remove("") andmed = { url: { objName: entitySet } } if (label == "ORG"): if not (ontologyData.sharedList_org._callmethod( '__contains__', (andmed, ))): ontologyData.sharedList_org._callmethod( 'append', (andmed, )) elif (label == "LOC"): if not (ontologyData.sharedList_loc._callmethod( '__contains__', (andmed, ))): ontologyData.sharedList_loc._callmethod( 'append', (andmed, )) if ((ontologyData.sharedList_org)._callmethod('__len__') > comm.chunksize): try: chunkedList = (ontologyData.sharedList_org[:] ) #makes copy,not refrence del ontologyData.sharedList_org[:] #tests #jf = open("tEst.txt", 'a', encoding='utf-8') #jf.write(str(len(chunkedList)) + "\n") #jf.close() orgManager = initRdf.OrganizationManager( ontologyData) orgManager.addTriples(chunkedList) except: comm.printException(comm.initRdfErrorsFilePath, "get_ORG_entities") pass if ((ontologyData.sharedList_loc)._callmethod('__len__') > comm.chunksize): try: chunkedList = ontologyData.sharedList_loc[:] #makes copy,not refrence del ontologyData.sharedList_loc[:] locManager = initRdf.LocationManager(ontologyData) locManager.addTriples(chunkedList) except: comm.printException(comm.initRdfErrorsFilePath, "get_LOC_entities") pass except: comm.printException(comm.initRdfErrorsFilePath, "getEntities.py") pass
def detectChanges(jsonFilePath, listOfUrls): global nrOfChanges global worker_counter #print(jsonFilePath) #print("--------------------") #print(str(len(listOfUrls))) #print("--------------------") jsonDict = dict() #json-files may be incorrectly formed, #in that case one cannot load it into dictionary isValidJsonFile=True try: #open json file #load file into dictionary-type: jsonDict = json.load(open(jsonFilePath)) except: isValidJsonFile=False pass #print(str(isValidJsonFile)) #get URL from the current dict if(isValidJsonFile): #hash is sha224 for key in jsonDict.keys(): #if key is currently not a base_url, it is filename #under filename-key, there can be among other metadata #one or more content SHAs #URL to that file (document) in web if(key != 'base_url'):#key is sha of file's URL, elsewhere saved into variable localFilename #structure: #sha of filename ###sha of file content ######metadata of file + (filename , sha(file content), ###### human-readable file url (under key 'file_url'), accessed date) ###sha of file another (updated) content ######metadata... fileSHAs = list(jsonDict[key].keys())#list of SHA's of file content at time of accessing this file arbitrFileSha = fileSHAs[0]#this is only for getting file URL fileUrl = jsonDict[key][arbitrFileSha]["file_url"] redirectedTo=0 try: redirectedTo = requests.get(fileUrl).url except: comm.printException(comm.updateErrorsFilePath, errString="open_url") continue #continue with next URL in loop if(redirectedTo!=0): #print(str(redirectedTo)) #read the doc's content at current moment try: pageread = (requests.get(redirectedTo)).text except: comm.printException(comm.updateErrorsFilePath, errString="pageread1") try: pageread = ((requests.get(redirectedTo)).text.encode('utf-8').strip()) except Exception as e: comm.printException(comm.updateErrorsFilePath, errString="pageread2") print(e) continue #get hash of this doc fileContentSha224 = (hashlib.sha224(pageread.encode('utf-8')).hexdigest()) #check if content is changed meanwhile if(fileContentSha224 not in fileSHAs):#data has changed!!! #collect number of changes nrOfChanges += 1 #as a content of this doc has changed, send its URL to the worker #for extracting entities #fill the list of URLs listOfUrls.append(fileUrl) '''''' postListSize = postToWorker.defPostListSize(worker_counter, ipList_tuple) #send certain amount of URLs to each worker, then empty the list of URLS if(len(listOfUrls) == postListSize): #send list of urls to worker worker_counter = postToWorker.detectConnection(ipList, worker_counter, listOfUrls) #empty list of object names #to prepare it for the next worker del listOfUrls[:] #prepare next worker worker_counter += 1 if (worker_counter > (len(ipList)-1)):#last worker in the workers list #start over from first worker in the workers list worker_counter = 0
try: p = subprocess.Popen( ["python2", comm.parentDir + "upload_logfile/getworkers.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() #print((out.decode())) WORKER_INSTANCES = json.loads(out.decode()) #print(WORKER_INSTANCES) #tuple of ip list and list of tuple of ip and amount of cpus ipList_and_tuple = getListOfWorkerIPs(WORKER_INSTANCES) ipList = ipList_and_tuple[0] ipList_tuple = ipList_and_tuple[1] except Exception as e: comm.printException(comm.updateErrorsFilePath, errString="subprocess_to_getworkers.py" + str(e)) pass #if there is at least 1 ip n the list lenIpList = len(ipList) if (lenIpList > 0): #the list of doc URLs that is going to send to worker listOfUrls = [] #loop over all json-files for (dirpath, dirnames, filenames) in walk(jsonFiles): for fname in filenames: #start to search for changed content, #comparing content hash saved in json-file to the #content of web document at current moment #print(os.path.join(dirpath, fname))
def detectChanges(jsonFilePath, listOfUrls): global nrOfChanges global worker_counter #print(jsonFilePath) #print("--------------------") #print(str(len(listOfUrls))) #print("--------------------") jsonDict = dict() #json-files may be incorrectly formed, #in that case one cannot load it into dictionary isValidJsonFile = True try: #open json file #load file into dictionary-type: jsonDict = json.load(open(jsonFilePath)) except: isValidJsonFile = False pass #print(str(isValidJsonFile)) #get URL from the current dict if (isValidJsonFile): #hash is sha224 for key in jsonDict.keys(): #if key is currently not a base_url, it is filename #under filename-key, there can be among other metadata #one or more content SHAs #URL to that file (document) in web if ( key != 'base_url' ): #key is sha of file's URL, elsewhere saved into variable localFilename #structure: #sha of filename ###sha of file content ######metadata of file + (filename , sha(file content), ###### human-readable file url (under key 'file_url'), accessed date) ###sha of file another (updated) content ######metadata... fileSHAs = list( jsonDict[key].keys() ) #list of SHA's of file content at time of accessing this file arbitrFileSha = fileSHAs[0] #this is only for getting file URL fileUrl = jsonDict[key][arbitrFileSha]["file_url"] redirectedTo = 0 try: redirectedTo = requests.get(fileUrl).url except: comm.printException(comm.updateErrorsFilePath, errString="open_url") continue #continue with next URL in loop if (redirectedTo != 0): #print(str(redirectedTo)) #read the doc's content at current moment try: pageread = (requests.get(redirectedTo)).text except: comm.printException(comm.updateErrorsFilePath, errString="pageread1") try: pageread = ((requests.get(redirectedTo) ).text.encode('utf-8').strip()) except Exception as e: comm.printException(comm.updateErrorsFilePath, errString="pageread2") print(e) continue #get hash of this doc fileContentSha224 = (hashlib.sha224( pageread.encode('utf-8')).hexdigest()) #check if content is changed meanwhile if (fileContentSha224 not in fileSHAs): #data has changed!!! #collect number of changes nrOfChanges += 1 #as a content of this doc has changed, send its URL to the worker #for extracting entities #fill the list of URLs listOfUrls.append(fileUrl) '''''' postListSize = postToWorker.defPostListSize( worker_counter, ipList_tuple) #send certain amount of URLs to each worker, then empty the list of URLS if (len(listOfUrls) == postListSize): #send list of urls to worker worker_counter = postToWorker.detectConnection( ipList, worker_counter, listOfUrls) #empty list of object names #to prepare it for the next worker del listOfUrls[:] #prepare next worker worker_counter += 1 if (worker_counter > (len(ipList) - 1)): #last worker in the workers list #start over from first worker in the workers list worker_counter = 0
def doDownloadJob(jsonToDict, itemname): global nrOfDownloads try: base_url = jsonToDict['base_url'] #becomes folderName for fname_key in jsonToDict.keys(): #At first level, there are two sorts of keys in json-file: #1. base-url #2. sha(s) of filename(s), (sha of file full url, including "/"-slashes) ##As the file content may change over time, every sha(filename)-element contains ##1. sha(s) of a content(s) ###Every sha of a content contains ###1. metadata of a file/content if (fname_key != 'base_url' ): #fname_key(sha of file url) becomes local filename #'jsonToDict[fname_key].keys()' is a list of sha(content) of a current sha(filename) #loop over every sha(content) of a sha(filename) #here, csha is the sha(filecontent) for csha in jsonToDict[fname_key].keys(): contentKeyExists = False """check if metadata contains key 'Content-Type'""" try: if ('Content-Type' in jsonToDict[fname_key][csha]): contentKeyExists = True except: contentKeyExists = False pass #Get the time the json-file was made timeDir = jsonToDict[fname_key][csha]['timeDir'] #download only changes that are no older than #the date of start of current process! process_start_date = comm.makeDateObj(ajadir) json_model_date = comm.makeDateObj(timeDir) #continueonly if #date in model is younger or equal to a #date of a process start if (contentKeyExists) & (json_model_date >= process_start_date): #excel type is already downloaded if ("excel" not in jsonToDict[fname_key][csha] ['Content-Type']): #full URL of a file file_url = jsonToDict[fname_key][csha]['file_url'] dirPath = comm.downloadsDir + base_url + "/" try: #create folder for this 'date/base_url' if does not exist if (not os.path.isdir(dirPath)) & ( not os.path.exists(dirPath)): os.makedirs(dirPath) try: #download the file into that folder #fname_key is the sha(filename) #resulting path of a file will become 'date/base_url/sha(filename)' urr(file_url, dirPath + fname_key) nrOfDownloads += 1 except: comm.printException( comm.pathToSaveDownloadErrors, itemname) pass except: comm.printException( comm.pathToSaveDownloadErrors, itemname) pass except: comm.printException(comm.pathToSaveDownloadErrors, itemname) pass