def detectChanges(jsonFilePath, listOfUrls):
    global nrOfChanges
    global worker_counter
    #print(jsonFilePath)
    #print("--------------------")
    #print(str(len(listOfUrls)))
    #print("--------------------")
    jsonDict = dict()
    #json-files may be incorrectly formed,
    #in that case one cannot load it into dictionary
    isValidJsonFile=True
    try:
        #open json file
        #load file into dictionary-type:
        jsonDict = json.load(open(jsonFilePath))
    except:
        isValidJsonFile=False
        pass
    #print(str(isValidJsonFile))
    #get URL from the current dict
    if(isValidJsonFile):
        #hash is sha224
        for key in jsonDict.keys():
            #if key is currently not a base_url, it is filename
            #under filename-key, there can be among other metadata
            #one or more  content SHAs
            #URL to that file (document) in web
            if(key != 'base_url'):#key is sha of file's URL, elsewhere saved into variable localFilename 
                #structure:
                #sha of filename
                ###sha of file content
                ######metadata of file + (filename , sha(file content), 
                ###### human-readable file url (under key 'file_url'), accessed date) 
                ###sha of file another (updated) content
                ######metadata...
                fileSHAs = list(jsonDict[key].keys())#list of SHA's of file content at time of accessing this file
                arbitrFileSha = fileSHAs[0]#this is only for getting file URL
                fileUrl = jsonDict[key][arbitrFileSha]["file_url"]
                redirectedTo=0
                try:
                    redirectedTo = requests.get(fileUrl).url
                except:
                    comm.printException(comm.updateErrorsFilePath, errString="open_url")
                    continue #continue with next URL in loop
                if(redirectedTo!=0):
                    #print(str(redirectedTo))
                    #read the doc's content at current moment
                    try:
                        pageread = (requests.get(redirectedTo)).text
                    except:
                        comm.printException(comm.updateErrorsFilePath, errString="pageread1")
                        try:
                            pageread = ((requests.get(redirectedTo)).text.encode('utf-8').strip())
                        except Exception as e:
                            comm.printException(comm.updateErrorsFilePath, errString="pageread2")
                            print(e)
                            continue
                    #get hash of this doc
                    fileContentSha224 = (hashlib.sha224(pageread.encode('utf-8')).hexdigest())
                    #check if content is changed meanwhile
                    if(fileContentSha224 not in fileSHAs):#data has changed!!!
                        #collect number of changes
                        nrOfChanges += 1
                        #as a content of this doc has changed, send its URL to the worker
                        #for extracting entities
                        #fill the list of URLs
                        listOfUrls.append(fileUrl)
                        ''''''
                        postListSize = postToWorker.defPostListSize(worker_counter, ipList_tuple)
                        #send certain amount of URLs to each worker, then empty the list of URLS
                        if(len(listOfUrls) == postListSize):
                            #send list of urls to worker
                            worker_counter = postToWorker.detectConnection(ipList, worker_counter, listOfUrls)
                            #empty list of object names
                            #to prepare it for the next worker
                            del listOfUrls[:] 
                            #prepare next worker
                            worker_counter += 1
                            if (worker_counter > (len(ipList)-1)):#last worker in the workers list
                                #start over from first worker in the workers list
                                worker_counter = 0
         if(jsonpath is not ""):
             try:
                 detectChanges(os.path.join(dirpath, fname), listOfUrls)
             except:
                 comm.printException(comm.updateErrorsFilePath, errString="start_detect")
 
 #post rest of the list to the worker
 if(len(listOfUrls) > 0):
     #if this was the last worker, use first in workers list, else use next
     if(worker_counter == (len(ipList)-1)):#last worker was recently used
         worker_counter = 0
     else:
         worker_counter += 1
      
     #send list of object names to worker    
     worker_counter = postToWorker.detectConnection(ipList, worker_counter, listOfUrls)
     
 else:
     comm.printException(comm.updateErrorsFilePath, errString="no_json-files_for_updating")
 
 #save finishing time of update process for measuring how much time the process took
 end = datetime.datetime.now()
 span = end-start
 try:
     jf = open(comm.monthly_updates_path, 'a')
     jf.write("update-process " + currTime + " " + str(span) + " " + str(nrOfChanges) + " " + str(lenIpList) + " ")
     jf.close()
 except:
     comm.printException(comm.updateErrorsFilePath, errString="update")
     pass
 
Beispiel #3
0
                nr_of_log_rows += 1  #for statistics
                plineUrl = processline(line)
                if (plineUrl is not None) & (plineUrl != ""):
                    if (plineUrl not in distinct_urls) & (
                            'icomoon' not in plineUrl.lower()) & (
                                'hobekivi' not in plineUrl.lower()):
                        distinct_urls.add(plineUrl)
                        #delete_counter += 1
                        line_counter += 1
                        urlsList.append(plineUrl)
                        postListSize = postToWorker.defPostListSize(
                            worker_counter, ipList_tuple)
                        #send certain amount of URLs to each worker, then empty the list of URLS
                        if (len(urlsList) == postListSize):
                            #post the list until connected to worker is successful
                            worker_counter = postToWorker.detectConnection(
                                ipList, worker_counter, urlsList)

                            #postreq_dir
                            jf = open(comm.postreq_path, 'a')
                            jf.write(
                                time.strftime("%d/%m/%Y_%H:%M:%S") +
                                " just posted to: " +
                                str(ipList[worker_counter]) + "\n")
                            jf.close()

                            del urlsList[:]  #empty list of urls
                            #prepare next worker
                            worker_counter += 1
                            if (worker_counter > (len(ipList) - 1)):
                                #start over from first worker in list
                                worker_counter = 0
def detectChanges(jsonFilePath, listOfUrls):
    global nrOfChanges
    global worker_counter
    #print(jsonFilePath)
    #print("--------------------")
    #print(str(len(listOfUrls)))
    #print("--------------------")
    jsonDict = dict()
    #json-files may be incorrectly formed,
    #in that case one cannot load it into dictionary
    isValidJsonFile = True
    try:
        #open json file
        #load file into dictionary-type:
        jsonDict = json.load(open(jsonFilePath))
    except:
        isValidJsonFile = False
        pass
    #print(str(isValidJsonFile))
    #get URL from the current dict
    if (isValidJsonFile):
        #hash is sha224
        for key in jsonDict.keys():
            #if key is currently not a base_url, it is filename
            #under filename-key, there can be among other metadata
            #one or more  content SHAs
            #URL to that file (document) in web
            if (
                    key != 'base_url'
            ):  #key is sha of file's URL, elsewhere saved into variable localFilename
                #structure:
                #sha of filename
                ###sha of file content
                ######metadata of file + (filename , sha(file content),
                ###### human-readable file url (under key 'file_url'), accessed date)
                ###sha of file another (updated) content
                ######metadata...
                fileSHAs = list(
                    jsonDict[key].keys()
                )  #list of SHA's of file content at time of accessing this file
                arbitrFileSha = fileSHAs[0]  #this is only for getting file URL
                fileUrl = jsonDict[key][arbitrFileSha]["file_url"]
                redirectedTo = 0
                try:
                    redirectedTo = requests.get(fileUrl).url
                except:
                    comm.printException(comm.updateErrorsFilePath,
                                        errString="open_url")
                    continue  #continue with next URL in loop
                if (redirectedTo != 0):
                    #print(str(redirectedTo))
                    #read the doc's content at current moment
                    try:
                        pageread = (requests.get(redirectedTo)).text
                    except:
                        comm.printException(comm.updateErrorsFilePath,
                                            errString="pageread1")
                        try:
                            pageread = ((requests.get(redirectedTo)
                                         ).text.encode('utf-8').strip())
                        except Exception as e:
                            comm.printException(comm.updateErrorsFilePath,
                                                errString="pageread2")
                            print(e)
                            continue
                    #get hash of this doc
                    fileContentSha224 = (hashlib.sha224(
                        pageread.encode('utf-8')).hexdigest())
                    #check if content is changed meanwhile
                    if (fileContentSha224
                            not in fileSHAs):  #data has changed!!!
                        #collect number of changes
                        nrOfChanges += 1
                        #as a content of this doc has changed, send its URL to the worker
                        #for extracting entities
                        #fill the list of URLs
                        listOfUrls.append(fileUrl)
                        ''''''
                        postListSize = postToWorker.defPostListSize(
                            worker_counter, ipList_tuple)
                        #send certain amount of URLs to each worker, then empty the list of URLS
                        if (len(listOfUrls) == postListSize):
                            #send list of urls to worker
                            worker_counter = postToWorker.detectConnection(
                                ipList, worker_counter, listOfUrls)
                            #empty list of object names
                            #to prepare it for the next worker
                            del listOfUrls[:]
                            #prepare next worker
                            worker_counter += 1
                            if (worker_counter >
                                (len(ipList) -
                                 1)):  #last worker in the workers list
                                #start over from first worker in the workers list
                                worker_counter = 0
                                          listOfUrls)
                        except:
                            comm.printException(comm.updateErrorsFilePath,
                                                errString="start_detect")

            #post rest of the list to the worker
            if (len(listOfUrls) > 0):
                #if this was the last worker, use first in workers list, else use next
                if (worker_counter == (len(ipList) -
                                       1)):  #last worker was recently used
                    worker_counter = 0
                else:
                    worker_counter += 1

                #send list of object names to worker
                worker_counter = postToWorker.detectConnection(
                    ipList, worker_counter, listOfUrls)

            else:
                comm.printException(comm.updateErrorsFilePath,
                                    errString="no_json-files_for_updating")

            #save finishing time of update process for measuring how much time the process took
            end = datetime.datetime.now()
            span = end - start
            try:
                jf = open(comm.monthly_updates_path, 'a')
                jf.write("update-process " + currTime + " " + str(span) + " " +
                         str(nrOfChanges) + " " + str(lenIpList) + " ")
                jf.close()
            except:
                comm.printException(comm.updateErrorsFilePath,
Beispiel #6
0
 distinct_urls = set()
 with open(mylargefile) as f:
     for line in f:
         nr_of_log_rows += 1 #for statistics
         plineUrl = processline(line)
         if(plineUrl is not None)&(plineUrl != ""):
             if(plineUrl not in distinct_urls)&('icomoon' not in plineUrl.lower())&('hobekivi' not in plineUrl.lower()):
                 distinct_urls.add(plineUrl)
                 #delete_counter += 1
                 line_counter += 1
                 urlsList.append(plineUrl)
                 postListSize = postToWorker.defPostListSize(worker_counter, ipList_tuple)
                 #send certain amount of URLs to each worker, then empty the list of URLS
                 if(len(urlsList) == postListSize):
                     #post the list until connected to worker is successful
                     worker_counter = postToWorker.detectConnection(ipList, worker_counter, urlsList)
                     
                     #postreq_dir
                     jf = open(comm.postreq_path, 'a')
                     jf.write(time.strftime("%d/%m/%Y_%H:%M:%S") + " just posted to: " + str(ipList[worker_counter]) + "\n")
                     jf.close()
                     
                     
                     del urlsList[:] #empty list of urls
                     #prepare next worker
                     worker_counter += 1
                     if (worker_counter > (len(ipList)-1)):
                         #start over from first worker in list
                         worker_counter = 0
         #dont let memory to grow too buzy
         if (len(distinct_urls) > 1000):