def processline(line): splitted = line.split() tyyp = (splitted[6]).lower() #content type if (valide.isValideType(tyyp)): action = splitted[4] ''' R - Redirect E - Embed X - Speculative embed (aggressive/Javascript link extraction) L - Link P - Prerequisite (as for DNS or robots.txt before another URI) ''' #print(tyyp) #call parser for every line url = "" if "X" not in action: if "P" in action: #dns or robots.txt, send basic url url = splitted[5] #base_url aka host name else: #send url of file url = splitted[3] #file-path followed to host name if (url != ""): neededUrl = valide.isNeededUrl(url) if (neededUrl): return url else: return ""
def processline(line): splitted = line.split() tyyp = (splitted[6]).lower()#content type if(valide.isValideType(tyyp)): action = splitted[4] ''' R - Redirect E - Embed X - Speculative embed (aggressive/Javascript link extraction) L - Link P - Prerequisite (as for DNS or robots.txt before another URI) ''' #print(tyyp) #call parser for every line url = "" if "X" not in action: if "P" in action:#dns or robots.txt, send basic url url = splitted[5] #base_url aka host name else:#send url of file url = splitted[3] #file-path followed to host name if(url != ""): neededUrl = valide.isNeededUrl(url) if(neededUrl): return url else: return ""
def saveMetadata(url, od): #save the result of trying of opening a page into variable 'canOpen' canOpen = True try: #try to open document at URL redirectedTo = requests.get(url).url except ConnectionError: #it was not possible to open this web document canOpen = False #save exception (error of getting web document) errStr = (url + " Cannot_open_web-source \n") comm.printException(comm.pathToConnectionErrors, errStr) #continue without terminating a program pass except: #it was not possible to open this web document canOpen = False #save exception (error of getting web document) errStr = (url + " Cannot_open_web-source \n") comm.printException(comm.pathToSaveJsonErrors, errStr) #continue without terminating a program pass #continue only if 'canOpen' is still true if canOpen is True: #continue only if url is valid isValidUrl = valide.isNeededUrl(redirectedTo) if (isValidUrl): #print("can open: " + str(canOpen)) if not os.path.isdir(comm.jsonsDir): os.makedirs(comm.jsonsDir) try: #in following, use only the URL, where one was redirected, if at all page = requests.get(redirectedTo) statusCode = page.status_code #textual content of a doc pageread = page.text #get doc's metadata pageInfo = dict(page.headers) #generate filename for local storage: #it will be the hash of doc URL localFilename = comm.getUrlSHA(redirectedTo) #important metadata: content type contentType = page.headers['content-type'] isValid_contentType = valide.isValideType(contentType) #base_url denotes host name, #all documents that are from the same host, will be saved into same json-file #base_url becomes also json-file name (pathToSaveMetadata) baseUrl = None if (isValid_contentType): baseUrl = (urlparse(redirectedTo)).netloc if (baseUrl is not None): #generate hash of the content of doc. #this hash is used later for detecting whether the doc's content has changed or not. # this chnge-detection happens in cases of #1. monthly update #2. current method, when appears, that this URL have processed earlier sha224_ = (hashlib.sha224( pageread.encode('utf-8')).hexdigest()) #important data for parsers: encoding _encoding = page.encoding #_encoding = comm.getDocumentEncoding(contentType, pageread) #print("-----------------------------------------------------") #exclude doc types where it is not possible to find textual content: e.g images, videos isDesiredType = comm.isDesiredContent(contentType, od) #continue only witj desired types if (isDesiredType): #jsonsDir is actually a so called 'bucket' name in Google Cloud Storage jsonsDir = comm.jsonsDir print(jsonsDir) #jsonsFile becomes a so called 'object' inside a bucket #object's name is URL's host name and extension is '.json' jsonsFile = baseUrl + ".json" #build dictionary of address of object of this meta data jsonsPath = dict() jsonsPath["object"] = jsonsFile #'hostname.json'# jsonsPath[ "bucket"] = jsonsDir #e.g. 'datadownload_json'# pathToSaveMetadata_ = json.dumps(jsonsPath) #save meta data into dictionary structure infoDict_tmp = dict() infoDict_tmp["base_url"] = baseUrl infoDict = insertValuesToDict( infoDict_tmp, localFilename, redirectedTo, pageInfo, sha224_, statusCode) #convert dictionary into json-string jsondata = json.dumps(infoDict, indent=4) #dict for sending collected data to 'updateObj.py' insertJson = dict() insertJson["jsondata"] = jsondata insertJson["localFilename"] = localFilename insertJson["redirectedTo"] = redirectedTo insertJson["pageInfo"] = pageInfo insertJson["sha224_"] = sha224_ insertJson["statusCode"] = statusCode insertJson["timeDir"] = comm.timeDir insertJson["address"] = pathToSaveMetadata_ #variable 'someNewData' is for storing knowledge about #whether this doc at this url #is processed earlier: #1. if Yes and doc's content has changed, then 'someNewData' becomes True; else remains False #2. if No, then 'someNewData' becomes True someNewData = False #string for saving a unique error message errr = "" try: #convert dictionary into json-string for sending argument to 'updateObj.py' jd = json.dumps(insertJson) #get info back about whether here is some new data #'p' is a returned boolean value of 'someNewData' #communication with google-python-api-client is done using older version, python2.7 p = subprocess.Popen( ["python2", "storage/updateObj.py", jd], stdout=subprocess.PIPE) out, err = p.communicate() someNewData = out.decode() errr = str(err).lower() print("\nsomeNewData " + str(someNewData)) print("\nerrr " + str(errr)) except: errstr = errr if ( (errr != "") & (errr != "none") ) else "storage-updateObj.py-ERROR" comm.printException(comm.pathToSaveJsonErrors, errstr) pass #continue with parsing of doc only when new data was detected if someNewData: sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding, localFilename, pageread) #record errors except urr.HTTPError as e: errStr = (redirectedTo + " HTTPError " + str(e.code) + " " + str(e.reason) + " \n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except urr.URLError as e: errStr = (redirectedTo + " URLError " + str(e.reason) + " \n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except IOError as e: errStr = (redirectedTo + " " + str( "I/O_erRror({0}):_{1}".format(e.errno, e.strerror)) + "\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except ValueError: errStr = ( redirectedTo + " ValueError_Could_not_convert_data_to_an_integer.\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except TypeError: errStr = (redirectedTo + " TypeError\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except: errStr = (redirectedTo + " Unexpected_error:_" + (str( sys.exc_info()[0])) + "\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass
def saveMetadata(url, od): #save the result of trying of opening a page into variable 'canOpen' canOpen = True try: #try to open document at URL redirectedTo = requests.get(url).url except ConnectionError: #it was not possible to open this web document canOpen = False #save exception (error of getting web document) errStr = (url + " Cannot_open_web-source \n" ) comm.printException(comm.pathToConnectionErrors, errStr) #continue without terminating a program pass except: #it was not possible to open this web document canOpen = False #save exception (error of getting web document) errStr = (url + " Cannot_open_web-source \n" ) comm.printException(comm.pathToSaveJsonErrors, errStr) #continue without terminating a program pass #continue only if 'canOpen' is still true if canOpen is True: #continue only if url is valid isValidUrl = valide.isNeededUrl(redirectedTo) if(isValidUrl): #print("can open: " + str(canOpen)) if not os.path.isdir(comm.jsonsDir): os.makedirs(comm.jsonsDir) try: #in following, use only the URL, where one was redirected, if at all page = requests.get(redirectedTo) statusCode = page.status_code #textual content of a doc pageread = page.text #get doc's metadata pageInfo = dict(page.headers) #generate filename for local storage: #it will be the hash of doc URL localFilename = comm.getUrlSHA(redirectedTo) #important metadata: content type contentType = page.headers['content-type'] isValid_contentType = valide.isValideType(contentType) #base_url denotes host name, #all documents that are from the same host, will be saved into same json-file #base_url becomes also json-file name (pathToSaveMetadata) baseUrl = None if(isValid_contentType): baseUrl = (urlparse(redirectedTo)).netloc if(baseUrl is not None): #generate hash of the content of doc. #this hash is used later for detecting whether the doc's content has changed or not. # this chnge-detection happens in cases of #1. monthly update #2. current method, when appears, that this URL have processed earlier sha224_ = (hashlib.sha224(pageread.encode('utf-8')).hexdigest()) #important data for parsers: encoding _encoding = page.encoding #_encoding = comm.getDocumentEncoding(contentType, pageread) #print("-----------------------------------------------------") #exclude doc types where it is not possible to find textual content: e.g images, videos isDesiredType = comm.isDesiredContent(contentType, od) #continue only witj desired types if(isDesiredType): #jsonsDir is actually a so called 'bucket' name in Google Cloud Storage jsonsDir = comm.jsonsDir print(jsonsDir) #jsonsFile becomes a so called 'object' inside a bucket #object's name is URL's host name and extension is '.json' jsonsFile = baseUrl + ".json" #build dictionary of address of object of this meta data jsonsPath = dict() jsonsPath["object"] = jsonsFile#'hostname.json'# jsonsPath["bucket"] = jsonsDir#e.g. 'datadownload_json'# pathToSaveMetadata_ = json.dumps(jsonsPath) #save meta data into dictionary structure infoDict_tmp = dict() infoDict_tmp["base_url"] = baseUrl infoDict = insertValuesToDict(infoDict_tmp, localFilename, redirectedTo, pageInfo, sha224_, statusCode ) #convert dictionary into json-string jsondata = json.dumps(infoDict, indent=4) #dict for sending collected data to 'updateObj.py' insertJson = dict() insertJson["jsondata"] = jsondata insertJson["localFilename"] = localFilename insertJson["redirectedTo"] = redirectedTo insertJson["pageInfo"] = pageInfo insertJson["sha224_"] = sha224_ insertJson["statusCode"] = statusCode insertJson["timeDir"] = comm.timeDir insertJson["address"] = pathToSaveMetadata_ #variable 'someNewData' is for storing knowledge about #whether this doc at this url #is processed earlier: #1. if Yes and doc's content has changed, then 'someNewData' becomes True; else remains False #2. if No, then 'someNewData' becomes True someNewData = False #string for saving a unique error message errr="" try: #convert dictionary into json-string for sending argument to 'updateObj.py' jd = json.dumps(insertJson) #get info back about whether here is some new data #'p' is a returned boolean value of 'someNewData' #communication with google-python-api-client is done using older version, python2.7 p = subprocess.Popen(["python2", "storage/updateObj.py", jd], stdout=subprocess.PIPE) out, err = p.communicate() someNewData = out.decode() errr = str(err).lower() print("\nsomeNewData " + str(someNewData)) print("\nerrr " + str(errr)) except: errstr = errr if ((errr != "") & (errr != "none")) else "storage-updateObj.py-ERROR" comm.printException(comm.pathToSaveJsonErrors, errstr) pass #continue with parsing of doc only when new data was detected if someNewData: sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding, localFilename, pageread) #record errors except urr.HTTPError as e: errStr = (redirectedTo + " HTTPError " + str(e.code) + " " + str(e.reason) + " \n" ) comm.printException(comm.pathToSaveJsonErrors, errStr) pass except urr.URLError as e: errStr = (redirectedTo + " URLError " + str(e.reason) + " \n" ) comm.printException(comm.pathToSaveJsonErrors, errStr) pass except IOError as e: errStr = (redirectedTo + " " + str("I/O_erRror({0}):_{1}".format(e.errno, e.strerror)) + "\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except ValueError: errStr = (redirectedTo + " ValueError_Could_not_convert_data_to_an_integer.\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except TypeError: errStr = (redirectedTo + " TypeError\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except: errStr = (redirectedTo + " Unexpected_error:_" + (str(sys.exc_info()[0])) + "\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass