Python isNeededUrl Examples

Programming Language: Python

Namespace/Package Name: validationFuncs

Method/Function: isNeededUrl

Examples at hotexamples.com: 4

Python isNeededUrl - 4 examples found. These are the top rated real world Python examples of validationFuncs.isNeededUrl extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def processline(line):
    splitted = line.split()
    tyyp = (splitted[6]).lower()  #content type

    if (valide.isValideType(tyyp)):
        action = splitted[4]
        '''
        R - Redirect
        E - Embed
        X - Speculative embed (aggressive/Javascript link extraction)
        L - Link
        P - Prerequisite (as for DNS or robots.txt before another URI)
        '''
        #print(tyyp)
        #call parser for every line
        url = ""
        if "X" not in action:
            if "P" in action:  #dns or robots.txt, send basic url
                url = splitted[5]  #base_url aka host name
            else:  #send url of file
                url = splitted[3]  #file-path followed to host name

        if (url != ""):
            neededUrl = valide.isNeededUrl(url)
            if (neededUrl):
                return url
        else:
            return ""

Example #2

Show file

File: auth.py Project: Mailis/EstNer

def processline(line): 
    splitted = line.split()
    tyyp = (splitted[6]).lower()#content type
    
    
    if(valide.isValideType(tyyp)):
        action = splitted[4] 
        '''
        R - Redirect
        E - Embed
        X - Speculative embed (aggressive/Javascript link extraction)
        L - Link
        P - Prerequisite (as for DNS or robots.txt before another URI)
        '''
        #print(tyyp)
        #call parser for every line
        url = ""
        if "X" not in action:
            if "P" in action:#dns or robots.txt, send basic url
                url = splitted[5] #base_url aka host name
            else:#send url of file
                url = splitted[3] #file-path followed to host name 
                
        if(url != ""):
            neededUrl = valide.isNeededUrl(url)
            if(neededUrl):
                return url  
        else:
            return ""

Example #3

Show file

File: download_files_from_log.py Project: peepkungas/EstNer

def saveMetadata(url, od):

    #save the result of trying of opening a page into variable 'canOpen'
    canOpen = True
    try:
        #try to open document at URL
        redirectedTo = requests.get(url).url
    except ConnectionError:
        #it was not possible to open this web document
        canOpen = False
        #save exception (error of getting web document)
        errStr = (url + " Cannot_open_web-source \n")
        comm.printException(comm.pathToConnectionErrors, errStr)
        #continue without terminating a program
        pass
    except:
        #it was not possible to open this web document
        canOpen = False
        #save exception (error of getting web document)
        errStr = (url + " Cannot_open_web-source \n")
        comm.printException(comm.pathToSaveJsonErrors, errStr)
        #continue without terminating a program
        pass
    #continue only if 'canOpen' is still true
    if canOpen is True:
        #continue only if url is valid
        isValidUrl = valide.isNeededUrl(redirectedTo)
        if (isValidUrl):
            #print("can open: " + str(canOpen))
            if not os.path.isdir(comm.jsonsDir):
                os.makedirs(comm.jsonsDir)
            try:
                #in following, use only the URL, where one was redirected, if at all
                page = requests.get(redirectedTo)
                statusCode = page.status_code
                #textual content of a doc
                pageread = page.text
                #get doc's metadata
                pageInfo = dict(page.headers)
                #generate filename for local storage:
                #it will be the hash of doc URL
                localFilename = comm.getUrlSHA(redirectedTo)
                #important metadata: content type
                contentType = page.headers['content-type']
                isValid_contentType = valide.isValideType(contentType)
                #base_url denotes host name,
                #all documents that are from the same host, will be saved into same json-file
                #base_url becomes also json-file name (pathToSaveMetadata)
                baseUrl = None
                if (isValid_contentType):
                    baseUrl = (urlparse(redirectedTo)).netloc

                if (baseUrl is not None):
                    #generate hash of the content of  doc.
                    #this hash is used later for detecting whether the doc's content has changed or not.
                    # this chnge-detection happens in cases of
                    #1. monthly update
                    #2. current method, when appears, that this URL have processed earlier
                    sha224_ = (hashlib.sha224(
                        pageread.encode('utf-8')).hexdigest())
                    #important data for parsers: encoding
                    _encoding = page.encoding
                    #_encoding = comm.getDocumentEncoding(contentType, pageread)
                    #print("-----------------------------------------------------")

                    #exclude doc types where it is not possible to find textual  content: e.g images, videos
                    isDesiredType = comm.isDesiredContent(contentType, od)
                    #continue only witj desired types
                    if (isDesiredType):
                        #jsonsDir is actually a so called 'bucket' name in Google Cloud Storage
                        jsonsDir = comm.jsonsDir
                        print(jsonsDir)
                        #jsonsFile becomes a so called 'object' inside a bucket
                        #object's name is URL's host name and extension is '.json'
                        jsonsFile = baseUrl + ".json"
                        #build dictionary of address of object of this meta data
                        jsonsPath = dict()
                        jsonsPath["object"] = jsonsFile  #'hostname.json'#
                        jsonsPath[
                            "bucket"] = jsonsDir  #e.g. 'datadownload_json'#
                        pathToSaveMetadata_ = json.dumps(jsonsPath)
                        #save meta data into dictionary structure
                        infoDict_tmp = dict()
                        infoDict_tmp["base_url"] = baseUrl
                        infoDict = insertValuesToDict(
                            infoDict_tmp, localFilename, redirectedTo,
                            pageInfo, sha224_, statusCode)
                        #convert dictionary into json-string
                        jsondata = json.dumps(infoDict, indent=4)
                        #dict for sending collected data to 'updateObj.py'
                        insertJson = dict()
                        insertJson["jsondata"] = jsondata
                        insertJson["localFilename"] = localFilename
                        insertJson["redirectedTo"] = redirectedTo
                        insertJson["pageInfo"] = pageInfo
                        insertJson["sha224_"] = sha224_
                        insertJson["statusCode"] = statusCode
                        insertJson["timeDir"] = comm.timeDir
                        insertJson["address"] = pathToSaveMetadata_

                        #variable 'someNewData' is for storing knowledge about
                        #whether this doc at this url
                        #is processed earlier:
                        #1. if Yes and doc's content has changed, then 'someNewData' becomes True; else remains False
                        #2. if No, then 'someNewData' becomes True
                        someNewData = False
                        #string for saving a unique error message
                        errr = ""
                        try:
                            #convert dictionary into json-string for sending argument to 'updateObj.py'
                            jd = json.dumps(insertJson)
                            #get info back about whether here is some new data
                            #'p' is a returned boolean value of 'someNewData'
                            #communication with google-python-api-client is done using older version, python2.7
                            p = subprocess.Popen(
                                ["python2", "storage/updateObj.py", jd],
                                stdout=subprocess.PIPE)
                            out, err = p.communicate()
                            someNewData = out.decode()
                            errr = str(err).lower()
                            print("\nsomeNewData " + str(someNewData))
                            print("\nerrr " + str(errr))
                        except:
                            errstr = errr if (
                                (errr != "") & (errr != "none")
                            ) else "storage-updateObj.py-ERROR"
                            comm.printException(comm.pathToSaveJsonErrors,
                                                errstr)
                            pass

                        #continue with parsing of doc only when new data was detected
                        if someNewData:
                            sendFileToParser(contentType, baseUrl,
                                             redirectedTo, od, _encoding,
                                             localFilename, pageread)
            #record errors
            except urr.HTTPError as e:
                errStr = (redirectedTo + " HTTPError " + str(e.code) + " " +
                          str(e.reason) + " \n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except urr.URLError as e:
                errStr = (redirectedTo + " URLError " + str(e.reason) + " \n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except IOError as e:
                errStr = (redirectedTo + " " + str(
                    "I/O_erRror({0}):_{1}".format(e.errno, e.strerror)) + "\n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except ValueError:
                errStr = (
                    redirectedTo +
                    " ValueError_Could_not_convert_data_to_an_integer.\n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except TypeError:
                errStr = (redirectedTo + " TypeError\n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except:
                errStr = (redirectedTo + " Unexpected_error:_" + (str(
                    sys.exc_info()[0])) + "\n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass

Example #4

Show file

File: download_files_from_log.py Project: peepkungas/EstNer

def saveMetadata(url, od):
    
    #save the result of trying of opening a page into variable 'canOpen'
    canOpen = True
    try:
        #try to open document at URL
        redirectedTo = requests.get(url).url
    except ConnectionError:
        #it was not possible to open this web document
        canOpen = False
        #save exception (error of getting web document)
        errStr = (url + " Cannot_open_web-source \n" )
        comm.printException(comm.pathToConnectionErrors, errStr)
        #continue without terminating a program
        pass
    except:
        #it was not possible to open this web document
        canOpen = False
        #save exception (error of getting web document)
        errStr = (url + " Cannot_open_web-source \n" )
        comm.printException(comm.pathToSaveJsonErrors, errStr)
        #continue without terminating a program
        pass
    #continue only if 'canOpen' is still true
    if canOpen is True:
        #continue only if url is valid
        isValidUrl = valide.isNeededUrl(redirectedTo)
        if(isValidUrl):
            #print("can open: " + str(canOpen))
            if not os.path.isdir(comm.jsonsDir):
                os.makedirs(comm.jsonsDir)
            try:
                #in following, use only the URL, where one was redirected, if at all
                page = requests.get(redirectedTo)
                statusCode = page.status_code
                #textual content of a doc
                pageread = page.text
                #get doc's metadata
                pageInfo = dict(page.headers)
                #generate filename for local storage:
                #it will be the hash of doc URL
                localFilename = comm.getUrlSHA(redirectedTo)
                #important metadata: content type
                contentType = page.headers['content-type']
                isValid_contentType = valide.isValideType(contentType)
                #base_url denotes host name, 
                #all documents that are from the same host, will be saved into same json-file
                #base_url becomes also json-file name (pathToSaveMetadata)
                baseUrl = None
                if(isValid_contentType):
                    baseUrl = (urlparse(redirectedTo)).netloc
                    
                if(baseUrl is not None):
                    #generate hash of the content of  doc.
                    #this hash is used later for detecting whether the doc's content has changed or not.
                    # this chnge-detection happens in cases of 
                    #1. monthly update
                    #2. current method, when appears, that this URL have processed earlier
                    sha224_ = (hashlib.sha224(pageread.encode('utf-8')).hexdigest())
                    #important data for parsers: encoding
                    _encoding = page.encoding
                    #_encoding = comm.getDocumentEncoding(contentType, pageread)
                    #print("-----------------------------------------------------")
                    
                    #exclude doc types where it is not possible to find textual  content: e.g images, videos
                    isDesiredType = comm.isDesiredContent(contentType, od)
                    #continue only witj desired types
                    if(isDesiredType):
                        #jsonsDir is actually a so called 'bucket' name in Google Cloud Storage
                        jsonsDir = comm.jsonsDir
                        print(jsonsDir)
                        #jsonsFile becomes a so called 'object' inside a bucket
                        #object's name is URL's host name and extension is '.json'
                        jsonsFile = baseUrl + ".json"
                        #build dictionary of address of object of this meta data
                        jsonsPath = dict()
                        jsonsPath["object"] = jsonsFile#'hostname.json'#
                        jsonsPath["bucket"] = jsonsDir#e.g. 'datadownload_json'#
                        pathToSaveMetadata_ = json.dumps(jsonsPath)
                        #save meta data into dictionary structure
                        infoDict_tmp = dict()
                        infoDict_tmp["base_url"] = baseUrl
                        infoDict = insertValuesToDict(infoDict_tmp, localFilename, redirectedTo, pageInfo, sha224_, statusCode )
                        #convert dictionary into json-string
                        jsondata = json.dumps(infoDict, indent=4)
                        #dict for sending collected data to 'updateObj.py'
                        insertJson = dict()
                        insertJson["jsondata"] = jsondata
                        insertJson["localFilename"] = localFilename
                        insertJson["redirectedTo"] = redirectedTo
                        insertJson["pageInfo"] = pageInfo
                        insertJson["sha224_"] = sha224_
                        insertJson["statusCode"] = statusCode
                        insertJson["timeDir"] = comm.timeDir
                        insertJson["address"] = pathToSaveMetadata_
            
                        #variable 'someNewData' is for storing knowledge about
                        #whether this doc at this url 
                        #is processed earlier:
                        #1. if Yes and doc's content has changed, then 'someNewData' becomes True; else remains False
                        #2. if No, then 'someNewData' becomes True 
                        someNewData = False
                        #string for saving a unique error message
                        errr=""
                        try:
                            #convert dictionary into json-string for sending argument to 'updateObj.py'
                            jd = json.dumps(insertJson)
                            #get info back about whether here is some new data
                            #'p' is a returned boolean value of 'someNewData'
                            #communication with google-python-api-client is done using older version, python2.7
                            p = subprocess.Popen(["python2", "storage/updateObj.py", jd], stdout=subprocess.PIPE)
                            out, err = p.communicate()
                            someNewData  = out.decode()
                            errr = str(err).lower()
                            print("\nsomeNewData " + str(someNewData))
                            print("\nerrr " + str(errr))
                        except:
                            errstr = errr if ((errr != "") & (errr != "none")) else "storage-updateObj.py-ERROR"
                            comm.printException(comm.pathToSaveJsonErrors, errstr)
                            pass
                        
                        #continue with parsing of doc only when new data was detected
                        if someNewData: 
                            sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding, localFilename, pageread)
            #record errors                    
            except urr.HTTPError as e:
                errStr = (redirectedTo + " HTTPError " + str(e.code) + " " + str(e.reason) + " \n" )
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except urr.URLError as e:
                errStr = (redirectedTo + " URLError " + str(e.reason) + " \n" )
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except IOError as e:
                errStr = (redirectedTo + " " + str("I/O_erRror({0}):_{1}".format(e.errno, e.strerror)) + "\n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except ValueError:
                errStr = (redirectedTo + " ValueError_Could_not_convert_data_to_an_integer.\n") 
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except TypeError:
                errStr = (redirectedTo + " TypeError\n")  
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except:
                errStr = (redirectedTo + " Unexpected_error:_" + (str(sys.exc_info()[0])) + "\n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass