Python preprocess Examples

Programming Language: Python

Namespace/Package Name: processdata

Method/Function: preprocess

Examples at hotexamples.com: 4

Python preprocess - 4 examples found. These are the top rated real world Python examples of processdata.preprocess extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: scrapper.py Project: Nitss10/Data-Processing

def get_static_text_content(url):
    """ scrap static content form url and preprocess it """

    content = []

    try:
        with Timeout(30):
            res = requests.get(url, headers=headers, verify=False, timeout=10)
            content.extend(processdata.preprocess(text_from_html(res.text)))
            if len(content) > 0 and content[0] == "invalidcontentfound":
                return content

            abt_url = get_about_url(res.text, url)

            if abt_url != None:
                res = requests.get(abt_url,
                                   headers=headers,
                                   verify=False,
                                   timeout=10)
                content.extend(processdata.preprocess(text_from_html(
                    res.text)))
            return content

    except:
        return content
    return content

Example #2

Show file

File: scrapper.py Project: StTronn/web_cat_server

def get_dynamic_text_content(url):
  """ scrap dynamic content form url and preprocess it """
  
  content = []
  try:
    browser.get(url)
    content.extend(processdata.preprocess(text_from_html(browser.page_source)))
    if len(content) > 0 and content[0] == "invalidcontentfound": return content
    abt_url = get_about_url(browser.page_source, url)
   
    if abt_url != None:
      browser.get(abt_url)
      content.extend(processdata.preprocess(text_from_html(browser.page_source)))
    return content
  except:
    return content

Example #3

Show file

File: scrapper.py Project: Pratikmehta1729/Data-Processing

def get_dynamic_text_content(url):
    """ scrap dynamic content form url and preprocess it """

    content = []
    try:
        browser.get(url)
        content.extend(
            processdata.preprocess(text_from_html(browser.page_source)))
        if len(content) > 0 and content[0] == "invalidcontentfound":
            return ['ERROR_MSG', 'Non English']
        abt_url = get_about_url(browser.page_source, url)

        if abt_url != None:
            browser.get(abt_url)
            content.extend(
                processdata.preprocess(text_from_html(browser.page_source)))
        return content
    except:
        return ['ERROR_MSG', 'site take too long to complete request']

Example #4

Show file

File: scrapper.py Project: Pratikmehta1729/Data-Processing

def get_static_text_content(url, timeout=5):
    """ scrap static content form url and preprocess it """

    content = []

    try:
        res = requests.get(url,
                           headers=headers,
                           verify=False,
                           timeout=timeout,
                           allow_redirects=True)
        res.raise_for_status()
        content.extend(processdata.preprocess(text_from_html(res.text)))
        if len(content) > 0 and content[0] == "invalidcontentfound":
            return ['ERROR_MSG', 'Non English']
        abt_url = get_about_url(res.text, url)
        if abt_url != None:
            try:
                res = requests.get(abt_url,
                                   verify=False,
                                   timeout=timeout,
                                   allow_redirects=True)
                content.extend(processdata.preprocess(text_from_html(
                    res.text)))
            except:
                return content

    except requests.exceptions.Timeout as errt:
        return [
            'ERROR_MSG',
            'Timeout: (connect time={timeout:} sec)'.format(timeout=timeout)
        ]
    except requests.exceptions.HTTPError as errh:
        return ['ERROR_MSG', str(errh.args[0])]
    except requests.exceptions.ConnectionError as errc:
        return ['ERROR_MSG', str(errc.args[0]).split(':')[-1]]
    except requests.exceptions.RequestException as err:
        return ['ERROR_MSG', str(err.args[0])]
    return content