Example #1
0
def get_static_text_content(url):
    """ scrap static content form url and preprocess it """

    content = []

    try:
        with Timeout(30):
            res = requests.get(url, headers=headers, verify=False, timeout=10)
            content.extend(processdata.preprocess(text_from_html(res.text)))
            if len(content) > 0 and content[0] == "invalidcontentfound":
                return content

            abt_url = get_about_url(res.text, url)

            if abt_url != None:
                res = requests.get(abt_url,
                                   headers=headers,
                                   verify=False,
                                   timeout=10)
                content.extend(processdata.preprocess(text_from_html(
                    res.text)))
            return content

    except:
        return content
    return content
Example #2
0
def get_dynamic_text_content(url):
  """ scrap dynamic content form url and preprocess it """
  
  content = []
  try:
    browser.get(url)
    content.extend(processdata.preprocess(text_from_html(browser.page_source)))
    if len(content) > 0 and content[0] == "invalidcontentfound": return content
    abt_url = get_about_url(browser.page_source, url)
   
    if abt_url != None:
      browser.get(abt_url)
      content.extend(processdata.preprocess(text_from_html(browser.page_source)))
    return content
  except:
    return content
def get_dynamic_text_content(url):
    """ scrap dynamic content form url and preprocess it """

    content = []
    try:
        browser.get(url)
        content.extend(
            processdata.preprocess(text_from_html(browser.page_source)))
        if len(content) > 0 and content[0] == "invalidcontentfound":
            return ['ERROR_MSG', 'Non English']
        abt_url = get_about_url(browser.page_source, url)

        if abt_url != None:
            browser.get(abt_url)
            content.extend(
                processdata.preprocess(text_from_html(browser.page_source)))
        return content
    except:
        return ['ERROR_MSG', 'site take too long to complete request']
def get_static_text_content(url, timeout=5):
    """ scrap static content form url and preprocess it """

    content = []

    try:
        res = requests.get(url,
                           headers=headers,
                           verify=False,
                           timeout=timeout,
                           allow_redirects=True)
        res.raise_for_status()
        content.extend(processdata.preprocess(text_from_html(res.text)))
        if len(content) > 0 and content[0] == "invalidcontentfound":
            return ['ERROR_MSG', 'Non English']
        abt_url = get_about_url(res.text, url)
        if abt_url != None:
            try:
                res = requests.get(abt_url,
                                   verify=False,
                                   timeout=timeout,
                                   allow_redirects=True)
                content.extend(processdata.preprocess(text_from_html(
                    res.text)))
            except:
                return content

    except requests.exceptions.Timeout as errt:
        return [
            'ERROR_MSG',
            'Timeout: (connect time={timeout:} sec)'.format(timeout=timeout)
        ]
    except requests.exceptions.HTTPError as errh:
        return ['ERROR_MSG', str(errh.args[0])]
    except requests.exceptions.ConnectionError as errc:
        return ['ERROR_MSG', str(errc.args[0]).split(':')[-1]]
    except requests.exceptions.RequestException as err:
        return ['ERROR_MSG', str(err.args[0])]
    return content