def get_static_text_content(url): """ scrap static content form url and preprocess it """ content = [] try: with Timeout(30): res = requests.get(url, headers=headers, verify=False, timeout=10) content.extend(processdata.preprocess(text_from_html(res.text))) if len(content) > 0 and content[0] == "invalidcontentfound": return content abt_url = get_about_url(res.text, url) if abt_url != None: res = requests.get(abt_url, headers=headers, verify=False, timeout=10) content.extend(processdata.preprocess(text_from_html( res.text))) return content except: return content return content
def get_dynamic_text_content(url): """ scrap dynamic content form url and preprocess it """ content = [] try: browser.get(url) content.extend(processdata.preprocess(text_from_html(browser.page_source))) if len(content) > 0 and content[0] == "invalidcontentfound": return content abt_url = get_about_url(browser.page_source, url) if abt_url != None: browser.get(abt_url) content.extend(processdata.preprocess(text_from_html(browser.page_source))) return content except: return content
def get_dynamic_text_content(url): """ scrap dynamic content form url and preprocess it """ content = [] try: browser.get(url) content.extend( processdata.preprocess(text_from_html(browser.page_source))) if len(content) > 0 and content[0] == "invalidcontentfound": return ['ERROR_MSG', 'Non English'] abt_url = get_about_url(browser.page_source, url) if abt_url != None: browser.get(abt_url) content.extend( processdata.preprocess(text_from_html(browser.page_source))) return content except: return ['ERROR_MSG', 'site take too long to complete request']
def get_static_text_content(url, timeout=5): """ scrap static content form url and preprocess it """ content = [] try: res = requests.get(url, headers=headers, verify=False, timeout=timeout, allow_redirects=True) res.raise_for_status() content.extend(processdata.preprocess(text_from_html(res.text))) if len(content) > 0 and content[0] == "invalidcontentfound": return ['ERROR_MSG', 'Non English'] abt_url = get_about_url(res.text, url) if abt_url != None: try: res = requests.get(abt_url, verify=False, timeout=timeout, allow_redirects=True) content.extend(processdata.preprocess(text_from_html( res.text))) except: return content except requests.exceptions.Timeout as errt: return [ 'ERROR_MSG', 'Timeout: (connect time={timeout:} sec)'.format(timeout=timeout) ] except requests.exceptions.HTTPError as errh: return ['ERROR_MSG', str(errh.args[0])] except requests.exceptions.ConnectionError as errc: return ['ERROR_MSG', str(errc.args[0]).split(':')[-1]] except requests.exceptions.RequestException as err: return ['ERROR_MSG', str(err.args[0])] return content