def check_search_result_integrity(grab): if grab.doc.code != 200: raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code) elif not b'"status":"success"' in grab.doc.body: #grab.doc.save('/tmp/x.html') #print('not success') #import pdb; pdb.set_trace() raise DataNotValid('JSON success status not found')
def check_search_result_integrity(grab): if grab.doc.code == 403: raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code) elif grab.doc.code != 200: raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code) elif not grab.doc('//link[contains(@href, "opensearch") and ' 'contains(@title, "DuckDuckGo")]').exists(): raise DataNotValid('Expected HTML element not found')
def check_search_result_integrity(grab): #if grab.doc.code == 403: # raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code) if grab.doc.code != 200: raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code) elif not grab.doc('//meta[@name="generator" and' ' contains(@content, "searx/")]').exists(): raise DataNotValid('Expected HTML element not found')
def check_search_result_integrity(grab): if ('your Internet connection has been prevented from accessing it' in grab.doc.unicode_body()): raise RequestBanned('Found ban message') if grab.doc('//form[@id="captcha_form"]').exists(): raise RequestBanned('Found captcha form') if grab.doc.code != 200: raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code) elif not grab.doc('//base[contains(@href, "ixquick.com")]').exists(): grab.doc.save('/tmp/x.html') import pdb pdb.set_trace() raise DataNotValid('Expected HTML element not found')
def check_ajax_search_result_integrity(grab): #if grab.doc.code == -1: # FIX # raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code) if grab.doc.code != 200: raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code) else: try: info = grab.doc.json except (TypeError, ValueError) as ex: raise DataNotValid('Not valid JSON') if not info.get('body', {}).get('serp'): raise DataNotValid('body->serp key not found') elif info['body']['antirobot']['blocked']: raise RequestBanned('Ban!')
def check_integrity(grab): #if grab.doc.code == -1: # FIX # raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code) if False: #grab.doc.select('//img[contains(@src, "/captchaimg?")]').exists(): raise RequestBanned('Ban (captcha)') elif grab.doc.code != 200: raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code)
def check_integrity(grab): #if grab.doc.code == -1: # FIX # raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code) if grab.doc.select('//img[contains(@src, "/captchaimg?")]').exists(): raise RequestBanned('Ban (captcha)') elif grab.doc.code != 200: #grab.doc.save('/tmp/x.html') #print('NOT 200 CODE') #import pdb; pdb.set_trace() raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code)
def check_cache_integrity(grab): check_integrity(grab) if not grab.doc('//div[@class="cacheContent"]').exists(): raise DataNotValid('Div[@class="cacheContent"] not found')
def check_search_result_integrity(grab): check_integrity(grab) if not grab.doc('//input[@name="p"]').exists(): raise DataNotValid('Search query input not found')
def check_cache_integrity(grab): check_integrity(grab) if grab.doc.code == 404: pass elif not grab.doc('//div[@id="google-cache-hdr"]').exists(): raise DataNotValid('Google Cache Header not found')
def check_search_result_integrity(grab): check_integrity(grab) if not grab.doc('//div[@id="res"]').exists(): raise DataNotValid('Content of response has unexpected format.')
def check_cache_integrity(grab): check_integrity(grab) if not grab.doc('//script[contains(@src,' '"yandex.st/hilitedaemon-js")]').exists(): raise DataNotValid('Expected yandex.st script not found')
def check_search_result_integrity(grab): check_integrity(grab) if not grab.doc('//input[@name="text"]').exists(): raise DataNotValid('Expected HTML element not found')