def check_search_result_integrity(grab): if ('your Internet connection has been prevented from accessing it' in grab.doc.unicode_body()): raise RequestBanned('Found ban message') if grab.doc('//form[@id="captcha_form"]').exists(): raise RequestBanned('Found captcha form') if grab.doc.code != 200: raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code) elif not grab.doc('//base[contains(@href, "ixquick.com")]').exists(): grab.doc.save('/tmp/x.html') import pdb pdb.set_trace() raise DataNotValid('Expected HTML element not found')
def check_integrity(grab): #if grab.doc.code == -1: # FIX # raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code) if False: #grab.doc.select('//img[contains(@src, "/captchaimg?")]').exists(): raise RequestBanned('Ban (captcha)') elif grab.doc.code != 200: raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code)
def check_integrity(grab): if False:#grab.doc.code == 999: pass#raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code) elif grab.doc.code != 200: raise HttpCodeNotValid('Non-200 HTTP code: %d' % grab.doc.code) elif grab.doc('//input[@name="captcha"]').exists(): raise RequestBanned('Found captcha')
def check_search_result_integrity(grab): if grab.doc.code == 403: raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code) elif grab.doc.code != 200: raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code) elif not grab.doc('//link[contains(@href, "opensearch") and ' 'contains(@title, "DuckDuckGo")]').exists(): raise DataNotValid('Expected HTML element not found')
def check_integrity(grab): #if grab.doc.code == -1: # FIX # raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code) if grab.doc.select('//img[contains(@src, "/captchaimg?")]').exists(): raise RequestBanned('Ban (captcha)') elif grab.doc.code != 200: #grab.doc.save('/tmp/x.html') #print('NOT 200 CODE') #import pdb; pdb.set_trace() raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code)
def check_ajax_search_result_integrity(grab): #if grab.doc.code == -1: # FIX # raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code) if grab.doc.code != 200: raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code) else: try: info = grab.doc.json except (TypeError, ValueError) as ex: raise DataNotValid('Not valid JSON') if not info.get('body', {}).get('serp'): raise DataNotValid('body->serp key not found') elif info['body']['antirobot']['blocked']: raise RequestBanned('Ban!')
def check_integrity(grab): if grab.doc.code == 999: raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code) elif grab.doc.code != 200: raise HttpCodeNotValid('Non-200 HTTP code: %d' % grab.doc.code)
def check_integrity(grab): if grab.doc.code in (503, 403) or grab.doc('//input[@name="captcha"]').exists(): raise RequestBanned('Captcha found')