def page_item(self, response: HtmlResponse) -> Item: media_urls = [] get_urls = lambda le: (link.url for link in le.extract_links(response)) if self.settings.get('FILES_STORE'): media_urls.extend(get_urls(self.images_le)) media_urls.extend( set(get_urls(self.files_le)) - set(get_urls(self.le))) metadata = { 'id': _url_hash(response.url, as_bytes=False), 'parent': _url_hash_as_str(response.meta.get('parent')), 'depth': response.meta.get('depth'), 'priority': response.request.priority, } if (self.settings.get('AUTOLOGIN_ENABLED') and not self.queue.has_login_form(response.url)): for form_el, form_meta in extract_forms(response.text, fields=False): if form_meta.get('form') == 'login': self.queue.add_login_form(response.url) metadata['has_login_form'] = True return text_cdr_item( response, crawler_name=self.settings.get('CDR_CRAWLER'), team_name=self.settings.get('CDR_TEAM'), objects=media_urls, metadata=metadata, )
def test_extract_forms_proba(tree): forms = formasaurus.extract_forms(tree, proba=True, threshold=0) assert len(forms) == 1 probs = forms[0][1]['form'] assert probs['login'] > 0.5 assert probs['contact/comment'] < 0.4 assert probs['search'] < 0.4 assert probs['registration'] < 0.4 assert probs['join mailing list'] < 0.4 assert probs['other'] < 0.4 assert probs['password/login recovery'] < 0.4 forms = formasaurus.extract_forms(tree, proba=True, threshold=0.3) assert len(forms) == 1 probs = forms[0][1]['form'] assert list(probs.keys()) == ['login']
def test_extract_forms(tree): forms = formasaurus.extract_forms(tree) assert len(forms) == 1 assert forms[0][1] == { 'form': 'login', 'fields': {'password': '******', 'username': '******'}, }
def forms_info(response): """ Return a list of form classification results """ res = formasaurus.extract_forms(response.text, proba=True, threshold=0, fields=True) return [info for form, info in res]
def get_login_form(html_source, page_forms=None): matches = [] Match = namedtuple('Match', ['idx', 'form', 'meta']) for idx, (form, meta) in enumerate(formasaurus.extract_forms(html_source)): if meta['form'] == 'login': matches.append(Match(idx, form, meta)) if matches: if page_forms: return max(matches, key=lambda match: ( _get_captcha_field(match.meta) is not None, _form_area(page_forms[match.idx]))) else: return matches[0]
def extract(self, url, nature): task = TaskFactory.buildFromURI(url, Task(auth=AuthNature.no)) handler = getHandler(task)("*", "*/*;", None) #pas de cache contentTypes, tmpFile, newTasks = handler.execute(task) doc = html.document_fromstring(str(tmpFile.read()), base_url=task.url) forms = [] for form, cl in formasaurus.extract_forms(doc): if nature == cl: forms.append(build(url, html.tostring(form), nature)) return forms
def process_forms(): f = open("final_results.jl") f_write = open("final_results_forms.jl", "w") for line in f: line_dic = json.loads(line.strip()) html = list(line_dic.values())[0].strip() if html.strip(): try: forms_all = strip_el( formasaurus.extract_forms(html, proba=True, threshold=0.05)) line_dic["forms"] = forms_all f_write.write(json.dumps(line_dic)) f_write.write("\n") except: pass
def parse(self, response): self.logger.info(response.url) if response.text: for _, meta in formasaurus.extract_forms(response.text): form_type = meta['form'] if form_type == 'login' and not self.found_login: self.found_login = True self.handle_login_form(response.url) elif form_type == 'registration' \ and not self.found_registration: self.found_registration = True self.handle_registration_form(response.url) if self.found_registration and self.found_login: raise CloseSpider('done') for link in self.link_extractor.extract_links(response): priority = 0 text = ' '.join([relative_url(link.url), link.text]).lower() if any(pattern in text for pattern in self.priority_patterns): priority = 100 yield self.request(link.url, self.parse, priority=priority)
def test_extract_forms_proba(tree, fields): forms = formasaurus.extract_forms(tree, proba=True, threshold=0, fields=fields) assert len(forms) == 1 probs = forms[0][1]['form'] assert probs['login'] > 0.5 assert probs['contact/comment'] < 0.4 assert probs['search'] < 0.4 assert probs['registration'] < 0.4 assert probs['join mailing list'] < 0.4 assert probs['other'] < 0.4 assert probs['password/login recovery'] < 0.4 if fields: field_probs = forms[0][1]['fields'] assert sorted(field_probs.keys()) == ['password', 'username'] assert field_probs['password']['password'] > 0.9 assert field_probs['username']['username'] > 0.9 assert 1.0 - 1e-6 < sum(field_probs['password'].values()) < 1.0 + 1e-6 assert 1.0 - 1e-6 < sum(field_probs['username'].values()) < 1.0 + 1e-6
async def get_login_form(self, cookie_jar, response, username, password): ''' Attempt to extract login form action and form data from a response, substituting the provided ``username`` and ``password`` into the corresponding fields. ''' encoding, html = w3lib.encoding.html_to_unicode( response.content_type, response.body, auto_detect_fun=chardet ) forms = await self._loop.run_in_executor(None, lambda: formasaurus.extract_forms(html, proba=True)) form, meta = self._select_login_form(forms) if form is None: raise Exception("Can't find login form") login_field, password_field, captcha_field = self._select_login_fields( meta['fields']) if login_field is None or password_field is None: raise Exception("Can't find username/password fields") form.fields[login_field] = username form.fields[password_field] = password if captcha_field is not None: if self._policy.captcha_solver is None: raise Exception('CAPTCHA required for login url={} but there is' ' no CAPTCHA solver available'.format(response.url)) img_el = self._get_captcha_image_element(form) img_src = urljoin(response.url, img_el.get('src')) captcha_text = await self._solve_captcha(cookie_jar, img_src) form.fields[captcha_field] = captcha_text form_action = urljoin(response.url, form.action) return form_action, form.method, dict(form.fields)
def parse(self, response): if not self.link_extractor.matches(response.url): return request_meta = { 'from_search': response.meta.get('is_search'), 'extracted_at': response.url, } def request(url, meta=None, **kwargs): meta = meta or {} meta.update(request_meta) return self.make_request(url, meta=meta, **kwargs) # Not using formasaurus for Inferlink forms processing forms = (formasaurus.extract_forms(response.text) if response.text else []) # for x in forms: # print(etree.tostring(x[0], pretty_print=True)) # print(x[1]) metadata = dict( is_page=response.meta.get('is_page', False), is_onclick=response.meta.get('is_onclick', False), is_iframe=response.meta.get('is_iframe', False), is_search=response.meta.get('is_search', False), from_search=response.meta.get('from_search', False), extracted_at=response.meta.get('extracted_at', None), depth=response.meta.get('depth', None), priority=response.request.priority, forms=[meta for _, meta in forms], screenshot=self._take_screenshot(response), ) # print('Response.url -- ', response.url) # print('start_urls[0] -- ', self.start_urls[0]) # if response.url == self.start_urls[0]: # print('its the first url') ## yield self.text_cdr_item( ## response, follow_urls=[], metadata=metadata) # print('Google request again') ## yield request('http://www.google.com') # if there is no file # then pages can be an empty array # for will not do nothing # for each page say page_valid=true beforehand # and also start with a null urlExtractionInfo page_valid = True url_extract_info = None forms = [] # page classification #pages = self.pages_data['pagesInfo'] for pg in self.pages_cfg: page_name = pg.get('pageName') # print(page_name) # check if the page gets classified url_regex = pg.get('urlRegex') content_regex = pg.get('contentRegex') page_valid = True if url_regex: # print(search_re(url_regex, response.url)) if not search_re(url_regex, response.url): page_valid = False if content_regex: # print('content_regex not null') # print(search_re(content_regex, response.body.decode("utf-8"))) if not search_re(content_regex, response.body.decode("utf-8")): page_valid = False if page_valid: url_extract_info = pg.get('urlExtractionInfo') forms = pg.get('formsInfo') break if not page_valid: print( 'the page did not pass through any of the specified page classifiers', response.url) else: # do rest of the processing # print('PAGE is VALID --', response.url) follow_urls = { link_to_url(link) for link in self.link_extractor.extract_links(response) if not self._looks_like_logout(link, response) } print('Size of follow-urls: ', len(follow_urls)) yield self.text_cdr_item(response, follow_urls=follow_urls, metadata=metadata) if not self.settings.getbool('FOLLOW_LINKS'): return if self.settings.getbool('PREFER_PAGINATION'): # Follow pagination links; pagination is not a subject of # a max depth limit. This also prioritizes pagination links because # depth is not increased for them. with _dont_increase_depth(response): for url in self._pagination_urls(response): # self.logger.debug('Pagination link found: %s', url) yield request(url, meta={'is_page': True}) #url extraction processing allowed_follow_urls = list() if url_extract_info: url_extract_method = url_extract_info['extractionMethod'] if url_extract_method == 'inferlink': extract_urls = url_extract_info.get('urls') for extract_url in (extract_urls or []): allowed_follow_urls.append(extract_url) else: url_regexes_allow = url_extract_info.get('urlRegexesAllow') url_regexes_deny = url_extract_info.get('urlRegexesDeny') for follow_url in (follow_urls or []): follow = True for url_regex_deny in (url_regexes_deny or []): if search_re(url_regex_deny, follow_url): follow = False break if follow and url_regexes_allow: follow = False for url_regex_allow in (url_regexes_allow or []): if search_re(url_regex_allow, follow_url): follow = True break if follow: allowed_follow_urls.append(follow_url) else: allowed_follow_urls = list(follow_urls) print('Number of urls to be followed - ', len(allowed_follow_urls)) # Follow all the allowed in-domain links. # Pagination requests are sent twice, but we don't care because # they're be filtered out by a dupefilter. for url in allowed_follow_urls: # print('url to be followed: ', url) yield request(url) # urls extracted from onclick handlers for url in get_js_links(response): priority = 0 if _looks_like_url(url) else -15 url = response.urljoin(url) yield request(url, meta={'is_onclick': True}, priority=priority) # go to iframes for link in self.iframe_link_extractor.extract_links(response): yield request(link_to_url(link), meta={'is_iframe': True}) # forms processing for form in (forms or []): form_identity = json.loads(form['identity']) form_method = form['method'] form_params_list = form['params'] kwargs = {} if self.use_splash: kwargs.update(self.setup_splash_args()) meta = {} meta['avoid_dup_content'] = True meta.update(request_meta) kwargs.update(form_identity) for form_params in form_params_list: # SplashRequest for all the params print('===== Submitting FORM again ========' + json.dumps(form_params)) yield SplashFormRequest.from_response(response, formdata=form_params, method=form_method, callback=self.parse, meta=meta.copy(), **kwargs)
def test_classify_proba(tree): form = get_forms(tree)[0] res1 = formasaurus.classify_proba(form, threshold=0.05) res2 = formasaurus.extract_forms(tree, proba=True, threshold=0.05)[0][1] assert res1 == res2
def test_extract_forms_proba_threshold(tree): forms = formasaurus.extract_forms(tree, proba=True, threshold=0.3) assert len(forms) == 1 probs = forms[0][1]['form'] assert list(probs.keys()) == ['login']
def test_extract_forms_no_fields(tree): forms = formasaurus.extract_forms(tree, fields=False) assert len(forms) == 1 assert forms[0][1] == {'form': 'login'}
def parse(self, response): if not self.link_extractor.matches(response.url): return request_meta = { 'from_search': response.meta.get('is_search'), 'extracted_at': response.url, } def request(url, meta=None, **kwargs): meta = meta or {} meta.update(request_meta) return self.make_request(url, meta=meta, **kwargs) forms = (formasaurus.extract_forms(response.text) if response.text else []) metadata = dict( is_page=response.meta.get('is_page', False), is_onclick=response.meta.get('is_onclick', False), is_iframe=response.meta.get('is_iframe', False), is_search=response.meta.get('is_search', False), from_search=response.meta.get('from_search', False), extracted_at=response.meta.get('extracted_at', None), depth=response.meta.get('depth', None), priority=response.request.priority, forms=[meta for _, meta in forms], screenshot=self._take_screenshot(response), ) follow_urls = {link_to_url(link) for link in self.link_extractor.extract_links(response) if not self._looks_like_logout(link, response)} yield self.text_cdr_item( response, follow_urls=follow_urls, metadata=metadata) if not self.settings.getbool('FOLLOW_LINKS'): return if self.settings.getbool('PREFER_PAGINATION'): # Follow pagination links; pagination is not a subject of # a max depth limit. This also prioritizes pagination links because # depth is not increased for them. with _dont_increase_depth(response): for url in self._pagination_urls(response): # self.logger.debug('Pagination link found: %s', url) yield request(url, meta={'is_page': True}) # Follow all in-domain links. # Pagination requests are sent twice, but we don't care because # they're be filtered out by a dupefilter. for url in follow_urls: yield request(url) # urls extracted from onclick handlers for url in get_js_links(response): priority = 0 if _looks_like_url(url) else -15 url = response.urljoin(url) yield request(url, meta={'is_onclick': True}, priority=priority) # go to iframes for link in self.iframe_link_extractor.extract_links(response): yield request(link_to_url(link), meta={'is_iframe': True}) # Try submitting forms for form, meta in forms: for request_kwargs in self.handle_form(response.url, form, meta): yield request(**request_kwargs)
def parse(self, response): if not self.link_extractor.matches(response.url): return request_meta = { 'from_search': response.meta.get('is_search'), 'extracted_at': response.url, } def request(url, meta=None, **kwargs): meta = meta or {} meta.update(request_meta) return self.splash_request(url, meta=meta, **kwargs) forms = formasaurus.extract_forms(response.text) if response.text \ else [] parent_item = self.text_cdr_item(response, dict( is_page=response.meta.get('is_page', False), is_onclick=response.meta.get('is_onclick', False), is_iframe=response.meta.get('is_iframe', False), is_search=response.meta.get('is_search', False), from_search=response.meta.get('from_search', False), extracted_at=response.meta.get('extracted_at', None), depth=response.meta.get('depth', None), forms=[meta for _, meta in forms], )) yield parent_item if self.settings.getbool('PREFER_PAGINATION'): # Follow pagination links; pagination is not a subject of # a max depth limit. This also prioritizes pagination links because # depth is not increased for them. with _dont_increase_depth(response): for url in self._pagination_urls(response): # self.logger.debug('Pagination link found: %s', url) yield request(url, meta={'is_page': True}) # Follow all in-domain links. # Pagination requests are sent twice, but we don't care because # they're be filtered out by a dupefilter. normal_urls = {link.url for link in self.link_extractor.extract_links(response)} for url in normal_urls: yield request(url) if self.settings.get('FILES_STORE'): yield from self.download_files(response, normal_urls, parent_item) # urls extracted from onclick handlers for url in get_js_links(response): priority = 0 if _looks_like_url(url) else -15 url = response.urljoin(url) yield request(url, meta={'is_onclick': True}, priority=priority) # go to iframes for link in self.iframe_link_extractor.extract_links(response): yield request(link.url, meta={'is_iframe': True}) # Try submitting forms for form, meta in forms: for request_kwargs in self.handle_form(response.url, form, meta): yield request(**request_kwargs)