def page_item(self, response: HtmlResponse) -> Item:
     media_urls = []
     get_urls = lambda le: (link.url for link in le.extract_links(response))
     if self.settings.get('FILES_STORE'):
         media_urls.extend(get_urls(self.images_le))
         media_urls.extend(
             set(get_urls(self.files_le)) - set(get_urls(self.le)))
     metadata = {
         'id': _url_hash(response.url, as_bytes=False),
         'parent': _url_hash_as_str(response.meta.get('parent')),
         'depth': response.meta.get('depth'),
         'priority': response.request.priority,
     }
     if (self.settings.get('AUTOLOGIN_ENABLED')
             and not self.queue.has_login_form(response.url)):
         for form_el, form_meta in extract_forms(response.text,
                                                 fields=False):
             if form_meta.get('form') == 'login':
                 self.queue.add_login_form(response.url)
                 metadata['has_login_form'] = True
     return text_cdr_item(
         response,
         crawler_name=self.settings.get('CDR_CRAWLER'),
         team_name=self.settings.get('CDR_TEAM'),
         objects=media_urls,
         metadata=metadata,
     )
Example #2
0
def test_extract_forms_proba(tree):
    forms = formasaurus.extract_forms(tree, proba=True, threshold=0)
    assert len(forms) == 1
    probs = forms[0][1]['form']
    assert probs['login'] > 0.5
    assert probs['contact/comment'] < 0.4
    assert probs['search'] < 0.4
    assert probs['registration'] < 0.4
    assert probs['join mailing list'] < 0.4
    assert probs['other'] < 0.4
    assert probs['password/login recovery'] < 0.4

    forms = formasaurus.extract_forms(tree, proba=True, threshold=0.3)
    assert len(forms) == 1
    probs = forms[0][1]['form']
    assert list(probs.keys()) == ['login']
Example #3
0
def test_extract_forms(tree):
    forms = formasaurus.extract_forms(tree)
    assert len(forms) == 1
    assert forms[0][1] == {
        'form': 'login',
        'fields': {'password': '******', 'username': '******'},
    }
Example #4
0
def forms_info(response):
    """ Return a list of form classification results """
    res = formasaurus.extract_forms(response.text,
                                    proba=True,
                                    threshold=0,
                                    fields=True)
    return [info for form, info in res]
Example #5
0
def get_login_form(html_source, page_forms=None):
    matches = []
    Match = namedtuple('Match', ['idx', 'form', 'meta'])
    for idx, (form, meta) in enumerate(formasaurus.extract_forms(html_source)):
        if meta['form'] == 'login':
            matches.append(Match(idx, form, meta))
    if matches:
        if page_forms:
            return max(matches, key=lambda match: (
                _get_captcha_field(match.meta) is not None,
                _form_area(page_forms[match.idx])))
        else:
            return matches[0]
Example #6
0
    def extract(self, url, nature):
        task = TaskFactory.buildFromURI(url, Task(auth=AuthNature.no))
        handler = getHandler(task)("*", "*/*;", None)  #pas de cache
        contentTypes, tmpFile, newTasks = handler.execute(task)

        doc = html.document_fromstring(str(tmpFile.read()), base_url=task.url)

        forms = []
        for form, cl in formasaurus.extract_forms(doc):
            if nature == cl:
                forms.append(build(url, html.tostring(form), nature))

        return forms
Example #7
0
def process_forms():
    f = open("final_results.jl")
    f_write = open("final_results_forms.jl", "w")
    for line in f:
        line_dic = json.loads(line.strip())
        html = list(line_dic.values())[0].strip()
        if html.strip():
            try:
                forms_all = strip_el(
                    formasaurus.extract_forms(html, proba=True,
                                              threshold=0.05))
                line_dic["forms"] = forms_all
                f_write.write(json.dumps(line_dic))
                f_write.write("\n")
            except:
                pass
Example #8
0
 def parse(self, response):
     self.logger.info(response.url)
     if response.text:
         for _, meta in formasaurus.extract_forms(response.text):
             form_type = meta['form']
             if form_type == 'login' and not self.found_login:
                 self.found_login = True
                 self.handle_login_form(response.url)
             elif form_type == 'registration' \
                     and not self.found_registration:
                 self.found_registration = True
                 self.handle_registration_form(response.url)
     if self.found_registration and self.found_login:
         raise CloseSpider('done')
     for link in self.link_extractor.extract_links(response):
         priority = 0
         text = ' '.join([relative_url(link.url), link.text]).lower()
         if any(pattern in text for pattern in self.priority_patterns):
             priority = 100
         yield self.request(link.url, self.parse, priority=priority)
Example #9
0
def test_extract_forms_proba(tree, fields):
    forms = formasaurus.extract_forms(tree, proba=True, threshold=0, fields=fields)
    assert len(forms) == 1
    probs = forms[0][1]['form']
    assert probs['login'] > 0.5
    assert probs['contact/comment'] < 0.4
    assert probs['search'] < 0.4
    assert probs['registration'] < 0.4
    assert probs['join mailing list'] < 0.4
    assert probs['other'] < 0.4
    assert probs['password/login recovery'] < 0.4

    if fields:
        field_probs = forms[0][1]['fields']
        assert sorted(field_probs.keys()) == ['password', 'username']
        assert field_probs['password']['password'] > 0.9
        assert field_probs['username']['username'] > 0.9

        assert 1.0 - 1e-6 < sum(field_probs['password'].values()) < 1.0 + 1e-6
        assert 1.0 - 1e-6 < sum(field_probs['username'].values()) < 1.0 + 1e-6
Example #10
0
    async def get_login_form(self, cookie_jar, response, username, password):
        '''
        Attempt to extract login form action and form data from a response,
        substituting the provided ``username`` and ``password`` into the
        corresponding fields.
        '''
        encoding, html = w3lib.encoding.html_to_unicode(
            response.content_type,
            response.body,
            auto_detect_fun=chardet
        )

        forms = await self._loop.run_in_executor(None,
            lambda: formasaurus.extract_forms(html, proba=True))
        form, meta = self._select_login_form(forms)

        if form is None:
            raise Exception("Can't find login form")

        login_field, password_field, captcha_field = self._select_login_fields(
            meta['fields'])
        if login_field is None or password_field is None:
            raise Exception("Can't find username/password fields")

        form.fields[login_field] = username
        form.fields[password_field] = password

        if captcha_field is not None:
            if self._policy.captcha_solver is None:
                raise Exception('CAPTCHA required for login url={} but there is'
                    ' no CAPTCHA solver available'.format(response.url))

            img_el = self._get_captcha_image_element(form)
            img_src = urljoin(response.url, img_el.get('src'))
            captcha_text = await self._solve_captcha(cookie_jar, img_src)
            form.fields[captcha_field] = captcha_text

        form_action = urljoin(response.url, form.action)
        return form_action, form.method, dict(form.fields)
Example #11
0
    def parse(self, response):
        if not self.link_extractor.matches(response.url):
            return

        request_meta = {
            'from_search': response.meta.get('is_search'),
            'extracted_at': response.url,
        }

        def request(url, meta=None, **kwargs):
            meta = meta or {}
            meta.update(request_meta)
            return self.make_request(url, meta=meta, **kwargs)

        # Not using formasaurus for Inferlink forms processing
        forms = (formasaurus.extract_forms(response.text)
                 if response.text else [])

        #        for x in forms:
        #          print(etree.tostring(x[0], pretty_print=True))
        #          print(x[1])
        metadata = dict(
            is_page=response.meta.get('is_page', False),
            is_onclick=response.meta.get('is_onclick', False),
            is_iframe=response.meta.get('is_iframe', False),
            is_search=response.meta.get('is_search', False),
            from_search=response.meta.get('from_search', False),
            extracted_at=response.meta.get('extracted_at', None),
            depth=response.meta.get('depth', None),
            priority=response.request.priority,
            forms=[meta for _, meta in forms],
            screenshot=self._take_screenshot(response),
        )

        #        print('Response.url -- ', response.url)
        #        print('start_urls[0] -- ', self.start_urls[0])
        #        if response.url == self.start_urls[0]:
        #          print('its the first url')
        ##          yield self.text_cdr_item(
        ##              response, follow_urls=[], metadata=metadata)
        #          print('Google request again')
        ##          yield request('http://www.google.com')

        #        if there is no file
        #    then pages can be an empty array
        #    for will not do nothing
        #    for each page say page_valid=true beforehand
        #    and also start with a null urlExtractionInfo

        page_valid = True
        url_extract_info = None
        forms = []

        # page classification
        #pages = self.pages_data['pagesInfo']
        for pg in self.pages_cfg:
            page_name = pg.get('pageName')
            #          print(page_name)

            # check if the page gets classified
            url_regex = pg.get('urlRegex')
            content_regex = pg.get('contentRegex')
            page_valid = True
            if url_regex:
                #            print(search_re(url_regex, response.url))
                if not search_re(url_regex, response.url):
                    page_valid = False
            if content_regex:
                #            print('content_regex not null')
                #            print(search_re(content_regex, response.body.decode("utf-8")))
                if not search_re(content_regex, response.body.decode("utf-8")):
                    page_valid = False
            if page_valid:
                url_extract_info = pg.get('urlExtractionInfo')
                forms = pg.get('formsInfo')
                break

        if not page_valid:
            print(
                'the page did not pass through any of the specified page classifiers',
                response.url)
        else:
            # do rest of the processing
            #          print('PAGE is VALID --', response.url)
            follow_urls = {
                link_to_url(link)
                for link in self.link_extractor.extract_links(response)
                if not self._looks_like_logout(link, response)
            }
            print('Size of follow-urls: ', len(follow_urls))

            yield self.text_cdr_item(response,
                                     follow_urls=follow_urls,
                                     metadata=metadata)

            if not self.settings.getbool('FOLLOW_LINKS'):
                return

            if self.settings.getbool('PREFER_PAGINATION'):
                # Follow pagination links; pagination is not a subject of
                # a max depth limit. This also prioritizes pagination links because
                # depth is not increased for them.
                with _dont_increase_depth(response):
                    for url in self._pagination_urls(response):
                        # self.logger.debug('Pagination link found: %s', url)
                        yield request(url, meta={'is_page': True})

            #url extraction processing
            allowed_follow_urls = list()
            if url_extract_info:
                url_extract_method = url_extract_info['extractionMethod']
                if url_extract_method == 'inferlink':
                    extract_urls = url_extract_info.get('urls')
                    for extract_url in (extract_urls or []):
                        allowed_follow_urls.append(extract_url)
                else:
                    url_regexes_allow = url_extract_info.get('urlRegexesAllow')
                    url_regexes_deny = url_extract_info.get('urlRegexesDeny')
                    for follow_url in (follow_urls or []):
                        follow = True
                        for url_regex_deny in (url_regexes_deny or []):
                            if search_re(url_regex_deny, follow_url):
                                follow = False
                                break
                        if follow and url_regexes_allow:
                            follow = False
                            for url_regex_allow in (url_regexes_allow or []):
                                if search_re(url_regex_allow, follow_url):
                                    follow = True
                                    break
                        if follow:
                            allowed_follow_urls.append(follow_url)
            else:
                allowed_follow_urls = list(follow_urls)

            print('Number of urls to be followed - ', len(allowed_follow_urls))
            # Follow all the allowed in-domain links.
            # Pagination requests are sent twice, but we don't care because
            # they're be filtered out by a dupefilter.
            for url in allowed_follow_urls:
                #              print('url to be followed: ', url)
                yield request(url)

            # urls extracted from onclick handlers
            for url in get_js_links(response):
                priority = 0 if _looks_like_url(url) else -15
                url = response.urljoin(url)
                yield request(url,
                              meta={'is_onclick': True},
                              priority=priority)

            # go to iframes
            for link in self.iframe_link_extractor.extract_links(response):
                yield request(link_to_url(link), meta={'is_iframe': True})

            # forms processing
            for form in (forms or []):
                form_identity = json.loads(form['identity'])
                form_method = form['method']
                form_params_list = form['params']

                kwargs = {}
                if self.use_splash:
                    kwargs.update(self.setup_splash_args())
                meta = {}
                meta['avoid_dup_content'] = True
                meta.update(request_meta)
                kwargs.update(form_identity)

                for form_params in form_params_list:
                    # SplashRequest for all the params
                    print('===== Submitting FORM again ========' +
                          json.dumps(form_params))
                    yield SplashFormRequest.from_response(response,
                                                          formdata=form_params,
                                                          method=form_method,
                                                          callback=self.parse,
                                                          meta=meta.copy(),
                                                          **kwargs)
Example #12
0
def test_classify_proba(tree):
    form = get_forms(tree)[0]
    res1 = formasaurus.classify_proba(form, threshold=0.05)
    res2 = formasaurus.extract_forms(tree, proba=True, threshold=0.05)[0][1]
    assert res1 == res2
Example #13
0
def test_extract_forms_proba_threshold(tree):
    forms = formasaurus.extract_forms(tree, proba=True, threshold=0.3)
    assert len(forms) == 1
    probs = forms[0][1]['form']
    assert list(probs.keys()) == ['login']
Example #14
0
def test_extract_forms_no_fields(tree):
    forms = formasaurus.extract_forms(tree, fields=False)
    assert len(forms) == 1
    assert forms[0][1] == {'form': 'login'}
Example #15
0
    def parse(self, response):
        if not self.link_extractor.matches(response.url):
            return

        request_meta = {
            'from_search': response.meta.get('is_search'),
            'extracted_at': response.url,
        }

        def request(url, meta=None, **kwargs):
            meta = meta or {}
            meta.update(request_meta)
            return self.make_request(url, meta=meta, **kwargs)

        forms = (formasaurus.extract_forms(response.text) if response.text
                 else [])
        metadata = dict(
            is_page=response.meta.get('is_page', False),
            is_onclick=response.meta.get('is_onclick', False),
            is_iframe=response.meta.get('is_iframe', False),
            is_search=response.meta.get('is_search', False),
            from_search=response.meta.get('from_search', False),
            extracted_at=response.meta.get('extracted_at', None),
            depth=response.meta.get('depth', None),
            priority=response.request.priority,
            forms=[meta for _, meta in forms],
            screenshot=self._take_screenshot(response),
        )
        follow_urls = {link_to_url(link) for link in
                       self.link_extractor.extract_links(response)
                       if not self._looks_like_logout(link, response)}
        yield self.text_cdr_item(
            response, follow_urls=follow_urls, metadata=metadata)

        if not self.settings.getbool('FOLLOW_LINKS'):
            return

        if self.settings.getbool('PREFER_PAGINATION'):
            # Follow pagination links; pagination is not a subject of
            # a max depth limit. This also prioritizes pagination links because
            # depth is not increased for them.
            with _dont_increase_depth(response):
                for url in self._pagination_urls(response):
                    # self.logger.debug('Pagination link found: %s', url)
                    yield request(url, meta={'is_page': True})

        # Follow all in-domain links.
        # Pagination requests are sent twice, but we don't care because
        # they're be filtered out by a dupefilter.
        for url in follow_urls:
            yield request(url)

        # urls extracted from onclick handlers
        for url in get_js_links(response):
            priority = 0 if _looks_like_url(url) else -15
            url = response.urljoin(url)
            yield request(url, meta={'is_onclick': True}, priority=priority)

        # go to iframes
        for link in self.iframe_link_extractor.extract_links(response):
            yield request(link_to_url(link), meta={'is_iframe': True})

        # Try submitting forms
        for form, meta in forms:
            for request_kwargs in self.handle_form(response.url, form, meta):
                yield request(**request_kwargs)
Example #16
0
    def parse(self, response):
        if not self.link_extractor.matches(response.url):
            return

        request_meta = {
            'from_search': response.meta.get('is_search'),
            'extracted_at': response.url,
        }
        def request(url, meta=None, **kwargs):
            meta = meta or {}
            meta.update(request_meta)
            return self.splash_request(url, meta=meta, **kwargs)

        forms = formasaurus.extract_forms(response.text) if response.text \
                else []
        parent_item = self.text_cdr_item(response, dict(
            is_page=response.meta.get('is_page', False),
            is_onclick=response.meta.get('is_onclick', False),
            is_iframe=response.meta.get('is_iframe', False),
            is_search=response.meta.get('is_search', False),
            from_search=response.meta.get('from_search', False),
            extracted_at=response.meta.get('extracted_at', None),
            depth=response.meta.get('depth', None),
            forms=[meta for _, meta in forms],
            ))
        yield parent_item

        if self.settings.getbool('PREFER_PAGINATION'):
            # Follow pagination links; pagination is not a subject of
            # a max depth limit. This also prioritizes pagination links because
            # depth is not increased for them.
            with _dont_increase_depth(response):
                for url in self._pagination_urls(response):
                    # self.logger.debug('Pagination link found: %s', url)
                    yield request(url, meta={'is_page': True})

        # Follow all in-domain links.
        # Pagination requests are sent twice, but we don't care because
        # they're be filtered out by a dupefilter.
        normal_urls = {link.url for link in
                       self.link_extractor.extract_links(response)}
        for url in normal_urls:
            yield request(url)

        if self.settings.get('FILES_STORE'):
            yield from self.download_files(response, normal_urls, parent_item)

        # urls extracted from onclick handlers
        for url in get_js_links(response):
            priority = 0 if _looks_like_url(url) else -15
            url = response.urljoin(url)
            yield request(url, meta={'is_onclick': True}, priority=priority)

        # go to iframes
        for link in self.iframe_link_extractor.extract_links(response):
            yield request(link.url, meta={'is_iframe': True})

        # Try submitting forms
        for form, meta in forms:
            for request_kwargs in self.handle_form(response.url, form, meta):
                yield request(**request_kwargs)