Esempio n. 1
0
def iter_response_link_dicts(response: TextResponse,
                             limit_by_domain: bool = True) -> Iterator[Dict]:
    page_url = response.url
    domain_from = get_domain(response.url)
    base_url = get_base_url(response)
    for link in extract_link_dicts(response.selector, base_url):
        link['domain_to'] = get_domain(link['url'])
        if limit_by_domain and link['domain_to'] != domain_from:
            continue
        link['domain_from'] = domain_from
        link['page_url'] = page_url
        yield link
Esempio n. 2
0
    def _links_to_requests(
        self,
        response: TextResponse,
        links: List[Dict],
        links_matrix: sp.csr_matrix,
    ) -> Iterator[scrapy.Request]:
        indices_and_links = list(self.le.deduplicate_links_enumerated(links))
        if not indices_and_links:
            return
        indices, links_to_follow = zip(*indices_and_links)
        AS = links_matrix[list(indices)]
        scores = self.Q.predict(AS)

        for link, v, score in zip(links_to_follow, AS, scores):
            url = link['url']
            next_domain = get_domain(url)
            meta = {
                'link_vector': v,
                # 'link': link,  # turn it on for debugging
                'scheduler_slot': next_domain,
            }
            priority = score_to_priority(score)
            req = scrapy.Request(url, priority=priority, meta=meta)
            set_request_domain(req, next_domain)
            yield req
Esempio n. 3
0
    def _parse(self, response):
        if self.is_seed(response) and not hasattr(response, 'text'):
            # bad seed
            return [], 0

        as_t = response.meta.get('link_vector')

        if not hasattr(response, 'text'):
            # learn to avoid non-html responses
            self.Q.add_experience(as_t=as_t, AS_t1=None, r_t1=0)
            self.update_node(response, {'reward': 0})
            return [], 0

        page_vector = self._page_vector(response) if self.use_pages else None
        links = self._extract_links(response)
        links_matrix = self.link_vectorizer.transform(links) if links else None
        links_matrix = self.Q.join_As(links_matrix, page_vector)
        if links_matrix is not None:
            links_matrix = links_matrix.astype(np.float32)  # saving memory

        reward = 0
        if not self.is_seed(response):
            reward = self.goal.get_reward(response)
            self.update_node(response, {'reward': reward})
            self.total_reward += reward
            self.rewards.append(reward)
            self.Q.add_experience(as_t=as_t, AS_t1=links_matrix, r_t1=reward)
        domain = get_domain(response.url)
        self.crawled_domains.add(domain)
        if reward > 0.5:
            self.relevant_domains.add(domain)

        return (list(self._links_to_requests(response, links,
                                             links_matrix)), reward)
Esempio n. 4
0
 def _examples(self):
     examples = [
         [
             'forgot password',
             'http://example.com/wp-login.php?action=lostpassword'
         ],
         ['registration', 'http://example.com/register'],
         ['register', 'http://example.com/reg'],
         ['sign up', 'http://example.com/users/new'],
         ['my account', 'http://example.com/account/my?sess=GJHFHJS21123'],
         ['my little pony', 'http://example.com?category=25?sort=1&'],
         ['comment', 'http://example.com/blog?p=2'],
         ['sign in', 'http://example.com/users/login'],
         ['login', 'http://example.com/users/login'],
         ['forum', 'http://example.com/mybb'],
         ['forums', 'http://example.com/mybb'],
         ['forums', 'http://other-domain.com/mybb'],
         [
             'sadhjgrhgsfd',
             'http://example.com/new-to-exhibiting/discover-your-stand-position/'
         ],
         ['забыли пароль', 'http://example.com/users/send-password/'],
     ]
     examples_repr = ["{:20s} {}".format(txt, url) for txt, url in examples]
     links = [{
         'inside_text': txt,
         'url': url,
         'domain_from': 'example',
         'domain_to': get_domain(url),
     } for txt, url in examples]
     A = self.link_vectorizer.transform(links)
     s = self.page_vectorizer.transform([""]) if self.use_pages else None
     AS = self.Q.join_As(A, s)
     return examples_repr, AS
Esempio n. 5
0
    def process_request(self, request, spider):
        if not request.meta.get('domain'):
            return

        domain = request.meta['domain']
        if get_domain(request.url) != domain:
            logger.info("Dropped request {}: it doesn't belong to {}".format(
                request, domain))
            self.signals.send_catch_log(offdomain_request_dropped,
                                        request=request)
            raise IgnoreRequest()
Esempio n. 6
0
    def _extract_urls(self, html: str, url: str, sel: parsel.Selector,
                      base_url: str) -> List[Tuple[float, str]]:
        links = list(extract_link_dicts(sel, base_url))
        if not links:
            return []

        domain_from = get_domain(url)
        for link in links:
            link['domain_from'] = domain_from
            link['domain_to'] = get_domain(link['url'])

        if self.page_vectorizer:
            page_vec = self.page_vectorizer.transform([html])
        else:
            page_vec = None
        link_matrix = self.link_vectorizer.transform(links)
        AS = self.Q.join_As(link_matrix, page_vec)
        scores = self.Q.predict(AS)

        urls = [link['url'] for link in links]
        return list(zip(scores, urls))