Ejemplo n.º 1
0
    def _extract_links(self, selector, response_url, response_encoding, base_url):
        links = []
        # hacky way to get the underlying lxml parsed document
        for el, attr, attr_val in self._iter_links(selector._root):

            if not self._is_valid_link(el, attr, attr_val):
                continue

            attr_val = urljoin(base_url, attr_val)
            url = self.process_attr(attr_val)

            if url is None:
                continue
            if isinstance(url, unicode):
                url = url.encode(response_encoding)
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            if el.tag != 'a':
                link = AssetLink(url, _collect_string_content(el) or u'',
                    nofollow=True if el.get('rel') == 'nofollow' else False)
            else:
                link = PageLink(url, _collect_string_content(el) or u'',
                    nofollow=True if el.get('rel') == 'nofollow' else False)
            links.append(link)

        return unique_list(links, key=lambda link: link.url) \
                if self.unique else links        
Ejemplo n.º 2
0
    def _extract_links(self, selector, response_url, response_encoding,
                       base_url):
        links = []
        # hacky way to get the underlying lxml parsed document
        for el, attr, attr_val in self._iter_links(selector._root):
            # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
            try:
                attr_val = urljoin(base_url, attr_val)
            except ValueError:
                continue  # skipping bogus links
            else:
                url = self.process_attr(attr_val)
                if url is None:
                    continue
            if isinstance(url, unicode):
                url = url.encode(response_encoding)
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            link = Link(
                url,
                _collect_string_content(el) or u'',
                nofollow=True if el.get('rel') == 'nofollow' else False)
            links.append(link)

        return unique_list(links, key=lambda link: link.url) \
                if self.unique else links
Ejemplo n.º 3
0
 def extract_links(self, response):
     result = json.loads(response.text)
     for pattern in self.patterns:
         extractors = pattern.get('extractors')
         format = pattern.get('format')
         data = result
         for extractor in extractors:
             type = extractor.get('type')
             if isinstance(data, dict):
                 if type == 'value':
                     data = self.get_value(*([data] +
                                             extractor.get('args')))
             elif isinstance(data, list):
                 if type == 'value':
                     data = [
                         self.get_value(*([item] + extractor.get('args')))
                         for item in data
                     ]
                 elif type == 'slice':
                     data = self.get_slice(*([data] +
                                             extractor.get('args')))
         if not isinstance(data, list):
             data = [data]
         all_links = [
             Link(response.urljoin(format.format(
                 *[item]))) if not isinstance(item, list) else Link(
                     response.urljoin(format.format(*item)))
             for item in data
         ]
         return unique_list(all_links)
Ejemplo n.º 4
0
    def _extract_links(self, selector, response_url, response_encoding, base_url):
        '''
        Pretty much the same function, just added 'ignore' to url.encode
        '''
        links = []
        # hacky way to get the underlying lxml parsed document
        for el, attr, attr_val in self._iter_links(selector.root):
            # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
            try:
                attr_val = urljoin(base_url, attr_val)
            except ValueError:
                continue  # skipping bogus links
            else:
                url = self.process_attr(attr_val)
                if url is None:
                    continue
            if isinstance(url, unicode):
                # add 'ignore' to encoding errors
                url = url.encode(response_encoding, 'ignore')
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            link = Link(url, _collect_string_content(el) or u'',
                        nofollow=True if el.get('rel') == 'nofollow' else False)
            links.append(link)

        return unique_list(links, key=lambda link: link.url) \
                if self.unique else links
Ejemplo n.º 5
0
    def _process_links(self, links):
        """ Normalize and filter extracted links

        The subclass should override it if necessary
        """
        links = unique_list(links, key=lambda link: link.url) if self.unique else links
        return links
Ejemplo n.º 6
0
    def _process_links(self, links):
        """ Normalize and filter extracted links

        The subclass should override it if neccessary
        """
        links = unique_list(links, key=lambda link: link.url) if self.unique else links
        return links
Ejemplo n.º 7
0
 def extract_links(self, response):
     if not self.base_url:
         self.base_url = get_base_url(response)
     items = re.findall(self.restrict_re, response.text)
     all_links = [
         Link(response.urljoin(self.base_url.format(str(item))))
         for item in items
     ]
     return unique_list(all_links)
Ejemplo n.º 8
0
 def extract_links(self, response):
     base_url = get_base_url(response)
     if self.restrict_xpaths:
         docs = [subdoc for x in self.restrict_xpaths for subdoc in response.xpath(x)]
     else:
         docs = [response.selector]
     all_links = []
     for doc in docs:
         links = self._extract_links(doc, response.url, response.encoding, base_url)
         all_links.extend(self._process_links(links))
     return unique_list(all_links)
Ejemplo n.º 9
0
    def _extract_links(self, response_text, response_url):
        html = lxml.html.fromstring(response_text)
        html.make_links_absolute(response_url)
        for e, a, l, p in html.iterlinks():
            if self.scan_tag(e.tag):
                if self.scan_attr(a):
                    link = Link(self.process_attr(l), text=e.text)
                    self.links.append(link)

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        return links
Ejemplo n.º 10
0
    def _extract_links(self, response_text, response_url):
        html = lxml.html.fromstring(response_text)
        html.make_links_absolute(response_url)
        for e, a, l, p in html.iterlinks():
            if self.scan_tag(e.tag):
                if self.scan_attr(a):
                    link = Link(self.process_attr(l), text=e.text)
                    self.links.append(link)

        links = unique_list(self.links, key=lambda link: link.url) \
                if self.unique else self.links

        return links
Ejemplo n.º 11
0
 def extract_links(self, response):
     base_url = get_base_url(response)
     if self.restrict_xpaths:
         docs = [subdoc
                 for x in self.restrict_xpaths
                 for subdoc in response.xpath(x)]
     else:
         docs = [response.selector]
     all_links = []
     for doc in docs:
         links = self._extract_links(doc, response.url, response.encoding, base_url)
         all_links.extend(self._process_links(links))
     return unique_list(all_links)
Ejemplo n.º 12
0
    def _extract_links(self, response_text, response_url, response_encoding):
        self.base_url, self.links = etree.HTML(response_text, self.parser) 

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding)
            ret.append(link)

        return ret
 def extract_links(self, response):
     html = Selector(response)
     try:
         base_url = response.xpath("//base/@href").extract()[0]
     except IndexError:
         base_url = get_base_url(response)
     if self.restrict_xpaths:
         docs = [subdoc for x in self.restrict_xpaths for subdoc in html.xpath(x)]
     else:
         docs = [html]
     all_links = []
     for doc in docs:
         links = self._extract_links(doc, response.url, response.encoding, base_url)
         all_links.extend(self._process_links(links))
     return unique_list(all_links)
Ejemplo n.º 14
0
def merge_clusters(merged, all_urls, min_cluster_size):
    res = {'clusters': {}, 'unclustered': []}
    unclustered = all_urls
    for regex in merged:
        matches = apply_reg_ex_to_urls(regex, all_urls)
        matches = unique_list(matches)
        if len(matches) >= min_cluster_size:
            for match in matches: 
                try: 
                    unclustered.remove(match)
                except:
                    continue
            human = regex.replace('([^/]+)', '[...]').replace('([^&=?]+)', '[...]').replace('(\d+)', '[NUMBER]')
            res['clusters'].update({(regex, human): matches})
    res['unclustered'] = unclustered
    return res
Ejemplo n.º 15
0
    def _extract_links(self, response_text, response_url, response_encoding):
        self.reset()
        self.feed(response_text)
        self.close()

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = link.text.decode(response_encoding)
            ret.append(link)

        return ret
Ejemplo n.º 16
0
def get_urls(_url, _html=None, headers=None):
    if _html is None: 
        response = requests.get(_url, verify=False, headers=headers)
        _html = response.content
    page = str(BeautifulSoup(_html))
    url_list = []
    while True:
        url, n = get_url(page)
        page = page[n:]
        if url:
            url = urljoin(_url, url)
            url_list.append(url)
        else:
            break
    url_list = unique_list(url_list)
    return url_list
Ejemplo n.º 17
0
    def _extract_links(self, response_text, response_url, response_encoding):
        self.reset()
        self.feed(response_text)
        self.close()

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = self.base_url if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = link.text.decode(response_encoding)
            ret.append(link)

        return ret
Ejemplo n.º 18
0
    def extract_links(self, response):
        base_url = get_base_url(response)

        if self.restrict_xpaths:
            links = [link
                    for xpath in self.restrict_xpaths
                    for link in response.xpath(xpath)]
        else:
            links = [response.selector,]

        all_links = [Link(response.url),]
        
        for link in links:
            new_link = self._extract_links(link, response.url, response.encoding, base_url)
            all_links.extend(self._process_links(new_link))
        
        return unique_list(all_links)
Ejemplo n.º 19
0
    def _extract_links(self, response_text, response_url, response_encoding):
        links = []
        html = lxml.html.fromstring(response_text)
        html.make_links_absolute(response_url)
        for e, a, l, p in html.iterlinks():
            if self.tag_func(e.tag):
                if self.attr_func(a):
                    l = safe_url_string(l, response_encoding)
                    text = u''
                    if e.text:
                        text = str_to_unicode(e.text, response_encoding, errors='replace').strip()
                    link = Link(self.process_func(l), text=text)
                    links.append(link)

        links = unique_list(links, key=lambda link: link.url) \
                if self.unique else links

        return links
Ejemplo n.º 20
0
    def _extract_links(self, response_text, response_url, response_encoding):
        self.base_url, self.links = etree.HTML(response_text, self.parser)

        links = unique_list(
            self.links,
            key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin_rfc(
            response_url, self.base_url) if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text,
                                       response_encoding,
                                       errors='replace')
            ret.append(link)

        return ret
Ejemplo n.º 21
0
    def _extract_links(self, selector, response_url, response_encoding, base_url):
        links = []
        # hacky way to get the underlying lxml parsed document
        for el, attr, attr_val in self._iter_links(selector._root):
            # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
            attr_val = urljoin(base_url, attr_val)
            url = self.process_attr(attr_val)
            if url is None:
                continue
            if isinstance(url, unicode):
                url = url.encode(response_encoding, 'ignore')
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            link = Link(url, _collect_string_content(el) or u'',
                        nofollow=True if el.get('rel') == 'nofollow' else False)
            links.append(link)

        return unique_list(links, key=lambda link: link.url) \
            if self.unique else links
Ejemplo n.º 22
0
	def extract_links(self, response):
		base_url = get_base_url(response)
		print "base_url", base_url
		domain_name=tldextract.extract(base_url).domain
		if domain_name in self.crawledPagesPerSite and self.crawledPagesPerSite[domain_name]>self.maximumPagesPerSite:
			return []

		if self.restrict_xpaths:
			docs = [subdoc for x in self.restrict_xpaths for subdoc in response.xpath(x)]
		else:
			docs = [response.selector]
		all_links = []
		for doc in docs:
			links = self._extract_links(doc, response.url, response.encoding, base_url)
			all_links.extend(self._process_links(links))

		all_links=unique_list(all_links)

		new_all_links=[]

		for link in all_links:
			url=link.url
			
			domain_name=tldextract.extract(url).domain
			suffix=tldextract.extract(url).suffix
			domain_and_suffix=domain_name+"."+suffix

			if domain_and_suffix not in self.allow_domains:
				continue
				
			if domain_name in self.crawledPagesPerSite:
				self.crawledPagesPerSite[domain_name]+=1
			else:
				self.crawledPagesPerSite[domain_name]=1

			if self.crawledPagesPerSite[domain_name]>self.maximumPagesPerSite:
				break	
			else:
				print "have crawled " , self.crawledPagesPerSite[domain_name], "pages"
				new_all_links.append(link)
		return new_all_links
Ejemplo n.º 23
0
    def _extract_links(self, response_text, response_url, response_encoding):
        self.reset()
        self.feed(response_text)
        self.close()

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            if isinstance(link.url, unicode):
                link.url = link.url.encode(response_encoding)
            try:
                link.url = urljoin(base_url, link.url)
            except ValueError:
                continue
            link.url = safe_url_string(link.url, response_encoding)
            link.text = link.text.decode(response_encoding)
            ret.append(link)

        return ret
Ejemplo n.º 24
0
 def extract_links(self, response):
     # TODO: remove debug code
     with open('/export/home/asanakoy/tmp/response.txt', 'w') as f:
         f.write(response.body)
     assert False, 'enough ;)'
     base_url = self.base_url if self.base_url else get_base_url(response)
     if self.restrict_xpaths:
         docs = [
             subdoc for x in self.restrict_xpaths
             for subdoc in response.xpath(x)
         ]
     else:
         docs = [response.selector]
     all_links = []
     for doc in docs:
         links = self._extract_links(doc, response.url, response.encoding,
                                     base_url)
         print 'Num links before filter:', len(links)
         all_links.extend(self._process_links(links))
     print 'Num links:', len(all_links)
     return unique_list(all_links)
Ejemplo n.º 25
0
    def extract_links(self, response):
        """Returns a list of :class:`~scrapy.link.Link` objects from the
        specified :class:`response <scrapy.http.Response>`.

        Only links that match the settings passed to the ``__init__`` method of
        the link extractor are returned.

        Duplicate links are omitted.
        """
        base_url = get_base_url(response)
        if self.restrict_xpaths:
            docs = [subdoc
                    for x in self.restrict_xpaths
                    for subdoc in response.xpath(x)]
        else:
            docs = [response.selector]
        all_links = []
        for doc in docs:
            links = self._extract_links(doc, response.url, response.encoding, base_url)
            all_links.extend(self._process_links(links))
        return unique_list(all_links)
Ejemplo n.º 26
0
    def extract_links(self, response):
        """Returns a list of :class:`~scrapy.link.Link` objects from the
        specified :class:`response <scrapy.http.Response>`.

        Only links that match the settings passed to the ``__init__`` method of
        the link extractor are returned.

        Duplicate links are omitted.
        """
        base_url = get_base_url(response)
        if self.restrict_xpaths:
            docs = [
                subdoc
                for x in self.restrict_xpaths #大循环 有多少条restrict_xpaths
                for subdoc in response.xpath(x) #小循环 符合这个xpath的条目的 有多少个字条目
            ]#[a for i in range(3) for a in range(10)]》》》[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        else:
            docs = [response.selector]
        all_links = []
        for doc in docs:
            links = self._extract_links(doc, response.url, response.encoding, base_url) #实际上是调用 LxmlParserLinkExtractor._extract_links
            all_links.extend(self._process_links(links))
        return unique_list(all_links)
Ejemplo n.º 27
0
def _extract_links(self, selector, response_url, response_encoding, base_url):
    links = []
    # hacky way to get the underlying lxml parsed document
    for el, attr, attr_val in self._iter_links(selector.root):
        if self.scan_tag(el.tag) and self.scan_attr(attr):
            # pseudo root.make_links_absolute(base_url)
            # START PATCH: Added check to filter links before making absolute
            if not _is_valid_link(attr_val):
                continue
            # END PATCH
            attr_val = urljoin(base_url, attr_val)
            url = self.process_attr(attr_val)
            if url is None:
                continue
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            link = Link(
                url, _collect_string_content(el) or '',
                nofollow=True if el.get('rel') == 'nofollow' else False
            )
            links.append(link)
    return unique_list(links, key=lambda link: link.url) \
        if self.unique else links
Ejemplo n.º 28
0
Archivo: utils.py Proyecto: ICCV/chaos
    def _extract_from_js(self, doc, response_url, response_encoding, base_url):
        data = doc.re('siblings:\s*(?P<data>.*)\s*\, registryURL')
        if not data:
            return []
        links = []
        articles = json.loads(data[0])
        for article in articles['articleList']:
            try:
                attr_val = urljoin(base_url, article['uri'])
            except ValueError:
                continue # skipping bogus links
            else:
                url = self.link_extractor.process_attr(attr_val)
                if url is None:
                    continue
            if isinstance(url, unicode):
                url = url.encode(response_encoding)

            url = urljoin(response_url, url)
            link = Link(url, u'', nofollow=False)
            links.append(link)

        return unique_list(links, key=lambda link: link.url) \
                if self.link_extractor.unique else links
Ejemplo n.º 29
0
 def _deduplicate_if_needed(self, links):
     if self.unique:
         return unique_list(links, key=lambda link: link.url)
     return links
Ejemplo n.º 30
0
 def extract_links(self, response):
     links = super(RmDupliLinkExtractor, self).extract_links(response)
     base_url = get_base_url(response)
     return unique_list([link for link in links if not (link.url.startswith(base_url) and base_url.endswith(link.url[len(base_url):]))])
Ejemplo n.º 31
0
 def _deduplicate_if_needed(self, links):
     if self.unique:
         return unique_list(links, key=self.link_key)
     return links
Ejemplo n.º 32
0
    def _process_links(self, links):
        """ Normalize and filter extracted links

        The subclass should override it if necessary
        """
        return unique_list(links, key=self.link_key) if self.unique else links
Ejemplo n.º 33
0
    def _process_links(self, links):
        """ Normalize and filter extracted links

        The subclass should override it if necessary
        """
        return unique_list(links, key=self.link_key) if self.unique else links
Ejemplo n.º 34
0
 def _deduplicate_if_needed(self, links):
     if self.unique:
         return unique_list(links, key=self.link_key)
     return links
 def extract_links(self, response):
     all_links = []
     for json_path in self.json_paths:
         links = self._extract_links(json_path, response)
         all_links.extend(self._process_links(links))
     return unique_list(all_links)
Ejemplo n.º 36
0
 def _deduplicate_if_needed(self, links):
     if self.unique:
         return unique_list(links, key=lambda link: link.url)
     return links