Esempio n. 1
0
    def add_links(self, link_container, depth=0, base=None):
        """
        Add a list of urls to self at a certain depth.

        :param link_container: list of urls
        :param depth: depth at which the urls have been harvested
        :param base: base at which the urls have been harvested
        """
        number_of_links = 0
        if not base:
            base = self.base[0]
        # try:
        for url_dict in link_container:
            url = url_dict['links']
            if "#" in url:
                url = url.split('#')[0]
                if not len(url):
                    continue
            url = urllib.parse.urljoin(base, url)
            if not validate.url_explicit(url):
                continue
            self.add(url, depth)
            number_of_links += 1
        logger.debug('{} links added @base {} .'.format(
            number_of_links, base))
Esempio n. 2
0
    def append(self, url, depth):
        """
        Extract an urls base, and append the url and the base to self.

        :param url: a url to be added with its base.
        :param depth: crawl depth the url has to be added to.
        """
        if depth > CRAWL_DEPTH:
            return
        with self.lock:
            base = parse_base(url)
            if base not in self[depth] or validate.url_explicit(url):
                logger.debug('BASE_URL: adding new base @depth {} : {}'
                             .format(depth, base))
                if '//' in base:
                    queue_name = base.split('//')[1]
                else:
                    queue_name = base
                link_queue = FileQueue(
                    directory="../data",
                    name=queue_name,
                    persistent=True,
                    overwrite=True,
                    pickled=False
                )
                self.add_to_history(url)
                link_queue.put(url)
                # base urls are added to the crawl queue only if set in
                # settings:
                if ALWAYS_INCLUDE_BASE_IN_CRAWLABLE_LINK_QUEUE:
                    self.add_to_history(base)
                    link_queue.put(base)
                self[depth][base] = link_queue
                self.base_queue.put((base, depth))
                if not self.load_from_db:
                    self.store(base, depth)
            else:
                logger.debug("BASE_URL: cannot add {}".format(base))
Esempio n. 3
0
 def parse(self, *args, **kwargs):
     self.links = [y for x in self.html.split('\n') for y in x.split('\r')
                  if validate.url_explicit(y)]