def add_links(self, link_container, depth=0, base=None): """ Add a list of urls to self at a certain depth. :param link_container: list of urls :param depth: depth at which the urls have been harvested :param base: base at which the urls have been harvested """ number_of_links = 0 if not base: base = self.base[0] # try: for url_dict in link_container: url = url_dict['links'] if "#" in url: url = url.split('#')[0] if not len(url): continue url = urllib.parse.urljoin(base, url) if not validate.url_explicit(url): continue self.add(url, depth) number_of_links += 1 logger.debug('{} links added @base {} .'.format( number_of_links, base))
def append(self, url, depth): """ Extract an urls base, and append the url and the base to self. :param url: a url to be added with its base. :param depth: crawl depth the url has to be added to. """ if depth > CRAWL_DEPTH: return with self.lock: base = parse_base(url) if base not in self[depth] or validate.url_explicit(url): logger.debug('BASE_URL: adding new base @depth {} : {}' .format(depth, base)) if '//' in base: queue_name = base.split('//')[1] else: queue_name = base link_queue = FileQueue( directory="../data", name=queue_name, persistent=True, overwrite=True, pickled=False ) self.add_to_history(url) link_queue.put(url) # base urls are added to the crawl queue only if set in # settings: if ALWAYS_INCLUDE_BASE_IN_CRAWLABLE_LINK_QUEUE: self.add_to_history(base) link_queue.put(base) self[depth][base] = link_queue self.base_queue.put((base, depth)) if not self.load_from_db: self.store(base, depth) else: logger.debug("BASE_URL: cannot add {}".format(base))
def parse(self, *args, **kwargs): self.links = [y for x in self.html.split('\n') for y in x.split('\r') if validate.url_explicit(y)]