def spider_from(self, url, source_url=None, limit=20000): print '%d links crawled, %d links in queue. url %s' % (len( self.hrefs), len(self.queue), url) if url not in self.hrefs: self.hrefs |= set([url]) data = LinkExtractor(url, source_url=source_url).extract(get_meta=True) data.update({ 'guid': url, 'stories_id': hashlib.md5(url).hexdigest() }) story = self.db.addStory(data) limit -= 1 # Add the inlinks to a queue so we get the closest links first self.queue |= set([ link['href'] for link in data['story_links'] if link['inlink'] is True and link['href'] not in self.hrefs and not any((subl in link['href'] for subl in ('/topics/', ))) ]) else: print '--> Already crawled' if self.queue: return self.spider_from(self.queue.pop(), source_url=source_url, limit=limit) return len(self.hrefs)
def run(self): self.total_bytes = 0 html_data = self._get_html(url=self.url) if html_data is None: return self.total_bytes += len(html_data) extractor = LinkExtractor(base_url=self.url) extractor.feed(html_data) for link in extractor.links: extra_data = self._get_html(url=link) if extra_data: self.total_bytes += len(extra_data)
def run(self): self.total_bytes = 0 html_data = self._get_html(url=self.url) if html_data is None: return self.total_bytes += len(html_data) if self.go_ahead: extractor = LinkExtractor(base_url=self.url) extractor.feed(html_data) sizers = [PageSizer(url=link, go_ahead=False) for link in extractor.links] for sizer in sizers: sizer.start() for sizer in sizers: sizer.join() for sizer in sizers: self.total_bytes += sizer.total_bytes
def run(self): self.total_bytes = 0 html_data = self._get_html(url=self.url) if html_data is None: return self.total_bytes += len(html_data) if self.go_ahead: extractor = LinkExtractor(base_url=self.url) extractor.feed(html_data) collector = multiprocessing.Queue() sizers = [PageSizer(url=link, go_ahead=False, collector=collector) for link in extractor.links] for sizer in sizers: sizer.start() for sizer in sizers: sizer.join() while not collector.empty(): data = collector.get() self.total_bytes += data['total_bytes'] self.collector.put(dict(url=self.url, total_bytes=self.total_bytes))
def spider_from(self, url, source_url=None, limit=20000): print '%d links crawled, %d links in queue. url %s' % (len(self.hrefs), len(self.queue), url) if url not in self.hrefs: self.hrefs |= set([url]) data = LinkExtractor(url, source_url=source_url).extract(get_meta=True) data.update({ 'guid': url, 'stories_id': hashlib.md5(url).hexdigest() }) story = self.db.addStory(data) limit -= 1 # Add the inlinks to a queue so we get the closest links first self.queue |= set([link['href'] for link in data['story_links'] if link['inlink'] is True and link['href'] not in self.hrefs and not any((subl in link['href'] for subl in ('/topics/',)))]) else: print '--> Already crawled' if self.queue: return self.spider_from(self.queue.pop(), source_url=source_url, limit=limit) return len(self.hrefs)
self._queue = queue self._used_links = set() def process(self, data): url, links = data fresh = set(links) - self._used_links self._used_links.update(fresh) for link in fresh: queue.put(link) data = (url, fresh) self._output.write(functools.partial(self._save, data)) @staticmethod def _save(data, fp): url, links = data fp.write('--URL crawled---:' + url + '\n') print('URL crawled---:' + url) for link in links: fp.write('\t' + link + '\n') print(link) queue = queue.Queue() queue.put('https://github.com') provider = Provider(queue, _base.Output('links.txt')) looper = Looper(queue, [LinkExtractor(provider)]) looper.run(timeout=1) queue.join()