def get_hostname(request): try: netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(request.url) except Exception, e: logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta['fingerprint'], request.url)) return None
def schedule(self, batch): to_save = [] for fprint, score, request, schedule in batch: if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast( request.url) if not hostname: self.logger.error( "Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) partition_id = self.partitions[0] host_crc32 = 0 else: partition_id = self.partitioner.partition( hostname, self.partitions) host_crc32 = get_crc32(hostname) q = self.queue_model(fingerprint=to_native_str(fprint), score=score, url=request.url, meta=request.meta, headers=request.headers, cookies=request.cookies, method=to_native_str(request.method), partition_id=partition_id, host_crc32=host_crc32, created_at=time() * 1E+6) to_save.append(q) request.meta[b'state'] = States.QUEUED self.session.bulk_save_objects(to_save) self.session.commit()
def schedule(self, batch): to_save = [] for fprint, score, request, schedule in batch: if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast( request.url) if not hostname: self.logger.error( "Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) partition_id = self.partitions[0] host_crc32 = 0 else: partition_id = self.partitioner.partition( hostname, self.partitions) host_crc32 = get_crc32(hostname) q = self.queue_model( fingerprint=fprint, score=score, url=request.url, meta=request.meta, headers=request.headers, cookies=request.cookies, method=request.method, partition_id=partition_id, host_crc32=host_crc32, created_at=time() * 1E+6) to_save.append(q) request.meta['state'] = States.QUEUED self.session.bulk_save_objects(to_save) self.session.commit()
def get_hostname(request): try: netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(request.url) except Exception as e: logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta[b'fingerprint'], request.url)) return None else: return name.encode('utf-8', 'ignore')
def schedule(self, batch): for fprint, score, request, schedule in batch: if schedule: request.meta[b'_scr'] = score _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint) partition_id = self.partitions[0] else: partition_id = self.partitioner.partition(hostname, self.partitions) self.heap[partition_id].push(request)
def schedule(self, batch): to_schedule = [] for fprint, score, request, schedule in batch: if schedule: if 'domain' not in request.meta: _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint) request.meta['domain'] = {'name': hostname} to_schedule.append((score, fprint, request.meta['domain'], request.url)) self._schedule(to_schedule)
def _is_domain_blacklisted(self, request): if not self.domains_blacklist: return if 'domain' in request.meta: hostname = request.meta['domain'].get('name') else: _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if hostname: hostname = hostname.lower() if hostname in self.domains_blacklist: self.logger.debug("Dropping black-listed hostname, URL %s", request.url) return True return False
def schedule(self, batch): to_schedule = dict() now = int(time()) for fprint, score, request, schedule in batch: if schedule: if b'domain' not in request.meta: # TODO: this have to be done always by DomainMiddleware, # so I propose to require DomainMiddleware by HBaseBackend and remove that code _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint) request.meta[b'domain'] = {'name': hostname} timestamp = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else now to_schedule.setdefault(timestamp, []).append((request, score)) for timestamp, batch in six.iteritems(to_schedule): self._schedule(batch, timestamp)
def update_score(self, batch): if not isinstance(batch, dict): raise TypeError("batch should be dict with fingerprint as key, and float score as value") to_schedule = [] for fprint, (score, url, schedule) in batch.iteritems(): obj = prepare_hbase_object(score=score) rk = unhexlify(fprint) self.batch.put(rk, obj) if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast(url) if not hostname: self.manager.logger.backend.error("Can't get hostname for URL %s, fingerprint %s" % (url, fprint)) continue to_schedule.append((score, fprint, {"name": hostname}, url)) self.queue.schedule(to_schedule)
def parse_domain_info(url, test_mode=False): if test_mode: match = re.match('([A-Z])\w+', url) netloc = name = match.groups()[0] if match else '?' scheme = sld = tld = subdomain = '-' else: netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(url) return { 'netloc': netloc, 'name': name, 'scheme': scheme, 'sld': sld, 'tld': tld, 'subdomain': subdomain, }
def schedule(self, batch): for fprint, score, request, schedule_at in batch: if schedule_at: _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) partition_id = self.partitions[0] host_crc32 = 0 else: partition_id = self.partitioner.partition(hostname, self.partitions) host_crc32 = get_crc32(hostname) created_at = time()*1E+6 q = self._create_queue(request, fprint, score, partition_id, host_crc32, created_at) q.save() request.meta['state'] = States.QUEUED
def get_key(request): domain = request.meta.get(b'domain') if domain is not None: if type(domain) == dict: return domain[b'name'] elif type(domain) == int: return domain else: raise TypeError("domain of unknown type.") try: _, name, _, _, _, _ = parse_domain_from_url_fast(request.url) except Exception: return None else: return name.encode('utf-8', 'ignore')
def parse_domain_info(url, test_mode=False): if test_mode: match = re.match('([A-Z])\w+', url) netloc = name = match.groups()[0] if match else '?' scheme = sld = tld = subdomain = '-' else: netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast( url) return { 'netloc': netloc, 'name': name, 'scheme': scheme, 'sld': sld, 'tld': tld, 'subdomain': subdomain, }
def schedule(self, batch): query = self.session.prepare("INSERT INTO queue (crawl, fingerprint, score, partition_id, host_crc32, url, " "created_at, meta, depth, headers, method, cookies) " "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") cql_items = [] for fprint, score, request, schedule in batch: if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) partition_id = self.partitions[0] host_crc32 = 0 else: partition_id = self.partitioner.partition(hostname, self.partitions) host_crc32 = get_crc32(hostname) created_at = time()*1E+6 if "domain" not in request.meta: request.meta["domain"] = {} if "origin_is_frontier" not in request.meta: request.meta["origin_is_frontier"] = '' if "scrapy_callback" not in request.meta: request.meta["scrapy_callback"] = None if "scrapy_errback" not in request.meta: request.meta["scrapy_errback"] = None if "scrapy_meta" not in request.meta: request.meta["scrapy_meta"] = {} if "score" not in request.meta: request.meta["score"] = 0 if "jid" not in request.meta: request.meta["jid"] = 0 meta = Meta(domain=request.meta['domain'], fingerprint=fprint, origin_is_frontier=request.meta['origin_is_frontier'], scrapy_callback=request.meta['scrapy_callback'], scrapy_errback=request.meta['scrapy_errback'], scrapy_meta=request.meta['scrapy_meta']) cql_i = (self.crawl_id, fprint, score, partition_id, host_crc32, request.url, created_at, meta, 0, request.headers, request.method, request.cookies) cql_items.append(cql_i) request.meta['state'] = States.QUEUED execute_concurrent_with_args(self.session, query, cql_items, concurrency=400) self.counter_cls.cass_count({"queued_urls": len(cql_items)})
def schedule(self, batch): to_schedule = dict() now = int(time()) for fprint, score, request, schedule in batch: if schedule: # TODO: This is done by DomainMiddleware - RedisBackend should require DomainMiddleware if FIELD_DOMAIN not in request.meta: _, hostname, _, _, _, _ = parse_domain_from_url_fast( request.url) if not hostname: self._logger.error( "Can't get hostname for URL %s, fingerprint %s", request.url, fprint) request.meta[FIELD_DOMAIN] = {'name': hostname} timestamp = request.meta[ FIELD_CRAWL_AT] if FIELD_CRAWL_AT in request.meta else now to_schedule.setdefault(timestamp, []).append((request, score)) for timestamp, batch in to_schedule.items(): self._schedule(batch, timestamp)
def update_score(self, batch): if not isinstance(batch, dict): raise TypeError( 'batch should be dict with fingerprint as key, and float score as value' ) to_schedule = [] for fprint, (score, url, schedule) in batch.iteritems(): obj = prepare_hbase_object(score=score) rk = unhexlify(fprint) self.batch.put(rk, obj) if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast(url) if not hostname: self.manager.logger.backend.error( "Can't get hostname for URL %s, fingerprint %s" % (url, fprint)) continue to_schedule.append((score, fprint, {'name': hostname}, url)) self.queue.schedule(to_schedule)
def new_batch(self, *args, **kwargs): lags = self._offset_fetcher.get() logger.info("Got lags %s" % str(lags)) partitions = [] for partition, lag in lags.iteritems(): if lag < self.max_next_requests: partitions.append(partition) logger.info("Getting new batches for partitions %s" % str(",").join(map(str, partitions))) if not partitions: return 0 count = 0 for request in self._backend.get_next_requests(self.max_next_requests, partitions=partitions): try: request.meta['jid'] = self.job_id eo = self._encoder.encode_request(request) except Exception as e: logger.error("Encoding error, %s, fingerprint: %s, url: %s" % (e, request.meta['fingerprint'], request.url)) continue finally: count += 1 try: netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast( request.url) except Exception as e: logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta['fingerprint'], request.url)) encoded_name = name.encode('utf-8', 'ignore') self._producer.send_messages(self.outgoing_topic, encoded_name, eo) logger.info("Pushed new batch of %d items", count) self.stats['last_batch_size'] = count self.stats.setdefault('batches_after_start', 0) self.stats['batches_after_start'] += 1 self.stats['last_batch_generated'] = asctime() return count
count = 0 for request in self._backend.get_next_requests(self.max_next_requests, partitions=partitions): try: request.meta['jid'] = self.job_id eo = self._encoder.encode_request(request) except Exception, e: logger.error("Encoding error, %s, fingerprint: %s, url: %s" % (e, request.meta['fingerprint'], request.url)) continue finally: count += 1 try: netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast( request.url) except Exception, e: logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta['fingerprint'], request.url)) encoded_name = name.encode('utf-8', 'ignore') self._producer.send_messages(self.outgoing_topic, encoded_name, eo) logger.info("Pushed new batch of %d items", count) self.stats['last_batch_size'] = count self.stats.setdefault('batches_after_start', 0) self.stats['batches_after_start'] += 1 self.stats['last_batch_generated'] = asctime() return count def disable_new_batches(self): self.slot.disable_new_batches = True
def test_complete_url(self): self.assertEqual(parse_domain_from_url_fast(complete_url), ('username:[email protected]:80', 'www.example.com', 'http', '', '', ''))
def test_simple_url(self): self.assertEqual(parse_domain_from_url_fast(simple_url), ('www.example.com', 'www.example.com', 'http', '', '', ''))
count = 0 for request in self._backend.get_next_requests(self.max_next_requests, partitions=partitions): try: request.meta['jid'] = self.job_id eo = self._encoder.encode_request(request) except Exception, e: logger.error("Encoding error, %s, fingerprint: %s, url: %s" % (e, request.meta['fingerprint'], request.url)) continue finally: count +=1 try: netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(request.url) except Exception, e: logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta['fingerprint'], request.url)) encoded_name = name.encode('utf-8', 'ignore') self._producer.send_messages(self.outgoing_topic, encoded_name, eo) logger.info("Pushed new batch of %d items", count) self.stats['last_batch_size'] = count self.stats.setdefault('batches_after_start', 0) self.stats['batches_after_start'] += 1 self.stats['last_batch_generated'] = asctime() return count def disable_new_batches(self): self.slot.disable_new_batches = True
def test_simple_url(self): self.assertEqual( parse_domain_from_url_fast(simple_url), ('www.example.com', 'www.example.com', 'http', '', '', ''))