コード例 #1
0
ファイル: db.py プロジェクト: thodison/frontera
 def get_hostname(request):
     try:
         netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(request.url)
     except Exception, e:
         logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta['fingerprint'],
                                                                        request.url))
         return None
コード例 #2
0
 def schedule(self, batch):
     to_save = []
     for fprint, score, request, schedule in batch:
         if schedule:
             _, hostname, _, _, _, _ = parse_domain_from_url_fast(
                 request.url)
             if not hostname:
                 self.logger.error(
                     "Can't get hostname for URL %s, fingerprint %s" %
                     (request.url, fprint))
                 partition_id = self.partitions[0]
                 host_crc32 = 0
             else:
                 partition_id = self.partitioner.partition(
                     hostname, self.partitions)
                 host_crc32 = get_crc32(hostname)
             q = self.queue_model(fingerprint=to_native_str(fprint),
                                  score=score,
                                  url=request.url,
                                  meta=request.meta,
                                  headers=request.headers,
                                  cookies=request.cookies,
                                  method=to_native_str(request.method),
                                  partition_id=partition_id,
                                  host_crc32=host_crc32,
                                  created_at=time() * 1E+6)
             to_save.append(q)
             request.meta[b'state'] = States.QUEUED
     self.session.bulk_save_objects(to_save)
     self.session.commit()
コード例 #3
0
ファイル: components.py プロジェクト: lljrsr/frontera
 def schedule(self, batch):
     to_save = []
     for fprint, score, request, schedule in batch:
         if schedule:
             _, hostname, _, _, _, _ = parse_domain_from_url_fast(
                 request.url)
             if not hostname:
                 self.logger.error(
                     "Can't get hostname for URL %s, fingerprint %s" %
                     (request.url, fprint))
                 partition_id = self.partitions[0]
                 host_crc32 = 0
             else:
                 partition_id = self.partitioner.partition(
                     hostname, self.partitions)
                 host_crc32 = get_crc32(hostname)
             q = self.queue_model(
                 fingerprint=fprint,
                 score=score,
                 url=request.url,
                 meta=request.meta,
                 headers=request.headers,
                 cookies=request.cookies,
                 method=request.method,
                 partition_id=partition_id,
                 host_crc32=host_crc32,
                 created_at=time() * 1E+6)
             to_save.append(q)
             request.meta['state'] = States.QUEUED
     self.session.bulk_save_objects(to_save)
     self.session.commit()
コード例 #4
0
 def get_hostname(request):
     try:
         netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(request.url)
     except Exception as e:
         logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta[b'fingerprint'],
                                                                        request.url))
         return None
     else:
         return name.encode('utf-8', 'ignore')
コード例 #5
0
ファイル: db.py プロジェクト: Preetwinder/frontera
 def get_hostname(request):
     try:
         netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(request.url)
     except Exception as e:
         logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta[b'fingerprint'],
                                                                        request.url))
         return None
     else:
         return name.encode('utf-8', 'ignore')
コード例 #6
0
ファイル: __init__.py プロジェクト: Preetwinder/frontera
 def schedule(self, batch):
     for fprint, score, request, schedule in batch:
         if schedule:
             request.meta[b'_scr'] = score
             _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
             if not hostname:
                 self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint)
                 partition_id = self.partitions[0]
             else:
                 partition_id = self.partitioner.partition(hostname, self.partitions)
             self.heap[partition_id].push(request)
コード例 #7
0
ファイル: hbase.py プロジェクト: RaoUmer/frontera
 def schedule(self, batch):
     to_schedule = []
     for fprint, score, request, schedule in batch:
         if schedule:
             if 'domain' not in request.meta:
                 _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
                 if not hostname:
                     self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint)
                 request.meta['domain'] = {'name': hostname}
             to_schedule.append((score, fprint, request.meta['domain'], request.url))
     self._schedule(to_schedule)
コード例 #8
0
ファイル: hbase.py プロジェクト: rampage644/frontera
 def schedule(self, batch):
     to_schedule = []
     for fprint, score, request, schedule in batch:
         if schedule:
             if 'domain' not in request.meta:
                 _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
                 if not hostname:
                     self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint)
                 request.meta['domain'] = {'name': hostname}
             to_schedule.append((score, fprint, request.meta['domain'], request.url))
     self._schedule(to_schedule)
コード例 #9
0
ファイル: __init__.py プロジェクト: widy28/frontera
 def schedule(self, batch):
     for fprint, score, request, schedule in batch:
         if schedule:
             request.meta[b'_scr'] = score
             _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
             if not hostname:
                 self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint)
                 partition_id = self.partitions[0]
             else:
                 partition_id = self.partitioner.partition(hostname, self.partitions)
             self.heap[partition_id].push(request)
コード例 #10
0
 def _is_domain_blacklisted(self, request):
     if not self.domains_blacklist:
         return
     if 'domain' in request.meta:
         hostname = request.meta['domain'].get('name')
     else:
         _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
     if hostname:
         hostname = hostname.lower()
         if hostname in self.domains_blacklist:
             self.logger.debug("Dropping black-listed hostname, URL %s", request.url)
             return True
     return False
コード例 #11
0
 def _is_domain_blacklisted(self, request):
     if not self.domains_blacklist:
         return
     if 'domain' in request.meta:
         hostname = request.meta['domain'].get('name')
     else:
         _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
     if hostname:
         hostname = hostname.lower()
         if hostname in self.domains_blacklist:
             self.logger.debug("Dropping black-listed hostname, URL %s",
                               request.url)
             return True
     return False
コード例 #12
0
ファイル: hbase.py プロジェクト: Preetwinder/frontera
 def schedule(self, batch):
     to_schedule = dict()
     now = int(time())
     for fprint, score, request, schedule in batch:
         if schedule:
             if b'domain' not in request.meta:    # TODO: this have to be done always by DomainMiddleware,
                 # so I propose to require DomainMiddleware by HBaseBackend and remove that code
                 _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
                 if not hostname:
                     self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint)
                 request.meta[b'domain'] = {'name': hostname}
             timestamp = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else now
             to_schedule.setdefault(timestamp, []).append((request, score))
     for timestamp, batch in six.iteritems(to_schedule):
         self._schedule(batch, timestamp)
コード例 #13
0
ファイル: hbase.py プロジェクト: vu3jej/distributed-frontera
 def update_score(self, batch):
     if not isinstance(batch, dict):
         raise TypeError("batch should be dict with fingerprint as key, and float score as value")
     to_schedule = []
     for fprint, (score, url, schedule) in batch.iteritems():
         obj = prepare_hbase_object(score=score)
         rk = unhexlify(fprint)
         self.batch.put(rk, obj)
         if schedule:
             _, hostname, _, _, _, _ = parse_domain_from_url_fast(url)
             if not hostname:
                 self.manager.logger.backend.error("Can't get hostname for URL %s, fingerprint %s" % (url, fprint))
                 continue
             to_schedule.append((score, fprint, {"name": hostname}, url))
     self.queue.schedule(to_schedule)
コード例 #14
0
ファイル: hbase.py プロジェクト: CN-hanyi/frontera
 def schedule(self, batch):
     to_schedule = dict()
     now = int(time())
     for fprint, score, request, schedule in batch:
         if schedule:
             if b'domain' not in request.meta:    # TODO: this have to be done always by DomainMiddleware,
                 # so I propose to require DomainMiddleware by HBaseBackend and remove that code
                 _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
                 if not hostname:
                     self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint)
                 request.meta[b'domain'] = {'name': hostname}
             timestamp = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else now
             to_schedule.setdefault(timestamp, []).append((request, score))
     for timestamp, batch in six.iteritems(to_schedule):
         self._schedule(batch, timestamp)
コード例 #15
0
ファイル: domain.py プロジェクト: rahulsharma1991/frontera
def parse_domain_info(url, test_mode=False):
    if test_mode:
        match = re.match('([A-Z])\w+', url)
        netloc = name = match.groups()[0] if match else '?'
        scheme = sld = tld = subdomain = '-'
    else:
        netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(url)
    return {
        'netloc': netloc,
        'name': name,
        'scheme': scheme,
        'sld': sld,
        'tld': tld,
        'subdomain': subdomain,
    }
コード例 #16
0
ファイル: revisiting.py プロジェクト: wpxgit/frontera
    def schedule(self, batch):
        for fprint, score, request, schedule_at in batch:
            if schedule_at:
                _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
                if not hostname:
                    self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint))
                    partition_id = self.partitions[0]
                    host_crc32 = 0
                else:
                    partition_id = self.partitioner.partition(hostname, self.partitions)
                    host_crc32 = get_crc32(hostname)
                created_at = time()*1E+6
                q = self._create_queue(request, fprint, score, partition_id, host_crc32, created_at)

                q.save()
                request.meta['state'] = States.QUEUED
コード例 #17
0
    def get_key(request):
        domain = request.meta.get(b'domain')
        if domain is not None:
            if type(domain) == dict:
                return domain[b'name']
            elif type(domain) == int:
                return domain
            else:
                raise TypeError("domain of unknown type.")

        try:
            _, name, _, _, _, _ = parse_domain_from_url_fast(request.url)
        except Exception:
            return None
        else:
            return name.encode('utf-8', 'ignore')
コード例 #18
0
ファイル: domain.py プロジェクト: RajatGoyal/frontera
def parse_domain_info(url, test_mode=False):
    if test_mode:
        match = re.match('([A-Z])\w+', url)
        netloc = name = match.groups()[0] if match else '?'
        scheme = sld = tld = subdomain = '-'
    else:
        netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(
            url)
    return {
        'netloc': netloc,
        'name': name,
        'scheme': scheme,
        'sld': sld,
        'tld': tld,
        'subdomain': subdomain,
    }
コード例 #19
0
ファイル: components.py プロジェクト: wpxgit/frontera
    def schedule(self, batch):
        query = self.session.prepare("INSERT INTO queue (crawl, fingerprint, score, partition_id, host_crc32, url, "
                                     "created_at, meta, depth, headers, method, cookies) "
                                     "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
        cql_items = []
        for fprint, score, request, schedule in batch:
            if schedule:
                _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
                if not hostname:
                    self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint))
                    partition_id = self.partitions[0]
                    host_crc32 = 0
                else:
                    partition_id = self.partitioner.partition(hostname, self.partitions)
                    host_crc32 = get_crc32(hostname)
                created_at = time()*1E+6

                if "domain" not in request.meta:
                    request.meta["domain"] = {}
                if "origin_is_frontier" not in request.meta:
                    request.meta["origin_is_frontier"] = ''
                if "scrapy_callback" not in request.meta:
                    request.meta["scrapy_callback"] = None
                if "scrapy_errback" not in request.meta:
                    request.meta["scrapy_errback"] = None
                if "scrapy_meta" not in request.meta:
                    request.meta["scrapy_meta"] = {}
                if "score" not in request.meta:
                    request.meta["score"] = 0
                if "jid" not in request.meta:
                    request.meta["jid"] = 0

                meta = Meta(domain=request.meta['domain'], fingerprint=fprint,
                            origin_is_frontier=request.meta['origin_is_frontier'],
                            scrapy_callback=request.meta['scrapy_callback'],
                            scrapy_errback=request.meta['scrapy_errback'], scrapy_meta=request.meta['scrapy_meta'])

                cql_i = (self.crawl_id, fprint, score, partition_id, host_crc32, request.url, created_at, meta, 0,
                         request.headers, request.method, request.cookies)
                cql_items.append(cql_i)

                request.meta['state'] = States.QUEUED

        execute_concurrent_with_args(self.session, query, cql_items, concurrency=400)
        self.counter_cls.cass_count({"queued_urls": len(cql_items)})
コード例 #20
0
 def schedule(self, batch):
     to_schedule = dict()
     now = int(time())
     for fprint, score, request, schedule in batch:
         if schedule:
             # TODO: This is done by DomainMiddleware - RedisBackend should require DomainMiddleware
             if FIELD_DOMAIN not in request.meta:
                 _, hostname, _, _, _, _ = parse_domain_from_url_fast(
                     request.url)
                 if not hostname:
                     self._logger.error(
                         "Can't get hostname for URL %s, fingerprint %s",
                         request.url, fprint)
                 request.meta[FIELD_DOMAIN] = {'name': hostname}
             timestamp = request.meta[
                 FIELD_CRAWL_AT] if FIELD_CRAWL_AT in request.meta else now
             to_schedule.setdefault(timestamp, []).append((request, score))
     for timestamp, batch in to_schedule.items():
         self._schedule(batch, timestamp)
コード例 #21
0
    def update_score(self, batch):
        if not isinstance(batch, dict):
            raise TypeError(
                'batch should be dict with fingerprint as key, and float score as value'
            )

        to_schedule = []
        for fprint, (score, url, schedule) in batch.iteritems():
            obj = prepare_hbase_object(score=score)
            rk = unhexlify(fprint)
            self.batch.put(rk, obj)
            if schedule:
                _, hostname, _, _, _, _ = parse_domain_from_url_fast(url)
                if not hostname:
                    self.manager.logger.backend.error(
                        "Can't get hostname for URL %s, fingerprint %s" %
                        (url, fprint))
                    continue
                to_schedule.append((score, fprint, {'name': hostname}, url))
        self.queue.schedule(to_schedule)
コード例 #22
0
    def new_batch(self, *args, **kwargs):
        lags = self._offset_fetcher.get()
        logger.info("Got lags %s" % str(lags))

        partitions = []
        for partition, lag in lags.iteritems():
            if lag < self.max_next_requests:
                partitions.append(partition)

        logger.info("Getting new batches for partitions %s" %
                    str(",").join(map(str, partitions)))
        if not partitions:
            return 0

        count = 0
        for request in self._backend.get_next_requests(self.max_next_requests,
                                                       partitions=partitions):
            try:
                request.meta['jid'] = self.job_id
                eo = self._encoder.encode_request(request)
            except Exception as e:
                logger.error("Encoding error, %s, fingerprint: %s, url: %s" %
                             (e, request.meta['fingerprint'], request.url))
                continue
            finally:
                count += 1

            try:
                netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(
                    request.url)
            except Exception as e:
                logger.error("URL parsing error %s, fingerprint %s, url %s" %
                             (e, request.meta['fingerprint'], request.url))
            encoded_name = name.encode('utf-8', 'ignore')
            self._producer.send_messages(self.outgoing_topic, encoded_name, eo)
        logger.info("Pushed new batch of %d items", count)
        self.stats['last_batch_size'] = count
        self.stats.setdefault('batches_after_start', 0)
        self.stats['batches_after_start'] += 1
        self.stats['last_batch_generated'] = asctime()
        return count
コード例 #23
0
        count = 0
        for request in self._backend.get_next_requests(self.max_next_requests,
                                                       partitions=partitions):
            try:
                request.meta['jid'] = self.job_id
                eo = self._encoder.encode_request(request)
            except Exception, e:
                logger.error("Encoding error, %s, fingerprint: %s, url: %s" %
                             (e, request.meta['fingerprint'], request.url))
                continue
            finally:
                count += 1

            try:
                netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(
                    request.url)
            except Exception, e:
                logger.error("URL parsing error %s, fingerprint %s, url %s" %
                             (e, request.meta['fingerprint'], request.url))
            encoded_name = name.encode('utf-8', 'ignore')
            self._producer.send_messages(self.outgoing_topic, encoded_name, eo)
        logger.info("Pushed new batch of %d items", count)
        self.stats['last_batch_size'] = count
        self.stats.setdefault('batches_after_start', 0)
        self.stats['batches_after_start'] += 1
        self.stats['last_batch_generated'] = asctime()
        return count

    def disable_new_batches(self):
        self.slot.disable_new_batches = True
コード例 #24
0
 def test_complete_url(self):
     self.assertEqual(parse_domain_from_url_fast(complete_url),
                      ('username:[email protected]:80', 'www.example.com', 'http', '', '', ''))
コード例 #25
0
 def test_simple_url(self):
     self.assertEqual(parse_domain_from_url_fast(simple_url),
                      ('www.example.com', 'www.example.com', 'http', '', '', ''))
コード例 #26
0
 def test_complete_url(self):
     self.assertEqual(parse_domain_from_url_fast(complete_url),
                      ('username:[email protected]:80',
                       'www.example.com', 'http', '', '', ''))
コード例 #27
0
ファイル: main.py プロジェクト: vu3jej/distributed-frontera
        count = 0
        for request in self._backend.get_next_requests(self.max_next_requests, partitions=partitions):
            try:
                request.meta['jid'] = self.job_id
                eo = self._encoder.encode_request(request)
            except Exception, e:
                logger.error("Encoding error, %s, fingerprint: %s, url: %s" % (e,
                                                                               request.meta['fingerprint'],
                                                                               request.url))
                continue
            finally:
                count +=1

            try:
                netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(request.url)
            except Exception, e:
                logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, 
                                                                                request.meta['fingerprint'], 
                                                                                request.url))
            encoded_name = name.encode('utf-8', 'ignore')
            self._producer.send_messages(self.outgoing_topic, encoded_name, eo)
        logger.info("Pushed new batch of %d items", count)
        self.stats['last_batch_size'] = count
        self.stats.setdefault('batches_after_start', 0)
        self.stats['batches_after_start'] += 1
        self.stats['last_batch_generated'] = asctime()
        return count

    def disable_new_batches(self):
        self.slot.disable_new_batches = True
コード例 #28
0
 def test_simple_url(self):
     self.assertEqual(
         parse_domain_from_url_fast(simple_url),
         ('www.example.com', 'www.example.com', 'http', '', '', ''))