Ejemplo n.º 1
0
    def __init__(self, path='/', base_url=None, query_string=None,
                 method='GET', input_stream=None, content_type=None,
                 content_length=None, errors_stream=None, multithread=False,
                 multiprocess=False, run_once=False, headers=None, data=None,
                 environ_base=None, environ_overrides=None, charset='utf-8'):
        if query_string is None and '?' in path:
            path, query_string = path.split('?', 1)
        self.charset = charset
        if isinstance(path, unicode):
            path = iri_to_uri(path, charset)
        self.path = path
        if base_url is not None:
            if isinstance(base_url, unicode):
                base_url = iri_to_uri(base_url, charset)
            else:
                base_url = url_fix(base_url, charset)
        self.base_url = base_url
        if isinstance(query_string, basestring):
            self.query_string = query_string
        else:
            if query_string is None:
                query_string = MultiDict()
            elif not isinstance(query_string, MultiDict):
                query_string = MultiDict(query_string)
            self.args = query_string
        self.method = method
        if headers is None:
            headers = Headers()
        elif not isinstance(headers, Headers):
            headers = Headers(headers)
        self.headers = headers
        self.content_type = content_type
        if errors_stream is None:
            errors_stream = sys.stderr
        self.errors_stream = errors_stream
        self.multithread = multithread
        self.multiprocess = multiprocess
        self.run_once = run_once
        self.environ_base = environ_base
        self.environ_overrides = environ_overrides
        self.input_stream = input_stream
        self.content_length = content_length
        self.closed = False

        if data:
            if input_stream is not None:
                raise TypeError('can\'t provide input stream and data')
            if isinstance(data, basestring):
                self.input_stream = StringIO(data)
                if self.content_length is None:
                    self.content_length = len(data)
            else:
                for key, value in _iter_data(data):
                    if isinstance(value, (tuple, dict)) or \
                       hasattr(value, 'read'):
                        self._add_file_from_data(key, value)
                    else:
                        self.form.setlistdefault(key).append(value)
Ejemplo n.º 2
0
    def add(self, site, url_list, qps=SITE_DEFAULT_QPS, batch_size=JOB_BATCH_SIZE):
        host_table_key = "host:%s" % site

        if not r.exists(host_table_key):
            ## create a new host entry
            host_record = {
                    'name':site,
                    'total_qps':qps,
                    'qps':qps,
                    'total_urls':0,
                    'total_batches':0,
                    'pending_batches':0,
                    }

            ## 1.insert host into host table.
            ##
            ## 2.add the hostname into host list (sorted set)
            ##   * score=now, optimized to processing existing host first
            ##     r.zadd(host_list_key, now, site)
            ##   * score=SITE_DEADZONE_OFFSET, optimized to processing new host first
            ##     since the timestamp SITE_DEADZONE_OFFSET is the past
            r.pipeline()                                                    \
                .hmset(host_table_key, host_record)                         \
                .zadd(host_list_key, host_table_key, SITE_DEADZONE_OFFSET)  \
                .execute()

        host_pending_key = "host:%s:pending:%s" % (site, self.priority)

        job_batch_key = "jobs:jid:%s:batches" % self.id
        job_url_key = "jobs:jid:%s:urls" % self.id

        count = len(url_list)
        for i in range(0, count, batch_size):
            bid = r.incr("host:%s:nextBatchId" % site)

            batch_key = "host:%s:batch:%s" % (site, bid)

            batch_list = url_list[i:i+batch_size]
            batch_count = len(batch_list)

            ## add the batch record to DB.
            for url in batch_list:
                try:
                    url = url_fix(urlnorm.norm(url.strip()))
                except urlnorm.InvalidUrl:
                    continue
                except UnicodeDecodeError:
                    continue

                r.rpush(batch_key, url)

            ##
            ## 1. update site and job counters.
            ## 
            ## 2. update host pending list.
            ##
            ## 3. job batch list.
            ##
            ret = r.pipeline()                                          \
                    .hincrby(host_table_key, "total_urls", batch_count) \
                    .hincrby(host_table_key, "total_batches", 1)        \
                    .hincrby(host_table_key, "pending_batches", 1)      \
                    .incr(job_url_key, batch_count)                     \
                    .rpush(host_pending_key, batch_key)                 \
                    .rpush(job_batch_key, batch_key)                    \
                    .execute()

        ## restore the host for scheduling if it's been in deadzone.
        resurrect_host(r, host_table_key)
Ejemplo n.º 3
0
    def __init__(self,
                 path='/',
                 base_url=None,
                 query_string=None,
                 method='GET',
                 input_stream=None,
                 content_type=None,
                 content_length=None,
                 errors_stream=None,
                 multithread=False,
                 multiprocess=False,
                 run_once=False,
                 headers=None,
                 data=None,
                 environ_base=None,
                 environ_overrides=None,
                 charset='utf-8'):
        if query_string is None and '?' in path:
            path, query_string = path.split('?', 1)
        self.charset = charset
        if isinstance(path, unicode):
            path = iri_to_uri(path, charset)
        self.path = path
        if base_url is not None:
            if isinstance(base_url, unicode):
                base_url = iri_to_uri(base_url, charset)
            else:
                base_url = url_fix(base_url, charset)
        self.base_url = base_url
        if isinstance(query_string, basestring):
            self.query_string = query_string
        else:
            if query_string is None:
                query_string = MultiDict()
            elif not isinstance(query_string, MultiDict):
                query_string = MultiDict(query_string)
            self.args = query_string
        self.method = method
        if headers is None:
            headers = Headers()
        elif not isinstance(headers, Headers):
            headers = Headers(headers)
        self.headers = headers
        self.content_type = content_type
        if errors_stream is None:
            errors_stream = sys.stderr
        self.errors_stream = errors_stream
        self.multithread = multithread
        self.multiprocess = multiprocess
        self.run_once = run_once
        self.environ_base = environ_base
        self.environ_overrides = environ_overrides
        self.input_stream = input_stream
        self.content_length = content_length
        self.closed = False

        if data:
            if input_stream is not None:
                raise TypeError('can\'t provide input stream and data')
            if isinstance(data, basestring):
                self.input_stream = StringIO(data)
                if self.content_length is None:
                    self.content_length = len(data)
            else:
                for key, value in _iter_data(data):
                    if isinstance(value, (tuple, dict)) or \
                            hasattr(value, 'read'):
                        self._add_file_from_data(key, value)
                    else:
                        self.form.setlistdefault(key).append(value)
Ejemplo n.º 4
0
Archivo: jobs.py Proyecto: mfan/collie
    def add(self, site, url_list, qps=SITE_DEFAULT_QPS, batch_size=JOB_BATCH_SIZE):
        host_table_key = "host:%s" % site

        if not r.exists(host_table_key):
            ## create a new host entry
            host_record = {
                "name": site,
                "total_qps": qps,
                "qps": qps,
                "total_urls": 0,
                "total_batches": 0,
                "pending_batches": 0,
            }

            ## 1.insert host into host table.
            ##
            ## 2.add the hostname into host list (sorted set)
            ##   * score=now, optimized to processing existing host first
            ##     r.zadd(host_list_key, now, site)
            ##   * score=SITE_DEADZONE_OFFSET, optimized to processing new host first
            ##     since the timestamp SITE_DEADZONE_OFFSET is the past
            r.pipeline().hmset(host_table_key, host_record).zadd(
                host_list_key, host_table_key, SITE_DEADZONE_OFFSET
            ).execute()

        host_pending_key = "host:%s:pending:%s" % (site, self.priority)

        job_batch_key = "jobs:jid:%s:batches" % self.id
        job_url_key = "jobs:jid:%s:urls" % self.id

        count = len(url_list)
        for i in range(0, count, batch_size):
            bid = r.incr("host:%s:nextBatchId" % site)

            batch_key = "host:%s:batch:%s" % (site, bid)

            batch_list = url_list[i : i + batch_size]
            batch_count = len(batch_list)

            ## add the batch record to DB.
            for url in batch_list:
                try:
                    url = url_fix(urlnorm.norm(url.strip()))
                except urlnorm.InvalidUrl:
                    continue
                except UnicodeDecodeError:
                    continue

                r.rpush(batch_key, url)

            ##
            ## 1. update site and job counters.
            ##
            ## 2. update host pending list.
            ##
            ## 3. job batch list.
            ##
            ret = (
                r.pipeline()
                .hincrby(host_table_key, "total_urls", batch_count)
                .hincrby(host_table_key, "total_batches", 1)
                .hincrby(host_table_key, "pending_batches", 1)
                .incr(job_url_key, batch_count)
                .rpush(host_pending_key, batch_key)
                .rpush(job_batch_key, batch_key)
                .execute()
            )

        ## restore the host for scheduling if it's been in deadzone.
        resurrect_host(r, host_table_key)