Ejemplo n.º 1
0
    def _update_project(self, project):
        '''update one project'''
        if project['name'] not in self.projects:
            self.projects[project['name']] = {}
        self.projects[project['name']].update(project)
        self.projects[project['name']]['md5sum'] = utils.md5string(project['script'])
        if not self.projects[project['name']].get('active_tasks', None):
            self.projects[project['name']]['active_tasks'] = deque(maxlen=self.ACTIVE_TASKS)

        # load task queue when project is running and delete task_queue when project is stoped
        if project['status'] in ('RUNNING', 'DEBUG'):
            if project['name'] not in self.task_queue:
                self._load_tasks(project['name'])
            self.task_queue[project['name']].rate = project['rate']
            self.task_queue[project['name']].burst = project['burst']

            # update project runtime info from processor by sending a _on_get_info
            # request, result is in status_page.track.save
            self.on_select_task({
                'taskid': '_on_get_info',
                'project': project['name'],
                'url': 'data:,_on_get_info',
                'status': self.taskdb.SUCCESS,
                'fetch': {
                    'save': ['min_tick', ],
                },
                'process': {
                    'callback': '_on_get_info',
                },
            })
        else:
            if project['name'] in self.task_queue:
                self.task_queue[project['name']].rate = 0
                self.task_queue[project['name']].burst = 0
                del self.task_queue[project['name']]
Ejemplo n.º 2
0
    def _crawl(self, url, **kwargs):
        task = {}

        if kwargs.get('callback'):
            callback = kwargs['callback']
            if isinstance(callback, basestring) and hasattr(self, callback):
                func = getattr(self, callback)
            elif hasattr(callback, 'im_self') and callback.im_self is self:
                func = callback
                kwargs['callback'] = func.__name__
            else:
                raise NotImplementedError("self.%s() not implemented!" % callback)
            if hasattr(func, '_config'):
                for k, v in func._config.iteritems():
                    kwargs.setdefault(k, v)

        if hasattr(self, 'crawl_config'):
            for k, v in self.crawl_config.iteritems():
                kwargs.setdefault(k, v)

        url = quote_chinese(_build_url(url.strip(), kwargs.get('params')))
        if kwargs.get('files'):
            assert isinstance(kwargs.get('data', {}), dict), "data must be a dict when using with files!"
            content_type, data = _encode_multipart_formdata(kwargs.get('data', {}),
                                                            kwargs.get('files', {}))
            kwargs.setdefault('headers', {})
            kwargs['headers']['Content-Type'] = content_type
            kwargs['data'] = data
        if kwargs.get('data'):
            kwargs['data'] = _encode_params(kwargs['data'])
        if kwargs.get('data'):
            kwargs.setdefault('method', 'POST')

        schedule = {}
        for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update'):
            if key in kwargs and kwargs[key] is not None:
                schedule[key] = kwargs[key]
        if schedule:
            task['schedule'] = schedule

        fetch = {}
        for key in ('method', 'headers', 'data', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'save', 'js_run_at', 'js_script', 'load_images', 'fetch_type'):
            if key in kwargs and kwargs[key] is not None:
                fetch[key] = kwargs[key]
        if fetch:
            task['fetch'] = fetch

        process = {}
        for key in ('callback', ):
            if key in kwargs and kwargs[key] is not None:
                process[key] = kwargs[key]
        if process:
            task['process'] = process

        task['project'] = self.project_name
        task['url'] = url
        task['taskid'] = task.get('taskid') or md5string(url)

        self._follows.append(task)
        return task
Ejemplo n.º 3
0
def followeeUser(info):
    """
    Information of people concerned, this function can control the rate of send task to the detail page
    :param content: The content of the page
    :return:
    """
    url_token, following_count = info['url_token'], info['following_count']
    for page in range(0, following_count, 20):
        url = FOLLOWEES_URL.format(url_token=url_token, offset=page)
        try:
            r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
        except:
            pass
        else:
            content = json.loads(r.content.decode('utf-8'))
            page_datas = content['data']
            user_objs, relation_objs = [], []
            for data in page_datas:
                user_objs.append(UserInfo(user_id=data['id'], url_token=data['url_token']))
                relation_objs.append(Relation(
                    parent_url_token=data['url_token'],
                    children_url_token=url_token,
                    md5=md5string((data['url_token']+url_token).encode('utf-8'))
                ))
            try:
                session.add_all(user_objs)
                session.commit()
            except Exception as e:
                session.rollback()
            try:
                session.add_all(relation_objs)
                session.commit()
            except Exception as e:
                session.rollback()
        time.sleep(TIME_DELAY)
Ejemplo n.º 4
0
 def test_put(n):
     logger.info("message queue put %d", n)
     start_time = time.time()
     for i in range(n):
         task['url'] = 'http://bench.pyspider.org/?l=%d' % i
         task['taskid'] = md5string(task['url'])
         queue.put(task, block=True, timeout=1)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info("cost %.2fs, %.2f/s %.2fms",
                 cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
Ejemplo n.º 5
0
 def test_update(n, start=0):
     logger.info("taskdb update %d" % n)
     start_time = time.time()
     for i in range(n):
         task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start)
         task['taskid'] = md5string(task['url'])
         task['track'] = track
         taskdb.update(task['project'], task['taskid'], task)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info("cost %.2fs, %.2f/s %.2fms",
                 cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
Ejemplo n.º 6
0
 def test_get(n, start=0, random=True, fields=request_task_fields):
     logger.info("taskdb get %d %s" % (n, "randomly" if random else ""))
     range_n = list(range(n))
     if random:
         from random import shuffle
         shuffle(range_n)
     start_time = time.time()
     for i in range_n:
         task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start)
         task['taskid'] = md5string(task['url'])
         task['track'] = track
         taskdb.get_task(task['project'], task['taskid'], fields=fields)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info("cost %.2fs, %.2f/s %.2fms",
                 cost_time, n * 1.0 / cost_time, cost_time / n * 1000)
Ejemplo n.º 7
0
 def _load_project(self, project):
     '''Load project into self.projects from project info dict'''
     try:
         project['md5sum'] = utils.md5string(project['script'])
         ret = self.build_module(project, self.env)
         self.projects[project['name']] = ret
     except Exception as e:
         logger.exception("load project %s error", project.get('name', None))
         ret = {
             'loader': None,
             'module': None,
             'class': None,
             'instance': None,
             'exception': e,
             'exception_log': traceback.format_exc(),
             'info': project,
             'load_time': time.time(),
         }
         self.projects[project['name']] = ret
         return False
     logger.debug('project: %s updated.', project.get('name', None))
     return True
Ejemplo n.º 8
0
def send_message(ctx, scheduler_rpc, project, message):
    """
    Send Message to project from command line
    """
    if isinstance(scheduler_rpc, six.string_types):
        scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)
    if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'):
        scheduler_rpc = connect_rpc(ctx, None, 'http://%s/' % (
            os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):]))
    if scheduler_rpc is None:
        scheduler_rpc = connect_rpc(ctx, None, 'http://127.0.0.1:23333/')

    return scheduler_rpc.send_task({
        'taskid': utils.md5string('data:,on_message'),
        'project': project,
        'url': 'data:,on_message',
        'fetch': {
            'save': ('__command__', message),
        },
        'process': {
            'callback': '_on_message',
        }
    })
Ejemplo n.º 9
0
                            'time': process_time,
                            'follows': len(ret.follows),
                            'result': unicode(ret.result)[:100],
                            'logs': ret.logstr()[:200],
                            'exception': ret.exception,
                            },
                        },
                    })
            self.status_queue.put(status_pack)

        for newtask in ret.follows:
            self.newtask_queue.put(newtask)

        for project, msg, url in ret.messages:
            self.inqueue.put(({
                    'taskid': utils.md5string(url),
                    'project': project,
                    'url': url,
                    'process': {
                        'callback': '_on_message',
                        }
                }, {
                    'status_code': 200,
                    'url': url,
                    'save': (task['project'], msg),
                }))

        if response.error or ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
Ejemplo n.º 10
0
 def get_taskid(self, task):
     '''Generate taskid by information of task md5(url) by default, override me'''
     return md5string(task['url'])
Ejemplo n.º 11
0
                            'time': process_time,
                            'follows': len(ret.follows),
                            'result': unicode(ret.result)[:100],
                            'logs': ret.logstr()[-200:],
                            'exception': ret.exception,
                            },
                        },
                    })
            self.status_queue.put(status_pack)

        for newtask in ret.follows:
            self.newtask_queue.put(newtask)

        for project, msg, url in ret.messages:
            self.inqueue.put(({
                    'taskid': utils.md5string(url),
                    'project': project,
                    'url': url,
                    'process': {
                        'callback': '_on_message',
                        }
                }, {
                    'status_code': 200,
                    'url': url,
                    'save': (task['project'], msg),
                }))

        if response.error or ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info