コード例 #1
0
    def process_item(self, item, spider):
        # item['segments'] = '[]'
        item = dataUtil.strip_item(item)
        item = dataUtil.keys_for_short(item)

        self.store.append(dict(item))
        run_time = time.time()
        if run_time - self.interval >= 60:
            self.interval = run_time
            permins = spider.crawler.stats.get_value('permins')
            print(
                pubUtil.heartbeat(spider.host_name, spider.name, spider.num,
                                  permins, spider.version))

        if 1 or len(self.store) >= settings.PUSH_DATA_NUM:
            add_success = pubUtil.addData('add',
                                          self.store,
                                          settings.PUSH_DATA_URL_TEST,
                                          spider.host_name,
                                          carrier=spider.name.upper())
            if add_success:
                self.store = []
                invalid_success = pubUtil.invalidData(
                    'invalid', spider.task,
                    settings.PUSH_DATA_URL_TEST + 'carrier=%s' % spider.name,
                    spider.host_name)
                if invalid_success:
                    spider.task = []
コード例 #2
0
    def process_item(self, item, spider):

        item = dataUtil.strip_item(item)
        item = dataUtil.keys_for_short(item)
        run_time = time.time()
        if run_time - self.interval >= 60:
            self.interval = run_time
            permins = spider.crawler.stats.get_value('permins')
            print(pubUtil.heartbeat(spider.host_name, spider.name, spider.num, permins, spider.version))

        self.store.append(dict(item))
        
        if hasattr(spider, 'push_data_num'):
            num = spider.push_data_num
        else:
            num = settings.PUSH_DATA_NUM
        if len(self.store) >= num:
            url = dataUtil.get_random_url(settings.PUSH_DATA_URL)
            add_success = pubUtil.addData('add', self.store, url, spider.host_name, carrier=spider.carrier)
            if add_success:
                self.store = []
                if len(spider.task):
                    time.sleep(0.5)
                    invalid_success = pubUtil.invalidData('invalid', spider.task, url + 'carrier=%s' % spider.name, spider.host_name)
                    if invalid_success:
                        spider.task = []
コード例 #3
0
ファイル: wn.py プロジェクト: Biking0/spider_project
    def process_item(self, item):
        self.buffer.append(item)
        self.count += 1
        this_time = time.time()
        if this_time - self.st_time >= 60:
            self.st_time = this_time
            logging.info(
                pubUtil.heartbeat(self.host_name, self.carrier, self.num,
                                  self.count, self.version))
            self.count = 0

        if len(self.buffer) > 5:
            add_success = pubUtil.addData('add',
                                          self.buffer,
                                          self.push_url,
                                          self.host_name,
                                          carrier=self.carrier)
            if add_success:
                logging.info(add_success)
                self.buffer.clear()

        if len(self.headers) > 5:
            add_success = pubUtil.push_cookies(self.headers, self.carrier)
            if add_success:
                logging.info(add_success)
                self.headers.clear()
コード例 #4
0
    def process_item(self, item, spider):
        # item['segments'] = '[]'
        item = dataUtil.strip_item(item)
        item = dataUtil.keys_for_short(item)
        run_time = time.time()
        if run_time - self.interval >= 60:
            self.interval = run_time
            permins = spider.crawler.stats.get_value('permins')
            print(
                pubUtil.heartbeat(spider.host_name, spider.name, spider.num,
                                  permins, spider.version))

        self.store.append(dict(item))

        if hasattr(spider, 'push_data_num'):
            num = spider.push_data_num
        else:
            num = settings.PUSH_DATA_NUM
        if len(self.store) >= num:
            url = dataUtil.get_random_url(settings.PUSH_DATA_URL)
            add_success = pubUtil.addData('add',
                                          self.store,
                                          url,
                                          spider.host_name,
                                          carrier=spider.name.upper())
            if add_success:
                if hasattr(spider, 'spe'):
                    push_data_log = [
                        '%s->%s:%s %s' %
                        (data.get('depAirport'), data.get('arrAirport'),
                         time.strftime('%Y-%m-%d',
                                       time.localtime(data.get('depTime'))),
                         data.get('flightNumber')) for data in self.store
                    ]
                    spider.log('push task :' + str(push_data_log), 20)
                self.store = []
                invalid_success = pubUtil.invalidData(
                    'invalid', spider.task, url + 'carrier=%s' % spider.name,
                    spider.host_name)
                if invalid_success:
                    spider.task = []
コード例 #5
0
ファイル: jq_spider.py プロジェクト: Biking0/spider_project
    def process_item(self, item):
        self.buffer.append(item)
        if len(self.buffer) >= 5:
            # # 测试库
            # url = '%scarrier=%s' % (settings.PUSH_DATA_URL_TEST, item["carrier"])
            # # 正式库
            # # url = '%scarrier=%s' % (settings.PUSH_DATA_URL, item["carrier"])
            # data = {
            #     "action": "add",
            #     "data": self.buffer
            #
            # }
            # response = requests.post(url, data=json.dumps(data), timeout=2 * 60, verify=False)
            # logging.info("%s,%s" % (response.content, len(self.buffer)))

            url = dataUtil.get_random_url(settings.PUSH_DATA_URL)
            add_success = pubUtil.addData('add', self.buffer, url, self.name,
                                          'JQ')

            self.item_num += len(self.buffer)
            if add_success:
                self.buffer = []
                invalid_success = pubUtil.invalidData(
                    'invalid', self.task, url + 'carrier=%s' % 'JQ', self.name)
                if invalid_success:
                    self.task = []

            # 加入心跳
            run_time = time.time()
            if run_time - self.now >= 60:
                permins = self.item_num
                self.item_num = 0

                print(
                    pubUtil.heartbeat('%s' % (self.name), 'jq',
                                      '%s' % self.num, permins, self.version))
                self.now = run_time