Esempio n. 1
0
    def crawl(self):
        # item sql list
        _iteminfosql_list = []
        _itemdaysql_list = []
        _itemhoursql_list = []
        _itemupdatesql_list = []
        while True:
            _data = None
            try:
                try:
                    # 取队列消息
                    _data = self.get_q()
                except Empty as e:
                    # 队列为空,退出
                    # info
                    self.insertIteminfo(_iteminfosql_list, True)
                    _iteminfosql_list = []

                    break

                item = None
                obj = 'item'
                if self._q_type == 'main':
                    # 新商品实例
                    item = Item()
                    _val = _data[1]
                    if self.a_val: _val = _val + self.a_val
                    item.antPage(_val)
                    # 汇聚
                    # redis
                    #self.putItemDB(item)
                    self.push_back(self.items, item.outSql())
                    # 入库
                    iteminfoSql = item.outSql()
                    _iteminfosql_list.append(iteminfoSql)
                    if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = []

                # 存网页
                #if item:
                #    _pages = item.outItemPage(obj, self._q_type)
                #    self.mongofsAccess.insertJMPages(_pages)

                # 延时
                time.sleep(1)
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoItemException as e:
                Common.log('# Not item exception: %s' % e)
                # 通知queue, task结束
                self.queue.task_done()

            except Common.NoPageException as e:
                Common.log('# Not page exception: %s' % e)
                # 通知queue, task结束
                self.queue.task_done()

            except Common.InvalidPageException as e:
                self.crawlRetry(_data)
                Common.log('# Invalid page exception: %s' % e)
                # 通知queue, task结束
                self.queue.task_done()

            except Exception as e:
                Common.log('# Unknown exception crawl item: %s' % e)
                Common.traceback_log()
                self.crawlRetry(_data)
                # 通知queue, task结束
                self.queue.task_done()
                if str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution') != -1:
                    Common.log(_data)
                if str(e).find('Read timed out') == -1:
                    # 重新拨号
                    try:
                        self.dialRouter(4, 'item')
                    except Exception as e:
                        Common.log('# DailClient Exception err: %s' % e)
                        time.sleep(10)
                    time.sleep(random.uniform(10,40))