def start_requests(self): # 做一些操作 self.beforeRequest() if not self.wait_utils_env_ok(): self.logWarn(u'环境不可行,退出当前抓取') return # 得到type types = getTableByName('baike_type').select() for type in types: url = 'http://fenlei.baike.com/%s/list/' % type.name self.logInfo(u"开始抓取列表:" + url) yield scrapy.Request(url=url, meta={ 'request_type': self.name, 'typeName': type.name }, callback=self.parseDetail, dont_filter=True)
def updateStatus(self, haoyaoshiId, status): """ 存在更改,不存在则新增 :param status: be_forbid, start_request, save_success, save_fail, 404, no_complete_data, no_parse_method, dont_need_parse :return: """ pass try: results = self.Table.select().where( self.Table.haoyaoshi_id == haoyaoshiId) if len(results): for result in results: result.status = status result.save() else: table = getTableByName('haoyaoshi_status') table.create(haoyaoshi_id=haoyaoshiId, status=status, update_time=datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S')) except Exception as e: print str(e) LogDao.warn(str(e), belong_to='updateStatus')
def __init__(self): self.Table = getTableByName('baike_citiao')
def __init__(self, spiderName): self.hashList = [] # 代表此次已经存在的hash,防止同一时间得到相同文章进行抓取 tableName = getSpiderDetail(spiderName).get('table_name', '') self.Table = getTableByName(tableName)
def __init__(self): self.belong_to = 'baike_citiao_detail' self.logName = u'百科词条详情' self.Table = getTableByName('baike_citiao_detail') pass
def __init__(self): self.belong_to = 'baike_type' self.logName = u'百科类型' self.Table = getTableByName('baike_type') pass
def __init__(self): self.Table = getTableByName('haoyaoshi_status')