Ejemplo n.º 1
0
    def start_requests(self):
        # 做一些操作
        self.beforeRequest()

        if not self.wait_utils_env_ok():
            self.logWarn(u'环境不可行,退出当前抓取')
            return

        # 得到type
        types = getTableByName('baike_type').select()
        for type in types:
            url = 'http://fenlei.baike.com/%s/list/' % type.name
            self.logInfo(u"开始抓取列表:" + url)
            yield scrapy.Request(url=url,
                                 meta={
                                     'request_type': self.name,
                                     'typeName': type.name
                                 },
                                 callback=self.parseDetail,
                                 dont_filter=True)
Ejemplo n.º 2
0
 def updateStatus(self, haoyaoshiId, status):
     """
     存在更改,不存在则新增
     :param status: be_forbid, start_request, save_success, save_fail, 404, no_complete_data, no_parse_method, dont_need_parse
     :return:
     """
     pass
     try:
         results = self.Table.select().where(
             self.Table.haoyaoshi_id == haoyaoshiId)
         if len(results):
             for result in results:
                 result.status = status
                 result.save()
         else:
             table = getTableByName('haoyaoshi_status')
             table.create(haoyaoshi_id=haoyaoshiId,
                          status=status,
                          update_time=datetime.datetime.now().strftime(
                              '%Y-%m-%d %H:%M:%S'))
     except Exception as e:
         print str(e)
         LogDao.warn(str(e), belong_to='updateStatus')
Ejemplo n.º 3
0
 def __init__(self):
     self.Table = getTableByName('baike_citiao')
Ejemplo n.º 4
0
 def __init__(self, spiderName):
     self.hashList = []  # 代表此次已经存在的hash,防止同一时间得到相同文章进行抓取
     tableName = getSpiderDetail(spiderName).get('table_name', '')
     self.Table = getTableByName(tableName)
Ejemplo n.º 5
0
 def __init__(self):
     self.belong_to = 'baike_citiao_detail'
     self.logName = u'百科词条详情'
     self.Table = getTableByName('baike_citiao_detail')
     pass
Ejemplo n.º 6
0
 def __init__(self):
     self.belong_to = 'baike_type'
     self.logName = u'百科类型'
     self.Table = getTableByName('baike_type')
     pass
Ejemplo n.º 7
0
 def __init__(self):
     self.Table = getTableByName('haoyaoshi_status')