Esempio n. 1
0
 def cleardb(self, dbpath, dbtbl_name):  # delete table 有时候无效 ,采用drop table
     mylogger.info("call %s " % (sys._getframe().f_code.co_name))
     try:
         conn = sqlite3.connect(dbpath)
         c = conn.cursor()
         # 获取建表语句
         sql_create = "select sql from sqlite_master where type='table' and  tbl_name='%s'" % (
             dbtbl_name)
         cursor = c.execute(sql_create)
         sql_create = ""
         for row in cursor:
             sql_create = row[0]
             break
         # drop table
         if not sql_create.isspace():
             sql_drop = "drop table %s" % (dbtbl_name)
             cursor = c.execute(sql_drop)
             cursor = c.execute(sql_create)
             conn.commit()
             mylogger.info("cleardb [%s:%s] %s succeful" %
                           (dbpath, dbtbl_name, sql_drop))
         else:
             mylogger.error("[sql_create:isspace]")
     except:
         mylogger.error("cleardb [%s:%s] failed" % (dbpath, dbtbl_name))
     finally:
         cursor.close()
         conn.close()
Esempio n. 2
0
 def on_result(self, result):
     if not result:
         mylogger.info("call on_result if not result")
         return
     mylogger.info("call on_result")
     mylogger.debug(result)
     return super(Handler, self).on_result(result)
Esempio n. 3
0
 def load_conf(self):  #加载配置 url
     conf_file = "./%s_conf.json" % (self.project_name)
     mylogger.info("[conf_file:%s]" % (conf_file))
     f = open(conf_file, encoding='utf-8')  # 打开json文件
     res = f.read()  # 读文件
     dict = json.loads(res)
     f.close()
     for each_one in dict["baseUrl_list"]:
         url = each_one["url"]
         if not url or (url in self.baseUrl):
             continue
         self.baseUrl.append(url)
Esempio n. 4
0
 def detail_page(self, response):
     mylogger.info("call %s " % (sys._getframe().f_code.co_name))
     return {
         "url": response.url,
         "proj_name": self.project_name,
         "job_title": response.doc('h1').text(),
         "job_salary": response.doc('.cn > strong').text(),
         "job_comp": response.doc('.catn').text(),
         "job_scr1": response.doc('.ltype').text(),
         "job_scr2": response.doc('.jtag > div').text(),
         "job_addr": response.doc('.bmsg > .fp').text(),
         "job_comp_brief": response.doc('.tmsg').text(),
     }
Esempio n. 5
0
 def on_start(self):
     mylogger.info("call %s " % (sys._getframe().f_code.co_name))
     # 清空数据库 重新启动爬虫任务
     dbtbl_name = "resultdb_" + self.project_name
     self.cleardb('./data/result.db', dbtbl_name)
     dbtbl_name = "taskdb_" + self.project_name
     self.cleardb('./data/task.db', dbtbl_name)
     # 加载配置
     self.load_conf()
     for url in self.baseUrl:
         mylogger.info("[baseUrl:%s]" % (url))
         self.crawl(url,
                    callback=self.index_page,
                    validate_cert=False,
                    fetch_type='js')
 def copy_db_file(self, from_path, to_path):
     #目标路径检测
     if not os.path.exists(os.path.dirname(to_path)):
         os.makedirs(os.path.dirname(to_path))
     self.from_path = os.path.abspath(from_path)
     self.to_path = os.path.abspath(to_path)
     self.mydbfile = os.path.join(self.to_path, os.path.split(self.from_path)[1])
     #目标文件检测,不存在就拷贝
     if os.path.isfile(self.mydbfile):
         mylogger.info("数据库已存在,如果要使用新库,请删除当前数据库,系统会自动拷贝爬虫数据库[self.mydbfile:%s]" % (self.mydbfile))
         return
     shutil.copy(from_path, to_path)
     if os.path.isfile(self.mydbfile):
         mylogger.info("数据库拷贝成功")
     else:
         mylogger.error("数据库拷贝失败")
     mylogger.info("[self.from_path:%s][self.to_path:%s]" % (self.from_path, self.to_path))
     mylogger.info("[self.mydbfile:%s]" % (self.mydbfile))
Esempio n. 7
0
def my_init():
    mylogger.info("call %s " % (sys._getframe().f_code.co_name))
    global g_init_flag
    mylogger.info("[g_init_flag:%s]" % (g_init_flag))
    if g_init_flag:
        return
    job_server = JobsParseServer()
    job_server.parse()
    job_server.save2db()
    g_init_flag = True
    mylogger.info("call %s succefful" % (sys._getframe().f_code.co_name))
Esempio n. 8
0
 def index_page(self, response):
     mylogger.info("call %s " % (sys._getframe().f_code.co_name))
     # 职位列表跳转
     for each in response.doc('#resultList p > span > a').items():
         mylogger.info("职位列表url: " + each.attr.href)
         self.crawl(each.attr.href,
                    callback=self.detail_page,
                    validate_cert=False,
                    fetch_type='js')
     # 翻页
     detail_url_list = [
         x.attr.href for x in response.doc('#resultList  .bk>a').items()
     ]
     mylogger.info("翻页url: " + "分页".join(detail_url_list))
     self.crawl(detail_url_list,
                callback=self.index_page,
                validate_cert=False,
                fetch_type='js')
Esempio n. 9
0
 def on_finished(self, response, task):
     mylogger.info("call %s " % (sys._getframe().f_code.co_name))
     mylogger.info(self)
     mylogger.info(response)
     mylogger.info(task)