def strptime(string, format): try: if string is None or len(string) == 0: return None return time.strptime(string, format) except Exception as e: log("时间转换失败===> params: {},{}".format(string, format) + str(e))
def start_requests(self): current_page = 1 result = self.query_company_page(current_page) pages = result.get("pages") logger.log("公司总页数:" + str(pages)) while current_page <= pages: result = self.query_company_page(current_page) for row in result.get("rows"): full_name = row[0] name = row[1] if full_name is not None: yield Request(self.company_url.format(1, full_name), dont_filter=True, meta={ "company": full_name, "current_page": 1 }) else: yield Request(self.company_url.format(1, name), dont_filter=True, meta={ "company": name, "current_page": 1 }) current_page += 1
def execute(self, sql, params): if type(params) == list and len(params) == 0: return # 使用cursor()方法获取操作游标 connection = self.get_connection() # cursor = connection.cursor() try: # 执行sql语句 # cursor.execute(sql) # cursor.executemany("insert into tb7(user,pass,licnese)values(%s,%s,%s)", # [("u1","u1pass","11111"),("u2","u2pass","22222")]) with connection.cursor() as cursor: # 执行sql语句 if type(params) == list: cursor.executemany(sql, params) else: cursor.execute(sql, params) # 获取自增id pk = cursor.lastrowid connection.commit() if pk: cy_logger.log("executes successfully! last_row_id==> " + str(pk)) else: cy_logger.log("executes successfully") return pk except Exception as e: cy_logger.error(str(e)) # 发生错误时回滚 connection.rollback() finally: # 关闭数据库连接 connection.close()
def process_item(self, item, spider): logger.log("*" * 100) logger.log("===> " + str(item)) logger.log("*" * 100) insert("INSERT INTO `xsbbiz`.`spider_test` (`name`, `company`) " "VALUES (%s, %s)", (item['name'], item['company']))
def select_rows_paper(self, sql, param=None, page_no=1, page_size=20): """ 分页查询 """ connection = self.get_connection() try: with connection.cursor() as cursor: if param: sql = sql + '%s' % param cursor.execute("SELECT COUNT(1) FROM (%s) tmp" % sql) total = cursor.fetchall()[0][0] pages = 0 rows = [] if total > 0: # 总页数 pages = int(total / page_size) if total % page_size == 0 else int( total / page_size) + 1 if page_no > pages: page_no = pages offset = (page_no - 1) * page_size # 偏移量 sql = sql + ' LIMIT %s, %s' % (offset, page_size) cy_logger.log(sql) cursor.execute(sql) rows = cursor.fetchall() return { 'total': total, 'page_no': page_no, 'pages': pages, 'rows': rows } except Exception as e: cy_logger.error(e) finally: connection.close()
def parse_page(self, response): # 把json数据转换成Python对象 company_list = json.loads(response.body.decode('utf-8')) logger.log("列表地址===>" + response.url) # 总共有多少页: page_count = jsonpath.jsonpath(company_list, '$..data')[0]['page_total'] logger.log(page_count) # 当前第几页: now_page = jsonpath.jsonpath(company_list, '$..data')[0]['page_num'] logger.log(now_page) if now_page == 1: for url in [ self.list_url.format(page) for page in range(2, page_count + 1) ]: yield Request(url, cookies=self.cookies, callback=self.parse_page, headers=self.headers, dont_filter=True) companys = jsonpath.jsonpath(company_list, '$..data')[0]['rows'] for company in companys: item = ItjuziCompanyItem() # 解析想要的字段: item["company_name"] = company["com_name"] # logo地址 item["company_logo"] = company['com_logo_archive'] # 公司id item['company_id'] = company['com_id'] # 公司描述 item["company_des"] = company['com_des'] # 公司行业 item["company_fa"] = company['com_cat_name'] # 公司子行业 item["company_son"] = company['com_sub_cat_name'] # 最新融资情况由四步分组成: invse_date = company['invse_date'] invse_round_id = company['invse_round_id'] invse_detail_money = company['invse_detail_money'] invse_total_money = company['invse_total_money'] # 公司最新融资情况 item[ "company_recent"] = invse_date + ' ' + invse_round_id + ' ' + invse_detail_money + ' ' + invse_total_money # 公司资产总额 item["company_count"] = company['invse_total_money'] # 公司估价 item["company_money"] = company['guzhi'] # 公司地址 item['company_addr'] = company['com_addr'] # 公司标语 item['company_slogan'] = company['com_slogan'] # 公司成立时间 item["company_btime"] = company['com_born_date'] # 公司规模 item["company_people"] = company['com_scale'] # 公司营运情况 item["company_operate"] = company['com_status'] # 发送详细页的请求 yield FormRequest("https://www.itjuzi.com/company/" + item['company_id'], cookies=self.cookies, meta={'meta1': item}, callback=self.parse_detail, headers=self.headers, dont_filter=True)
def __init__(self, *a, **kw): super(BaiduZhaopinSpider, self).__init__(*a, **kw) self.chrome_options = Options() # 不打开浏览器窗口 self.chrome_options.add_argument('--headless') self.chrome_options.add_argument('--disable-gpu') self.browser = webdriver.Chrome( executable_path=r'spider/file/chromedriver.exe', chrome_options=self.chrome_options) self.browser.maximize_window() # 窗口最大化 # 隐性等待,最长等2秒 self.browser.implicitly_wait(2) current_page = 1 result = self.query_company_page(current_page) pages = result.get("pages") logger.log("公司总页数:" + str(pages)) while current_page <= pages: result = self.query_company_page(current_page) for row in result.get("rows"): name = row[1] self.browser.get(self.company_url.format(name)) # 解析数据 job_list = self.browser.find_elements_by_xpath( "//div[@id='qz-list-box']/a[@id='cktarget']") cur_num = 0 if len(job_list) > 0: while len(job_list) > cur_num: cur_num = len(job_list) # 满10条提交一次 if cur_num % 10 == 0: for job in job_list[-10:]: item = BaiduZhaopinItem() item['company_name'] = job.find_element_by_xpath( "./div/div[@class='inlineblock percent33']/div[@class='company']/span[@class='inlineblock companyname']" ).text item['job_name'] = job.find_element_by_xpath( "./div/div[@class='inlineblock percent47']/div[@class='title']" ).text item['location'] = job.find_element_by_xpath( "./div/div[@class='inlineblock percent33']/div[@class='detail']/span[1]" ).text item['education'] = job.find_element_by_xpath( "./div/div[@class='inlineblock percent33']/div[@class='detail']/span[3]" ).text item['years'] = job.find_element_by_xpath( "./div/div[@class='inlineblock percent33']/div[@class='detail']/span[5]" ).text salary_num = job.find_element_by_xpath( "./div/div[@class='inlineblock percent47']/div[@class='salaryarea top16']/span[@class='inlineblock num']" ).text try: salary_unit = job.find_element_by_xpath( "./div/div[@class='inlineblock percent47']/div[@class='salaryarea top16']/span[@class='inlineblock unit']" ).text except: salary_unit = "" item['salary'] = salary_num + salary_unit # 点击跳转详情 ActionChains(self.browser).move_to_element( job).click(job).perform() handles = self.browser.window_handles # 获取窗口句柄集合(列表类型) for handle in handles: # 切换到新窗口 if handle != self.browser.current_window_handle: self.browser.switch_to_window(handle) break item['release_time'] = self.browser.find_element_by_xpath( "//div[@class='job-desc-box inner home-inner clearfix']/div[@class='job-desc right-box']/div[@class='job-desc-item'][1]/div[@class='job-classfiy']/p[2]" ).text item['platform'] = self.browser.find_element_by_xpath( "//div[@class='job-desc-box inner home-inner clearfix']/div[@class='job-desc right-box']/div[@class='job-desc-item'][3]/div[@class='media-item source']/div[@class='item-bd']/h4[@class='bd-tt']" ).text self.browser.close() # 关闭当前窗口 self.browser.switch_to_window( handles[0]) # 切换回原窗口 self.save_item(item) # 滑到底部,加载更多 self.browser.execute_script( "window.scrollTo(0, document.body.scrollHeight)") job_list = self.browser.find_elements_by_xpath( "//div[@id='qz-list-box']/a[@id='cktarget']") # 提交剩余数据 for job in job_list[-10:]: item = BaiduZhaopinItem() item['company_name'] = job.find_element_by_xpath( "./div/div[@class='inlineblock percent33']/div[@class='company']/span[@class='inlineblock companyname']" ).text item['job_name'] = job.find_element_by_xpath( "./div/div[@class='inlineblock percent47']/div[@class='title']" ).text item['location'] = job.find_element_by_xpath( "./div/div[@class='inlineblock percent33']/div[@class='detail']/span[1]" ).text item['education'] = job.find_element_by_xpath( "./div/div[@class='inlineblock percent33']/div[@class='detail']/span[3]" ).text item['years'] = job.find_element_by_xpath( "./div/div[@class='inlineblock percent33']/div[@class='detail']/span[5]" ).text salary_num = job.find_element_by_xpath( "./div/div[@class='inlineblock percent47']/div[@class='salaryarea top16']/span[@class='inlineblock num']" ).text try: salary_unit = job.find_element_by_xpath( "./div/div[@class='inlineblock percent47']/div[@class='salaryarea top16']/span[@class='inlineblock unit']" ).text except: salary_unit = "" item['salary'] = salary_num + salary_unit # 点击跳转详情 ActionChains(self.browser).move_to_element(job).click( job).perform() handles = self.browser.window_handles # 获取窗口句柄集合(列表类型) for handle in handles: # 切换到新窗口 if handle != self.browser.current_window_handle: self.browser.switch_to_window(handle) break item['release_time'] = self.browser.find_element_by_xpath( "//div[@class='job-desc-box inner home-inner clearfix']/div[@class='job-desc right-box']/div[@class='job-desc-item'][1]/div[@class='job-classfiy']/p[2]" ).text try: item['platform'] = self.browser.find_element_by_xpath( "//div[@class='job-desc-box inner home-inner clearfix']/div[@class='job-desc right-box']/div[@class='job-desc-item'][3]/div[@class='media-item source']/div[@class='item-bd']/h4[@class='bd-tt']" ).text except: item['platform'] = "百度" self.browser.close() # 关闭当前窗口 self.browser.switch_to_window(handles[0]) # 切换回原窗口 self.save_item(item) current_page += 1 # 关闭浏览器 self.browser.quit()
def process_item(self, item, spider): logger.log("===> " + str(item))