def Get_html(url): req = requests.get(url) if req.status_code != 200: log.error("网址(%s)无法访问,状态码:%d" % (url, req.status_code)) return None html = req.content return html
def Get_html(url): req = requests.get(url) if req.status_code != 200: log.error("网址(%s)无法访问,状态码:%d" % (url, req.status_code)) return None html=req.content return html
def get_html_t(url, coding="utf-8"): response = requests.get(url) if response.status_code != 200: log.error("网址(%s)无法访问,状态码:%d" % (url, response.status_code)) content = str(response.content) r = re.findall("charset=(\w+)\"", content) if r: coding = r[0].lower() response.encoding = coding return response.text
def get_html_t(url,coding="utf-8"): response = requests.get(url) if response.status_code != 200: log.error("网址(%s)无法访问,状态码:%d" % (url, response.status_code)) content=str(response.content) r=re.findall("charset=(\w+)\"",content) if r: coding=r[0].lower() response.encoding=coding return response.text
def __send_poster(open_id, poster_url): pic_url = html_change_to_png(poster_url) if pic_url is None: # __send_message_use_api(open_id, u'生成个人海报失败,请向我们反馈!') log.error(u'生成个人海报失败,用户openid:%s' % open_id) return None content = "<img style='width: 200px;' src='%s'>" % pic_url __send_message_use_api(open_id, content) # 告诉微信自动回复系统名片海报已经更新 tell_api_build_card(open_id, pic_url)
def Get_html(url): """ 获取html :param url: :return: """ req = requests.get(url) if req.status_code != 200: log.error("网址(%s)无法访问,状态码:%d" % (url, req.status_code)) return None html=req.content.decode('gb2312','ignore').encode('utf8') #解码并编码为utf8 return html
def tell_api_build_card(open_id, pic_url): """ 告诉API接口,我们生成了一张海报 :return: """ r = requests.post('http://wechatsys.lingnanchuangye.com/set_key_data/frefer45g4re514re', data={ 'key': 'poster_image_' + open_id, 'value': pic_url }) if r.status_code != 200: log.error(u'通知微信公众号生成了新海报操作失败,用户openid:%s' % open_id) raise Exception('error!')
def tell_api_build_card(open_id, pic_url): """ 告诉API接口,我们生成了一张海报 :return: """ r = requests.post( 'http://wechatsys.lingnanchuangye.com/set_key_data/frefer45g4re514re', data={ 'key': 'poster_image_' + open_id, 'value': pic_url }) if r.status_code != 200: log.error(u'通知微信公众号生成了新海报操作失败,用户openid:%s' % open_id) raise Exception('error!')
def get_all_page_of_job(): """ 获取每一页的招聘信息 :return: """ global first_url response = requests.get(first_url) if response.status_code != 200: log.error("网址(%s)无法访问,状态码:%d" % (first_url, response.status_code)) return None response.encoding = 'gbk' objs = get_tile_and_link_lists(response.text) objs = list(map(handle_job_message, objs)) return objs
def handle_job_message(obj): """ 处理兼职信息 :param obj: :return: """ tools.sleep_some_time() response = requests.get(obj['web_url']) if response.status_code != 200: log.error("网址(%s)无法访问,状态码:%d" % (obj['web_url'], response.status_code)) return obj obj['web_html'] = response.content obj['company'] = tools.get_company_name(obj['web_html']) obj['position'] = tools.get_work_position(obj['web_html']) obj['work_city'] = tools.get_work_citys(obj['web_html']) return obj
def get_all_page_html(): """ 获取若干页 :return: """ global first_html_url # 获取第一页信息 response = requests.get(first_html_url) if response.status_code != 200: return None html = response.content if not html: return None messages = get_message_title_and_url_list(html) page = 2 form_data = get_search_form(html) while page < 50: form_data['__EVENTTARGET'] = 'ctl00$cph_content_temp$DataPager1$ctl01$ctl0%d' % (page - 1) response = requests.post(first_html_url, form_data) if response.status_code != 200: log.error("网址(%s)无法访问,状态码:%d" % (first_html_url, response.status_code)) page += 1 continue # 更新form隐藏字段 html = response.content form_data = get_search_form(html) temp_messages = get_message_title_and_url_list(html) messages.extend(temp_messages) if not temp_messages or len(temp_messages) < 10: break page += 1 messages = list(map(handle_job_message, messages)) return messages
def get_all_page_of_work(): """ 获取所有页 :return: """ global web_url page = 1 messages = [] while page < 6: url = web_url + str(page) response = requests.get(url) if response.status_code != 200: log.error("网址(%s)无法访问,状态码:%d" % (url, response.status_code)) continue objs = get_message_title_and_url_list(response.content) messages.extend(objs) if len(objs) < 15: break page += 1 messages = list(map(handle_job_message, messages)) return messages
def get_all_jobs(): """ 获取最近的招聘信息 :return: """ global message_url messages = [] page = 1 while page <= 5: url = message_url % page response = requests.get(url) if response.status_code != 200: log.error("网址(%s)无法访问,状态码:%d" % (url, response.status_code)) continue response.encoding = 'gbk' objs = get_title_and_link_list(response.text) messages.extend(objs) if len(objs) < 18: break page += 1 messages = list(map(handle_job_message, messages)) return messages
def add_a_job(job_title, job_company, job_url, job_city, job_message_source, job_position, job_release_time, web_html): """ 添加一条招聘信息 :param job_title: 工作标题 :param job_company: 公司名字 :param job_url: 跳转的页面地址 :param job_city: 工作地点 :param job_message_source: 消息来源 :param job_position: 工作职位 :param job_release_time: 工作发布时间 :param web_html: 抓取的网页html :return: """ if type(job_city) == list: job_city = '#'.join(job_city) if type(job_position) == list: job_position = '#'.join(job_position) token = hashlib.md5(job_company.encode("UTF-8")).hexdigest() job_release_time = tools.get_real_time(job_release_time) sql = """ insert into jobs(title, company, position, web_url, work_city, message_source, job_type, authentication, status, web_html, release_time, token, create_time) values(%s, %s, %s, %s, %s, %s, 0, 0, 0, %s, %s, %s, now()); """ try: insert_id = db_lib.insert(sql, [ job_title, job_company, job_position, job_url, job_city, job_message_source, web_html, job_release_time, token ]) add_company(job_company) return insert_id except IntegrityError: return 0 except Exception as error: log.error("写入数据库失败(错误类型:%s), 信息地址:%s" % (str(error), job_url)) return -1
def get_all_page_of_job(): """ 获取最近的招聘信息 :return: """ global message_url messages = [] page = 1 while page <= 1: url = message_url % (page - 1) * 20 response = requests.get(url) if response.status_code != 200: log.error("网址(%s)无法访问,状态码:%d" % (url, response.status_code)) continue objs = get_title_and_link_list(response.content) messages.extend(objs) if len(objs) < 20: break page += 1 messages = list(map(handle_job_message, messages)) return messages
def get_html(url): response = requests.get(url) if response.status_code != 200: log.error("网址(%s)无法访问,状态码:%d" % (url, response.status_code)) return response.content