def main_handler(): global result, last_result try: logging.info('临时文件:' + ZK_TMP_FILE) if not os.path.exists(ZK_TMP_FILE): logging.info('临时文件不存在') else: with open(ZK_TMP_FILE, 'r', encoding=UTF8_ENCODING) as f: result = json.load(f) logging.info('当前存储数据量:' + str(len(result.keys()))) # zk首页热门内容 d = py(ZK_BASE_URL, headers=REQUEST_HEADERS, encoding=GBK_ENCODING) d('#threadlisttableid tbody').each(deal_post) # 0818tuan d = py(TUAN_BASE_URL, headers=REQUEST_HEADERS, encoding=GB2312_ENCODING) d('.list-group > .list-group-item').each(deal_post_tuan) except Exception as ex: logging.exception('主任务运行异常:' + str(ex)) raise ex finally: # 创建存储目录 if not os.path.exists(BASE_DIR): os.makedirs(BASE_DIR) # 存储结果 with open(ZK_TMP_FILE, 'w', encoding=UTF8_ENCODING) as f: # 超过数量清空 if len(result.keys()) > 500: last_result.clear() last_result = result result = dict() logging.info('保存数据结果') json.dump(result, f)
def get_post_info(post_id, title=None, time=None): info = dict() info['url'] = ZK_POST_URL % post_id logging.info('爬取链接:' + info['url']) d = py(info['url'], headers=REQUEST_HEADERS, encoding=GBK_ENCODING, timeout=REQUEST_TIMEOUT) # 帖子标题 post_title = d('#thread_subject').attr('title') # 帖子时间 post_time = d('.pti:first>.authi:first').find('em:first').text().replace('发表于 ', '') # 帖子图片 info['images'] = list() if post_title is None: info['title'] = None if title is None else title info['time'] = None if time is None else time info['content'] = d('#messagetext>p:first').text() else: info['title'] = post_title info['time'] = post_time ele = d('#postlist>div:first').find('tr:first').find('.t_f').clone() info['content'] = ele.remove('ignore_js_op').text() mapping = get_url_mapping(ele) for href in mapping: info['content'] = info['content'].replace(href, mapping[href]) for e in d('.t_fsz:first').find('ignore_js_op').find('img'): e = py(e) if e.attr('aid') is None: continue src = e.attr('file') if len(re.findall('.jpg|.jpeg|.png', src, re.I)) == 0: continue info['images'].append(src) return info
def get_one_page_book_list(url, **kwargs): book_list = [] session = kwargs.get('session') if session: pq = py(session.get(url).content) else: pq = py(_get_response_content(url)) book_dict = pq('.subject-item') for book in book_dict.items(): book_tittle = book('.info h2').text().strip() + book( '.info h2 span').text().strip() book_url = book('.info h2 a').attr('href') book_id = '' for txt in book_url.split('/'): if re.match(r'^\d+$', txt): book_id = ''.join(txt) break book_coverimage_url = book('.pic img').attr('src') book_intro = book('.pub').text() book_tags = book('.tags').text().split(':')[1].strip() if book( '.tags').text().count(':') else '' book_comment = book('.comment').text() book_dict = { 'id': book_id, 'url': book_url, 'title': book_tittle, 'cover_img_url': book_coverimage_url, 'intro': book_intro, 'tags': book_tags, 'comment': book_comment } book_list.append(book_dict) return book_list
def parse(self, response): result = response.text self.log(response.url) links = py(result).find('.info a') for _ in links: href = py(_).attr('href') if href != 'javascript:void(0)': yield response.follow(href, callback=self.parse_item)
def test_get_ip(): row = py(with_space_html) ip1 = crawl_goubanjia.get_ip(row) without_space_row = py(without_space_html) ip2 = crawl_goubanjia.get_ip(without_space_row) assert ip1 == actual_ip assert ip2 == actual_ip
def jdzuLogin(username, password, verifycode, file_format): # 获取js编码后的encoded encoded = getEncoded(username, password) # 准备表单数据 data = { "userAccount": username, "userPassword": "", "RANDOMCODE": verifycode, "encoded": encoded } # 模拟登录 r = py( sess.post('http://61.131.228.75:8080/jsxsd/xk/LoginToXk', data=data, headers=headers).text) # 如果验证码错误 if r('#showMsg').text(): return r('#showMsg').text() # 请求首页,获取用户名字 rep = py( sess.get('http://61.131.228.75:8080/jsxsd/framework/xsMain.jsp', headers=headers).text) name = rep('#btn_gotoGrzx .glyphicon-class').text() + "的成绩单" # 请求成绩单数据 response = py( sess.get('http://61.131.228.75:8080/jsxsd/kscj/cjcx_list', headers=headers).text) # 将成绩解析到列表组,方便生产excel scores = list() for item in response('tr').items(): score = list() if item('th').items(): # 表头 for th in item('th').items(): score.append(th.text()) if item('td').items(): # tbody for td in item('td').items(): score.append(td.text()) scores.append(score) # 根据用户选择的格式保存 if file_format == "excel": # 保存到excel save_excel(scores, name, username) if file_format == "pdf": # 保存到pdf save_excel_for_pdf(scores, name, username) convert_to_pdf(username) # 发送邮件通知 send_email.mail(name)
def extract_attrs(self, ele): """ Extract attrs """ attrs = {} #manipulation keywords TARGET_KEYWORDS = MTBuild.attrs() for a in TARGET_KEYWORDS.iterkeys(): if py(ele).attr(a): attrs[a] = py(ele).attr(a) #get tag name attrs['_TAG_'] = py(ele)[0].tag #future: deal with internal property like <a><a.href></a.href></a> return attrs
def get_url_mapping(ele): url_mapping = dict() for i in ele.find('a'): href = py(i).attr['href'] url = py(i).text() if re.match(r'https?.+\.\..+|.*链接.*', url) is None \ or href is None: continue if '0818tuan' in href and '?u=' in href: url_mapping[url] = unquote(href.split('?u=')[1]) else: url_mapping[url] = unquote(href) return url_mapping
def select(self, context): """ select sub-context and yield generateor """ if self.attrs.has_key('select'): #return py(context).find(self.attrs['select']) selector = self.attrs['select'] else: # no selector is found return py(context) # if find selector try: return py(context).find(selector) except Exception, e: # print e return py([]) # return empty pyquery object
def select(self, context): """ select sub-context and yield generateor """ if self.attrs.has_key('select'): #return py(context).find(self.attrs['select']) selector = self.attrs['select'] else: # no selector is found return py(context) # if find selector try: return py(context).find(selector) except Exception, e: # print e return py( [ ] ) # return empty pyquery object
def _get_info(self, tds): lst_value = tds.filter(lambda i: i % 2 == 1).map( lambda i, e: py(e).text()) lst_title = tds.filter(lambda i: i % 2 == 0).map( lambda i, e: py(e).text()) map_title_value = zip(lst_title, lst_value) model = {} for k_title, v_value in map_title_value: k_title = k_title.replace(u':', u'') if k_title == u'': continue key = self.info_dic.get(k_title, None) if key is None: continue model[key] = v_value return model
def parse(self, response): movie_list = response.xpath('/html').extract() for temp in movie_list: soup = BeautifulSoup(temp, 'html5lib') # 提取网址 urls = [] for link in soup.find_all('a'): urls.append(link.get('href')) # 提取ul内容 datas1 = {} doc = py(temp) item = doc('div ul li') for li in item.items(): datas1['img'] = li.find('img') datas1['url'] = li.find('a').attr('href') datas1['content'] = li.text() # 提取ol内容 datas2 = {} doc = py(temp) item = doc('div ol li') for li in item.items(): datas2['img'] = li.find('img') datas2['url'] = li.find('a').attr('href') datas2['content'] = li.text() print(li.text()) print('——' * 80) # 提取自定义列表内容 datas3 = {} doc = py(temp) item = doc('dl') for t in item.items(): datas3['dt'] = t.find('dt').text() dd = [] for tt in t.find('dd').items(): dd.append(tt.text()) datas3['dd'] = dd # 提取table内容 tables = [] table_node = soup.find_all('td') for table in table_node: tables.append(table.get_text())
def __call__(self, doc, **environment): tag = self.attrs['_TAG_'] # convert encoding if tag == "root" and self.has_attr('encoding'): encoding = self.attrs['encoding'] doc = doc.decode(encoding) root_element = py(doc) results = [] for tri in self.children: r = tri['exp_callsite'](root_element) results.append(r) # action: if tag != "root": ret = self.call_action(results, action=tag, attrs=self.attrs, **environment) return ret else: # actor as: if self.has_attr('as'): as_val = self.attrs['as'] ret = self.call_as(as_val, results) return ret else: return results
def get_acticals(actical_url): url = base_actical_url + actical_url response = requests.get(url, headers=headers) response.encoding = 'utf-8' if response.status_code == 200: html = response.text # actical_urls = BeautifulSoup(html, 'lxml') pattern1 = re.compile(r'[|"\/\\:<>?*]+', re.S) #不能使用的只有/ \ : * ?" < > | 这几个符号. doc = py(html) pattern2 = re.compile(r'[\u4e00-\u9fa5]+', re.S) title = doc('div').filter('#title').text() title = re.sub(pattern1, '', title) content = doc('p').text() #有些网友是br内装内容 pattern3 = re.compile(r'[\s]*', re.S) if re.match(pattern3, content): content = doc('div').filter('#content').text() content = re.sub(pattern2, '', content) # title=actical_url.div.title.string # print(actical_urls) yield {'title': title, 'content': content} return None
def get_annual_out_guarantee_info(self, page_list): lst = [] for page in page_list: py_all = py(page, parser='html') trs = py_all.find('table').find('tr').not_( '.partner_com_top').items() for tr in trs: tds = tr.find('td') if len(tds) < 2: continue performance = tds.eq(5).text().strip() performance_period = self.trans_for(performance) share_model = { AnnualReports.OutGuaranteeInfo.CREDITOR: tds.eq(1).text().strip(), # AnnualReports.OutGuaranteeInfo.OBLIGOR: tds.eq(2).text().strip(), # AnnualReports.OutGuaranteeInfo.DEBT_TYPE: tds.eq(3).text().strip(), # AnnualReports.OutGuaranteeInfo.DEBT_AMOUNT: util.get_amount_with_unit(tds.eq(4).text().strip()), AnnualReports.OutGuaranteeInfo.PERFORMANCE_PERIOD: performance_period, AnnualReports.OutGuaranteeInfo.GUARANTEE_PERIOD: tds.eq(6).text().strip(), # 担保期限 AnnualReports.OutGuaranteeInfo.GUARANTEE_TYPE: tds.eq(7).text().strip(), # 担保方式 } lst.append(share_model) return lst
def get_book(i): book =[] url = "https://www.d4j.cn/download.php?id=" + str(i) head = user_agent.getheaders() # 获取随机user-agent response = requests.get(url, headers=head) # 获取pyquery对象 html1 = response.text html = py(html1) title = html("body > div.wrap > div.content > h2").text() # 获取百度网盘地址 link = html("body > div.wrap > div.content > div:nth-child(4) > div.panel-body > span:nth-child(4) > a").attr("href") # 获取提取码 code = html("body > div.wrap > div.content > div.plus_box > div.plus_l > ul > li:nth-child(4) > font").text() if link: print("第"+str(i)+"页面") print(title) print(link) print(code) # 插入到数据库 book.append(title) book.append(link) book.append(code) return book else: print("第" + str(i) + "页面")
def get_annual_share_hold_info(page_list): lst = [] for page in page_list: py_all = py(page, parser='html') trs = py_all.find('table').find('tr').not_( '.partner_com_top').items() for tr in trs: tds = tr.find('td') if len(tds) < 2: continue share_model = { AnnualReports.ShareholderInformation.SHAREHOLDER_NAME: tds.eq(1).text().strip(), AnnualReports.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(2).text().strip()), AnnualReports.ShareholderInformation.SUBSCRIPTION_TIME: tds.eq(3).text().strip(), # 认缴时间 AnnualReports.ShareholderInformation.SUBSCRIPTION_TYPE: tds.eq(4).text().strip(), # 认缴类型 AnnualReports.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit( tds.eq(5).text().strip()), # 1实缴金额 AnnualReports.ShareholderInformation.PAIED_TIME: tds.eq(6).text().strip(), # 实缴时间 AnnualReports.ShareholderInformation.PAIED_TYPE: tds.eq(7).text().strip(), # 实缴类型 } lst.append(share_model) return lst
def main_handler(event, context): global result tmp_path = ZK_TMP_FILE % datetime.now().strftime('%Y%m%d') try: logging.info('临时文件:' + tmp_path) if not os.path.exists(tmp_path): logging.info('临时文件不存在') else: with open(tmp_path, 'r', encoding=UTF8_ENCODING) as f: result = json.load(f) logging.info('当前存储数据量:' + str(len(result.keys()))) # 首页热门内容 d = py(ZK_BASE_URL, headers=REQUEST_HEADERS, encoding=GBK_ENCODING) # 每个帖子 d('#threadlisttableid tbody').each(deal_post) return "Success" except Exception as ex: logging.error('主任务运行异常:' + str(ex)) raise ex finally: # 创建存储目录 if not os.path.exists(BASE_DIR): os.makedirs(BASE_DIR) # 存储结果 with open(tmp_path, 'w', encoding=UTF8_ENCODING) as f: logging.info('保存数据结果') json.dump(result, f)
def get_key_person_info(self, key_person_info): key_person_info_dict = {} page = self.get_crawl_page(key_person_info) if page is None or page == u'': return key_person_info_dict items = py(page, parser='html').find('.info_name').find('li').items() lst_key_person = [] for item in items: item_content = item.text() part = item_content.split(' ', 1) if len(part) >= 2: name = part[0].strip() position = part[1].strip() elif len(part) == 1: name = part[0].strip() position = u'' else: continue key_person = { GsModel.KeyPerson.KEY_PERSON_NAME: name, GsModel.KeyPerson.KEY_PERSON_POSITION: position } lst_key_person.append(key_person) key_person_info_dict[GsModel.KEY_PERSON] = lst_key_person return key_person_info_dict
def get_change_info(self, page): change_info_dict = {} lst_change_records = [] if isinstance(page, dict) or page is None: return {} trs = py(page, parser='html').find('#table_bgxx').find('tr').items() for tr in trs: tds = tr.find('td') if tds is None or len(tds) < 2: continue change_model = { GsModel.ChangeRecords.CHANGE_ITEM: tds.eq(1).text(), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(tds.eq(2).text()), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(tds.eq(3).text()), # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: tds.eq(4).text() } lst_change_records.append(change_model) change_info_dict[GsModel.CHANGERECORDS] = lst_change_records if len( lst_change_records) != 0 else None return change_info_dict
def test(): url = "https://www.d4j.cn/download.php?id=17144" response = requests.get(url, headers=head, proxies=proxies) # 获取pyquery对象 html1 = response.text html = py(html1) print(html)
def _get_login_captcha_info(content): pq = py(content) captcha_id = pq('input[name = "captcha-id"]').attr('value') captcha_img_url = pq('#captcha_image').attr('src') Image.open(BytesIO(requests.get(captcha_img_url).content)).show() captcha_solution = input('请输入验证码:\n') return captcha_id, captcha_solution
def get_change_info(self, change_info): change_info_dict = {} lst_change_records = [] pages = self.get_crawl_page(change_info, True) if pages is None: return {} for page in pages: trs = py(page.get(u'text', u''), parser='html').find( '.partner_com').find('tr').not_('.partner_com_top').items() for tr in trs: tds = tr.find('td') change_model = { GsModel.ChangeRecords.CHANGE_ITEM: tds.eq(1).text(), # 去除多余的字 GsModel.ChangeRecords.BEFORE_CONTENT: util.format_content(tds.eq(2).text()), GsModel.ChangeRecords.AFTER_CONTENT: util.format_content(tds.eq(3).text()), # 日期格式化 GsModel.ChangeRecords.CHANGE_DATE: tds.eq(4).text() } lst_change_records.append(change_model) change_info_dict[GsModel.CHANGERECORDS] = lst_change_records return change_info_dict
def get_con_detail(page): shareholder_name = "" sub_model = {} if page is None or page == u'': return shareholder_name, sub_model tables = py(page, parser='html').find('.partner_com').items() for table in tables: if u'发起人' in table.find('.info_table_h3').text( ) or u'股东' in table.find('.info_table_h3').text(): # 股东信息 tds = table.find('td') shareholder_name = tds.eq(1).text().strip() sub_model[GsModel.ContributorInformation. SHAREHOLDER_NAME] = tds.eq(1).text() sub_model[GsModel.ContributorInformation. SUBSCRIPTION_AMOUNT] = util.get_amount_with_unit( tds.eq(3).text()) sub_model[GsModel.ContributorInformation. PAIED_AMOUNT] = util.get_amount_with_unit( tds.eq(5).text()) if u'认缴' in table.find('.info_table_h3').text(): # 认缴明细信息 trs = table.find('tr') lst_sub_detail = [] for tr_i in xrange(1, len(trs)): tds = trs.eq(tr_i).find('td') sub_model_detail = { GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_TYPE: tds.eq(0).text(), GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(tds.eq(1).text()), GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_TIME: tds.eq(2).text() } sub_model_detail = replace_none(sub_model_detail) lst_sub_detail.append(sub_model_detail) sub_model[GsModel.ContributorInformation. SUBSCRIPTION_DETAIL] = lst_sub_detail if u'实缴' in table.find('.info_table_h3').text(): # 实缴明细信息 trs = table.find('tr') lst_paid_detail = [] for tr_i in xrange(1, len(trs)): tds = trs.eq(tr_i).find('td') paid_model_detail = { GsModel.ContributorInformation.PaiedDetail.PAIED_TYPE: tds.eq(0).text(), GsModel.ContributorInformation.PaiedDetail.PAIED_AMOUNT: util.get_amount_with_unit(tds.eq(1).text()), GsModel.ContributorInformation.PaiedDetail.PAIED_TIME: tds.eq(2).text() } paid_model_detail = replace_none(paid_model_detail) # 补丁2 lst_paid_detail.append(paid_model_detail) sub_model[GsModel.ContributorInformation. PAIED_DETAIL] = lst_paid_detail sub_model = replace_none(sub_model) return shareholder_name, sub_model
def auto_get_url(msg): response = requests.get(msg.url) document = py(response.text) content = document('#js_content').text() text1 = stats_word.stats_text_cn(content,100) text2 = str(text1) bot.file_helper.send(text1) return text2
def getSouSuoInfo(): urls = 'http://www.baidu.com/s?wd=联通' doc = py(getInfo(urls)) mlist = doc('#content_left h3.t a').items() i = 0 for li in mlist: i = i + 1 print('标题:' + li.text() +' 链接:'+li.attr('href'))
def get_annual_base_info(page): py_all = py(page, parser='html') tds = py_all.find('.info_table').find('td').items() annual_base_info = {} for td in tds: part = td.text().split(u':', 1) k = AnnualReports.format_base_model(part[0]) annual_base_info[k] = part[1] return annual_base_info
def get_parse_page(self): """ 获取解析网页 :return: """ html = self.browser.page_source doc = py(html) return doc
def get_my_movie_page_list(self): p = py(self.movie_collect) page_dict = p('.paginator > a') for page in page_dict.items(): url = page.attr('href') if url: self.movie_page_list.append(_douban_movie_host + url) else: continue
def get_keywords(self): # keywords_search = re.compile(r'https://www.lagou.com/zhaopin.*<h3>(.*?)</h3></a>', re.S) keywords_url = "https://www.lagou.com/" keywords_res = self.handle_request(method="GET", url=keywords_url) doc = py(keywords_res) res = doc("#sidebar > div > div:nth-child(1) a h3") # self.keywords = set(keywords_search.findall(keywords_res)) self.keywords = set(res.text().split(" ")) self.lagou_session.cookies.clear()
def get_score(html): score = [] doc = py(html) sco = doc('td') for i in sco.items(): score.append(str(i.text()).split()) # for i in score: # log(i[0]) return score
def build_script(self, ele, attrs): # print "script:", ele, attrs ch = py(ele).html() x = dict() x['exp_meta'] = 'script' x['exp_attrs'] = attrs x['exp_node'] = ele x['exp_children'] = ch x['exp_kind'] = 'code' x['exp_callsite'] = ScriptCallSite(x) return x
def build_array(self, ele, attrs): #print "array:", ele, attrs ch = [] for child in py(ele).children(): ch.append(self.build_element(child)) x = dict() x['exp_meta'] = 'array' x['exp_attrs'] = attrs x['exp_node'] = ele x['exp_children'] = ch x['exp_kind'] = 'data' x['exp_callsite'] = ArrayCallSite(x) return x
def eval(self, eval_value, context): """ """ key_ctx = Evaluater.CONTEXT g = MTContext.globals() l = MTContext.locals(**{key_ctx: py(context),}) # cache the evaluator cache = self.evaluator_cache if not cache.has_key(eval_value): evaluator = Evaluater(eval_value, key_ctx, g, l) cache[eval_value] = evaluator evaluator = cache[eval_value] ret = evaluator() return ret
def build_map(self, ele, attrs): # print "map:", ele, attrs ch = [] for child in py(ele).children(): #print child ch.append(self.build_element(child)) x = dict() x['exp_meta'] = 'map' x['exp_attrs'] = attrs x['exp_node'] = ele x['exp_children'] = ch x['exp_kind'] = 'data' x['exp_callsite'] = MapCallSite(x) return x
def get_list(_url,mark): import urllib2 from pyquery import PyQuery as py req = urllib2.urlopen(_url) html = req.read() html = html.decode("gbk") d = py(html) items = d("tr") mylist = [] for item in items: text = d(item).text() try: href = "http://news.hitsz.edu.cn/site/news/" + d( d(item).find("a") ).attr("href") ob = { "text": text, "href": href } mylist.append(ob) except: print "error:" + mark return mylist
def build(self, env): doc = py(env.template) #print "[Template]" #print doc #print "[/Template]" self.fromdoc(doc)
def then_there_should_be_the_same_number_of_bars(step): jq = py(world.browser.contents) assert len(jq("h1")) == 2
def get(self, get_value, context): """ get text() or html() """ if get_value == "text": return py(context).text() elif get_value == "html": return py(context).html()