def douyu_rank(rankName, statType): ''' 斗鱼主播数据抓取 [数据地址](https://www.douyu.com/directory/rank_list/game) * `rankName` anchor(巨星主播榜),fans(主播粉丝榜),haoyou(土豪实力榜),user(主播壕友榜) * `statType` day(日),week(周),month(月) ''' if not isinstance(rankName, ERankName): raise Exception("rankName 类型错误,必须是ERankName枚举") if not isinstance(statType, EStatType): raise Exception("statType 类型错误,必须是EStatType枚举") rankName = '%sListData' % rankName.name statType = '%sListData' % statType.name # 请求获取html源码 .replace('\r\n', '') rs = rq.get("https://www.douyu.com/directory/rank_list/game", headers={'User-Agent': 'Mozilla/5.0'}) # 正则解析出数据 mt = re.search(r'rankListData\s+?=(.*?);', rs, re.S) if (not mt): print(u"无法解析rankListData数据") return grps = mt.groups() # 数据转json rankListDataStr = grps[0] rankListData = json.loads(rankListDataStr) dayList = rankListData[rankName][statType] # 修改排序 dayList.sort(key=lambda k: (k.get('id', 0)), reverse=False) return dayList
def suo(url): ''' suo.im 短链 ''' url = 'http://suo.im/api.php?format=json&url={}'.format( url.encode('utf-8')) json_str = rq.get(url) if not json_str: return json_data = json.loads(json_str) return json_data['url']
def getArticleContent(url): ''' 获取文章博文内容 ''' # show-content if (not url): print('非法地址') return htmlStr = rq.get(url) jq_dom = jquery(htmlStr) jq_content = jq_dom.find('.show-content') content_html = jq_content.html() # print(content_html) return content_html
def GetBrandData(self): html = rq.get(self._url) doc = jquery(html) brandJqs = doc.find('.super-mod') allNum = brandJqs.length # 总数量 print('解析出总的品牌数据量:%s' % allNum) scNum = 0 # 成功数据量 for brandItem in brandJqs: brandJq = jquery(brandItem) if (len(brandJq.find('.mod-intro')) == 0): print('解析没有mod-intro标签') continue scNum += 1 print(brandJq.find('.mod-intro').html()) print('成功获取的品牌数据数量:%s,解析失败数量:%s' % (scNum, allNum - scNum))
def douyu_room(romm_id): ''' 主播房间信息解析 [数据地址](https://www.douyu.com/xxx) 'romm_id' 主播房号 ''' rs = rq.get(("https://www.douyu.com/%s" % romm_id), headers={'User-Agent': 'Mozilla/5.0'}) mt = re.search(r'\$ROOM\s+?=\s+?({.*?});', rs, re.S) if (not mt): print(u"无法解析ROOM数据") return grps = mt.groups() roomDataStr = grps[0] roomData = json.loads(roomDataStr) return roomData
def getComment(questionNum): url = u'http://www.bxd365.com/qa/%s.html' % questionNum print(u"开始解析:%s" % url) try: html = rq.get(url) except Exception as e: print(e) return doc = jquery(html) replys = doc.find(".reply li") if replys is None: print(u'无评论数据') return if len(replys) <= 0: print(u'评论数量为0') return for item in replys: parseComment(questionNum, item)
def analysis_job_data(job_url): ''' 解析job详情页面里的pagenum数据 ''' if job_url is None: print(u'工作详情地址为空,略过') return print('job_url=%s' % job_url) try: # html = rq.get_cookie(job_url, cookie_file_name=get_cookie_name()) html = rq.get(job_url) except Exception as e: print(e) return # 解析工作标识 pagenum = job_pagenum(html) # 解析工作联系人 contactPerson = job_contactPerson(html) return {'pagenum': pagenum, 'contactPerson': contactPerson}
def get_agencys(city, page_index=1): ''' 获取58全职搜索保险代理人信息 * 'city_type' 城市类型 * 'page_index' 当前页码 ''' city_type = city['id'] hp.print_partition(u'解析城市:%s-%s-%s,保险代理人的工作' % (city['province'], city['city'], city['id'])) # 构造接口url url = __agencys_url.format(city_type, page_index) print(u'工作地址:%s' % url) try: # html = rq.get_cookie( # url, # headers={ # "User-agent": # "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", # "Referer": # url # }) html = rq.get(url) except Exception as e: print(e) return doc = jquery(html) if doc is None: print(u"解析html报错") return # 总页数 page_nums_str = doc.find(".num_operate .total_page").html() if page_nums_str is None: page_nums_str = '0' page_nums = int(page_nums_str) print(u'总页数:%s,当前页码:%s' % (page_nums, page_index)) # 工作列表 list_jobs = doc.find("#list_con .job_item") if list_jobs is None: print(u"没有查询到工作列表信息") return print(u"工作总数:%s" % len(list_jobs)) # 遍历工作 today_nums = 0 for job_item in list_jobs: job_item_jq = jquery(job_item) job_sign = job_item_jq.find(".sign").html() if not check_job_istoday(job_sign): print(u'状态:%s,不是今日发布,略过' % job_sign) continue today_nums = today_nums + 1 job_name = job_item_jq.find(".name").html() if job_name.find(u'保险') < 0: print(u'工作:%s,非保险类工作,略过~' % job_name) continue job_address = job_item_jq.find(".address").html() job_url = job_item_jq.find("a").attr("href") job_company = job_item_jq.find(".job_comp .comp_name .fl").attr( "title") job_company = analysis_job_company(job_company) print(u'%s|%s|%s' % (job_address, job_name, job_sign)) # 延迟 hp.sleep(0.3, 0.6, content=u'获取工作详情=》') # 解析job_data数据 job_data = analysis_job_data(job_url) if job_data is None or job_data['pagenum'] is None: print(u'无法获取pagenum,略过!') continue __jobs.append({ "name": job_name, "address": job_address, "url": job_url, "pagenum": job_data['pagenum'], "contactPerson": job_data['contactPerson'], "sign": job_sign, "company": job_company }) print(u'当前页码:%s,总页数:%s' % (page_index, page_nums)) # 校验是否需要继续翻页 if today_nums <= 0: print('当前页码:%s,无今日工作,无需继续翻页' % (page_index)) return page_index = page_index + 1 # 递归翻页 if page_index <= page_nums: print(' ') # 延迟 hp.sleep(0, 1, content=u'翻页=》') get_agencys(city, page_index)
def specialArticles(key, source, page=1): ''' 获取主题中的文章信息列表 * 'key' 主题Key * 'page' 文章页码 ''' url = __special_newlike_url.format(key) htmlStr = rq.get(url) if (not htmlStr): print(u'获取html失败') return jq_dom = jquery(htmlStr) if (not jq_dom): print(u'无法解析页面dom') return dom_contents = jq_dom.find('.content') if (not dom_contents): print(u'无法解析文章内容') return articles = [] for item in dom_contents: jq_content_item = jquery(item) dom_title = jq_content_item.find('.title') dom_time = jq_content_item.find('.time') dom_read = jq_content_item.find('.ic-list-read') dom_comments = jq_content_item.find('.ic-list-comments') dom_like = jq_content_item.find('.ic-list-like') if (not dom_title): print(u'无法解析 title') continue if (not dom_time): print(u'无法解析 time') continue # 解析文章信息 article_read = int(dom_read.parent().text()) article_comments = int(dom_comments.parent().text()) article_like = int(dom_like.parent().text()) article_title = dom_title.html() artitle_href = dom_title.attr('href') artitle_time = dom_time.attr('data-shared-at').replace( '-', ' ').replace('+08:00', '').replace('T', ' ') # artitle_time = time.strptime(artitle_time, '%Y %m %d %H:%M:%S') article_url = '{host}{href}'.format( host=__jianshu_host, href=artitle_href) print(u'获得文章:', hp.remove_emoji(), article_title, article_url, artitle_time) if (article_read < 100): print(u'文章阅读量<100,不爬取') continue if (article_like < 1): print(u'文章收藏量<10,不爬取') continue if (article_comments < 1): print(u'文章评论量<3,不爬取') continue # 获取文章内容 content_html = getArticleContent(article_url) if (not content_html): print(u'无法获取博文内容') continue # 文章内容字符串处理 content_html = content_html.replace('data-original-', '') content_markdown = getCotentMarkDown(content_html) # markdown内容字符串处理 # content_markdown = content_markdown.replace("|", "-") articles.append({ 'title': article_title, 'url': article_url, 'time': artitle_time, 'source': source, 'content': content_markdown }) return articles