def crawler_github(): """ 获取github 热榜 :return: """ url = 'https://github.com/trending' headers = { 'Host': 'github.com', 'Referer': 'https://github.com/explore' } response_html = get_text(url, options=headers) content_list = [] if response_html: tree = etree.HTML(response_html.text) article_list = tree.xpath("//article[@class='Box-row']") for article in article_list: title = article.xpath('string(./h1/a)').strip() href = 'https://github.com/%s' % article.xpath('./h1/a/@href')[0] describe = article.xpath('string(./p)').strip() content_list.append({'title':'%s---%s' % (title, describe), 'href': href}) return {'hot_name': 'GitHub', 'content': content_list}
def crawler_wang_yi(): """ 爬取网易云音乐榜单 :return: """ url = 'https://music.163.com/discover/toplist?id=19723756' headers = { 'authority': 'music.163.com', 'referer': 'https://music.163.com/', } response_html = get_text(url, options=headers) content_list = [] if response_html: tree = etree.HTML(response_html.text) ul_list = tree.xpath('//div[@id="song-list-pre-cache"]/ul[@class="f-hide"]/li') for li in ul_list: title = li.xpath('./a/text()')[0] href = 'https://music.163.com/#%s' % li.xpath('./a/@href')[0] content_list.append({'title': title, 'href': href}) return {'hot_name': '云音乐飙升榜', 'content': content_list}
def crawler_tian_ya(): """ 获取天涯热榜贴 :return: """ url = 'http://bbs.tianya.cn/hotArticle.jsp' headers = { 'Host': 'bbs.tianya.cn' } response_html = get_text(url, options=headers) content_list = [] if response_html: tree = etree.HTML(response_html.text) # print(response_html) tbody_list = tree.xpath("//div[@class='mt5']/table/tbody")[1:] for tbody in tbody_list: for tr in tbody.xpath('./tr'): title = tr.xpath("./td[@class='td-title']/a/text()")[0] href = 'http://bbs.tianya.cn' + tr.xpath("./td[@class='td-title']/a/@href")[0] content_list.append({'title': title, 'href': href}) return {'hot_name': '天涯', 'content': content_list}
def crawler_zhi_hu(): """ 获取知乎热榜 :return: """ url = 'https://www.zhihu.com/api/v3/feed/topstory/hot-lists/total?limit=50&desktop=true' headers = { 'path': '/api/v3/feed/topstory/hot-lists/total?limit=50&desktop=true', 'x-api-version': '3.0.76', 'x-requested-with': 'fetch', } content_list = [] response_html = get_text(url, options=headers) if response_html: data_list = response_html.json().get('data', '') # print(data_list) if data_list: for data in data_list: title = data.get('target').get('title_area').get('text', '') href = data.get('target').get('link').get('url', '') content_list.append({'title': title, 'href': href}) return {'hot_name': '知乎热榜', 'content': content_list}
def __init__(self, filepath): self.filetext = hlp.get_text(filepath) self.inputs = [] self.outputs = [] self.submodules = {} self.modulename = None