Example #1
0
def crawler_github():
    """
    获取github 热榜
    :return:
    """
    url = 'https://github.com/trending'
    headers = {
        'Host': 'github.com',
        'Referer': 'https://github.com/explore'
    }
    response_html = get_text(url, options=headers)
    content_list = []
    if response_html:
        tree = etree.HTML(response_html.text)
        article_list = tree.xpath("//article[@class='Box-row']")
        for article in article_list:
            title = article.xpath('string(./h1/a)').strip()
            href = 'https://github.com/%s' % article.xpath('./h1/a/@href')[0]
            describe = article.xpath('string(./p)').strip()
            content_list.append({'title':'%s---%s' % (title, describe), 'href': href})
    return {'hot_name': 'GitHub', 'content': content_list}
Example #2
0
def crawler_wang_yi():
    """
    爬取网易云音乐榜单
    :return:
    """
    url = 'https://music.163.com/discover/toplist?id=19723756'
    headers = {
        'authority': 'music.163.com',
        'referer': 'https://music.163.com/',

    }
    response_html = get_text(url, options=headers)
    content_list = []
    if response_html:
        tree = etree.HTML(response_html.text)
        ul_list = tree.xpath('//div[@id="song-list-pre-cache"]/ul[@class="f-hide"]/li')
        for li in ul_list:
            title = li.xpath('./a/text()')[0]
            href = 'https://music.163.com/#%s' % li.xpath('./a/@href')[0]
            content_list.append({'title': title, 'href': href})
    return {'hot_name': '云音乐飙升榜', 'content': content_list}
Example #3
0
def crawler_tian_ya():
    """
    获取天涯热榜贴
    :return:
    """
    url = 'http://bbs.tianya.cn/hotArticle.jsp'
    headers = {
        'Host': 'bbs.tianya.cn'
    }
    response_html = get_text(url, options=headers)
    content_list = []
    if response_html:
        tree = etree.HTML(response_html.text)
        # print(response_html)
        tbody_list = tree.xpath("//div[@class='mt5']/table/tbody")[1:]
        for tbody in tbody_list:
            for tr in tbody.xpath('./tr'):
                title = tr.xpath("./td[@class='td-title']/a/text()")[0]
                href = 'http://bbs.tianya.cn' + tr.xpath("./td[@class='td-title']/a/@href")[0]
                content_list.append({'title': title, 'href': href})
    return {'hot_name': '天涯', 'content': content_list}
Example #4
0
def crawler_zhi_hu():
    """
    获取知乎热榜
    :return:
    """
    url = 'https://www.zhihu.com/api/v3/feed/topstory/hot-lists/total?limit=50&desktop=true'
    headers = {
        'path': '/api/v3/feed/topstory/hot-lists/total?limit=50&desktop=true',
        'x-api-version': '3.0.76',
        'x-requested-with': 'fetch',
    }
    content_list = []
    response_html = get_text(url, options=headers)
    if response_html:
        data_list = response_html.json().get('data', '')
        # print(data_list)
        if data_list:
            for data in data_list:
                title = data.get('target').get('title_area').get('text', '')
                href = data.get('target').get('link').get('url', '')
                content_list.append({'title': title, 'href': href})
    return {'hot_name': '知乎热榜', 'content': content_list}
Example #5
0
 def __init__(self, filepath):
     self.filetext = hlp.get_text(filepath)
     self.inputs = []
     self.outputs = []
     self.submodules = {}
     self.modulename = None