Example #1
0
def douyu_rank(rankName, statType):
    '''
        斗鱼主播数据抓取
        [数据地址](https://www.douyu.com/directory/rank_list/game)

        * `rankName` anchor(巨星主播榜),fans(主播粉丝榜),haoyou(土豪实力榜),user(主播壕友榜)
        * `statType` day(日),week(周),month(月)
    '''
    if not isinstance(rankName, ERankName):
        raise Exception("rankName 类型错误,必须是ERankName枚举")
    if not isinstance(statType, EStatType):
        raise Exception("statType 类型错误,必须是EStatType枚举")

    rankName = '%sListData' % rankName.name
    statType = '%sListData' % statType.name
    # 请求获取html源码 .replace('\r\n', '')
    rs = rq.get("https://www.douyu.com/directory/rank_list/game",
                headers={'User-Agent': 'Mozilla/5.0'})
    # 正则解析出数据
    mt = re.search(r'rankListData\s+?=(.*?);', rs, re.S)
    if (not mt):
        print(u"无法解析rankListData数据")
        return
    grps = mt.groups()
    # 数据转json
    rankListDataStr = grps[0]
    rankListData = json.loads(rankListDataStr)
    dayList = rankListData[rankName][statType]
    # 修改排序
    dayList.sort(key=lambda k: (k.get('id', 0)), reverse=False)
    return dayList
Example #2
0
def suo(url):
    '''
        suo.im 短链
    '''
    url = 'http://suo.im/api.php?format=json&url={}'.format(
        url.encode('utf-8'))
    json_str = rq.get(url)
    if not json_str:
        return
    json_data = json.loads(json_str)
    return json_data['url']
Example #3
0
def getArticleContent(url):
    '''
        获取文章博文内容
    '''
    # show-content
    if (not url):
        print('非法地址')
        return
    htmlStr = rq.get(url)
    jq_dom = jquery(htmlStr)
    jq_content = jq_dom.find('.show-content')
    content_html = jq_content.html()
    # print(content_html)
    return content_html
Example #4
0
 def GetBrandData(self):
     html = rq.get(self._url)
     doc = jquery(html)
     brandJqs = doc.find('.super-mod')
     allNum = brandJqs.length  # 总数量
     print('解析出总的品牌数据量:%s' % allNum)
     scNum = 0  # 成功数据量
     for brandItem in brandJqs:
         brandJq = jquery(brandItem)
         if (len(brandJq.find('.mod-intro')) == 0):
             print('解析没有mod-intro标签')
             continue
         scNum += 1
         print(brandJq.find('.mod-intro').html())
     print('成功获取的品牌数据数量:%s,解析失败数量:%s' % (scNum, allNum - scNum))
Example #5
0
def douyu_room(romm_id):
    '''
        主播房间信息解析
        [数据地址](https://www.douyu.com/xxx)
        'romm_id' 主播房号
    '''
    rs = rq.get(("https://www.douyu.com/%s" % romm_id),
                headers={'User-Agent': 'Mozilla/5.0'})
    mt = re.search(r'\$ROOM\s+?=\s+?({.*?});', rs, re.S)
    if (not mt):
        print(u"无法解析ROOM数据")
        return
    grps = mt.groups()
    roomDataStr = grps[0]
    roomData = json.loads(roomDataStr)
    return roomData
Example #6
0
def getComment(questionNum):
    url = u'http://www.bxd365.com/qa/%s.html' % questionNum
    print(u"开始解析:%s" % url)
    try:
        html = rq.get(url)
    except Exception as e:
        print(e)
        return
    doc = jquery(html)
    replys = doc.find(".reply li")
    if replys is None:
        print(u'无评论数据')
        return
    if len(replys) <= 0:
        print(u'评论数量为0')
        return
    for item in replys:
        parseComment(questionNum, item)
Example #7
0
def analysis_job_data(job_url):
    '''
        解析job详情页面里的pagenum数据
    '''
    if job_url is None:
        print(u'工作详情地址为空,略过')
        return
    print('job_url=%s' % job_url)
    try:
        # html = rq.get_cookie(job_url, cookie_file_name=get_cookie_name())
        html = rq.get(job_url)
    except Exception as e:
        print(e)
        return
    # 解析工作标识
    pagenum = job_pagenum(html)
    # 解析工作联系人
    contactPerson = job_contactPerson(html)
    return {'pagenum': pagenum, 'contactPerson': contactPerson}
Example #8
0
def get_agencys(city, page_index=1):
    '''
        获取58全职搜索保险代理人信息
        * 'city_type' 城市类型
        * 'page_index' 当前页码
    '''
    city_type = city['id']
    hp.print_partition(u'解析城市:%s-%s-%s,保险代理人的工作' %
                       (city['province'], city['city'], city['id']))
    # 构造接口url
    url = __agencys_url.format(city_type, page_index)
    print(u'工作地址:%s' % url)
    try:
        # html = rq.get_cookie(
        #     url,
        #     headers={
        #         "User-agent":
        #         "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        #         "Referer":
        #         url
        #     })
        html = rq.get(url)
    except Exception as e:
        print(e)
        return
    doc = jquery(html)
    if doc is None:
        print(u"解析html报错")
        return
    # 总页数
    page_nums_str = doc.find(".num_operate .total_page").html()
    if page_nums_str is None:
        page_nums_str = '0'
    page_nums = int(page_nums_str)
    print(u'总页数:%s,当前页码:%s' % (page_nums, page_index))
    # 工作列表
    list_jobs = doc.find("#list_con .job_item")
    if list_jobs is None:
        print(u"没有查询到工作列表信息")
        return
    print(u"工作总数:%s" % len(list_jobs))
    # 遍历工作
    today_nums = 0
    for job_item in list_jobs:
        job_item_jq = jquery(job_item)
        job_sign = job_item_jq.find(".sign").html()
        if not check_job_istoday(job_sign):
            print(u'状态:%s,不是今日发布,略过' % job_sign)
            continue
        today_nums = today_nums + 1
        job_name = job_item_jq.find(".name").html()
        if job_name.find(u'保险') < 0:
            print(u'工作:%s,非保险类工作,略过~' % job_name)
            continue
        job_address = job_item_jq.find(".address").html()
        job_url = job_item_jq.find("a").attr("href")
        job_company = job_item_jq.find(".job_comp .comp_name .fl").attr(
            "title")
        job_company = analysis_job_company(job_company)
        print(u'%s|%s|%s' % (job_address, job_name, job_sign))
        # 延迟
        hp.sleep(0.3, 0.6, content=u'获取工作详情=》')
        # 解析job_data数据
        job_data = analysis_job_data(job_url)

        if job_data is None or job_data['pagenum'] is None:
            print(u'无法获取pagenum,略过!')
            continue
        __jobs.append({
            "name": job_name,
            "address": job_address,
            "url": job_url,
            "pagenum": job_data['pagenum'],
            "contactPerson": job_data['contactPerson'],
            "sign": job_sign,
            "company": job_company
        })
    print(u'当前页码:%s,总页数:%s' % (page_index, page_nums))
    # 校验是否需要继续翻页
    if today_nums <= 0:
        print('当前页码:%s,无今日工作,无需继续翻页' % (page_index))
        return
    page_index = page_index + 1
    # 递归翻页
    if page_index <= page_nums:
        print(' ')
        # 延迟
        hp.sleep(0, 1, content=u'翻页=》')
        get_agencys(city, page_index)
Example #9
0
def specialArticles(key, source, page=1):
    '''
        获取主题中的文章信息列表
        * 'key' 主题Key
        * 'page' 文章页码
    '''
    url = __special_newlike_url.format(key)
    htmlStr = rq.get(url)
    if (not htmlStr):
        print(u'获取html失败')
        return
    jq_dom = jquery(htmlStr)
    if (not jq_dom):
        print(u'无法解析页面dom')
        return
    dom_contents = jq_dom.find('.content')
    if (not dom_contents):
        print(u'无法解析文章内容')
        return
    articles = []
    for item in dom_contents:
        jq_content_item = jquery(item)
        dom_title = jq_content_item.find('.title')
        dom_time = jq_content_item.find('.time')
        dom_read = jq_content_item.find('.ic-list-read')
        dom_comments = jq_content_item.find('.ic-list-comments')
        dom_like = jq_content_item.find('.ic-list-like')
        if (not dom_title):
            print(u'无法解析 title')
            continue
        if (not dom_time):
            print(u'无法解析 time')
            continue
        # 解析文章信息
        article_read = int(dom_read.parent().text())
        article_comments = int(dom_comments.parent().text())
        article_like = int(dom_like.parent().text())
        article_title = dom_title.html()
        artitle_href = dom_title.attr('href')
        artitle_time = dom_time.attr('data-shared-at').replace(
            '-', ' ').replace('+08:00', '').replace('T', ' ')
        # artitle_time = time.strptime(artitle_time, '%Y %m %d %H:%M:%S')
        article_url = '{host}{href}'.format(
            host=__jianshu_host, href=artitle_href)
        print(u'获得文章:', hp.remove_emoji(), article_title, article_url,
              artitle_time)
        if (article_read < 100):
            print(u'文章阅读量<100,不爬取')
            continue
        if (article_like < 1):
            print(u'文章收藏量<10,不爬取')
            continue
        if (article_comments < 1):
            print(u'文章评论量<3,不爬取')
            continue
        # 获取文章内容
        content_html = getArticleContent(article_url)
        if (not content_html):
            print(u'无法获取博文内容')
            continue
        # 文章内容字符串处理
        content_html = content_html.replace('data-original-', '')
        content_markdown = getCotentMarkDown(content_html)
        # markdown内容字符串处理
        # content_markdown = content_markdown.replace("|", "-")
        articles.append({
            'title': article_title,
            'url': article_url,
            'time': artitle_time,
            'source': source,
            'content': content_markdown
        })
    return articles