コード例 #1
0
def get_detail_page(weburl):
    detail_url = "https://www.shixiseng.com" + weburl

    response = Req.get(detail_url)
    if response.status_code == 200:
        return response.text
    return None
コード例 #2
0
def get_one_topic_from2(url, desc, dir_path):
    response = Req.post(url)
    data = response.text
    print(url)
    if json.loads(data).get('ok') == 0:
        return
    contents = json.loads(data).get('data').get('cards')
    for content in contents:
        mblog = content.get('mblog')

        # id = mblog.get('id')
        title = mblog.get('text')
        time = time_handler(mblog.get('created_at'))
        comments_count = mblog.get('comments_count')
        reposts_count = mblog.get('reposts_count')
        attitudes_count = mblog.get('attitudes_count')
        soup = BeautifulSoup(title)
        user_name = mblog.get('user').get('screen_name')
        print(user_name + " " + time + " " + soup.get_text())
        csv_c = {
            '用户名': user_name,
            '文本内容': soup.get_text(),
            '发布时间': time,
            '点赞数': attitudes_count,
            '评论数': comments_count,
            '转发数': reposts_count
        }
        write_csv_rows(csv_headers, csv_c, desc, dir_path)
コード例 #3
0
def get_hot_topic_top10():
    url = "https://m.weibo.cn/api/container/getIndex"
    param = {
        'containerid':
        '106003type=25&t=3&disable_hot=1&filter_type=realtimehot',
        'title': '微博热搜',
        'extparam':
        'filter_type=realtimehot&mi_cid=100103&pos=0_0&c_type=30&display_time=1554296319',
        'luicode': 10000011,
        'lfid': 231583
    }
    res = Req.get(url, param)
    data = json.loads(res.text)
    hot_topics = data.get('data').get('cards')[0].get('card_group')

    cur_path = os.getcwd()
    dir_name = str(time.strftime('%Y%m%d', time.localtime(time.time())))
    dir_path = cur_path + os.path.sep + dir_name
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    for i in range(1, 11):
        hot_topic = hot_topics[i]
        topic_url = hot_topic.get('scheme')
        topic_desc = hot_topic.get('desc')
        print("搜索第" + str(i) + "个话题:" + topic_desc)
        get_one_topic_for_page(topic_url, topic_desc, dir_path)
コード例 #4
0
def get_one_topic_first_page(url, desc, dir_path):
    print("搜索话题[" + desc + "]的第1页")
    response = Req.post(url)
    data = response.text
    contents = json.loads(data).get('data').get('cards')
    print(url)
    for content in contents:
        card_group = content.get('card_group')
        if card_group is None:
            continue
        mblog = card_group[0].get('mblog')
        if mblog is None:
            continue
        # id = mblog.get('id')
        title = mblog.get('text')
        time = time_handler(mblog.get('created_at'))
        comments_count = mblog.get('comments_count')
        reposts_count = mblog.get('reposts_count')
        attitudes_count = mblog.get('attitudes_count')
        soup = BeautifulSoup(title)
        user_name = mblog.get('user').get('screen_name')
        print(user_name + " " + time + " " + soup.get_text())
        csv_c = {
            '用户名': user_name,
            '文本内容': soup.get_text(),
            '发布时间': time,
            '点赞数': attitudes_count,
            '评论数': comments_count,
            '转发数': reposts_count
        }
        write_csv_rows(csv_headers, csv_c, desc, dir_path)
コード例 #5
0
def save_pdf(pdf_url, file_name, file_path='D:\\'):
    html = Req.get(pdf_url)
    # 获取pdf的后缀名
    file_suffix = os.path.splitext(pdf_url)[1]
    with open(file_path + file_name + file_suffix, 'wb')as file:
        file.write(html.content)
    print("下载{}成功".format(file_name))
コード例 #6
0
    def get_weibo(self):
        i = 0
        while True:
            url = 'https://m.weibo.cn/api/container/getIndex?uid=' + str(self.id) + '&type=uid&value=' + str(self.id) + \
                  '&containerid=' + str(self.containerid) + '&page=' + str(i)
            try:
                data = Req.get(url).content
                content = json.loads(data).get('data')
                cards = content.get('cards')
                if len(cards) > 0:
                    for j in range(len(cards)):
                        # print("-------------正在爬取第"+str(i)+"页,第"+str(j)+"条微博")
                        if cards[j].get('card_type') == 9:
                            mblog = cards[j].get('mblog')
                            if mblog.__contains__('retweeted_status'):
                                isSelf = False
                            else:
                                isSelf = True
                            text = mblog.get('text')
                            status = "原创" if isSelf else "转载"
                            data = status + " 内容:" + text
                            # print(data)
                            self.file_writer.writer(
                                str(self.id) + "_" + str(
                                    time.strftime('%Y%m%d',
                                                  time.localtime(time.time())))
                                + "_weibo_contain.txt", data)

                    i += 1
                else:
                    break
            except Exception as e:
                print(e)
                pass
コード例 #7
0
def download_pdf(object_id):
    url = "https://mooc1-1.chaoxing.com/ananas/status/" + object_id
    response = Req.get(url)
    data = json.loads(response.text)
    pdf_url = data.get("pdf")
    if pdf_url is not None:
        pdf_name = data.get("filename")
        save_pdf(pdf_url, pdf_name)
コード例 #8
0
def get_one_page(keyword, page):
    index_url = "https://www.shixiseng.com/interns/st-intern_c-420100_?k=" + keyword + "&p="

    index_full_url = index_url + str(page)

    response = Req.get(index_full_url)

    if response.status_code == 200:
        return response.text
コード例 #9
0
def get_chapter_page(know_id):
    url = 'https://mooc1-1.chaoxing.com/knowledge/cards?clazzid=10078203&courseid=204962725&knowledgeid=' + str(know_id)
    response = Req.get(url, headers=header)
    soup = BeautifulSoup(response.text, "html.parser")
    text = soup.find_all('script')[4].get_text()
    pattern = re.compile(r'"objectid":"(.*?)"')
    index = re.search(pattern, text)
    if index is not None:
        object_id = index.group(1)
        print(object_id)
        download_pdf(object_id)
コード例 #10
0
def get_home_page():
    home_url = "https://mooc1-1.chaoxing.com/mycourse/studentstudycourselist?courseId=204962725&chapterId=167501723&clazzid=10078203"

    response = Req.get(home_url, headers=header)
    soup = BeautifulSoup(response.text, "html.parser")
    hrefs_box = soup.find_all('div', attrs={'class': 'ncells'})

    for href in hrefs_box:
        text = href.find('a').attrs['href']
        pattern = re.compile(r"javascript:getTeacherAjax[(]'204962725','10078203','(.*?)'[)];",
                             re.MULTILINE | re.DOTALL)
        index = re.search(pattern, text)
        know_id = index.group(1)
        get_chapter_page(know_id)
コード例 #11
0
    def get_userinfo(self):
        url = 'https://m.weibo.cn/api/container/getIndex?uid=' + str(
            self.id) + '&type=uid&value=' + str(self.id)
        data = Req.get(url).content
        content = json.loads(data).get('data')

        # 获取containerid
        for tab in content.get('tabsInfo').get('tabs'):
            if tab.get('tab_type') == 'weibo':
                self.containerid = tab.get('containerid')

        # 获取用户信息
        user_info = content.get('userInfo')
        userInfo = {}
        userInfo['name'] = user_info['screen_name']
        userInfo['description'] = user_info['description']
        userInfo['follow_count'] = user_info['follow_count']
        userInfo['followers_count'] = user_info['followers_count']
        return userInfo
コード例 #12
0
def getScenic():
    url = "https://itrip.meituan.com/volga/api/v1/trip/billboard/list?poiId=761025&billboardId=42&source=mt&inner_source=mtshare&utm_source=appshare&utm_fromapp=qq&lch=appshare_k20koe6yxp6o&ci=57&cityId=57&feclient=lvyou_wap&uuid=AF13A8D6D897C9FB1D61E3438AB054B171041D30F54290C675296FDB636A76F9&client=wap"
    params = {
        "poiId": "761025",
        "output": "json",
        "citylimit": "true",
        "types": "110204",
        "key": "610c2b21dcd0b8b86959bf1478eeac55"
    }
    res = Req.get(url)
    data = json.loads(res.text)
    pois = data.get("data").get("poiList")

    headers = [
        "name", "introduction", "open_time", "price", "suggested_time",
        "longitude", "latitude", "address", "phone", "score", "photo"
    ]
    write_csv_header(headers)
    for poi in pois:
        map = {
            "name": poi.get("poiName"),
            "introduction": poi.get("recommendBooth"),
            "open_time": "早上8:00-晚上5:00",
            "price": poi.get("price"),
            "suggested_time": "2小时",
            "longitude": poi.get("lng"),
            "latitude": poi.get("lat"),
            "address": poi.get("poiName"),
            "phone": "13063254952",
            "score": poi.get("score"),
            "photo":
            poi.get("frontImg").replace("/w.h", "") + ".webp@60q_1l_175w"
        }
        a = str("1").replace("/w.h", "")
        print(map)
        write_csv_rows(headers, map)