def get_detail_page(weburl): detail_url = "https://www.shixiseng.com" + weburl response = Req.get(detail_url) if response.status_code == 200: return response.text return None
def get_one_topic_from2(url, desc, dir_path): response = Req.post(url) data = response.text print(url) if json.loads(data).get('ok') == 0: return contents = json.loads(data).get('data').get('cards') for content in contents: mblog = content.get('mblog') # id = mblog.get('id') title = mblog.get('text') time = time_handler(mblog.get('created_at')) comments_count = mblog.get('comments_count') reposts_count = mblog.get('reposts_count') attitudes_count = mblog.get('attitudes_count') soup = BeautifulSoup(title) user_name = mblog.get('user').get('screen_name') print(user_name + " " + time + " " + soup.get_text()) csv_c = { '用户名': user_name, '文本内容': soup.get_text(), '发布时间': time, '点赞数': attitudes_count, '评论数': comments_count, '转发数': reposts_count } write_csv_rows(csv_headers, csv_c, desc, dir_path)
def get_hot_topic_top10(): url = "https://m.weibo.cn/api/container/getIndex" param = { 'containerid': '106003type=25&t=3&disable_hot=1&filter_type=realtimehot', 'title': '微博热搜', 'extparam': 'filter_type=realtimehot&mi_cid=100103&pos=0_0&c_type=30&display_time=1554296319', 'luicode': 10000011, 'lfid': 231583 } res = Req.get(url, param) data = json.loads(res.text) hot_topics = data.get('data').get('cards')[0].get('card_group') cur_path = os.getcwd() dir_name = str(time.strftime('%Y%m%d', time.localtime(time.time()))) dir_path = cur_path + os.path.sep + dir_name if not os.path.exists(dir_path): os.makedirs(dir_path) for i in range(1, 11): hot_topic = hot_topics[i] topic_url = hot_topic.get('scheme') topic_desc = hot_topic.get('desc') print("搜索第" + str(i) + "个话题:" + topic_desc) get_one_topic_for_page(topic_url, topic_desc, dir_path)
def get_one_topic_first_page(url, desc, dir_path): print("搜索话题[" + desc + "]的第1页") response = Req.post(url) data = response.text contents = json.loads(data).get('data').get('cards') print(url) for content in contents: card_group = content.get('card_group') if card_group is None: continue mblog = card_group[0].get('mblog') if mblog is None: continue # id = mblog.get('id') title = mblog.get('text') time = time_handler(mblog.get('created_at')) comments_count = mblog.get('comments_count') reposts_count = mblog.get('reposts_count') attitudes_count = mblog.get('attitudes_count') soup = BeautifulSoup(title) user_name = mblog.get('user').get('screen_name') print(user_name + " " + time + " " + soup.get_text()) csv_c = { '用户名': user_name, '文本内容': soup.get_text(), '发布时间': time, '点赞数': attitudes_count, '评论数': comments_count, '转发数': reposts_count } write_csv_rows(csv_headers, csv_c, desc, dir_path)
def save_pdf(pdf_url, file_name, file_path='D:\\'): html = Req.get(pdf_url) # 获取pdf的后缀名 file_suffix = os.path.splitext(pdf_url)[1] with open(file_path + file_name + file_suffix, 'wb')as file: file.write(html.content) print("下载{}成功".format(file_name))
def get_weibo(self): i = 0 while True: url = 'https://m.weibo.cn/api/container/getIndex?uid=' + str(self.id) + '&type=uid&value=' + str(self.id) + \ '&containerid=' + str(self.containerid) + '&page=' + str(i) try: data = Req.get(url).content content = json.loads(data).get('data') cards = content.get('cards') if len(cards) > 0: for j in range(len(cards)): # print("-------------正在爬取第"+str(i)+"页,第"+str(j)+"条微博") if cards[j].get('card_type') == 9: mblog = cards[j].get('mblog') if mblog.__contains__('retweeted_status'): isSelf = False else: isSelf = True text = mblog.get('text') status = "原创" if isSelf else "转载" data = status + " 内容:" + text # print(data) self.file_writer.writer( str(self.id) + "_" + str( time.strftime('%Y%m%d', time.localtime(time.time()))) + "_weibo_contain.txt", data) i += 1 else: break except Exception as e: print(e) pass
def download_pdf(object_id): url = "https://mooc1-1.chaoxing.com/ananas/status/" + object_id response = Req.get(url) data = json.loads(response.text) pdf_url = data.get("pdf") if pdf_url is not None: pdf_name = data.get("filename") save_pdf(pdf_url, pdf_name)
def get_one_page(keyword, page): index_url = "https://www.shixiseng.com/interns/st-intern_c-420100_?k=" + keyword + "&p=" index_full_url = index_url + str(page) response = Req.get(index_full_url) if response.status_code == 200: return response.text
def get_chapter_page(know_id): url = 'https://mooc1-1.chaoxing.com/knowledge/cards?clazzid=10078203&courseid=204962725&knowledgeid=' + str(know_id) response = Req.get(url, headers=header) soup = BeautifulSoup(response.text, "html.parser") text = soup.find_all('script')[4].get_text() pattern = re.compile(r'"objectid":"(.*?)"') index = re.search(pattern, text) if index is not None: object_id = index.group(1) print(object_id) download_pdf(object_id)
def get_home_page(): home_url = "https://mooc1-1.chaoxing.com/mycourse/studentstudycourselist?courseId=204962725&chapterId=167501723&clazzid=10078203" response = Req.get(home_url, headers=header) soup = BeautifulSoup(response.text, "html.parser") hrefs_box = soup.find_all('div', attrs={'class': 'ncells'}) for href in hrefs_box: text = href.find('a').attrs['href'] pattern = re.compile(r"javascript:getTeacherAjax[(]'204962725','10078203','(.*?)'[)];", re.MULTILINE | re.DOTALL) index = re.search(pattern, text) know_id = index.group(1) get_chapter_page(know_id)
def get_userinfo(self): url = 'https://m.weibo.cn/api/container/getIndex?uid=' + str( self.id) + '&type=uid&value=' + str(self.id) data = Req.get(url).content content = json.loads(data).get('data') # 获取containerid for tab in content.get('tabsInfo').get('tabs'): if tab.get('tab_type') == 'weibo': self.containerid = tab.get('containerid') # 获取用户信息 user_info = content.get('userInfo') userInfo = {} userInfo['name'] = user_info['screen_name'] userInfo['description'] = user_info['description'] userInfo['follow_count'] = user_info['follow_count'] userInfo['followers_count'] = user_info['followers_count'] return userInfo
def getScenic(): url = "https://itrip.meituan.com/volga/api/v1/trip/billboard/list?poiId=761025&billboardId=42&source=mt&inner_source=mtshare&utm_source=appshare&utm_fromapp=qq&lch=appshare_k20koe6yxp6o&ci=57&cityId=57&feclient=lvyou_wap&uuid=AF13A8D6D897C9FB1D61E3438AB054B171041D30F54290C675296FDB636A76F9&client=wap" params = { "poiId": "761025", "output": "json", "citylimit": "true", "types": "110204", "key": "610c2b21dcd0b8b86959bf1478eeac55" } res = Req.get(url) data = json.loads(res.text) pois = data.get("data").get("poiList") headers = [ "name", "introduction", "open_time", "price", "suggested_time", "longitude", "latitude", "address", "phone", "score", "photo" ] write_csv_header(headers) for poi in pois: map = { "name": poi.get("poiName"), "introduction": poi.get("recommendBooth"), "open_time": "早上8:00-晚上5:00", "price": poi.get("price"), "suggested_time": "2小时", "longitude": poi.get("lng"), "latitude": poi.get("lat"), "address": poi.get("poiName"), "phone": "13063254952", "score": poi.get("score"), "photo": poi.get("frontImg").replace("/w.h", "") + ".webp@60q_1l_175w" } a = str("1").replace("/w.h", "") print(map) write_csv_rows(headers, map)