def download_images(url, timestamp): """ 下载图片 :param timestamp: :param url: :return: """ headers = { # "Cookie": config['cookie'], "User-Agent": spider_config.get('user_agent') } req = urllib.request.Request(url=url, headers=headers) with urllib.request.urlopen(req) as response: html = response.read() dom = etree.HTML(html) images = dom.xpath('// *[ @ id = "js_content"]//img') image_links = [] for image in images: image_link = image.attrib.get("data-src") fmt = image.attrib.get("data-type") if image_link is None: image_link = image.attrib.get("src") fmt = 'png' url_obj = urlparse(image_link) segments = url_obj.path.split('/') filename = segments[len(segments) - 2] image_dir = current_dir + os.sep + "../output/images/" + timestamp if not os.path.exists(image_dir): os.makedirs(image_dir, exist_ok=True) local_path = image_dir + os.sep + filename + "." + fmt image_links.append((image_link, local_path)) try: res = requests.get(url=image_link, headers=headers, stream=True, timeout=5) if res.status_code == 200: with open(local_path, 'wb') as f: for chunk in res.iter_content(chunk_size=512): if chunk: f.write(chunk) f.close() except Exception as e: print(e) print(image_link) return image_links
def extract_context(link): """ 提取正文 :param link: 文章地址 :return: """ headers = { # "Cookie": config['cookie'], "User-Agent": spider_config.get('user_agent') } req = urllib.request.Request(url=link, headers=headers) with urllib.request.urlopen(req) as response: html = response.read() # print(html) # 对 html文本进行处理 获得一个_Element对象 dom = etree.HTML(html) # texts = dom.xpath('// *[ @ id = "js_content"] //section/text()') root = dom.xpath('// *[ @ id = "js_content"]') # print(link) if len(root) > 0: return root[0].xpath('string(.)') return ""
def get_articles(_info, _details, _date): """ 获取文章 :param _details: :param _info: :param _date: :return: """ headers = { "Cookie": _details['cookie'], "User-Agent": spider_config.get("user_agent") } # 请求参数 url = "https://mp.weixin.qq.com/cgi-bin/appmsg" begin = "0" params = { "action": "list_ex", "begin": begin, "count": "16", "fakeid": _info['fakeid'], "type": "9", "token": _details['token'], "lang": "zh_CN", "f": "json", "ajax": "1" } fmt = spider_config.get("common.fmt") i = 0 next_page = True while next_page: count = 0 begin = i * 16 res = requests.get(url, headers=headers, params=params, verify=False) params["begin"] = str(begin) # 微信流量控制, 退出 if res.json()['base_resp']['ret'] == 200013: log.info("frequency control, stop at {}".format(str(begin))) return False if res.json()['base_resp']['ret'] == 200003: log.info("invalid session, stop at {}".format(str(begin))) _send(_details) return False # 如果返回的内容中为空则结束 if 'app_msg_list' in res.json(): app_list = res.json()['app_msg_list'] if len(app_list) == 0: log.info("all article parsed") return True for row in app_list: # 判断文章更新时间 update_date = time.strftime(fmt, time.localtime(row['update_time'])) if int(update_date) > int(_date): # 获取t+1文章,当天更新文章忽略 continue elif int(update_date) < int(_date): next_page = False # 只获取t+1文章 break else: insert_article(_info['fakeid'], row) product_article(row) count += 1 log.info("公众号:%s,%d" % (_info['nickname'], count)) else: log.info("公众号:%s,响应:%s" % (_info['nickname'], res.json())) _send(_details) break i += 1 time.sleep(spider_config.get('common.time.sleep')) return True
def _send(_details): push = WxPush() _content = {"token": _details['token'], "msg": '爬取公众号配置失效,请尽快更新'} push.send_wx(_content) time.sleep(spider_config.get('common.time.sleep'))
def download_video(link, timestamp): """ 下载视频到本地目录 :param timestamp: :param link: :return: """ res = requests.get(link) url_obj = urlparse(link) qs = parse_qs(url_obj.query) json_res = res.text # 匹配:wxv_1105179750743556096 regex = r"wxv_.{19}" result = re.search(regex, json_res) if result: vid = result.group(0) biz = qs['__biz'] mid = qs['mid'] idx = qs['idx'] url_info = get_video_url(biz, mid, idx, vid) if len(url_info) == 0: print("无视频") return None url = url_info[0]['url'] try: if url_info[0]['filesize'] < MAX_VIDEO_SIZE: url = url_info[0]['url'] elif url_info[1]['filesize'] < MAX_VIDEO_SIZE: url = url_info[1]['url'] else: url = url_info[2]['url'] headers = { # "Cookie": _cookie, "User-Agent": spider_config.get('user_agent') } res = requests.get(url=url, headers=headers, stream=True, timeout=5) if res.status_code == 200: log.info("开始下载视频%s", vid) video_dir = current_dir + os.sep + "../output/videos/" + timestamp if not os.path.exists(video_dir): os.makedirs(video_dir, exist_ok=True) video_path = video_dir + os.sep + vid + '.mp4' with open(video_path, 'wb') as f: for chunk in res.iter_content(chunk_size=512): if chunk: f.write(chunk) f.close() log.info("视频%s下载结束", vid) return url, video_path except requests.exceptions.RequestException as e: log.error("视频地址" + vid + "无法下载,原因:%s", e) # print(e) return None
um.cursor.execute(sql, param) if __name__ == '__main__': _article_rq = RedisQueue('article_rq') # 文章 while 1: _article = _article_rq.get_wait() if not _article: break today = datetime.date.today() yesterday = today - datetime.timedelta(days=1) fmt = spider_config.get("common.fmt") yesterday_str = datetime.date.strftime(yesterday, fmt) article_obj = json.loads(_article[1]) # 正文 content = extract_context(article_obj['link']) save_article_content(article_obj['aid'], content.strip()) # print("%s 正文获取结束" % article['title']) # 图片 images = download_images(article_obj['link'], yesterday_str) # print("%s 图片获取结束" % article['title']) if len(images) > 0: save_article_image(article_obj['aid'], images) # 视频 video = download_video(article_obj['link'], yesterday_str) # print("%s 视频获取结束" % article['title'])
def __init__(self): self.app_id = spider_config.get('wx.test.app_id') self.app_secret = spider_config.get('wx.test.app_secret') self.template_id = spider_config.get('wx.test.template_id') self.access_token = '' self.expires_in = datetime.datetime.now()