Beispiel #1
0
def download_images(url, timestamp):
    """
    下载图片
    :param timestamp:
    :param url:
    :return:
    """
    headers = {
        # "Cookie": config['cookie'],
        "User-Agent": spider_config.get('user_agent')
    }

    req = urllib.request.Request(url=url, headers=headers)
    with urllib.request.urlopen(req) as response:
        html = response.read()
        dom = etree.HTML(html)
        images = dom.xpath('// *[ @ id = "js_content"]//img')

        image_links = []
        for image in images:
            image_link = image.attrib.get("data-src")
            fmt = image.attrib.get("data-type")

            if image_link is None:
                image_link = image.attrib.get("src")
                fmt = 'png'

            url_obj = urlparse(image_link)
            segments = url_obj.path.split('/')
            filename = segments[len(segments) - 2]

            image_dir = current_dir + os.sep + "../output/images/" + timestamp
            if not os.path.exists(image_dir):
                os.makedirs(image_dir, exist_ok=True)

            local_path = image_dir + os.sep + filename + "." + fmt
            image_links.append((image_link, local_path))
            try:
                res = requests.get(url=image_link,
                                   headers=headers,
                                   stream=True,
                                   timeout=5)
                if res.status_code == 200:
                    with open(local_path, 'wb') as f:
                        for chunk in res.iter_content(chunk_size=512):
                            if chunk:
                                f.write(chunk)
                        f.close()
            except Exception as e:
                print(e)
                print(image_link)
        return image_links
Beispiel #2
0
def extract_context(link):
    """
    提取正文
    :param link: 文章地址
    :return:
    """
    headers = {
        # "Cookie": config['cookie'],
        "User-Agent": spider_config.get('user_agent')
    }

    req = urllib.request.Request(url=link, headers=headers)
    with urllib.request.urlopen(req) as response:
        html = response.read()
        # print(html)
        # 对 html文本进行处理 获得一个_Element对象
        dom = etree.HTML(html)
        # texts = dom.xpath('// *[ @ id = "js_content"] //section/text()')
        root = dom.xpath('// *[ @ id = "js_content"]')
        # print(link)
        if len(root) > 0:
            return root[0].xpath('string(.)')
        return ""
def get_articles(_info, _details, _date):
    """
    获取文章
    :param _details:
    :param _info:
    :param _date:
    :return:
    """
    headers = {
        "Cookie": _details['cookie'],
        "User-Agent": spider_config.get("user_agent")
    }

    # 请求参数
    url = "https://mp.weixin.qq.com/cgi-bin/appmsg"
    begin = "0"
    params = {
        "action": "list_ex",
        "begin": begin,
        "count": "16",
        "fakeid": _info['fakeid'],
        "type": "9",
        "token": _details['token'],
        "lang": "zh_CN",
        "f": "json",
        "ajax": "1"
    }

    fmt = spider_config.get("common.fmt")
    i = 0
    next_page = True

    while next_page:

        count = 0
        begin = i * 16

        res = requests.get(url, headers=headers, params=params, verify=False)
        params["begin"] = str(begin)

        # 微信流量控制, 退出
        if res.json()['base_resp']['ret'] == 200013:
            log.info("frequency control, stop at {}".format(str(begin)))
            return False

        if res.json()['base_resp']['ret'] == 200003:
            log.info("invalid session, stop at {}".format(str(begin)))
            _send(_details)
            return False

        # 如果返回的内容中为空则结束
        if 'app_msg_list' in res.json():
            app_list = res.json()['app_msg_list']
            if len(app_list) == 0:
                log.info("all article parsed")
                return True

            for row in app_list:
                # 判断文章更新时间
                update_date = time.strftime(fmt,
                                            time.localtime(row['update_time']))
                if int(update_date) > int(_date):
                    # 获取t+1文章,当天更新文章忽略
                    continue
                elif int(update_date) < int(_date):
                    next_page = False
                    # 只获取t+1文章
                    break
                else:
                    insert_article(_info['fakeid'], row)
                    product_article(row)
                    count += 1
            log.info("公众号:%s,%d" % (_info['nickname'], count))
        else:
            log.info("公众号:%s,响应:%s" % (_info['nickname'], res.json()))
            _send(_details)
            break
        i += 1
        time.sleep(spider_config.get('common.time.sleep'))

    return True
def _send(_details):
    push = WxPush()
    _content = {"token": _details['token'], "msg": '爬取公众号配置失效,请尽快更新'}
    push.send_wx(_content)
    time.sleep(spider_config.get('common.time.sleep'))
Beispiel #5
0
def download_video(link, timestamp):
    """
    下载视频到本地目录
    :param timestamp:
    :param link:
    :return:
    """
    res = requests.get(link)
    url_obj = urlparse(link)
    qs = parse_qs(url_obj.query)

    json_res = res.text  # 匹配:wxv_1105179750743556096
    regex = r"wxv_.{19}"
    result = re.search(regex, json_res)
    if result:
        vid = result.group(0)
        biz = qs['__biz']
        mid = qs['mid']
        idx = qs['idx']

        url_info = get_video_url(biz, mid, idx, vid)
        if len(url_info) == 0:
            print("无视频")
            return None

        url = url_info[0]['url']
        try:
            if url_info[0]['filesize'] < MAX_VIDEO_SIZE:
                url = url_info[0]['url']
            elif url_info[1]['filesize'] < MAX_VIDEO_SIZE:
                url = url_info[1]['url']
            else:
                url = url_info[2]['url']

            headers = {
                # "Cookie": _cookie,
                "User-Agent": spider_config.get('user_agent')
            }
            res = requests.get(url=url,
                               headers=headers,
                               stream=True,
                               timeout=5)

            if res.status_code == 200:
                log.info("开始下载视频%s", vid)
                video_dir = current_dir + os.sep + "../output/videos/" + timestamp
                if not os.path.exists(video_dir):
                    os.makedirs(video_dir, exist_ok=True)

                video_path = video_dir + os.sep + vid + '.mp4'
                with open(video_path, 'wb') as f:
                    for chunk in res.iter_content(chunk_size=512):
                        if chunk:
                            f.write(chunk)
                    f.close()
                log.info("视频%s下载结束", vid)
                return url, video_path
        except requests.exceptions.RequestException as e:
            log.error("视频地址" + vid + "无法下载,原因:%s", e)
        # print(e)
    return None
Beispiel #6
0
        um.cursor.execute(sql, param)


if __name__ == '__main__':

    _article_rq = RedisQueue('article_rq')

    # 文章
    while 1:
        _article = _article_rq.get_wait()
        if not _article:
            break

        today = datetime.date.today()
        yesterday = today - datetime.timedelta(days=1)
        fmt = spider_config.get("common.fmt")
        yesterday_str = datetime.date.strftime(yesterday, fmt)

        article_obj = json.loads(_article[1])
        # 正文
        content = extract_context(article_obj['link'])
        save_article_content(article_obj['aid'], content.strip())
        # print("%s 正文获取结束" % article['title'])
        # 图片
        images = download_images(article_obj['link'], yesterday_str)
        # print("%s 图片获取结束" % article['title'])
        if len(images) > 0:
            save_article_image(article_obj['aid'], images)
        # 视频
        video = download_video(article_obj['link'], yesterday_str)
        # print("%s 视频获取结束" % article['title'])
Beispiel #7
0
 def __init__(self):
     self.app_id = spider_config.get('wx.test.app_id')
     self.app_secret = spider_config.get('wx.test.app_secret')
     self.template_id = spider_config.get('wx.test.template_id')
     self.access_token = ''
     self.expires_in = datetime.datetime.now()