Beispiel #1
0
def user_scrape(D, token, stime=""):
    FIELDS = [u"统计时间", u"新关注人数", u"取消关注人数", u"净增关注人数", u"累积关注人数", u"公众号搜索", u"二维码扫描", u"图文页右上角菜单", u"名片分享", u"其他"]
    writer = UnicodeWriter("user_info.csv", "gbk")
    writer.writerow(FIELDS)
    bag = {}
    url_total = "https://mp.weixin.qq.com/misc/useranalysis?&token=%s&lang=zh_CN"
    html = D.get(url_total % token)
    if not stime:
        m = common_re(r'\{\s*(date:\s"[^\}]+)\}\s*\]\s*\}\s*\]', html)
    else:
        m = common_re(r'\{\s*(date:\s"%s"[^\}]+)\}' % stime, html)
    bag[u"统计时间"] = common_re(r'date:\s"([^"]+)"', m) if m else ""
    bag[u"新关注人数"] = common_re(r"new_user:\s([^\s]+)\s", m) if m else ""
    bag[u"取消关注人数"] = common_re(r"cancel_user:\s([^,]+),", m) if m else ""
    bag[u"净增关注人数"] = common_re(r"netgain_user:\s([^,]+),", m) if m else ""
    bag[u"累积关注人数"] = common_re(r"cumulate_user:\s([^,]+),", m) if m else ""
    assert bag[u"统计时间"]
    a = ["1", "30", "43", "17", "0"]
    b = [u"公众号搜索", u"二维码扫描", u"图文页右上角菜单", u"名片分享", u"其他"]
    for num, i in enumerate(a):
        gain_url = (
            "https://mp.weixin.qq.com/misc/useranalysis?&begin_date=%s&end_date=%s&source=%s&token=%s&lang=zh_CN&f=json&ajax=1"
            % (bag[u"统计时间"], bag[u"统计时间"], i, token)
        )
        gain_html = D.get(gain_url)
        bag[b[num]] = common_re(r'"new_user":([^,]+),', gain_html)
    writer.writerow(bag.get(field) for field in FIELDS)
    return bag[u"统计时间"]
Beispiel #2
0
def all_txt_scrape(D, stime, token):
    url = "https://mp.weixin.qq.com/misc/appmsganalysis?action=report&type=daily&begin_date=%s&end_date=%s&token=%s&lang=zh_CN&f=json&ajax=1"
    FIELDS = [
        u"采集时间",
        u"统计时间",
        u"图文页阅读人数-全部",
        u"图文页阅读次数-全部",
        u"原文页阅读人数-全部",
        u"原文页阅读次数-全部",
        u"分享转发人数-全部",
        u"分享转发次数-全部",
        u"微信收藏人数-全部",
        u"图文页阅读人数-会话",
        u"图文页阅读次数-会话",
        u"图文页阅读人数-好友转发",
        u"图文页阅读次数-好友转发",
        u"图文页阅读人数-朋友圈",
        u"图文页阅读次数-朋友圈",
        u"图文页阅读人数-腾讯微博",
        u"图文页阅读次数-腾讯微博",
        u"图文页阅读人数-历史消息页",
        u"图文页阅读次数-历史消息页",
        u"图文页阅读人数-其他",
        u"图文页阅读次数-其他",
    ]
    html = D.get(url % (stime, stime, token))
    jdata = json.loads(html)
    if "item" not in jdata:
        return
    writer = UnicodeWriter("article_channel_info.csv", "gbk")
    writer.writerow(FIELDS)
    us = ["99999999", "0", "1", "2", "3", "4", "5"]
    bag = {}
    bag[u"采集时间"] = time.strftime("%Y-%m-%d")
    bag[u"统计时间"] = stime
    for i in jdata["item"]:
        m = str(i["user_source"])
        assert m in us
        if m == "99999999":
            mindex = 2
            bag[FIELDS[mindex]] = i["int_page_read_user"]
            bag[FIELDS[mindex + 1]] = i["int_page_read_count"]
            bag[FIELDS[mindex + 2]] = i["ori_page_read_user"]
            bag[FIELDS[mindex + 3]] = i["ori_page_read_count"]
            bag[FIELDS[mindex + 4]] = i["share_user"]
            bag[FIELDS[mindex + 5]] = i["share_count"]
            bag[FIELDS[mindex + 6]] = i["add_to_fav_user"]
        else:
            mindex = 2 * us.index(m) + 7
            bag[FIELDS[mindex]] = i["int_page_read_user"]
            bag[FIELDS[mindex + 1]] = i["int_page_read_count"]
    writer.writerow(bag.get(field) for field in FIELDS)
Beispiel #3
0
def scrape():
    [sdate, szse_tasks, bond_tasks] = read_conf()
    writer = UnicodeWriter('cninfo.csv', 'gbk')
    writer.writerow(FIELDS)
    D = download('http://www.cninfo.com.cn/cninfo-new/announcement/show', is_cookie=True)
    url = 'http://www.cninfo.com.cn/cninfo-new/announcement/query'
    last_date = get_last_trade_date() if not sdate else ''
    post_data = {}
    post_data['columnTitle'] = '历史公告查询'
    post_data['pageNum'] =  '1'
    post_data['pageSize'] = '30'
    post_data['tabName'] = 'fulltext'
    if sdate:
        post_data['seDate'] = sdate
        print 'Query interval time is: %s' % sdate
    else:
        post_data['seDate'] = last_date if last_date else '请选择日期'
        print 'Last trade date is: %s' % last_date
    for k in szse_tasks + bond_tasks:
        post_data['column'] = 'szse' if k in szse_tasks else 'bond'
        post_data['searchkey'] = k
        try:
            html = D.post(url, data = post_data)
        except Exception:
            html = D.post(url, data = post_data)
        html = html.decode('utf-8')
        jdata = json.loads(html).get('announcements')
        if jdata:
            for i in jdata:
                bag = {}
                bag[u'关键字'] = '%s' % k.decode('utf-8')
                bag[u'代码'] = str(i.get('secCode'))
                bag[u'简称'] = i.get('secName')
                bag[u'公告标题'] =  i.get('announcementTitle')
                stime = str(i.get('announcementTime'))
                x = time.localtime(float(stime[:-3])) if stime else ''
                bag[u'公告时间'] = time.strftime('%Y-%m-%d', x)
                m = i.get('announcementId')
                bag[u'PDF文件URL'] = 'http://www.cninfo.com.cn/cninfo-new/disclosure/szse/bulletin_detail/true/' + m if m else ''
                writer.writerow(bag.get(field) for field in FIELDS)
Beispiel #4
0
def txt_scrape(D, stime, token):
    FIELDS = [
        u"采集时间",
        u"发布时间",
        u"标题",
        u"送达人数",
        u"图文页阅读总人数",
        u"图文页阅读总次数",
        u"原文页阅读总人数",
        u"原文页阅读总次数",
        u"转发+收藏总人数",
        u"转发+收藏总次数",
        u"当日图文页阅读人数",
        u"当日图文页阅读次数",
        u"当日原文页阅读人数",
        u"当日原文页阅读次数",
        u"当日转发人数",
        u"当日转发次数",
        u"当日收藏人数",
        u"当日收藏次数",
    ]
    writer = UnicodeWriter("article_info.csv", "gbk")
    writer.writerow(FIELDS)
    bag = {}
    # 7 days ago
    atime = calculte_time(stime, 7)
    url = (
        "https://mp.weixin.qq.com/misc/appmsganalysis?action=all&begin_date=%s&end_date=%s&order_by=1&order_direction=2&page_num=1&page_size=10&token=%s&lang=zh_CN&f=json&ajax=1"
        % (atime, stime, token)
    )
    html = D.get(url)
    jdata = json.loads(html)
    if "total_article_data" in jdata:
        infos = {}
        m = jdata["article_summary_data"]
        m = m.replace('\\"', '"')
        m2 = json.loads(m)
        if "list" in m2:
            for i in m2["list"]:
                if i["ref_date"] == stime:
                    infos[i["title"]] = i
        if not infos:
            print "文章发布时间超过七天,没有article_info数据".decode("utf8").encode("gb2312")
            return
        m = jdata["total_article_data"]
        m = m.replace('\\"', '"')
        m2 = json.loads(m)
        if "list" in m2:
            for i in m2["list"]:
                bag = {}
                bag[u"采集时间"] = time.strftime("%Y-%m-%d")
                bag[u"发布时间"] = i["publish_date"]
                bag[u"标题"] = i["title"]
                bag[u"送达人数"] = i["target_user"]
                bag[u"图文页阅读总人数"] = i["int_page_read_user"]
                bag[u"图文页阅读总次数"] = i["int_page_read_count"]
                bag[u"原文页阅读总人数"] = i["ori_page_read_user"]
                bag[u"原文页阅读总次数"] = i["ori_page_read_count"]
                bag[u"转发+收藏总人数"] = str(int(i["share_user"]) + int(i["add_to_fav_user"]))
                bag[u"转发+收藏总次数"] = str(int(i["share_count"]) + int(i["add_to_fav_count"]))
                try:
                    bag[u"当日图文页阅读人数"] = infos[i["title"]]["int_page_read_user"]
                except KeyError:
                    continue
                bag[u"当日图文页阅读次数"] = infos[i["title"]]["int_page_read_count"]
                bag[u"当日原文页阅读人数"] = infos[i["title"]]["ori_page_read_user"]
                bag[u"当日原文页阅读次数"] = infos[i["title"]]["ori_page_read_count"]
                bag[u"当日转发人数"] = infos[i["title"]]["share_user"]
                bag[u"当日转发次数"] = infos[i["title"]]["share_count"]
                bag[u"当日收藏人数"] = infos[i["title"]]["add_to_fav_user"]
                bag[u"当日收藏次数"] = infos[i["title"]]["add_to_fav_count"]
                writer.writerow(bag.get(field) for field in FIELDS)