Ejemplo n.º 1
0
def __get_all_friends_infos():
    print("[%s] <get infos> start" % (time.ctime(time.time())))
    base.check_path(r'./results/', new=True)
    base.check_path(r'./results/infos/', new=True)
    try:
        with open('friends/numbers.txt', encoding='utf-8') as f:
            numbers_list = eval(f.read())
    except Exception as e:
        print("[%s] <get infos> make sure numbers.txt exists" % (time.ctime(time.time())))
        print(e)

    while numbers_list:
        save = numbers_list[:]
        item = numbers_list.pop()
        qq = item['data']
        print("[%s] <get infos> qq: %s" % (time.ctime(time.time()), qq))
        try:
            __get_each_friend_info(qq)
        # restore the file
        except Exception as e:
            with open('friends/numbers.txt', 'w', encoding='utf-8') as f:
                f.write(str(save))
            print(e)
    else:
        print("[%s] <get infos> ok" % (time.ctime(time.time())))
Ejemplo n.º 2
0
def __get_url_photo(url, name, num):
    from PIL import Image
    from io import BytesIO
    base.check_path('./results/photos/%s' % num, new=True)
    try:
        response = requests.get(url)
        image = Image.open(BytesIO(response.content))
        image.save('results/photos/%s/%d.jpg' % (num, name))
    except Exception as e:
        print("[%s] <get photos> qq: %s some error occur" % (time.ctime(time.time()), num))
Ejemplo n.º 3
0
def get_photos():
    print("[%s] <get photos> start" % (time.ctime(time.time())))
    base.check_path('./results/', new=True)
    base.check_path('./results/pyq-photos/', new=True)
    __get_urls()
    with open('./results/pyq-urls/urls.txt') as f:
        li = eval(f.read())
    for i in range(len(li)):
        print("[%s] <get photos> pulling %dth photo" %
              (time.ctime(time.time()), i + 1))
        __get_url_photo(li[i], i + 1)
    print("[%s] <get photos> ok" % (time.ctime(time.time())))
Ejemplo n.º 4
0
def __get_friends_list():
    if base.check_path(r"./friends/numbers.txt"):
        return
    print("[%s] <get friends list> start" % (time.ctime(time.time())))
    base.check_path(r"./friends/", new=True)
    position = 0
    while True:
        url = base.friends_url + '&offset=' + str(position)
        base.header['Referer'] = 'http://qzs.qq.com/qzone/v8/pages/setting/visit_v8.html'
        print("[%s] <get friends list> position: %d" % (time.ctime(time.time()), position))
        res = requests.get(url, headers=base.header)
        html = res.text

        # cookie invalid
        if "请先登录" in html:
            print("[%s] <get friends list> some error occur" % (time.ctime(time.time())))
            break

        # html[10: -2] may cause a error
        try:
            html = html[10: -2]
            html_dict = dict(eval(html))
            html_data = html_dict["data"]
            html_list = html_dict["data"]["uinlist"]
        except Exception as e:
            print("[%s] <get friends list> some error occur" % (time.ctime(time.time())))
            print(e)

        if not len(html_list):
            print("[%s] <get friends list> ok" % (time.ctime(time.time())))
            break

        # write initial data crawled from qqzone with format 'json'
        with open('friends/position' + str(position) + '.json', 'w', encoding='utf-8') as f:
            f.write(str(html_data))

        position += 50
        time.sleep(1)

    print("[%s] <process friends> start" % (time.ctime(time.time())))
    friends = [i for i in os.listdir('friends') if i.endswith("json")]
    numbers = []
    for item in friends:
        with open('friends/' + item, encoding='utf-8') as f:
            con = eval(f.read())["uinlist"]
            for i in con:
                numbers.append(i)
    else:
        with open('friends/numbers.txt', 'w', encoding='utf-8') as f:
            print("[%s] <process friends> ok, %d in total" % (time.ctime(time.time()), len(numbers)))
            f.write(str(numbers))
Ejemplo n.º 5
0
def get_photos(given):
    base.check_path(r'./results/', new=True)
    base.check_path(r'./results/photos/', new=True)
    for num in given:
        try:
            print("[%s] <get photos> qq: %s start" % (time.ctime(time.time()), num))
            __get_urls(num)
            with open('./results/urls/%s.txt' % num, encoding='utf-8') as f:
                con = eval(f.read())
                count = 0
                for i in con:
                    __get_url_photo(i, count, num)
                    count += 1
                    print("[%s] <get photos> qq: %s pulling %dth photo" % (time.ctime(time.time()), num, count))
                print("[%s] <get photos> qq: %s ok" % (time.ctime(time.time()), num))
        except Exception as e:
            print("[%s] <get photos> make sure %s.txt exists" % (time.ctime(time.time()), num))
            print(e)
Ejemplo n.º 6
0
def __segment_shuoshuo(num):
    print("[%s] <shuoshuo segment> qq: %s start" % (time.ctime(time.time()), num))
    try:
        with open('./results/shuoshuo/%s.txt' % num, encoding='utf-8') as f:
            content = eval(f.read())
    except Exception as e:
        print("[%s] <shuoshuo segment> make sure %s.txt exists" % (num, time.ctime(time.time())))
        print(e)
    base.check_path(r"./results/", new=True)
    base.check_path(r"./results/shuoshuo/", new=True)
    with open('./results/shuoshuo/%s-seg.txt' % num, 'w', encoding='utf-8') as wf:
        for con in content:
            # replace #
            con, number = re.subn('[#]', "", con)
            # replace [emoji]
            con, number = re.subn(r'\[(.*?)\](.*?)\[(.*?)\]', "", con)
            wf.write(con)
        print("[%s] <shuoshuo segment> qq: %s ok" % (time.ctime(time.time()), num))
Ejemplo n.º 7
0
def __segment_pyq():
    print("[%s] <pyq segment> start" % (time.ctime(time.time())))
    base.check_path('./results/', new=True)
    base.check_path('./results/words/', new=True)
    with open('./exported_sns.json', encoding='utf-8') as f:
        raw = f.read().replace("false", "False").replace("true", "True")
        raw_content = eval(raw)
    content = []
    for i in raw_content:
        content.append(i["content"])
    with open('./results/words/pyq-seg.txt', 'w', encoding='utf-8') as wf:
        for con in content:
            # replace #
            con, number = re.subn('[#]', "", con)
            # replace [emoji]
            con, number = re.subn(r'\[(.*?)\](.*?)\[(.*?)\]', "", con)
            wf.write(con)
        print("[%s] <pyq segment> ok" % (time.ctime(time.time())))
Ejemplo n.º 8
0
def get_pyq_word_cloud(mask="Male"):
    print("[%s] <get pyq word cloud> start" % (time.ctime(time.time())))
    text_url = './results/pyq/pyq-seg.txt'
    mask_url = './word/alice_mask.png' if mask == "Female" else './word/boy_mask.png'
    if not base.check_path(text_url):
        print("[%s] <get pyq word cloud> make sure pyq-seg.txt exists" %
              (time.ctime(time.time())))
        return
    __gen_word_cloud(text_url, mask_url)
    print("[%s] <get pyq word cloud> ok" % (time.ctime(time.time())))
Ejemplo n.º 9
0
def get_zone_word_cloud(num, mask="Male"):
    print("[%s] <get zone word cloud> start" % (time.ctime(time.time())))
    text_url = './results/shuoshuo/%s-seg.txt' % num
    mask_url = './word/alice_mask.png' if mask == "Female" else './word/boy_mask.png'
    if not base.check_path(text_url):
        print("[%s] <get zone word cloud> make sure %s-seg.txt exists" %
              (time.ctime(time.time()), num))
        return
    __gen_word_cloud(text_url, mask_url)
    print("[%s] <get zone word cloud> ok" % (time.ctime(time.time())))
Ejemplo n.º 10
0
def __get_urls(num):
    if base.check_path('./results/urls/%s.txt' % num):
        return
    base.check_path(r'./results/', new=True)
    base.check_path(r'./results/urls/', new=True)
    print("[%s] <get urls> qq: %s start" % (time.ctime(time.time()), num))
    files = [i for i in os.listdir('results/infos/%s' % num) if i.endswith(".txt")]
    con = []
    for item in files:
        with open('results/infos/%s/' % num + item, encoding='utf-8') as f:
            msglist = eval(f.read())['msglist']
            for i in msglist:
                if 'pic' not in i.keys() or i['pic'] is None:
                    continue
                else:
                    for j in i['pic']:
                        if 'url2' in j.keys():
                            con.append(j['url2'])
    else:
        with open('results/urls/%s.txt' % num, 'w', encoding='utf-8') as f:
            print("[%s] <get urls> qq: %s ok, %d in total" % (time.ctime(time.time()), num, len(con)))
            f.write(str(con))
Ejemplo n.º 11
0
def __get_given_friends_infos(given):
    print("[%s] <get infos> start" % (time.ctime(time.time())))
    base.check_path(r'./results/', new=True)
    base.check_path(r'./results/infos/', new=True)
    try:
        with open('friends/numbers.txt', encoding='utf-8') as f:
            numbers_list = eval(f.read())
    except Exception as e:
        print("[%s] <get infos> make sure numbers.txt exists" % (time.ctime(time.time())))
        print(e)
    numbers = [i['data'] for i in numbers_list]
    for qq in given:
        if qq in numbers:
            print("[%s] <get infos> qq: %s" % (time.ctime(time.time()), qq))
            try:
                __get_each_friend_info(qq)
            except Exception as e:
                continue
        else:
            continue
    else:
        print("[%s] <get infos> ok" % (time.ctime(time.time())))
Ejemplo n.º 12
0
def __get_urls():
    if base.check_path('./results/pyq-urls/urls.txt'):
        return
    base.check_path(r'./results/', new=True)
    base.check_path(r'./results/pyq-urls/', new=True)
    print("[%s] <get urls> start" % (time.ctime(time.time())))
    with open('./exported_sns.json', encoding='utf-8') as f:
        raw = f.read().replace("false", "False").replace("true", "True")
        raw_content = eval(raw)
    contents = []
    for i in raw_content:
        contents.append(i["rawXML"])
    urls = []
    for con in contents:
        # replace #
        li = re.findall(
            r'<url\ type\ =\ \ "1"\ ><!\[CDATA\[(.*?)\]\]><\\/url>', con)
        for i in li:
            urls.append(i.replace("\/", "/"))
    with open('./results/pyq-urls/urls.txt', "w") as f:
        f.write(str(urls))
    print("[%s] <get urls> ok" % (time.ctime(time.time())))
Ejemplo n.º 13
0
def __get_each_friend_info(num):
    base.header['Referer'] = 'http://user.qzone.qq.com/' + num
    if base.check_path('./results/infos/' + num):
        return
    base.check_path('results/infos/' + num, new=True)
    url_base = base.get_moods_url(num)
    position = 0
    while True:
        print("[%s] <get infos> qq: %s position: %d" % (time.ctime(time.time()), num, position))
        url = url_base + "&pos=%d" % position
        res = base.session.get(url, headers=base.header)
        con = res.text
        con = con[10: -2]
        con_dict = json.loads(con)
        if con_dict["msglist"] is None or con_dict['usrinfo']["msgnum"] == 0:
            break
        if con_dict["subcode"] == -4001:
            sys.exit()

        with open('results/infos/' + num + '/' + str(position) + '.txt', 'w', encoding='utf-8') as f:
            f.write(str(con_dict))

        position += 20
        time.sleep(1)
Ejemplo n.º 14
0
def get_shuoshuo(given):
    __get_friends_list()
    __get_given_friends_infos(given)
    base.check_path(r"./results/", new=True)
    base.check_path(r"./results/shuoshuo/", new=True)
    for num in given:
        print("[%s] <get shuoshuo> qq: %s start" % (time.ctime(time.time()), num))
        files = [i for i in os.listdir('results/infos/%s' % num) if i.endswith(".txt")]
        con = []
        for item in files:
            with open('results/infos/%s/' % num + item, encoding='utf-8') as f:
                msglist = eval(f.read())['msglist']
                for i in msglist:
                    if i['conlist'] is None:
                        continue
                    else:
                        for j in i['conlist']:
                            if 'con' in j.keys():
                                con.append(j['con'])
        else:
            with open('results/shuoshuo/%s.txt' % num, 'w', encoding='utf-8') as f:
                print("[%s] <get shuoshuo> qq: %s ok, %d in total" % (time.ctime(time.time()), num, len(con)))
                f.write(str(con))
        __segment_shuoshuo(num)
Ejemplo n.º 15
0
def get_words():
    if not base.check_path("./exported_sns.json"):
        print("[%s] <get pyq> make sure exported_sns.json exists" %
              (time.ctime(time.time())))
        return
    __segment_pyq()