Ejemplo n.º 1
0
def livecollect(num):
    bloom = ScalableBloomFilter(1000000,0.001)
    havedone = temp_stupid('user.txt').read()
    error = temp_stupid('erroruser.txt').read()
    for i in havedone:
        try:
            ii=json.loads(i)['id']
        except:
            continue
        bloom.add(ii)
    for i in error:
        bloom.add(i)
    temp = temp_stupid('sample.txt')
    see = temp.read()
    sc = tempflow('user.txt','a')
    seed=[]
    for i in see:
        if i not in bloom:
            seed.append(i)
    for i in seed[:num]:
        #tempp=temp_stupid(i+'txt')
        if i.startswith(u'\ufeff'):
            i = i.encode('utf8')[3:].decode('utf8')
        try:
            userdict = tiny_people(getRequest(),i)
            sc.writein([json.dumps(userdict,ensure_ascii=False)])
            print(i)
        except ErrorInJson as result:
            temp_stupid('erroruser.txt').save([i])
        except BaseException as result:
            for e in range(10):
                try:
                    send(traceback.format_exc() + '\n' + str(result) + '\n in ' + datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S'), __name__ + ' throw '+result.__class__.__name__, '*****@*****.**')
                    break
                except BaseException as res:
                    print(res)
                    continue
            while True:
                if not CanConnect(cre):
                    continue
                else:
                    break

            
        # try:
        #     livelist = worm_userV4(i,'lives',['id'])
        # except BaseException as result:
        #     while True:
        #         try:
        #             send(traceback.format_exc() + '\n' + str(result) + '\n in ' + datetime.datetime.now().strftime(
        #                 '%Y-%m-%d %H:%M:%S'), __name__ + ' throw '+result.__class__.__name__, '*****@*****.**')
        #             break
        #         except:
        #             continue
        #     continue
        # tempp.update(livelist)
    sc.end()
Ejemplo n.º 2
0
def CanConnect(url, header=XHR_HEADER_WZ):
    '''
    测试连接是否被ban(即是否返回403状态码)
    url: 用于测试的链接
    header: 默认为www.zhihu.com主机,如果是api.zhihu.com必须更换
    '''
    req = getRequest()
    r = req.get(url, headers=header)
    if int(r.status_code) in [401, 403]:
        return False
    else:
        return True
Ejemplo n.º 3
0
def forlive(num):
    bloom = ScalableBloomFilter(1000000, 0.001)
    havedone = temp_stupid('live.txt').read()
    error = temp_stupid('errorlive.txt').read()
    for i in havedone:
        try:
            ii = json.loads(i)['id']
        except:
            continue
        bloom.add(ii)
    for i in error:
        bloom.add(i)
    temp = temp_stupid('need.txt')
    see = temp.read()
    sc = tempflow('live.txt', 'a')
    seed = []
    for i in see:
        if i not in bloom:
            seed.append(i)
    for i in seed[:num]:
        #tempp=temp_stupid(i+'txt')
        if i.startswith(u'\ufeff'):
            i = i.encode('utf8')[3:].decode('utf8')
        try:
            userdict = tiny_live(i, getRequest())
            sc.writein([userdict])
            print(i)
        except ErrorInJson as result:
            temp_stupid('errorlive.txt').save([i])
        except BaseException as result:
            for e in range(10):
                try:
                    send(
                        traceback.format_exc() + '\n' + str(result) +
                        '\n in ' +
                        datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        __name__ + ' throw ' + result.__class__.__name__,
                        '*****@*****.**')
                    break
                except BaseException as res:
                    print(res)
                    continue
            while True:
                if not CanConnect(create_live(i), XHR_HEADER_API):
                    continue
                else:
                    break
    sc.end()
Ejemplo n.º 4
0
def antcolony_userV4(token,
                     domain,
                     bloom: ScalableBloomFilter,
                     key: str,
                     dataobject,
                     keylist=None,
                     lamda=20,
                     xhr_headers=XHR_HEADER_WZ):
    def keyconvert(keys, dict):
        c = {}
        for key in dict.keys():
            if key in keys:
                c[key] = dict[key]
        return c

    urlhead = create_userV4(domain, token)
    req = getRequest()
    people_url = urlhead + paging(0, 5)
    print(people_url)
    print('before r')
    r = req.get(people_url, headers=xhr_headers)
    print('after r')
    if int(r.status_code) == 410:
        dataobject.delone(token)
        return 0
    if int(r.status_code) > 300:
        raise WrongStatuCode(str(r.status_code) + ': ' + people_url)
    j = json.loads(r.text)
    if 'error' in j:
        raise ErrorInJson(__name__ + ": from url=" + urlhead + '\n  msg=' +
                          j['error'])
    print(j['paging'])
    try:
        total = int(j['paging']['totals'])
    except:
        total = None
    if total:
        print(total)
        print(round(total / lamda))
        for i in range(round(total / lamda) + 1):
            urll = urlhead + paging(i * lamda, lamda)
            r = req.get(urll, headers=xhr_headers)
            # print(r)
            jc = json.loads(r.text, encoding='utf-8')
            data = jc['data']

            if keylist:
                data = [keyconvert(keylist, x) for x in data]
            for i in data:
                if i[key] not in bloom:
                    dataobject.insert(i)
                    bloom.add(i[key])
                    print('data import ' + i[key])
                else:
                    print('pass')
    else:
        i = 0
        while True:
            urll = urlhead + paging(int(i * lamda), lamda)
            r = req.get(urll, headers=xhr_headers)
            jc = json.loads(r.text, encoding='utf-8')
            data = jc['data']
            if len(data) == 0:
                break
            if keylist:
                data = [keyconvert(keylist, x) for x in data]
            for c in data:
                if c[key] not in bloom:
                    dataobject.insert(c)
                    bloom.add(c[key])
                    print('data import ' + c[key])
                else:
                    print('pass')
            i += 1
Ejemplo n.º 5
0
def worm_userV4(token,
                domain,
                keylist=None,
                bloom=None,
                lamda=20,
                xhr_headers=XHR_HEADER_API):
    def keyconvert(keys, dict):
        c = {}
        for key in dict.keys():
            if key in keys:
                c[key] = dict[key]
        return c

    def convert(bloom, data):
        def istrue(bloom, ok):
            if ok['url_token'] in bloom:
                return False
            else:
                return True

        c = []
        for i in data:
            if istrue(bloom, i):
                c.append(i)
        return c

    urlhead = create_userV4(domain, token)
    req = getRequest()
    people_url = urlhead + paging(0, 5)
    r = req.get(people_url, headers=xhr_headers)
    if int(r.status_code) > 300:
        raise WrongStatuCode(str(r.status_code) + ': ' + people_url)
    j = json.loads(r.text)
    if 'error' in j:
        raise ErrorInJson(__name__ + ": from url=" + urlhead + '\n  msg=' +
                          j['error'])
    print(j['paging'])
    alldata = []
    try:
        total = int(j['paging']['totals'])
    except:
        total = None
    if total:
        print(total)
        print(round(total / lamda))
        for i in range(round(total / lamda) + 1):
            urll = urlhead + paging(i * lamda, lamda)
            r = req.get(urll, headers=xhr_headers)
            # print(r)
            jc = json.loads(r.text, encoding='utf-8')
            data = jc['data']
            if keylist:
                data = [keyconvert(keylist, x) for x in data]
            if bloom:
                data = convert(bloom, data)
            print(data)
            alldata = alldata + data
    else:
        i = 0
        while True:
            urll = urlhead + paging(int(i * lamda), lamda)
            r = req.get(urll, headers=xhr_headers)
            jc = json.loads(r.text, encoding='utf-8')
            data = jc['data']
            if len(data) == 0:
                break
            if keylist:
                data = [keyconvert(keylist, x) for x in data]
            if bloom:
                data = convert(bloom, data)
            print(data)
            alldata = alldata + data
            i = i + 1
    return alldata
Ejemplo n.º 6
0
                if i[key] not in bloom:
                    dataobject.insert(i)
                    bloom.add(i[key])
                    print('data import ' + i[key])
                else:
                    print('pass')
    else:
        i = 0
        while True:
            urll = urlhead + paging(int(i * lamda), lamda)
            r = req.get(urll, headers=xhr_headers)
            jc = json.loads(r.text, encoding='utf-8')
            data = jc['data']
            if len(data) == 0:
                break
            if keylist:
                data = [keyconvert(keylist, x) for x in data]
            for c in data:
                if c[key] not in bloom:
                    dataobject.insert(c)
                    bloom.add(c[key])
                    print('data import ' + c[key])
                else:
                    print('pass')
            i += 1


if __name__ == '__main__':
    req = getRequest()
    print(tiny_speciallist('1020409341244997632', req, XHR_HEADER_API))
Ejemplo n.º 7
0
def getuseronline(id):
    return FORWEB(json.loads(tiny_people(id,getRequest()),encoding='utf-8'))
Ejemplo n.º 8
0
def getliveonline(id):
    return FORWEB(json.loads(tiny_live(id,getRequest()),encoding='utf-8'))