コード例 #1
0
ファイル: spider.py プロジェクト: KoBenji/weibospider
def get_info_history(session):
    geo_num = 63
    starttime = convert_time('2015', '1', '1', '0')
    starttime = str(starttime)
    starttime = starttime[:-2]
    page_count = 50
    endtime = starttime
    sort = 0
    offset = 0
    inter_lat = INTER_LAT
    inter_lon = INTER_LON
    for p_id in range(0, geo_num):
        geo_range = DISTANCE
        p = 1
        index = [0]*50
        view_list = fourtree(session, QUERY_COORDINATE_LIST[p_id], starttime, geo_range, inter_lat, inter_lon)
        for view in view_list:
            p = 1
            while p < 20:
                coordinate = view['coordinate']
                geo_range = view['geo_range']
                temp_time = random.randint(10, 15)
                wait_time(temp_time)
                info_list = get_weibo_by_coordinate(session, coordinate, starttime, endtime, geo_range, 0, page_count, p, 0)
                if info_list:
                    pd = save_data_by_db(info_list)
                    if not pd:
                        break
                    p += 1
                else:
                    break
        sleep_time = random.randint(10, 20)
        wait_time(sleep_time)
コード例 #2
0
ファイル: spider.py プロジェクト: wesavetheworld/weibospider
def search_info(session,
                keyword="",
                start_time="",
                end_time="",
                num=1,
                location=0):
    content_text = str()
    id_text = str()
    haslink = str()
    if location != 0:
        haslink = "&haslink=1"
    for i in range(START_PAGE, START_PAGE + TOTAL_PAGE):
        url = 'http://s.weibo.com/weibo/' + keyword + '&scope=ori' + haslink + '&timescope=custom:' + start_time + ':' + end_time + '&page=' + str(
            i) + '&rd=newTips'
        sleep_time = random.randint(10, 30)
        wait_time(sleep_time)
        get_text = session.get(url).text
        get_text = u'' + get_text
        get_text = get_text.encode('utf-8')
        content_text = save_catch_page(get_text)
        pd = out_page(content_text)
        if not pd:
            return num
        num = get_page_info(content_text, session, location, num)
    return num
コード例 #3
0
ファイル: spider.py プロジェクト: wesavetheworld/weibospider
def fourtree(session, coordinate, starttime, geo_range, inter_lat, inter_lon):
    temp_time = random.randint(2, 5)
    wait_time(temp_time)
    info_list = get_weibo_by_coordinate(session, coordinate, starttime, 0,
                                        geo_range, 0, 50, 20, 0)
    if info_list:
        print(geo_range)
        save_data_by_db(info_list)
    else:
        print(0)
    if info_list:
        inter_lat = round(inter_lat / 2, 6)
        inter_lon = round(inter_lon / 2, 6)
        coordinate1 = dict()
        coordinate2 = dict()
        coordinate3 = dict()
        coordinate4 = dict()
        geo_range = float(geo_range)
        geo_range = int(round(geo_range / 2 * 1.3))
        if geo_range < 100:
            geo_range = 100
        coordinate1['latitude'] = str(
            round(float(coordinate['latitude']) + inter_lat, 6))
        coordinate1['longitude'] = str(
            round(float(coordinate['longitude']) + inter_lat, 6))
        coordinate2['latitude'] = str(
            round(float(coordinate['latitude']) - inter_lon, 6))
        coordinate2['longitude'] = str(
            round(float(coordinate['longitude']) + inter_lon, 6))
        coordinate3['latitude'] = str(
            round(float(coordinate['latitude']) + inter_lat, 6))
        coordinate3['longitude'] = str(
            round(float(coordinate['longitude']) - inter_lat, 6))
        coordinate4['latitude'] = str(
            round(float(coordinate['latitude']) - inter_lon, 6))
        coordinate4['longitude'] = str(
            round(float(coordinate['longitude']) - inter_lon, 6))
        l1 = fourtree(session, coordinate1, starttime, geo_range, inter_lat,
                      inter_lon)
        l2 = fourtree(session, coordinate2, starttime, geo_range, inter_lat,
                      inter_lon)
        l3 = fourtree(session, coordinate3, starttime, geo_range, inter_lat,
                      inter_lon)
        l4 = fourtree(session, coordinate4, starttime, geo_range, inter_lat,
                      inter_lon)
        l5 = [{'coordinate': coordinate, 'geo_range': geo_range}]
        return l1 + l2 + l3 + l4 + l5
    else:
        return [{'coordinate': coordinate, 'geo_range': geo_range}]
コード例 #4
0
ファイル: login.py プロジェクト: liinnux/weibospider-1
def wblogin(username, password):
    resp = session.get(
               'http://login.sina.com.cn/sso/prelogin.php?'
               'entry=sso&callback=sinaSSOController.preloginCallBack&'
               'su=%s&rsakt=mod&client=%s' %
               (base64.b64encode(username.encode('utf-8')), WBCLIENT)
           )
    wait_time(3)

    pre_login_str = re.match(r'[^{]+({.+?})', resp.text).group(1)
    pre_login = json.loads(pre_login_str)
    pre_login = json.loads(pre_login_str)
    data = {
                'entry': 'weibo',
                'gateway': 1,
                'from': '',
                'savestate': 7,
                'userticket': 1,
                'ssosimplelogin': 1,
                'su': base64.b64encode(requests.utils.quote(username).encode('utf-8')),
                'service': 'miniblog',
                'servertime': pre_login['servertime'],
                'nonce': pre_login['nonce'],
                'vsnf': 1,
                'vsnval': '',
                'pwencode': 'rsa2',
                'sp': encrypt_passwd(
                    password, pre_login['pubkey'], pre_login['servertime'], pre_login['nonce']
                ),  
                'rsakv': pre_login['rsakv'],
                'encoding':'UTF-8',
                'prelt': '115',
                'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.si'
                       'naSSOController.feedBackUrlCallBack',
                'returntype': 'META',
            }
    resp = session.post(
        'http://login.sina.com.cn/sso/login.php?client=%s' % WBCLIENT, data=data
    )
    login_url = re.search(r'replace\([\"\']([^\'\"]+)[\"\']', resp.text).group(1)
    resp = session.get(login_url)
    login_str = re.match(r'[^{]+({.+?}})', resp.text).group(1)
    
    return json.loads(login_str)
コード例 #5
0
ファイル: spider.py プロジェクト: KoBenji/weibospider
def search_info(session, keyword="", start_time="", end_time="",  num=1, location=0):
    content_text = str()
    id_text = str()
    haslink = str()
    if location != 0:
        haslink = "&haslink=1"
    for i in range(START_PAGE, START_PAGE+TOTAL_PAGE):
        url = 'http://s.weibo.com/weibo/'+keyword+'&scope=ori'+haslink+'&timescope=custom:'+start_time+':'+end_time+'&page='+str(i)+'&rd=newTips'
        sleep_time = random.randint(10, 30)
        wait_time(sleep_time)
        get_text = session.get(url).text
        get_text = u'' + get_text
        get_text = get_text.encode('utf-8')
        content_text = save_catch_page(get_text)
        pd = out_page(content_text)
        if not pd:
            return num
        num = get_page_info(content_text, session, location, num)
    return num
コード例 #6
0
ファイル: spider.py プロジェクト: wesavetheworld/weibospider
def get_info_history(session):
    geo_num = 63
    starttime = convert_time('2015', '1', '1', '0')
    starttime = str(starttime)
    starttime = starttime[:-2]
    page_count = 50
    endtime = starttime
    sort = 0
    offset = 0
    inter_lat = INTER_LAT
    inter_lon = INTER_LON
    for p_id in range(0, geo_num):
        geo_range = DISTANCE
        p = 1
        index = [0] * 50
        view_list = fourtree(session, QUERY_COORDINATE_LIST[p_id], starttime,
                             geo_range, inter_lat, inter_lon)
        for view in view_list:
            p = 1
            while p < 20:
                coordinate = view['coordinate']
                geo_range = view['geo_range']
                temp_time = random.randint(10, 15)
                wait_time(temp_time)
                info_list = get_weibo_by_coordinate(session, coordinate,
                                                    starttime, endtime,
                                                    geo_range, 0, page_count,
                                                    p, 0)
                if info_list:
                    pd = save_data_by_db(info_list)
                    if not pd:
                        break
                    p += 1
                else:
                    break
        sleep_time = random.randint(10, 20)
        wait_time(sleep_time)
コード例 #7
0
ファイル: spider.py プロジェクト: KoBenji/weibospider
def fourtree(session, coordinate, starttime, geo_range, inter_lat, inter_lon):
    temp_time = random.randint(2, 5)
    wait_time(temp_time)
    info_list = get_weibo_by_coordinate(session, coordinate, starttime, 0, geo_range, 0, 50, 20, 0)
    if info_list:
        print (geo_range)
        save_data_by_db(info_list)
    else:
        print (0)
    if info_list:
        inter_lat = round(inter_lat/2, 6)
        inter_lon = round(inter_lon/2, 6)
        coordinate1 = dict()
        coordinate2 = dict()
        coordinate3 = dict()
        coordinate4 = dict()
        geo_range = float(geo_range)
        geo_range = int(round(geo_range/2*1.3))
        if geo_range < 100:
            geo_range = 100
        coordinate1['latitude'] = str(round(float(coordinate['latitude']) + inter_lat, 6))
        coordinate1['longitude'] = str(round(float(coordinate['longitude']) + inter_lat, 6))
        coordinate2['latitude'] = str(round(float(coordinate['latitude']) - inter_lon, 6))
        coordinate2['longitude'] = str(round(float(coordinate['longitude']) + inter_lon, 6))
        coordinate3['latitude'] = str(round(float(coordinate['latitude']) + inter_lat, 6))
        coordinate3['longitude'] = str(round(float(coordinate['longitude']) - inter_lat, 6))
        coordinate4['latitude'] = str(round(float(coordinate['latitude']) - inter_lon, 6))
        coordinate4['longitude'] = str(round(float(coordinate['longitude']) - inter_lon, 6))
        l1 = fourtree(session, coordinate1, starttime, geo_range, inter_lat, inter_lon)
        l2 = fourtree(session, coordinate2, starttime, geo_range, inter_lat, inter_lon)
        l3 = fourtree(session, coordinate3, starttime, geo_range, inter_lat, inter_lon)
        l4 = fourtree(session, coordinate4, starttime, geo_range, inter_lat, inter_lon)
        l5 = [{'coordinate': coordinate, 'geo_range': geo_range}]
        return l1+l2+l3+l4 +l5
    else:
        return [{'coordinate': coordinate, 'geo_range': geo_range}]
コード例 #8
0
ファイル: weibo_api.py プロジェクト: liinnux/weibospider-1
def get_weibo_by_coordinate(session, coordinate, starttime, endtime, range=2000, sort=0, count=20, page=1, offset=0):
    if log_date.log_date.year != datetime.datetime.now():
        log_date.change_log_date()
        init_log()
    num = 0
    pd_403 = [0] * len(APP_SOURCE_LIST)
    end_403 = [1] * len(APP_SOURCE_LIST)
    while True:
        try:
            app_id = random.randint(0, len(APP_SOURCE_LIST)-1)
            url = "http://api.weibo.com/2/place/nearby_timeline.json?"
            url += "source="+APP_SOURCE_LIST[app_id]
            url += "&lat="+coordinate['latitude']+"&long="+coordinate['longitude']
            url += "&starttime="+str(starttime)+"&range="+str(range)+"&sort="+str(sort)
            url += "&count="+str(count)+"&page="+str(page)+"&offset="+str(offset)
            text = session.get(url)
            if text.status_code == 403:
                pd_403[app_id] = 1
                if pd_403 == end_403:
                    sleep_time = 15600
                else:
                    sleep_time = random.randint(12, 30)
                wait_time(sleep_time)
                continue
            break
        except ConnectionError:
            num += 1
            lg_warning(ConnectionError)
            lg_debug('connect fail'+str(num))
            sleep_time = random.randint(6, 10)
            wait_time(str(sleep_time))
            continue
        except Exception:
            num += 1
            print('Connection reset by peer error')
            lg_warning(Exception)
            lg_debug('Connection reset by peer'+str(num))
            sleep_time = random.randint(10, 20)
            wait_time(str(sleep_time))
            continue


    text_dict = None
    text_list_dict = None
    try:
        text_dict = text.json()
        if text_dict.has_key('statuses'):
            text_list_dict = text_dict['statuses']
            lg_debug('success catch the info_list')
        else:
            lg_debug("get_weibo_by_coordinate: No Json")
    except Exception:
        lg_warning(Exception.message)
        lg_debug("get_weibo_by_coordinate: No Json")
    return text_list_dict
コード例 #9
0
def wblogin(username, password):
    resp = session.get('http://login.sina.com.cn/sso/prelogin.php?'
                       'entry=sso&callback=sinaSSOController.preloginCallBack&'
                       'su=%s&rsakt=mod&client=%s' %
                       (base64.b64encode(username.encode('utf-8')), WBCLIENT))
    wait_time(3)

    pre_login_str = re.match(r'[^{]+({.+?})', resp.text).group(1)
    pre_login = json.loads(pre_login_str)
    pre_login = json.loads(pre_login_str)
    data = {
        'entry':
        'weibo',
        'gateway':
        1,
        'from':
        '',
        'savestate':
        7,
        'userticket':
        1,
        'ssosimplelogin':
        1,
        'su':
        base64.b64encode(requests.utils.quote(username).encode('utf-8')),
        'service':
        'miniblog',
        'servertime':
        pre_login['servertime'],
        'nonce':
        pre_login['nonce'],
        'vsnf':
        1,
        'vsnval':
        '',
        'pwencode':
        'rsa2',
        'sp':
        encrypt_passwd(password, pre_login['pubkey'], pre_login['servertime'],
                       pre_login['nonce']),
        'rsakv':
        pre_login['rsakv'],
        'encoding':
        'UTF-8',
        'prelt':
        '115',
        'url':
        'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.si'
        'naSSOController.feedBackUrlCallBack',
        'returntype':
        'META',
    }
    resp = session.post('http://login.sina.com.cn/sso/login.php?client=%s' %
                        WBCLIENT,
                        data=data)
    login_url = re.search(r'replace\([\"\']([^\'\"]+)[\"\']',
                          resp.text).group(1)
    resp = session.get(login_url)
    login_str = re.match(r'[^{]+({.+?}})', resp.text).group(1)

    return json.loads(login_str)
コード例 #10
0
def get_weibo_by_coordinate(session,
                            coordinate,
                            starttime,
                            endtime,
                            range=2000,
                            sort=0,
                            count=20,
                            page=1,
                            offset=0):
    if log_date.log_date.year != datetime.datetime.now():
        log_date.change_log_date()
        init_log()
    num = 0
    pd_403 = [0] * len(APP_SOURCE_LIST)
    end_403 = [1] * len(APP_SOURCE_LIST)
    while True:
        try:
            app_id = random.randint(0, len(APP_SOURCE_LIST) - 1)
            url = "http://api.weibo.com/2/place/nearby_timeline.json?"
            url += "source=" + APP_SOURCE_LIST[app_id]
            url += "&lat=" + coordinate['latitude'] + "&long=" + coordinate[
                'longitude']
            url += "&starttime=" + str(starttime) + "&range=" + str(
                range) + "&sort=" + str(sort)
            url += "&count=" + str(count) + "&page=" + str(
                page) + "&offset=" + str(offset)
            text = session.get(url)
            if text.status_code == 403:
                pd_403[app_id] = 1
                if pd_403 == end_403:
                    sleep_time = 15600
                else:
                    sleep_time = random.randint(12, 30)
                wait_time(sleep_time)
                continue
            break
        except ConnectionError:
            num += 1
            lg_warning(ConnectionError)
            lg_debug('connect fail' + str(num))
            sleep_time = random.randint(6, 10)
            wait_time(str(sleep_time))
            continue
        except Exception:
            num += 1
            print('Connection reset by peer error')
            lg_warning(Exception)
            lg_debug('Connection reset by peer' + str(num))
            sleep_time = random.randint(10, 20)
            wait_time(str(sleep_time))
            continue

    text_dict = None
    text_list_dict = None
    try:
        text_dict = text.json()
        if text_dict.has_key('statuses'):
            text_list_dict = text_dict['statuses']
            lg_debug('success catch the info_list')
        else:
            lg_debug("get_weibo_by_coordinate: No Json")
    except Exception:
        lg_warning(Exception.message)
        lg_debug("get_weibo_by_coordinate: No Json")
    return text_list_dict
コード例 #11
0
ファイル: main.py プロジェクト: KoBenji/weibospider
            starttime = convert_time('2015', '7', '1', '0')
            endtime = starttime
            starttime = str(starttime)
            starttime = starttime[:-2]
            info_list = get_weibo_by_coordinate(session, QUERY_COORDINATE_LIST[p_id],
                                                starttime, endtime, geo_range, 0, page_count, p, 0)
            save_data_by_db(info_list)
            if not info_list:
                continue
            else:
                pass
            length = len(info_list)
            cmpstr1 = info_list[length - 1]['mid']
            cmpstr2 = index_num[p_id]
            index_num[p_id] = info_list[0]['mid']
            while arbitrary_precision_compare(cmpstr1, cmpstr2) == 1:
                p += 1
                info_list = get_weibo_by_coordinate(session, QUERY_COORDINATE_LIST[p_id],
                                                    starttime, endtime, geo_range, 0, page_count, p, 0)
                save_data_by_db(info_list)
                if not info_list:
                    break
                else:
                    pass
                length = len(info_list)
                cmpstr1 = info_list[length - 1]['mid']
                if p >= 20:
                    break
        sleep_time = random.randint(10, 20)
        wait_time(sleep_time)