Exemple #1
0
def search_info_by_id(session,
                      keyword="",
                      start_time="",
                      end_time="",
                      num=1,
                      location=0):
    content_text = str()
    id_text = str()
    haslink = str()
    if location != 0:
        haslink = "&haslink=1"
    for i in range(START_PAGE, START_PAGE + TOTAL_PAGE):
        url = 'http://s.weibo.com/weibo/' + keyword + '&scope=ori' + haslink + '&timescope=custom:' + start_time + ':' + end_time + '&page=' + str(
            i) + '&rd=newTips'
        sleep_time = random.randint(10, 30)
        os_sleep = 'sleep ' + str(sleep_time)
        os.system(os_sleep)
        get_text = session.get(url).text
        get_text = u'' + get_text
        get_text = get_text.encode('utf-8')
        content_text = save_catch_page(get_text)
        pd = out_page(content_text)
        if not pd:
            return num
        id_list = get_id_list(content_text, session)
        info_list = get_weibo_by_ids(id_list, session)
        status = save_data_by_db(info_list)
        lg_info(status)
    return num
Exemple #2
0
def get_page_info(text, session, location, num):
    get_list = get_p(text)
    if get_list == 0:
        pass
    else:
        num = save_search_data(get_list, session, location, num)
    lg_info('record number: ' + str(num))
    return num
Exemple #3
0
def out_page(text):
    r_page = re.compile(u'feed_list_page_morelist'+'[\s\S]+'+u'page next S_txt1 S_line1?')
    temp = r_page.search(text)
    if temp:
        return True
    else:
        lg_info('out_page: out of page limit')
        return False
Exemple #4
0
def get_page_info(text, session, location, num):
    get_list = get_p(text)
    if get_list == 0:
        pass
    else:
        num = save_search_data(get_list, session, location, num)
    lg_info('record number: ' + str(num))
    return num
Exemple #5
0
def out_page(text):
    r_page = re.compile(u'feed_list_page_morelist' + '[\s\S]+' +
                        u'page next S_txt1 S_line1?')
    temp = r_page.search(text)
    if temp:
        return True
    else:
        lg_info('out_page: out of page limit')
        return False
Exemple #6
0
def get_location(get_text, session):
    location_re = re.compile(u'''<a class=[\S\s]+?icon_cd_place[\S\s]+?a>''')
    temp_l = location_re.search(get_text)
    if temp_l:
        origin_url = temp_l.group()
        location_re = re.compile(u'''http:.+? ''')
        url = location_re.search(origin_url).group()[:-2]
        url = url.replace('\\', '')
        print (url+'\n')
        coordinate = short_to_long(url, session, random.randint(0, 5))#顺序为经度、纬度
        lg_info(str(coordinate))
        return coordinate 
    else:
        return None
Exemple #7
0
def get_location(get_text, session):
    location_re = re.compile(u'''<a class=[\S\s]+?icon_cd_place[\S\s]+?a>''')
    temp_l = location_re.search(get_text)
    if temp_l:
        origin_url = temp_l.group()
        location_re = re.compile(u'''http:.+? ''')
        url = location_re.search(origin_url).group()[:-2]
        url = url.replace('\\', '')
        print(url + '\n')
        coordinate = short_to_long(url, session, random.randint(0,
                                                                5))  #顺序为经度、纬度
        lg_info(str(coordinate))
        return coordinate
    else:
        return None
Exemple #8
0
def get_id_list(text, session):
    re_id = re.compile("<div mid=[\s\S]+?>")
    origin_id_list = re_id.findall(text)
    re_id = re.compile("mid=\\\\\"[0-9]+\\\\\"")
    id_list = list()
    for i in origin_id_list:
        temp = re_id.search(i)
        if temp:
            temp = temp.group()
            temp = temp[6:len(temp) - 2]
            id_list.append(temp)
        else:
            lg_warning('not match  weibo id')
    time = str(datetime.datetime.now()).upper()
    lg_debug(time + '\n' + 'ID_LIST Here')
    lg_info('get_id_list: ' + str(id_list))
    return id_list
Exemple #9
0
def get_id_list(text, session):
    re_id = re.compile("<div mid=[\s\S]+?>")
    origin_id_list = re_id.findall(text)
    re_id = re.compile("mid=\\\\\"[0-9]+\\\\\"")
    id_list = list()
    for i in origin_id_list:
        temp = re_id.search(i)
        if temp:
            temp = temp.group()
            temp = temp[6:len(temp)-2]
            id_list.append(temp)
        else:
            lg_warning('not match  weibo id')
    time = str(datetime.datetime.now()).upper()
    lg_debug(time+'\n'+'ID_LIST Here')
    lg_info('get_id_list: ' + str(id_list))
    return id_list
Exemple #10
0
def search_info_by_id(session, keyword="", start_time="", end_time="",  num=1, location=0):
    content_text = str()
    id_text = str()
    haslink = str()
    if location != 0:
        haslink = "&haslink=1"
    for i in range(START_PAGE, START_PAGE+TOTAL_PAGE):
        url = 'http://s.weibo.com/weibo/'+keyword+'&scope=ori'+haslink+'&timescope=custom:'+start_time+':'+end_time+'&page='+str(i)+'&rd=newTips'
        sleep_time = random.randint(10, 30)
        os_sleep = 'sleep '+str(sleep_time)
        os.system(os_sleep)
        get_text = session.get(url).text
        get_text = u'' + get_text
        get_text = get_text.encode('utf-8')
        content_text = save_catch_page(get_text)
        pd = out_page(content_text)
        if not pd:
            return num
        id_list = get_id_list(content_text, session)
        info_list = get_weibo_by_ids(id_list, session)
        status = save_data_by_db(info_list)
        lg_info(status)
    return num
Exemple #11
0
import json
import random
from datetime import datetime

from user.login import get_session, wblogin
from settings.settings import USERNAME, PASSWORD, START_NUM, QUERY_COORDINATE_LIST
from lib.base import init_xls, init_env
from lib.log import lg_debug, lg_info, lg_warning, init_log
from lib.lib_func import convert_time, wait_time, arbitrary_precision_compare
from official.weibo_api import get_weibo_by_coordinate
from save import save_data_by_db
from spider import get_info_history

if __name__ == '__main__':
    init_env()
    lg_info(json.dumps(wblogin(USERNAME, PASSWORD), ensure_ascii=False))
    init_log()
    num = START_NUM
    init_xls()
    session = get_session()
    get_info_history(session)
#根据关键词搜索
#    for i in KEY_WORDS:
#        for j in range(0, 3):
#            start_time = '2015-5-'+str(3*j+1)
#            end_time = '2015-5-'+str(3*j+3)
#            num = search_info_by_id(session, KEY_WORDS[0], start_time, end_time, num, 0)

#按时间来计算
#    for y in range(2015, 2016):
#        for m in range(1, 2):