def search_info_by_id(session, keyword="", start_time="", end_time="", num=1, location=0): content_text = str() id_text = str() haslink = str() if location != 0: haslink = "&haslink=1" for i in range(START_PAGE, START_PAGE + TOTAL_PAGE): url = 'http://s.weibo.com/weibo/' + keyword + '&scope=ori' + haslink + '×cope=custom:' + start_time + ':' + end_time + '&page=' + str( i) + '&rd=newTips' sleep_time = random.randint(10, 30) os_sleep = 'sleep ' + str(sleep_time) os.system(os_sleep) get_text = session.get(url).text get_text = u'' + get_text get_text = get_text.encode('utf-8') content_text = save_catch_page(get_text) pd = out_page(content_text) if not pd: return num id_list = get_id_list(content_text, session) info_list = get_weibo_by_ids(id_list, session) status = save_data_by_db(info_list) lg_info(status) return num
def get_page_info(text, session, location, num): get_list = get_p(text) if get_list == 0: pass else: num = save_search_data(get_list, session, location, num) lg_info('record number: ' + str(num)) return num
def out_page(text): r_page = re.compile(u'feed_list_page_morelist'+'[\s\S]+'+u'page next S_txt1 S_line1?') temp = r_page.search(text) if temp: return True else: lg_info('out_page: out of page limit') return False
def out_page(text): r_page = re.compile(u'feed_list_page_morelist' + '[\s\S]+' + u'page next S_txt1 S_line1?') temp = r_page.search(text) if temp: return True else: lg_info('out_page: out of page limit') return False
def get_location(get_text, session): location_re = re.compile(u'''<a class=[\S\s]+?icon_cd_place[\S\s]+?a>''') temp_l = location_re.search(get_text) if temp_l: origin_url = temp_l.group() location_re = re.compile(u'''http:.+? ''') url = location_re.search(origin_url).group()[:-2] url = url.replace('\\', '') print (url+'\n') coordinate = short_to_long(url, session, random.randint(0, 5))#顺序为经度、纬度 lg_info(str(coordinate)) return coordinate else: return None
def get_location(get_text, session): location_re = re.compile(u'''<a class=[\S\s]+?icon_cd_place[\S\s]+?a>''') temp_l = location_re.search(get_text) if temp_l: origin_url = temp_l.group() location_re = re.compile(u'''http:.+? ''') url = location_re.search(origin_url).group()[:-2] url = url.replace('\\', '') print(url + '\n') coordinate = short_to_long(url, session, random.randint(0, 5)) #顺序为经度、纬度 lg_info(str(coordinate)) return coordinate else: return None
def get_id_list(text, session): re_id = re.compile("<div mid=[\s\S]+?>") origin_id_list = re_id.findall(text) re_id = re.compile("mid=\\\\\"[0-9]+\\\\\"") id_list = list() for i in origin_id_list: temp = re_id.search(i) if temp: temp = temp.group() temp = temp[6:len(temp) - 2] id_list.append(temp) else: lg_warning('not match weibo id') time = str(datetime.datetime.now()).upper() lg_debug(time + '\n' + 'ID_LIST Here') lg_info('get_id_list: ' + str(id_list)) return id_list
def get_id_list(text, session): re_id = re.compile("<div mid=[\s\S]+?>") origin_id_list = re_id.findall(text) re_id = re.compile("mid=\\\\\"[0-9]+\\\\\"") id_list = list() for i in origin_id_list: temp = re_id.search(i) if temp: temp = temp.group() temp = temp[6:len(temp)-2] id_list.append(temp) else: lg_warning('not match weibo id') time = str(datetime.datetime.now()).upper() lg_debug(time+'\n'+'ID_LIST Here') lg_info('get_id_list: ' + str(id_list)) return id_list
def search_info_by_id(session, keyword="", start_time="", end_time="", num=1, location=0): content_text = str() id_text = str() haslink = str() if location != 0: haslink = "&haslink=1" for i in range(START_PAGE, START_PAGE+TOTAL_PAGE): url = 'http://s.weibo.com/weibo/'+keyword+'&scope=ori'+haslink+'×cope=custom:'+start_time+':'+end_time+'&page='+str(i)+'&rd=newTips' sleep_time = random.randint(10, 30) os_sleep = 'sleep '+str(sleep_time) os.system(os_sleep) get_text = session.get(url).text get_text = u'' + get_text get_text = get_text.encode('utf-8') content_text = save_catch_page(get_text) pd = out_page(content_text) if not pd: return num id_list = get_id_list(content_text, session) info_list = get_weibo_by_ids(id_list, session) status = save_data_by_db(info_list) lg_info(status) return num
import json import random from datetime import datetime from user.login import get_session, wblogin from settings.settings import USERNAME, PASSWORD, START_NUM, QUERY_COORDINATE_LIST from lib.base import init_xls, init_env from lib.log import lg_debug, lg_info, lg_warning, init_log from lib.lib_func import convert_time, wait_time, arbitrary_precision_compare from official.weibo_api import get_weibo_by_coordinate from save import save_data_by_db from spider import get_info_history if __name__ == '__main__': init_env() lg_info(json.dumps(wblogin(USERNAME, PASSWORD), ensure_ascii=False)) init_log() num = START_NUM init_xls() session = get_session() get_info_history(session) #根据关键词搜索 # for i in KEY_WORDS: # for j in range(0, 3): # start_time = '2015-5-'+str(3*j+1) # end_time = '2015-5-'+str(3*j+3) # num = search_info_by_id(session, KEY_WORDS[0], start_time, end_time, num, 0) #按时间来计算 # for y in range(2015, 2016): # for m in range(1, 2):