def get_weibo_by_id(m_id, session): url = "http://api.weibo.com/2/statuses/show.json?source="+APP_SOURCE+"&id="+m_id text = session.get(url) text_dict = None try: text_dict = text.json() except Exception: lg_warning(Exception.message) lg_debug("get_weibo_by_id: No Json") return text_dict
def request_limit(url, session, app_num): temp_s = session.get('http://api.weibo.com/2/short_url/expand.json?url_short='+url+'&source='+APP_SOURCE_LIST[app_num]) text = temp_s.text lg_debug(text) text_list = json.loads(text) if text_list.has_key('error'): lg_warning('error, out of request limit ERROR') return 0 else: return text_list
def get_weibo_by_id(m_id, session): url = "http://api.weibo.com/2/statuses/show.json?source=" + APP_SOURCE + "&id=" + m_id text = session.get(url) text_dict = None try: text_dict = text.json() except Exception: lg_warning(Exception.message) lg_debug("get_weibo_by_id: No Json") return text_dict
def get_weibo_by_coordinate(session, coordinate, starttime, endtime, range=2000, sort=0, count=20, page=1, offset=0): if log_date.log_date.year != datetime.datetime.now(): log_date.change_log_date() init_log() num = 0 pd_403 = [0] * len(APP_SOURCE_LIST) end_403 = [1] * len(APP_SOURCE_LIST) while True: try: app_id = random.randint(0, len(APP_SOURCE_LIST)-1) url = "http://api.weibo.com/2/place/nearby_timeline.json?" url += "source="+APP_SOURCE_LIST[app_id] url += "&lat="+coordinate['latitude']+"&long="+coordinate['longitude'] url += "&starttime="+str(starttime)+"&range="+str(range)+"&sort="+str(sort) url += "&count="+str(count)+"&page="+str(page)+"&offset="+str(offset) text = session.get(url) if text.status_code == 403: pd_403[app_id] = 1 if pd_403 == end_403: sleep_time = 15600 else: sleep_time = random.randint(12, 30) wait_time(sleep_time) continue break except ConnectionError: num += 1 lg_warning(ConnectionError) lg_debug('connect fail'+str(num)) sleep_time = random.randint(6, 10) wait_time(str(sleep_time)) continue except Exception: num += 1 print('Connection reset by peer error') lg_warning(Exception) lg_debug('Connection reset by peer'+str(num)) sleep_time = random.randint(10, 20) wait_time(str(sleep_time)) continue text_dict = None text_list_dict = None try: text_dict = text.json() if text_dict.has_key('statuses'): text_list_dict = text_dict['statuses'] lg_debug('success catch the info_list') else: lg_debug("get_weibo_by_coordinate: No Json") except Exception: lg_warning(Exception.message) lg_debug("get_weibo_by_coordinate: No Json") return text_list_dict
def request_limit(url, session, app_num): temp_s = session.get( 'http://api.weibo.com/2/short_url/expand.json?url_short=' + url + '&source=' + APP_SOURCE_LIST[app_num]) text = temp_s.text lg_debug(text) text_list = json.loads(text) if text_list.has_key('error'): lg_warning('error, out of request limit ERROR') return 0 else: return text_list
def get_p(text): content_list = re.findall(r'<p class=\\"comment_txt\\"[\s\S]+?<\\/p>', text) if content_list: content_len = len(content_list) for i in range(0, content_len): re_text = re.match(r'<p class[\s\S]+?>', content_list[i]) re_text = re_text.group() temp_len = len(re_text) content_list[i] = content_list[i][temp_len:] content_list[i] = content_list[i][:-5] return content_list else: lg_debug('log more time') return 0
def get_id_list(text, session): re_id = re.compile("<div mid=[\s\S]+?>") origin_id_list = re_id.findall(text) re_id = re.compile("mid=\\\\\"[0-9]+\\\\\"") id_list = list() for i in origin_id_list: temp = re_id.search(i) if temp: temp = temp.group() temp = temp[6:len(temp) - 2] id_list.append(temp) else: lg_warning('not match weibo id') time = str(datetime.datetime.now()).upper() lg_debug(time + '\n' + 'ID_LIST Here') lg_info('get_id_list: ' + str(id_list)) return id_list
def get_weibo_by_ids(m_ids, session): id_str = str() for i in m_ids: id_str += i+"," id_str = id_str[:-1] url = "http://api.weibo.com/2/statuses/show_batch.json?source="+APP_SOURCE+"&ids="+id_str text = session.get(url) text_dict = None text_list_dict = None try: text_dict = text.json() text_list_dict = text_dict['statuses'] lg_debug('success catch the info_list') except Exception: lg_warning(Exception.message) lg_debug("get_weibo_by_ids: No Json") return text_list_dict
def get_id_list(text, session): re_id = re.compile("<div mid=[\s\S]+?>") origin_id_list = re_id.findall(text) re_id = re.compile("mid=\\\\\"[0-9]+\\\\\"") id_list = list() for i in origin_id_list: temp = re_id.search(i) if temp: temp = temp.group() temp = temp[6:len(temp)-2] id_list.append(temp) else: lg_warning('not match weibo id') time = str(datetime.datetime.now()).upper() lg_debug(time+'\n'+'ID_LIST Here') lg_info('get_id_list: ' + str(id_list)) return id_list
def get_weibo_by_ids(m_ids, session): id_str = str() for i in m_ids: id_str += i + "," id_str = id_str[:-1] url = "http://api.weibo.com/2/statuses/show_batch.json?source=" + APP_SOURCE + "&ids=" + id_str text = session.get(url) text_dict = None text_list_dict = None try: text_dict = text.json() text_list_dict = text_dict['statuses'] lg_debug('success catch the info_list') except Exception: lg_warning(Exception.message) lg_debug("get_weibo_by_ids: No Json") return text_list_dict
def save_data_by_db(get_list): client = MongoClient(MONGO_DB['address'], MONGO_DB['port']) db = client.get_database(name=MONGO_DB['db_name']) #存储根据北京地理位置获得的微博 collection = db.get_collection(name=MONGO_DB['collection_name']) if get_list: pass else: get_list = list() num = 0 pd = False for wd in get_list: try: collection.insert_one(wd).inserted_id lg_debug('True:save success'+str(len(get_list))) pd = True except Exception: num += 1 lg_debug('False:mongodb save fail. num:'+str(num)) return pd
def save_data_by_db(get_list): client = MongoClient(MONGO_DB['address'], MONGO_DB['port']) db = client.get_database(name=MONGO_DB['db_name']) #存储根据北京地理位置获得的微博 collection = db.get_collection(name=MONGO_DB['collection_name']) if get_list: pass else: get_list = list() num = 0 pd = False for wd in get_list: try: collection.insert_one(wd).inserted_id lg_debug('True:save success' + str(len(get_list))) pd = True except Exception: num += 1 lg_debug('False:mongodb save fail. num:' + str(num)) return pd
def short_to_long(url, session, app_num=0): if u't.cn' in url: temp = 0 while (temp == 0): temp = request_limit(url, session, app_num) if temp == 0: pass else: text_list = temp break app_num += 1 if app_num >= len(APP_SOURCE_LIST): lg_debug('all of the id out limited') return None else: return None text = text_list['urls'][0][u'url_long'][25:] coordinate = text.split('_') if len(coordinate) == 2: return coordinate else: return None
def get_weibo_by_coordinate(session, coordinate, starttime, endtime, range=2000, sort=0, count=20, page=1, offset=0): if log_date.log_date.year != datetime.datetime.now(): log_date.change_log_date() init_log() num = 0 pd_403 = [0] * len(APP_SOURCE_LIST) end_403 = [1] * len(APP_SOURCE_LIST) while True: try: app_id = random.randint(0, len(APP_SOURCE_LIST) - 1) url = "http://api.weibo.com/2/place/nearby_timeline.json?" url += "source=" + APP_SOURCE_LIST[app_id] url += "&lat=" + coordinate['latitude'] + "&long=" + coordinate[ 'longitude'] url += "&starttime=" + str(starttime) + "&range=" + str( range) + "&sort=" + str(sort) url += "&count=" + str(count) + "&page=" + str( page) + "&offset=" + str(offset) text = session.get(url) if text.status_code == 403: pd_403[app_id] = 1 if pd_403 == end_403: sleep_time = 15600 else: sleep_time = random.randint(12, 30) wait_time(sleep_time) continue break except ConnectionError: num += 1 lg_warning(ConnectionError) lg_debug('connect fail' + str(num)) sleep_time = random.randint(6, 10) wait_time(str(sleep_time)) continue except Exception: num += 1 print('Connection reset by peer error') lg_warning(Exception) lg_debug('Connection reset by peer' + str(num)) sleep_time = random.randint(10, 20) wait_time(str(sleep_time)) continue text_dict = None text_list_dict = None try: text_dict = text.json() if text_dict.has_key('statuses'): text_list_dict = text_dict['statuses'] lg_debug('success catch the info_list') else: lg_debug("get_weibo_by_coordinate: No Json") except Exception: lg_warning(Exception.message) lg_debug("get_weibo_by_coordinate: No Json") return text_list_dict