コード例 #1
0
ファイル: weibo_api.py プロジェクト: liinnux/weibospider-1
def get_weibo_by_id(m_id, session):
    url = "http://api.weibo.com/2/statuses/show.json?source="+APP_SOURCE+"&id="+m_id
    text = session.get(url)
    text_dict = None
    try:
        text_dict = text.json()
    except Exception:
        lg_warning(Exception.message)
        lg_debug("get_weibo_by_id: No Json")
    return text_dict
コード例 #2
0
ファイル: weibo_api.py プロジェクト: liinnux/weibospider-1
def request_limit(url, session, app_num):
    temp_s = session.get('http://api.weibo.com/2/short_url/expand.json?url_short='+url+'&source='+APP_SOURCE_LIST[app_num])
    text = temp_s.text
    lg_debug(text)
    text_list = json.loads(text)
    if text_list.has_key('error'):
        lg_warning('error, out of request limit ERROR')
        return 0 
    else:
        return text_list
コード例 #3
0
def get_weibo_by_id(m_id, session):
    url = "http://api.weibo.com/2/statuses/show.json?source=" + APP_SOURCE + "&id=" + m_id
    text = session.get(url)
    text_dict = None
    try:
        text_dict = text.json()
    except Exception:
        lg_warning(Exception.message)
        lg_debug("get_weibo_by_id: No Json")
    return text_dict
コード例 #4
0
ファイル: weibo_api.py プロジェクト: liinnux/weibospider-1
def get_weibo_by_coordinate(session, coordinate, starttime, endtime, range=2000, sort=0, count=20, page=1, offset=0):
    if log_date.log_date.year != datetime.datetime.now():
        log_date.change_log_date()
        init_log()
    num = 0
    pd_403 = [0] * len(APP_SOURCE_LIST)
    end_403 = [1] * len(APP_SOURCE_LIST)
    while True:
        try:
            app_id = random.randint(0, len(APP_SOURCE_LIST)-1)
            url = "http://api.weibo.com/2/place/nearby_timeline.json?"
            url += "source="+APP_SOURCE_LIST[app_id]
            url += "&lat="+coordinate['latitude']+"&long="+coordinate['longitude']
            url += "&starttime="+str(starttime)+"&range="+str(range)+"&sort="+str(sort)
            url += "&count="+str(count)+"&page="+str(page)+"&offset="+str(offset)
            text = session.get(url)
            if text.status_code == 403:
                pd_403[app_id] = 1
                if pd_403 == end_403:
                    sleep_time = 15600
                else:
                    sleep_time = random.randint(12, 30)
                wait_time(sleep_time)
                continue
            break
        except ConnectionError:
            num += 1
            lg_warning(ConnectionError)
            lg_debug('connect fail'+str(num))
            sleep_time = random.randint(6, 10)
            wait_time(str(sleep_time))
            continue
        except Exception:
            num += 1
            print('Connection reset by peer error')
            lg_warning(Exception)
            lg_debug('Connection reset by peer'+str(num))
            sleep_time = random.randint(10, 20)
            wait_time(str(sleep_time))
            continue


    text_dict = None
    text_list_dict = None
    try:
        text_dict = text.json()
        if text_dict.has_key('statuses'):
            text_list_dict = text_dict['statuses']
            lg_debug('success catch the info_list')
        else:
            lg_debug("get_weibo_by_coordinate: No Json")
    except Exception:
        lg_warning(Exception.message)
        lg_debug("get_weibo_by_coordinate: No Json")
    return text_list_dict
コード例 #5
0
def request_limit(url, session, app_num):
    temp_s = session.get(
        'http://api.weibo.com/2/short_url/expand.json?url_short=' + url +
        '&source=' + APP_SOURCE_LIST[app_num])
    text = temp_s.text
    lg_debug(text)
    text_list = json.loads(text)
    if text_list.has_key('error'):
        lg_warning('error, out of request limit ERROR')
        return 0
    else:
        return text_list
コード例 #6
0
ファイル: spider.py プロジェクト: KoBenji/weibospider
def get_p(text):
    content_list = re.findall(r'<p class=\\"comment_txt\\"[\s\S]+?<\\/p>', text)
    if content_list:
        content_len = len(content_list)
        for i in range(0, content_len):
            re_text = re.match(r'<p class[\s\S]+?>', content_list[i])
            re_text = re_text.group()
            temp_len = len(re_text)
            content_list[i] = content_list[i][temp_len:]
            content_list[i] = content_list[i][:-5]
        return content_list
    else:
        lg_debug('log more time')
    return 0
コード例 #7
0
ファイル: spider.py プロジェクト: wesavetheworld/weibospider
def get_p(text):
    content_list = re.findall(r'<p class=\\"comment_txt\\"[\s\S]+?<\\/p>',
                              text)
    if content_list:
        content_len = len(content_list)
        for i in range(0, content_len):
            re_text = re.match(r'<p class[\s\S]+?>', content_list[i])
            re_text = re_text.group()
            temp_len = len(re_text)
            content_list[i] = content_list[i][temp_len:]
            content_list[i] = content_list[i][:-5]
        return content_list
    else:
        lg_debug('log more time')
    return 0
コード例 #8
0
ファイル: spider.py プロジェクト: wesavetheworld/weibospider
def get_id_list(text, session):
    re_id = re.compile("<div mid=[\s\S]+?>")
    origin_id_list = re_id.findall(text)
    re_id = re.compile("mid=\\\\\"[0-9]+\\\\\"")
    id_list = list()
    for i in origin_id_list:
        temp = re_id.search(i)
        if temp:
            temp = temp.group()
            temp = temp[6:len(temp) - 2]
            id_list.append(temp)
        else:
            lg_warning('not match  weibo id')
    time = str(datetime.datetime.now()).upper()
    lg_debug(time + '\n' + 'ID_LIST Here')
    lg_info('get_id_list: ' + str(id_list))
    return id_list
コード例 #9
0
ファイル: weibo_api.py プロジェクト: liinnux/weibospider-1
def get_weibo_by_ids(m_ids, session):
    id_str = str()
    for i in m_ids:
        id_str += i+","
    id_str = id_str[:-1]
    url = "http://api.weibo.com/2/statuses/show_batch.json?source="+APP_SOURCE+"&ids="+id_str
    text = session.get(url)
    text_dict = None
    text_list_dict = None
    try:
        text_dict = text.json()
        text_list_dict = text_dict['statuses']
        lg_debug('success catch the info_list')
    except Exception:
        lg_warning(Exception.message)
        lg_debug("get_weibo_by_ids: No Json")
    return text_list_dict
コード例 #10
0
ファイル: spider.py プロジェクト: KoBenji/weibospider
def get_id_list(text, session):
    re_id = re.compile("<div mid=[\s\S]+?>")
    origin_id_list = re_id.findall(text)
    re_id = re.compile("mid=\\\\\"[0-9]+\\\\\"")
    id_list = list()
    for i in origin_id_list:
        temp = re_id.search(i)
        if temp:
            temp = temp.group()
            temp = temp[6:len(temp)-2]
            id_list.append(temp)
        else:
            lg_warning('not match  weibo id')
    time = str(datetime.datetime.now()).upper()
    lg_debug(time+'\n'+'ID_LIST Here')
    lg_info('get_id_list: ' + str(id_list))
    return id_list
コード例 #11
0
def get_weibo_by_ids(m_ids, session):
    id_str = str()
    for i in m_ids:
        id_str += i + ","
    id_str = id_str[:-1]
    url = "http://api.weibo.com/2/statuses/show_batch.json?source=" + APP_SOURCE + "&ids=" + id_str
    text = session.get(url)
    text_dict = None
    text_list_dict = None
    try:
        text_dict = text.json()
        text_list_dict = text_dict['statuses']
        lg_debug('success catch the info_list')
    except Exception:
        lg_warning(Exception.message)
        lg_debug("get_weibo_by_ids: No Json")
    return text_list_dict
コード例 #12
0
ファイル: save.py プロジェクト: KoBenji/weibospider
def save_data_by_db(get_list):
    client = MongoClient(MONGO_DB['address'], MONGO_DB['port'])
    db = client.get_database(name=MONGO_DB['db_name'])
    #存储根据北京地理位置获得的微博
    collection = db.get_collection(name=MONGO_DB['collection_name'])
    if get_list:
        pass
    else:
        get_list = list()

    num = 0
    pd = False
    for wd in get_list:
        try:
            collection.insert_one(wd).inserted_id
            lg_debug('True:save success'+str(len(get_list)))
            pd = True
        except Exception:
            num += 1
    lg_debug('False:mongodb save fail. num:'+str(num))
    return pd
コード例 #13
0
ファイル: save.py プロジェクト: wesavetheworld/weibospider
def save_data_by_db(get_list):
    client = MongoClient(MONGO_DB['address'], MONGO_DB['port'])
    db = client.get_database(name=MONGO_DB['db_name'])
    #存储根据北京地理位置获得的微博
    collection = db.get_collection(name=MONGO_DB['collection_name'])
    if get_list:
        pass
    else:
        get_list = list()

    num = 0
    pd = False
    for wd in get_list:
        try:
            collection.insert_one(wd).inserted_id
            lg_debug('True:save success' + str(len(get_list)))
            pd = True
        except Exception:
            num += 1
    lg_debug('False:mongodb save fail. num:' + str(num))
    return pd
コード例 #14
0
ファイル: weibo_api.py プロジェクト: liinnux/weibospider-1
def short_to_long(url, session, app_num=0):
    if u't.cn' in url:
        temp = 0 
        while (temp == 0):
            temp = request_limit(url, session, app_num)
            if temp == 0:
                pass
            else:
                text_list = temp
                break
            app_num += 1
            if app_num >= len(APP_SOURCE_LIST):
                lg_debug('all of the id out limited')
                return None
    else:
        return None
    text = text_list['urls'][0][u'url_long'][25:]
    coordinate = text.split('_')
    if len(coordinate) == 2:
        return coordinate
    else:
        return None
コード例 #15
0
def short_to_long(url, session, app_num=0):
    if u't.cn' in url:
        temp = 0
        while (temp == 0):
            temp = request_limit(url, session, app_num)
            if temp == 0:
                pass
            else:
                text_list = temp
                break
            app_num += 1
            if app_num >= len(APP_SOURCE_LIST):
                lg_debug('all of the id out limited')
                return None
    else:
        return None
    text = text_list['urls'][0][u'url_long'][25:]
    coordinate = text.split('_')
    if len(coordinate) == 2:
        return coordinate
    else:
        return None
コード例 #16
0
def get_weibo_by_coordinate(session,
                            coordinate,
                            starttime,
                            endtime,
                            range=2000,
                            sort=0,
                            count=20,
                            page=1,
                            offset=0):
    if log_date.log_date.year != datetime.datetime.now():
        log_date.change_log_date()
        init_log()
    num = 0
    pd_403 = [0] * len(APP_SOURCE_LIST)
    end_403 = [1] * len(APP_SOURCE_LIST)
    while True:
        try:
            app_id = random.randint(0, len(APP_SOURCE_LIST) - 1)
            url = "http://api.weibo.com/2/place/nearby_timeline.json?"
            url += "source=" + APP_SOURCE_LIST[app_id]
            url += "&lat=" + coordinate['latitude'] + "&long=" + coordinate[
                'longitude']
            url += "&starttime=" + str(starttime) + "&range=" + str(
                range) + "&sort=" + str(sort)
            url += "&count=" + str(count) + "&page=" + str(
                page) + "&offset=" + str(offset)
            text = session.get(url)
            if text.status_code == 403:
                pd_403[app_id] = 1
                if pd_403 == end_403:
                    sleep_time = 15600
                else:
                    sleep_time = random.randint(12, 30)
                wait_time(sleep_time)
                continue
            break
        except ConnectionError:
            num += 1
            lg_warning(ConnectionError)
            lg_debug('connect fail' + str(num))
            sleep_time = random.randint(6, 10)
            wait_time(str(sleep_time))
            continue
        except Exception:
            num += 1
            print('Connection reset by peer error')
            lg_warning(Exception)
            lg_debug('Connection reset by peer' + str(num))
            sleep_time = random.randint(10, 20)
            wait_time(str(sleep_time))
            continue

    text_dict = None
    text_list_dict = None
    try:
        text_dict = text.json()
        if text_dict.has_key('statuses'):
            text_list_dict = text_dict['statuses']
            lg_debug('success catch the info_list')
        else:
            lg_debug("get_weibo_by_coordinate: No Json")
    except Exception:
        lg_warning(Exception.message)
        lg_debug("get_weibo_by_coordinate: No Json")
    return text_list_dict