Esempio n. 1
0
def insert_proxies(proxies_list):
    if not proxies_list:
        return
    con = MyPyMysql(**mysql_config)
    sql = """ replace into pt_db.spide_proxies_ip (proxy_host,proxy_port) values %s """
    con.insert_query(sql, proxies_list)
    mylog.info('insert :' + sql + str(proxies_list))
    con.close_connect()
Esempio n. 2
0
def getfollowlist(current_uk):
    @gen.coroutine
    def followquery(current_uk, start, limit):
        query_uk = current_uk
        url = follow_url.format(start, limit, query_uk)
        mylog.info('follow: ' + url)
        response = yield get_spide(url)
        list_data = json.loads(response.body)
        if list_data['errno'] != 0:
            yield followquery(current_uk, start, limit)
            raise gen.Return(response.body)
        total_count = list_data['total_count']
        follow_list = list_data['follow_list']
        raise gen.Return([total_count, follow_list])

    try:
        start = 0
        limit = 24
        url = follow_url.format(start, limit, current_uk)
        list_data = yield followquery(current_uk, start, limit)
        total_count = list_data[0]
        follow_list = []
        person_data = []
        con = MyPyMysql(**mysql_config)
        sql = """select follow_nums from pt_db.spide_all_person_log where uk = %s """
        query_data = con.query(sql, current_uk)
        query_follows = query_data[0]['follow_nums'] if query_data else 0
        for_follows = total_count - query_follows  # 增量更新,差异多少
        if for_follows > 0:
            for j in range(((for_follows - 1) / limit) + 1):
                start = j * limit
                url = follow_url.format(start, limit, current_uk)
                range_data = yield followquery(current_uk, start, limit)
                follow_list.extend(range_data[1])
        for i in follow_list:
            person_data.append([
                i['follow_uk'], i['follow_uname'].encode("utf-8"),
                i['fans_count'], i['follow_count'], i['pubshare_count']
            ])
        # 把所有查到的人添加到数据库
        if person_data:
            sql = """ replace into pt_db.spide_all_person (uk,uk_name,fan_nums,follow_nums,share_nums) values %s """
            con.insert_query(sql, person_data)
        # 记录查询的这个人当前的关注人数
        sql = """ insert into pt_db.spide_all_person_log (uk,follow_nums) values (%s,%s) ON DUPLICATE KEY UPDATE follow_nums=%s , m_time = %s;"""
        con.query(sql, (current_uk, total_count, total_count, now_time))
        # 查询没有fetching的用户uk
        sql = """ SELECT p.uk FROM pt_db.spide_all_person p
                where p.follow_nums !=0
              """
        query_uk = con.query(sql)
        uks = [i['uk'] for i in query_uk]
        con.close_connect()
    except Exception as e:
        mylog.error('followlist 失败: ' + str(url))
        mylog.error(e)
        raise gen.Return([])
    raise gen.Return(uks)
Esempio n. 3
0
def delete_proxies(proxies_list):
    if not proxies_list:
        return
    pmysql = MyPyMysql(**mysql_config)
    change_proxies = '"' + '","'.join(proxies_list) + '"'
    sql = """delete from pt_db.spide_proxies_ip  where proxy_host in (%s) ;"""
    sql = sql % change_proxies
    pmysql.query(sql)
    mylog.info('update :' + sql)
    pmysql.close_connect()
Esempio n. 4
0
def put_ip():
    pmysql = MyPyMysql(**mysql_config)
    sql = """SELECT proxy_host,proxy_port FROM pt_db.spide_proxies_ip order by rand();"""
    result = pmysql.query(sql)
    for i in result:
        r.rpush("proxy_ip_list", json.dumps(i))
    mylog.info('向proxy_ip_list加数据')
    pmysql.close_connect()
    if not result or result is None:
        mylog.info('数据库无代理IP...')
        yield get_first_proxy_data(page_n=2,if_proxy=True)
Esempio n. 5
0
def put_ip():
    pmysql = MyPyMysql(**mysql_config)
    sql = """SELECT proxy_host,proxy_port FROM pt_db.spide_proxies_ip where status = 0 ;"""
    result = pmysql.query(sql)
    for i in result:
        if i['proxy_host'] not in proxy_ip_set:
            proxy_ip_set.add(i['proxy_host'])
            proxy_ip_queue.put(i)
    pmysql.close_connect()
    if not result or result is None:
        os._exit(0)
Esempio n. 6
0
def get_all_person(uk):
    """
    获取所有用户当前粉丝数,分享数,关注数   ,代理连接数
    :return:
    """
    pmysql = MyPyMysql(**mysql_config)
    sql = """SELECT proxy_host,proxy_port FROM pt_db.spide_proxies_ip where status = 0 order by rand() desc ;"""
    result = pmysql.query(sql)  # 代理连接数
    sql = """SELECT ifnull(p.follow_nums,0)-ifnull(l.follow_nums,0) as follow_nums
        FROM pt_db.spide_all_person p
        left join pt_db.spide_all_person_log l on p.uk = l.uk
        where p.share_nums !=0 and p.uk = %s """
    all_person = pmysql.query(sql, uk)  #
    uk_data = int(all_person[0]['follow_nums']) if all_person else 0
    pmysql.close_connect()
    return result, uk_data
Esempio n. 7
0
def getsharelist(current_uk, start, limit):
    try:

        # yield gen.sleep(consume_sleeptime)
        auth_type = 1
        query_uk = current_uk
        url = share_url.format(auth_type, start, limit, query_uk)
        response = yield get_spide(url)
        list_data = json.loads(response.body)
        if list_data['errno'] != 0:
            yield getsharelist(current_uk, start, limit)
            raise gen.Return(response.body)
        records = list_data['records'] if 'records' in list_data else []
        insert_data = []
        for i in records:
            if i['feed_type'] == 'share':
                for j in i['filelist']:
                    insert_data.append([
                        j['fs_id'], j['category'],
                        'http://pan.baidu.com/s/' if 'shorturl' in i.keys()
                        else 'http://pan.baidu.com/share/link?uk={0}&shareid='.
                        format(current_uk),
                        i['shorturl'].encode("utf-8") if 'shorturl'
                        in i.keys() else i['shareid'].encode("utf-8"),
                        i['public'].encode("utf-8"),
                        j['server_filename'].encode("utf-8"), i['uk'],
                        i['username'].encode("utf-8"), j['size'],
                        timestamptotime(i['feed_time'] / 1000)
                    ])
        len_insert_data = len(records)
        # print len_insert_data,insert_data
        con = MyPyMysql(**mysql_config)
        if insert_data:
            sql = """ insert ignore into pt_db.spide_shares (fs_id,category,base_url,share_url,`public`,server_filename,uk,username,`size`,share_time) values %s """
            con.insert_query(sql, insert_data)
        # 记录查询的这个人现在分享到多少了
        sql = """ insert into pt_db.spide_all_person_log (uk,share_nums) values (%s,%s) ON DUPLICATE KEY UPDATE share_nums=share_nums+%s , m_time = %s;"""
        con.query(sql,
                  (current_uk, len_insert_data, len_insert_data, now_time))
        con.close_connect()
        mylog.info('sharelist 成功: ' + url)
    except Exception as e:
        mylog.error('sharelist 失败: ' + str(url))
        mylog.error(e)
        raise gen.Return([])
    raise gen.Return([])
Esempio n. 8
0
                        yield getsharelist(current_uk, starts, limit)

            except Exception as e:
                mylog.error(e)

            finally:
                share_data.task_done()

    start = time.time()
    q.put(base_uk)
    share_data.put(base_uk)
    worker()
    # Start workers, then wait for the work queue to be empty.
    for i in range(3):
        consumer()
    yield q.join()
    yield share_data.join()
    # assert fetching == fetched
    print('Done in %d seconds, fetched %s URLs.' %
          (time.time() - start, len(fetched)))


if __name__ == '__main__':
    mylog = Logger(main_logging_filename)
    mylog.info('爬虫开始....')
    con = MyPyMysql(**mysql_config)
    sql = """ insert into pt_db.spide_all_person_log (uk,follow_nums,fan_nums,share_nums) values (%s,%s,%s,%s) ON DUPLICATE KEY UPDATE follow_nums=follow_nums , m_time = %s;"""
    con.query(sql, [2164327417, 0, 0, 0, now_time])
    con.close_connect()
    io_loop = ioloop.IOLoop.current().run_sync(main)