Example #1
0
def getfollowlist(current_uk):
    @gen.coroutine
    def followquery(current_uk, start, limit):
        query_uk = current_uk
        url = follow_url.format(start, limit, query_uk)
        mylog.info('follow: ' + url)
        response = yield get_spide(url)
        list_data = json.loads(response.body)
        if list_data['errno'] != 0:
            yield followquery(current_uk, start, limit)
            raise gen.Return(response.body)
        total_count = list_data['total_count']
        follow_list = list_data['follow_list']
        raise gen.Return([total_count, follow_list])

    try:
        start = 0
        limit = 24
        url = follow_url.format(start, limit, current_uk)
        list_data = yield followquery(current_uk, start, limit)
        total_count = list_data[0]
        follow_list = []
        person_data = []
        con = MyPyMysql(**mysql_config)
        sql = """select follow_nums from pt_db.spide_all_person_log where uk = %s """
        query_data = con.query(sql, current_uk)
        query_follows = query_data[0]['follow_nums'] if query_data else 0
        for_follows = total_count - query_follows  # 增量更新,差异多少
        if for_follows > 0:
            for j in range(((for_follows - 1) / limit) + 1):
                start = j * limit
                url = follow_url.format(start, limit, current_uk)
                range_data = yield followquery(current_uk, start, limit)
                follow_list.extend(range_data[1])
        for i in follow_list:
            person_data.append([
                i['follow_uk'], i['follow_uname'].encode("utf-8"),
                i['fans_count'], i['follow_count'], i['pubshare_count']
            ])
        # 把所有查到的人添加到数据库
        if person_data:
            sql = """ replace into pt_db.spide_all_person (uk,uk_name,fan_nums,follow_nums,share_nums) values %s """
            con.insert_query(sql, person_data)
        # 记录查询的这个人当前的关注人数
        sql = """ insert into pt_db.spide_all_person_log (uk,follow_nums) values (%s,%s) ON DUPLICATE KEY UPDATE follow_nums=%s , m_time = %s;"""
        con.query(sql, (current_uk, total_count, total_count, now_time))
        # 查询没有fetching的用户uk
        sql = """ SELECT p.uk FROM pt_db.spide_all_person p
                where p.follow_nums !=0
              """
        query_uk = con.query(sql)
        uks = [i['uk'] for i in query_uk]
        con.close_connect()
    except Exception as e:
        mylog.error('followlist 失败: ' + str(url))
        mylog.error(e)
        raise gen.Return([])
    raise gen.Return(uks)
Example #2
0
def delete_proxies(proxies_list):
    if not proxies_list:
        return
    pmysql = MyPyMysql(**mysql_config)
    change_proxies = '"' + '","'.join(proxies_list) + '"'
    sql = """delete from pt_db.spide_proxies_ip  where proxy_host in (%s) ;"""
    sql = sql % change_proxies
    pmysql.query(sql)
    mylog.info('update :' + sql)
    pmysql.close_connect()
Example #3
0
def get_all_person(uk):
    """
    获取所有用户当前粉丝数,分享数,关注数   ,代理连接数
    :return:
    """
    pmysql = MyPyMysql(**mysql_config)
    sql = """SELECT proxy_host,proxy_port FROM pt_db.spide_proxies_ip where status = 0 order by rand() desc ;"""
    result = pmysql.query(sql)  # 代理连接数
    sql = """SELECT ifnull(p.follow_nums,0)-ifnull(l.follow_nums,0) as follow_nums
        FROM pt_db.spide_all_person p
        left join pt_db.spide_all_person_log l on p.uk = l.uk
        where p.share_nums !=0 and p.uk = %s """
    all_person = pmysql.query(sql, uk)  #
    uk_data = int(all_person[0]['follow_nums']) if all_person else 0
    pmysql.close_connect()
    return result, uk_data
Example #4
0
def getsharelist(current_uk, start, limit):
    try:

        # yield gen.sleep(consume_sleeptime)
        auth_type = 1
        query_uk = current_uk
        url = share_url.format(auth_type, start, limit, query_uk)
        response = yield get_spide(url)
        list_data = json.loads(response.body)
        if list_data['errno'] != 0:
            yield getsharelist(current_uk, start, limit)
            raise gen.Return(response.body)
        records = list_data['records'] if 'records' in list_data else []
        insert_data = []
        for i in records:
            if i['feed_type'] == 'share':
                for j in i['filelist']:
                    insert_data.append([
                        j['fs_id'], j['category'],
                        'http://pan.baidu.com/s/' if 'shorturl' in i.keys()
                        else 'http://pan.baidu.com/share/link?uk={0}&shareid='.
                        format(current_uk),
                        i['shorturl'].encode("utf-8") if 'shorturl'
                        in i.keys() else i['shareid'].encode("utf-8"),
                        i['public'].encode("utf-8"),
                        j['server_filename'].encode("utf-8"), i['uk'],
                        i['username'].encode("utf-8"), j['size'],
                        timestamptotime(i['feed_time'] / 1000)
                    ])
        len_insert_data = len(records)
        # print len_insert_data,insert_data
        con = MyPyMysql(**mysql_config)
        if insert_data:
            sql = """ insert ignore into pt_db.spide_shares (fs_id,category,base_url,share_url,`public`,server_filename,uk,username,`size`,share_time) values %s """
            con.insert_query(sql, insert_data)
        # 记录查询的这个人现在分享到多少了
        sql = """ insert into pt_db.spide_all_person_log (uk,share_nums) values (%s,%s) ON DUPLICATE KEY UPDATE share_nums=share_nums+%s , m_time = %s;"""
        con.query(sql,
                  (current_uk, len_insert_data, len_insert_data, now_time))
        con.close_connect()
        mylog.info('sharelist 成功: ' + url)
    except Exception as e:
        mylog.error('sharelist 失败: ' + str(url))
        mylog.error(e)
        raise gen.Return([])
    raise gen.Return([])
Example #5
0
def put_ip():
    pmysql = MyPyMysql(**mysql_config)
    sql = """SELECT proxy_host,proxy_port FROM pt_db.spide_proxies_ip order by rand();"""
    result = pmysql.query(sql)
    for i in result:
        r.rpush("proxy_ip_list", json.dumps(i))
    mylog.info('向proxy_ip_list加数据')
    pmysql.close_connect()
    if not result or result is None:
        mylog.info('数据库无代理IP...')
        yield get_first_proxy_data(page_n=2,if_proxy=True)
Example #6
0
def put_ip():
    pmysql = MyPyMysql(**mysql_config)
    sql = """SELECT proxy_host,proxy_port FROM pt_db.spide_proxies_ip where status = 0 ;"""
    result = pmysql.query(sql)
    for i in result:
        if i['proxy_host'] not in proxy_ip_set:
            proxy_ip_set.add(i['proxy_host'])
            proxy_ip_queue.put(i)
    pmysql.close_connect()
    if not result or result is None:
        os._exit(0)
Example #7
0
class SpProducer(object):
    def __init__(self):
        self.now_time = time.strftime('%Y-%m-%d %H:%M:%S')
        self.r = RedisPool(**redis_conf).redis()
        self.r.rpush("share_list", str(base_uk))  # 初始化把base_url加入到队列中
        self.con = MyPyMysql(**mysql_config)

    def add_most_person(self):
        ntime = time.time()
        last_time = self.r.get('if_add_time')
        if not last_time or last_time is None:
            self.r.set('if_add_time', str(ntime))
        else:
            if ntime - float(last_time) > 86400:
                self.r.set('if_add_time', str(ntime))
                # 向队列里面添加数据
                sql = """SELECT uk FROM pt_db.spide_all_person p where p.share_nums !=0 order by share_nums desc limit 100; """
                result = self.con.query(sql)
                for i in result:
                    self.r.rpush("share_list", str(i['uk']))
                mylog.info('向share_list添加前分享前100数据')

    @gen.coroutine
    def getsharelist(self, current_uk, start, limit):
        try:
            auth_type = 1
            query_uk = current_uk
            url = share_url.format(auth_type, start, limit, query_uk)
            response = yield self.get_spide(url)
            list_data = json.loads(response.body)
            if list_data['errno'] != 0:
                yield self.getsharelist(current_uk, start, limit)
            else:
                records = list_data[
                    'records'] if 'records' in list_data else []
                insert_data = []
                for i in records:
                    if i['feed_type'] == 'share':
                        for j in i['filelist']:
                            insert_data.append([
                                j['fs_id'], j['category'],
                                'http://pan.baidu.com/s/'
                                if 'shorturl' in i.keys() else
                                'http://pan.baidu.com/share/link?uk={0}&shareid='
                                .format(current_uk),
                                i['shorturl'].encode("utf-8") if 'shorturl'
                                in i.keys() else i['shareid'].encode("utf-8"),
                                i['public'].encode("utf-8"),
                                j['server_filename'].encode("utf-8"), i['uk'],
                                i['username'].encode("utf-8"), j['size'],
                                timestamptotime(i['feed_time'] / 1000)
                            ])
                len_insert_data = len(records)
                # print len_insert_data,insert_data
                if insert_data:
                    sql = """ insert ignore into pt_db.spide_shares (fs_id,category,base_url,share_url,`public`,server_filename,uk,username,`size`,share_time) values %s """
                    self.con.insert_query(sql, insert_data)
                # 记录查询的这个人现在分享到多少了
                sql = """ insert into pt_db.spide_all_person_log (uk,share_nums) values (%s,%s) ON DUPLICATE KEY UPDATE share_nums=share_nums+%s , m_time = %s;"""
                self.con.query(sql,
                               (current_uk, len_insert_data, len_insert_data,
                                time.strftime('%Y-%m-%d %H:%M:%S')))
                mylog.info('sharelist 成功: ' + url)
        except Exception as e:
            mylog.error('sharelist 失败: ' + str(url))
            mylog.error(e)
        else:
            raise gen.Return([])

    @gen.coroutine
    def get_spide(self, url):
        """
        获取 能用的代理连接
        :return:
        """
        for _ in range(10):
            rlist = self.r.lrange("proxy_ip_list", 0, -1)
            try:
                if self.r.llen('proxy_ip_list') == 0:
                    self.put_ip()
                    mylog.info('proxy_ip_list队列无值,等待添加中....')
                i = json.loads(self.r.blpop("proxy_ip_list", timeout=0)[1])
                httpconfigs = get_http_config()
                httpconfigs['proxy_host'] = i['proxy_host']
                httpconfigs['proxy_port'] = i['proxy_port']
                response = yield Spide(url, **httpconfigs).async_proxy()
                # response = yield Spide(url, **httpconfigs).async()
            except Exception as e:
                mylog.error(str(e))
                mylog.error('无法连接... ' + str(len(rlist)) + ' ' +
                            str(i['proxy_host']))
            else:
                mylog.info('连接成功...' + str(len(rlist)) + ' ' +
                           str(i['proxy_host']))
                raise gen.Return(response)

    def put_ip(self):
        sql = """SELECT proxy_host,proxy_port FROM pt_db.spide_proxies_ip order by rand();"""
        result = self.con.query(sql)
        for i in result:
            self.r.rpush("proxy_ip_list", json.dumps(i))
        mylog.info('向proxy_ip_list加数据')

    def put_share_list(self):
        # 向队列里面添加数据
        sql = """SELECT uk FROM pt_db.spide_all_person p where p.share_nums !=0 order by share_nums desc; """
        result = self.con.query(sql)
        for i in result:
            self.r.rpush("share_list", str(i['uk']))
        mylog.info('向share_list加数据')
        if not result or result is None:
            mylog.info('share_list数据库无数据...')

    def get_all_person(self, uk):
        """
        获取用户当前的
        :return:
        """
        sql = """SELECT ifnull(p.share_nums,0)-ifnull(l.share_nums,0) as share_nums
            FROM pt_db.spide_all_person p
            left join pt_db.spide_all_person_log l on p.uk = l.uk
            where p.share_nums !=0 and p.uk = %s """
        all_person = self.con.query(sql, uk)  #
        uk_data = (all_person[0]['share_nums']) if all_person else 0
        return uk_data

    @gen.coroutine
    def worker(self):
        while True:
            try:
                self.add_most_person()
                if self.r.llen('share_list') == 0:
                    mylog.info('share_list队列无值,等待添加中....')
                    self.put_share_list()
                mylog.info('消费队列:share_list:{0}'.format(
                    self.r.llen('share_list')))
                current_uk = (self.r.blpop("share_list", timeout=200)[1])
                query_share_nums = self.get_all_person(current_uk)
                limit = 60
                if query_share_nums > 0:
                    for j in range(((query_share_nums - 1) / limit) + 1):
                        starts = j * limit
                        yield self.getsharelist(current_uk, starts, limit)
            except Exception as e:
                mylog.error(e.message)

    def runner(self):
        ioloop.IOLoop.current().run_sync(self.worker)
Example #8
0
class SpProducer(object):
    def __init__(self):
        self.now_time = time.strftime('%Y-%m-%d %H:%M:%S')
        self.r = RedisPool(**redis_conf).redis()
        self.con = MyPyMysql(**mysql_config)
        self.first_sql = """ insert into pt_db.spide_all_person_log (uk,follow_nums,fan_nums,share_nums) values (%s,%s,%s,%s) ON DUPLICATE KEY UPDATE follow_nums=follow_nums , m_time = %s;"""
        self.con.query(self.first_sql, [base_uk, 0, 0, 0, self.now_time])
        self.r.rpush("follow_list", str(base_uk))  # 初始化把base_url加入到队列中

    @gen.coroutine
    def getfollowlist(self, current_uk):
        @gen.coroutine
        def followquery(current_uk, start, limit):
            query_uk = current_uk
            url = follow_url.format(start, limit, query_uk)
            mylog.info('follow: ' + url)
            response = yield self.get_spide(url)
            list_data = json.loads(response.body)
            if list_data['errno'] != 0:
                mylog.info(response.body)
                yield followquery(current_uk, start, limit)
            else:
                total_count = list_data['total_count']
                follow_list = list_data['follow_list']
                raise gen.Return([total_count, follow_list])

        try:
            start = 0
            limit = 24
            url = follow_url.format(start, limit, current_uk)
            list_data = yield followquery(current_uk, start, limit)
            total_count = list_data[0]
            follow_list = []
            person_data = []
            uk_lists = []
            sql = """select follow_nums from pt_db.spide_all_person_log where uk = %s """
            query_data = self.con.query(sql, current_uk)
            query_follows = query_data[0]['follow_nums'] if query_data else 0
            for_follows = total_count - query_follows  # 增量更新,差异多少
            if for_follows > 0:
                for j in range(((for_follows - 1) / limit) + 1):
                    start = j * limit
                    url = follow_url.format(start, limit, current_uk)
                    range_data = yield followquery(current_uk, start, limit)
                    follow_list.extend(range_data[1])
            for i in follow_list:
                uk_lists.append(i['follow_uk'])
                person_data.append([
                    i['follow_uk'], i['follow_uname'].encode("utf-8"),
                    i['fans_count'], i['follow_count'], i['pubshare_count']
                ])
            # 把所有查到的人添加到数据库
            if person_data:
                sql = """ replace into pt_db.spide_all_person (uk,uk_name,fan_nums,follow_nums,share_nums) values %s """
                self.con.insert_query(sql, person_data)
            # 记录查询的这个人当前的关注人数
            sql = """ insert into pt_db.spide_all_person_log (uk,follow_nums) values (%s,%s) ON DUPLICATE KEY UPDATE follow_nums=%s , m_time = %s;"""
            self.con.query(sql, (current_uk, total_count, total_count,
                                 time.strftime('%Y-%m-%d %H:%M:%S')))
        except Exception as e:
            mylog.error('followlist 失败: ' + str(url))
            mylog.error(e)
            raise gen.Return([])
        raise gen.Return(uk_lists)

    @gen.coroutine
    def get_spide(self, url):
        """
        获取 能用的代理连接
        :return:
        """
        for _ in range(10):
            rlist = self.r.lrange("proxy_ip_list", 0, -1)
            try:
                if self.r.llen('proxy_ip_list') == 0:
                    mylog.info('proxy_ip_list队列无值,等待添加中....')
                i = json.loads(self.r.blpop("proxy_ip_list", timeout=100)[1])
                httpconfigs = get_http_config()
                httpconfigs['proxy_host'] = i['proxy_host']
                httpconfigs['proxy_port'] = i['proxy_port']
                response = yield Spide(url, **httpconfigs).async_proxy()
                # response = yield Spide(url, **httpconfigs).async()
            except Exception as e:
                mylog.error(str(e))
                mylog.error('无法连接... ' + str(len(rlist)) + ' ' +
                            str(i['proxy_host']))
            else:
                mylog.info('连接成功...' + str(len(rlist)) + ' ' +
                           str(i['proxy_host']))
                raise gen.Return(response)

    @gen.coroutine
    def put_ip(self):
        # 向队列里面添加数据
        sql = """SELECT uk FROM pt_db.spide_all_person p where follow_nums != 0 order by rand() desc ;"""
        result = self.con.query(sql)
        for i in result:
            self.r.rpush("follow_list", str(i['uk']))
        mylog.info('向follow_list加数据')
        if not result or result is None:
            mylog.info('spide_all_person无数据...')

    @gen.coroutine
    def fetch_url(self):
        if self.r.llen('follow_list') == 0:
            mylog.info('follow_list队列无值,等待添加中....')
            self.put_ip()
        current_uk = (self.r.blpop("follow_list", timeout=100)[1])
        try:
            mylog.info('生产队列:follow_list:{0},followed_set:{1}'.format(
                self.r.llen('follow_list'), self.r.scard('followed_set')))
            follow_uks = yield self.getfollowlist(current_uk)
            # fank_uks = yield getfanlist(current_uk,proxies)
            # uks = list(set(follow_uks + fank_uks))
            for uk in follow_uks:
                # 判断是否在follow_set 里面,不在才插入到队列中
                if not self.r.sismember('follow_set', uk):
                    self.r.rpush("follow_list", uk)
            yield gen.sleep(product_sleeptime)
        except Exception as e:
            mylog.error(e.message)

    @gen.coroutine
    def worker(self):
        while True:
            yield self.fetch_url()

    def runner(self):
        ioloop.IOLoop.current().run_sync(self.worker)
Example #9
0
                        yield getsharelist(current_uk, starts, limit)

            except Exception as e:
                mylog.error(e)

            finally:
                share_data.task_done()

    start = time.time()
    q.put(base_uk)
    share_data.put(base_uk)
    worker()
    # Start workers, then wait for the work queue to be empty.
    for i in range(3):
        consumer()
    yield q.join()
    yield share_data.join()
    # assert fetching == fetched
    print('Done in %d seconds, fetched %s URLs.' %
          (time.time() - start, len(fetched)))


if __name__ == '__main__':
    mylog = Logger(main_logging_filename)
    mylog.info('爬虫开始....')
    con = MyPyMysql(**mysql_config)
    sql = """ insert into pt_db.spide_all_person_log (uk,follow_nums,fan_nums,share_nums) values (%s,%s,%s,%s) ON DUPLICATE KEY UPDATE follow_nums=follow_nums , m_time = %s;"""
    con.query(sql, [2164327417, 0, 0, 0, now_time])
    con.close_connect()
    io_loop = ioloop.IOLoop.current().run_sync(main)