Esempio n. 1
0
def load_data():

    if environ.get('REDIS_SERVER') is not None:
        redis_server = environ.get('REDIS_SERVER')
    else:
        redis_server = 'localhost'

    if environ.get('REDIS_PORT') is not None:
        redis_port = int(environ.get('REDIS_PORT'))
    else:
        redis_port = 6379

    if environ.get('REDIS_PASSWORD') is not None:
        redis_password = environ.get('REDIS_PASSWORD')
    else:
        redis_password = ''

    rdb = redis.Redis(host=redis_server,
                      port=redis_port,
                      password=redis_password)
    rb = RedisBloom(host=redis_server,
                    port=redis_port,
                    password=redis_password)
    rts = RedisTimeseries(host=redis_server,
                          port=redis_port,
                          password=redis_password)

    rdb.set("CONFIG", "YES")

    rts.create('s-unfiltered', retention_ms=60000)
    rts.create('s-filtered', retention_ms=60000)
    rts.create('unfiltered', labels={'Type': 'Final'}, retention_ms=86400000)
    rts.create('filtered', labels={'Type': 'Final'}, retention_ms=86400000)
    rts.createrule('s-unfiltered', 'unfiltered', 'last', 1000)
    rts.createrule('s-filtered', 'filtered', 'last', 1000)

    for gear in ['./dedup.py']:
        file = open(gear, mode='r')
        g = file.read()
        rdb.execute_command('RG.PYEXECUTE', g)
        file.close()

    if environ.get('REDIS_SCRABBLE') is not None:
        for line in fileinput.input("2019_Collins_Scrabble_Words.txt"):
            rb.bfAdd("Scrabble-Bloom", line.rstrip())
Esempio n. 2
0
class Follow(object):
    def __init__(self, config):
        """Follow类初始化"""
        self.rb = Client()
        self.filter_redis_key = 'uidfilter'
        self.validate_config(config)
        self.cookie = {'Cookie': config['cookie']}
        user_id_list = config['user_id_list']
        if not isinstance(user_id_list, list):
            if not os.path.isabs(user_id_list):
                user_id_list = os.path.split(
                    os.path.realpath(__file__))[0] + os.sep + user_id_list
            user_id_list = self.get_user_list(user_id_list)
        self.user_id_list = user_id_list  # 要爬取的微博用户的user_id列表
        self.user_id = ''
        self.follow_list = []  # 存储爬取到的所有关注微博的uri和用户昵称
        self.fans_list = []  # 存储爬取到的所有粉丝微博的uri和用户昵称
        self.file_name = 'user_id_list' + str(time()) + '.txt'

    def validate_config(self, config):
        """验证配置是否正确"""
        user_id_list = config['user_id_list']
        if (not isinstance(user_id_list,
                           list)) and (not user_id_list.endswith('.txt')):
            sys.exit(u'user_id_list值应为list类型或txt文件路径')
        if not isinstance(user_id_list, list):
            if not os.path.isabs(user_id_list):
                user_id_list = os.path.split(
                    os.path.realpath(__file__))[0] + os.sep + user_id_list
            if not os.path.isfile(user_id_list):
                sys.exit(u'不存在%s文件' % user_id_list)

    def deal_html(self, url):
        """处理html"""
        try:
            html = requests.get(url, cookies=self.cookie, verify=False).content
            selector = etree.HTML(html)
            return selector
        except Exception as e:
            print('Error: ', e)
            traceback.print_exc()

    def get_page_num(self):
        """获取关注列表页数"""
        url = "https://weibo.cn/%s/follow" % self.user_id
        selector = self.deal_html(url)
        if selector.xpath("//input[@name='mp']") == []:
            page_num = 1
        else:
            page_num = (int)(
                selector.xpath("//input[@name='mp']")[0].attrib['value'])
        return page_num

    def get_one_page(self, page):
        """获取第page页的user_id"""
        print(u'%s第%d页%s' % ('-' * 30, page, '-' * 30))
        url = 'https://weibo.cn/%s/follow?page=%d' % (self.user_id, page)
        selector = self.deal_html(url)
        table_list = selector.xpath('//table')
        if (page == 1 and len(table_list) == 0):
            print(u'cookie无效或提供的user_id无效')
        else:
            for t in table_list:
                im = t.xpath('.//a/@href')[-1]
                uri = im.split('uid=')[-1].split('&')[0].split('/')[-1]
                nickname = t.xpath('.//a/text()')[0]
                # if {'uri': uri, 'nickname': nickname} not in self.follow_list:
                if self.rb.bfExists(self.filter_redis_key, uri) == 0:
                    self.rb.bfAdd(self.filter_redis_key, uri)
                    self.follow_list.append({'uri': uri, 'nickname': nickname})
                    print(u'%s %s' % (nickname, uri))

    def get_follow_list(self):
        """获取关注用户主页地址"""
        page_num = self.get_page_num()
        print(u'用户关注页数:' + str(page_num))
        page1 = 0
        random_pages = random.randint(1, 5)
        for page in tqdm(range(1, page_num + 1), desc=u'关注列表爬取进度'):
            self.get_one_page(page)

            if page - page1 == random_pages and page < page_num:
                sleep(random.randint(6, 10))
                page1 = page
                random_pages = random.randint(1, 5)

        print(u'用户关注列表爬取完毕')

    def get_fans_page_num(self):
        """获取关注列表页数"""
        url = "https://weibo.cn/%s/fans" % self.user_id
        selector = self.deal_html(url)
        if selector.xpath("//input[@name='mp']") == []:
            page_num = 1
        else:
            page_num = (int)(
                selector.xpath("//input[@name='mp']")[0].attrib['value'])
        return page_num

    def get_fans_one_page(self, page):
        """获取第page页的user_id"""
        print(u'%s第%d页%s' % ('-' * 30, page, '-' * 30))
        url = 'https://weibo.cn/%s/fans?page=%d' % (self.user_id, page)
        selector = self.deal_html(url)
        table_list = selector.xpath('//table')
        if (page == 1 and len(table_list) == 0):
            print(u'cookie无效或提供的user_id无效')
        else:
            for t in table_list:
                im = t.xpath('.//a/@href')[-1]
                uri = im.split('uid=')[-1].split('&')[0].split('/')[-1]
                nickname = t.xpath('.//a/text()')[0]
                #if {'uri': uri, 'nickname': nickname} not in self.fans_list:
                if self.rb.bfExists(self.filter_redis_key, uri) == 0:
                    self.rb.bfAdd(self.filter_redis_key, uri)
                    self.fans_list.append({'uri': uri, 'nickname': nickname})
                    print(u'%s %s' % (nickname, uri))

    def get_fans_list(self):
        """获取关注用户主页地址"""
        page_num = self.get_fans_page_num()
        print(u'用户关注页数:' + str(page_num))
        page1 = 0
        random_pages = random.randint(1, 5)
        for page in tqdm(range(1, page_num + 1), desc=u'关注列表爬取进度'):
            self.get_fans_one_page(page)

            if page - page1 == random_pages and page < page_num:
                sleep(random.randint(6, 10))
                page1 = page
                random_pages = random.randint(1, 5)

        print(u'用户粉丝列表爬取完毕')

    def write_to_txt(self):
        with open(self.file_name, 'ab') as f:
            for user in self.follow_list:
                f.write((user['uri'] + ' ' + user['nickname'] + '\n').encode(
                    sys.stdout.encoding))
            for user in self.fans_list:
                f.write((user['uri'] + ' ' + user['nickname'] + '\n').encode(
                    sys.stdout.encoding))

    def get_user_list(self, file_name):
        """获取文件中的微博id信息"""
        with open(file_name, 'rb') as f:
            try:
                lines = f.read().splitlines()
                lines = [line.decode('utf-8-sig') for line in lines]
            except UnicodeDecodeError:
                sys.exit(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序' % file_name)
            user_id_list = []
            for line in lines:
                info = line.split(' ')
                if len(info) > 0 and info[0].isdigit():
                    user_id = info[0]
                    if user_id not in user_id_list:
                        user_id_list.append(user_id)
        return user_id_list

    def initialize_info(self, user_id):
        """初始化爬虫信息"""
        self.follow_list = []
        self.fans_list = []
        self.user_id = user_id

    def check_unique(self, user_id):
        """查看user_id是否已经保存过"""

    def start(self):
        """运行爬虫"""
        for user_id in self.user_id_list:
            self.initialize_info(user_id)
            print(u'开始抓取:' + user_id)
            print('*' * 100)
            try:
                self.get_follow_list()  # 爬取关注列表
                self.get_fans_list()  # 爬取粉丝列表
            except Exception as e:
                print('Error: ', e)
                traceback.print_exc()
                sleep(10)  # 如果出错则跳过用户,而不是退出
            self.write_to_txt()
            print(u'信息抓取完毕')
            print('*' * 100)