Esempio n. 1
0
class Pipeline():
    '''
    存储结果管道
    (不负责压next task)
    '''
    def __init__(self, task):
        self.task = task
        self.taskUtil = TaskUtil()
        self.mongoUtil = MongoUtil()
        pass

    def run(self):
        '''
        分发
        :return: 无
        '''
        Log.i('Pipeline.run()')
        if self.task['results'] is not None and len(self.task['results']) > 0:
            #下次任务入队列
            if self.task['next_tasks'] is not None:
                for next_task in self.task['next_tasks']:
                    self.taskUtil.insert_one(next_task)
            #本次解析结果入库
            # 利用反射机制自动执行pipeline_<parser名>()函数,如果找不到则执行默认的pipeline_default()函数
            if hasattr(self, 'pipeline_' + self.task['parser']):
                func = getattr(self, 'pipeline_' + self.task['parser'])
                func(self.task['table'])
            else:
                self.pipeline_default(self.task['table'])
            #将完整task存入mongo,并将本条task
            self.task['state'] = 'done'
            self.taskUtil.replace_one(self.task['_id'], self.task)
        else:
            #没有解析出结果,则表示中间出错了,等待下次再启动
            pass
        Log.i('this task is finished')

    def pipeline_default(self, collection_name):
        '''
        存储demo
        demo_info
        {
            '_id':'http://tieba.baidu.com',
            'name':'百度贴吧'
        }
        :return:
        '''
        if self.task['parent'] is None:
            self.task['parent'] = {}
        if self.task['results'] is not None:
            for result in self.task['results']:
                insert_data = dict(self.task['parent'], **result)
                self.mongoUtil.insert(collection_name=collection_name,
                                      insert_data=insert_data)
        pass

    def __del__(self):
        self.mongoUtil.close_conn()
Esempio n. 2
0
 def __init__(self, request):
     self.mongoUtil = MongoUtil()
     count = self.mongoUtil.count(collection_name='tasks')
     if count == 0:
         task = None
         task_file = os.path.join(
             os.path.dirname(os.path.abspath(__file__)), '..', 'conf',
             'task.json')
         f = open(task_file, encoding='utf-8')
         task = f.read()
         f.close()
         insert_data = json.loads(task)
         self.mongoUtil.insert(collection_name='tasks',
                               insert_data=insert_data)
Esempio n. 3
0
class InitUtil:
    '''
    初始化,清空所有数据,重新开始新一轮任务
    '''
    def __init__(self):
        self.mongoUtil = MongoUtil()
        pass

    def init(self):
        db = Setting.MONGO_DB
        self.mongoUtil.clear_all(db)

    def __del__(self):
        self.mongoUtil.close_conn()
Esempio n. 4
0
 def __init__(self):
     self.mongoUtil = MongoUtil()
     count = self.mongoUtil.count(collection_name='tasks')
     if count == 0:
         first_task_parser = Setting.FIRST_TASK_PARSER
         first_task_url = Setting.FIRST_TASK_URL
         first_task_table = Setting.FIRST_TASK_TABLE
         insert_data = {
             "parser": first_task_parser,
             "request": first_task_url,
             "table": first_task_table,
             "parent": {},
             "state": "ready"
         }
         self.mongoUtil.insert(collection_name='tasks',
                               insert_data=insert_data)
Esempio n. 5
0
 def __init__(self):
     self.Mg = MongoUtil()
Esempio n. 6
0
class Spider:
    def __init__(self):
        self.Mg = MongoUtil()

    def run(self, site_id=None):
        coll = self.Mg.getCol('site')
        res = coll.find()
        for site in res:
            # try:
            if 'data_type' in site and site['data_type'] == 'json':
                self.update_site_json(site)
                last_page = self.get_films_json(site)
            else:
                pass
                self.update_site(site, coll)
                last_page = self.get_films(site)
            coll.update_one({'_id': site['_id']},
                            {'$set': {
                                'last_page': last_page
                            }})
            # except:
            #     continue

            # last_page = self.get_films(site)
            # coll.update_one({'_id': site['_id']}, {'$set': {'last_page': last_page}})
        # self.fixClassify()

    # 过滤非法字符
    def filter_xml(self, str):
        str = str.strip()
        for x in re_filters_xml:
            str = re.sub(x, '', str)
        return str

    # 过滤电影
    def filter_film(self, name):
        if len(re.findall(re_filters_name, name)) > 0:
            return 1
        return 0

    # 更新总数和pagesize
    def update_site(self, site, coll):
        try:
            news = requests.get(site['api_url'], timeout=5)
            root = ET.fromstring(self.filter_xml(news.text))
        except:
            return
        _ns = root.find('list').attrib
        tyClass = root.find('class')
        configs = []
        for x in tyClass:
            configs.append({'ty': x.attrib['id'], 'key': x.text})
        data = {
            'last_recordcount': int(_ns['recordcount']),
            'pagesize': int(_ns['pagesize']),
            'pagecount': int(_ns['pagecount']),
            'configs': configs
        }
        coll.update_one({'id': site['id']}, {"$set": data})

    def set_classify(self, site, tid):
        for x, y in site['classify'].items():
            if int(tid) in y:
                return int(x)
        return 0

    # 获取列表
    def get_films(self, _site):
        coll = self.Mg.getCol('films')
        siteColl = self.Mg.getCol('site')
        site = siteColl.find_one({'id': _site['id']})
        # 补一页
        page = site['last_page']
        last = int(site['pagecount']) - site['last_page'] if int(
            site['pagecount']) - site['last_page'] >= 0 else 0
        addNum = 0
        updateNum = 0
        sameNum = 0
        tqIt = tqdm(range(last * site['pagesize']),
                    total=last,
                    desc=self.set_desc(site, page, addNum, updateNum, sameNum))
        for x in tqIt:
            siteColl.update_one({'_id': site['_id']},
                                {'$set': {
                                    'last_page': page
                                }})
            url = site['api_url'] + '?ac=videolist&pg=%d' % page
            page += 1
            try:
                res = requests.get(url, timeout=20)
            except:
                return page

            try:
                root = ET.fromstring(res.text.strip())
            except:
                continue

            if len(root.find('list')) == 0:
                return page

            for child in root.find('list'):
                last -= 1
                film_name = child.find('name').text.upper() if child.find(
                    'name').text else ''
                note = child.find('note').text.upper() if child.find(
                    'note').text else ''
                video = []
                for v in child.find('dl'):
                    flag = v.attrib['flag']
                    if flag == '' or not v.text or len(list(
                            v.text.split('#'))) == 0:
                        continue
                    video.append({
                        'key': flag,
                        'plist': list(v.text.split('#'))
                    })
                if len(video) == 0:
                    continue
                year = self.set_year(child.find('year').text)
                info = {
                    'film_name':
                    film_name,
                    'tid':
                    int(child.find('tid').text),
                    'pic':
                    child.find('pic').text.strip()
                    if child.find('pic').text else '',
                    'site_id':
                    site['id'],
                    'state':
                    int(self.filter_film(film_name)),
                    'des':
                    child.find('des').text.strip()
                    if child.find('des').text else '',
                    'type':
                    child.find('type').text,
                    'area':
                    child.find('area').text,
                    'year':
                    int(year),
                    'douban_initial_year':
                    int(year),
                    'classify':
                    self.set_classify(site, int(child.find('tid').text)),
                    'note':
                    note,
                    'video':
                    video
                }
                if int(child.find('tid').text) not in site['filter_tid']:
                    continue

                old = coll.find_one({'film_name': film_name, 'note': note})
                if not old:
                    coll.insert_one(info)
                    addNum += 1
                else:
                    sameNum += 1
                    oldVideo = old['video'] if 'video' in old else []
                    coll.update_one(
                        {'_id': old['_id']},
                        {'$set': {
                            'video': self.set_video(video, oldVideo)
                        }})
                    # new_video_keys = list(map(lambda x: x['key'], video))
                    # old_video_keys = list(map(lambda x: x['key'], old['video']))
                    # difference = list(set(new_video_keys).difference(set(old_video_keys)))
                    # for newKey in difference:
                    #     newPlist = list(filter(lambda x: x['key'] == newKey, video))[0]
                    #     coll.update_one({'_id': old['_id']}, {'$addToSet': {'video': newPlist}})
                    updateNum += 1

                tqIt.set_description(
                    desc=self.set_desc(site, page, addNum, updateNum, sameNum))
        return page

    # 更新总数和pagesize
    def update_site_json(self, site):
        coll = self.Mg.getCol('site')
        news = requests.get(site['api_url'], timeout=5)
        news = news.json()
        tyClass = news['class']
        configs = []
        for x in tyClass:
            configs.append({'ty': x['type_id'], 'key': x['type_name']})
        if 'total' in news:
            data = {
                'last_recordcount': int(news['total']),
                'pagesize': int(news['limit']),
                'pagecount': int(news['pagecount']),
                'configs': configs
            }
        else:
            page = news['page']
            data = {
                'last_recordcount': int(page['recordcount']),
                'pagesize': int(page['pagesize']),
                'pagecount': int(page['pagecount']),
                'configs': configs
            }
        coll.update_one({'id': site['id']}, {"$set": data})

    def get_films_json(self, _site):
        coll = self.Mg.getCol('films')
        siteColl = self.Mg.getCol('site')
        site = siteColl.find_one({'id': _site['id']})
        # 补一页
        page = site['last_page']
        # 剩余页数
        last = int(site['pagecount']) - site['last_page']
        addNum = 0
        updateNum = 0
        sameNum = 0
        tqIt = tqdm(range(last * site['pagesize']),
                    desc=self.set_desc(site, page, addNum, updateNum, sameNum))

        for x in tqIt:
            siteColl.update_one({'_id': site['_id']},
                                {'$set': {
                                    'last_page': page
                                }})
            # 参数:p页码,翻页;wd关键词;cid分类id; 可以实现全部数据/分类数据获取还有搜索数据
            url = site['api_url'] + '?ac=detail&pg=' + str(page)
            page += 1
            try:
                res = requests.get(url, timeout=20)
                root = res.json()
            except:
                return page

            if len(root['list']) == 0:
                return page

            for child in root['list']:
                last -= 1
                film_name = child['vod_name'].upper(
                ) if child['vod_name'] else ''
                note = child['vod_remarks'].upper(
                ) if child['vod_remarks'] else ''
                kkey = child['vod_play_from'].split(
                    '$$$')[1] if child['vod_play_from'].find(
                        '$$$') > -1 else child['vod_play_from']

                video = [{
                    'key': kkey,
                    'plist': child['vod_play_url'].split('#')
                }]
                if len(video) == 0 or int(
                        child['type_id']) not in site['filter_tid']:
                    continue
                year = self.set_year(child['vod_year'])
                info = {
                    'film_name':
                    film_name,
                    'tid':
                    child['type_id'],
                    'pic':
                    child['vod_pic'].strip() if child['vod_pic'] else '',
                    'site_id':
                    site['id'],
                    'state':
                    int(self.filter_film(film_name)),
                    'des':
                    child['vod_content'].strip()
                    if child['vod_content'] else '',
                    'type':
                    child['type_name'],
                    'area':
                    child['vod_area'],
                    'year':
                    int(year),
                    'douban_initial_year':
                    int(year),
                    'classify':
                    self.set_classify(site, int(child['type_id'])),
                    'note':
                    note,
                    'video':
                    video
                }

                old = coll.find_one({'film_name': film_name, 'note': note})
                if not old:
                    coll.insert_one(info)
                    addNum += 1
                else:
                    sameNum += 1
                    oldVideo = old['video'] if 'video' in old else []
                    coll.update_one(
                        {'_id': old['_id']},
                        {'$set': {
                            'video': self.set_video(video, oldVideo)
                        }})
                    updateNum += 1

                tqIt.set_description(
                    desc=self.set_desc(site, page, addNum, updateNum, sameNum))
        return page

    def set_year(self, old):
        old = re.sub(re.compile('普通话|年'), '', old)
        year = int(old) if old and 1970 < int(old) < 2021 else 1970
        return year

    def set_video(self, newV, oldV):
        new_video_keys = list(map(lambda x: x['key'], newV))
        for x in oldV:
            if x['key'] not in new_video_keys:
                newV.append(x)
        return newV

    def set_desc(self, site, page, addNum, updateNum, sameNum):
        return "[%s: %d, 总页数: %d, page: %d, 新增: %d, 修改: %d, 相同: %d]" % (
            site['site_name'], site['id'], site['pagecount'], page, addNum,
            updateNum, sameNum)

    def fixClassify(self):
        baseClassifyDict = {
            "1":
            [1, 5, 6, 7, 8, 9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27, 29],
            "2": [2, 12, 13, 14, 15, 16, 17, 18, 48, 54, 34, 33],
            "3": [3, 45, 38, 37],
            "4": [4, 41, 47, 28, 39, 40],
            "5": [49, 51, 42, 43, 55, 19],
            "6": [60, 44, 43, 42, 30],
            "49": [49, 51, 52, 53, 55, 19]
        }
        coll = self.Mg.getCol('films')
        site_list = self.Mg.getCol('site').find()

        for site in site_list:
            if 'type_change' in site:
                for t in site['type_change']:
                    coll.update_many(
                        {
                            'site_id': site['id'],
                            'tid': int(t['ty']),
                            'new_tid': {
                                "$exists": False
                            }
                        }, {'$set': {
                            'new_tid': int(t['change'])
                        }})

        for (k, v) in baseClassifyDict.items():
            query = {
                "$or": [{
                    u"$and": [{
                        u"new_tid": {
                            u"$exists": True
                        }
                    }, {
                        u"new_tid": {
                            u"$in": v
                        }
                    }]
                }, {
                    u"$and": [{
                        u"new_tid": {
                            u"$exists": False
                        }
                    }, {
                        u"tid": {
                            u"$in": v
                        }
                    }]
                }],
                "classify": {
                    u"$exists": False
                }
            }
            coll.update_many(query, {'$set': {'classify': k}})
Esempio n. 7
0
class TaskUtil:
    '''
    Task操作工具
    tasks
    {
        "parser": "phase"
        "request": "http://so.eduyun.cn/synResource",
        "response": "<html>...</html>"
        "parent": {},
        "state": "done",
        "uptime":
    }
    '''
    def __init__(self, request):
        self.mongoUtil = MongoUtil()
        count = self.mongoUtil.count(collection_name='tasks')
        if count == 0:
            task = None
            task_file = os.path.join(
                os.path.dirname(os.path.abspath(__file__)), '..', 'conf',
                'task.json')
            f = open(task_file, encoding='utf-8')
            task = f.read()
            f.close()
            insert_data = json.loads(task)
            self.mongoUtil.insert(collection_name='tasks',
                                  insert_data=insert_data)

    def get_ready(self):
        '''
        获取一条待执行的任务(准备状态),并置为doing状态
        :return: dict 单条任务
        '''
        # 过滤条件,不存在state字段或state=ready
        filter_dict = {
            '$or': [{
                'state': {
                    '$exists': False
                }
            }, {
                'state': 'ready'
            }]
        }
        # 更新条件,将state=doing
        update_dict = {'$set': {'state': 'doing'}}
        # 执行mongo操作
        task = self.mongoUtil.find_one_and_update(collection_name='tasks',
                                                  filter_dict=filter_dict,
                                                  update_dict=update_dict)
        return task

    def set_state(self, id, state):
        '''
        设置任务状态(ready,doing,done)
        :param id: str 主键id
        :param state: str 更新状态值
        :return: 无
        '''
        filter_dict = {'_id': id}
        update_dict = {'$set': {'state': state}}
        self.mongoUtil.update(collection_name='tasks',
                              filter_dict=filter_dict,
                              update_dict=update_dict)

    def replace_one(self, id, task):
        '''
        更新整个任务
        :param id: str
        :param task: dict
        :return: 无
        '''
        filter_dict = {'_id': id}
        r = self.mongoUtil.find_one_and_replace(collection_name='tasks',
                                                filter_dict=filter_dict,
                                                replace_dict=task)
        return r

    def insert_one(self, task):
        '''
        插入一条task
        :param parser: str 解析器
        :param request: str 请求的url
        :return: 无
        '''
        r = self.mongoUtil.find_one(collection_name='tasks', filter_dict=task)
        if r is None:
            task['state'] = 'ready'
            r = self.mongoUtil.insert(collection_name='tasks',
                                      insert_data=task)

    def __del__(self):
        self.mongoUtil.close_conn()
Esempio n. 8
0
from utils.MongoUtil import MongoUtil

Mg = MongoUtil()
col = Mg.getCol("site")

# site_config = {}
# for x in col.find():
#     site_config[x['id']] = x['configs']
#
# print(site_config)

classify = {
    '1': {
        '1': [1, 5, 6, 7, 8, 9, 10, 11],
        '2': [2, 12, 13, 14, 15, 16, 17, 18, 54],
        '3': [4, 41, 39, 40, 47],
        '49': [19, 49, 51, 52, 53, 55]
    },
    '2': {
        '1': [1, 5, 6, 7, 8, 9, 10, 11, 24],
        '2': [2, 12, 13, 14, 15, 17, 18, 20],
        '3': [4, 41, 39, 40, 47],
        '49': [16, 19.21, 22]
    },
    '3': {
        '1': [1, 5, 6, 7, 8, 9, 10, 11],
        '2': [2, 12, 13, 14, 15, 17, 18, 19, 21],
        '3': [4],
        '49': [16]
    },
    '4': {
Esempio n. 9
0
class TaskUtil:
    '''
    Task操作工具
    tasks
    {
        "parser": "phase"
        "request": "http://so.eduyun.cn/synResource",
        "response": "<html>...</html>"
        "parent": {},
        "state": "done",
        "uptime":
    }
    '''
    def __init__(self):
        self.mongoUtil = MongoUtil()
        count = self.mongoUtil.count(collection_name='tasks')
        if count == 0:
            first_task_parser = Setting.FIRST_TASK_PARSER
            first_task_url = Setting.FIRST_TASK_URL
            first_task_table = Setting.FIRST_TASK_TABLE
            insert_data = {
                "parser": first_task_parser,
                "request": first_task_url,
                "table": first_task_table,
                "parent": {},
                "state": "ready"
            }
            self.mongoUtil.insert(collection_name='tasks',
                                  insert_data=insert_data)

    def get_ready(self):
        '''
        获取一条待执行的任务(准备状态),并置为doing状态
        :return: dict 单条任务
        '''
        # 过滤条件,不存在state字段或state=ready
        filter_dict = {
            '$or': [{
                'state': {
                    '$exists': False
                }
            }, {
                'state': 'ready'
            }]
        }
        # 更新条件,将state=doing
        update_dict = {'$set': {'state': 'doing'}}
        # 执行mongo操作
        task = self.mongoUtil.find_one_and_update(collection_name='tasks',
                                                  filter_dict=filter_dict,
                                                  update_dict=update_dict)
        return task

    def set_state(self, id, state):
        '''
        设置任务状态(ready,doing,done)
        :param id: str 主键id
        :param state: str 更新状态值
        :return: 无
        '''
        filter_dict = {'_id': id}
        update_dict = {'$set': {'state': state}}
        self.mongoUtil.update(collection_name='tasks',
                              filter_dict=filter_dict,
                              update_dict=update_dict)

    def replace_one(self, id, task):
        '''
        更新整个任务
        :param id: str
        :param task: dict
        :return: 无
        '''
        filter_dict = {'_id': id}
        r = self.mongoUtil.find_one_and_replace(collection_name='tasks',
                                                filter_dict=filter_dict,
                                                replace_dict=task)
        return r

    def insert_one(self, task):
        '''
        插入一条task
        :param parser: str 解析器
        :param request: str 请求的url
        :return: 无
        '''
        r = self.mongoUtil.find_one(collection_name='tasks', filter_dict=task)
        if r is None:
            task['state'] = 'ready'
            r = self.mongoUtil.insert(collection_name='tasks',
                                      insert_data=task)

    def __del__(self):
        self.mongoUtil.close_conn()
Esempio n. 10
0
 def __init__(self, task):
     self.task = task
     self.taskUtil = TaskUtil()
     self.mongoUtil = MongoUtil()
     pass
Esempio n. 11
0
 def __init__(self):
     self.mongoUtil = MongoUtil()
     pass
Esempio n. 12
0
def get_site():
    Mg = MongoUtil()
    siteColl = Mg.getCol('site')
    res = siteColl.find({'id': 8})
    print(list(res)[0])
Esempio n. 13
0
def insert_site():
    ss = {
        'id':
        34,
        'site_key':
        'okzy',
        'api_url':
        'https://api.okzy.tv/api.php/provide/vod/at/json/?ac=detail',
        'site_name':
        'okzy',
        'configs': [{
            'type_id': 1,
            'type_name': '电影'
        }, {
            'type_id': 2,
            'type_name': '连续剧'
        }, {
            'type_id': 3,
            'type_name': '综艺'
        }, {
            'type_id': 4,
            'type_name': '动漫'
        }, {
            'type_id': 5,
            'type_name': '资讯'
        }, {
            'type_id': 6,
            'type_name': '动作片'
        }, {
            'type_id': 7,
            'type_name': '喜剧片'
        }, {
            'type_id': 8,
            'type_name': '爱情片'
        }, {
            'type_id': 9,
            'type_name': '科幻片'
        }, {
            'type_id': 10,
            'type_name': '恐怖片'
        }, {
            'type_id': 11,
            'type_name': '剧情片'
        }, {
            'type_id': 12,
            'type_name': '战争片'
        }, {
            'type_id': 13,
            'type_name': '国产剧'
        }, {
            'type_id': 14,
            'type_name': '香港剧'
        }, {
            'type_id': 15,
            'type_name': '韩国剧'
        }, {
            'type_id': 16,
            'type_name': '欧美剧'
        }, {
            'type_id': 17,
            'type_name': '公告'
        }, {
            'type_id': 18,
            'type_name': '头条'
        }, {
            'type_id': 20,
            'type_name': '纪录片'
        }, {
            'type_id': 21,
            'type_name': '微电影'
        }, {
            'type_id': 22,
            'type_name': '台湾剧'
        }, {
            'type_id': 23,
            'type_name': '日本剧'
        }, {
            'type_id': 24,
            'type_name': '海外剧'
        }, {
            'type_id': 25,
            'type_name': '内地综艺'
        }, {
            'type_id': 26,
            'type_name': '港台综艺'
        }, {
            'type_id': 27,
            'type_name': '日韩综艺'
        }, {
            'type_id': 28,
            'type_name': '欧美综艺'
        }, {
            'type_id': 29,
            'type_name': '国产动漫'
        }, {
            'type_id': 30,
            'type_name': '日韩动漫'
        }, {
            'type_id': 31,
            'type_name': '欧美动漫'
        }, {
            'type_id': 32,
            'type_name': '港台动漫'
        }, {
            'type_id': 33,
            'type_name': '海外动漫'
        }, {
            'type_id': 34,
            'type_name': '福利片'
        }, {
            'type_id': 35,
            'type_name': '解说'
        }, {
            'type_id': 36,
            'type_name': '电影解说'
        }, {
            'type_id': 37,
            'type_name': '伦理片'
        }],
        'data_type':
        'json',
        'filter_tid': [3, 5, 17, 18, 20, 21, 22, 25, 26, 27, 28, 35, 36]
    }
    Mg = MongoUtil()
    siteColl = Mg.getCol('site')
    siteColl.insert_one(ss)
Esempio n. 14
0
 def __init__(self):
     self.proxyArr = []
     self.searchFilmList = []
     self.film_coll = MongoUtil().getCol('films')
     pass
Esempio n. 15
0
class DoubanSpider(object):
    def __init__(self):
        self.proxyArr = []
        self.searchFilmList = []
        self.film_coll = MongoUtil().getCol('films')
        pass

    def run_search_link(self):
        self.init_proxy(5)
        # Db = MysqlHelper()
        # self.searchFilmList = Db.select(
        #     "select id,film_name,douban_url from film_list where douban_url is null and classify in (1,2,4)")
        query = {
            "douban_rating": {
                "$exists": False
            },
            "douban_state": {"$ne": 1},
            "classify": {
                "$in": [
                    1.0,
                    # 2.0,
                    # 4.0
                ]
            }}
        projection = {"_id": 1.0, 'film_name': 1}

        self.searchFilmList = self.film_coll.find(query, projection=projection)
        self.searchFilmList = list(self.searchFilmList)
        self.count = len(self.searchFilmList)
        print(self.count)
        # asyncio.run(self.searchLink())
        # 任务组, 最大协程数
        loop = asyncio.get_event_loop()
        tasks = []
        for x in range(10):
            tasks.append(self.searchLink())
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()

    # 搜索详情
    def run_search_info(self):
        self.init_proxy(2)
        query = {'$and': [
            {
                "douban_url": {
                    u"$exists": True
                }
            },
            {
                "douban_rating": {
                    u"$exists": True
                }
            },
            {
                "douban_url": {
                    u"$ne": u"no"
                }
            },
            {
                "douban_country": {
                    u"$exists": False
                }
            },
            {
                "douban_state": {
                    u"$ne": 1
                }
            }
        ]}
        self.searchFilmList = self.film_coll.find(query, projection={"_id": 1, 'douban_url': 1, 'year': 1})
        self.searchFilmList = list(self.searchFilmList)
        self.count = len(self.searchFilmList)
        # asyncio.run(self.get_film_info())

        pool = AsyncPool(maxsize=1)
        for x in range(self.count):
            pool.submit(self.get_film_info())
        pool.release()
        pool.wait()

    # 初始化代理池
    def init_proxy(self, num=2):
        ip = requests.get('https://www.anpaitm.com/index/test?key=JGbeg4gFJD875obads')
        requests.get(
            'http://wapi.http.linkudp.com/index/index/save_white?neek=352139&appkey=3fdebe3814651abea473ec39b5dc53d1&white=' + ip.text)
        res = requests.get(
            f'http://webapi.http.zhimacangku.com/getip?num={num}&type=2&pro=&city=0&yys=0&port=11&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions=')
        js = res.json()
        for x in js['data']:
            self.proxyArr.append(f"{x['ip']}:{x['port']}")
        time.sleep(1)

    # 获取代理, 随机拿一个
    def get_proxy(self):
        if len(self.proxyArr) == 0:
            return ''
        index = random.randint(0, len(self.proxyArr) - 1)
        return self.proxyArr[index] if len(self.proxyArr) > 0 else ''

    # 代理失效后的处理
    def del_proxy(self, proxy):
        if not proxy: return

        if proxy in self.proxyArr:
            self.proxyArr.remove(proxy)

        if len(self.proxyArr) < 2:
            self.init_proxy()

    async def get_browser(self):
        proxy = self.get_proxy()
        browser = await launch({'headless': False, 'autoClose': True, 'args': [
            f"--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(56, 70)}.0.3538.25 Safari/537.36 Core/1.70.3861.400 QQBrowser/10.7.43{random.randint(0, 10)}.400",
            '--proxy-server=%s' % proxy
        ]})
        return browser, proxy

    @staticmethod
    def filter_film_name(name):
        # 过滤电影名称
        filter_re = re.compile("超清720P|高清\w*|%|\.|'|\\n|       |\u3000\u3000|(\w+)|普通话版|粤语版|\((.*)\)")
        return re.sub(filter_re, '', name).strip()

    # def codingJson(self, data):
    #     return escape_string(self.filter_film_name(json.dumps(data, ensure_ascii=False)))

    async def intercept_request(req):
        """请求过滤"""
        if req.resourceType in ['image', 'media', 'eventsource', 'websocket']:
            await req.abort()
        else:
            await req.continue_()

    # 通过电影名称搜索豆瓣链接
    async def searchLink(self):
        browser = None
        page = None
        browser, proxy = await self.get_browser()
        page = await browser.newPage()
        # await page.setRequestInterception(True)
        # page.on('request', "item=>self.intercept_request")
        # page.on('request', lambda req: asyncio.ensure_future(self.intercept_request(req)))

        while True:
            await asyncio.sleep(random.uniform(0.5, 1.5))
            if len(self.searchFilmList) == 0:
                break
            film = self.searchFilmList.pop()
            film_name = self.filter_film_name(film['film_name'])
            print(film_name)
            link = 'https://search.douban.com/movie/subject_search?search_text=' + film_name
            try:
                goto_link = await page.goto(link, {'timeout': 50 * 1000})
                body = await page.J('body')
                body_text = await page.evaluate('item=>item.textContent', body)
                if goto_link.status != 200 or body_text.find('检测到有异常') != -1 or body_text.find(
                        '服务异常') != -1 or body_text.find('请登录后再试') != -1:
                    self.del_proxy(proxy)
                    print('检测到有异常')
                    await page.close()
                    await browser.close()
                    await asyncio.sleep(random.uniform(0.5, 1.5))
                    browser, proxy = await self.get_browser()
                    page = await browser.newPage()
                    continue
                # 没有找到
                if body_text.find('换个搜索词试试吧') != -1 or body_text.find('根据相关法律法规和政策') != -1:
                    self.film_coll.update_one({'_id': film['_id']}, {'$set': {'douban_state': 1}})
                    print("\r剩余--: %d" % len(self.searchFilmList), end='')
                    continue
            except:
                # await page.close()
                # await browser.close()
                # page = None
                # browser = None
                # self.del_proxy(proxy)
                print('检测到有异常')
                # continue

            root = await page.J('#wrapper')
            # root_text = await page.evaluate('item=>item.textContent', root)

            item_root = await page.J('.item-root')
            while not item_root:
                await asyncio.sleep(1)

            # 没有搜到
            if not item_root:
                data = {'douban_url': 'no', 'film_name': film_name}
                # Db.update_dict('film_list', {'id': film['id']}, data)
                print('no find')
                continue
            # print(await page.evaluate('item=>item.textContent', item_root))
            # print(await page.evaluate('item=>item.src', await item_root.J('.cover-link >.cover')))
            # print(await page.evaluate('item=>item.href', await item_root.J('.cover-link')))
            title = await page.evaluate('item=>item.textContent', await item_root.J('.title-text'))

            douban_rating = await item_root.J('.rating >.rating_nums')
            if douban_rating:
                douban_rating = await page.evaluate('item=>item.textContent', douban_rating)
            else:
                douban_rating = 1.1
            data = {
                'douban_pic': await page.evaluate('item=>item.src', await item_root.J('.cover-link >.cover')),
                'douban_url': await page.evaluate('item=>item.href', await item_root.J('.cover-link')),
                'douban_rating': douban_rating,
                'film_name': film_name
            }
            self.film_coll.update_one({'_id': film['_id']}, {'$set': data})
            print("\r剩余: %d" % len(self.searchFilmList), end='')

        if page:
            await page.close()
        if browser:
            await browser.close()
        page = None
        browser = None

    async def get_film_info(self):
        try:
            film = self.searchFilmList.pop()
        except:
            return
        proxy = self.get_proxy()
        headers = {
            'Host': 'movie.douban.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3904.108 Safari/537.36'}
        # 实例化好一个请求对象
        async with aiohttp.ClientSession(trust_env=True) as sess:
            # 调用get发起请求,返回一个响应对象
            async with sess.get(url=film['douban_url'], headers=headers, timeout=5000,
                                proxy="http://" + proxy) as response:
                # await asyncio.sleep(random.uniform(0.5, 1.5))
                if response.status != 200:
                    self.del_proxy(proxy)
                    print('err')
                    return
                    # pool.submit(self.get_film_info(Db, pool))

                page_text = await response.text()
                soup = BeautifulSoup(page_text, 'html.parser')
                info = soup.find('div', id="info")
                if info.find('span', text=re.compile("^职业.?")) or info.find('span', text=re.compile("^性别.?")) or \
                        info.find('dt', text=re.compile("^官方网站.?")) or info.find('h2', text=re.compile("影人简介.?")):
                    self.film_coll.update_one({'_id': film['_id']}, {'$set': {'douban_state': 1}})
                    self.count -= 1
                    print("\r剩余-: %d" % self.count, end='')
                    return

                producer = info.find(name='span', class_=re.compile('pl'), text=re.compile("^制片国家.?"))
                producer = producer.next_sibling.replace(' ', '') if producer is not None else ''
                if producer == '':
                    producer1 = info.find('span', attrs={'property': 'v:initialReleaseDate'})
                    producer1 = producer1.string if producer1 else ''
                    producer1 = re.findall(r'[0123456789-]?\((.*)\)', producer1) if producer1 else []
                    producer = producer1[0] if len(producer1) > 0 else ''
                data_str = soup.find('script', attrs={'type': 'application/ld+json'}).string
                data_dict = json.loads(data_str, strict=False)
                douban_name = data_dict['name']
                director = data_dict['director'][0]['name'] if len(data_dict['director']) > 0 else ''
                actor = []
                for x in data_dict['actor']:
                    actor.append(x['name'])
                # year = fi.lm['year'] if film['year'] and film['year'] != -1 else '1990'
                # initial_date = data_dict['datePublished'] if data_dict['datePublished'] != '' else str(year) + '-01-01'
                # douban_initial_year = initial_date.split('-')[0] if initial_date else year
                year = 1990
                if data_dict['datePublished']:
                    initial_date = data_dict['datePublished']
                else:
                    if film['year'] and 1960 < int(film['year']) < 2022:
                        year = film['year']
                    initial_date = str(year) + '-01-01'
                douban_initial_year = initial_date.split('-')[0] if initial_date else year

                if len(soup.select('#link-report span')) > 0:
                    if soup.select('#link-report .all'):
                        des_text = soup.select('#link-report .all')[0].get_text()
                    else:
                        des_text = soup.select('#link-report span')[0].get_text()
                    des = self.filter_film_name(des_text)
                else:
                    des = ''

                data = {
                    'douban_genre': data_dict['genre'],
                    'douban_country': producer,
                    'douban_initial_date': initial_date,
                    'douban_initial_year': douban_initial_year,
                    'actor': actor,
                    'director': director,
                    'douban_name': douban_name if len(douban_name) <= 200 else douban_name[:100],
                    'douban_des': self.filter_film_name(des)
                }
                print(data)
                print(film)
                # Db.update_byid('film_list', data)
                # Db.update_dict('film_info', {'film_id': film['id']}, data2)
                self.film_coll.update_one({'_id': film['_id']}, {'$set': data})
                self.count -= 1
                print("\r剩余: %d" % self.count, end='')
Esempio n. 16
0
import re
from flask import Flask, render_template, request, jsonify
from flask_cors import CORS
from bson.objectid import ObjectId

import site_config
from utils.MongoUtil import MongoUtil

app = Flask(__name__)
# 实现跨域访问
cors = CORS(app, resources={r"*": {"origins": "*"}})
Mg = MongoUtil()

film_filters = [{
    'key':
    'classify',
    'list': [{
        'name': '电影',
        'value': 1
    }, {
        'name': '连续剧',
        'value': 2
    }, {
        'name': '动漫',
        'value': 3
    }, {
        'name': '综艺',
        'value': 5
    }]
}, {
    'key': 'order',