Exemple #1
0
def real_mapper(queryset):
    this = Message.objects(task=queryset.task, payload=queryset.payload)
    STATE = True
    Model = getattr(models, queryset.task)
    this.update(set__inprocess=True)
    if queryset.task == 'Movie':
        for process in queryset.payload:
            ret = parse.get_movie_info(process)
            ret['movieid'] = process
            models.Movie(**ret).save()
        return
    Parse = getattr(parse, queryset.task + 'Parse')
    for process in queryset.payload:
        try:
            p = Parse(process)
            count = 1
            while 1:
                haspage = p()
                if haspage is None:
                    # 很可能404
                    break
                result, hasnext = haspage
                Model(**result).save()
                # 别名体系, 这样只需要全局记录一个人物就知道他们的全部别名
                for k, v in p._alias.items():
                    models.AliasName.objects.get_or_create(name=k)[0].update(
                        add_to_set__alias=v)
                if hasnext:
                    count += 1
                    url = p.original_url
                    p.set_url(url.replace('.html', '-{}.html'.format(count)))
                else:
                    #没有下一页就退出循环
                    break
        except:
            raise
            STATE = False
        else:
            models.IdFinished.objects(year=queryset.year).update(
                add_to_set__ids=[process])
    if STATE:
        this.update(set__state=2)
    else:
        this.update(set__state=3)
    this.update(set__inprocess=False)
Exemple #2
0
def real_mapper(queryset):
    this = Message.objects(task=queryset.task, payload=queryset.payload)
    STATE = True
    Model = getattr(models, queryset.task)
    this.update(set__inprocess=True)
    if queryset.task == 'Movie':
        for process in queryset.payload:
            ret = parse.get_movie_info(process)
            ret['movieid'] = process
            models.Movie(**ret).save()
        return
    Parse = getattr(parse, queryset.task + 'Parse')
    for process in queryset.payload:
        try:
            p = Parse(process)
            count = 1
            while 1:
                haspage = p()
                if haspage is None:
                    # 很可能404
                    break
                result, hasnext = haspage
                Model(**result).save()
                # 别名体系, 这样只需要全局记录一个人物就知道他们的全部别名
                for k, v in p._alias.items():
                    models.AliasName.objects.get_or_create(
                        name=k)[0].update(add_to_set__alias=v)
                if hasnext:
                    count += 1
                    url = p.original_url
                    p.set_url(url.replace('.html', '-{}.html'.format(count)))
                else:
                    #没有下一页就退出循环
                    break
        except:
            raise
            STATE = False
        else:
            models.IdFinished.objects(
                year=queryset.year            ).update(add_to_set__ids=[process])
    if STATE:
        this.update(set__state=2)
    else:
        this.update(set__state=3)
    this.update(set__inprocess=False)
Exemple #3
0
 def __init__(self, map_func, num_workers=None, **kwargs):
     self.map_func = map_func
     self.inputs = Message.objects(state__ne=2, inprocess__ne=True)
     self.pool = multiprocessing.Pool(num_workers, **kwargs)
Exemple #4
0
def mtime_beat():
    '''每次任务只跑一年的'''
    y_list = []
    y = get_year() + 1  # 要抓取的年份
    debug('Fetch Year: {} starting...'.format(y))
    instance = fetch(y, 1)
    page = get_movie_pages(instance)
    if page is None:
        warn('Movie"page has not fetched')
        # 执行间隔自适应
        if scheduler.get_interval < TASK_BEAT * 7:
            scheduler.change_interval(incr=True)
        return
    ids = get_movie_ids(instance)
    if ids is None:
        # 间隔自适应也不能太大
        warn('Movie has not fetched')
        if scheduler.get_interval < TASK_BEAT * 7:
            scheduler.change_interval(incr=True)
        return
    # 当任务继续能执行的时候,回到默认的间隔
    if scheduler.get_interval > TASK_BEAT:
        debug('Interval back to default')
        scheduler.change_interval(TASK_BEAT)
    y_list.extend(ids)
    if not y_list:
        # 本年没有电影
        debug('Year: {} has not movie'.format(y))
        YearFinished(year=y).save()
        sleep2()
        return mtime_beat()
    if page > 1:
        p = 2
        while p <= page:
            instance = fetch(y, p)
            debug('Fetch Year:{} Page:{}'.format(y, p))
            ids = get_movie_ids(instance)
            if ids is None:
                # 间隔自适应也不能太大
                if scheduler.get_interval < TASK_BEAT * 7:
                    scheduler.change_interval(incr=True)
                    # 出现需要验证码 手动输入或者等待一段时间后重试,直到能正常使用
                    sleep2(VERIFY_INTERVAL)
                    continue
                ids = []
            y_list.extend(ids)
            p += 1
            sleep2()
    obj = IdFinished.objects(year=y).first()
    if obj is not None:
        has_finished = obj.ids
    else:
        has_finished = []
    to_process = get_unfinished(has_finished, y_list)
    # 给相应队列添加任务
    for payload in group(to_process, TASK_BEAT_NUM):
        for task in ['Fullcredits', 'Movie', 'Comment', 'Character',
                     'MicroComment', 'Scenes', 'Awards', 'Plot',
                     'Details']:
            debug('Push payload: {} to {} Queue'.format(payload, task))
            try:
                Message(year=y, task=task, payload=payload).save()
                # Hack一下
                #Message.objects.get_or_create(year=y, task=task, payload=payload)
            except NotUniqueError:
                debug('Duplicate insert: [{}], payload: {}'.format(task, payload))
    # 当前年份数据已经入MQ
    YearFinished(year=y).save()
    debug('Year: {} done'.format(y))
Exemple #5
0
                for k, v in p._alias.items():
                    models.AliasName.objects.get_or_create(name=k)[0].update(
                        add_to_set__alias=v)
                if hasnext:
                    count += 1
                    url = p.original_url
                    p.set_url(url.replace('.html', '-{}.html'.format(count)))
                else:
                    #没有下一页就退出循环
                    break
        except:
            raise
            STATE = False
        else:
            models.IdFinished.objects(year=queryset.year).update(
                add_to_set__ids=[process])
    if STATE:
        this.update(set__state=2)
    else:
        this.update(set__state=3)
    this.update(set__inprocess=False)


all = Message.objects(state__ne=2)

for i in all:
    try:
        real_mapper(i)
    except:
        raise
Exemple #6
0
                # 别名体系, 这样只需要全局记录一个人物就知道他们的全部别名
                for k, v in p._alias.items():
                    models.AliasName.objects.get_or_create(
                        name=k)[0].update(add_to_set__alias=v)
                if hasnext:
                    count += 1
                    url = p.original_url
                    p.set_url(url.replace('.html', '-{}.html'.format(count)))
                else:
                    #没有下一页就退出循环
                    break
        except:
            raise
            STATE = False
        else:
            models.IdFinished.objects(
                year=queryset.year            ).update(add_to_set__ids=[process])
    if STATE:
        this.update(set__state=2)
    else:
        this.update(set__state=3)
    this.update(set__inprocess=False)

all = Message.objects(state__ne=2)

for i in all:
    try:
        real_mapper(i)
    except:
        raise