コード例 #1
0
ファイル: crawl.py プロジェクト: devinshields/Mturk-Tracker
    def handle(self, *args, **options):

        self.mturk_email = getattr(settings, 'MTURK_AUTH_EMAIL', None)
        self.mturk_password = getattr(settings, 'MTURK_AUTH_PASSWORD', None)

        _start_time = time.time()
        pid = Pid('mturk_crawler', True)
        log.info('crawler started: %s;;%s', args, options)

        if options.get('mturk_email'):
            self.mturk_email = options['mturk_email']
        if options.get('mturk_password'):
            self.mturk_password = options['mturk_password']

        if options.get('logconf', None):
            self.setup_logging(options['logconf'])

        if options.get('debug', False):
            self.setup_debug()
            print 'Current proccess pid: %s' % pid.actual_pid
            print ('To debug, type: python -c "import os,signal; '
                'os.kill(%s, signal.SIGUSR1)"\n') % pid.actual_pid

        self.maxworkers = options['workers']
        if self.maxworkers > 9:
            # If you want to remote this limit, don't forget to change dbpool
            # object maximum number of connections. Each worker should fetch
            # 10 hitgroups and spawn single task for every one of them, that
            # will get private connection instance. So for 9 workers it's
            # already 9x10 = 90 connections required
            #
            # Also, for too many workers, amazon isn't returning valid data
            # and retrying takes much longer than using smaller amount of
            # workers
            sys.exit('Too many workers (more than 9). Quit.')
        start_time = datetime.datetime.now()

        hits_available = tasks.hits_mainpage_total()
        groups_available = tasks.hits_groups_total()

        # create crawl object that will be filled with data later
        crawl = Crawl.objects.create(
                start_time=start_time,
                end_time=start_time,
                success=True,
                hits_available=hits_available,
                hits_downloaded=0,
                groups_available=groups_available,
                groups_downloaded=groups_available)
        log.debug('fresh crawl object created: %s', crawl.id)

        # fetch those requester profiles so we could decide if their hitgroups
        # are public or not
        reqesters = RequesterProfile.objects.all_as_dict()

        dbpool = ThreadedConnectionPool(10, 90,
            'dbname=%s user=%s password=%s' % (
                settings.DATABASES['default']['NAME'],
                settings.DATABASES['default']['USER'],
                settings.DATABASES['default']['PASSWORD']))
        # collection of group_ids that were already processed - this should
        # protect us from duplicating data
        processed_groups = set()
        total_reward = 0
        hitgroups_iter = self.hits_iter()

        for hg_pack in hitgroups_iter:
            jobs = []
            for hg in hg_pack:
                if hg['group_id'] in processed_groups:
                    log.debug('Group already in processed_groups, skipping.')
                    continue
                processed_groups.add(hg['group_id'])

                j = gevent.spawn(tasks.process_group,
                        hg, crawl.id, reqesters, processed_groups, dbpool)
                jobs.append(j)
                total_reward += hg['reward'] * hg['hits_available']
            log.debug('processing pack of hitgroups objects')
            gevent.joinall(
                jobs, timeout=settings.CRAWLER_GROUP_PROCESSING_TIMEOUT)
            # check if all jobs ended successfully
            for job in jobs:
                if not job.ready():
                    log.error('Killing job: %s', job)
                    job.kill()

            if len(processed_groups) >= groups_available:
                log.info('Skipping empty groups.')
                # there's no need to iterate over empty groups.. break
                break

            # amazon does not like too many requests at once, so give them a
            # quick rest...
            gevent.sleep(1)

        dbpool.closeall()

        # update crawler object
        crawl.groups_downloaded = len(processed_groups)
        crawl.end_time = datetime.datetime.now()
        crawl.save()

        work_time = time.time() - _start_time
        log.info("""Crawl finished:
        created crawl id: {crawl_id})
        total reward value: {total_reward}
        hits groups downloaded: {processed_groups}
        hits groups available: {groups_available}
        work time: {work_time:.2f} seconds
        """.format(crawl_id=crawl.id, total_reward=total_reward,
            processed_groups=len(processed_groups),
            groups_available=groups_available,
            work_time=work_time))

        crawl_downloaded_pc = settings.INCOMPLETE_CRAWL_THRESHOLD
        crawl_warning_pc = settings.INCOMPLETE_CRAWL_WARNING_THRESHOLD
        crawl_time_warning = settings.CRAWLER_TIME_WARNING
        downloaded_pc = float(crawl.groups_downloaded) / groups_available
        if work_time > crawl_time_warning:
            log.warning(("Crawl took {0}s which seems a bit too long (more "
                "than {1}s), you might consider checking if correct mturk "
                "account is used, ignore this if high number of groups is "
                "experienced.").format(work_time, crawl_time_warning))
        if downloaded_pc < crawl_warning_pc:
            log.warning(('Only {0}% of hit groups were downloaded, below '
                '({1}% warning threshold) please check mturk account '
                'configuration and/or if there are any network-related '
                'problems.').format(downloaded_pc, crawl_warning_pc))
        if downloaded_pc < crawl_downloaded_pc:
            log.warning("This crawl contains far too few groups downloaded to "
                "available: {0}% < {1}% downloaded threshold and will be "
                "considered as erroneous ({2}/{3} groups).".format(
                    downloaded_pc, crawl_downloaded_pc,
                    crawl.groups_downloaded, groups_available))

        pid.remove_pid()
コード例 #2
0
ファイル: crawl.py プロジェクト: faridani/Mturk-Tracker
    def handle(self, *args, **options):
        _start_time = time.time()
        pid = Pid('mturk_crawler', True)
        log.info('crawler started: %s;;%s', args, options)

        if options.get('mturk_email'):
            self.mturk_email = options['mturk_email']
        if options.get('mturk_password'):
            self.mturk_password = options['mturk_password']

        if options.get('logconf', None):
            self.setup_logging(options['logconf'])

        if options.get('debug', False):
            self.setup_debug()
            print 'Current proccess pid: %s' % pid.actual_pid
            print 'To debug, type: python -c "import os,signal; os.kill(%s, signal.SIGUSR1)"\n' % \
                    pid.actual_pid

        self.maxworkers = options['workers']
        if self.maxworkers > 9:
            # If you want to remote this limit, don't forget to change dbpool
            # object maximum number of connections. Each worker should fetch
            # 10 hitgroups and spawn single task for every one of them, that
            # will get private connection instance. So for 9 workers it's
            # already 9x10 = 90 connections required
            #
            # Also, for too many workers, amazon isn't returning valid data
            # and retrying takes much longer than using smaller amount of
            # workers
            sys.exit('Too many workers (more than 9). Quit.')
        start_time = datetime.datetime.now()

        hits_available = tasks.hits_mainpage_total()
        groups_available = tasks.hits_groups_total()

        # create crawl object that will be filled with data later
        crawl = Crawl.objects.create(start_time=start_time,
                                     end_time=datetime.datetime.now(),
                                     success=True,
                                     hits_available=hits_available,
                                     hits_downloaded=0,
                                     groups_available=groups_available,
                                     groups_downloaded=groups_available)
        log.debug('fresh crawl object created: %s', crawl.id)

        # fetch those requester profiles so we could decide if their hitgroups
        # are public or not
        reqesters = RequesterProfile.objects.all_as_dict()

        # collection of group_ids that were already processed - this should
        # protect us from duplicating data
        processed_groups = set()
        total_reward = 0
        hitgroups_iter = self.hits_iter()
        for hg_pack in hitgroups_iter:
            jobs = []
            for hg in hg_pack:
                j = gevent.spawn(tasks.process_group, hg, crawl.id, reqesters,
                                 processed_groups)
                jobs.append(j)
                total_reward += hg['reward'] * hg['hits_available']
            log.debug('processing pack of hitgroups objects')
            gevent.joinall(jobs, timeout=20)
            # check if all jobs ended successfully
            for job in jobs:
                if not job.ready():
                    log.error('Killing job: %s', job)
                    job.kill()

            if len(processed_groups) >= groups_available:
                # there's no need to iterate over empty groups.. break
                break

            # amazon does not like too many requests at once, so give them a
            # quick rest...
            gevent.sleep(1)

        dbpool.closeall()

        # update crawler object
        crawl.groups_downloaded = len(processed_groups)
        crawl.end_time = datetime.datetime.now()
        crawl.save()

        work_time = time.time() - _start_time
        log.info('created crawl id: %s', crawl.id)
        log.info('total reward value: %s', total_reward)
        log.info('processed hits groups downloaded: %s', len(processed_groups))
        log.info('processed hits groups available: %s', groups_available)
        log.info('work time: %.2fsec', work_time)
コード例 #3
0
ファイル: crawl.py プロジェクト: knightelvis/Mturk-Tracker
    def handle(self, *args, **options):

        self.mturk_email = getattr(settings, 'MTURK_AUTH_EMAIL', None)
        self.mturk_password = getattr(settings, 'MTURK_AUTH_PASSWORD', None)

        _start_time = time.time()
        pid = Pid('mturk_crawler', True)
        log.info('crawler started: %s;;%s', args, options)

        if options.get('mturk_email'):
            self.mturk_email = options['mturk_email']
        if options.get('mturk_password'):
            self.mturk_password = options['mturk_password']

        if options.get('logconf', None):
            self.setup_logging(options['logconf'])

        if options.get('debug', False):
            self.setup_debug()
            print 'Current proccess pid: %s' % pid.actual_pid
            print ('To debug, type: python -c "import os,signal; '
                'os.kill(%s, signal.SIGUSR1)"\n') % pid.actual_pid

        self.maxworkers = options['workers']
        if self.maxworkers > 9:
            # If you want to remote this limit, don't forget to change dbpool
            # object maximum number of connections. Each worker should fetch
            # 10 hitgroups and spawn single task for every one of them, that
            # will get private connection instance. So for 9 workers it's
            # already 9x10 = 90 connections required
            #
            # Also, for too many workers, amazon isn't returning valid data
            # and retrying takes much longer than using smaller amount of
            # workers
            sys.exit('Too many workers (more than 9). Quit.')
        start_time = datetime.datetime.now()

        hits_available = tasks.hits_mainpage_total()
        groups_available = tasks.hits_groups_total()

        # create crawl object that will be filled with data later
        crawl = Crawl.objects.create(
                start_time=start_time,
                end_time=datetime.datetime.now(),
                success=True,
                hits_available=hits_available,
                hits_downloaded=0,
                groups_available=groups_available,
                groups_downloaded=groups_available)
        log.debug('fresh crawl object created: %s', crawl.id)

        # fetch those requester profiles so we could decide if their hitgroups
        # are public or not
        reqesters = RequesterProfile.objects.all_as_dict()

        dbpool = ThreadedConnectionPool(10, 90,
            'dbname=%s user=%s password=%s' % (settings.DATABASE_NAME,
                settings.DATABASE_USER, settings.DATABASE_PASSWORD))
        # collection of group_ids that were already processed - this should
        # protect us from duplicating data
        processed_groups = set()
        total_reward = 0
        hitgroups_iter = self.hits_iter()

        for hg_pack in hitgroups_iter:
            jobs = []
            for hg in hg_pack:
                j = gevent.spawn(tasks.process_group,
                        hg, crawl.id, reqesters, processed_groups, dbpool)
                jobs.append(j)
                total_reward += hg['reward'] * hg['hits_available']
            log.debug('processing pack of hitgroups objects')
            gevent.joinall(jobs, timeout=20)
            # check if all jobs ended successfully
            for job in jobs:
                if not job.ready():
                    log.error('Killing job: %s', job)
                    job.kill()

            if len(processed_groups) >= groups_available:
                log.info('Skipping empty groups.')
                # there's no need to iterate over empty groups.. break
                break

            # amazon does not like too many requests at once, so give them a
            # quick rest...
            gevent.sleep(1)

        dbpool.closeall()

        # update crawler object
        crawl.groups_downloaded = len(processed_groups)
        crawl.end_time = datetime.datetime.now()
        crawl.save()

        work_time = time.time() - _start_time
        log.info('created crawl id: %s', crawl.id)
        log.info('total reward value: %s', total_reward)
        log.info('processed hits groups downloaded: %s', len(processed_groups))
        log.info('processed hits groups available: %s', groups_available)
        log.info('work time: %.2fsec', work_time)

        crawl_time_warning = 300
        if work_time > crawl_time_warning:
            log.warning("Crawl took {0} s which seems a bit too long (more than"
                "{0} s), you might consider checking if correct mturk account"
                " is used.".format(crawl_time_warning))
        if crawl.groups_downloaded < groups_available * 0.9:
            log.warning('More than 10% of hit groups were not downloaded, '
                'please check mturk account configuration and/or if there are '
                'any network-related problems.')
        crawl_downloaded_pc = 0.6
        if crawl.groups_downloaded < groups_available * crawl_downloaded_pc:
            log.warning("This crawl contains far too few groups downloaded to "
                "available: ({0} < {1} * {2}) and will be considered as "
                "erroneous".format(crawl.groups_downloaded, groups_available,
                crawl_downloaded_pc))
コード例 #4
0
ファイル: crawl.py プロジェクト: faridani/Mturk-Tracker
    def handle(self, *args, **options):
        _start_time = time.time()
        pid = Pid('mturk_crawler', True)
        log.info('crawler started: %s;;%s', args, options)

        if options.get('mturk_email'):
            self.mturk_email = options['mturk_email']
        if options.get('mturk_password'):
            self.mturk_password = options['mturk_password']

        if options.get('logconf', None):
            self.setup_logging(options['logconf'])

        if options.get('debug', False):
            self.setup_debug()
            print 'Current proccess pid: %s' % pid.actual_pid
            print 'To debug, type: python -c "import os,signal; os.kill(%s, signal.SIGUSR1)"\n' % \
                    pid.actual_pid

        self.maxworkers = options['workers']
        if self.maxworkers > 9:
            # If you want to remote this limit, don't forget to change dbpool
            # object maximum number of connections. Each worker should fetch
            # 10 hitgroups and spawn single task for every one of them, that
            # will get private connection instance. So for 9 workers it's
            # already 9x10 = 90 connections required
            #
            # Also, for too many workers, amazon isn't returning valid data
            # and retrying takes much longer than using smaller amount of
            # workers
            sys.exit('Too many workers (more than 9). Quit.')
        start_time = datetime.datetime.now()

        hits_available = tasks.hits_mainpage_total()
        groups_available = tasks.hits_groups_total()

        # create crawl object that will be filled with data later
        crawl = Crawl.objects.create(
                start_time=start_time,
                end_time=datetime.datetime.now(),
                success=True,
                hits_available=hits_available,
                hits_downloaded=0,
                groups_available=groups_available,
                groups_downloaded=groups_available)
        log.debug('fresh crawl object created: %s', crawl.id)

        # fetch those requester profiles so we could decide if their hitgroups
        # are public or not
        reqesters = RequesterProfile.objects.all_as_dict()

        # collection of group_ids that were already processed - this should
        # protect us from duplicating data
        processed_groups = set()
        total_reward = 0
        hitgroups_iter = self.hits_iter()
        for hg_pack in hitgroups_iter:
            jobs = []
            for hg in hg_pack:
                j = gevent.spawn(tasks.process_group,
                        hg, crawl.id, reqesters, processed_groups)
                jobs.append(j)
                total_reward += hg['reward'] * hg['hits_available']
            log.debug('processing pack of hitgroups objects')
            gevent.joinall(jobs, timeout=20)
            # check if all jobs ended successfully
            for job in jobs:
                if not job.ready():
                    log.error('Killing job: %s', job)
                    job.kill()

            if len(processed_groups) >= groups_available:
                # there's no need to iterate over empty groups.. break
                break

            # amazon does not like too many requests at once, so give them a
            # quick rest...
            gevent.sleep(1)

        dbpool.closeall()

        # update crawler object
        crawl.groups_downloaded = len(processed_groups)
        crawl.end_time = datetime.datetime.now()
        crawl.save()

        work_time = time.time() - _start_time
        log.info('created crawl id: %s', crawl.id)
        log.info('total reward value: %s', total_reward)
        log.info('processed hits groups downloaded: %s', len(processed_groups))
        log.info('processed hits groups available: %s', groups_available)
        log.info('work time: %.2fsec', work_time)