def handle(self, **options):

        pid = Pid('mturk_diffs', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            for c in Crawl.objects.filter(is_spam_computed=False).order_by('-id')[:options['limit']]:
                
                updated = update_cid(c.id)
                
                if updated > 0:
                    c.has_diffs=True
                    c.save()

                transaction.commit()

        except (KeyError, KeyboardInterrupt):
            transaction.rollback()
            pid.remove_pid()
            exit()            

        logger.info('updating 5 crawls took: %s s', (time.time() - start_time))
Example #2
0
    def handle(self, **options):

        pid = Pid('mturk_diffs', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            for c in Crawl.objects.filter(
                    is_spam_computed=False).order_by('-id')[:options['limit']]:

                updated = update_cid(c.id)

                if updated > 0:
                    c.has_diffs = True
                    c.save()

                transaction.commit()

        except (KeyError, KeyboardInterrupt):
            transaction.rollback()
            pid.remove_pid()
            exit()

        logger.info('updating 5 crawls took: %s s', (time.time() - start_time))
 def handle(self, **options):
     
     pid = Pid('mturk_agregates', True)
     start_time = time.time()
     
     logging.info('Updating crawl agregates')
     update_crawl_agregates(1, only_new = True)
     
     logging.info('db_update_agregates took: %s' % (time.time() - start_time))
     
     pid.remove_pid()
Example #4
0
    def handle(self, **options):

        pid = Pid('mturk_agregates', True)
        start_time = time.time()

        logging.info('Updating crawl agregates')
        update_crawl_agregates(1, only_new=True)

        logging.info('db_update_agregates took: %s' %
                     (time.time() - start_time))

        pid.remove_pid()
    def handle(self, **options):

        pid = Pid('mturk_crawler', True)

        start_time = time.time()

        logging.info('cleaning up db from duplicates')
        clean_duplicates()

        logging.info('Refreshing hits_mv')
        update_mviews()

        logging.info('done refreshing hits_mv')

        logging.info('db_refresh_mviews took: %s' % (time.time() - start_time))

        pid.remove_pid()
    def handle(self, **options):

        pid = Pid('mturk_agregates', True)

        key = 'TOPREQUESTERS_CACHED'
        
        result = cache.get(key)
        if result is not None:
           logging.info("toprequesters still in cache...")
           return
        days = options['days']

        logging.info("toprequesters missing, refetching")
        # no chache perform query:
        
        from mturk.main.views import topreq_data
        start_time = time.time()
        data = topreq_data(days)
        logging.info("toprequesters: filled memcache in %s", time.time() - start_time)
        cache.set(key, data, HOURS4)

        pid.remove_pid()
Example #7
0
    def handle(self, **options):

        pid = Pid('mturk_agregates', True)

        key = 'TOPREQUESTERS_CACHED'

        result = cache.get(key)
        if result is not None:
            logging.info("toprequesters still in cache...")
            return
        days = options['days']

        logging.info("toprequesters missing, refetching")
        # no chache perform query:

        from mturk.main.views import topreq_data
        start_time = time.time()
        data = topreq_data(days)
        logging.info("toprequesters: filled memcache in %s",
                     time.time() - start_time)
        cache.set(key, data, HOURS4)

        pid.remove_pid()
Example #8
0
    def handle(self, **options):

        """
        Take ${lmit} last crawls without spam classification
        Classify all hit groups, update hits_mv to have proper hit classification
        Rebuild crawl_aggregates for a given crawl
        Refresh memcache
        """

        service = get_prediction_service()

        pid = Pid('classify_spam', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            number_of_predictions = 0

            for c in list(Crawl.objects.filter(is_spam_computed=False).order_by('-id')[:options['limit']]):

                log.info("processing %s", c)

                spam = set([])
                not_spam = set([])
                
                updated = 0

                for row in query_to_dicts("""select content_id, group_id, is_spam from hits_mv 
                    where 
                        crawl_id = %s""", c.id):

                    log.info("classyfing crawl_id: %s, %s", c.id,row)

                    if row['is_spam'] is None:

                        is_spam = None
                        content = HitGroupContent.objects.get(id= row['content_id'])

                        if content.is_spam is None:
                            data = content.prepare_for_prediction()

                            body = {'input': {'csvInstance': data}}
                            prediction = service.predict(body=body, data=options['file']).execute()
                            
                            number_of_predictions += 1
                            updated += 1                    
                            
                            content.is_spam = prediction['outputLabel'] != 'No'
                            content.save()

                        execute_sql("update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'" % ('true' if content.is_spam else 'false', c.id, row['group_id']))       
                        transaction.commit()
                            
                        if content.is_spam:
                            log.info("detected spam for %s", row)
                            spam.add(str(row['content_id']))
                        else:
                            not_spam.add(str(row['content_id']))

                    else:
                        log.info("is_spam already computed for %s" % row)
                
                if updated > 0:
                    c.is_spam_computed=True
                    c.save()

                log.info("done classyfing crawl")

                execute_sql("""UPDATE main_crawlagregates 
                    set spam_projects = 
                        ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                    where crawl_id = %s""" % (c.id, c.id) ) 


                transaction.commit()

                log.info("dome processing %s", c)

        except (KeyError, KeyboardInterrupt, HttpError), e:
            log.error(e)
            transaction.rollback()
            pid.remove_pid()
            exit()            
Example #9
0
    def run(self):

        pid = Pid('mturk_crawler', True)

        logging.info('Crawler started')

        start_time = datetime.datetime.now()

        #Fetching statistical information about groups and HITs count
        logging.debug("Fetching stats")
        main_response = urllib2.urlopen(get_allhit_url())
        main_html = main_response.read()
        main_soup = BeautifulSoup(
            main_html,
            parseOnlyThese=SoupStrainer(
                text=re.compile("(^[0-9,]+ HITs|of [0-9]+ Results)")))
        main_stats = [tag for tag in main_soup]
        hits_available = -1
        groups_available = -1
        if len(main_stats) > 1:
            hits_available_tmp = main_stats[0]
            hits_available_tmp = hits_available_tmp[:hits_available_tmp.
                                                    find(' ')].replace(
                                                        ',', '')
            hits_available = int(hits_available_tmp)
            groups_available_tmp = main_stats[1]
            groups_available_tmp = groups_available_tmp[
                groups_available_tmp.find('of') +
                3:groups_available_tmp.find('Results') - 1]
            groups_available = int(groups_available_tmp)
        main_soup = None

        #Fetching data from every mturk.com HITs list page
        logging.debug("Allhit processing")
        result_allhit = self.process_values(
            range(1,
                  self.get_max_page(main_html) + 1), callback_allhit,
            self.processes_count)
        self.data = result_allhit['data']
        self.append_errors(result_allhit['errors'])

        #Fetching html details for every HIT group
        logging.debug("Details processing")
        result_details = self.process_values(self.data, callback_details,
                                             self.processes_count)
        self.data = result_details['data']
        self.append_errors(result_details['errors'])

        hits_downloaded = sum(
            [hgs['HitGroupStatus']['hits_available'] for hgs in self.data])
        groups_downloaded = len(self.data)

        #Logging crawl information into the database
        success = False
        if groups_downloaded > 0 and hits_downloaded > 0 and groups_available / groups_downloaded <= 1.5 and hits_available / hits_downloaded <= 1.5:
            success = True

        logging.debug(
            "Crawl finished with success=%s. Saving main_crawl entry" %
            success)
        crawl = Crawl(
            **{
                'start_time': start_time,
                'end_time': datetime.datetime.now(),
                'success': success,
                'hits_available': hits_available,
                'hits_downloaded': hits_downloaded,
                'groups_available': groups_available,
                'groups_downloaded': groups_downloaded,
                #'errors':               str(self.errors) # !
                'errors': ''
            })
        crawl.save()

        #Adding crawl FK
        logging.debug("Adding FKs")
        result_add_crawlfk = self.process_values(self.data,
                                                 callback_add_crawlfk,
                                                 crawl=crawl)
        self.data = result_add_crawlfk['data']
        self.append_errors(result_add_crawlfk['errors'])

        #Saving results in the database
        logging.debug("Saving results")
        result_save_database = self.process_values(self.data,
                                                   callback_database)
        self.append_errors(result_save_database['errors'])

        print self.errors

        logging.info(
            "Crawler finished %ssuccessfully in %s with %d results, %d HITs (of %d and %d) and %d errors"
            % ("" if success else "un", (datetime.datetime.now() - start_time),
               groups_downloaded, hits_downloaded, groups_available,
               hits_available, len(self.errors)))

        pid.remove_pid()
Example #10
0
    def run(self):
        
        pid = Pid('mturk_crawler', True)

        logging.info('Crawler started')

        start_time = datetime.datetime.now()
        
        #Fetching statistical information about groups and HITs count
        logging.debug("Fetching stats")
        main_response = urllib2.urlopen(get_allhit_url())
        main_html = main_response.read()
        main_soup = BeautifulSoup(main_html, parseOnlyThese=SoupStrainer(text=re.compile("(^[0-9,]+ HITs|of [0-9]+ Results)")))
        main_stats = [tag for tag in main_soup]
        hits_available = -1
        groups_available = -1
        if len(main_stats) > 1:
            hits_available_tmp = main_stats[0]
            hits_available_tmp = hits_available_tmp[:hits_available_tmp.find(' ')].replace(',', '')
            hits_available = int(hits_available_tmp)
            groups_available_tmp = main_stats[1]
            groups_available_tmp = groups_available_tmp[groups_available_tmp.find('of')+3:groups_available_tmp.find('Results')-1]
            groups_available = int(groups_available_tmp)
        main_soup = None

        #Fetching data from every mturk.com HITs list page
        logging.debug("Allhit processing")
        result_allhit = self.process_values(range(1,self.get_max_page(main_html)+1), callback_allhit, 
                                            self.processes_count)
        self.data = result_allhit['data']
        self.append_errors(result_allhit['errors'])

        #Fetching html details for every HIT group
        logging.debug("Details processing")
        result_details = self.process_values(self.data, callback_details, 
                                             self.processes_count)
        self.data = result_details['data']
        self.append_errors(result_details['errors'])
        
        hits_downloaded = sum([hgs['HitGroupStatus']['hits_available'] for hgs in self.data])
        groups_downloaded = len(self.data)

        #Logging crawl information into the database
        success = False
        if groups_downloaded > 0 and hits_downloaded > 0 and groups_available/groups_downloaded <= 1.5 and hits_available/hits_downloaded <= 1.5:
            success = True
        
        logging.debug("Crawl finished with success=%s. Saving main_crawl entry" % success)
        crawl = Crawl(**{
            'start_time':           start_time,
            'end_time':             datetime.datetime.now(),
            'success':              success,
            'hits_available':       hits_available,
            'hits_downloaded':      hits_downloaded,
            'groups_available':     groups_available,
            'groups_downloaded':    groups_downloaded,
            #'errors':               str(self.errors) # !
            'errors':               ''
        })
        crawl.save()

        #Adding crawl FK
        logging.debug("Adding FKs")
        result_add_crawlfk = self.process_values(self.data, callback_add_crawlfk, 
                                                 crawl=crawl)
        self.data = result_add_crawlfk['data']
        self.append_errors(result_add_crawlfk['errors'])

        #Saving results in the database
        logging.debug("Saving results")
        result_save_database = self.process_values(self.data, callback_database)
        self.append_errors(result_save_database['errors'])
        
        print self.errors

        logging.info(
            "Crawler finished %ssuccessfully in %s with %d results, %d HITs (of %d and %d) and %d errors" % (
                "" if success else "un",
                (datetime.datetime.now()-start_time),
                groups_downloaded,
                hits_downloaded,
                groups_available,
                hits_available,
                len(self.errors)
            )
        )
        
        pid.remove_pid()
Example #11
0
    def handle(self, **options):
        """
        Take ${lmit} last crawls without spam classification
        Classify all hit groups, update hits_mv to have proper hit classification
        Rebuild crawl_aggregates for a given crawl
        Refresh memcache
        """

        service = get_prediction_service()

        pid = Pid('classify_spam', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            number_of_predictions = 0

            for c in list(
                    Crawl.objects.filter(is_spam_computed=False).order_by(
                        '-id')[:options['limit']]):

                log.info("processing %s", c)

                spam = set([])
                not_spam = set([])

                updated = 0

                for row in query_to_dicts(
                        """select content_id, group_id, is_spam from hits_mv 
                    where 
                        crawl_id = %s""", c.id):

                    log.info("classyfing crawl_id: %s, %s", c.id, row)

                    if row['is_spam'] is None:

                        is_spam = None
                        content = HitGroupContent.objects.get(
                            id=row['content_id'])

                        if content.is_spam is None:
                            data = content.prepare_for_prediction()

                            body = {'input': {'csvInstance': data}}
                            prediction = service.predict(
                                body=body, data=options['file']).execute()

                            number_of_predictions += 1
                            updated += 1

                            content.is_spam = prediction['outputLabel'] != 'No'
                            content.save()

                        execute_sql(
                            "update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'"
                            % ('true' if content.is_spam else 'false', c.id,
                               row['group_id']))
                        transaction.commit()

                        if content.is_spam:
                            log.info("detected spam for %s", row)
                            spam.add(str(row['content_id']))
                        else:
                            not_spam.add(str(row['content_id']))

                    else:
                        log.info("is_spam already computed for %s" % row)

                if updated > 0:
                    c.is_spam_computed = True
                    c.save()

                log.info("done classyfing crawl")

                execute_sql("""UPDATE main_crawlagregates 
                    set spam_projects = 
                        ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                    where crawl_id = %s""" % (c.id, c.id))

                transaction.commit()

                log.info("dome processing %s", c)

        except (KeyError, KeyboardInterrupt, HttpError), e:
            log.error(e)
            transaction.rollback()
            pid.remove_pid()
            exit()
Example #12
0
    def handle(self, *args, **options):
        _start_time = time.time()
        pid = Pid('mturk_crawler', True)
        log.info('crawler started: %s;;%s', args, options)

        if options.get('mturk_email'):
            self.mturk_email = options['mturk_email']
        if options.get('mturk_password'):
            self.mturk_password = options['mturk_password']

        if options.get('logconf', None):
            self.setup_logging(options['logconf'])

        if options.get('debug', False):
            self.setup_debug()
            print 'Current proccess pid: %s' % pid.actual_pid
            print 'To debug, type: python -c "import os,signal; os.kill(%s, signal.SIGUSR1)"\n' % \
                    pid.actual_pid

        self.maxworkers = options['workers']
        if self.maxworkers > 9:
            # If you want to remote this limit, don't forget to change dbpool
            # object maximum number of connections. Each worker should fetch
            # 10 hitgroups and spawn single task for every one of them, that
            # will get private connection instance. So for 9 workers it's
            # already 9x10 = 90 connections required
            #
            # Also, for too many workers, amazon isn't returning valid data
            # and retrying takes much longer than using smaller amount of
            # workers
            sys.exit('Too many workers (more than 9). Quit.')
        start_time = datetime.datetime.now()

        hits_available = tasks.hits_mainpage_total()
        groups_available = tasks.hits_groups_total()

        # create crawl object that will be filled with data later
        crawl = Crawl.objects.create(start_time=start_time,
                                     end_time=datetime.datetime.now(),
                                     success=True,
                                     hits_available=hits_available,
                                     hits_downloaded=0,
                                     groups_available=groups_available,
                                     groups_downloaded=groups_available)
        log.debug('fresh crawl object created: %s', crawl.id)

        # fetch those requester profiles so we could decide if their hitgroups
        # are public or not
        reqesters = RequesterProfile.objects.all_as_dict()

        # collection of group_ids that were already processed - this should
        # protect us from duplicating data
        processed_groups = set()
        total_reward = 0
        hitgroups_iter = self.hits_iter()
        for hg_pack in hitgroups_iter:
            jobs = []
            for hg in hg_pack:
                j = gevent.spawn(tasks.process_group, hg, crawl.id, reqesters,
                                 processed_groups)
                jobs.append(j)
                total_reward += hg['reward'] * hg['hits_available']
            log.debug('processing pack of hitgroups objects')
            gevent.joinall(jobs, timeout=20)
            # check if all jobs ended successfully
            for job in jobs:
                if not job.ready():
                    log.error('Killing job: %s', job)
                    job.kill()

            if len(processed_groups) >= groups_available:
                # there's no need to iterate over empty groups.. break
                break

            # amazon does not like too many requests at once, so give them a
            # quick rest...
            gevent.sleep(1)

        dbpool.closeall()

        # update crawler object
        crawl.groups_downloaded = len(processed_groups)
        crawl.end_time = datetime.datetime.now()
        crawl.save()

        work_time = time.time() - _start_time
        log.info('created crawl id: %s', crawl.id)
        log.info('total reward value: %s', total_reward)
        log.info('processed hits groups downloaded: %s', len(processed_groups))
        log.info('processed hits groups available: %s', groups_available)
        log.info('work time: %.2fsec', work_time)