コード例 #1
0
 def grab_files(self):
     bucket = config.s3_detector_bucket()
     retry_operation(s3client.download_tarball,
                     bucket,
                     self.tarball_basename,
                     self.local_dir(),
                     sleep_time=0.1,
                     error_class=IOError)
コード例 #2
0
 def _wait_for_server(self):
     try:
         retry_operation(self._check_ner_server, error_class=socket.error,
                         num_tries=SERVER_CHECK_ATTEMPTS, sleep_time=SERVER_CHECK_INTERVAL,
                         error_message='NER server not ready yet',
                         with_traceback=False)
         return
     except socket.error:
         # stop server subprocess and raise exception
         self.stop_ner_server()
         raise Exception('Timed out waiting for NER server to come up')
コード例 #3
0
 def _wait_for_server(cls):
     try:
         retry_operation(cls._poll_server,
                         raise_exception=True,
                         error_class=socket.error,
                         num_tries=SERVER_CHECK_ATTEMPTS,
                         sleep_time=SERVER_CHECK_INTERVAL,
                         error_message='Mallet server not ready yet',
                         with_traceback=False)
     except socket.error:
         # stop server subprocess and raise exception
         cls.stop_server()
         logger.exception('Timed out waiting for Mallet server to come up')
         raise Exception('Timed out waiting for Mallet server to come up')
コード例 #4
0
 def stop_server(cls):
     #check if server has gone away already
     if not cls._poll_server():
         logger.info("Server either stopped or not reachable")
         return
     logger.info("stopping Mallet server process")
     cls._query_server(cls.KILL_CMD)
     retry_operation(cls._inverse_poll_server,
                     error_class=Exception,
                     num_tries=SERVER_CHECK_ATTEMPTS,
                     sleep_time=SERVER_CHECK_INTERVAL,
                     error_message='Server still up',
                     with_traceback=False)
     logger.info("done stopping Mallet server process")
コード例 #5
0
def load_chunk_from_file(tablename, path, cols, on_duplicate, post,
                         line_delimiter, **retry_args):
    logger.info("file being loaded: %s", path)
    path = os.path.abspath(path)
    statement = """
        LOAD DATA LOCAL INFILE '%s' %s 
             INTO TABLE `%s` 
             LINES TERMINATED BY '%s'
             (%s)
             %s
    """ % (path, on_duplicate, tablename, line_delimiter, cols, post or '')

    retry_args.setdefault('error_message',
                          'Failed to execute load statement: %s' % statement)
    retry_operation(execute, statement, **retry_args)
コード例 #6
0
 def get_youtube_video_ids(self, query_string, n_results):
     """Returns a list of youtube video ids for training"""
     max_results = 50
     page_token = None
     video_ids = []
     # might not be a limitation in youtube API v3
     n_results = min(n_results, self.YT_MAX_LIMIT)
     video_ids = []
     for i in range(0, n_results + max_results, max_results):
         search_request = self.yt_service.search().list(
             q=query_string,
             part="id",
             maxResults=50,
             type="video",
             pageToken=page_token,
         )
         search_response = retry_operation(search_request.execute,
                                           sleep_time=0.5,
                                           error_class=HttpError)
         video_ids.extend([
             res['id']['videoId']
             for res in search_response.get('items', [])
         ])
         page_token = search_response['nextPageToken']
     video_ids = list(set(video_ids))  # may not be needed?
     return video_ids[:n_results]
コード例 #7
0
 def poll(self):
     try:
         return retry_operation(self.server.poll,
                                num_tries=5,
                                sleep_time=1,
                                error_class=Exception)
     except Exception:
         return False
コード例 #8
0
    def save_results(self):
        """Record results for all of our pages to the DB"""
        pages_with_updates = set()
        wplr_file = NamedTemporaryFile('wb')
        wplr_csv = csv.writer(wplr_file, delimiter="\t")

        for page_id in self.page_ids:
            labels_to_add, labels_to_delete = self.calculate(page_id)

            # if there are any changes to the labels on this page
            # add it to updated_pages_queue
            if labels_to_add or labels_to_delete:
                pages_with_updates.add(page_id)

            for label_id in labels_to_add:
                wplr_csv.writerow([page_id, label_id])
            if labels_to_delete:
                logger.info(
                    'For page_id: %s, deleting label results for label_ids : %s',
                    page_id, list(labels_to_delete))
                query = WebPageLabelResult.query.filter_by(page_id=page_id)
                query = query.filter(
                    WebPageLabelResult.label_id.in_(labels_to_delete))
                retry_operation(query.delete,
                                synchronize_session=False,
                                error_message='Deleting WPLRs failed')

        wplr_file.flush()

        WebPageLabelResult.load_from_file(wplr_file.name)
        with session.begin():
            # update last_label_update for all the pages in the chunk
            query = WebPage.query.filter(WebPage.id.in_(self.page_ids))
            query.update({'last_label_update': self.start_time},
                         synchronize_session=False)
            for page_id in self.page_ids:
                last_detection = self.last_detections[page_id]
                last_text_detection = self.last_text_detections[page_id]
                query = WebPage.query.filter_by(id=page_id)
                query.update({
                    'last_detection_at_llu': last_detection,
                    'last_text_detection_at_llu': last_text_detection,
                })

        return pages_with_updates
コード例 #9
0
    def new_crawl(self, videos, prerolls=None):
        """We have just visited this page and found the given videos and prerolls.
        The videos passed in should be unique (no duplicates).
        """
        update_args = {
            'last_crawled_video': datetime.utcnow(),
            'crawl_count': WebPage.crawl_count + 1,
            'text_detection_update': None,
        }
        if self.crawl_count:
            active_videos = [
                video for (video, stream_url, is_autoplay, width, height, top,
                           left) in videos
            ]
            if set(active_videos) != set(self.active_videos):
                update_args['change_count'] = WebPage.change_count + 1
        query = WebPage.query.filter_by(id=self.id)
        retry_operation(query.update, update_args)

        for crawled_video in self.crawled_videos:
            crawled_video.active = False
        for (is_preroll, video_list) in [(False, videos), (True, prerolls
                                                           or [])]:
            for video, stream_url, is_autoplay, player_width, player_height, player_top, player_left in video_list:
                crawled_video = None
                for old_crawled_video in self.crawled_videos:
                    if old_crawled_video.video == video:
                        crawled_video = old_crawled_video
                        break
                if crawled_video is None:
                    crawled_video = VideoOnPage(page=self,
                                                video=video,
                                                seen_count=0)
                crawled_video.active = not is_preroll
                crawled_video.is_preroll = is_preroll
                crawled_video.seen_count += 1
                crawled_video.stream_url = stream_url
                crawled_video.is_autoplay = is_autoplay
                crawled_video.player_width = player_width
                crawled_video.player_height = player_height
                crawled_video.player_left = player_left
                crawled_video.player_top = player_top

        session.flush()
コード例 #10
0
 def get_comments(self, video_id):
     comments = []
     search_request = self.yt_service.commentThreads().list(
         part="snippet",
         maxResults=10,
         videoId=video_id,
         textFormat="plainText")
     search_response = retry_operation(search_request.execute,
                                       sleep_time=0.5,
                                       error_class=HttpError)
     for res in search_response.get('items', []):
         comments.append(
             res['snippet']['topLevelComment']['snippet']['textDisplay'])
     return comments
コード例 #11
0
 def get_related_videos_text(self, video_id):
     search_request = self.yt_service.search().list(
         part="snippet",
         maxResults=10,
         type="video",
         relatedToVideoId=video_id,
     )
     search_response = retry_operation(search_request.execute,
                                       sleep_time=0.5,
                                       error_class=HttpError)
     related_videos_text = []
     for res in search_response.get('items', []):
         related_videos_text.append(
             '%s %s' %
             (res['snippet']['title'], res['snippet']['description']))
     return related_videos_text
コード例 #12
0
 def build_youtube_video_text(self, v_id, include_related=False):
     yvt = YoutubeVideoText(v_id)
     search_request = self.yt_service.videos().list(
         part="snippet",
         id=v_id,
     )
     search_response = retry_operation(search_request.execute,
                                       sleep_time=0.5,
                                       error_class=HttpError)
     yvt.video_title = search_response['items'][0]['snippet']['title']
     yvt.video_description = search_response['items'][0]['snippet'][
         'description']
     yvt.video_comments = self.get_comments(v_id)
     if include_related:
         yvt.related_videos_text = self.get_related_videos_text(v_id)
     return yvt
コード例 #13
0
 def wrapper(*args, **kwargs):
     kwargs['num_tries'] = 5
     kwargs['sleep_time'] = 1
     kwargs['error_class'] = Exception
     return retry_operation(func, *args, **kwargs)
コード例 #14
0
 def queue(self):
     if not hasattr(self, '_queue'):
         assert self.id is not None, 'need to persist the detector'
         self._queue = retry_operation(sqs.create_queue, self._queue_name)
     return self._queue