def grab_files(self): bucket = config.s3_detector_bucket() retry_operation(s3client.download_tarball, bucket, self.tarball_basename, self.local_dir(), sleep_time=0.1, error_class=IOError)
def _wait_for_server(self): try: retry_operation(self._check_ner_server, error_class=socket.error, num_tries=SERVER_CHECK_ATTEMPTS, sleep_time=SERVER_CHECK_INTERVAL, error_message='NER server not ready yet', with_traceback=False) return except socket.error: # stop server subprocess and raise exception self.stop_ner_server() raise Exception('Timed out waiting for NER server to come up')
def _wait_for_server(cls): try: retry_operation(cls._poll_server, raise_exception=True, error_class=socket.error, num_tries=SERVER_CHECK_ATTEMPTS, sleep_time=SERVER_CHECK_INTERVAL, error_message='Mallet server not ready yet', with_traceback=False) except socket.error: # stop server subprocess and raise exception cls.stop_server() logger.exception('Timed out waiting for Mallet server to come up') raise Exception('Timed out waiting for Mallet server to come up')
def stop_server(cls): #check if server has gone away already if not cls._poll_server(): logger.info("Server either stopped or not reachable") return logger.info("stopping Mallet server process") cls._query_server(cls.KILL_CMD) retry_operation(cls._inverse_poll_server, error_class=Exception, num_tries=SERVER_CHECK_ATTEMPTS, sleep_time=SERVER_CHECK_INTERVAL, error_message='Server still up', with_traceback=False) logger.info("done stopping Mallet server process")
def load_chunk_from_file(tablename, path, cols, on_duplicate, post, line_delimiter, **retry_args): logger.info("file being loaded: %s", path) path = os.path.abspath(path) statement = """ LOAD DATA LOCAL INFILE '%s' %s INTO TABLE `%s` LINES TERMINATED BY '%s' (%s) %s """ % (path, on_duplicate, tablename, line_delimiter, cols, post or '') retry_args.setdefault('error_message', 'Failed to execute load statement: %s' % statement) retry_operation(execute, statement, **retry_args)
def get_youtube_video_ids(self, query_string, n_results): """Returns a list of youtube video ids for training""" max_results = 50 page_token = None video_ids = [] # might not be a limitation in youtube API v3 n_results = min(n_results, self.YT_MAX_LIMIT) video_ids = [] for i in range(0, n_results + max_results, max_results): search_request = self.yt_service.search().list( q=query_string, part="id", maxResults=50, type="video", pageToken=page_token, ) search_response = retry_operation(search_request.execute, sleep_time=0.5, error_class=HttpError) video_ids.extend([ res['id']['videoId'] for res in search_response.get('items', []) ]) page_token = search_response['nextPageToken'] video_ids = list(set(video_ids)) # may not be needed? return video_ids[:n_results]
def poll(self): try: return retry_operation(self.server.poll, num_tries=5, sleep_time=1, error_class=Exception) except Exception: return False
def save_results(self): """Record results for all of our pages to the DB""" pages_with_updates = set() wplr_file = NamedTemporaryFile('wb') wplr_csv = csv.writer(wplr_file, delimiter="\t") for page_id in self.page_ids: labels_to_add, labels_to_delete = self.calculate(page_id) # if there are any changes to the labels on this page # add it to updated_pages_queue if labels_to_add or labels_to_delete: pages_with_updates.add(page_id) for label_id in labels_to_add: wplr_csv.writerow([page_id, label_id]) if labels_to_delete: logger.info( 'For page_id: %s, deleting label results for label_ids : %s', page_id, list(labels_to_delete)) query = WebPageLabelResult.query.filter_by(page_id=page_id) query = query.filter( WebPageLabelResult.label_id.in_(labels_to_delete)) retry_operation(query.delete, synchronize_session=False, error_message='Deleting WPLRs failed') wplr_file.flush() WebPageLabelResult.load_from_file(wplr_file.name) with session.begin(): # update last_label_update for all the pages in the chunk query = WebPage.query.filter(WebPage.id.in_(self.page_ids)) query.update({'last_label_update': self.start_time}, synchronize_session=False) for page_id in self.page_ids: last_detection = self.last_detections[page_id] last_text_detection = self.last_text_detections[page_id] query = WebPage.query.filter_by(id=page_id) query.update({ 'last_detection_at_llu': last_detection, 'last_text_detection_at_llu': last_text_detection, }) return pages_with_updates
def new_crawl(self, videos, prerolls=None): """We have just visited this page and found the given videos and prerolls. The videos passed in should be unique (no duplicates). """ update_args = { 'last_crawled_video': datetime.utcnow(), 'crawl_count': WebPage.crawl_count + 1, 'text_detection_update': None, } if self.crawl_count: active_videos = [ video for (video, stream_url, is_autoplay, width, height, top, left) in videos ] if set(active_videos) != set(self.active_videos): update_args['change_count'] = WebPage.change_count + 1 query = WebPage.query.filter_by(id=self.id) retry_operation(query.update, update_args) for crawled_video in self.crawled_videos: crawled_video.active = False for (is_preroll, video_list) in [(False, videos), (True, prerolls or [])]: for video, stream_url, is_autoplay, player_width, player_height, player_top, player_left in video_list: crawled_video = None for old_crawled_video in self.crawled_videos: if old_crawled_video.video == video: crawled_video = old_crawled_video break if crawled_video is None: crawled_video = VideoOnPage(page=self, video=video, seen_count=0) crawled_video.active = not is_preroll crawled_video.is_preroll = is_preroll crawled_video.seen_count += 1 crawled_video.stream_url = stream_url crawled_video.is_autoplay = is_autoplay crawled_video.player_width = player_width crawled_video.player_height = player_height crawled_video.player_left = player_left crawled_video.player_top = player_top session.flush()
def get_comments(self, video_id): comments = [] search_request = self.yt_service.commentThreads().list( part="snippet", maxResults=10, videoId=video_id, textFormat="plainText") search_response = retry_operation(search_request.execute, sleep_time=0.5, error_class=HttpError) for res in search_response.get('items', []): comments.append( res['snippet']['topLevelComment']['snippet']['textDisplay']) return comments
def get_related_videos_text(self, video_id): search_request = self.yt_service.search().list( part="snippet", maxResults=10, type="video", relatedToVideoId=video_id, ) search_response = retry_operation(search_request.execute, sleep_time=0.5, error_class=HttpError) related_videos_text = [] for res in search_response.get('items', []): related_videos_text.append( '%s %s' % (res['snippet']['title'], res['snippet']['description'])) return related_videos_text
def build_youtube_video_text(self, v_id, include_related=False): yvt = YoutubeVideoText(v_id) search_request = self.yt_service.videos().list( part="snippet", id=v_id, ) search_response = retry_operation(search_request.execute, sleep_time=0.5, error_class=HttpError) yvt.video_title = search_response['items'][0]['snippet']['title'] yvt.video_description = search_response['items'][0]['snippet'][ 'description'] yvt.video_comments = self.get_comments(v_id) if include_related: yvt.related_videos_text = self.get_related_videos_text(v_id) return yvt
def wrapper(*args, **kwargs): kwargs['num_tries'] = 5 kwargs['sleep_time'] = 1 kwargs['error_class'] = Exception return retry_operation(func, *args, **kwargs)
def queue(self): if not hasattr(self, '_queue'): assert self.id is not None, 'need to persist the detector' self._queue = retry_operation(sqs.create_queue, self._queue_name) return self._queue