def run(self): self.logger.info('Starting') failures = 0 while not self.stopping.is_set(): try: self.logger.info('Starting backfill') self.backfill() self.logger.info('Backfill complete') failures = 0 #reset failure count on a successful backfill if not self.run_once: self.stopping.wait(common.jitter(self.WAIT_INTERVAL)) except Exception: if failures < MAX_BACKOFF: failures += 1 delay = common.jitter(TIMEOUT * 2**failures) self.logger.exception( 'Backfill failed. Retrying in {:.0f} s'.format(delay)) backfill_errors.labels(remote=self.node).inc() self.stopping.wait(delay) if self.run_once: break self.logger.info('Worker stopped') self.done.set() if self.node in self.manager.workers: del self.manager.workers[self.node]
def run(self): """Stop and start workers based on results of get_nodes. Regularly call get_nodes. Nodes returned by get_nodes not currently running are started and currently running nodes not returned by get_nodes are stopped. If self.run_once, only call nodes once. Calling stop will exit the loop.""" self.logger.info('Starting') failures = 0 while not self.stopping.is_set(): try: new_nodes = set(self.get_nodes()) except Exception: # To ensure a fresh slate and clear any DB-related errors, get a new conn on error. # This is heavy-handed but simple and effective. self.connection = None if failures < MAX_BACKOFF: failures += 1 delay = common.jitter(TIMEOUT * 2**failures) self.logger.exception( 'Getting nodes failed. Retrying in {:.0f} s'.format(delay)) node_list_errors.inc() self.stopping.wait(delay) continue exisiting_nodes = set(self.workers.keys()) to_start = new_nodes - exisiting_nodes for node in to_start: self.start_worker(node) to_stop = exisiting_nodes - new_nodes for node in to_stop: self.stop_worker(node) failures = 0 #reset failures on success if self.run_once: break # note that if get_nodes() raises an error, then deletes will not occur if self.delete_old and self.start: try: self.delete_hours() except Exception: self.logger.warning('Failed to delete old segments', exc_info=True) self.stopping.wait(common.jitter(self.NODE_INTERVAL)) #wait for all workers to finish for worker in self.workers.values(): worker.done.wait()
def main(dbconnect, sheets_creds_file, edit_url, bustime_start, sheet_id, worksheet_names, metrics_port=8005, backdoor_port=0, allocate_ids=False): """ Sheet sync constantly scans a Google Sheets sheet and a database, copying inputs from the sheet to the DB and outputs from the DB to the sheet. With the exception of id allocation, all operations are idempotent and multiple sheet syncs may be run for redundancy. """ common.PromLogCountsHandler.install() common.install_stacksampler() prom.start_http_server(metrics_port) register_uuid() if backdoor_port: gevent.backdoor.BackdoorServer(('127.0.0.1', backdoor_port), locals=locals()).start() stop = gevent.event.Event() gevent.signal(signal.SIGTERM, stop.set) # shut down on sigterm logging.info("Starting up") dbmanager = DBManager(dsn=dbconnect) while True: try: # Get a test connection so we know the database is up, # this produces a clearer error in cases where there's a connection problem. conn = dbmanager.get_conn() except Exception: delay = common.jitter(10) logging.info( 'Cannot connect to database. Retrying in {:.0f} s'.format( delay)) stop.wait(delay) else: # put it back so it gets reused on next get_conn() dbmanager.put_conn(conn) break sheets_creds = json.load(open(sheets_creds_file)) sheets = Sheets( client_id=sheets_creds['client_id'], client_secret=sheets_creds['client_secret'], refresh_token=sheets_creds['refresh_token'], ) SheetSync(stop, dbmanager, sheets, sheet_id, worksheet_names, edit_url, bustime_start, allocate_ids).run() logging.info("Gracefully stopped")
def _run(self): start = monotonic() self.logger.debug("Getter started at {}".format(start)) while not self.exists(): self.retry = gevent.event.Event() worker = gevent.spawn(self.get_segment) # wait until worker succeeds/fails or retry is set gevent.wait([worker, self.retry], count=1) # If worker has returned, and return value is true, we're done if worker.ready() and worker.value: break # If a large amount of time has elapsed since starting, our URL is stale # anyway so we might as well give up to avoid cpu and disk usage. elapsed = monotonic() - start if elapsed > self.GIVE_UP_TIMEOUT: self.logger.warning( "Getter has been running for {}s, giving up as our URL has expired" .format(elapsed)) break # Create a new session, so we don't reuse a connection from the old session # which had an error / some other issue. This is mostly just out of paranoia. self.session = requests.Session() # if retry not set, wait for FETCH_RETRY first self.retry.wait(common.jitter(self.FETCH_RETRY)) self.logger.debug("Getter is done")
def run(self): self.trigger_refresh() # on first round, always go immediately while not self.stopping.is_set(): # clamp time to max age to non-negative, and default to 0 if no workers exist time_to_next_max_age = max( 0, min([ self.MAX_WORKER_AGE - workers[-1].age() for workers in self.stream_workers.values() if workers ] or [0])) self.logger.info( "Next master playlist refresh in at most {} sec".format( time_to_next_max_age)) # wait until refresh triggered, next max age reached, or we're stopping (whichever happens first) gevent.wait([self.stopping, self.refresh_needed], timeout=time_to_next_max_age, count=1) if not self.stopping.is_set(): self.refresh_needed.clear() gevent.spawn(self.fetch_latest) # wait min retry interval with jitter, unless we're stopping self.stopping.wait(common.jitter(self.FETCH_MIN_INTERVAL)) self.logger.info("Stopping workers") for workers in self.stream_workers.values(): for worker in workers: worker.stop() for worker in workers: worker.done.wait()
def run(self): try: while True: try: self._run() except Exception: self.logger.exception("Unexpected exception while getting segment {}, retrying".format(self.segment)) gevent.sleep(common.jitter(self.UNEXPECTED_FAILURE_RETRY)) else: break finally: self.done.set()
def wait(self, base, interval): """Wait until INTERVAL seconds after BASE.""" now = monotonic() to_wait = base + common.jitter(interval) - now if to_wait > 0: self.stop.wait(to_wait)
def run(self): """Loop over available hours for each quality, checking segment coverage.""" self.logger.info('Starting') while not self.stopping.is_set(): for quality in self.qualities: if self.stopping.is_set(): break path = os.path.join(self.base_dir, self.channel, quality) try: hours = [ name for name in os.listdir(path) if not name.startswith('.') ] except OSError as e: if e.errno == errno.ENOENT: self.logger.warning('{} does not exist'.format(path)) continue hours.sort() previous_hour_segments = None all_hour_holes = {} all_hour_partials = {} for hour in hours: if self.stopping.is_set(): break self.logger.info('Checking {}/{}'.format(quality, hour)) # based on common.segments.best_segments_by_start # but more complicated to capture more detailed metrics hour_path = os.path.join(self.base_dir, self.channel, quality, hour) try: segment_names = [ name for name in os.listdir(hour_path) if not name.startswith('.') ] except OSError as e: if e.errno == errno.ENOENT: self.logger.warning( 'Hour {} was deleted between finding it and processing it, ignoring' .format(hour)) continue segment_names.sort() parsed = [] bad_segment_count = 0 for name in segment_names: try: parsed.append( common.parse_segment_path( os.path.join(hour_path, name))) except ValueError: self.logger.warning( "Failed to parse segment: {!r}".format( os.path.join(hour_path, name)), exc_info=True) bad_segment_count += 1 full_segment_count = 0 partial_segment_count = 0 full_segment_duration = datetime.timedelta() partial_segment_duration = datetime.timedelta() full_overlaps = 0 full_overlap_duration = datetime.timedelta() partial_overlaps = 0 partial_overlap_duration = datetime.timedelta() best_segments = [] holes = [] editable_holes = [] previous = None previous_editable = None coverage = datetime.timedelta() editable_coverage = datetime.timedelta() only_partials = [] # loop over all start times # first select the best segment for a start time # then update coverage for start_time, segments in itertools.groupby( parsed, key=lambda segment: segment.start): full_segments = [] partial_segments = [] for segment in segments: if segment.type == 'full': full_segments.append(segment) full_segment_count += 1 full_segment_duration += segment.duration elif segment.type == 'partial': partial_segments.append(segment) partial_segment_count += 1 partial_segment_duration += segment.duration if full_segments: full_segments.sort( key=lambda segment: (segment.duration)) best_segment = full_segments[-1] for segment in full_segments[:-1]: full_overlaps += 1 full_overlap_duration += segment.duration for segment in partial_segments: partial_overlaps += 1 partial_overlap_duration += segment.duration elif partial_segments: partial_segments.sort(key=lambda segment: os.stat( segment.path).st_size) best_segment = partial_segments[-1] only_partials.append( (best_segment.start, best_segment.start + best_segment.duration)) for segment in partial_segments[:-1]: partial_overlaps += 1 partial_overlap_duration += segment.duration else: # ignore any start times with only temporary segments continue self.logger.debug(best_segment.path.split('/')[-1]) best_segments.append(best_segment) # now update coverage, overlaps and holes if previous is None: coverage += best_segment.duration editable_coverage += best_segment.duration previous_editable = best_segment else: previous_end = previous.start + previous.duration if segment.start < previous_end: if segment.type == 'full': full_overlaps += 1 full_overlap_duration += previous_end - segment.start else: partial_overlaps += 1 partial_overlap_duration += previous_end - segment.start coverage += segment.start - previous_end + segment.duration else: coverage += segment.duration editable_coverage += segment.duration if segment.start > previous_end: holes.append((previous_end, segment.start)) previous_editable_end = previous_editable.start + previous_editable.duration if segment.start > previous_editable_end: editable_holes.append( (previous_editable_end, segment.start)) previous_editable = best_segment previous = best_segment if best_segments: start = best_segments[0].start end = best_segments[-1].start + best_segments[ -1].duration hole_duration = end - start - coverage editable_hole_duration = end - start - editable_coverage hour_start = datetime.datetime.strptime(hour, HOUR_FMT) hour_end = hour_start + datetime.timedelta(hours=1) # handle the case when there is a hole between the last segment of the previous hour and the first of this if previous_hour_segments: last_segment = previous_hour_segments[-1] if best_segments[ 0].start > last_segment.start + last_segment.duration: holes.append((hour_start, start)) hole_duration += start - hour_start editable_holes.append((hour_start, start)) editable_hole_duration += start - hour_start # handle the case when there is a hole between the last segment and the end of the hour if not the last hour if hour != hours[-1] and end < hour_end: holes.append((end, hour_end)) hole_duration += hour_end - end editable_holes.append((end, hour_end)) editable_hole_duration += hour_end - end # update the large number of Prometheus guages segment_count_gauge.labels( channel=self.channel, quality=quality, hour=hour, type='full').set(full_segment_count) segment_count_gauge.labels( channel=self.channel, quality=quality, hour=hour, type='partial').set(partial_segment_count) segment_count_gauge.labels( channel=self.channel, quality=quality, hour=hour, type='bad').set(bad_segment_count) segment_duration_gauge.labels( channel=self.channel, quality=quality, hour=hour, type='full').set(full_segment_duration.total_seconds()) segment_duration_gauge.labels( channel=self.channel, quality=quality, hour=hour, type='partial').set( partial_segment_duration.total_seconds()) raw_coverage_gauge.labels(channel=self.channel, quality=quality, hour=hour).set( coverage.total_seconds()) editable_coverage_gauge.labels( channel=self.channel, quality=quality, hour=hour).set(editable_coverage.total_seconds()) raw_holes_gauge.labels(channel=self.channel, quality=quality, hour=hour).set(len(holes)) editable_holes_gauge.labels(channel=self.channel, quality=quality, hour=hour).set( len(editable_holes)) overlap_count_gauge.labels(channel=self.channel, quality=quality, hour=hour, type='full').set(full_overlaps) overlap_count_gauge.labels( channel=self.channel, quality=quality, hour=hour, type='partial').set(partial_overlaps) overlap_duration_gauge.labels( channel=self.channel, quality=quality, hour=hour, type='full').set(full_overlap_duration.total_seconds()) overlap_duration_gauge.labels( channel=self.channel, quality=quality, hour=hour, type='partial').set( partial_overlap_duration.total_seconds()) # log the same information if best_segments: self.logger.info( '{}/{}: Start: {} End: {} ({} s)'.format( quality, hour, start, end, (end - start).total_seconds())) self.logger.info( '{}/{}: {} full segments totalling {} s'.format( quality, hour, full_segment_count, full_segment_duration.total_seconds())) self.logger.info('{}/{}: {} bad segments'.format( quality, hour, bad_segment_count)) self.logger.info( '{}/{}: {} overlapping full segments totalling {} s' .format(quality, hour, full_overlaps, full_overlap_duration.total_seconds())) self.logger.info( '{}/{}: {} partial segments totalling {} s'.format( quality, hour, partial_segment_count, partial_segment_duration.total_seconds())) self.logger.info( '{}/{}: {} overlapping partial segments totalling {} s' .format(quality, hour, partial_overlaps, partial_overlap_duration.total_seconds())) self.logger.info( '{}/{}: raw coverage {} s, editable coverage {} s ' .format(quality, hour, coverage.total_seconds(), editable_coverage.total_seconds())) self.logger.info( '{}/{}: {} holes totalling {} s '.format( quality, hour, len(holes), hole_duration.total_seconds())) self.logger.info( '{}/{}: {} editable holes totalling {} s '.format( quality, hour, len(editable_holes), editable_hole_duration.total_seconds())) self.logger.info('Checking {}/{} complete'.format( quality, hour)) # add holes for the start and end hours for the # coverage map. do this after updating gauges and # logging as these aren't likely real holes, just the # start and end of the stream. if previous_hour_segments is None: holes.append((hour_start, start)) if hour == hours[-1]: holes.append((end, hour_end)) all_hour_holes[hour] = holes all_hour_partials[hour] = only_partials previous_hour_segments = best_segments else: self.logger.info('{}/{} is empty'.format( quality, hour)) self.create_coverage_map(quality, all_hour_holes, all_hour_partials) self.stopping.wait(common.jitter(self.CHECK_INTERVAL))
def wait(self, interval): self.stop.wait(common.jitter(interval))
def wait(self, interval): """Wait for given interval with jitter, unless we're stopping""" self.stopping.wait(common.jitter(interval))
def main( dbconnect, config, creds_file, name=None, base_dir=".", tags='', metrics_port=8003, backdoor_port=0, ): """dbconnect should be a postgres connection string, which is either a space-separated list of key=value pairs, or a URI like: postgresql://USER:PASSWORD@HOST/DBNAME?KEY=VALUE config should be a json blob mapping upload location names to a config object for that location. This config object should contain the keys: type: the name of the upload backend type no_transcode_check: bool. If true, won't check for when videos are done transcoding. This is useful when multiple upload locations actually refer to the same place just with different settings, and you only want one of them to actually do the check. cut_type: One of 'fast' or 'full'. Default 'fast'. This indicates whether to use fast_cut_segments() or full_cut_segments() for this location. along with any additional config options defined for that backend type. creds_file should contain any required credentials for the upload backends, as JSON. name defaults to hostname. tags should be a comma-seperated list of tags to attach to all videos. """ common.PromLogCountsHandler.install() common.install_stacksampler() prom.start_http_server(metrics_port) if backdoor_port: gevent.backdoor.BackdoorServer(('127.0.0.1', backdoor_port), locals=locals()).start() if name is None: name = socket.gethostname() tags = tags.split(',') if tags else [] stop = gevent.event.Event() gevent.signal(signal.SIGTERM, stop.set) # shut down on sigterm logging.info("Starting up") # We have two independent jobs to do - to perform cut jobs (cutter), # and to check the status of transcoding videos to see if they're done (transcode checker). # We want to error if either errors, and shut down if either exits. dbmanager = None stopping = gevent.event.Event() dbmanager = DBManager(dsn=dbconnect) while True: try: # Get a test connection so we know the database is up, # this produces a clearer error in cases where there's a connection problem. conn = dbmanager.get_conn() except Exception: delay = common.jitter(10) logging.warning( 'Cannot connect to database. Retrying in {:.0f} s'.format( delay), exc_info=True) stop.wait(delay) else: # put it back so it gets reused on next get_conn() dbmanager.put_conn(conn) break with open(creds_file) as f: credentials = json.load(f) config = json.loads(config) upload_locations = {} needs_transcode_check = {} for location, backend_config in config.items(): backend_type = backend_config.pop('type') no_transcode_check = backend_config.pop('no_transcode_check', False) cut_type = backend_config.pop('cut_type', 'full') if backend_type == 'youtube': backend_type = Youtube elif backend_type == 'local': backend_type = Local else: raise ValueError( "Unknown upload backend type: {!r}".format(backend_type)) backend = backend_type(credentials, **backend_config) if cut_type == 'fast': # mark for fast cut by clearing encoding settings backend.encoding_settings = None elif cut_type != 'full': raise ValueError("Unknown cut type: {!r}".format(cut_type)) upload_locations[location] = backend if backend.needs_transcode and not no_transcode_check: needs_transcode_check[location] = backend cutter = Cutter(upload_locations, dbmanager, stop, name, base_dir, tags) transcode_checkers = [ TranscodeChecker(location, backend, dbmanager, stop) for location, backend in needs_transcode_check.items() ] jobs = [gevent.spawn(cutter.run)] + [ gevent.spawn(transcode_checker.run) for transcode_checker in transcode_checkers ] # Block until any one exits gevent.wait(jobs, count=1) # Stop the others if they aren't stopping already stop.set() # Block until all have exited gevent.wait(jobs) # Call get() for each one to re-raise if any errored for job in jobs: job.get() logging.info("Gracefully stopped")
def wait(self, interval): """Wait for INTERVAL with jitter, unless we're stopping""" self.stop.wait(common.jitter(interval))