Esempio n. 1
0
def get_remote_segment(base_dir,
                       node,
                       channel,
                       quality,
                       hour,
                       missing_segment,
                       logger,
                       timeout=TIMEOUT):
    """Get a segment from a node.

	Fetches channel/quality/hour/missing_segment from node and puts it in
	base_dir/channel/quality/hour/missing_segment. If the segment already exists
	locally, this does not attempt to fetch it."""

    path = os.path.join(base_dir, channel, quality, hour, missing_segment)
    # check to see if file was created since we listed the local segments to
    # avoid unnecessarily copying
    if os.path.exists(path):
        logging.debug('Skipping existing segment {}'.format(path))
        return

    dir_name = os.path.dirname(path)
    date, duration, _ = os.path.basename(path).split('-', 2)
    temp_name = "-".join([date, duration, "temp", str(uuid.uuid4())])
    temp_path = os.path.join(dir_name, "{}.ts".format(temp_name))
    common.ensure_directory(temp_path)
    hash = hashlib.sha256()

    try:
        logging.debug('Fetching segment {} from {}'.format(path, node))
        uri = '{}/segments/{}/{}/{}/{}'.format(node, channel, quality, hour,
                                               missing_segment)
        resp = requests.get(uri,
                            stream=True,
                            timeout=timeout,
                            metric_name='get_remote_segment')

        resp.raise_for_status()

        with open(temp_path, 'w') as f:
            for chunk in resp.iter_content(8192):
                f.write(chunk)
                hash.update(chunk)

        filename_hash = common.parse_segment_path(missing_segment).hash
        if filename_hash != hash.digest():
            logger.warn(
                'Hash of segment {} does not match. Discarding segment'.format(
                    missing_segment))
            hash_mismatches.labels(remote=node,
                                   channel=channel,
                                   quality=quality,
                                   hour=hour).inc()
            os.remove(temp_path)
            return

    #try to get rid of the temp file if an exception is raised.
    except Exception:
        if os.path.exists(temp_path):
            os.remove(temp_path)
        raise
    logging.debug('Saving completed segment {} as {}'.format(temp_path, path))
    common.rename(temp_path, path)
    segments_backfilled.labels(remote=node,
                               channel=channel,
                               quality=quality,
                               hour=hour).inc()
    logger.info('Segment {}/{}/{} backfilled'.format(quality, hour,
                                                     missing_segment))
Esempio n. 2
0
    def backfill(self):
        """Backfill from remote node.
	
		Backfill from node/channel/qualities to base_dir/channel/qualities for
		each hour in hours.
		"""
        for channel, quality in itertools.product(self.channels,
                                                  self.qualities):
            for hour in list_hours(self.node, channel, quality, self.start):
                # since backfilling can take a long time, recheck whether this
                # hour is after the start
                if self.start is not None:
                    if not isinstance(self.start, datetime.datetime):
                        start_hour = datetime.datetime.utcnow(
                        ) - datetime.timedelta(hours=self.start)
                    else:
                        start_hour = self.start
                    if datetime.datetime.strptime(hour, HOUR_FMT) < start_hour:
                        break

                self.logger.info('Backfilling {}/{}'.format(quality, hour))

                local_segments = set(
                    list_local_segments(self.base_dir, channel, quality, hour))
                remote_segments = set(
                    list_remote_segments(self.node, channel, quality, hour))
                missing_segments = list(remote_segments - local_segments)

                # randomise the order of the segments to reduce the chance that
                # multiple workers request the same segment at the same time
                random.shuffle(missing_segments)

                pool = gevent.pool.Pool(self.download_concurrency)
                workers = []

                for missing_segment in missing_segments:

                    if self.stopping.is_set():
                        return

                    path = os.path.join(channel, quality, hour,
                                        missing_segment)

                    # test to see if file is a segment and get the segments start time
                    try:
                        segment = common.parse_segment_path(path)
                    except ValueError as e:
                        self.logger.warning('File {} invaid: {}'.format(
                            path, e))
                        continue

                    # Ignore temp segments as they may go away by the time we fetch them.
                    if segment.type == "temp":
                        self.logger.debug(
                            'Skipping {} as it is a temp segment'.format(path))
                        continue

                    # to avoid getting in the downloader's way ignore segments
                    # less than recent_cutoff old
                    if datetime.datetime.utcnow(
                    ) - segment.start < datetime.timedelta(
                            seconds=self.recent_cutoff):
                        self.logger.debug(
                            'Skipping {} as too recent'.format(path))
                        continue

                    # start segment as soon as a pool slot opens up, then track it in workers
                    workers.append(
                        pool.spawn(get_remote_segment, self.base_dir,
                                   self.node, channel, quality, hour,
                                   missing_segment, self.logger))

                # verify that all the workers succeeded. if any failed, raise the exception from
                # one of them arbitrarily.
                for worker in workers:
                    worker.get()  # re-raise error, if any

                self.logger.info('{} segments in {}/{} backfilled'.format(
                    len(workers), quality, hour))
                hours_backfilled.labels(remote=self.node,
                                        channel=channel,
                                        quality=quality).inc()
Esempio n. 3
0
    def run(self):
        """Loop over available hours for each quality, checking segment coverage."""
        self.logger.info('Starting')

        while not self.stopping.is_set():

            for quality in self.qualities:
                if self.stopping.is_set():
                    break

                path = os.path.join(self.base_dir, self.channel, quality)
                try:
                    hours = [
                        name for name in os.listdir(path)
                        if not name.startswith('.')
                    ]
                except OSError as e:
                    if e.errno == errno.ENOENT:
                        self.logger.warning('{} does not exist'.format(path))
                        continue

                hours.sort()
                previous_hour_segments = None
                all_hour_holes = {}
                all_hour_partials = {}
                for hour in hours:
                    if self.stopping.is_set():
                        break
                    self.logger.info('Checking {}/{}'.format(quality, hour))

                    # based on common.segments.best_segments_by_start
                    # but more complicated to capture more detailed metrics
                    hour_path = os.path.join(self.base_dir, self.channel,
                                             quality, hour)
                    try:
                        segment_names = [
                            name for name in os.listdir(hour_path)
                            if not name.startswith('.')
                        ]
                    except OSError as e:
                        if e.errno == errno.ENOENT:
                            self.logger.warning(
                                'Hour {} was deleted between finding it and processing it, ignoring'
                                .format(hour))
                            continue
                    segment_names.sort()
                    parsed = []
                    bad_segment_count = 0
                    for name in segment_names:
                        try:
                            parsed.append(
                                common.parse_segment_path(
                                    os.path.join(hour_path, name)))
                        except ValueError:
                            self.logger.warning(
                                "Failed to parse segment: {!r}".format(
                                    os.path.join(hour_path, name)),
                                exc_info=True)
                            bad_segment_count += 1

                    full_segment_count = 0
                    partial_segment_count = 0
                    full_segment_duration = datetime.timedelta()
                    partial_segment_duration = datetime.timedelta()
                    full_overlaps = 0
                    full_overlap_duration = datetime.timedelta()
                    partial_overlaps = 0
                    partial_overlap_duration = datetime.timedelta()
                    best_segments = []
                    holes = []
                    editable_holes = []
                    previous = None
                    previous_editable = None
                    coverage = datetime.timedelta()
                    editable_coverage = datetime.timedelta()
                    only_partials = []

                    # loop over all start times
                    # first select the best segment for a start time
                    # then update coverage
                    for start_time, segments in itertools.groupby(
                            parsed, key=lambda segment: segment.start):
                        full_segments = []
                        partial_segments = []
                        for segment in segments:
                            if segment.type == 'full':
                                full_segments.append(segment)
                                full_segment_count += 1
                                full_segment_duration += segment.duration
                            elif segment.type == 'partial':
                                partial_segments.append(segment)
                                partial_segment_count += 1
                                partial_segment_duration += segment.duration
                        if full_segments:
                            full_segments.sort(
                                key=lambda segment: (segment.duration))
                            best_segment = full_segments[-1]
                            for segment in full_segments[:-1]:
                                full_overlaps += 1
                                full_overlap_duration += segment.duration
                            for segment in partial_segments:
                                partial_overlaps += 1
                                partial_overlap_duration += segment.duration
                        elif partial_segments:
                            partial_segments.sort(key=lambda segment: os.stat(
                                segment.path).st_size)
                            best_segment = partial_segments[-1]
                            only_partials.append(
                                (best_segment.start,
                                 best_segment.start + best_segment.duration))
                            for segment in partial_segments[:-1]:
                                partial_overlaps += 1
                                partial_overlap_duration += segment.duration
                        else:
                            # ignore any start times with only temporary segments
                            continue
                        self.logger.debug(best_segment.path.split('/')[-1])
                        best_segments.append(best_segment)

                        # now update coverage, overlaps and holes
                        if previous is None:
                            coverage += best_segment.duration
                            editable_coverage += best_segment.duration
                            previous_editable = best_segment
                        else:
                            previous_end = previous.start + previous.duration
                            if segment.start < previous_end:
                                if segment.type == 'full':
                                    full_overlaps += 1
                                    full_overlap_duration += previous_end - segment.start
                                else:
                                    partial_overlaps += 1
                                    partial_overlap_duration += previous_end - segment.start
                                coverage += segment.start - previous_end + segment.duration
                            else:
                                coverage += segment.duration
                                editable_coverage += segment.duration

                                if segment.start > previous_end:
                                    holes.append((previous_end, segment.start))

                                previous_editable_end = previous_editable.start + previous_editable.duration
                                if segment.start > previous_editable_end:
                                    editable_holes.append(
                                        (previous_editable_end, segment.start))

                                previous_editable = best_segment

                        previous = best_segment

                    if best_segments:
                        start = best_segments[0].start
                        end = best_segments[-1].start + best_segments[
                            -1].duration
                        hole_duration = end - start - coverage
                        editable_hole_duration = end - start - editable_coverage

                        hour_start = datetime.datetime.strptime(hour, HOUR_FMT)
                        hour_end = hour_start + datetime.timedelta(hours=1)
                        # handle the case when there is a hole between the last segment of the previous hour and the first of this
                        if previous_hour_segments:
                            last_segment = previous_hour_segments[-1]
                            if best_segments[
                                    0].start > last_segment.start + last_segment.duration:
                                holes.append((hour_start, start))
                                hole_duration += start - hour_start
                                editable_holes.append((hour_start, start))
                                editable_hole_duration += start - hour_start

                        # handle the case when there is a hole between the last segment and the end of the hour if not the last hour
                        if hour != hours[-1] and end < hour_end:
                            holes.append((end, hour_end))
                            hole_duration += hour_end - end
                            editable_holes.append((end, hour_end))
                            editable_hole_duration += hour_end - end

                    # update the large number of Prometheus guages
                    segment_count_gauge.labels(
                        channel=self.channel,
                        quality=quality,
                        hour=hour,
                        type='full').set(full_segment_count)
                    segment_count_gauge.labels(
                        channel=self.channel,
                        quality=quality,
                        hour=hour,
                        type='partial').set(partial_segment_count)
                    segment_count_gauge.labels(
                        channel=self.channel,
                        quality=quality,
                        hour=hour,
                        type='bad').set(bad_segment_count)
                    segment_duration_gauge.labels(
                        channel=self.channel,
                        quality=quality,
                        hour=hour,
                        type='full').set(full_segment_duration.total_seconds())
                    segment_duration_gauge.labels(
                        channel=self.channel,
                        quality=quality,
                        hour=hour,
                        type='partial').set(
                            partial_segment_duration.total_seconds())
                    raw_coverage_gauge.labels(channel=self.channel,
                                              quality=quality,
                                              hour=hour).set(
                                                  coverage.total_seconds())
                    editable_coverage_gauge.labels(
                        channel=self.channel, quality=quality,
                        hour=hour).set(editable_coverage.total_seconds())
                    raw_holes_gauge.labels(channel=self.channel,
                                           quality=quality,
                                           hour=hour).set(len(holes))
                    editable_holes_gauge.labels(channel=self.channel,
                                                quality=quality,
                                                hour=hour).set(
                                                    len(editable_holes))
                    overlap_count_gauge.labels(channel=self.channel,
                                               quality=quality,
                                               hour=hour,
                                               type='full').set(full_overlaps)
                    overlap_count_gauge.labels(
                        channel=self.channel,
                        quality=quality,
                        hour=hour,
                        type='partial').set(partial_overlaps)
                    overlap_duration_gauge.labels(
                        channel=self.channel,
                        quality=quality,
                        hour=hour,
                        type='full').set(full_overlap_duration.total_seconds())
                    overlap_duration_gauge.labels(
                        channel=self.channel,
                        quality=quality,
                        hour=hour,
                        type='partial').set(
                            partial_overlap_duration.total_seconds())

                    # log the same information
                    if best_segments:
                        self.logger.info(
                            '{}/{}: Start: {} End: {} ({} s)'.format(
                                quality, hour, start, end,
                                (end - start).total_seconds()))
                        self.logger.info(
                            '{}/{}: {} full segments totalling {} s'.format(
                                quality, hour, full_segment_count,
                                full_segment_duration.total_seconds()))
                        self.logger.info('{}/{}: {} bad segments'.format(
                            quality, hour, bad_segment_count))
                        self.logger.info(
                            '{}/{}: {} overlapping full segments totalling {} s'
                            .format(quality, hour, full_overlaps,
                                    full_overlap_duration.total_seconds()))
                        self.logger.info(
                            '{}/{}: {} partial segments totalling {} s'.format(
                                quality, hour, partial_segment_count,
                                partial_segment_duration.total_seconds()))
                        self.logger.info(
                            '{}/{}: {} overlapping partial segments totalling {} s'
                            .format(quality, hour, partial_overlaps,
                                    partial_overlap_duration.total_seconds()))
                        self.logger.info(
                            '{}/{}: raw coverage {} s, editable coverage {} s '
                            .format(quality, hour, coverage.total_seconds(),
                                    editable_coverage.total_seconds()))
                        self.logger.info(
                            '{}/{}: {} holes totalling {} s '.format(
                                quality, hour, len(holes),
                                hole_duration.total_seconds()))
                        self.logger.info(
                            '{}/{}: {} editable holes totalling {} s '.format(
                                quality, hour, len(editable_holes),
                                editable_hole_duration.total_seconds()))
                        self.logger.info('Checking {}/{} complete'.format(
                            quality, hour))

                        # add holes for the start and end hours for the
                        # coverage map. do this after updating gauges and
                        # logging as these aren't likely real holes, just the
                        # start and end of the stream.
                        if previous_hour_segments is None:
                            holes.append((hour_start, start))
                        if hour == hours[-1]:
                            holes.append((end, hour_end))

                        all_hour_holes[hour] = holes
                        all_hour_partials[hour] = only_partials

                        previous_hour_segments = best_segments

                    else:
                        self.logger.info('{}/{} is empty'.format(
                            quality, hour))

                self.create_coverage_map(quality, all_hour_holes,
                                         all_hour_partials)

            self.stopping.wait(common.jitter(self.CHECK_INTERVAL))