def download_subset_file(subset_url, dataset_dir):
    """
    Download a subset segments file from the given url to the given directory.

    Args:
        subset_url:   URL to subset segments file
                      (Type: str)

        dataset_dir:  Dataset directory where subset segment file will be stored
                      (Type: str)

    Returns:
        subset_path:  Path to subset segments file
                      (Type: str)

    """
    # Get filename of the subset file
    subset_filename = get_filename(subset_url)
    subset_name = get_subset_name(subset_url)
    subset_path = os.path.join(dataset_dir, subset_filename)

    os.makedirs(dataset_dir, exist_ok=True)

    # Open subset file as a CSV
    if not os.path.exists(subset_path):
        LOGGER.info('Downloading subset file for "{}"'.format(subset_name))
        with open(subset_path, 'w') as f:
            subset_data = urllib.request.urlopen(subset_url).read().decode()
            f.write(subset_data)

    return subset_path
def download_subset(subset_path, dataset_dir, ffmpeg_path, ffprobe_path,
                    num_workers, **ffmpeg_cfg):
    """
    Download all files for a subset, including the segment file, and the audio and video files.

    Args:
        subset_path:    Path to subset segments file
                        (Type: str)

        dataset_dir:    Path to dataset directory where files are saved
                        (Type: str)

        ffmpeg_path:    Path to ffmpeg executable
                        (Type: str)

        ffprobe_path:   Path to ffprobe executable
                        (Type: str)

        num_workers:    Number of workers to download and process videos
                        (Type: int)

    Keyword Args:
        **ffmpeg_cfg:                   Configuration for audio and video
                                        downloading and decoding done by ffmpeg
                                        (Type: dict[str, *])

    Returns:

    """
    if is_url(subset_path):
        subset_path = download_subset_file(subset_path, dataset_dir)

    subset_name = get_subset_name(subset_path)
    data_dir = init_subset_data_dir(dataset_dir, subset_name)

    download_subset_videos(subset_path, data_dir, ffmpeg_path, ffprobe_path,
                           num_workers, **ffmpeg_cfg)
def download_random_subset_files(subset_url,
                                 dataset_dir,
                                 ffmpeg_path,
                                 ffprobe_path,
                                 num_workers,
                                 max_videos=None,
                                 **ffmpeg_cfg):
    """
    Download a a random subset (of size `max_videos`) of subset segment file and videos

    Args:
        subset_path:   Path to subset segments file
                       (Type: str)

        dataset_dir:   Directory where dataset files will be saved
                       (Type: str)

        ffmpeg_path:   Path to ffmpeg executable
                       (Type: str)

        ffprobe_path:  Path to ffprobe executable
                       (Type: str)

        num_workers:   Number of multiprocessing workers used to download videos
                       (Type: int)

    Keyword Args:
        max_videos:    Maximum number of videos to download in this subset. If
                       None, download all files in this subset.
                       (Type int or None)

        **ffmpeg_cfg:  Configuration for audio and video
                       downloading and decoding done by ffmpeg
                       (Type: dict[str, *])
    """
    # FIXME: This code is outdated and shouldn't be used
    # Validate max_videos
    if max_videos is not None and (max_videos < 1 or type(max_videos) != int):
        err_msg = 'max_videos must be a positive integer, or None'
        LOGGER.error(err_msg)
        raise ValueError(err_msg)

    # Get filename of the subset file
    subset_filename = get_filename(subset_url)
    subset_name = get_subset_name(subset_url)
    subset_path = os.path.join(dataset_dir, subset_filename)
    data_dir = init_subset_data_dir(dataset_dir, subset_name)

    # Open subset file as a CSV
    if not os.path.exists(subset_path):
        LOGGER.info('Downloading subset file for "{}"'.format(subset_name))
        with open(subset_path, 'w') as f:
            subset_data = urllib.request.urlopen(subset_url).read().decode()
            f.write(subset_data)

    subset_data = []
    LOGGER.info(
        'Starting download jobs for random subset (of size {}) of subset "{}"'.
        format(max_videos, subset_name))
    with open(subset_path, 'r') as f:
        subset_data_reader = csv.reader(f)
        try:
            for row_idx, row in enumerate(subset_data_reader):
                # Skip commented lines
                if row[0][0] == '#':
                    continue

                subset_data.append(row[:3])
        except csv.Error as e:
            err_msg = 'Encountered error in {} at line {}: {}'
            LOGGER.error(err_msg)
            sys.exit(err_msg.format(subset_filename, row_idx + 1, e))

    # Shuffle data
    random.shuffle(subset_data)

    # Set up multiprocessing pool
    pool = mp.Pool(num_workers)
    try:
        for idx, row in enumerate(subset_data):
            worker_args = [
                row[0],
                float(row[1]),
                float(row[2]), data_dir, ffmpeg_path, ffprobe_path
            ]
            pool.apply_async(partial(segment_mp_worker, **ffmpeg_cfg),
                             worker_args)
            # Run serially
            #segment_mp_worker(*worker_args, **ffmpeg_cfg)

            if max_videos is not None:
                if idx + 1 >= max_videos:
                    info_msg = 'Reached maximum ({}) for subset {}'
                    LOGGER.info(info_msg.format(max_videos, subset_name))
                    break
    except KeyboardInterrupt:
        LOGGER.info("Forcing exit.")
        exit()
    finally:
        try:
            pool.close()
            pool.join()
        except KeyboardInterrupt:
            LOGGER.info("Forcing exit.")
            exit()

    LOGGER.info('Finished download jobs for subset "{}"'.format(subset_name))
def download_subset_videos(subset_path, data_dir, ffmpeg_path, ffprobe_path,
                           num_workers, **ffmpeg_cfg):
    """
    Download subset segment file and videos

    Args:
        subset_path:   Path to subset segments file
                       (Type: str)

        data_dir:      Directory where dataset files will be saved
                       (Type: str)

        ffmpeg_path:   Path to ffmpeg executable
                       (Type: str)

        ffprobe_path:  Path to ffprobe executable
                       (Type: str)

        num_workers:   Number of multiprocessing workers used to download videos
                       (Type: int)

    Keyword Args:
        **ffmpeg_cfg:  Configuration for audio and video
                       downloading and decoding done by ffmpeg
                       (Type: dict[str, *])
    """
    subset_name = get_subset_name(subset_path)

    LOGGER.info('Starting download jobs for subset "{}"'.format(subset_name))
    with open(subset_path, 'r') as f:
        subset_data = csv.reader(f)

        # Set up multiprocessing pool
        pool = mp.Pool(num_workers)
        try:
            for row_idx, row in enumerate(subset_data):
                # Skip commented lines
                if row[0][0] == '#':
                    continue
                ytid, ts_start, ts_end = row[0], float(row[1]), float(row[2])

                # Skip files that already have been downloaded
                media_filename = get_media_filename(ytid, ts_start, ts_end)
                video_filepath = os.path.join(
                    data_dir, 'video', media_filename + '.' +
                    ffmpeg_cfg.get('video_format', 'mp4'))
                audio_filepath = os.path.join(
                    data_dir, 'audio', media_filename + '.' +
                    ffmpeg_cfg.get('audio_format', 'flac'))
                if os.path.exists(video_filepath) and os.path.exists(
                        audio_filepath):
                    info_msg = 'Already downloaded video {} ({} - {}). Skipping.'
                    LOGGER.info(info_msg.format(ytid, ts_start, ts_end))
                    continue

                # Skip files that are neither Applause nor Speech
                with open('filemove/both_id.txt', 'r+') as f:
                    if ytid in f.read():
                        print("downloaded sth meaningful!" + ytid)
                    else:
                        # print("skip" + ytid)
                        continue

                worker_args = [
                    ytid, ts_start, ts_end, data_dir, ffmpeg_path, ffprobe_path
                ]
                pool.apply_async(partial(segment_mp_worker, **ffmpeg_cfg),
                                 worker_args)
                # Run serially
                #segment_mp_worker(*worker_args, **ffmpeg_cfg)

        except csv.Error as e:
            err_msg = 'Encountered error in {} at line {}: {}'
            LOGGER.error(err_msg)
            sys.exit(err_msg.format(subset_path, row_idx + 1, e))
        except KeyboardInterrupt:
            LOGGER.info("Forcing exit.")
            exit()
        finally:
            try:
                pool.close()
                pool.join()
            except KeyboardInterrupt:
                LOGGER.info("Forcing exit.")
                exit()

    LOGGER.info('Finished download jobs for subset "{}"'.format(subset_name))
Ejemplo n.º 5
0
def download_subset_videos(subset_path, data_dir, ffmpeg_path, ffprobe_path,
                           num_workers, **ffmpeg_cfg):
    """
    Download subset segment file and videos

    Args:
        subset_path:   Path to subset segments file
                       (Type: str)

        data_dir:      Directory where dataset files will be saved
                       (Type: str)

        ffmpeg_path:   Path to ffmpeg executable
                       (Type: str)

        ffprobe_path:  Path to ffprobe executable
                       (Type: str)

        num_workers:   Number of multiprocessing workers used to download videos
                       (Type: int)

    Keyword Args:
        **ffmpeg_cfg:  Configuration for audio and video
                       downloading and decoding done by ffmpeg
                       (Type: dict[str, *])
    """
    subset_name = get_subset_name(subset_path)

    failed_ids = load_failures()
    LOGGER.info('Loaded failures, {}'.format(len(failed_ids)))

    LOGGER.info('Preparing jobs for subset "{}"'.format(subset_name))

    import joblib

    def setup_jobs(data):
        jobs = []

        remaining = []

        try:
            for row_idx, row in enumerate(data):
                # Skip commented lines
                if row[0][0] == '#':
                    continue
                ytid, ts_start, ts_end = row[0], float(row[1]), float(row[2])

                audio_only = not bool(ffmpeg_cfg.get('video_mode'))
                output_exists = check_output_exists(data_dir,
                                                    ytid,
                                                    ts_start,
                                                    ts_end,
                                                    audio_only=audio_only)
                if output_exists:
                    continue

                if ytid in failed_ids:
                    continue

                worker_args = [
                    ytid, ts_start, ts_end, data_dir, ffmpeg_path,
                    ffprobe_path, ffmpeg_cfg, failed_ids
                ]

                remaining.append((ytid, ts_start, ts_end))
                job = joblib.delayed(process_job)(*worker_args)
                jobs += [job]
        except csv.Error as e:
            LOGGER.error(f'CSV error in {subset_path} at line {row_idx}: {e}')

        df = pandas.DataFrame.from_records(remaining)
        df.to_csv("remaining.csv", index=False, header=False)

        return jobs

    # Prepare jobs
    jobs = []
    with open(subset_path, 'r') as f:
        subset_data = csv.reader(f)
        jobs = setup_jobs(subset_data)

    LOGGER.info('Starting {} download jobs for subset "{}"'.format(
        len(jobs), subset_name))

    # Execute jobs
    #print(len(jobs), jobs[0])
    results = joblib.Parallel(n_jobs=num_workers)(jobs)

    LOGGER.info('Finished download jobs for subset "{}"'.format(subset_name))