import webvtt from tqdm import tqdm from app.data_preparation.caption_merger.caption_merger import CaptionMerger from app.data_preparation.caption_converter.video_caption import ( Caption, VideoCaption, ) from app.data_preparation.constants import ( RAW_CAPTION_FILES_PATH, TED_CAPTIONS_DATASET_PATH, ) from app.utils.utils import get_logger, save_as_json logger = get_logger() def run() -> None: """ Loads each caption files (extension .vtt), extracts their data and store it into a json file """ logger.info("Loading text files") caption_files = _load_caption_files_path() logger.info("Caption files loaded") captions = create_captions_dataset(caption_files) save_as_json(captions, TED_CAPTIONS_DATASET_PATH)
def __init__(self): self._logger = get_logger()