Esempio n. 1
0
import webvtt
from tqdm import tqdm

from app.data_preparation.caption_merger.caption_merger import CaptionMerger
from app.data_preparation.caption_converter.video_caption import (
    Caption,
    VideoCaption,
)
from app.data_preparation.constants import (
    RAW_CAPTION_FILES_PATH,
    TED_CAPTIONS_DATASET_PATH,
)

from app.utils.utils import get_logger, save_as_json

logger = get_logger()


def run() -> None:
    """
    Loads each caption files (extension .vtt), extracts their data and
    store it into a json file
    """

    logger.info("Loading text files")
    caption_files = _load_caption_files_path()
    logger.info("Caption files loaded")

    captions = create_captions_dataset(caption_files)

    save_as_json(captions, TED_CAPTIONS_DATASET_PATH)
Esempio n. 2
0
 def __init__(self):
     self._logger = get_logger()