-
Notifications
You must be signed in to change notification settings - Fork 0
/
youtube.py
58 lines (46 loc) · 1.62 KB
/
youtube.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import logging
import os
import re
from datetime import datetime
import utils
from subtitles import Subtitles
from work_base import WorkBase, prepare_mapping
from pathlib import Path
from episode import Episode
log = logging.getLogger(__name__)
import itertools
def pairwise(iterable):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = itertools.tee(iterable)
next(b, None)
return zip(a, b)
class YouTube(WorkBase):
def _files_generator(self):
processed_urls = set()
with Path(self.dataset_path).open() as fp:
for url in fp:
if url and url not in processed_urls:
processed_urls.add(url)
yield url.strip('\n')
def _process(self, url):
log.info(f'Processing {url}')
episode = Episode.cached(url)
if not episode:
return []
tmp_audio_path = utils.convert(file_path=episode.audio_path, extension='.flac')
subtitles = Subtitles.from_srt(episode.captions_path)
results = []
start = None
accum_text = []
for line, next_line in pairwise(subtitles.lines):
# log.info(f'Processing {line}')
if start is None:
start = line.start
accum_text.append(line.text)
if line.end - start > 10 * 1000:
self._save_part(start, line.end, tmp_audio_path, ' '.join(accum_text), results)
accum_text = []
start = next_line.start
log.info(f'Processed {url}')
return results
# shutil.rmtree(ep_folder, ignore_errors=True) # remove episode folder and files