/
subs_utils.py
70 lines (47 loc) · 1.73 KB
/
subs_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# coding:utf-8
from pycaption import WebVTTWriter, WebVTTReader, SRTReader, detect_format
import utils
import pyvtt
import re
def convert_subs_to_vtt(input_subs_path, output_vtt_path):
with open(input_subs_path, 'r') as f:
text = f.read().decode(utils.get_file_encoding(input_subs_path))
reader = detect_format(text)
subs = reader().read(text)
output_text = WebVTTWriter().write(subs)
with open(output_vtt_path, 'w') as w:
w.write(output_text)
def get_subs(vtt_subs_path):
subs = []
reader = WebVTTReader()
with open(vtt_subs_path, 'r') as f:
text = f.read().decode(utils.get_file_encoding(vtt_subs_path))
vtt = reader.read(text)
vttsubs = vtt.get_captions(vtt.get_languages()[0])
#vttsubs = pyvtt.WebVTTFile.open(vtt_subs_path)
print "vttsubs total: %i " % len(vttsubs)
print vttsubs[0].start
print vttsubs[0].end
print vttsubs[0].get_text()
for s in vttsubs:
subs.append({
"text": s.get_text(),
"start": float(s.start)/1000000,
"end": float(s.end)/1000000
})
return subs
def is_bad_subs(subs_text):
bad = False
if subs_text.strip() == "":
bad = True
if len(re.findall(r'[0-9]+', subs_text)) > 0:
bad = True
if len(re.findall(r'[A-Za-z]+', subs_text)) > 0:
bad = True
return bad
def clean_transcript_text_rus(transcript):
transcript = re.sub(r'<[^>]*>', '', transcript)
transcript = transcript.replace("\n", " ")
transcript = re.sub(u'[^а-яё\- ]', '', transcript.strip().lower()).strip()
transcript = transcript.replace("ё", "е")
return transcript