def download(cls, url, language, options): """ Download CCs from the given ``url`` for the given ``language``, and return a raw string with the result. """ auto = options.get(cls.OPTION_AUTO, False) handler, tmp = gf.tmp_file() if os.path.exists(tmp): os.remove(tmp) tmp = gf.to_unicode_string(tmp) out = u"%s.%s.ttml" % (tmp, language) ydl_options = { "outtmpl": tmp, "subtitlesformat": u"ttml", "subtitleslangs": [language], "writesubtitles": not auto, "writeautomaticsub": auto, "skip_download": True, "logger": YDLogger(), } try: with youtube_dl.YoutubeDL(ydl_options) as ydl: ydl.download([url]) except Exception as e: raise NotDownloadedError if not os.path.exists(out): raise NotDownloadedError with io.open(out, "r", encoding="utf-8") as out_file: data = out_file.read() gf.delete_file(handler, tmp) gf.delete_file(None, out) return data
def from_code(cls, code): if (isinstance(code, LanguageObject)) and (code in cls.ALL_LANGUAGES): return code code = gf.to_unicode_string(code) for language in cls.ALL_LANGUAGES: if language == code: return language return None
def predict(self, obj): """ ``obj`` is either a list of features or a Span containing Tokens. In the latter case, features will be extracted from the sequence of Tokens. TBW """ features = None if isinstance(obj, list): features = obj elif isinstance(obj, Span): features = sentence_to_features(obj) else: raise TypeError(u"The obj should be either a Span (sentence) object or a list of features (dict) objects.") predicted_labels = [gf.to_unicode_string(l) for l in self.tagger.tag(features)] probability = self.tagger.probability(predicted_labels) return predicted_labels, probability
def parse(cls, raw_data, language=None): """ Parse the given ``raw_data`` string, and return a Document object. """ # constants PLACEHOLDER_BR = u" ||| " PLACEHOLDER_NO_TEXT = u"()" PATTERN_SPAN_OPEN = re.compile(r"<span[^>]*>") PATTERN_SPAN_CLOSE = re.compile(r"</span>") PATTERN_BR = re.compile(r"<br[ ]*/>") PATTERN_SPACES = re.compile(r"\s+") TTML_NS = "{http://www.w3.org/ns/ttml}" TTML_TT = "%stt" % TTML_NS TTML_P = "%sp" % TTML_NS TTML_BEGIN = "begin" TTML_END = "end" XML_NS = "{http://www.w3.org/XML/1998/namespace}" XML_LANG = "%slang" % XML_NS # remove spans s = raw_data s = re.sub(PATTERN_SPAN_OPEN, u"", s) s = re.sub(PATTERN_SPAN_CLOSE, u"", s) # replace br with placeholder s = re.sub(PATTERN_BR, PLACEHOLDER_BR, s) # remove duplicated spaces s = re.sub(PATTERN_SPACES, u" ", s).strip() # encode to utf-8 as required by lxml if gf.is_unicode(s): s = s.encode("utf-8") # create tree root = etree.fromstring(s) # parse language xml_lang = language for elem in root.iter(TTML_TT): try: xml_lang = gf.to_unicode_string(elem.get(XML_LANG)) break except: pass raw_ccl = RawCCListSpan() # parse fragments for elem in root.iter(TTML_P): begin = gf.time_from_hhmmssmmm(elem.get(TTML_BEGIN).strip()) end = gf.time_from_hhmmssmmm(elem.get(TTML_END).strip()) text = elem.text # text missing if text is None: text = u"" # strip leading/trailing spaces text = text.strip() # if no text is available, replace it with () if text == u"": text = PLACEHOLDER_NO_TEXT # split lines if the <br/> is present lines = [l.strip() for l in text.split(PLACEHOLDER_BR)] # make sure we return unicode strings lines = [gf.to_unicode_string(l) for l in lines if len(l) > 0] lines = [u"%s %s" % (l, EndOfLineToken.RAW) for l in lines] # append span objects raw_ccl.append( RawCCSpan( elements=[RawCCLineSpan(raw=l) for l in lines], time_interval=TimeInterval(TimeValue(begin), TimeValue(end)), )) # create new Document object doc = Document(raw=raw_ccl, language=xml_lang) return doc