def infer_msg(self, tts, rsp=None): """Attempt to guess what went wrong by using known information (e.g. http response) and observed behaviour """ cause = "Unknown" if rsp is None: premise = "Failed to connect" if tts.tld != 'com': host = _translate_url(tld=tts.tld) cause = "Host '{}' is not reachable".format(host) else: # rsp should be <requests.Response> # http://docs.python-requests.org/en/master/api/ status = rsp.status_code reason = rsp.reason premise = "{:d} ({}) from TTS API".format(status, reason) if status == 403: cause = "Bad token or upstream API changes" elif status == 404 and not tts.lang_check: cause = "Unsupported language '%s'" % self.tts.lang elif status >= 500: cause = "Uptream API error. Try again later." return "{}. Probable cause: {}".format(premise, cause)
def _prepare_requests(self): """Created the TTS API the request(s) without sending them. Returns: list: ``requests.PreparedRequests_``. <https://2.python-requests.org/en/master/api/#requests.PreparedRequest>`_``. """ # TTS API URL translate_url = _translate_url(tld=self.tld, path="_/TranslateWebserverUi/data/batchexecute") text_parts = self._tokenize(self.text) log.debug("text_parts: %s", str(text_parts)) log.debug("text_parts: %i", len(text_parts)) assert text_parts, 'No text to send to TTS API' prepared_requests = [] for idx, part in enumerate(text_parts): data = self._package_rpc(part) log.debug("data-%i: %s", idx, data) # Request r = requests.Request(method='POST', url=translate_url, data=data, headers=self.GOOGLE_TTS_HEADERS) # Prepare request prepared_requests.append(r.prepare()) return prepared_requests
def _fetch_langs(tld="com"): """Fetch (scrape) languages from Google Translate. Google Translate loads a JavaScript Array of 'languages codes' that can be spoken. We intersect this list with all the languages Google Translate provides to get the ones that support text-to-speech. Args: tld (string): Top-level domain for the Google Translate host to fetch languages from. i.e `https://translate.google.<tld>`. The language names obtained will be in a language locale of the TLD (e.g. ``tld=fr`` will retrieve the French names of the languages). Default is ``com``. Returns: dict: A dictionnary of languages from Google Translate """ URL_BASE = _translate_url(tld) headers = { 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/605.1.15 (KHTML, like Gecko) " "Version/14.0 Safari/605.1.15" } page = requests.get(URL_BASE, headers=headers) soup = BeautifulSoup(page.content, 'html.parser') scripts = soup.find_all(name='script', string=re.compile(r"^AF_initDataCallback")) scripts = [s.text for s in scripts] data_by_key = _get_data_by_key(scripts) # Get all languages (ds:3) # data for 'ds:3' is # [ # [['hi', 'Hindi'], ['ps', 'Pashto'], ... ]], # [['hi', 'Hindi'], ['ps', 'Pashto'], ... ]] # ] # (Note: list[0] and list[1] are identical) all_langs_raw = data_by_key["ds:3"] # Get languages codes that have TTS (ds:6) # data for 'ds:6' is # [ # [['af', 200], ['ar', 200], ...] # ] tts_langs_raw = data_by_key["ds:6"] tts_langs = [lang[0] for lang in tts_langs_raw[0]] # Create language dict (and filter only TTS-enabled langs) # langs = { lang[0], lang[1] for lang in all_langs_raw[0] } langs = {k: v for k, v in all_langs_raw[0] if k in tts_langs} return langs
def _fetch_langs(tld="com"): """Fetch (scrape) languages from Google Translate. Google Translate loads a JavaScript Array of 'languages codes' that can be spoken. We intersect this list with all the languages Google Translate provides to get the ones that support text-to-speech. Args: tld (string): Top-level domain for the Google Translate host to fetch languages from. i.e `https://translate.google.<tld>`. The language names obtained will be in a language locale of the TLD (e.g. ``tld=fr`` will retrieve the French names of the languages). Default is ``com``. Returns: dict: A dictionnary of languages from Google Translate """ URL_BASE = _translate_url(tld) # The JavaScript file to look for is either: # * translate_m.js or # * translate_m_<lang-code>.js # e.g. translate_m_fr.js or translate_m_zh-CN.js JS_FILE = r'translate_m(|_\S*)\.js' # Load HTML page = requests.get(URL_BASE) soup = BeautifulSoup(page.content, 'html.parser') # JavaScript URL # The <script src=''> path can change, but not the file. # Ex: /zyx/abc/20180211/translate_m.js js_path = soup.find(src=re.compile(JS_FILE))['src'] js_url = "{}/{}".format(URL_BASE, js_path) # Load JavaScript js_contents = requests.get(js_url).text # Approximately extract TTS-enabled language codes # RegEx pattern search because minified variables can change. # Extra garbage will be dealt with later as we keep languages only. # In: "[...]Fv={af:1,ar:1,[...],zh:1,"zh-cn":1,"zh-tw":1}[...]" # Out: ['is', '12', [...], 'af', 'ar', [...], 'zh', 'zh-cn', 'zh-tw'] pattern = r'[{,\"](\w{2}|\w{2}-\w{2,3})(?=:1|\":1)' tts_langs = re.findall(pattern, js_contents) # Build lang. dict. from main page (JavaScript object populating lang. menu) # Filtering with the TTS-enabled languages # In: "{code:'auto',name:'Detect language'},{code:'af',name:'Afrikaans'},[...]" # re.findall: [('auto', 'Detect language'), ('af', 'Afrikaans'), [...]] # Out: {'af': 'Afrikaans', [...]} trans_pattern = r"{code:'(?P<lang>.+?[^'])',name:'(?P<name>.+?[^'])'}" trans_langs = re.findall(trans_pattern, page.text) return {lang: name for lang, name in trans_langs if lang in tts_langs}
def _prepare_requests(self): """Created the TTS API the request(s) without sending them. Returns: list: ``requests.PreparedRequests_``. <https://2.python-requests.org/en/master/api/#requests.PreparedRequest>`_``. """ # TTS API URL translate_url = _translate_url(tld=self.tld, path="translate_tts") text_parts = self._tokenize(self.text) log.debug("text_parts: %i", len(text_parts)) assert text_parts, 'No text to send to TTS API' prepared_requests = [] for idx, part in enumerate(text_parts): try: # Calculate token part_tk = self.token.calculate_token(part) except requests.exceptions.RequestException as e: # pragma: no cover log.debug(str(e), exc_info=True) raise gTTSError( "Connection error during token calculation: %s" % str(e)) payload = { 'ie': 'UTF-8', 'q': part, 'tl': self.lang, 'ttsspeed': self.speed, 'total': len(text_parts), 'idx': idx, 'client': 'tw-ob', 'textlen': _len(part), 'tk': part_tk } log.debug("payload-%i: %s", idx, payload) # Request r = requests.Request(method='GET', url=translate_url, params=payload, headers=self.GOOGLE_TTS_HEADERS) # Prepare request prepared_requests.append(r.prepare()) return prepared_requests
def test_translate_url(): _in = {"tld": "qwerty", "path": "asdf"} _out = "https://translate.google.qwerty/asdf" assert _translate_url(**_in) == _out