Exemple #1
0
    def infer_msg(self, tts, rsp=None):
        """Attempt to guess what went wrong by using known
        information (e.g. http response) and observed behaviour

        """
        cause = "Unknown"

        if rsp is None:
            premise = "Failed to connect"

            if tts.tld != 'com':
                host = _translate_url(tld=tts.tld)
                cause = "Host '{}' is not reachable".format(host)

        else:
            # rsp should be <requests.Response>
            # http://docs.python-requests.org/en/master/api/
            status = rsp.status_code
            reason = rsp.reason

            premise = "{:d} ({}) from TTS API".format(status, reason)

            if status == 403:
                cause = "Bad token or upstream API changes"
            elif status == 404 and not tts.lang_check:
                cause = "Unsupported language '%s'" % self.tts.lang
            elif status >= 500:
                cause = "Uptream API error. Try again later."

        return "{}. Probable cause: {}".format(premise, cause)
Exemple #2
0
    def _prepare_requests(self):
        """Created the TTS API the request(s) without sending them.

        Returns:
            list: ``requests.PreparedRequests_``. <https://2.python-requests.org/en/master/api/#requests.PreparedRequest>`_``.
        """
        # TTS API URL
        translate_url = _translate_url(tld=self.tld, path="_/TranslateWebserverUi/data/batchexecute")

        text_parts = self._tokenize(self.text)
        log.debug("text_parts: %s", str(text_parts))
        log.debug("text_parts: %i", len(text_parts))
        assert text_parts, 'No text to send to TTS API'

        prepared_requests = []
        for idx, part in enumerate(text_parts):
            data = self._package_rpc(part)

            log.debug("data-%i: %s", idx, data)

            # Request
            r = requests.Request(method='POST',
                                 url=translate_url,
                                 data=data,
                                 headers=self.GOOGLE_TTS_HEADERS)

            # Prepare request
            prepared_requests.append(r.prepare())

        return prepared_requests
def _fetch_langs(tld="com"):
    """Fetch (scrape) languages from Google Translate.

    Google Translate loads a JavaScript Array of 'languages codes' that can
    be spoken. We intersect this list with all the languages Google Translate
    provides to get the ones that support text-to-speech.

    Args:
        tld (string): Top-level domain for the Google Translate host
            to fetch languages from. i.e `https://translate.google.<tld>`.
            The language names obtained will be in a language locale of the TLD
            (e.g. ``tld=fr`` will retrieve the French names of the languages).
            Default is ``com``.

    Returns:
        dict: A dictionnary of languages from Google Translate

    """

    URL_BASE = _translate_url(tld)

    headers = {
        'User-Agent':
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/605.1.15 (KHTML, like Gecko) "
            "Version/14.0 Safari/605.1.15"
    }

    page = requests.get(URL_BASE, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    scripts = soup.find_all(name='script', string=re.compile(r"^AF_initDataCallback"))
    scripts = [s.text for s in scripts]

    data_by_key = _get_data_by_key(scripts)

    # Get all languages (ds:3)
    # data for 'ds:3' is
    #   [
    #       [['hi', 'Hindi'], ['ps', 'Pashto'], ... ]],
    #       [['hi', 'Hindi'], ['ps', 'Pashto'], ... ]]
    #   ]
    # (Note: list[0] and list[1] are identical)
    all_langs_raw = data_by_key["ds:3"]

    # Get languages codes that have TTS (ds:6)
    # data for 'ds:6' is
    #   [
    #       [['af', 200], ['ar', 200], ...]
    #   ]
    tts_langs_raw = data_by_key["ds:6"]
    tts_langs = [lang[0] for lang in tts_langs_raw[0]]

    # Create language dict (and filter only TTS-enabled langs)
    # langs = { lang[0], lang[1] for lang in all_langs_raw[0] }

    langs = {k: v for k, v in all_langs_raw[0] if k in tts_langs}
    return langs
Exemple #4
0
def _fetch_langs(tld="com"):
    """Fetch (scrape) languages from Google Translate.

    Google Translate loads a JavaScript Array of 'languages codes' that can
    be spoken. We intersect this list with all the languages Google Translate
    provides to get the ones that support text-to-speech.

    Args:
        tld (string): Top-level domain for the Google Translate host
            to fetch languages from. i.e `https://translate.google.<tld>`.
            The language names obtained will be in a language locale of the TLD
            (e.g. ``tld=fr`` will retrieve the French names of the languages).
            Default is ``com``.

    Returns:
        dict: A dictionnary of languages from Google Translate

    """
    URL_BASE = _translate_url(tld)

    # The JavaScript file to look for is either:
    # * translate_m.js or
    # * translate_m_<lang-code>.js
    #   e.g. translate_m_fr.js or translate_m_zh-CN.js
    JS_FILE = r'translate_m(|_\S*)\.js'

    # Load HTML
    page = requests.get(URL_BASE)
    soup = BeautifulSoup(page.content, 'html.parser')

    # JavaScript URL
    # The <script src=''> path can change, but not the file.
    # Ex: /zyx/abc/20180211/translate_m.js
    js_path = soup.find(src=re.compile(JS_FILE))['src']
    js_url = "{}/{}".format(URL_BASE, js_path)

    # Load JavaScript
    js_contents = requests.get(js_url).text

    # Approximately extract TTS-enabled language codes
    # RegEx pattern search because minified variables can change.
    # Extra garbage will be dealt with later as we keep languages only.
    # In: "[...]Fv={af:1,ar:1,[...],zh:1,"zh-cn":1,"zh-tw":1}[...]"
    # Out: ['is', '12', [...], 'af', 'ar', [...], 'zh', 'zh-cn', 'zh-tw']
    pattern = r'[{,\"](\w{2}|\w{2}-\w{2,3})(?=:1|\":1)'
    tts_langs = re.findall(pattern, js_contents)

    # Build lang. dict. from main page (JavaScript object populating lang. menu)
    # Filtering with the TTS-enabled languages
    # In: "{code:'auto',name:'Detect language'},{code:'af',name:'Afrikaans'},[...]"
    # re.findall: [('auto', 'Detect language'), ('af', 'Afrikaans'), [...]]
    # Out: {'af': 'Afrikaans', [...]}
    trans_pattern = r"{code:'(?P<lang>.+?[^'])',name:'(?P<name>.+?[^'])'}"
    trans_langs = re.findall(trans_pattern, page.text)
    return {lang: name for lang, name in trans_langs if lang in tts_langs}
Exemple #5
0
    def _prepare_requests(self):
        """Created the TTS API the request(s) without sending them.

        Returns:
            list: ``requests.PreparedRequests_``. <https://2.python-requests.org/en/master/api/#requests.PreparedRequest>`_``.
        """
        # TTS API URL
        translate_url = _translate_url(tld=self.tld, path="translate_tts")

        text_parts = self._tokenize(self.text)
        log.debug("text_parts: %i", len(text_parts))
        assert text_parts, 'No text to send to TTS API'

        prepared_requests = []
        for idx, part in enumerate(text_parts):
            try:
                # Calculate token
                part_tk = self.token.calculate_token(part)
            except requests.exceptions.RequestException as e:  # pragma: no cover
                log.debug(str(e), exc_info=True)
                raise gTTSError(
                    "Connection error during token calculation: %s" % str(e))

            payload = {
                'ie': 'UTF-8',
                'q': part,
                'tl': self.lang,
                'ttsspeed': self.speed,
                'total': len(text_parts),
                'idx': idx,
                'client': 'tw-ob',
                'textlen': _len(part),
                'tk': part_tk
            }

            log.debug("payload-%i: %s", idx, payload)

            # Request
            r = requests.Request(method='GET',
                                 url=translate_url,
                                 params=payload,
                                 headers=self.GOOGLE_TTS_HEADERS)

            # Prepare request
            prepared_requests.append(r.prepare())

        return prepared_requests
Exemple #6
0
def test_translate_url():
    _in = {"tld": "qwerty", "path": "asdf"}
    _out = "https://translate.google.qwerty/asdf"
    assert _translate_url(**_in) == _out