def detect_language(project_id="YOUR_PROJECT_ID"):
    """Detecting the language of a text string."""

    client = translate.TranslationServiceClient()

    parent = client.location_path(project_id, "global")

    # Detail on supported types can be found here:
    # https://cloud.google.com/translate/docs/supported-formats
    response = client.detect_language(
        content="Hello, world!",
        parent=parent,
        mime_type="text/plain",  # mime types: text/plain, text/html
    )

    # Display list of detected languages sorted by detection confidence.
    # The most probable language is first.
    for language in response.languages:
        # The language detected
        print(u"Language code: {}".format(language.language_code))
        # Confidence of detection result for this language
        print(u"Confidence: {}".format(language.confidence))
Ejemplo n.º 2
0
def translateToEnglish(text):
    client = translate.TranslationServiceClient()

    # Detail on supported types can be found here:
    # https://cloud.google.com/translate/docs/supported-formats

    response = client.translate_text(
        parent='projects/{}'.format(os.getenv('GOOGLE_CLOUD_PROJECT')),
        contents=[text],
        mime_type="text/plain",  # mime types: text/plain, text/html
        source_language_code="si",
        target_language_code="en-US",
    )

    result = ''

    # Display the translation for each input text provided
    for translation in response.translations:
        result = result + translation.translated_text
        pass

    return result
Ejemplo n.º 3
0
def translate_text(cloud=str, text=str):

    if cloud == "azure":
        key = ''

        endpoint = 'https://api.cognitive.microsofttranslator.com/translate?api-version=3.0'
        params = '&to=it'

        full_url = endpoint + params
        headers = {
            'Ocp-Apim-Subscription-Key': key,
            'Content-type': 'application/json',
            'X-ClientTraceId': str(uuid.uuid4())
        }

        body = [{
            'text': text
        }]

        request = requests.post(full_url, headers=headers, json=body)

        output_response(request, cloud)
    elif cloud == "google":
        client = translate.TranslationServiceClient()

        parent = client.location_path('project_id', "global")

        request = client.translate_text(
            parent=parent,
            contents=[text],
            mime_type="text/plain",
            source_language_code="en-US",
            target_language_code="it",
        )

        output_response(request, cloud)
    else:
        print("Cloud not supported.")
Ejemplo n.º 4
0
def batch_translate_text(
    input_uri="gs://input_translation/input/input_00.txt",
    output_uri="gs://out_translation/test-dev2015_questions_00_hin/",
    project_id="singular-antler-295914",
    timeout=3600,
):
    """Translates a batch of texts on GCS and stores the result in a GCS location."""

    client = translate.TranslationServiceClient()

    location = "us-central1"
    # Supported file types: https://cloud.google.com/translate/docs/supported-formats
    gcs_source = {"input_uri": input_uri}

    input_configs_element = {
        "gcs_source": gcs_source,
        "mime_type": "text/plain",  # Can be "text/plain" or "text/html".
    }
    gcs_destination = {"output_uri_prefix": output_uri}
    output_config = {"gcs_destination": gcs_destination}
    parent = f"projects/{project_id}/locations/{location}"

    # Supported language codes: https://cloud.google.com/translate/docs/language
    operation = client.batch_translate_text(
        request={
            "parent": parent,
            "source_language_code": "en",
            "target_language_codes": ["hi"],  # Up to 10 language codes here.
            "input_configs": [input_configs_element],
            "output_config": output_config,
        }
    )

    print("Waiting for operation to complete...")
    response = operation.result(timeout)

    print("Total Characters: {}".format(response.total_characters))
    print("Translated Characters: {}".format(response.translated_characters))
Ejemplo n.º 5
0
def translate_text(text="YOUR_TEXT_TO_TRANSLATE",
                   project_id="YOUR_PROJECT_ID"):
    """Translating Text."""

    client = translate.TranslationServiceClient()

    parent = client.location_path(project_id, "global")

    # Detail on supported types can be found here:
    # https://cloud.google.com/translate/docs/supported-formats
    response = client.translate_text(
        parent=parent,
        contents=[text],
        mime_type="text/plain",  # mime types: text/plain, text/html
        source_language_code=sys.argv[2],
        target_language_code=sys.argv[3],
    )
    # return the translation for each input text provided
    result = ''
    for translation in response.translations:
        #print(u"Translated text: {}".format(translation.translated_text))
        result += translation.translated_text
    return result
Ejemplo n.º 6
0
def translate_text(texts, target="uk", project_id="engaged-kite-304010"):
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """

    client = translate.TranslationServiceClient()
    location = "global"
    parent = f"projects/{project_id}/locations/{location}"

    response = client.translate_text(
        request={
            "parent": parent,
            "contents": texts,
            "mime_type": "text/plain",  # mime types: text/plain, text/html
            "source_language_code": "en-US",
            "target_language_code": target,
        })

    return [
        translation.translated_text for translation in response.translations
    ]
def sample_translate_text(text="YOUR_TEXT_TO_TRANSLATE",
                          project_id="YOUR_PROJECT_ID",
                          language="fr"):
    """Translating Text."""

    client = translate.TranslationServiceClient()

    parent = client.location_path(project_id, "global")

    # Detail on supported types can be found here:
    # https://cloud.google.com/translate/docs/supported-formats
    response = client.translate_text(
        parent=parent,
        contents=[text],
        mime_type="text/plain",  # mime types: text/plain, text/html
        source_language_code="en-US",
        target_language_code=language,
    )
    print(f"You passed in this language {language}")
    # Display the translation for each input text provided
    for translation in response.translations:
        print("Translated text: {}".format(translation.translated_text))
    return "Translated text: {}".format(translation.translated_text)
Ejemplo n.º 8
0
def translateToRandomLang(project_id, message):
    client = translate.TranslationServiceClient()
    location = "global"
    parent = f"projects/{project_id}/locations/{location}"

    try:
        supported_langs = getSupportedLanguages(client, parent)
        if supported_langs != None:
            num_supported_langs = len(supported_langs.languages)
            target_lang = supported_langs.languages[random.randint(
                0, num_supported_langs - 1)]
            target_lang = target_lang.language_code
            source_lang = getMessageLanguage(client, parent, message)
            if source_lang != None:
                translated_mess = translateFromSourceToTarget(
                    client, parent, source_lang, target_lang, message)
                if translated_mess != None:
                    return translated_mess

        return None
    except Exception as e:
        print("translateToRandomLang error", e)
        return None
def batch_translate_text(
    input_uri="gs://YOUR_BUCKET_ID/path/to/your/file.txt",
    output_uri="gs://YOUR_BUCKET_ID/path/to/save/results/",
    project_id="YOUR_PROJECT_ID",
    timeout=180,
):
    """Translates a batch of texts on GCS and stores the result in a GCS location."""

    client = translate.TranslationServiceClient()

    location = "us-central1"
    # Supported file types: https://cloud.google.com/translate/docs/supported-formats
    gcs_source = {"input_uri": input_uri}

    input_configs_element = {
        "gcs_source": gcs_source,
        "mime_type": "text/plain"  # Can be "text/plain" or "text/html".
    }
    gcs_destination = {"output_uri_prefix": output_uri}
    output_config = {"gcs_destination": gcs_destination}
    parent = f"projects/{project_id}/locations/{location}"

    # Supported language codes: https://cloud.google.com/translate/docs/language
    operation = client.batch_translate_text(
        request={
            "parent": parent,
            "source_language_code": "en",
            "target_language_codes": ["ja"],  # Up to 10 language codes here.
            "input_configs": [input_configs_element],
            "output_config": output_config
        })

    print(u"Waiting for operation to complete...")
    response = operation.result(timeout)

    print(u"Total Characters: {}".format(response.total_characters))
    print(u"Translated Characters: {}".format(response.translated_characters))
Ejemplo n.º 10
0
def translate_text(source="en-US",
                   target="fr",
                   text="Example Application",
                   project_id="rosy-griffin-312113"):
    """Translating Text."""

    client = translate.TranslationServiceClient()

    location = "global"

    parent = f"projects/{project_id}/locations/{location}"

    # Detail on supported types can be found here:
    # https://cloud.google.com/translate/docs/supported-formats
    response = client.translate_text(
        request={
            "parent": parent,
            "contents": [text],
            "mime_type": "text/plain",  # mime types: text/plain, text/html
            "source_language_code": f"{source}",
            "target_language_code": f"{target}",
        })

    return response.translations
def sample_translate_text(text):
    client = translate.TranslationServiceClient()

    target_language = 'pt'  # lingua desejada pra tradução

    # credencial para a API
    project_id = 'projeto-tcc-276919'
    parent = client.location_path(project_id, "global")

    contents = [text]

    # Chama a requisição à API e pega sua resposta
    response = client.translate_text(
        parent=parent,
        contents=contents,
        mime_type='text/plain',
        source_language_code='en-US',  # o atual idioma da palavra
        target_language_code=target_language)

    #for translation in response.translations:
    #    print(u"Translated text: {}".format(translation.translated_text))

    # retorna apenas o primeiro resultado do vetor pois é apenas para uma palavra
    return response.translations[0].translated_text
Ejemplo n.º 12
0
    def translate(self, writer, entity):
        if not entity.schema.is_a("Analyzable"):
            return

        # This isn't part of the example, just a generic call to Google
        # cloud translation. This implementation doesn't do caching, does
        # not check if the document is already in the target language,
        # and does not translate other string values like names.
        #
        # This code isn't the point. Don't run it in production, and if you
        # do anyway, don't complain about the fact it's bad. PRs welcome.
        if not hasattr(self, "client"):
            self.client = translate.TranslationServiceClient()
            self.parent = self.client.location_path(PROJECT_ID, "global")

        # Get all the text parts of the entity:
        contents = entity.get_type_values(registry.text)
        if not len(contents):
            return
        log.info("Translating %r", entity)
        response = self.client.translate_text(
            parent=self.parent,
            contents=contents,
            mime_type="text/plain",
            target_language_code=TARGET_LANGUAGE,
        )
        # Make a copy of the entity with no properties set:
        translated = model.make_entity(entity.schema)
        translated.id = entity.id
        for translation in response.translations:
            # log.debug("Received: %s", translation.translated_text)
            translated.add("indexText", translation.translated_text)
            # Store the generated translation fragment for the entity
            # in the ftm-store database. All the properties of the
            # entity will be combined upon indexing.
            writer.put(translated)
def translate_text(text="YOUR_TEXT_TO_TRANSLATE", project_id="YOUR_PROJECT_ID"):
    """Translating Text."""

    client = translate.TranslationServiceClient()

    location = "global"

    parent = f"projects/{project_id}/locations/{location}"

    # Detail on supported types can be found here:
    # https://cloud.google.com/translate/docs/supported-formats
    response = client.translate_text(
        request={
            "parent": parent,
            "contents": [text],
            "mime_type": "text/plain",  # mime types: text/plain, text/html
            "source_language_code": "en-US",
            "target_language_code": "fr"
        }
    )

    # Display the translation for each input text provided
    for translation in response.translations:
        print(u"Translated text: {}".format(translation.translated_text))
Ejemplo n.º 14
0
def main(analysis=None):
    if analysis == None:
        analysis = processor.sources_analysis.do_analysis()
    project_id = "francophonic-1565560815749"
    credential_path = "K:/private/anchpop/privatekeys/Francophonic-f72c700469aa.json"
    environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

    current_dictionary = get_word_dictionary()

    (collected_words, collected_sentences, source_info) = analysis
    understandable_sentences = get_understandable_sentences(analysis)

    translations = get_translations()

    sentences_to_translate = understandable_sentences - translations[
        'french_to_english'].keys()

    print(f"{len(sentences_to_translate)} sentences to translate")

    if len(sentences_to_translate) > 0:
        characters = sum([len(s) for s in sentences_to_translate])
        while True:
            inp = input(
                f"To translate these {len(sentences_to_translate)} sentences would cost around ${characters / 1000000 * 20}, continue? (yes/no/view) "
            )
            if inp == "view":
                for sentence in sentences_to_translate:
                    print(sentence)
            elif inp == "yes":
                client = translate.TranslationServiceClient()
                location = "global"
                parent = f"projects/{project_id}/locations/{location}"
                for sentence in sentences_to_translate:
                    if (sentence != sentence.strip()):
                        raise Exception(
                            f"sentence \"{Style.DIM}{sentence}{Style.RESET_ALL}\" is unstripped!"
                        )

                    print(
                        f"Translating \"{Style.DIM}{sentence}{Style.RESET_ALL}\""
                    )
                    response = client.translate_text(
                        request={
                            "parent": parent,
                            "contents": [sentence],
                            "mime_type":
                            "text/plain",  # mime types: text/plain, text/html
                            "source_language_code": "fr",
                            "target_language_code": "en-US",
                        })

                    translations['french_to_english'][sentence] = {
                        'google': [
                            translation.translated_text
                            for translation in response.translations
                        ]
                    }

                translations['english_to_french'] = translations.get(
                    'english_to_french', [])
                data = yaml.dump(translations,
                                 Dumper=Dumper,
                                 allow_unicode=True)
                with safer.open("translations.yaml", "w",
                                encoding='utf-8') as f:
                    f.write(data)
                break
            else:
                break
Ejemplo n.º 15
0
from google.cloud import translate
import os
import csv

os.environ[
    "GOOGLE_APPLICATION_CREDENTIALS"] = "ADD PATH TO YOUR GOOGLE CLOUD SERVICE ACCOUNT API KEY HERE"

# https://cloud.google.com/translate/docs/basic/translating-text
# Pull training questions from input_file_name and call the Google Translation API to return JSON response
# Write translated text from JSON response to new CSV spreadsheet in the same order as the input file training questions
# Will recieve an output of a csv file called translation_results.csv

# Remember to set the source and target language, language code can be obtained from Google: https://developers.google.com/admin-sdk/directory/v1/languages

client = translate.TranslationServiceClient()
source_language = 'tr'
target_language = 'en-CA'
parent = client.location_path('tqtraining', 'global')
input_file_format = 'text/plain'
input_file_name = 'turkish_training_questions.csv'


def translation_api():
    with open(input_file_name) as training_questions:
        translation_file = csv.reader(training_questions)
        for tq in translation_file:
            print("Text being translated: {}".format(tq))
            with open('translation_results.csv', 'a') as f:
                translation_output_file = csv.writer(f)
                translation_output = client.translate_text(
                    mime_type=input_file_format,
Ejemplo n.º 16
0
 def __init__(self, project_id='moonlit-haven-256102'):
     self.client = translate.TranslationServiceClient()
     self.parent = self.client.location_path(project_id, "global")
Ejemplo n.º 17
0
def setUpModule():
    Config.CLIENT_V2 = translate_v2.Client()
    Config.CLIENT_V3 = translate.TranslationServiceClient()
Ejemplo n.º 18
0
logging.getLogger().setLevel(logging.INFO)

import re
import time
import os

project_id = os.getenv('PROJECT_ID')
bucket_name = os.getenv('BUCKET_NAME')
location = os.getenv('LOCATION')
key_path = os.getenv('SA_KEY_PATH')

credentials = service_account.Credentials.from_service_account_file(key_path)

storage_client = storage.Client(credentials=credentials)

translate_client = translate.TranslationServiceClient(credentials=credentials)

lst_raw_txt_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
                                           prefix='raw_txt')

customize_stop_words = [
    'uoc', 'diagnostic', 'interventional', 'radiology', 'madonna', 'delle', 'grazie', 'hospital',
    'Borgheresi', 'Agostini', 'Ottaviani', 'Floridi', 'Giovagnoni', 'di', 'specialization',
    'Polytechnic', 'University', 'marche', 'ANCONA', 'Italy', 'Azienda', 'Ospedali',
    'Riuniti', 'Yorrette', 'Matera', 'Michele', 'Nardella', 'Gerardo', 'Costanzo',
    'Claudia', 'Lopez', 'st', 'a.', 'a', 'of', 's', 'cien', 'ze', 'diolog', 'ic', 'he',
    'â', '€', 's', 'b', 'case', 'Cuoladi', 'l', 'c', 'ra', 'bergamo', 'patelli', 'est', 'asst',
    'dr', 'Dianluigi', 'Svizzero', 'i', 'riccardo', 'Alessandro', 'Spinazzola', 'angelo',
    'maggiore', 'p', 'r', 't', 'm', 'en', 't', 'o', 'd', 'e', 'n', 'd', 'o', 'g', 'h', 'u',
    'man', 'female', 'D'
]
Ejemplo n.º 19
0
def sample_batch_translate_text_with_glossary_and_model(
    input_uri,
    output_uri,
    project_id,
    location,
    target_language,
    source_language,
    model_id,
    glossary_id,
):
    """
    Batch translate text with Glossary and Translation model
    """

    client = translate.TranslationServiceClient()

    # TODO(developer): Uncomment and set the following variables
    # input_uri = 'gs://cloud-samples-data/text.txt'
    # output_uri = 'gs://YOUR_BUCKET_ID/path_to_store_results/'
    # project = '[Google Cloud Project ID]'
    # location = 'us-central1'
    # target_language = 'en'
    # source_language = 'de'
    # model_id = '{your-model-id}'
    # glossary_id = '[YOUR_GLOSSARY_ID]'
    target_language_codes = [target_language]
    gcs_source = {"input_uri": input_uri}

    # Optional. Can be "text/plain" or "text/html".
    mime_type = "text/plain"
    input_configs_element = {"gcs_source": gcs_source, "mime_type": mime_type}
    input_configs = [input_configs_element]
    gcs_destination = {"output_uri_prefix": output_uri}
    output_config = {"gcs_destination": gcs_destination}
    parent = f"projects/{project_id}/locations/{location}"
    model_path = "projects/{}/locations/{}/models/{}".format(
        project_id, "us-central1", model_id)
    models = {target_language: model_path}

    glossary_path = client.glossary_path(
        project_id,
        "us-central1",
        glossary_id  # The location of the glossary
    )

    glossary_config = translate.TranslateTextGlossaryConfig(
        glossary=glossary_path)
    glossaries = {"ja": glossary_config}  # target lang as key

    operation = client.batch_translate_text(
        request={
            "parent": parent,
            "source_language_code": "en",
            "target_language_codes": target_language_codes,
            "input_configs": input_configs,
            "output_config": output_config,
            "models": models,
            "glossaries": glossaries,
        })

    print("Waiting for operation to complete...")
    response = operation.result()

    # Display the translation for each input text provided
    print("Total Characters: {}".format(response.total_characters))
    print("Translated Characters: {}".format(response.translated_characters))
Ejemplo n.º 20
0
 def __init__(self, project_id):
     self.project_id = project_id
     self.client = translate.TranslationServiceClient()
     self.parent = self.client.location_path(project_id, "global")
Ejemplo n.º 21
0
 def setup(self):
     self._client = translate.TranslationServiceClient()
Ejemplo n.º 22
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("--input_bucket", type=str, default="almond_dataset")
    parser.add_argument("--input_local_path", type=str, default="./")
    parser.add_argument("--input_names",
                        type=str,
                        nargs='+',
                        default=["input.tsv"])
    parser.add_argument('--output_bucket', type=str, default="almond_output")
    parser.add_argument("--glossary_local_path", type=str, default="./extras/")
    parser.add_argument("--glossary_name", type=str, default="glossary.csv")
    parser.add_argument("--glossary_bucket",
                        type=str,
                        default="almond_glossary")
    parser.add_argument("--project_id", type=str, default="")
    parser.add_argument("--project_number", type=str, default="")
    parser.add_argument("--location", type=str, default="us-central1")
    parser.add_argument("--source_lang", type=str, default="en")
    parser.add_argument("--target_langs", type=str, nargs='+', default=["fa"])
    parser.add_argument("--model_id", type=str, default="")
    parser.add_argument("--glossary_type",
                        type=str,
                        choices=['default', 'manual'],
                        default="default")

    parser.add_argument("--update_glossary", action='store_true')
    parser.add_argument("--no_glossary", action='store_true')
    parser.add_argument('--no_translate_params', action='store_true')
    parser.add_argument("--update_dataset", action='store_true')
    parser.add_argument("--do_translate", action='store_true')
    parser.add_argument("--download_results", action='store_true')
    parser.add_argument("--remove_output_id", action='store_true')
    parser.add_argument("--overwrite_output", action='store_true')

    parser.add_argument("--output_local_path",
                        type=str,
                        default="./translation_results")
    parser.add_argument("--keep_blob_name",
                        action='store_true',
                        help="output files names is same as their blob names")

    parser.add_argument('--credential_file', default='', type=str)
    args = parser.parse_args()

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = args.credential_file

    global TRANSLATION_CLIENT
    global STORAGE_CLIENT

    TRANSLATION_CLIENT = translate.TranslationServiceClient()
    STORAGE_CLIENT = storage.Client()

    if args.update_glossary or args.do_translate:

        if args.glossary_type == 'default':
            glossary_name = 'default.csv'
        else:
            glossary_name = args.glossary_name

        glossary_id = glossary_name.rsplit('.', 1)[0] + '_id'

    if args.update_glossary:

        if args.glossary_type == 'default':
            create_default_glossary(args.input_local_path,
                                    args.input_names,
                                    args.glossary_local_path,
                                    args.source_lang,
                                    args.target_langs,
                                    not args.no_translate_params,
                                    glossary_name,
                                    special_words=set())

        glossary_uri = os.path.join(
            *['gs://', args.glossary_bucket, glossary_name])
        upload_blob(args.glossary_bucket, args.glossary_local_path,
                    glossary_name)
        upload_term_set_glossary(args.project_id, glossary_uri, glossary_id,
                                 args.source_lang, args.target_langs)

    if args.update_dataset:
        upload_blob(args.input_bucket, args.input_local_path, args.input_names)

    use_glossary = True
    if args.no_glossary:
        use_glossary = False

    if args.do_translate:

        if bucket_exists(args.output_bucket):
            if args.overwrite_output:
                delete_blobs(args.output_bucket)
        else:
            create_bucket(args.output_bucket)

        input_uris = []
        for input_name in args.input_names:
            input_uris.append(
                os.path.join(*['gs://', args.input_bucket, input_name]))
        output_uri = os.path.join(*['gs://', args.output_bucket + '/'])

        sample_batch_translate_text_with_glossary_and_model(
            input_uris, output_uri, args.project_id, args.location,
            args.target_langs, args.source_lang, args.model_id, glossary_id,
            use_glossary)

    if args.download_results:

        input2origids = dict()
        for input_name in args.input_names:
            input2origids[input_name] = get_ids(args.input_local_path,
                                                input_name)

        output_path = args.output_local_path
        os.makedirs(output_path, exist_ok=True)

        download_results(args.output_bucket, output_path, input2origids,
                         use_glossary, args.keep_blob_name,
                         args.remove_output_id)
Ejemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser(description="EASI Translate")
    parser.add_argument("--menus", required=True, help="Path to Directory containing EASI Menu JSONs")
    parser.add_argument("--generate-index", required=False, action="store_true", help="Just generate the index based on the directory containing EASI Menu JSONs")
    parser.add_argument("--webdriver", required=True, help="The type of Selenium WebDriver")
    parser.add_argument("--webdriver-path", required=True, help="Path to the Selenium WebDriver")
    parser.add_argument("--gapps", required=True, help="Path to Google Application Credentials JSON")
    parser.add_argument("--log", required=False, help="Path to log file")
    args = parser.parse_args()

    logger = logging.getLogger()
    console_logger = logging.StreamHandler()
    console_logger.setFormatter(LOG_FORMATTER)
    logger.addHandler(console_logger)
    logger.setLevel(logging.DEBUG)
    if args.log:
        file_logger = logging.FileHandler(args.log)
        file_logger.setFormatter(LOG_FORMATTER)
        logger.addHandler(file_logger)
    
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = args.gapps
    with open(args.gapps, "r") as f:
        gapps_json = json.load(f)

    google_client = translate.TranslationServiceClient()
    google_parent = google_client.location_path(gapps_json["project_id"], "global")

    os.environ["PATH"] += os.pathsep + os.path.dirname(os.path.abspath(args.webdriver_path))
    driver = getattr(webdriver, args.webdriver)()

    shop_infos = [ ]

    menus = glob.glob(os.path.join(args.menus, "*.json"))
    for i, path in enumerate(menus):
        logging.info(f"Processing {path} ({i + 1}/{len(menus)})")

        with open(path, "r") as f:
            menu_json = json.load(f)

        shop_info = menu_json["data"]["shop_info"]
        shop_infos.append(shop_info)

        if not args.generate_index:
            all_translation_values = [ ]
            for value, key, context in traverse_json(menu_json, valid_translate_value):
                all_translation_values.append(value)

            translation_response = None
            try:
                translation_response = google_client.translate_text(
                    parent=google_parent,
                    contents=all_translation_values,
                    mime_type="text/html", 
                    source_language_code="zh-cn",
                    target_language_code="en"
                )
            except Exception as e:
                logging.error(f"Failed (Google) for {path} ({e})")

            for j, vkc in enumerate(traverse_json(menu_json, valid_translate_value)):
                logging.info(f"{j + 1} / {len(all_translation_values)}")
                
                value, key, context = vkc

                context[key] = { 
                    "value": value, 
                }

                pinyin_value = strip_ascii(value).strip()
                if len(pinyin_value) > 0:
                    context[key]["pinyin"] = pinyin.get(strip_ascii(value), delimiter=" ")
                
                if "price" in context:
                    google_search = None
                    while google_search is None:
                        google_search = get_page(f"https://www.google.com/search?q={urllib.parse.quote(value)}", [
                                ("//*[contains(@class,'kno-ecr-pt')]/span", "text"), 
                                ("//*[contains(@class,'kno-ecr-pt')]/following-sibling::node()/span", "text")
                            ], driver)
                        if google_search is None:
                            driver = getattr(webdriver, args.webdriver)()
                    
                    google_img_search = None
                    while google_img_search is None:
                        google_img_search = get_page(f"https://www.google.com/search?tbm=isch&q={urllib.parse.quote(value)}+food", [
                                ("//*[contains(@class,'rg_i')]", "@src")
                            ], driver)
                        if google_search is None:
                            driver = getattr(webdriver, args.webdriver)()

                    if len(google_search) > 0:
                        context[key]["knowledge_graph"] = google_search[0]

                    if len(google_img_search) > 0:
                        context[key]["google_image"] = google_img_search[0]

                    time.sleep(0.5)

                if translation_response and j < len(translation_response.translations):
                    context[key]["translation"] = translation_response.translations[j].translated_text
            try:
                with open(f"menu.{os.path.splitext(path)[0]}-processed.json", "w") as f:
                    f.write(json.dumps(menu_json, indent=4))
            except Exception as e:
                logging.error(f"Failed (Writing) for {path} ({e})")

    shop_infos.sort(key=lambda x: parse_sold(x), reverse=True)
    try:
        with open(os.path.join(os.path.dirname(path), "index.json"), "w") as f:
            f.write(json.dumps(shop_infos, indent=4))
    except Exception as e:
        logging.error(f"Failed (Writing) for {path} ({e})")

    driver.close()
Ejemplo n.º 24
0
def translateAndRefine(event, context):
    """
    This Cloud Function will be triggered when a message is published on the
    PubSub topic of interest. It will call Translate API.
    args:
        event (dict): Metadata of the event, received from Pub/Sub.
        context (google.cloud.functions.Context): Metadata of triggering event.
    returns:
        None; the output is written to stdout and Stackdriver Logging
    """
    # INSTANTIATION
    translate_client = translate.TranslationServiceClient()
    storage_client = storage.Client()
    dlp_client = google.cloud.dlp_v2.DlpServiceClient()

    # SET VARIABLES
    project_id = os.environ['GCP_PROJECT']
    location = 'global'  # or you can set it to os.environ['LOCATION']

    start_time = time.time()
    if event.get('data'):
        message_data = base64.b64decode(event['data']).decode('utf-8')
        message = json.loads(message_data)
    else:
        raise ValueError('Data sector is missing in the Pub/Sub message.')

    it_text = message.get('text')
    doc_title = message.get('doc_title')
    dest_bucket = 'aketari-covid19-data'

    # Step 1: Call Translate API
    raw_eng_text = doTranslation(translate_client, project_id, it_text)
    print("Completed translation step!")
    print('=============================')

    # Step 2: Clean eng text
    curated_eng_text = cleanEngText(raw_eng_text)
    print("Completed english curation step!")
    print('=============================')

    # Step 3: Redact text
    parent = "{}/{}".format(project_id, location)
    # TODO: replace gcs_prefix_secret with the correct location
    gcs_prefix_secret = 'path/to/your/secret_file.txt'
    INFO_TYPES = [
        "FIRST_NAME", "LAST_NAME", "FEMALE_NAME", "MALE_NAME", "PERSON_NAME",
        "STREET_ADDRESS", "ITALY_FISCAL_CODE"
    ]
    bucket_client = storage_client.get_bucket(dest_bucket)
    AES_bytes = bucket_client.blob(
        gcs_prefix_secret).download_as_string().encode('utf-8')
    base64_AES_bytes = base64.b64encode(AES_bytes)
    redacted_text = deterministicDeidentifyWithFpe(
        dlp_client=dlp_client,
        parent=parent,
        text=text,
        info_types=INFO_TYPES,
        surrogate_type="REDACTED",
        b64encoded_bytes=base64_AES_bytes)

    print("Completed redaction step!")
    print('=============================')

    # Step 4: Upload translated text
    prefix_raw_eng_txt = 'eng_txt/{}.txt'.format(doc_title)
    uploadBlob(storage_client, dest_bucket, raw_eng_text, prefix_raw_eng_txt)

    prefix_curated_eng_txt = 'curated_eng_txt/{}.txt'.format(doc_title)
    uploadBlob(storage_client, dest_bucket, curated_eng_text,
               prefix_curated_eng_txt)

    prefix_redacted_eng_txt = 'redacted_raw_eng_txt/{}.txt'.format(doc_title)
    uploadBlob(storage_client, dest_bucket, redacted_text,
               prefix_redacted_eng_txt)
    print("Completed upload step!")
    print('=============================')

    end_time = time.time() - start_time
    logging.info("Completion of text_extract took: {} seconds".format(
        round(end_time, 1)))
def translate_text(text="YOUR_TEXT_TO_TRANSLATE", project_id="YOUR_PROJECT_ID"):
    """Translating Text."""
    
    client = translate.TranslationServiceClient()

    location = "global"

    parent = f"projects/{project_id}/locations/{location}"

    # Detail on supported types can be found here:
    # https://cloud.google.com/translate/docs/supported-formats
    response = client.translate_text(
        request={
            "parent": parent,
            "contents": [text],
            "mime_type": "text/plain",  # mime types: text/plain, text/html
            "source_language_code": "en-US",
            "target_language_code": "fr",
        }
    )

    #Converting the Google object into a string
    trString = str(response.translations)

    #Cleaning the extra characters
    cleanStr = trString[19:-3].lower()
    
    #E accent values to be replaced
    graveE = "\\303\\250"
    aiguE = "\\303\\251"
    circE = "\\303\\252"
    tremaE = "\\303\\253"
    
    #Slash after apostrophe removal
    ap = "\\"

    #A accents
    graveA = "\\303\\240"
    circA = "\\303\\242"

    #U accents
    graveU = "\\303\\271"
    circU = "\\303\\273"
    tremaU = "\\303\\274"

    #I accents
    circI = "\\303\\256"
    tremaI = "\\303\\257"
    
    #O accents
    circO = "\\303\\264"

    #C accents
    cedi = "\\303\\247"

    if(aiguE in cleanStr or graveE in cleanStr or circE in cleanStr or ap in cleanStr or graveA in cleanStr or circA in cleanStr or graveU in cleanStr or circU in cleanStr or tremaU in cleanStr or circI in cleanStr or tremaI in cleanStr or circO in cleanStr or cedi in cleanStr or tremaE in cleanStr):
        # e decoding
        cleanStr = cleanStr.replace(aiguE, aiguE.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8'))
        cleanStr = cleanStr.replace(graveE, graveE.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8'))
        cleanStr = cleanStr.replace(circE, circE.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8'))
        cleanStr = cleanStr.replace(tremaE, tremaE.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8'))
        # a decoding
        cleanStr = cleanStr.replace(graveA, graveA.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8'))
        cleanStr = cleanStr.replace(circA, circA.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8'))
        # u decoding
        cleanStr = cleanStr.replace(graveU, graveU.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8'))
        cleanStr = cleanStr.replace(circU, circU.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8'))
        cleanStr = cleanStr.replace(tremaU, tremaU.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8'))
        # i decoding
        cleanStr = cleanStr.replace(circI, circI.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8'))
        cleanStr = cleanStr.replace(tremaI, tremaI.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8'))
        # o decoding
        cleanStr = cleanStr.replace(circO, circO.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8'))
        cleanStr = cleanStr.replace(cedi, cedi.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8'))
        # apostrophe decoding
        cleanStr = cleanStr.replace(ap, "")

    #print(cleanStr)
    nlp = spacy.load("fr_core_news_lg")
    ennlp = spacy.load("en_core_web_sm")
    
    docFr = nlp(cleanStr)
    num_posFr = docFr.count_by(spacy.attrs.IDS['POS'])
    
    docEn = ennlp(text)
    num_depEn = docEn.count_by(spacy.attrs.IDS['DEP'])
    
    #print(num_posFr)
    #print(num_depEn)

    if(440 in num_depEn):
        if(num_depEn[440]==1):
            sentTagSimInt(cleanStr, text)
    
    if(95 in num_posFr):
        if(num_posFr[95]==1):
            sentTagSimInt(cleanStr, text)
    
    if(440 in num_depEn):
        if(num_depEn[440]>=2):
            sentTagComplex(cleanStr, text)
    
    if(95 in num_posFr):
        if(num_posFr[95]>=2):
            sentTagComplex(cleanStr, text)
Ejemplo n.º 26
0
 def __init__(self):
     credentials, project_id = google.auth.default()
     self.client = translate.TranslationServiceClient(
         credentials=credentials)
     self.parent = self.client.location_path(project_id, "global")
     log.info("Using Google Translation Service. Charges apply.")