def detect_language(project_id="YOUR_PROJECT_ID"): """Detecting the language of a text string.""" client = translate.TranslationServiceClient() parent = client.location_path(project_id, "global") # Detail on supported types can be found here: # https://cloud.google.com/translate/docs/supported-formats response = client.detect_language( content="Hello, world!", parent=parent, mime_type="text/plain", # mime types: text/plain, text/html ) # Display list of detected languages sorted by detection confidence. # The most probable language is first. for language in response.languages: # The language detected print(u"Language code: {}".format(language.language_code)) # Confidence of detection result for this language print(u"Confidence: {}".format(language.confidence))
def translateToEnglish(text): client = translate.TranslationServiceClient() # Detail on supported types can be found here: # https://cloud.google.com/translate/docs/supported-formats response = client.translate_text( parent='projects/{}'.format(os.getenv('GOOGLE_CLOUD_PROJECT')), contents=[text], mime_type="text/plain", # mime types: text/plain, text/html source_language_code="si", target_language_code="en-US", ) result = '' # Display the translation for each input text provided for translation in response.translations: result = result + translation.translated_text pass return result
def translate_text(cloud=str, text=str): if cloud == "azure": key = '' endpoint = 'https://api.cognitive.microsofttranslator.com/translate?api-version=3.0' params = '&to=it' full_url = endpoint + params headers = { 'Ocp-Apim-Subscription-Key': key, 'Content-type': 'application/json', 'X-ClientTraceId': str(uuid.uuid4()) } body = [{ 'text': text }] request = requests.post(full_url, headers=headers, json=body) output_response(request, cloud) elif cloud == "google": client = translate.TranslationServiceClient() parent = client.location_path('project_id', "global") request = client.translate_text( parent=parent, contents=[text], mime_type="text/plain", source_language_code="en-US", target_language_code="it", ) output_response(request, cloud) else: print("Cloud not supported.")
def batch_translate_text( input_uri="gs://input_translation/input/input_00.txt", output_uri="gs://out_translation/test-dev2015_questions_00_hin/", project_id="singular-antler-295914", timeout=3600, ): """Translates a batch of texts on GCS and stores the result in a GCS location.""" client = translate.TranslationServiceClient() location = "us-central1" # Supported file types: https://cloud.google.com/translate/docs/supported-formats gcs_source = {"input_uri": input_uri} input_configs_element = { "gcs_source": gcs_source, "mime_type": "text/plain", # Can be "text/plain" or "text/html". } gcs_destination = {"output_uri_prefix": output_uri} output_config = {"gcs_destination": gcs_destination} parent = f"projects/{project_id}/locations/{location}" # Supported language codes: https://cloud.google.com/translate/docs/language operation = client.batch_translate_text( request={ "parent": parent, "source_language_code": "en", "target_language_codes": ["hi"], # Up to 10 language codes here. "input_configs": [input_configs_element], "output_config": output_config, } ) print("Waiting for operation to complete...") response = operation.result(timeout) print("Total Characters: {}".format(response.total_characters)) print("Translated Characters: {}".format(response.translated_characters))
def translate_text(text="YOUR_TEXT_TO_TRANSLATE", project_id="YOUR_PROJECT_ID"): """Translating Text.""" client = translate.TranslationServiceClient() parent = client.location_path(project_id, "global") # Detail on supported types can be found here: # https://cloud.google.com/translate/docs/supported-formats response = client.translate_text( parent=parent, contents=[text], mime_type="text/plain", # mime types: text/plain, text/html source_language_code=sys.argv[2], target_language_code=sys.argv[3], ) # return the translation for each input text provided result = '' for translation in response.translations: #print(u"Translated text: {}".format(translation.translated_text)) result += translation.translated_text return result
def translate_text(texts, target="uk", project_id="engaged-kite-304010"): """Translates text into the target language. Target must be an ISO 639-1 language code. See https://g.co/cloud/translate/v2/translate-reference#supported_languages """ client = translate.TranslationServiceClient() location = "global" parent = f"projects/{project_id}/locations/{location}" response = client.translate_text( request={ "parent": parent, "contents": texts, "mime_type": "text/plain", # mime types: text/plain, text/html "source_language_code": "en-US", "target_language_code": target, }) return [ translation.translated_text for translation in response.translations ]
def sample_translate_text(text="YOUR_TEXT_TO_TRANSLATE", project_id="YOUR_PROJECT_ID", language="fr"): """Translating Text.""" client = translate.TranslationServiceClient() parent = client.location_path(project_id, "global") # Detail on supported types can be found here: # https://cloud.google.com/translate/docs/supported-formats response = client.translate_text( parent=parent, contents=[text], mime_type="text/plain", # mime types: text/plain, text/html source_language_code="en-US", target_language_code=language, ) print(f"You passed in this language {language}") # Display the translation for each input text provided for translation in response.translations: print("Translated text: {}".format(translation.translated_text)) return "Translated text: {}".format(translation.translated_text)
def translateToRandomLang(project_id, message): client = translate.TranslationServiceClient() location = "global" parent = f"projects/{project_id}/locations/{location}" try: supported_langs = getSupportedLanguages(client, parent) if supported_langs != None: num_supported_langs = len(supported_langs.languages) target_lang = supported_langs.languages[random.randint( 0, num_supported_langs - 1)] target_lang = target_lang.language_code source_lang = getMessageLanguage(client, parent, message) if source_lang != None: translated_mess = translateFromSourceToTarget( client, parent, source_lang, target_lang, message) if translated_mess != None: return translated_mess return None except Exception as e: print("translateToRandomLang error", e) return None
def batch_translate_text( input_uri="gs://YOUR_BUCKET_ID/path/to/your/file.txt", output_uri="gs://YOUR_BUCKET_ID/path/to/save/results/", project_id="YOUR_PROJECT_ID", timeout=180, ): """Translates a batch of texts on GCS and stores the result in a GCS location.""" client = translate.TranslationServiceClient() location = "us-central1" # Supported file types: https://cloud.google.com/translate/docs/supported-formats gcs_source = {"input_uri": input_uri} input_configs_element = { "gcs_source": gcs_source, "mime_type": "text/plain" # Can be "text/plain" or "text/html". } gcs_destination = {"output_uri_prefix": output_uri} output_config = {"gcs_destination": gcs_destination} parent = f"projects/{project_id}/locations/{location}" # Supported language codes: https://cloud.google.com/translate/docs/language operation = client.batch_translate_text( request={ "parent": parent, "source_language_code": "en", "target_language_codes": ["ja"], # Up to 10 language codes here. "input_configs": [input_configs_element], "output_config": output_config }) print(u"Waiting for operation to complete...") response = operation.result(timeout) print(u"Total Characters: {}".format(response.total_characters)) print(u"Translated Characters: {}".format(response.translated_characters))
def translate_text(source="en-US", target="fr", text="Example Application", project_id="rosy-griffin-312113"): """Translating Text.""" client = translate.TranslationServiceClient() location = "global" parent = f"projects/{project_id}/locations/{location}" # Detail on supported types can be found here: # https://cloud.google.com/translate/docs/supported-formats response = client.translate_text( request={ "parent": parent, "contents": [text], "mime_type": "text/plain", # mime types: text/plain, text/html "source_language_code": f"{source}", "target_language_code": f"{target}", }) return response.translations
def sample_translate_text(text): client = translate.TranslationServiceClient() target_language = 'pt' # lingua desejada pra tradução # credencial para a API project_id = 'projeto-tcc-276919' parent = client.location_path(project_id, "global") contents = [text] # Chama a requisição à API e pega sua resposta response = client.translate_text( parent=parent, contents=contents, mime_type='text/plain', source_language_code='en-US', # o atual idioma da palavra target_language_code=target_language) #for translation in response.translations: # print(u"Translated text: {}".format(translation.translated_text)) # retorna apenas o primeiro resultado do vetor pois é apenas para uma palavra return response.translations[0].translated_text
def translate(self, writer, entity): if not entity.schema.is_a("Analyzable"): return # This isn't part of the example, just a generic call to Google # cloud translation. This implementation doesn't do caching, does # not check if the document is already in the target language, # and does not translate other string values like names. # # This code isn't the point. Don't run it in production, and if you # do anyway, don't complain about the fact it's bad. PRs welcome. if not hasattr(self, "client"): self.client = translate.TranslationServiceClient() self.parent = self.client.location_path(PROJECT_ID, "global") # Get all the text parts of the entity: contents = entity.get_type_values(registry.text) if not len(contents): return log.info("Translating %r", entity) response = self.client.translate_text( parent=self.parent, contents=contents, mime_type="text/plain", target_language_code=TARGET_LANGUAGE, ) # Make a copy of the entity with no properties set: translated = model.make_entity(entity.schema) translated.id = entity.id for translation in response.translations: # log.debug("Received: %s", translation.translated_text) translated.add("indexText", translation.translated_text) # Store the generated translation fragment for the entity # in the ftm-store database. All the properties of the # entity will be combined upon indexing. writer.put(translated)
def translate_text(text="YOUR_TEXT_TO_TRANSLATE", project_id="YOUR_PROJECT_ID"): """Translating Text.""" client = translate.TranslationServiceClient() location = "global" parent = f"projects/{project_id}/locations/{location}" # Detail on supported types can be found here: # https://cloud.google.com/translate/docs/supported-formats response = client.translate_text( request={ "parent": parent, "contents": [text], "mime_type": "text/plain", # mime types: text/plain, text/html "source_language_code": "en-US", "target_language_code": "fr" } ) # Display the translation for each input text provided for translation in response.translations: print(u"Translated text: {}".format(translation.translated_text))
def main(analysis=None): if analysis == None: analysis = processor.sources_analysis.do_analysis() project_id = "francophonic-1565560815749" credential_path = "K:/private/anchpop/privatekeys/Francophonic-f72c700469aa.json" environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path current_dictionary = get_word_dictionary() (collected_words, collected_sentences, source_info) = analysis understandable_sentences = get_understandable_sentences(analysis) translations = get_translations() sentences_to_translate = understandable_sentences - translations[ 'french_to_english'].keys() print(f"{len(sentences_to_translate)} sentences to translate") if len(sentences_to_translate) > 0: characters = sum([len(s) for s in sentences_to_translate]) while True: inp = input( f"To translate these {len(sentences_to_translate)} sentences would cost around ${characters / 1000000 * 20}, continue? (yes/no/view) " ) if inp == "view": for sentence in sentences_to_translate: print(sentence) elif inp == "yes": client = translate.TranslationServiceClient() location = "global" parent = f"projects/{project_id}/locations/{location}" for sentence in sentences_to_translate: if (sentence != sentence.strip()): raise Exception( f"sentence \"{Style.DIM}{sentence}{Style.RESET_ALL}\" is unstripped!" ) print( f"Translating \"{Style.DIM}{sentence}{Style.RESET_ALL}\"" ) response = client.translate_text( request={ "parent": parent, "contents": [sentence], "mime_type": "text/plain", # mime types: text/plain, text/html "source_language_code": "fr", "target_language_code": "en-US", }) translations['french_to_english'][sentence] = { 'google': [ translation.translated_text for translation in response.translations ] } translations['english_to_french'] = translations.get( 'english_to_french', []) data = yaml.dump(translations, Dumper=Dumper, allow_unicode=True) with safer.open("translations.yaml", "w", encoding='utf-8') as f: f.write(data) break else: break
from google.cloud import translate import os import csv os.environ[ "GOOGLE_APPLICATION_CREDENTIALS"] = "ADD PATH TO YOUR GOOGLE CLOUD SERVICE ACCOUNT API KEY HERE" # https://cloud.google.com/translate/docs/basic/translating-text # Pull training questions from input_file_name and call the Google Translation API to return JSON response # Write translated text from JSON response to new CSV spreadsheet in the same order as the input file training questions # Will recieve an output of a csv file called translation_results.csv # Remember to set the source and target language, language code can be obtained from Google: https://developers.google.com/admin-sdk/directory/v1/languages client = translate.TranslationServiceClient() source_language = 'tr' target_language = 'en-CA' parent = client.location_path('tqtraining', 'global') input_file_format = 'text/plain' input_file_name = 'turkish_training_questions.csv' def translation_api(): with open(input_file_name) as training_questions: translation_file = csv.reader(training_questions) for tq in translation_file: print("Text being translated: {}".format(tq)) with open('translation_results.csv', 'a') as f: translation_output_file = csv.writer(f) translation_output = client.translate_text( mime_type=input_file_format,
def __init__(self, project_id='moonlit-haven-256102'): self.client = translate.TranslationServiceClient() self.parent = self.client.location_path(project_id, "global")
def setUpModule(): Config.CLIENT_V2 = translate_v2.Client() Config.CLIENT_V3 = translate.TranslationServiceClient()
logging.getLogger().setLevel(logging.INFO) import re import time import os project_id = os.getenv('PROJECT_ID') bucket_name = os.getenv('BUCKET_NAME') location = os.getenv('LOCATION') key_path = os.getenv('SA_KEY_PATH') credentials = service_account.Credentials.from_service_account_file(key_path) storage_client = storage.Client(credentials=credentials) translate_client = translate.TranslationServiceClient(credentials=credentials) lst_raw_txt_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, prefix='raw_txt') customize_stop_words = [ 'uoc', 'diagnostic', 'interventional', 'radiology', 'madonna', 'delle', 'grazie', 'hospital', 'Borgheresi', 'Agostini', 'Ottaviani', 'Floridi', 'Giovagnoni', 'di', 'specialization', 'Polytechnic', 'University', 'marche', 'ANCONA', 'Italy', 'Azienda', 'Ospedali', 'Riuniti', 'Yorrette', 'Matera', 'Michele', 'Nardella', 'Gerardo', 'Costanzo', 'Claudia', 'Lopez', 'st', 'a.', 'a', 'of', 's', 'cien', 'ze', 'diolog', 'ic', 'he', 'â', '€', 's', 'b', 'case', 'Cuoladi', 'l', 'c', 'ra', 'bergamo', 'patelli', 'est', 'asst', 'dr', 'Dianluigi', 'Svizzero', 'i', 'riccardo', 'Alessandro', 'Spinazzola', 'angelo', 'maggiore', 'p', 'r', 't', 'm', 'en', 't', 'o', 'd', 'e', 'n', 'd', 'o', 'g', 'h', 'u', 'man', 'female', 'D' ]
def sample_batch_translate_text_with_glossary_and_model( input_uri, output_uri, project_id, location, target_language, source_language, model_id, glossary_id, ): """ Batch translate text with Glossary and Translation model """ client = translate.TranslationServiceClient() # TODO(developer): Uncomment and set the following variables # input_uri = 'gs://cloud-samples-data/text.txt' # output_uri = 'gs://YOUR_BUCKET_ID/path_to_store_results/' # project = '[Google Cloud Project ID]' # location = 'us-central1' # target_language = 'en' # source_language = 'de' # model_id = '{your-model-id}' # glossary_id = '[YOUR_GLOSSARY_ID]' target_language_codes = [target_language] gcs_source = {"input_uri": input_uri} # Optional. Can be "text/plain" or "text/html". mime_type = "text/plain" input_configs_element = {"gcs_source": gcs_source, "mime_type": mime_type} input_configs = [input_configs_element] gcs_destination = {"output_uri_prefix": output_uri} output_config = {"gcs_destination": gcs_destination} parent = f"projects/{project_id}/locations/{location}" model_path = "projects/{}/locations/{}/models/{}".format( project_id, "us-central1", model_id) models = {target_language: model_path} glossary_path = client.glossary_path( project_id, "us-central1", glossary_id # The location of the glossary ) glossary_config = translate.TranslateTextGlossaryConfig( glossary=glossary_path) glossaries = {"ja": glossary_config} # target lang as key operation = client.batch_translate_text( request={ "parent": parent, "source_language_code": "en", "target_language_codes": target_language_codes, "input_configs": input_configs, "output_config": output_config, "models": models, "glossaries": glossaries, }) print("Waiting for operation to complete...") response = operation.result() # Display the translation for each input text provided print("Total Characters: {}".format(response.total_characters)) print("Translated Characters: {}".format(response.translated_characters))
def __init__(self, project_id): self.project_id = project_id self.client = translate.TranslationServiceClient() self.parent = self.client.location_path(project_id, "global")
def setup(self): self._client = translate.TranslationServiceClient()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--input_bucket", type=str, default="almond_dataset") parser.add_argument("--input_local_path", type=str, default="./") parser.add_argument("--input_names", type=str, nargs='+', default=["input.tsv"]) parser.add_argument('--output_bucket', type=str, default="almond_output") parser.add_argument("--glossary_local_path", type=str, default="./extras/") parser.add_argument("--glossary_name", type=str, default="glossary.csv") parser.add_argument("--glossary_bucket", type=str, default="almond_glossary") parser.add_argument("--project_id", type=str, default="") parser.add_argument("--project_number", type=str, default="") parser.add_argument("--location", type=str, default="us-central1") parser.add_argument("--source_lang", type=str, default="en") parser.add_argument("--target_langs", type=str, nargs='+', default=["fa"]) parser.add_argument("--model_id", type=str, default="") parser.add_argument("--glossary_type", type=str, choices=['default', 'manual'], default="default") parser.add_argument("--update_glossary", action='store_true') parser.add_argument("--no_glossary", action='store_true') parser.add_argument('--no_translate_params', action='store_true') parser.add_argument("--update_dataset", action='store_true') parser.add_argument("--do_translate", action='store_true') parser.add_argument("--download_results", action='store_true') parser.add_argument("--remove_output_id", action='store_true') parser.add_argument("--overwrite_output", action='store_true') parser.add_argument("--output_local_path", type=str, default="./translation_results") parser.add_argument("--keep_blob_name", action='store_true', help="output files names is same as their blob names") parser.add_argument('--credential_file', default='', type=str) args = parser.parse_args() os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = args.credential_file global TRANSLATION_CLIENT global STORAGE_CLIENT TRANSLATION_CLIENT = translate.TranslationServiceClient() STORAGE_CLIENT = storage.Client() if args.update_glossary or args.do_translate: if args.glossary_type == 'default': glossary_name = 'default.csv' else: glossary_name = args.glossary_name glossary_id = glossary_name.rsplit('.', 1)[0] + '_id' if args.update_glossary: if args.glossary_type == 'default': create_default_glossary(args.input_local_path, args.input_names, args.glossary_local_path, args.source_lang, args.target_langs, not args.no_translate_params, glossary_name, special_words=set()) glossary_uri = os.path.join( *['gs://', args.glossary_bucket, glossary_name]) upload_blob(args.glossary_bucket, args.glossary_local_path, glossary_name) upload_term_set_glossary(args.project_id, glossary_uri, glossary_id, args.source_lang, args.target_langs) if args.update_dataset: upload_blob(args.input_bucket, args.input_local_path, args.input_names) use_glossary = True if args.no_glossary: use_glossary = False if args.do_translate: if bucket_exists(args.output_bucket): if args.overwrite_output: delete_blobs(args.output_bucket) else: create_bucket(args.output_bucket) input_uris = [] for input_name in args.input_names: input_uris.append( os.path.join(*['gs://', args.input_bucket, input_name])) output_uri = os.path.join(*['gs://', args.output_bucket + '/']) sample_batch_translate_text_with_glossary_and_model( input_uris, output_uri, args.project_id, args.location, args.target_langs, args.source_lang, args.model_id, glossary_id, use_glossary) if args.download_results: input2origids = dict() for input_name in args.input_names: input2origids[input_name] = get_ids(args.input_local_path, input_name) output_path = args.output_local_path os.makedirs(output_path, exist_ok=True) download_results(args.output_bucket, output_path, input2origids, use_glossary, args.keep_blob_name, args.remove_output_id)
def main(): parser = argparse.ArgumentParser(description="EASI Translate") parser.add_argument("--menus", required=True, help="Path to Directory containing EASI Menu JSONs") parser.add_argument("--generate-index", required=False, action="store_true", help="Just generate the index based on the directory containing EASI Menu JSONs") parser.add_argument("--webdriver", required=True, help="The type of Selenium WebDriver") parser.add_argument("--webdriver-path", required=True, help="Path to the Selenium WebDriver") parser.add_argument("--gapps", required=True, help="Path to Google Application Credentials JSON") parser.add_argument("--log", required=False, help="Path to log file") args = parser.parse_args() logger = logging.getLogger() console_logger = logging.StreamHandler() console_logger.setFormatter(LOG_FORMATTER) logger.addHandler(console_logger) logger.setLevel(logging.DEBUG) if args.log: file_logger = logging.FileHandler(args.log) file_logger.setFormatter(LOG_FORMATTER) logger.addHandler(file_logger) os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = args.gapps with open(args.gapps, "r") as f: gapps_json = json.load(f) google_client = translate.TranslationServiceClient() google_parent = google_client.location_path(gapps_json["project_id"], "global") os.environ["PATH"] += os.pathsep + os.path.dirname(os.path.abspath(args.webdriver_path)) driver = getattr(webdriver, args.webdriver)() shop_infos = [ ] menus = glob.glob(os.path.join(args.menus, "*.json")) for i, path in enumerate(menus): logging.info(f"Processing {path} ({i + 1}/{len(menus)})") with open(path, "r") as f: menu_json = json.load(f) shop_info = menu_json["data"]["shop_info"] shop_infos.append(shop_info) if not args.generate_index: all_translation_values = [ ] for value, key, context in traverse_json(menu_json, valid_translate_value): all_translation_values.append(value) translation_response = None try: translation_response = google_client.translate_text( parent=google_parent, contents=all_translation_values, mime_type="text/html", source_language_code="zh-cn", target_language_code="en" ) except Exception as e: logging.error(f"Failed (Google) for {path} ({e})") for j, vkc in enumerate(traverse_json(menu_json, valid_translate_value)): logging.info(f"{j + 1} / {len(all_translation_values)}") value, key, context = vkc context[key] = { "value": value, } pinyin_value = strip_ascii(value).strip() if len(pinyin_value) > 0: context[key]["pinyin"] = pinyin.get(strip_ascii(value), delimiter=" ") if "price" in context: google_search = None while google_search is None: google_search = get_page(f"https://www.google.com/search?q={urllib.parse.quote(value)}", [ ("//*[contains(@class,'kno-ecr-pt')]/span", "text"), ("//*[contains(@class,'kno-ecr-pt')]/following-sibling::node()/span", "text") ], driver) if google_search is None: driver = getattr(webdriver, args.webdriver)() google_img_search = None while google_img_search is None: google_img_search = get_page(f"https://www.google.com/search?tbm=isch&q={urllib.parse.quote(value)}+food", [ ("//*[contains(@class,'rg_i')]", "@src") ], driver) if google_search is None: driver = getattr(webdriver, args.webdriver)() if len(google_search) > 0: context[key]["knowledge_graph"] = google_search[0] if len(google_img_search) > 0: context[key]["google_image"] = google_img_search[0] time.sleep(0.5) if translation_response and j < len(translation_response.translations): context[key]["translation"] = translation_response.translations[j].translated_text try: with open(f"menu.{os.path.splitext(path)[0]}-processed.json", "w") as f: f.write(json.dumps(menu_json, indent=4)) except Exception as e: logging.error(f"Failed (Writing) for {path} ({e})") shop_infos.sort(key=lambda x: parse_sold(x), reverse=True) try: with open(os.path.join(os.path.dirname(path), "index.json"), "w") as f: f.write(json.dumps(shop_infos, indent=4)) except Exception as e: logging.error(f"Failed (Writing) for {path} ({e})") driver.close()
def translateAndRefine(event, context): """ This Cloud Function will be triggered when a message is published on the PubSub topic of interest. It will call Translate API. args: event (dict): Metadata of the event, received from Pub/Sub. context (google.cloud.functions.Context): Metadata of triggering event. returns: None; the output is written to stdout and Stackdriver Logging """ # INSTANTIATION translate_client = translate.TranslationServiceClient() storage_client = storage.Client() dlp_client = google.cloud.dlp_v2.DlpServiceClient() # SET VARIABLES project_id = os.environ['GCP_PROJECT'] location = 'global' # or you can set it to os.environ['LOCATION'] start_time = time.time() if event.get('data'): message_data = base64.b64decode(event['data']).decode('utf-8') message = json.loads(message_data) else: raise ValueError('Data sector is missing in the Pub/Sub message.') it_text = message.get('text') doc_title = message.get('doc_title') dest_bucket = 'aketari-covid19-data' # Step 1: Call Translate API raw_eng_text = doTranslation(translate_client, project_id, it_text) print("Completed translation step!") print('=============================') # Step 2: Clean eng text curated_eng_text = cleanEngText(raw_eng_text) print("Completed english curation step!") print('=============================') # Step 3: Redact text parent = "{}/{}".format(project_id, location) # TODO: replace gcs_prefix_secret with the correct location gcs_prefix_secret = 'path/to/your/secret_file.txt' INFO_TYPES = [ "FIRST_NAME", "LAST_NAME", "FEMALE_NAME", "MALE_NAME", "PERSON_NAME", "STREET_ADDRESS", "ITALY_FISCAL_CODE" ] bucket_client = storage_client.get_bucket(dest_bucket) AES_bytes = bucket_client.blob( gcs_prefix_secret).download_as_string().encode('utf-8') base64_AES_bytes = base64.b64encode(AES_bytes) redacted_text = deterministicDeidentifyWithFpe( dlp_client=dlp_client, parent=parent, text=text, info_types=INFO_TYPES, surrogate_type="REDACTED", b64encoded_bytes=base64_AES_bytes) print("Completed redaction step!") print('=============================') # Step 4: Upload translated text prefix_raw_eng_txt = 'eng_txt/{}.txt'.format(doc_title) uploadBlob(storage_client, dest_bucket, raw_eng_text, prefix_raw_eng_txt) prefix_curated_eng_txt = 'curated_eng_txt/{}.txt'.format(doc_title) uploadBlob(storage_client, dest_bucket, curated_eng_text, prefix_curated_eng_txt) prefix_redacted_eng_txt = 'redacted_raw_eng_txt/{}.txt'.format(doc_title) uploadBlob(storage_client, dest_bucket, redacted_text, prefix_redacted_eng_txt) print("Completed upload step!") print('=============================') end_time = time.time() - start_time logging.info("Completion of text_extract took: {} seconds".format( round(end_time, 1)))
def translate_text(text="YOUR_TEXT_TO_TRANSLATE", project_id="YOUR_PROJECT_ID"): """Translating Text.""" client = translate.TranslationServiceClient() location = "global" parent = f"projects/{project_id}/locations/{location}" # Detail on supported types can be found here: # https://cloud.google.com/translate/docs/supported-formats response = client.translate_text( request={ "parent": parent, "contents": [text], "mime_type": "text/plain", # mime types: text/plain, text/html "source_language_code": "en-US", "target_language_code": "fr", } ) #Converting the Google object into a string trString = str(response.translations) #Cleaning the extra characters cleanStr = trString[19:-3].lower() #E accent values to be replaced graveE = "\\303\\250" aiguE = "\\303\\251" circE = "\\303\\252" tremaE = "\\303\\253" #Slash after apostrophe removal ap = "\\" #A accents graveA = "\\303\\240" circA = "\\303\\242" #U accents graveU = "\\303\\271" circU = "\\303\\273" tremaU = "\\303\\274" #I accents circI = "\\303\\256" tremaI = "\\303\\257" #O accents circO = "\\303\\264" #C accents cedi = "\\303\\247" if(aiguE in cleanStr or graveE in cleanStr or circE in cleanStr or ap in cleanStr or graveA in cleanStr or circA in cleanStr or graveU in cleanStr or circU in cleanStr or tremaU in cleanStr or circI in cleanStr or tremaI in cleanStr or circO in cleanStr or cedi in cleanStr or tremaE in cleanStr): # e decoding cleanStr = cleanStr.replace(aiguE, aiguE.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8')) cleanStr = cleanStr.replace(graveE, graveE.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8')) cleanStr = cleanStr.replace(circE, circE.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8')) cleanStr = cleanStr.replace(tremaE, tremaE.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8')) # a decoding cleanStr = cleanStr.replace(graveA, graveA.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8')) cleanStr = cleanStr.replace(circA, circA.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8')) # u decoding cleanStr = cleanStr.replace(graveU, graveU.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8')) cleanStr = cleanStr.replace(circU, circU.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8')) cleanStr = cleanStr.replace(tremaU, tremaU.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8')) # i decoding cleanStr = cleanStr.replace(circI, circI.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8')) cleanStr = cleanStr.replace(tremaI, tremaI.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8')) # o decoding cleanStr = cleanStr.replace(circO, circO.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8')) cleanStr = cleanStr.replace(cedi, cedi.encode('latin1').decode('unicode-escape').encode('latin1').decode('utf8')) # apostrophe decoding cleanStr = cleanStr.replace(ap, "") #print(cleanStr) nlp = spacy.load("fr_core_news_lg") ennlp = spacy.load("en_core_web_sm") docFr = nlp(cleanStr) num_posFr = docFr.count_by(spacy.attrs.IDS['POS']) docEn = ennlp(text) num_depEn = docEn.count_by(spacy.attrs.IDS['DEP']) #print(num_posFr) #print(num_depEn) if(440 in num_depEn): if(num_depEn[440]==1): sentTagSimInt(cleanStr, text) if(95 in num_posFr): if(num_posFr[95]==1): sentTagSimInt(cleanStr, text) if(440 in num_depEn): if(num_depEn[440]>=2): sentTagComplex(cleanStr, text) if(95 in num_posFr): if(num_posFr[95]>=2): sentTagComplex(cleanStr, text)
def __init__(self): credentials, project_id = google.auth.default() self.client = translate.TranslationServiceClient( credentials=credentials) self.parent = self.client.location_path(project_id, "global") log.info("Using Google Translation Service. Charges apply.")