def test_language_detector(): detector = LanguageDetector(minimum_score=0.2, fallback_language="es") output_df = detector.detect_languages_df( INPUT_DF, "input_text").sort_values(by=["input_text"]) for col in output_df.columns: np.testing.assert_array_equal(output_df[col].values, OUTPUT_DF[col].values)
# -*- coding: utf-8 -*- """Language Detection recipe script""" from plugin_config_loading import load_plugin_config_langdetect from language_detector import LanguageDetector from dku_io_utils import process_dataset_chunks, set_column_descriptions # Setup params = load_plugin_config_langdetect() detector = LanguageDetector( language_scope=params["language_scope"], minimum_score=params["minimum_score"], fallback_language=params["fallback_language"], ) # Run process_dataset_chunks( input_dataset=params["input_dataset"], output_dataset=params["output_dataset"], text_column=params["text_column"], func=detector.detect_languages_df, ) set_column_descriptions( input_dataset=params["input_dataset"], output_dataset=params["output_dataset"], column_descriptions=detector.column_descriptions, )
presidenteSoloPostId_List[x] + '.json') sys.stdout = io.TextIOWrapper(sys.stdout.detach(), sys.stdout.encoding, 'backslashreplace') totalPositivos = 0 totalComentarios = 0 for data_json in datas_json: comentarios = [] with open( data_json, mode='r', encoding='utf-8', ) as file: lector = json.load(file) for x in range(0, len(lector)): comentarios.append(lector[x]['message']) Id = LanguageDetector() # comentarios = [text for text in comentarios if Id.detect(text) == 'es'] # for text in comentarios: # print('{}: {}'.format(Id.detect(text), text)) if len(comentarios) > 0: lista = cm.predict(comentarios, params) publiNoVacias += 1 else: publiVacias += 1 #todo with open(postIdsVacios ) print(lista) comentariosPositivos = 0 total = len(lista) totalComentarios += total print(total) for index in range(0, len(lista)):
# -*- coding: utf-8 -*- import sys from language_detector import LanguageDetector ld = LanguageDetector(ngrams_max=int(sys.argv[1]), data_dir="../data") ld.process() while True: var = input("\nPlease enter the text: ") results = ld.detect_language(var) for r in range(5): print(r + 1, results[r][0], results[r][1])