def train_language_models(input_folder, output_folder=None): assert os.path.isdir(input_folder) if output_folder is None: output_folder = input_folder assert os.path.isdir(output_folder) # Read all files in the input folder training_paths = list() model_paths = list() languages = list() for name in os.listdir(input_folder): training_path = os.path.join(input_folder, name) if os.path.isfile(training_path): language = os.path.splitext(name)[0] languages.append(language) training_paths.append(training_path) model_paths.append(os.path.join(output_folder, language + '.yaml')) # Build language models for data, model, language in zip(training_paths, model_paths, languages): with open(data, 'r') as f: print 'Process file', data profile = ngramprofile.build_profile(f) profile.language = language ngramprofile.dump_profile(profile, model)
def process_batch(models, folder): total = 0 no_correct = 0 confusion_matrix = dict() paths = ioutils.list_files_only(folder) for path in paths: total += 1 with open(path, 'r') as f: print 'Process file', path, document = ngramprofile.build_profile(f) document.language = os.path.splitext(os.path.basename(path))[0].split('-')[0] predicted_language = identify_language(models, document) print 'Predict:', predicted_language, 'expect:', document.language if predicted_language == document.language: no_correct += 1 else: if document.language not in confusion_matrix: confusion_matrix[document.language] = dict() confusion_matrix[document.language][predicted_language] = confusion_matrix.get(predicted_language, 0) + 1 print 'Accuracy:', no_correct * 1.0 / total