def train_language_models(input_folder, output_folder=None):
    assert os.path.isdir(input_folder)
    if output_folder is None:
        output_folder = input_folder
    assert os.path.isdir(output_folder)

    # Read all files in the input folder
    training_paths = list()
    model_paths = list()
    languages = list()
    for name in os.listdir(input_folder):
        training_path = os.path.join(input_folder, name)
        if os.path.isfile(training_path):
            language = os.path.splitext(name)[0]
            languages.append(language)
            training_paths.append(training_path)
            model_paths.append(os.path.join(output_folder, language + '.yaml'))

    # Build language models
    for data, model, language in zip(training_paths, model_paths, languages):
        with open(data, 'r') as f:
            print 'Process file', data
            profile = ngramprofile.build_profile(f)
            profile.language = language
            ngramprofile.dump_profile(profile, model)
def process_batch(models, folder):
    total = 0
    no_correct = 0
    confusion_matrix = dict()

    paths = ioutils.list_files_only(folder)
    for path in paths:
        total += 1
        with open(path, 'r') as f:
            print 'Process file', path,
            document = ngramprofile.build_profile(f)
            document.language = os.path.splitext(os.path.basename(path))[0].split('-')[0]
        predicted_language = identify_language(models, document)
        print 'Predict:', predicted_language, 'expect:', document.language
        if predicted_language == document.language:
            no_correct += 1
        else:
            if document.language not in confusion_matrix:
                confusion_matrix[document.language] = dict()
            confusion_matrix[document.language][predicted_language] = confusion_matrix.get(predicted_language, 0) + 1
    print 'Accuracy:', no_correct * 1.0 / total