Esempio n. 1
0
        'raw_data/models/reduced_search/best_model_{0}.pkl'.format(lang))
    if not my_file.exists():
        continue
    else:
        with open(
                'raw_data/models/reduced_search/best_model_{0}.pkl'.format(
                    lang), 'rb') as file:
            best_model = joblib.load(file)

        # Load best model parameters
        with open(
                'raw_data/experiments/reduced_search/best_model_parameters_{0}.pkl'
                .format(lang), 'rb') as file:
            best_params = joblib.load(file)

        dataset = mlconjug3.DataSet(mlconjug3.Verbiste(language=lang).verbs)
        dataset.split_data(proportion=0.95)

        predicted = best_model.predict(dataset.test_input)
        predicted2 = best_model.predict(dataset.verbs_list)

        score = len([
            (a, b) for a, b in zip(predicted, dataset.test_labels) if a == b
        ]) / len(predicted)
        misses = len([(a, b) for a, b in zip(predicted, dataset.test_labels)
                      if a != b])
        entries = len(predicted)
        print('The score of the {0} best model is {1} on test set.'.format(
            lang, score))

        score2 = len([(a, b)
langs = ('ro', 'it', 'en', 'es', 'pt', 'fr')

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block
    for lang in langs:
        my_file = Path(
            'raw_data/models/reduced_search/best_model_{0}.pkl'.format(lang))
        if my_file.is_file():
            continue
        else:
            # #############################################################################
            # Initialize Data Set
            dataset = mlconjug3.DataSet(
                mlconjug3.Verbiste(language=lang).verbs)

            # #############################################################################
            # Define a pipeline.

            # Transforms dataset with CountVectorizer. We pass the function extract_verb_features to the CountVectorizer.
            ngrange = (2, 7)
            vectorizer = mlconjug3.CountVectorizer(analyzer=partial(
                mlconjug3.extract_verb_features,
                lang=lang,
                ngram_range=ngrange),
                                                   binary=True)

            # Feature reduction
            feature_reductor = mlconjug3.SelectFromModel(
                mlconjug3.LinearSVC(random_state=42,