Ejemplo n.º 1
0
        .similarity_diff_to_target()\
        .max_dependency_tree_depth() \
        .target_word_synset_count()\
        .token_count_norm_diff()\
        .semicol_count()\
        .elmo_similarity()

    rf = {
        'estimator': RandomForestClassifier(),
        'parameters': {
            'bootstrap': [True],
            'class_weight': ['balanced', 'balanced_subsample', 'None'],
            'max_depth': [5, 10, 30, 50, 80],
            'max_features': [2, 10, 15, 'auto', 'sqrt', 'log2'],
            'min_samples_leaf': [2, 5, 10],
            'min_samples_split': [2, 5, 10, 20],
            'n_estimators': [500, 800, 1000, 1500],
            'n_jobs': [8]
        }
    }

    model_trainer = ModelTrainer(english_config.testset_ratio,
                                 english_config.logger)
    model_trainer.add_estimators([rf])
    english_classifier = WordSenseAlignmentClassifier(english_config,
                                                      feature_extractor,
                                                      model_trainer)
    english_classifier.load_data() \
        .extract_features(['len_diff', 'pos_diff']) \
        .train()
Ejemplo n.º 2
0
feature_extractor = FeatureExtractor() \
    .first_word() \
    .similarity() \
    .diff_pos_count() \
    .tfidf() \
    .ont_hot_pos() \
    .matching_lemma() \
    .count_each_pos() \
    .cosine() \
    .jaccard() \
    .difference_in_length()

model_trainer = ModelTrainer(german_config, german_config.logger)

german_classifier = WordSenseAlignmentClassifier(german_config,
                                                 feature_extractor,
                                                 model_trainer)
data = german_classifier.load_data().get_preprocessed_data()

feats = feature_extractor.extract(
    data, feats_to_scale=['similarities', 'len_diff', 'pos_diff'])
feats = feature_extractor.keep_feats([
    'similarities', 'cos_tfidf', 'ADP', 'DET', 'pos_diff', 'len_diff', 'PRON',
    'CONJ', 'X', 'PROPN', 'NOUN', 'cos', 'ADJ', 'VERB', 'jaccard', 'PUNCT',
    'noun', 'ADV', 'adjective'
])
x_trainset, x_testset = model_trainer.split_data(feats, 0.0)

with open(
        'models/dutch_all_features_nonebalanceRandomForestClassifier20200329-1354.pickle',
        'rb') as pickle_file:
Ejemplo n.º 3
0
                'balanced_subsample',
            ],
            'max_depth': [10, 20],
            'max_features': ['auto', 'sqrt', 'log2', None],
            'min_samples_leaf': [2],
            'min_samples_split': [5, 10],
            'n_estimators': [300, 800],
            'n_jobs': [8]
        }
    }

    model_trainer = ModelTrainer(english_config.testset_ratio,
                                 english_config.logger)
    model_trainer.add_estimators([rf])
    english_classifier = WordSenseAlignmentClassifier(english_config,
                                                      feature_extractor,
                                                      model_trainer)
    english_classifier.load_data() \
        .extract_features(['len_diff', 'pos_diff']) \
        .select_features(['target_word_synset_count',
                          'elmo_sim',
                          'simdiff_to_target',
                          'synsets_count_diff',
                          'lemma_match_normalized',
                          'token_count_norm_diff',
                          'len_diff',
                          'NOUN',
                          'VERB',
                          'PUNCT',
                          'pos_diff',
                          'CCONJ',
Ejemplo n.º 4
0
            'max_depth': [30, 80],
            'max_features': [2, 15, 'auto', None],
            'min_samples_leaf': [3, 5],
            'min_samples_split': [2, 5, 8],
            'n_estimators': [500, 800],
            'n_jobs': [10]
        }
    }
    # rf = {
    #     'estimator': RandomForestClassifier(),
    #     'parameters': {
    #         'bootstrap': [True],
    #         'max_depth': [30, 50],
    #         'max_features': [None],
    #         'min_samples_leaf': [3, 5],
    #         'min_samples_split': [2, 5, 8],
    #         'n_estimators': [500, 600]
    #     }
    # }
    dt = {'estimator': DecisionTreeClassifier(), 'parameters': {}}

    model_trainer = ModelTrainer(german_config.testset_ratio,
                                 german_config.logger)
    model_trainer.add_estimators([rf])
    german_classifier = WordSenseAlignmentClassifier(german_config,
                                                     feature_extractor,
                                                     model_trainer)
    german_classifier.load_data() \
        .extract_features(['similarities', 'len_diff', 'pos_diff']) \
        .train(with_testset=True)
Ejemplo n.º 5
0
        .count_each_pos() \
        .jaccard() \
        .avg_count_synsets() \
        .difference_in_length()\
        .max_dependency_tree_depth() \
        .target_word_synset_count()\

    rf = {
        'estimator': RandomForestClassifier(),
        'parameters': {
            'class_weight': ['balanced_subsample', 'balanced'],
            'max_depth': [5, 10, 15],
            'max_features': ['auto', 'sqrt', 'log2'],
            'min_samples_leaf': [2],
            'min_samples_split': [5, 10],
            'n_estimators': [300, 1000],
            'n_jobs': [8]
        }
    }

    model_trainer = ModelTrainer(english_config.testset_ratio,
                                 english_config.logger)
    model_trainer.add_estimators([rf])
    english_classifier = WordSenseAlignmentClassifier(english_config,
                                                      feature_extractor,
                                                      model_trainer)
    english_classifier.load_data() \
        .extract_features(['similarities', 'len_diff']) \
        .select_features(['cos_tfidf','jaccard','similarities','first_word_same','PART','noun','adjective','verb','target_word_synset_count','adverb','len_diff'])\
        .train()
Ejemplo n.º 6
0
            'min_samples_split': [5, 8],
            'n_estimators': [500, 1000],
            'n_jobs': [8]
        }
    }
    # rf = {
    #     'estimator': RandomForestClassifier(),
    #     'parameters': {
    #         'bootstrap': [True],
    #         'max_depth': [30, 50],
    #         'max_features': [None],
    #         'min_samples_leaf': [3, 5],
    #         'min_samples_split': [2, 5, 8],
    #         'n_estimators': [500, 600]
    #     }
    # }
    dt = {'estimator': DecisionTreeClassifier(), 'parameters': {}}

    model_trainer = ModelTrainer(german_config.testset_ratio,
                                 german_config.logger)
    model_trainer.add_estimators([rf])
    german_classifier = WordSenseAlignmentClassifier(german_config,
                                                     feature_extractor,
                                                     model_trainer)
    german_classifier.load_data() \
        .extract_features(['similarities', 'len_diff', 'pos_diff']) \
        .select_features(['similarities', 'cos_tfidf',
                          'ADP', 'DET', 'pos_diff', 'len_diff',
                          'PRON', 'CONJ','X', 'PROPN', 'NOUN', 'cos', 'ADJ', 'VERB', 'jaccard', 'PUNCT', 'noun', 'ADV', 'adjective'])\
        .train(with_testset=True)
        .tfidf() \
        .ont_hot_pos() \
        .matching_lemma() \
        .count_each_pos() \
        .cosine() \
        .jaccard() \
        .avg_count_synsets() \
        .difference_in_length()\
        .similarity_diff_to_target()\
        .max_dependency_tree_depth() \
        .target_word_synset_count()

    svm_model = {
        'estimator': SVC(),
        'parameters': {
            'C': [3, 5, 10],
            'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
            'degree':[3, 5, 10],
            'gamma':['scale', 'auto'],
            'shrinking':[True, False],
            'class_weight':['balanced'],
            'decision_function_shape':['ovr','ovo'],
        }
    }
    model_trainer = ModelTrainer(english_config.testset_ratio, english_config.logger)
    model_trainer.add_estimators([svm_model])
    english_classifier = WordSenseAlignmentClassifier(english_config, feature_extractor, model_trainer)
    english_classifier.load_data() \
        .extract_features(['similarities', 'len_diff', 'pos_diff', 'max_depth_deptree_1', 'max_depth_deptree_2', 'synset_count_1','synset_count_2', 'target_word_synset_count']) \
        .train(with_testset=True)