Example #1
0
        .similarity_diff_to_target()\
        .max_dependency_tree_depth() \
        .target_word_synset_count()\
        .token_count_norm_diff()\
        .semicol_count()\
        .elmo_similarity()

    rf = {
        'estimator': RandomForestClassifier(),
        'parameters': {
            'bootstrap': [True],
            'class_weight': ['balanced', 'balanced_subsample', 'None'],
            'max_depth': [5, 10, 30, 50, 80],
            'max_features': [2, 10, 15, 'auto', 'sqrt', 'log2'],
            'min_samples_leaf': [2, 5, 10],
            'min_samples_split': [2, 5, 10, 20],
            'n_estimators': [500, 800, 1000, 1500],
            'n_jobs': [8]
        }
    }

    model_trainer = ModelTrainer(english_config.testset_ratio,
                                 english_config.logger)
    model_trainer.add_estimators([rf])
    english_classifier = WordSenseAlignmentClassifier(english_config,
                                                      feature_extractor,
                                                      model_trainer)
    english_classifier.load_data() \
        .extract_features(['len_diff', 'pos_diff']) \
        .train()
Example #2
0
        'parameters': {
            'bootstrap': [True],
            'class_weight': ['balanced', 'balanced_subsample','None'],
            'max_depth': [30, 50, 80],
            'max_features': [2, 10, 15, 'auto', 'sqrt', 'log2', None],
            'min_samples_leaf': [3, 5],
            'min_samples_split': [2, 5, 8],
            'n_estimators': [500, 800],
            'n_jobs':[-1]
        }
    }
    # rf = {
    #     'estimator': RandomForestClassifier(),
    #     'parameters': {
    #         'bootstrap': [True],
    #         'max_depth': [30, 50],
    #         'max_features': [None],
    #         'min_samples_leaf': [3, 5],
    #         'min_samples_split': [2, 5, 8],
    #         'n_estimators': [500, 600]
    #     }
    # }
    dt = {'estimator': DecisionTreeClassifier(), 'parameters': {}}

    model_trainer = ModelTrainer(german_config.testset_ratio, german_config.logger)
    model_trainer.add_estimators([lr, svm_model, rf])
    german_classifier = WordSenseAlignmentClassifier(german_config, feature_extractor, model_trainer)
    german_classifier.load_data() \
        .extract_features(['similarities', 'len_diff', 'pos_diff']) \
        .train(with_testset=True)
Example #3
0
    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))


if __name__ == '__main__':
    configure()

    english_config = ClassifierConfig('en_core_web_lg', "english", 'data/train', balancing_strategy="none",
                                      testset_ratio=0.2, with_wordnet= True, dataset='english_nuig', logger = 'en_nuig')

    feature_extractor = FeatureExtractor() \
        .first_word() \
        .similarity() \
        .diff_pos_count() \
        .tfidf() \
        .ont_hot_pos() \
        .matching_lemma() \
        .count_each_pos() \
        .cosine() \
        .jaccard() \
        .avg_count_synsets() \
        .difference_in_length()

    dt = {'estimator': DecisionTreeClassifier(), 'parameters': {}}

    model_trainer = ModelTrainer(english_config.testset_ratio, english_config.logger)
    model_trainer.add_estimators([dt])
    english_classifier = WordSenseAlignmentClassifier(english_config, feature_extractor, model_trainer)
    english_classifier.load_data() \
        .extract_features(['similarities', 'len_diff', 'pos_diff']) \
        .train(with_testset=True)
        .tfidf() \
        .ont_hot_pos() \
        .matching_lemma() \
        .count_each_pos() \
        .cosine() \
        .jaccard() \
        .avg_count_synsets() \
        .difference_in_length()\
        .similarity_diff_to_target()\
        .max_dependency_tree_depth() \
        .target_word_synset_count()

    svm_model = {
        'estimator': SVC(),
        'parameters': {
            'C': [3, 5, 10],
            'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
            'degree':[3, 5, 10],
            'gamma':['scale', 'auto'],
            'shrinking':[True, False],
            'class_weight':['balanced'],
            'decision_function_shape':['ovr','ovo'],
        }
    }
    model_trainer = ModelTrainer(english_config.testset_ratio, english_config.logger)
    model_trainer.add_estimators([svm_model])
    english_classifier = WordSenseAlignmentClassifier(english_config, feature_extractor, model_trainer)
    english_classifier.load_data() \
        .extract_features(['similarities', 'len_diff', 'pos_diff', 'max_depth_deptree_1', 'max_depth_deptree_2', 'synset_count_1','synset_count_2', 'target_word_synset_count']) \
        .train(with_testset=True)