.similarity_diff_to_target()\ .max_dependency_tree_depth() \ .target_word_synset_count()\ .token_count_norm_diff()\ .semicol_count()\ .elmo_similarity() rf = { 'estimator': RandomForestClassifier(), 'parameters': { 'bootstrap': [True], 'class_weight': ['balanced', 'balanced_subsample', 'None'], 'max_depth': [5, 10, 30, 50, 80], 'max_features': [2, 10, 15, 'auto', 'sqrt', 'log2'], 'min_samples_leaf': [2, 5, 10], 'min_samples_split': [2, 5, 10, 20], 'n_estimators': [500, 800, 1000, 1500], 'n_jobs': [8] } } model_trainer = ModelTrainer(english_config.testset_ratio, english_config.logger) model_trainer.add_estimators([rf]) english_classifier = WordSenseAlignmentClassifier(english_config, feature_extractor, model_trainer) english_classifier.load_data() \ .extract_features(['len_diff', 'pos_diff']) \ .train()
'parameters': { 'bootstrap': [True], 'class_weight': ['balanced', 'balanced_subsample','None'], 'max_depth': [30, 50, 80], 'max_features': [2, 10, 15, 'auto', 'sqrt', 'log2', None], 'min_samples_leaf': [3, 5], 'min_samples_split': [2, 5, 8], 'n_estimators': [500, 800], 'n_jobs':[-1] } } # rf = { # 'estimator': RandomForestClassifier(), # 'parameters': { # 'bootstrap': [True], # 'max_depth': [30, 50], # 'max_features': [None], # 'min_samples_leaf': [3, 5], # 'min_samples_split': [2, 5, 8], # 'n_estimators': [500, 600] # } # } dt = {'estimator': DecisionTreeClassifier(), 'parameters': {}} model_trainer = ModelTrainer(german_config.testset_ratio, german_config.logger) model_trainer.add_estimators([lr, svm_model, rf]) german_classifier = WordSenseAlignmentClassifier(german_config, feature_extractor, model_trainer) german_classifier.load_data() \ .extract_features(['similarities', 'len_diff', 'pos_diff']) \ .train(with_testset=True)
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) if __name__ == '__main__': configure() english_config = ClassifierConfig('en_core_web_lg', "english", 'data/train', balancing_strategy="none", testset_ratio=0.2, with_wordnet= True, dataset='english_nuig', logger = 'en_nuig') feature_extractor = FeatureExtractor() \ .first_word() \ .similarity() \ .diff_pos_count() \ .tfidf() \ .ont_hot_pos() \ .matching_lemma() \ .count_each_pos() \ .cosine() \ .jaccard() \ .avg_count_synsets() \ .difference_in_length() dt = {'estimator': DecisionTreeClassifier(), 'parameters': {}} model_trainer = ModelTrainer(english_config.testset_ratio, english_config.logger) model_trainer.add_estimators([dt]) english_classifier = WordSenseAlignmentClassifier(english_config, feature_extractor, model_trainer) english_classifier.load_data() \ .extract_features(['similarities', 'len_diff', 'pos_diff']) \ .train(with_testset=True)
.tfidf() \ .ont_hot_pos() \ .matching_lemma() \ .count_each_pos() \ .cosine() \ .jaccard() \ .avg_count_synsets() \ .difference_in_length()\ .similarity_diff_to_target()\ .max_dependency_tree_depth() \ .target_word_synset_count() svm_model = { 'estimator': SVC(), 'parameters': { 'C': [3, 5, 10], 'kernel': ['rbf', 'linear', 'poly', 'sigmoid'], 'degree':[3, 5, 10], 'gamma':['scale', 'auto'], 'shrinking':[True, False], 'class_weight':['balanced'], 'decision_function_shape':['ovr','ovo'], } } model_trainer = ModelTrainer(english_config.testset_ratio, english_config.logger) model_trainer.add_estimators([svm_model]) english_classifier = WordSenseAlignmentClassifier(english_config, feature_extractor, model_trainer) english_classifier.load_data() \ .extract_features(['similarities', 'len_diff', 'pos_diff', 'max_depth_deptree_1', 'max_depth_deptree_2', 'synset_count_1','synset_count_2', 'target_word_synset_count']) \ .train(with_testset=True)