.similarity_diff_to_target()\ .max_dependency_tree_depth() \ .target_word_synset_count()\ .token_count_norm_diff()\ .semicol_count()\ .elmo_similarity() rf = { 'estimator': RandomForestClassifier(), 'parameters': { 'bootstrap': [True], 'class_weight': ['balanced', 'balanced_subsample', 'None'], 'max_depth': [5, 10, 30, 50, 80], 'max_features': [2, 10, 15, 'auto', 'sqrt', 'log2'], 'min_samples_leaf': [2, 5, 10], 'min_samples_split': [2, 5, 10, 20], 'n_estimators': [500, 800, 1000, 1500], 'n_jobs': [8] } } model_trainer = ModelTrainer(english_config.testset_ratio, english_config.logger) model_trainer.add_estimators([rf]) english_classifier = WordSenseAlignmentClassifier(english_config, feature_extractor, model_trainer) english_classifier.load_data() \ .extract_features(['len_diff', 'pos_diff']) \ .train()
.similarity() \ .diff_pos_count() \ .tfidf() \ .ont_hot_pos() \ .matching_lemma() \ .count_each_pos() \ .cosine() \ .jaccard() \ .difference_in_length() model_trainer = ModelTrainer(german_config, german_config.logger) german_classifier = WordSenseAlignmentClassifier(german_config, feature_extractor, model_trainer) data = german_classifier.load_data().get_preprocessed_data() feats = feature_extractor.extract( data, feats_to_scale=['similarities', 'len_diff', 'pos_diff']) feats = feature_extractor.keep_feats([ 'similarities', 'cos_tfidf', 'ADP', 'DET', 'pos_diff', 'len_diff', 'PRON', 'CONJ', 'X', 'PROPN', 'NOUN', 'cos', 'ADJ', 'VERB', 'jaccard', 'PUNCT', 'noun', 'ADV', 'adjective' ]) x_trainset, x_testset = model_trainer.split_data(feats, 0.0) with open( 'models/dutch_all_features_nonebalanceRandomForestClassifier20200329-1354.pickle', 'rb') as pickle_file: clf = pickle.load(pickle_file) predicted = clf.predict(x_trainset)
} } model_trainer = ModelTrainer(english_config.testset_ratio, english_config.logger) model_trainer.add_estimators([rf]) english_classifier = WordSenseAlignmentClassifier(english_config, feature_extractor, model_trainer) english_classifier.load_data() \ .extract_features(['len_diff', 'pos_diff']) \ .select_features(['target_word_synset_count', 'elmo_sim', 'simdiff_to_target', 'synsets_count_diff', 'lemma_match_normalized', 'token_count_norm_diff', 'len_diff', 'NOUN', 'VERB', 'PUNCT', 'pos_diff', 'CCONJ', 'semicol_count2_norm', 'ADP', 'ADJ', 'semicol_diff', 'max_depth_deptree_2', 'max_depth_deptree_1'])\ .train()
'max_depth': [30, 80], 'max_features': [2, 15, 'auto', None], 'min_samples_leaf': [3, 5], 'min_samples_split': [2, 5, 8], 'n_estimators': [500, 800], 'n_jobs': [10] } } # rf = { # 'estimator': RandomForestClassifier(), # 'parameters': { # 'bootstrap': [True], # 'max_depth': [30, 50], # 'max_features': [None], # 'min_samples_leaf': [3, 5], # 'min_samples_split': [2, 5, 8], # 'n_estimators': [500, 600] # } # } dt = {'estimator': DecisionTreeClassifier(), 'parameters': {}} model_trainer = ModelTrainer(german_config.testset_ratio, german_config.logger) model_trainer.add_estimators([rf]) german_classifier = WordSenseAlignmentClassifier(german_config, feature_extractor, model_trainer) german_classifier.load_data() \ .extract_features(['similarities', 'len_diff', 'pos_diff']) \ .train(with_testset=True)
.count_each_pos() \ .jaccard() \ .avg_count_synsets() \ .difference_in_length()\ .max_dependency_tree_depth() \ .target_word_synset_count()\ rf = { 'estimator': RandomForestClassifier(), 'parameters': { 'class_weight': ['balanced_subsample', 'balanced'], 'max_depth': [5, 10, 15], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_leaf': [2], 'min_samples_split': [5, 10], 'n_estimators': [300, 1000], 'n_jobs': [8] } } model_trainer = ModelTrainer(english_config.testset_ratio, english_config.logger) model_trainer.add_estimators([rf]) english_classifier = WordSenseAlignmentClassifier(english_config, feature_extractor, model_trainer) english_classifier.load_data() \ .extract_features(['similarities', 'len_diff']) \ .select_features(['cos_tfidf','jaccard','similarities','first_word_same','PART','noun','adjective','verb','target_word_synset_count','adverb','len_diff'])\ .train()
'min_samples_split': [5, 8], 'n_estimators': [500, 1000], 'n_jobs': [8] } } # rf = { # 'estimator': RandomForestClassifier(), # 'parameters': { # 'bootstrap': [True], # 'max_depth': [30, 50], # 'max_features': [None], # 'min_samples_leaf': [3, 5], # 'min_samples_split': [2, 5, 8], # 'n_estimators': [500, 600] # } # } dt = {'estimator': DecisionTreeClassifier(), 'parameters': {}} model_trainer = ModelTrainer(german_config.testset_ratio, german_config.logger) model_trainer.add_estimators([rf]) german_classifier = WordSenseAlignmentClassifier(german_config, feature_extractor, model_trainer) german_classifier.load_data() \ .extract_features(['similarities', 'len_diff', 'pos_diff']) \ .select_features(['similarities', 'cos_tfidf', 'ADP', 'DET', 'pos_diff', 'len_diff', 'PRON', 'CONJ','X', 'PROPN', 'NOUN', 'cos', 'ADJ', 'VERB', 'jaccard', 'PUNCT', 'noun', 'ADV', 'adjective'])\ .train(with_testset=True)
.tfidf() \ .ont_hot_pos() \ .matching_lemma() \ .count_each_pos() \ .cosine() \ .jaccard() \ .avg_count_synsets() \ .difference_in_length()\ .similarity_diff_to_target()\ .max_dependency_tree_depth() \ .target_word_synset_count() svm_model = { 'estimator': SVC(), 'parameters': { 'C': [3, 5, 10], 'kernel': ['rbf', 'linear', 'poly', 'sigmoid'], 'degree':[3, 5, 10], 'gamma':['scale', 'auto'], 'shrinking':[True, False], 'class_weight':['balanced'], 'decision_function_shape':['ovr','ovo'], } } model_trainer = ModelTrainer(english_config.testset_ratio, english_config.logger) model_trainer.add_estimators([svm_model]) english_classifier = WordSenseAlignmentClassifier(english_config, feature_extractor, model_trainer) english_classifier.load_data() \ .extract_features(['similarities', 'len_diff', 'pos_diff', 'max_depth_deptree_1', 'max_depth_deptree_2', 'synset_count_1','synset_count_2', 'target_word_synset_count']) \ .train(with_testset=True)