コード例 #1
0
import pickle

import pandas as pd

from classifier_config import ClassifierConfig
from feature_extractor import FeatureExtractor
from model_trainer import ModelTrainer
from wsa_classifier import WordSenseAlignmentClassifier

german_config = ClassifierConfig('nl_core_news_sm',
                                 "dutch",
                                 'data/test',
                                 balancing_strategy="none",
                                 testset_ratio=0.0,
                                 logger='dutch_testset',
                                 is_testdata=True)

feature_extractor = FeatureExtractor() \
    .first_word() \
    .similarity() \
    .diff_pos_count() \
    .tfidf() \
    .ont_hot_pos() \
    .matching_lemma() \
    .count_each_pos() \
    .cosine() \
    .jaccard() \
    .difference_in_length()

model_trainer = ModelTrainer(german_config, german_config.logger)
コード例 #2
0
from model_trainer import ModelTrainer, BaseClassifier
from wsa_classifier import WordSenseAlignmentClassifier


def configure():
    pd.set_option('display.max_colwidth', -1)
    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))


if __name__ == '__main__':
    configure()

    german_config = ClassifierConfig('de_core_news_md',
                                     "german",
                                     'data/train',
                                     balancing_strategy="swap",
                                     testset_ratio=0.2,
                                     logger='de_all_features_swap',
                                     with_testset=True)

    feature_extractor = FeatureExtractor() \
        .first_word() \
        .similarity() \
        .diff_pos_count() \
        .tfidf() \
        .ont_hot_pos() \
        .matching_lemma() \
        .count_each_pos() \
        .cosine() \
        .jaccard() \
        .difference_in_length()\
コード例 #3
0
from model_trainer import ModelTrainer
from wsa_classifier import WordSenseAlignmentClassifier


def configure():
    pd.set_option('display.max_colwidth', -1)
    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))


if __name__ == '__main__':
    configure()

    english_config = ClassifierConfig('en_core_web_lg',
                                      "english",
                                      'data/train',
                                      balancing_strategy="none",
                                      testset_ratio=0.0,
                                      with_wordnet=True,
                                      dataset='english_nuig',
                                      logger='en_nuig_split_biggest')

    feature_extractor = FeatureExtractor() \
        .diff_pos_count() \
        .ont_hot_pos() \
        .matching_lemma() \
        .count_each_pos() \
        .avg_count_synsets() \
        .difference_in_length()\
        .similarity_diff_to_target()\
        .max_dependency_tree_depth() \
        .target_word_synset_count()\
        .token_count_norm_diff()\
コード例 #4
0
ファイル: German_Testset.py プロジェクト: lenkaB/Codalab
import pickle

import pandas as pd

from classifier_config import ClassifierConfig
from feature_extractor import FeatureExtractor
from model_trainer import ModelTrainer
from wsa_classifier import WordSenseAlignmentClassifier

german_config = ClassifierConfig('de_core_news_md',
                                 "german",
                                 'data/test',
                                 balancing_strategy="none",
                                 testset_ratio=0.0,
                                 logger='de_testset',
                                 is_testdata=True)

feature_extractor = FeatureExtractor() \
    .first_word() \
    .similarity() \
    .diff_pos_count() \
    .tfidf() \
    .ont_hot_pos() \
    .matching_lemma() \
    .count_each_pos() \
    .cosine() \
    .jaccard() \
    .difference_in_length()

model_trainer = ModelTrainer(german_config, german_config.logger)
コード例 #5
0
import pickle

import pandas as pd

from classifier_config import ClassifierConfig
from feature_extractor import FeatureExtractor
from model_trainer import ModelTrainer
from wsa_classifier import WordSenseAlignmentClassifier

german_config = ClassifierConfig('en_core_web_lg',
                                 "english",
                                 'data/test',
                                 balancing_strategy="split_biggest",
                                 testset_ratio=0.0,
                                 with_wordnet=True,
                                 dataset='english_nuig',
                                 logger='en_nuig',
                                 is_testdata=True)

feature_extractor = FeatureExtractor() \
    .first_word() \
    .similarity() \
    .diff_pos_count() \
    .tfidf() \
    .ont_hot_pos() \
    .matching_lemma() \
    .count_each_pos() \
    .cosine() \
    .jaccard() \
    .avg_count_synsets() \
    .difference_in_length() \
コード例 #6
0
ファイル: german_classifier.py プロジェクト: lenkaB/Codalab
from classifier_config import ClassifierConfig
from feature_extractor import FeatureExtractor
from model_trainer import ModelTrainer
from wsa_classifier import WordSenseAlignmentClassifier


def configure():
    pd.set_option('display.max_colwidth', -1)
    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))


if __name__ == '__main__':
    configure()

    german_config = ClassifierConfig('de_core_news_md', "german", 'data/train', balancing_strategy="none",testset_ratio=0.2, logger = 'de_all_features_nonebalance')

    feature_extractor = FeatureExtractor() \
        .first_word() \
        .similarity() \
        .diff_pos_count() \
        .tfidf() \
        .ont_hot_pos() \
        .matching_lemma() \
        .count_each_pos() \
        .cosine() \
        .jaccard() \
        .difference_in_length()

    lr = {'estimator': LogisticRegression(),
          'parameters': {
コード例 #7
0
from feature_extractor import FeatureExtractor
from model_trainer import ModelTrainer
from wsa_classifier import WordSenseAlignmentClassifier


def configure():
    pd.set_option('display.max_colwidth', -1)
    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))


if __name__ == '__main__':
    configure()

    german_config = ClassifierConfig('nl_core_news_sm',
                                     "dutch",
                                     'data/train',
                                     balancing_strategy="none",
                                     testset_ratio=0.2,
                                     logger='dutch_all_features_nonebalance')

    feature_extractor = FeatureExtractor() \
        .first_word() \
        .similarity() \
        .diff_pos_count() \
        .tfidf() \
        .ont_hot_pos() \
        .matching_lemma() \
        .count_each_pos() \
        .cosine() \
        .jaccard() \
        .difference_in_length()
コード例 #8
0
from classifier_config import ClassifierConfig
from feature_extractor import FeatureExtractor
from model_trainer import ModelTrainer
from wsa_classifier import WordSenseAlignmentClassifier


def configure():
    pd.set_option('display.max_colwidth', -1)
    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))


if __name__ == '__main__':
    configure()

    english_config = ClassifierConfig('en_core_web_lg', "english", '../data/train', balancing_strategy="none",
                                      testset_ratio=0.2, with_wordnet=True)

    feature_extractor = FeatureExtractor() \
        .first_word() \
        .similarity() \
        .diff_pos_count() \
        .tfidf() \
        .ont_hot_pos() \
        .matching_lemma() \
        .count_each_pos() \
        .cosine() \
        .jaccard() \
        .similarity_diff_to_target()\
        .avg_count_synsets() \
        .difference_in_length()
コード例 #9
0
from classifier_config import ClassifierConfig
from feature_extractor import FeatureExtractor
from model_trainer import ModelTrainer
from wsa_classifier import WordSenseAlignmentClassifier


def configure():
    pd.set_option('display.max_colwidth', -1)
    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))


if __name__ == '__main__':
    configure()

    english_config = ClassifierConfig('en_core_web_lg', "english", 'data/train', balancing_strategy="divide",
                                      testset_ratio=0.0, with_wordnet= True, dataset='english_nuig', logger = 'en_nuig_overfithandling')

    feature_extractor = FeatureExtractor() \
        .first_word() \
        .similarity() \
        .diff_pos_count() \
        .tfidf() \
        .ont_hot_pos() \
        .matching_lemma() \
        .count_each_pos() \
        .cosine() \
        .jaccard() \
        .avg_count_synsets() \
        .difference_in_length()\
        .similarity_diff_to_target()\
        .max_dependency_tree_depth() \
コード例 #10
0
ファイル: german_oversample.py プロジェクト: lenkaB/Codalab
from feature_extractor import FeatureExtractor
from model_trainer import ModelTrainer, BaseClassifier
from wsa_classifier import WordSenseAlignmentClassifier


def configure():
    pd.set_option('display.max_colwidth', -1)
    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))


if __name__ == '__main__':
    configure()

    german_config = ClassifierConfig('de_core_news_md',
                                     "german",
                                     'data/train',
                                     balancing_strategy="oversampling",
                                     testset_ratio=0.0,
                                     logger='de_all_features_oversampling')

    feature_extractor = FeatureExtractor() \
        .first_word() \
        .similarity() \
        .diff_pos_count() \
        .tfidf() \
        .ont_hot_pos() \
        .matching_lemma() \
        .count_each_pos() \
        .cosine() \
        .jaccard() \
        .difference_in_length()\