Example #1
0
 def setUp(self):
     self.config = {
         "classifier": "dtree",
         "classifier_args": dict(),
         "dimensionality_reduction": None,
         "dimensionality_reduction_dimension": None,
         "feature_selection": None,
         "feature_selection_dimension": None,
         "scaler": False,
         "sparse": True,
         "features": make_feature_list("""
                 bag_of_words
                 bag_of_pos
                 bag_of_word_bigrams
                 bag_of_wordpos
                 bag_of_wordpos_bigrams
                 bag_of_words_in_between
                 bag_of_pos_in_between
                 bag_of_word_bigrams_in_between
                 bag_of_wordpos_in_between
                 bag_of_wordpos_bigrams_in_between
                 entity_order
                 entity_distance
                 other_entities_in_between
                 in_same_sentence
                 verbs_count_in_between
                 verbs_count
                 total_number_of_entities
                 symbols_in_between
                 number_of_tokens
         """),
     }
Example #2
0
 def test_configuration_with_arguments(self, mocked_feature):
     patch = make_feature_list("""
         BagOfVerbStems True
         BagOfVerbLemmas True
         BagOfVerbLemmas False
     """)
     self.config["features"] = self.config["features"] + patch
     FactExtractor(self.config)
     self.assertEqual(mocked_feature.call_count, 1)
     self.assertEqual(mocked_feature.call_args, ((True, ), ))
 def setUp(self):
     self.config = {
         "classifier": "svc",
         "classifier_args": {},
         "dense_features": make_feature_list("""
             entity_order
             entity_distance
             other_entities_in_between
             verbs_count_in_between
             verbs_count
             total_number_of_entities
             symbols_in_between
             number_of_tokens
             """),
         "sparse_features": make_feature_list("""
             bag_of_words
             bag_of_pos
             bag_of_words_in_between
             bag_of_pos_in_between
             """)
     }
 def setUp(self):
     self.config = {
         "classifier":
         "svc",
         "classifier_args": {},
         "dense_features":
         make_feature_list("""
             entity_order
             entity_distance
             other_entities_in_between
             verbs_count_in_between
             verbs_count
             total_number_of_entities
             symbols_in_between
             number_of_tokens
             """),
         "sparse_features":
         make_feature_list("""
             bag_of_words
             bag_of_pos
             bag_of_words_in_between
             bag_of_pos_in_between
             """)
     }
Example #5
0
def iter_configs(input_file_path, dbname):
    input_file_path = os.path.abspath(input_file_path)
    hasher = hashlib.md5(open(input_file_path, "rb").read())
    base = {
        # Experiment configuration
        u"config_version": u"1",
        u"data_shuffle_seed": None,
        u"train_percentage": None,
        u"input_file_path": input_file_path,
        u"input_file_md5": hasher.hexdigest(),
        u"database_name": dbname,

        # Classifier configuration
        u"classifier": u"svm",
        u"classifier_args": dict(),
        u"dimensionality_reduction": None,
        u"dimensionality_reduction_dimension": None,
        u"feature_selection": None,
        u"feature_selection_dimension": None,
        u"scaler": True,
        u"sparse": False,
        u"features": make_feature_list(u"""
                bag_of_words
                bag_of_pos
                bag_of_word_bigrams
                bag_of_wordpos
                bag_of_wordpos_bigrams
                bag_of_words_in_between
                bag_of_pos_in_between
                bag_of_word_bigrams_in_between
                bag_of_wordpos_in_between
                bag_of_wordpos_bigrams_in_between
                entity_order
                entity_distance
                other_entities_in_between
                in_same_sentence
                verbs_count_in_between
                verbs_count
                total_number_of_entities
                symbols_in_between
                number_of_tokens
                BagOfVerbStems True
                BagOfVerbStems False
                BagOfVerbLemmas True
                BagOfVerbLemmas False
        """)
    }
    patch = {u"train_percentage": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
             u"data_shuffle_seed": [u"domino" + str(i) for i in range(10)]}

    xs = [(u"sgd", {}),
          (u"naivebayes", {}),
          (u"naivebayes_m", {}),
          (u"dtree", {u"max_depth": 4, u"min_samples_leaf": 5}),
          (u"logit", {}),
          (u"svm", {}),
          (u"adaboost", {})]
    for classifier, args in xs:
        base[u"classifier"] = classifier
        base[u"classifier_args"] = args
        base[u"scaler"] = True
        if classifier == "naivebayes_m":
            base[u"scaler"] = False
        for config in apply_dict_combinations(base, patch):
            yield config
Example #6
0
    "sparse":
    False,
    "features":
    make_feature_list("""
            bag_of_words
            bag_of_pos
            bag_of_word_bigrams
            bag_of_wordpos
            bag_of_wordpos_bigrams
            bag_of_words_in_between
            bag_of_pos_in_between
            bag_of_word_bigrams_in_between
            bag_of_wordpos_in_between
            bag_of_wordpos_bigrams_in_between
            entity_order
            entity_distance
            other_entities_in_between
            in_same_sentence
            verbs_count_in_between
            verbs_count
            total_number_of_entities
            symbols_in_between
            number_of_tokens
            BagOfVerbStems True
            BagOfVerbStems False
            BagOfVerbLemmas True
            BagOfVerbLemmas False
    """),
}

Example #7
0
# In this file we have defined the default values for IEPY settings
# These default values will be always used except explicitely said.
from iepy.utils import make_feature_list


extractor_config = {
            "classifier": "svc",
            "classifier_args": {},
            "dense_features": make_feature_list("""
                entity_order
                entity_distance
                other_entities_in_between
                verbs_count_in_between
                verbs_count
                total_number_of_entities
                symbols_in_between
                number_of_tokens
                """),
            "sparse_features": make_feature_list("""
                bag_of_words
                bag_of_pos
                bag_of_words_in_between
                bag_of_pos_in_between
                """)
        }
Example #8
0
def iter_configs(input_file_path, dbname):
    input_file_path = os.path.abspath(input_file_path)
    hasher = hashlib.md5(open(input_file_path, "rb").read())
    base = {
        # Experiment configuration
        u"config_version":
        u"1",
        u"data_shuffle_seed":
        None,
        u"train_percentage":
        None,
        u"input_file_path":
        input_file_path,
        u"input_file_md5":
        hasher.hexdigest(),
        u"database_name":
        dbname,

        # Classifier configuration
        u"classifier":
        u"svm",
        u"classifier_args":
        dict(),
        u"dimensionality_reduction":
        None,
        u"dimensionality_reduction_dimension":
        None,
        u"feature_selection":
        None,
        u"feature_selection_dimension":
        1000,
        u"scaler":
        True,
        u"sparse":
        False,
        u"features":
        make_feature_list(u"""
                bag_of_words
                bag_of_pos
                bag_of_word_bigrams
                bag_of_wordpos
                bag_of_wordpos_bigrams
                bag_of_words_in_between
                bag_of_pos_in_between
                bag_of_word_bigrams_in_between
                bag_of_wordpos_in_between
                bag_of_wordpos_bigrams_in_between
                entity_order
                entity_distance
                other_entities_in_between
                in_same_sentence
                verbs_count_in_between
                verbs_count
                total_number_of_entities
                symbols_in_between
                number_of_tokens
                BagOfVerbStems True
                BagOfVerbStems False
                BagOfVerbLemmas True
                BagOfVerbLemmas False
        """)
    }

    # SVM
    ######
    patch = {
        u"train_percentage": [0.05 * x for x in range(1, 11)],
        u"data_shuffle_seed": [u"daddycool" + str(i) for i in range(20)],
        u"feature_selection": [None, "kbest"]
    }
    svm_args_patches = [
        {
            u"kernel": [u"rbf"],
            u"C": [1, 10, 100],
            u"gamma": [0.0, 0.001, 0.0001]
        },
        {
            u"kernel": [u"poly"],
            u"C": [1, 10, 100],
            u"degree": [2, 3, 4],
            u"gamma": [0.0, 0.001, 0.0001]
        },
        {
            u"kernel": [u"linear"],
            u"C": [1, 10, 100]
        },
    ]

    for argpatch in svm_args_patches:
        for argconfig in apply_dict_combinations({}, argpatch):
            base[u"classifier_args"] = argconfig
            for config in apply_dict_combinations(base, patch):
                yield config

    # Adaboost
    ###########

    base.update({
        u"classifier": u"adaboost",
        u"feature_selection_dimension": None,
        u"scaler": False,
    })

    patch = {
        u"train_percentage": [0.05 * x for x in range(1, 11)],
        u"data_shuffle_seed": [u"daddycool" + str(i) for i in range(10)]
    }
    argpatch = {
        u"n_estimators": [5, 10, 20, 50],
        u"learning_rate": [0.9, 1.0, 1.1],
        u"max_depth": [1, 2, 3]
    }
    for argconfig in apply_dict_combinations({}, argpatch):
        base[u"classifier_args"] = argconfig
        for config in apply_dict_combinations(base, patch):
            yield config
Example #9
0
# In this file we have defined the default values for IEPY settings
# These default values will be always used except explicitely said.
from iepy.utils import make_feature_list

extractor_config = {
    "classifier":
    "svc",
    "classifier_args": {},
    "dense_features":
    make_feature_list("""
                entity_order
                entity_distance
                other_entities_in_between
                verbs_count_in_between
                verbs_count
                total_number_of_entities
                symbols_in_between
                number_of_tokens
                """),
    "sparse_features":
    make_feature_list("""
                bag_of_words
                bag_of_pos
                bag_of_words_in_between
                bag_of_pos_in_between
                """)
}
Example #10
0
def iter_configs(input_file_path, dbname):
    input_file_path = os.path.abspath(input_file_path)
    hasher = hashlib.md5(open(input_file_path, "rb").read())
    base = {
        # Experiment configuration
        u"config_version":
        u"1",
        u"data_shuffle_seed":
        None,
        u"train_percentage":
        None,
        u"input_file_path":
        input_file_path,
        u"input_file_md5":
        hasher.hexdigest(),
        u"database_name":
        dbname,

        # Classifier configuration
        u"classifier":
        u"svm",
        u"classifier_args":
        dict(),
        u"dimensionality_reduction":
        None,
        u"dimensionality_reduction_dimension":
        None,
        u"feature_selection":
        None,
        u"feature_selection_dimension":
        None,
        u"scaler":
        True,
        u"sparse":
        False,
        u"features":
        make_feature_list(u"""
                bag_of_words
                bag_of_pos
                bag_of_word_bigrams
                bag_of_wordpos
                bag_of_wordpos_bigrams
                bag_of_words_in_between
                bag_of_pos_in_between
                bag_of_word_bigrams_in_between
                bag_of_wordpos_in_between
                bag_of_wordpos_bigrams_in_between
                entity_order
                entity_distance
                other_entities_in_between
                in_same_sentence
                verbs_count_in_between
                verbs_count
                total_number_of_entities
                symbols_in_between
                number_of_tokens
                BagOfVerbStems True
                BagOfVerbStems False
                BagOfVerbLemmas True
                BagOfVerbLemmas False
        """)
    }

    # RBF
    ######
    patch = {
        u"train_percentage": [0.07 * x for x in range(1, 11)],
        u"data_shuffle_seed": [u"sussieq" + str(i) for i in range(20)]
    }
    argpatch = {
        u"kernel": [u"rbf"],
        u"gamma": [0.0, 1e-4, 1e-5, 1e-6],
        u"class_weight": [{
            True: 10,
            False: 1
        }, {
            True: 1,
            False: 10
        }, {
            True: 1,
            False: 1
        }, {
            True: 1,
            False: 0.1
        }, {
            True: 0.1,
            False: 1
        }]
    }

    for argconfig in apply_dict_combinations({}, argpatch):
        base[u"classifier_args"] = argconfig
        for config in apply_dict_combinations(base, patch):
            yield config

    # POLY
    #######

    base[u"feature_selection"] = "kbest"
    patch = {
        u"train_percentage": [0.07 * x for x in range(1, 11)],
        u"data_shuffle_seed": [u"sussieq" + str(i) for i in range(20)],
        u"feature_selection_dimension": [500, 1000, 2000, 4000]
    }

    argpatch = {
        u"kernel": [u"poly"],
        u"degree": [4],
        u"gamma": [0.0, 1e-4, 1e-5, 1e-6],
        u"class_weight": [{
            True: 10,
            False: 1
        }, {
            True: 1,
            False: 10
        }, {
            True: 1,
            False: 1
        }, {
            True: 1,
            False: 0.1
        }, {
            True: 0.1,
            False: 1
        }]
    }

    for argconfig in apply_dict_combinations({}, argpatch):
        base[u"classifier_args"] = argconfig
        for config in apply_dict_combinations(base, patch):
            yield config
Example #11
0
file will be added to the configurations.

"""
import os
import hashlib
from itertools import product
from copy import deepcopy

from iepy.utils import make_feature_list

from utils import apply_dict_combinations, check_configs

features = make_feature_list(u"""
                bag_of_words_in_between
                bag_of_pos_in_between
                bag_of_wordpos_in_between
                entity_order
                entity_distance
                other_entities_in_between
                verbs_count_in_between""")

# Proposing as classifiers the champions of the
# experimentation/classifier/config_round5.py
# Note: they will suffer small modifications (on feature_selection_dimension)
candidate_classifiers = [
    {
        u'classifier': u'svm',
        u'classifier_args': {
            u'class_weight': {
                u'false': 1,
                u'true': 1
            },
Example #12
0
File: core.py Project: copybin/iepy
    def __init__(self, db_connector, seed_facts, gold_standard=None):
        """
        Not blocking.
        """
        self.db_con = db_connector
        self.knowledge = Knowledge(
            {Evidence(f, None, None, None): 1
             for f in seed_facts})
        self.evidence_threshold = 0.99
        self.fact_threshold = 0.99
        self.questions = Knowledge()
        self.answers = {}
        self.gold_standard = gold_standard

        self.steps = [
            self.generalize_knowledge,  # Step 1
            self.generate_questions,  # Step 2, first half
            None,  # Pause to wait question answers
            self.filter_evidence,  # Step 2, second half
            self.learn_fact_extractors,  # Step 3
            self.extract_facts,  # Step 5
            self.filter_facts,  # Step 6
            self.evaluate  # Optional evaluation step
        ]
        self.step_iterator = itertools.cycle(self.steps)

        # Build relation description: a map from relation labels to pairs of entity kinds
        self.relations = {}
        for e in self.knowledge:
            t1 = e.fact.e1.kind
            t2 = e.fact.e2.kind
            if e.fact.relation in self.relations and (
                    t1, t2) != self.relations[e.fact.relation]:
                raise ValueError("Ambiguous kinds for relation %r" %
                                 e.fact.relation)
            self.relations[e.fact.relation] = (t1, t2)
        # Classifier configuration
        self.extractor_config = {
            "classifier":
            "dtree",
            "classifier_args":
            dict(),
            "dimensionality_reduction":
            None,
            "dimensionality_reduction_dimension":
            None,
            "feature_selection":
            None,
            "feature_selection_dimension":
            None,
            "scaler":
            False,
            "sparse":
            False,
            "features":
            make_feature_list("""
                    bag_of_words
                    bag_of_pos
                    bag_of_word_bigrams
                    bag_of_wordpos
                    bag_of_wordpos_bigrams
                    bag_of_words_in_between
                    bag_of_pos_in_between
                    bag_of_word_bigrams_in_between
                    bag_of_wordpos_in_between
                    bag_of_wordpos_bigrams_in_between
                    entity_order
                    entity_distance
                    other_entities_in_between
                    in_same_sentence
                    verbs_count_in_between
                    verbs_count
                    total_number_of_entities
                    symbols_in_between
                    number_of_tokens
                    BagOfVerbStems True
                    BagOfVerbStems False
                    BagOfVerbLemmas True
                    BagOfVerbLemmas False
            """),
        }