Beispiel #1
0
    def __init__(self, splitter=None, tokenizer=None, feature_generators=None):
        if not splitter:
            splitter = NLTKSplitter()
        if not tokenizer:
            tokenizer = TmVarTokenizer()
        if feature_generators is None:
            feature_generators = [SimpleFeatureGenerator(), PorterStemFeatureGenerator(),
                                  WindowFeatureGenerator((-3, -2, -1, 1, 2, 3), ['stem[0]'])]

        if isinstance(splitter, Splitter):
            self.splitter = splitter
        else:
            raise TypeError('not an instance that implements Splitter')

        if isinstance(tokenizer, Tokenizer):
            self.tokenizer = tokenizer
        else:
            raise TypeError('not an instance that implements Tokenizer')

        if hasattr(feature_generators, '__iter__'):
            for index, feature_generator in enumerate(feature_generators):
                if not isinstance(feature_generator, FeatureGenerator):
                    raise TypeError('not an instance that implements FeatureGenerator at index {}'.format(index))
            self.feature_generators = feature_generators
        elif isinstance(feature_generators, FeatureGenerator):
            self.feature_generators = [feature_generators]
        else:
            raise TypeError('not an instance or iterable of instances that implements FeatureGenerator')
Beispiel #2
0
 def setUp(self):
     self.dataset = StringReader(
         'some text ... (c.2708_2711delTTAG, p.V903GfsX905) ... text').read(
         )
     NLTKSplitter().split(self.dataset)
     TmVarTokenizer().tokenize(self.dataset)
     part = list(self.dataset.parts())[0]
     part.annotations.append(
         Entity(STUB_ENTITY_CLASS_ID, 15, 'c.2708_2711delTTAG'))
     part.annotations.append(
         Entity(STUB_ENTITY_CLASS_ID, 35, 'p.V903GfsX905'))
Beispiel #3
0
    def test_generate_patterns_245(self):
        dataset = StringReader('token c.A436C token').read()
        NLTKSplitter().split(dataset)
        TmVarTokenizer().tokenize(dataset)
        TmVarDictionaryFeatureGenerator().generate(dataset)

        token_features = [{key: value for key, value in token.features.items() if value is not 'O'}
                          for token in dataset.tokens()]
        self.assertEqual(token_features[0], {})
        self.assertEqual(token_features[1], {'pattern4[0]': 'B', 'pattern2[0]': 'B'})
        self.assertEqual(token_features[2], {'pattern4[0]': 'I', 'pattern2[0]': 'I'})
        self.assertEqual(token_features[3], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'B'})
        self.assertEqual(token_features[4], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'I'})
        self.assertEqual(token_features[5], {'pattern4[0]': 'E', 'pattern2[0]': 'I', 'pattern5[0]': 'E'})
        self.assertEqual(token_features[6], {})
Beispiel #4
0
 def test_tag(self):
     # todo question is that the proper way? with predicts_classes
     GNormPlusGeneTagger().tag(self.data, uniprot=True)
     NLTKSplitter().split(self.data)
     TmVarTokenizer().tokenize(self.data)
     StubSameSentenceRelationExtractor(PRO_CLASS_ID, MUT_CLASS_ID, PRO_REL_MUT_CLASS_ID).annotate(self.data)
     self.assertEqual(len([x for x in self.data.annotations() if x.class_id == PRO_CLASS_ID]), 0)
     self.assertEqual(len([x for x in self.data.annotations() if x.class_id == MUT_CLASS_ID]), 2)
     self.assertEqual(len([x for x in self.data.relations() if x.class_id == PRO_REL_MUT_CLASS_ID]), 0)
     self.data.purge_false_relationships()
     self.assertEqual(len([x for x in self.data.relations() if x.class_id == PRO_REL_MUT_CLASS_ID]), 0)
     del self.data.documents['15878741'].parts['abstract'].annotations[0]
     self.assertEqual(len([x for x in self.data.annotations() if x.class_id == MUT_CLASS_ID]), 1)
     self.data.purge_false_relationships()
     self.assertEqual(len([x for x in self.data.relations() if x.class_id == PRO_REL_MUT_CLASS_ID]), 0)
Beispiel #5
0
    def setup_class(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()

        doc_id1 = Document()
        # 15 tokens in 2 sentences
        doc_id1.parts['p1'] = Part(
            'this is some sample text. it contains this c.2708_2711delTTAG mutation.'
        )
        doc_id1.parts['p1'].sentences_ = [
            'this is some sample text.',
            'it contains this c.2708_2711delTTAG mutation.'
        ]

        cls.dataset.documents['doc_id1'] = doc_id1

        cls.tokenizer = TmVarTokenizer()
        cls.tokenizer.tokenize(cls.dataset)
Beispiel #6
0
def get_prepare_pipeline_for_best_model_general(use_windows=True, we_params=None, dictionaries_paths=None, hdfs_url=None, hdfs_user=None, dictionaries_stop_words=None):
    """
    Helper method that returns an instance of PrepareDatasetPipeline
    which uses the best configuration for predicating any-domain mentions.

    if we_params is empty dict, no we is applied

    :returns nalaf.structures.dataset_pipelines.PrepareDatasetPipeline
    """

    # MAYBE ml-performance: use more general-domain tokenizer such as NLTK's
    tokenizer = TmVarTokenizer()

    default_we_params = {'additive': None, 'multiplicative': None, 'location': None}
    we_params = default_we_params if we_params is None else we_params

    generators = [
        SpacyLemmatizer(),
        SpacyPosTagger(),
        SentenceMarkerFeatureGenerator(),
        TmVarFeatureGenerator(get_mutation_features=False)
    ]

    windows_include = []

    if dictionaries_paths:
        if type(dictionaries_paths) is str:
            dictionaries_paths = [x.strip() for x in dictionaries_paths.split(",")]

        dics_feat_generators = DictionaryFeatureGenerator.construct_all_from_paths(dictionaries_paths=dictionaries_paths, string_tokenizer=tokenizer.tokenize_string, case_sensitive=False, hdfs_url=hdfs_url, hdfs_user=hdfs_user, stop_words=dictionaries_stop_words)
        generators.extend(dics_feat_generators)
        for dic in dics_feat_generators:
            windows_include.append(dic.key + "[0]")

    if use_windows:
        windows_include.extend(['stem[0]', 'pos[0]'])
        f = WindowFeatureGenerator(template=(-2, -1, 1, 2), include_list=windows_include)
        generators.append(f)

    if we_params:
        generators.append(get_word_embeddings_feature_generator(we_params['location'], we_params['additive'], we_params['multiplicative']))

    return PrepareDatasetPipeline(tokenizer=tokenizer, feature_generators=generators)
Beispiel #7
0
    def setUpClass(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()

        doc_id1 = Document()

        doc_id1.parts['t1'] = Part('This title blows your mind')

        text = str(
            'This magic only exists in your dreams. To become reality, you have to work at it. '
            'Thr is only available with the residue threonine and a mutation, '
            'though things can change positions '
            'when adding some more replacements. Between me being sorry '
            'and you being an insertion.')
        doc_id1.parts['p1'] = Part(text.replace('\n', ''))

        cls.dataset.documents['doc_id1'] = doc_id1

        NLTKSplitter().split(cls.dataset)
        TmVarTokenizer().tokenize(cls.dataset)

        cls.feature = NLMentionFeatureGenerator(thr=4)
        cls.feature.generate(dataset=cls.dataset)
Beispiel #8
0
from nala.utils.corpora import get_corpus
from nalaf.preprocessing.spliters import NLTKSplitter
from nalaf.preprocessing.tokenizers import TmVarTokenizer

data = get_corpus('nala_training_1')

NLTKSplitter().split(data)
TmVarTokenizer().tokenize(data)
from nalaf.features.embeddings import BinarizedWordEmbeddingsFeatureGenerator

BinarizedWordEmbeddingsFeatureGenerator(
    '/home/abojchevski/projects/nala/nala/data/word_embeddings_2016-03-28/word_embeddings.model'
).generate(data)

for token in data.tokens():
    print(token.features, token.end)
Beispiel #9
0
 def __init__(self):
     self.data = get_corpus('IDP4+')
     NLTKSplitter().split(self.data)
     TmVarTokenizer().tokenize(self.data)
Beispiel #10
0
    def __init__(self,
                 entity1_class,
                 entity2_class,
                 rel_type,
                 sentence_distance=0,
                 selected_features_file=None,
                 feature_generators=None,
                 pipeline=None,
                 use_predicted_entities=False,
                 execute_pipeline=True,
                 model=None,
                 **model_params):

        super().__init__(entity1_class, entity2_class, rel_type)

        self.sentence_distance = sentence_distance
        edge_generator = SentenceDistanceEdgeGenerator(
            entity1_class,
            entity2_class,
            rel_type,
            distance=self.sentence_distance,
            use_gold=not use_predicted_entities,
            use_pred=use_predicted_entities,
        )

        if selected_features_file:
            self.feature_set = FeatureDictionary(is_locked=False)
            selected_features = unpickle_beautified_file(
                selected_features_file)
            # sort to make the order of feature insertion deterministic
            for selected in sorted(selected_features):
                self.feature_set[selected] = len(self.feature_set)
            self.feature_set.is_locked = True

        else:
            self.feature_set = None

        if pipeline:
            feature_generators = pipeline.feature_generators
        elif feature_generators is not None:  # Trick: if [], this will use pipeline's default generators
            feature_generators = feature_generators
        else:
            feature_generators = self.feature_generators()

        self.pipeline = pipeline if pipeline \
            else RelationExtractionPipeline(
                entity1_class, entity2_class, rel_type,
                tokenizer=TmVarTokenizer(),
                edge_generator=edge_generator,
                feature_set=self.feature_set,
                feature_generators=feature_generators)

        assert feature_generators == self.pipeline.feature_generators or feature_generators == [], str((feature_generators, self.pipeline.feature_generators))

        self.execute_pipeline = execute_pipeline

        # With the following two settings we try force the model to always give the same results between runs
        # and avoid slight variations due to different random generators initializations

        if not model_params.get("tol"):
            # As of 2017-Feb-7, default in SVC is 1e-3: http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
            model_params["tol"] = 1e-5

        if not model_params.get("random_state"):
            # TODO set with this
            model_params["random_state"] = 2727
            pass

        self.model = model if model else SklSVM(**model_params)