Ejemplo n.º 1
0
 def test_patterns(self):
     fg = TmVarDictionaryFeatureGenerator()
     self.assertTrue(fg.patterns[0].match('c.2708_2711delTTAG'))
     self.assertTrue(fg.patterns[1].match('IVS2-58_55insT'))
     self.assertTrue(fg.patterns[2].match('c.467C>A'))
     self.assertTrue(fg.patterns[3].match('IVS3+18C>T '))
     self.assertTrue(fg.patterns[4].match('c.A436C'))
     self.assertTrue(fg.patterns[5].match('A436C'))
     self.assertTrue(fg.patterns[6].match('912delTA'))
     self.assertTrue(fg.patterns[7].match('p.G204VfsX28'))
     self.assertTrue(fg.patterns[8].match('p.G204V'))
     self.assertTrue(fg.patterns[9].match('p.Ser157Ser'))
     self.assertTrue(fg.patterns[10].match('p.Ser119fsX'))
Ejemplo n.º 2
0
def get_prepare_pipeline_for_best_model(use_windows=True,
                                        we_params=None,
                                        nl_features=None):
    """
    Helper method that returns an instance of PrepareDatasetPipeline
    which uses the best configuration for predicating mutation mentions.
    if we_params is empty dict, no we is applied

    :returns nalaf.structures.dataset_pipelines.PrepareDatasetPipeline
    """

    default_we_params = {
        'additive': None,
        'multiplicative': None,
        'location': None
    }
    we_params = default_we_params if we_params is None else we_params

    generators = [
        SpacyLemmatizer(),
        SentenceMarkerFeatureGenerator(),
        TmVarFeatureGenerator(get_mutation_features=True),
        TmVarDictionaryFeatureGenerator(),
    ]

    include = []

    if nl_features:
        f = NLMentionFeatureGenerator(nl_features['threshold'])
        if nl_features['window']:
            include.extend(['tag_dict[0]', 'nl_tag_dict[0]'])

        generators.append(f)

    if use_windows:
        include.extend([
            'pattern0[0]', 'pattern1[0]', 'pattern2[0]', 'pattern3[0]',
            'pattern4[0]', 'pattern5[0]', 'pattern6[0]', 'pattern7[0]',
            'pattern8[0]', 'pattern9[0]', 'pattern10[0]', 'stem[0]'
        ])
        f = WindowFeatureGenerator(template=(-4, -3, -2, -1, 1, 2, 3, 4),
                                   include_list=include)
        generators.append(f)

    if we_params:
        generators.append(
            get_word_embeddings_feature_generator(we_params['location'],
                                                  we_params['additive'],
                                                  we_params['multiplicative']))

    return PrepareDatasetPipeline(feature_generators=generators)
Ejemplo n.º 3
0
    def test_generate_patterns_245(self):
        dataset = StringReader('token c.A436C token').read()
        NLTKSplitter().split(dataset)
        TmVarTokenizer().tokenize(dataset)
        TmVarDictionaryFeatureGenerator().generate(dataset)

        token_features = [{key: value for key, value in token.features.items() if value is not 'O'}
                          for token in dataset.tokens()]
        self.assertEqual(token_features[0], {})
        self.assertEqual(token_features[1], {'pattern4[0]': 'B', 'pattern2[0]': 'B'})
        self.assertEqual(token_features[2], {'pattern4[0]': 'I', 'pattern2[0]': 'I'})
        self.assertEqual(token_features[3], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'B'})
        self.assertEqual(token_features[4], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'I'})
        self.assertEqual(token_features[5], {'pattern4[0]': 'E', 'pattern2[0]': 'I', 'pattern5[0]': 'E'})
        self.assertEqual(token_features[6], {})