def testWalk(self): d = Dictionary([('key', 'the term', 42)], DictionaryTests.tokenizer) s = "Here is the term we're looking for." tokens = [s[start:end] for start, end, tag, ortho in DictionaryTests.tokenizer.tokenize(s)] result = list(d.walk(tokens)) O = Dictionary.O B = Dictionary.B % 'key' I = Dictionary.I % 'key' expected = [O, O, B, I, O, O, O, O, O, O] self.assertEqual(result, expected)
def testCapitalizationAlts(self): d = Dictionary( [('NEUROD1', 'NEUROD', 100), ('NEUROD2', 'NEUROD2', 100)], DictionaryTests.tokenizer ) s = "Transfection of vectors expressing neuroD and neuroD2 into P19 cells." tokens = [s[start:end] for start, end, tag, ortho in DictionaryTests.tokenizer.tokenize(s)] result = list(d.walk(tokens)) O = Dictionary.O B = Dictionary.B % 'NEUROD' I = Dictionary.I % 'NEUROD' expected = [O, O, O, O, B + "1", I + "1", O, B + "2", I + "2", I + "2", O, O, O, O, O] self.assertEqual(result, expected)
def testFullWalk(self): d = Dictionary( [('alt', 'the term we', 84), ('key', 'the term we', 42), ('apo', "'", 1), ('part', 'term we', 21)], DictionaryTests.tokenizer ) s = "Here is the term we're looking for." tokens = [s[start:end] for start, end, tag, ortho in DictionaryTests.tokenizer.tokenize(s)] result = list(d.walk(tokens)) O = Dictionary.O B = Dictionary.B % 'key' I = Dictionary.I % 'key' A = Dictionary.B % 'apo' expected = [O, O, B, I, I, A, O, O, O, O] self.assertEqual(result, expected)
def testExamples(self): d = Dictionary( [('NR1D1', 'rev erb α', 1), ('NR1D1', 'rev erb alpha', 1), ('PPARA', 'PPAR', 1)], DictionaryTests.tokenizer ) O = Dictionary.O B = Dictionary.B I = Dictionary.I sentences = [ ("A functional Rev-erb alpha responsive element located in the human Rev-erb alpha promoter mediates a repressing activity.", [O, O, B % 'NR1D1', I % 'NR1D1', I % 'NR1D1', O, O, O, O, O, O, B % 'NR1D1', I % 'NR1D1', I % 'NR1D1', O, O, O, O, O, O]), ("A positive PPAR-response element in the human apoA-I promoter nonfunctional in rats.", [O, O, B % 'PPARA', O, O, O, O, O, O, O, O, O, O, O, O, O]) ] for s, e in sentences: r = list(d.walk([s[start:end] for start, end, tag, ortho in DictionaryTests.tokenizer.tokenize(s)])) self.assertEqual(len(r), len(e)) self.assertEqual(r, e)
def testCreateDictionary(self): d = Dictionary([('key', 'The Term', 42, 21)], DictionaryTests.tokenizer) n = Node(The=Node(Term=Node(([42, 21], 'key')))) self.assertEqual(d.root, n)
else: parser.error("unknown output option " + args.output) method = lambda *args: None try: pos_tagger = GeniaTagger() ner_tagger = NerSuite(args.model) qualifier_list = [l.strip() for l in args.qranks] raw_dict_data = [ dictionaryReader(d, qualifier_list, args.separator) for d in args.dictionary ] # a tokenizer that skips Unicode Categories Zs and Pd: tokenizer = WordTokenizer(skipTags={'space'}, skipOrthos={'e'}) dictionaries = [ Dictionary(stream, tokenizer) for stream in raw_dict_data ] logging.info("initialized %s dictionaries", len(dictionaries)) lst = [dictionaries, tokenizer, pos_tagger, ner_tagger] kwds = dict(sep=args.separator, tag_all_nouns=args.nouns, use_greek_letters=args.greek) if args.files: lst.append(args.files) else: lst.append([sys.stdin]) method(*lst, **kwds) del ner_tagger