def ifd_tag(text): """ Tokenize the given text and use a global singleton TnT tagger to tag it """ global _TAGGER if _TAGGER is None: # Load the tagger from a pickle the first time it's used logging.info("Loading TnT model from {0}".format("config" + os.sep + "TnT-model.pickle")) _TAGGER = TnT.load("config" + os.sep + "TnT-model.pickle") if _TAGGER is None: return [] # No tagger model - unable to tag token_stream = raw_tokenize(text) result = [] def xlt(txt): """ Translate the token text as required before tagging it """ if txt[0] == '[' and txt[-1] == ']': # Abbreviation enclosed in square brackets: remove'em return txt[1:-1] return _XLT.get(txt, txt) for pg in paragraphs(token_stream): for _, sent in pg: toklist = [xlt(t.txt) for t in sent if t.txt] # print(f"Toklist: {toklist}") tagged = _TAGGER.tag(toklist) result.append(tagged) # Return a list of paragraphs, consisting of sentences, consisting of tokens return result
def ifd_tag(text): """ Tokenize the given text and use a global singleton TnT tagger to tag it """ global _TAGGER if _TAGGER is None: # Load the tagger from a pickle the first time it's used logging.info("Loading TnT model from {0}".format("config" + os.sep + "TnT-model.pickle")) _TAGGER = TnT.load("config" + os.sep + "TnT-model.pickle") if _TAGGER is None: return [] # No tagger model - unable to tag token_stream = raw_tokenize(text) result = [] def xlt(txt): """ Translate the token text as required before tagging it """ if txt[0] == '[' and txt[-1] == ']': # Abbreviation enclosed in square brackets: remove'em return txt[1:-1] return _XLT.get(txt, txt) for pg in paragraphs(token_stream): for _, sent in pg: toklist = [ xlt(t.txt) for t in sent if t.txt ] # print(f"Toklist: {toklist}") tagged = _TAGGER.tag(toklist) result.append(tagged) # Return a list of paragraphs, consisting of sentences, consisting of tokens return result