def __init__(self, language, tt_home=None, **kwargs): self.language = language self.tt_home = tt_home self.tokenizer = Tokenizer(language) self.tagger = TreeTagger( TAGLANG=language, TAGDIR=tt_home, # Explicit TAGOPT: the default has the '-no-unknown' option, # which prints the token rather than '<unknown>' for unknown lemmas # We'd rather skip unknown lemmas, as they are likely to be wrong tags TAGOPT=u'-token -lemma -sgml -quiet', # Use our tokenization logic (CHUNKERPROC here) CHUNKERPROC=self._tokenizer_wrapper, **kwargs)
def __init__(self, language, tt_home=None, **kwargs): self.language = language self.tt_home = tt_home self.tokenizer = Tokenizer(language) self.tagger = TreeTagger( TAGLANG=language, TAGDIR=tt_home, # Explicit TAGOPT: the default has the '-no-unknown' option, # which prints the token rather than '<unknown>' for unknown lemmas # We'd rather skip unknown lemmas, as they are likely to be wrong tags TAGOPT=u'-token -lemma -sgml -quiet', # Use our tokenization logic (CHUNKERPROC here) CHUNKERPROC=self._tokenizer_wrapper, **kwargs )
def __init__(self, corpus, document_key, sentences_key, language, lemma_to_token, match_base_form): """ Initializes the extractor. :param iterable corpus: The corpus, iterable of `dict`s :param str document_key: The key from which to retrieve the textual document :param str sentences_key: The key to which the extracted sentences should be stored :param str language: The language the text is in :param dict lemma_to_token: Mapping from lemma to list of tokens """ self.corpus = corpus self.sentences_key = sentences_key self.document_key = document_key self.lemma_to_token = lemma_to_token self.language = language self.lemma_to_token = lemma_to_token if match_base_form else self._filter_base_form( lemma_to_token) self.tokenizer = Tokenizer(self.language) self.tagger = TTPosTagger(self.language)
class TTPosTagger(object): """ part-of-speech tagger implemented using tree tagger and treetaggerwrapper """ def __init__(self, language, tt_home=None, **kwargs): self.language = language self.tt_home = tt_home self.tokenizer = Tokenizer(language) self.tagger = TreeTagger( TAGLANG=language, TAGDIR=tt_home, # Explicit TAGOPT: the default has the '-no-unknown' option, # which prints the token rather than '<unknown>' for unknown lemmas # We'd rather skip unknown lemmas, as they are likely to be wrong tags TAGOPT=u'-token -lemma -sgml -quiet', # Use our tokenization logic (CHUNKERPROC here) CHUNKERPROC=self._tokenizer_wrapper, **kwargs ) def _tokenizer_wrapper(self, tagger, text_list): """ Wrap the tokenization logic with the signature required by the TreeTagger CHUNKERPROC kwarg """ tokens = [] for text in text_list: for token in self.tokenizer.tokenize(text): tokens.append(token) return tokens def _postprocess_tags(self, tags, skip_unknown=True): """ Clean tagged data from non-tags and unknown lemmas (optionally) """ clean_tags = [] for tag in tags: if skip_unknown and isinstance(tag, NotTag) or tag.lemma == u'<unknown>': logger.debug("Unknown lemma found: %s. Skipping ..." % repr(tag)) continue clean_tags.append(tag) return clean_tags def tokenize(self, text): """ Splits a text into tokens """ return self.tokenizer.tokenize(text) def tag_one(self, text, skip_unknown=True, **kwargs): """ POS-Tags the given text, optionally skipping unknown lemmas :param unicode text: Text to be tagged :param bool skip_unknown: Automatically emove unrecognized tags from the result Sample usage: >>> from strephit.commons.pos_tag import TTPosTagger >>> from pprint import pprint >>> pprint(TTPosTagger('en').tag_one(u'sample sentence to be tagged fycgvkuhbj')) [Tag(word=u'sample', pos=u'NN', lemma=u'sample'), Tag(word=u'sentence', pos=u'NN', lemma=u'sentence'), Tag(word=u'to', pos=u'TO', lemma=u'to'), Tag(word=u'be', pos=u'VB', lemma=u'be'), Tag(word=u'tagged', pos=u'VVN', lemma=u'tag')] """ return self._postprocess_tags(make_tags(self.tagger.tag_text(text, **kwargs)), skip_unknown) def tag_many(self, items, document_key, pos_tag_key, batch_size=10000, **kwargs): """ POS-Tags many text documents of the given items. Use this for massive text tagging :param items: Iterable of items to tag. Generator preferred :param document_key: Where to find the text to tag inside each item. Text must be unicode :param pos_tag_key: Where to put pos tagged text Sample usage: >>> from strephit.commons.pos_tag import TTPosTagger >>> from pprint import pprint >>> pprint(list(TTPosTagger('en').tag_many( ... [{'text': u'Item one is in first position'}, {'text': u'In the second position is item two'}], ... 'text', 'tagged' ... ))) [{'tagged': [Tag(word=u'Item', pos=u'NN', lemma=u'item'), Tag(word=u'one', pos=u'CD', lemma=u'one'), Tag(word=u'is', pos=u'VBZ', lemma=u'be'), Tag(word=u'in', pos=u'IN', lemma=u'in'), Tag(word=u'first', pos=u'JJ', lemma=u'first'), Tag(word=u'position', pos=u'NN', lemma=u'position')], 'text': u'Item one is in first position'}, {'tagged': [Tag(word=u'In', pos=u'IN', lemma=u'in'), Tag(word=u'the', pos=u'DT', lemma=u'the'), Tag(word=u'second', pos=u'JJ', lemma=u'second'), Tag(word=u'position', pos=u'NN', lemma=u'position'), Tag(word=u'is', pos=u'VBZ', lemma=u'be'), Tag(word=u'item', pos=u'RB', lemma=u'item'), Tag(word=u'two', pos=u'CD', lemma=u'two')], 'text': u'In the second position is item two'}] """ tt_pool = TaggerProcessPoll( TAGLANG=self.language, TAGDIR=self.tt_home, TAGOPT=u'-token -lemma -sgml -quiet', CHUNKERPROC=self._tokenizer_wrapper ) logging.getLogger('TreeTagger').setLevel(logging.WARNING) try: jobs = [] for i, item in enumerate(items): if not item.get(document_key): continue jobs.append((item, tt_pool.tag_text_async(item[document_key], **kwargs))) if i % batch_size == 0: for each in self._finalize_batch(jobs, pos_tag_key): yield each jobs = [] for each in self._finalize_batch(jobs, pos_tag_key): yield each finally: tt_pool.stop_poll() def _finalize_batch(self, jobs, pos_tag_key): for item, job in jobs: job.wait_finished() item[pos_tag_key] = self._postprocess_tags(make_tags(job.result)) yield item
class TTPosTagger(object): """ part-of-speech tagger implemented using tree tagger and treetaggerwrapper """ def __init__(self, language, tt_home=None, **kwargs): self.language = language self.tt_home = tt_home self.tokenizer = Tokenizer(language) self.tagger = TreeTagger( TAGLANG=language, TAGDIR=tt_home, # Explicit TAGOPT: the default has the '-no-unknown' option, # which prints the token rather than '<unknown>' for unknown lemmas # We'd rather skip unknown lemmas, as they are likely to be wrong tags TAGOPT=u'-token -lemma -sgml -quiet', # Use our tokenization logic (CHUNKERPROC here) CHUNKERPROC=self._tokenizer_wrapper, **kwargs) def _tokenizer_wrapper(self, tagger, text_list): """ Wrap the tokenization logic with the signature required by the TreeTagger CHUNKERPROC kwarg """ tokens = [] for text in text_list: for token in self.tokenizer.tokenize(text): tokens.append(token) return tokens def _postprocess_tags(self, tags, skip_unknown=True): """ Clean tagged data from non-tags and unknown lemmas (optionally) """ clean_tags = [] for tag in tags: if skip_unknown and isinstance( tag, NotTag) or tag.lemma == u'<unknown>': logger.debug("Unknown lemma found: %s. Skipping ..." % repr(tag)) continue clean_tags.append(tag) return clean_tags def tokenize(self, text): """ Splits a text into tokens """ return self.tokenizer.tokenize(text) def tag_one(self, text, skip_unknown=True, **kwargs): """ POS-Tags the given text, optionally skipping unknown lemmas :param unicode text: Text to be tagged :param bool skip_unknown: Automatically emove unrecognized tags from the result Sample usage: >>> from strephit.commons.pos_tag import TTPosTagger >>> from pprint import pprint >>> pprint(TTPosTagger('en').tag_one(u'sample sentence to be tagged fycgvkuhbj')) [Tag(word=u'sample', pos=u'NN', lemma=u'sample'), Tag(word=u'sentence', pos=u'NN', lemma=u'sentence'), Tag(word=u'to', pos=u'TO', lemma=u'to'), Tag(word=u'be', pos=u'VB', lemma=u'be'), Tag(word=u'tagged', pos=u'VVN', lemma=u'tag')] """ return self._postprocess_tags( make_tags(self.tagger.tag_text(text, **kwargs)), skip_unknown) def tag_many(self, items, document_key, pos_tag_key, batch_size=10000, **kwargs): """ POS-Tags many text documents of the given items. Use this for massive text tagging :param items: Iterable of items to tag. Generator preferred :param document_key: Where to find the text to tag inside each item. Text must be unicode :param pos_tag_key: Where to put pos tagged text Sample usage: >>> from strephit.commons.pos_tag import TTPosTagger >>> from pprint import pprint >>> pprint(list(TTPosTagger('en').tag_many( ... [{'text': u'Item one is in first position'}, {'text': u'In the second position is item two'}], ... 'text', 'tagged' ... ))) [{'tagged': [Tag(word=u'Item', pos=u'NN', lemma=u'item'), Tag(word=u'one', pos=u'CD', lemma=u'one'), Tag(word=u'is', pos=u'VBZ', lemma=u'be'), Tag(word=u'in', pos=u'IN', lemma=u'in'), Tag(word=u'first', pos=u'JJ', lemma=u'first'), Tag(word=u'position', pos=u'NN', lemma=u'position')], 'text': u'Item one is in first position'}, {'tagged': [Tag(word=u'In', pos=u'IN', lemma=u'in'), Tag(word=u'the', pos=u'DT', lemma=u'the'), Tag(word=u'second', pos=u'JJ', lemma=u'second'), Tag(word=u'position', pos=u'NN', lemma=u'position'), Tag(word=u'is', pos=u'VBZ', lemma=u'be'), Tag(word=u'item', pos=u'RB', lemma=u'item'), Tag(word=u'two', pos=u'CD', lemma=u'two')], 'text': u'In the second position is item two'}] """ try: tt_pool = TaggerProcessPoll(TAGLANG=self.language, TAGDIR=self.tt_home, TAGOPT=u'-token -lemma -sgml -quiet', CHUNKERPROC=self._tokenizer_wrapper) except TypeError: logger.warn( 'failed to initialize tree tragger process pool, fallback to single-process tagging' ) for each in items: text = each.get(document_key) if text: each[pos_tag_key] = self.tag_one(text, **kwargs) yield each else: logging.getLogger('TreeTagger').setLevel(logging.WARNING) try: jobs = [] s = 0 for i, item in enumerate(items): if not item.get(document_key): s += 1 continue jobs.append( (item, tt_pool.tag_text_async(item[document_key], **kwargs))) if len(jobs) % batch_size == 0: for each in self._finalize_batch(jobs, pos_tag_key): yield each jobs = [] for each in self._finalize_batch(jobs, pos_tag_key): yield each finally: tt_pool.stop_poll() def _finalize_batch(self, jobs, pos_tag_key): for item, job in jobs: job.wait_finished() item[pos_tag_key] = self._postprocess_tags(make_tags(job.result)) yield item