def predict(self, text: Generator[list[str]]) -> list[list[str]]: preds = list() for words in text: word_ents = list( self.model.annotate(WordList(words, language='da'))) preds.append([ent for word, ent in word_ents]) return preds
def benchmark_polyglot_mdl(): """ Running ployglot requires these packages: # Morfessor==2.0.6 # PyICU==2.4.2 # pycld2==0.41 # polyglot """ from polyglot.tag import NEChunker from polyglot.text import WordList start = time.time() predictions = [] for tokens in sentences_tokens: word_list = WordList(tokens, language='da') ne_chunker = NEChunker(lang='da') word_ent_tuples = list(ne_chunker.annotate(word_list)) predictions.append([entity for word, entity in word_ent_tuples]) print('polyglot:') print_speed_performance(start, num_sentences, num_tokens) assert len(predictions) == len(sentences_entities) print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
def benchmark_polyglot_mdl(corrected_output=False): """ Running polyglot requires these packages: # Morfessor==2.0.6 # PyICU==2.4.2 # pycld2==0.41 # polyglot """ def udify_tag(tag, word): if tag == "CONJ": return "CCONJ" if tag == "VERB" and word in auxiliary_verbs: return "AUX" return tag start = time.time() tags_pred = [] for tokens in sentences_tokens: word_list = WordList(tokens, language='da') tagger = POSTagger(lang='da') word_tag_tuples = list(tagger.annotate(word_list)) tags_pred.append([ udify_tag(tag, word) if corrected_output else tag for word, tag in word_tag_tuples ]) print('**Polyglot model' + (' (corrected output) ' if corrected_output else '') + '**') print_speed_performance(start, num_sentences, num_tokens) assert len(tags_pred) == num_sentences assert sum([len(s) for s in tags_pred]) == num_tokens print(accuracy_report(tags_true, tags_pred), end="\n\n")
def benchmark_polyglot_mdl(): """ Running ployglot requires these packages: # Morfessor==2.0.6 # PyICU==2.4.2 # pycld2==0.41 # polyglot """ from polyglot.tag import NEChunker from polyglot.text import WordList start = time.time() predictions = [] for tokens in sentences_tokens: word_list = WordList(tokens, language='da') ne_chunker = NEChunker(lang='da') word_ent_tuples = list(ne_chunker.annotate(word_list)) predictions.append([entity for word, entity in word_ent_tuples]) print("Made predictions on {} sentences and {} tokens in {}s".format( num_sentences, num_tokens, time.time() - start)) assert len(predictions) == len(sentences_entities) print(classification_report(sentences_entities, remove_miscs(predictions), digits=4))
def benchmark_polyglot_mdl(): """ Running ployglot requires these packages: # Morfessor==2.0.6 # PyICU==2.4.2 # pycld2==0.41 # polyglot """ start = time.time() tags_pred = [] for tokens in sentences_tokens: word_list = WordList(tokens, language='da') ne_chunker = POSTagger(lang='da') word_ent_tuples = list(ne_chunker.annotate(word_list)) tags_pred.append([entity for word, entity in word_ent_tuples]) print('**Polyglot model**') print("Made predictions on {} sentences and {} tokens in {}s".format( num_sentences, num_tokens, time.time() - start)) assert len(tags_pred) == num_sentences assert sum([len(s) for s in tags_pred]) == num_tokens print(classification_report(tags_true, tags_pred, digits=4))
def tokens(self): """Return a list of tokens, using this blob's tokenizer object (defaults to :class:`WordTokenizer <textblob.tokenizers.WordTokenizer>`). """ seq = self.word_tokenizer.transform(Sequence(self.raw)) tokens = WordList(seq.tokens(), parent=self, language=self.language.code) fix_hyphen = [] i = 0 # SIDE DELETE # while i < len(tokens): # hyphen_word = '' # while i + 3 < len(tokens) and tokens[i+1] == '-' and tokens[i+2] not in string.punctuation: # if tokens[i+3] == '-': # hyphen_word += tokens[i] + tokens[i+1] # i += 2 # if i + 2 < len(tokens): # if tokens[i+1] == '-' and tokens[i+2] not in string.punctuation: # hyphen_word += tokens[i] + tokens[i + 1] + tokens[i+2] # # i+=3 # SIDE delete error [list out bound] # i += 1 # SIDE ADD # if tokens[i] != '-': # break # else: # hyphen_word += tokens[i] + tokens[i + 1] + tokens[i + 2] # i += 3 # if tokens[i] != '-': # break # if hyphen_word: # fix_hyphen.append(hyphen_word) # continue # else: # if i + 2 < len(tokens): # if tokens[i] not in string.punctuation and tokens[i+1] == '-' and tokens[i+2] not in string.punctuation: # fix_hyphen.append(tokens[i]+tokens[i+1]+tokens[i+2]) # i += 3 # continue # fix_hyphen.append(tokens[i]) # i+=1 # SIDE ADD while i < len(tokens): hyphen_word = '' if fix_hyphen and tokens[i] == '-' and i + 1 < len( tokens) and tokens[i + 1] not in string.punctuation: hyphen_word = tokens[i] + tokens[i + 1] if hyphen_word: fix_hyphen[-1] = fix_hyphen[-1] + hyphen_word i += 1 else: fix_hyphen.append(tokens[i]) i += 1 if self.split_apostrophe: fix_apostrophe = [] for token in fix_hyphen: if '\'' in token: split = token.split('\'') for i, t in enumerate(split): fix_apostrophe.append(t) if i != len(split) - 1: fix_apostrophe.append('\'') else: fix_apostrophe.append(token) return WordList(fix_apostrophe, parent=self, language=self.language.code) else: return WordList(fix_hyphen, parent=self, language=self.language.code)