def nltk_component_check(): ''' Checks that the required components of the nltk have been downloaded''' test = "a bat" try: nltk.word_tokenize(test) except LookupError: nltk.download('punkt') tokens = nltk.word_tokenize(test) try: nltk.pos_tag(tokens) except LookupError: nltk.download('averaged_perceptron_tagger') try: nltk.tagset_mapping('en-ptb', 'universal') except LookupError: nltk.download('universal_tagset')
def extract_pos(tokens, simple=True): """ Simple parts of speech of speech are: VERB - verbs (all tenses and modes) NOUN - nouns (common and proper) PRON - pronouns ADJ - adjectives ADV - adverbs ADP - adpositions (prepositions and postpositions) CONJ - conjunctions DET - determiners NUM - cardinal numbers PRT - particles or other function words X - other: foreign words, typos, abbreviations . - punctuation :param tokens: :return: """ tokens_pos = pos_tag(tokens) pos = [p for t, p in tokens_pos] if simple: # translate larger set of part of speech tags into small, simpler set pos_dict = nltk.tagset_mapping('en-ptb', 'universal') pos = [pos_dict[p] for p in pos] return pos
except LookupError: nltk.download('punkt') tokens = nltk.word_tokenize(test) try: nltk.pos_tag(tokens) except LookupError: nltk.download('averaged_perceptron_tagger') try: nltk.tagset_mapping('en-ptb', 'universal') except LookupError: nltk.download('universal_tagset') # Create a single tagger object to avoid repeated calls to 'load' nltk_component_check() tagger = nltk.tag.perceptron.PerceptronTagger() tagset_map = nltk.tagset_mapping('en-ptb', 'universal') def tag_song(song, simplify=True): ''' Returns a nested song structure where each word is tagged with part of speech. If 'simplify' is True (default) then all punctuation and special words are removed. ''' tagged_song = [] for paragraph in generate_nested(song): tagged_paragraph = [] for line in paragraph: tagged_line = tagger.tag(nltk.word_tokenize(line)) tagged_line = [(w, tagset_map[p]) for (w, p) in tagged_line if tagset_map[p] != '.' and tagset_map[p] != 'X'] tagged_paragraph.append(tagged_line) tagged_song.append(tagged_paragraph)
def get_tag_mapper(lang: str) -> dict: # https://stackoverflow.com/questions/44117664/how-to-reduce-the-number-of-pos-tags-in-penn-treebank-nltk-python if lang == 'en': return tagset_mapping('en-ptb', 'universal') return {}
# returns max, min, avg depth lst_depth = _list_up_depth_to_root(synset, root_synset) if len(lst_depth) == 0: return ret_empty else: return (float(np.max(lst_depth)), float(np.min(lst_depth)), float(np.mean(lst_depth))) _MAP_UNIVERSAL_TO_WORDNET = { "NOUN": wn.NOUN, "VERB": wn.VERB, "ADJ": wn.ADJ, "ADV": wn.ADV } _MAP_PTB_TO_UNIVERSAL = nltk.tagset_mapping("en-ptb", "universal") def universal_tagset_to_wordnet_tagset(univ_tag, na_value="o"): """ Universal PoS tagset を WordNetの PoS tagset に変換する. @param univ_tag: Universal PoS tag @param na_value: 変換先のWordNet PoS tagがない場合の出力 @return: WordNet PoS tagset. {wn.NOUN, wn.VERB, wn.ADJ, wn.ADV, na_value} のいずれか """ wn_tag = _MAP_UNIVERSAL_TO_WORDNET.get(univ_tag, na_value) return wn_tag def ptb_tagset_to_wordnet_tagset(ptb_tag, na_value="o"): """