Example #1
0
def nltk_component_check():
    ''' Checks that the required components of the nltk have been downloaded'''
    test = "a bat"
    try:
        nltk.word_tokenize(test)
    except LookupError:
        nltk.download('punkt')
    tokens = nltk.word_tokenize(test)
    try:
        nltk.pos_tag(tokens)
    except LookupError:
        nltk.download('averaged_perceptron_tagger')
    try:
        nltk.tagset_mapping('en-ptb', 'universal')
    except LookupError:
        nltk.download('universal_tagset')
Example #2
0
def extract_pos(tokens, simple=True):
	"""
	Simple parts of speech of speech are:
	VERB - verbs (all tenses and modes)
	NOUN - nouns (common and proper)
	PRON - pronouns
	ADJ - adjectives
	ADV - adverbs
	ADP - adpositions (prepositions and postpositions)
	CONJ - conjunctions
	DET - determiners
	NUM - cardinal numbers
	PRT - particles or other function words
	X - other: foreign words, typos, abbreviations
	. - punctuation
	:param tokens:
	:return:
	"""
	tokens_pos = pos_tag(tokens)
	pos = [p for t, p in tokens_pos]
	if simple:
		# translate larger set of part of speech tags into small, simpler set
		pos_dict = nltk.tagset_mapping('en-ptb', 'universal')
		pos = [pos_dict[p] for p in pos]
	return pos
Example #3
0
    except LookupError:
        nltk.download('punkt')
    tokens = nltk.word_tokenize(test)
    try:
        nltk.pos_tag(tokens)
    except LookupError:
        nltk.download('averaged_perceptron_tagger')
    try:
        nltk.tagset_mapping('en-ptb', 'universal')
    except LookupError:
        nltk.download('universal_tagset')
        
# Create a single tagger object to avoid repeated calls to 'load'
nltk_component_check()
tagger = nltk.tag.perceptron.PerceptronTagger()
tagset_map = nltk.tagset_mapping('en-ptb', 'universal')
    
def tag_song(song, simplify=True):
    ''' Returns a nested song structure where each word is tagged with part of
    speech. If 'simplify' is True (default) then all punctuation and special
    words are removed. '''
    
    tagged_song = []
    for paragraph in generate_nested(song):
        tagged_paragraph = []
        for line in paragraph:
            tagged_line = tagger.tag(nltk.word_tokenize(line))
            tagged_line = [(w, tagset_map[p]) for (w, p) in tagged_line if 
                    tagset_map[p] != '.' and tagset_map[p] != 'X']
            tagged_paragraph.append(tagged_line)
        tagged_song.append(tagged_paragraph)
Example #4
0
def get_tag_mapper(lang: str) -> dict:
    # https://stackoverflow.com/questions/44117664/how-to-reduce-the-number-of-pos-tags-in-penn-treebank-nltk-python
    if lang == 'en':
        return tagset_mapping('en-ptb', 'universal')
    return {}
Example #5
0
    # returns max, min, avg depth
    lst_depth = _list_up_depth_to_root(synset, root_synset)
    if len(lst_depth) == 0:
        return ret_empty
    else:
        return (float(np.max(lst_depth)), float(np.min(lst_depth)),
                float(np.mean(lst_depth)))


_MAP_UNIVERSAL_TO_WORDNET = {
    "NOUN": wn.NOUN,
    "VERB": wn.VERB,
    "ADJ": wn.ADJ,
    "ADV": wn.ADV
}
_MAP_PTB_TO_UNIVERSAL = nltk.tagset_mapping("en-ptb", "universal")


def universal_tagset_to_wordnet_tagset(univ_tag, na_value="o"):
    """
    Universal PoS tagset を WordNetの PoS tagset に変換する.
    @param univ_tag: Universal PoS tag
    @param na_value: 変換先のWordNet PoS tagがない場合の出力
    @return: WordNet PoS tagset. {wn.NOUN, wn.VERB, wn.ADJ, wn.ADV, na_value} のいずれか
    """
    wn_tag = _MAP_UNIVERSAL_TO_WORDNET.get(univ_tag, na_value)
    return wn_tag


def ptb_tagset_to_wordnet_tagset(ptb_tag, na_value="o"):
    """