コード例 #1
0
    def wordTagger(self, wordlist,number):
        train_sents = treebank.tagged_sents()[:3000]
        if number==1:
            taglist = nltk.pos_tag(wordlist)
        elif number ==2:
            tagger = DefaultTagger('NN')
            taglist = tagger.tag(wordlist)
        elif number ==3:
            tagger = UnigramTagger(train_sents)
            taglist = tagger.tag(wordlist)

        elif number ==4:
            tnt_tagger = tnt.TnT()
            tnt_tagger.train(train_sents)
            taglist = tnt_tagger.tag(wordlist)
        elif number ==5:
            tagger = ClassifierBasedPOSTagger(train=train_sents)
            taglist = tagger.tag(wordlist)
        return taglist
コード例 #2
0
def tag_words(words, tag):
    """
    Associates a tag with words.

    Parameters
    ----------
    words: A list of strings.
    tag: A str.

    Returns
    -------
    A list of tuples of (str, str)
    """

    default_tagger = DefaultTagger(tag)
    tags = default_tagger.tag(words)

    return tags
コード例 #3
0
def tag_words(words, tag):
    """
    Associates a tag with words.

    Parameters
    ----------
    words: A list of strings.
    tag: A str.

    Returns
    -------
    A list of tuples of (str, str)
    """

    default_tagger = DefaultTagger(tag)
    tags = default_tagger.tag(words)

    return tags
コード例 #4
0
ファイル: default_tagger.py プロジェクト: neuroph12/nlpy
# every tagger has a tag() method.
# DefaultTagger is a subclass of SequentialBackoffTagger which has a choose_tag() method.
from nltk.tag import DefaultTagger
from nltk.corpus import treebank

tagger = DefaultTagger('NN')
print(tagger.tag(['Hello', 'World']))

# thought it's too simple, we can try to evaluate it
test_sents = treebank.tagged_sents()[3000:]
print(tagger.evaluate(test_sents))

# for sentences
print(tagger.tag_sents([['Hello', 'World', '.'], ['How', 'are', 'you', '?']]))

# untagging
from nltk.tag import untag

print(untag([('Hello', 'NN'), ('World', 'NN')]))
コード例 #5
0
from nltk.tokenize import word_tokenize
from nltk.corpus import treebank, wordnet
from nltk.probability import FreqDist
from nltk.tag.sequential import ClassifierBasedPOSTagger
from nltk.tag import brill, brill_trainer, tnt, SequentialBackoffTagger
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger, AffixTagger
from samples import sample

# Test and training variables
test_sents = treebank.tagged_sents()[3000:]
train_sents = treebank.tagged_sents()[:3000]
tk_sample = word_tokenize(sample)

# Default tagger - Nouns
df_tagger = DefaultTagger('NN')
tagged = df_tagger.tag(tk_sample)
accuracy = df_tagger.evaluate(test_sents)
print(f"Tagged text: {tagged}; acc = {accuracy}\n")

# Unigram tagger
ug_tagger = UnigramTagger(train_sents)
tagged = ug_tagger.tag(tk_sample)
accuracy = ug_tagger.evaluate(test_sents)
print(f"Tagged text: {tagged}; acc = {accuracy}\n")

# Backoff tagger: rely on other tagger(backoff) when the current one does not know how to evaluate
ugb_tagger = UnigramTagger(train_sents, backoff=df_tagger)
accuracy = ugb_tagger.evaluate(test_sents)
print(f"Accuracy of backoff: {accuracy}\n")

# Saving pickle and testing it.
コード例 #6
0
# Build a custom tagger by extending class TaggerI, from the nltk.tag package and implementing the tag function.
# Use the evaluate function to assess the performance of the tagger.

# --- Tagger ---
# INPUT: Sentence tokens
# OUTPUT: List of pairs where each item corresponds to a token of the input with its POS tag

# 1. BACKOFF TAGGER (a tagger that is consulted by another when not able to tag a token):
#    Assigns the same tag to all tokens (tag specified as argument, NN in this case)
dt = DefaultTagger("NN")

# Measure accuracy on test data (i.e. Gold Standard). Test data should be tagged to compare these
# tags against new ones computed by the evaluated tagger (dt in this case)
print(dt.evaluate(gold=test_data))
print(dt.tag(tokens=tokens))

# 2. REGEX TAGGER:
#    Assigns tags to tokens by comparing their word strings to a series of regular expressions

# Define regex patterns used that determine the tags of tokens. Note that when tagging a token, expressions
# are evaluated bottom up and thus, the last one defines the default tag
patterns = [
    (r".*ing$", "VBG"),  # Gerunds
    (r".*ed$", "VBD"),  # Simple past
    (r".*es$", "VBZ"),  # 3rd singular present
    (r".*ould$", "MD"),  # Modals
    (r".*'s$", "NN$"),  # Possesive pronouns
    (r".*s$", "NNS"),  # Plural nouns
    (r"^-?[0-9]+(.[0-9]+)?$", "CD"),  # Cardinal numbers
    (r".*", "NN")  # Nouns (default)
コード例 #7
0
defaultTagger = DefaultTagger('NN')
initialTagger = backoff_tagger(brown_train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=defaultTagger)
brillTagger = train_brill_tagger(initialTagger, brown_train_sents)

tnt_tagger = tnt.TnT(N=100)
tnt_tagger.train(brown_train_sents)

bigramTagger = BigramTagger(brown_train_sents)
trigramTagger = TrigramTagger(brown_train_sents)

print("------------Recommended Tagger------------")
print(nltk.pos_tag(sent))

print("------------Default Tagger------------")
print(defaultTagger.tag(sent))

print("------------Unigram Tagger Overrode------------")
unigramTagger = UnigramTagger(model={'Pierre': 'NN'})
print(unigramTagger.tag(sent))

print("------------Unigram Tagger Trained------------")
unigramTagger = UnigramTagger(brown_train_sents)
print(unigramTagger.tag(sent))

#cutoff: The number of instances of training data the tagger must see in order not to use the backoff tagger
print("------------Unigram Tagger Trained with cutoff=3------------")
unigramTagger = UnigramTagger(brown_train_sents, cutoff=3)
print(unigramTagger.tag(sent))

print("------------Bigram Tagger------------")
コード例 #8
0
# In[28]:

from nltk.tag import DefaultTagger
default_tagger = DefaultTagger("NOUN")

# In[29]:

true_pred = 0
num_pred = 0

for sent in test_sents:
    tags = np.array([tag for (word, tag) in sent])
    words = np.array([word for (word, tag) in sent])

    tagged_sent = default_tagger.tag(words)
    outputs = [tag for token, tag in tagged_sent]

    true_pred += np.sum(outputs == tags)
    num_pred += len(words)

print(f"{true_pred / num_pred * 100:.1f}")

# как видим такой способ в целом дает результат довольно плохой, хотя 20% (или пятая часть) это скорее про дистрибуцию популярного тега

# ## NLTK, Rnnmorph

# Вспомним первый [семинар](https://colab.research.google.com/drive/1FHZVU6yJT61J8w1hALno0stD4VU36rit?usp=sharing) нашего курса. В том семинаре мы с вами работали c некоторыми библиотеками.
#
# Не забудьте преобразовать систему тэгов из `'en-ptb' в 'universal'` с помощью функции `map_tag` или используйте `tagset='universal'`
コード例 #9
0
#Without the part-of-speech tags, a chunker cannot know how to extract
#phrases from a sentence. But with part-of-speech tags, you can tell a chunker how to identify
#phrases based on tag patterns.

#part-of-speech tags for grammar analysis and word sense disambiguation

#All taggers in NLTK are in the nltk.tag package and inherit from the TaggerI base class.
#TaggerI requires all subclasses to implement a tag() method,which takes a list of word  as input
#and returns a tagged word as output

#TaggerI also provides an evaluate() method for evaluating the accuracy of the tagger

from nltk.tag import DefaultTagger
tagger = DefaultTagger('NN')
print(tagger.tag(['Hello', 'World']))
print(tagger.tag(['we', 'are','going']))# WRONG


#SequentialBackoffTagger implements the tag() method, which calls the
#choose_tag() method of the subclass for each index in the tokens list while accumulating
#a history of the previously tagged tokens


"""DefaultTagger is a subclass of SequentialBackoffTagger. Every subclass of
SequentialBackoffTagger must implement the choose_tag() method, which
takes three arguments:
    * The list of tokens
    * The index of the current token whose tag we want to choose
    * The history, which is a list of the previous tags
    SequentialBackoffTagger implements the tag() method, which calls the
コード例 #10
0
rc4 = []
def PhraseExtractor(tree):
    rc4 = []
    for subtree in tree.subtrees():
        if subtree.label() == 'Phrase':
            rc4.append(str(untag(subtree)))
    return rc4

'''-----------------------------------------------------------------------------------------'''
'''----------------------Customized POS Tagging -----------------------------------------------------'''
'''-----------------------------------------------------------------------------------------'''
ptagger = DefaultTagger('PWD')
ntagger = DefaultTagger('NWD')
etagger = DefaultTagger('EMP')

tag_pos = ptagger.tag(PosWords) # changed to list version of Dictionary
tag_neg = ntagger.tag(NegWords) # changed to list version of Dictionary
tag_emp = etagger.tag(EmpWords) # changed to list version of Dictionary

tag_wrd = tag_pos + tag_neg + tag_emp
tag_wrd_dict = dict(tag_wrd)


tagger5 = UnigramTagger(model = tag_wrd_dict, backoff= tagger2)


'''-----------------------------------------------------------------------------------------'''
'''------------------- Chunking with POS Tagging ---------------------------------------------------'''
'''-----------------------------------------------------------------------------------------'''
chunker1 = RegexpParser(r'''
    PWD:
コード例 #11
0
# building your own tagger

# preparing the data
from nltk.corpus import treebank
data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
print train_data[0]

# default tagger
from nltk.tag import DefaultTagger
dt = DefaultTagger('NN')

print dt.evaluate(test_data)

print dt.tag(tokens)


# regex tagger
from nltk.tag import RegexpTagger
# define regex tag patterns
patterns = [
        (r'.*ing$', 'VBG'),               # gerunds
        (r'.*ed$', 'VBD'),                # simple past
        (r'.*es$', 'VBZ'),                # 3rd singular present
        (r'.*ould$', 'MD'),               # modals
        (r'.*\'s$', 'NN$'),               # possessive nouns
        (r'.*s$', 'NNS'),                 # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')                     # nouns (default) ... 
]
コード例 #12
0
ファイル: test.py プロジェクト: cristianprice/pyxamples
from nltk.tag import DefaultTagger

tagger = DefaultTagger('NN')
result = tagger.tag(['Hello', 'World', 'want'])
print(result)
コード例 #13
0
#a
splits = [[90, 10], [50, 50]]
correct_brown = brown.tagged_sents()
correct_chat = chat.tagged_posts()
default_tagger = DefaultTagger("NN")

for split in splits:  #lag til funksjon for bruk i b
    test_brown, train_brown = train_test_split(correct_brown,
                                               test_size=split[1] / 100,
                                               shuffle=False)
    test_chat, train_chat = train_test_split(correct_chat,
                                             test_size=split[1] / 100,
                                             shuffle=False)

    default_tagger.tag(train_brown)
    print(
        f"The DefaultTagger accuracy for the Brown Corpus is {default_tagger.evaluate(test_brown)} using a {split[0]}/{split[1]} split."
    )
    default_tagger.tag(train_chat)
    print(
        f"The DefaultTagger accuracy for the NPS Chat Corpus is {default_tagger.evaluate(test_chat)} using a {split[0]}/{split[1]} split.\n"
    )

    #50/50 is better because the tagger doesn't "learn", so when the test data is increased (from 10%)
    #there's a bigger chance that some words are going to be NN?

#b
patterns = [
    (r'.*ing$', 'VBG'),  # gerunds
    (r'.*ed$', 'VBD'),  # simple past
コード例 #14
0
ファイル: nltk.py プロジェクト: byouloh/sourcenet
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
word_list = tokenizer.tokenize( sentence )

#--------------------------------------------------------------------------------
# Parts of Speech
#--------------------------------------------------------------------------------

# Default tagging
from nltk.tag import DefaultTagger

# if all else fails, make an unknown word a noun ( "NN" )
default_tagger = DefaultTagger( 'NN' )

# try it.
tagged_sentence = default_tagger.tag( word_list )

# Can also batch tag, but need a list of sentences, each already tokenized.
#tagger.batch_tag([['Hello', 'world', '.'], ['How', 'are', 'you', '?']])

#--------------------------------------------------------------------------------
# Training taggers
#--------------------------------------------------------------------------------

# so far so good.  Next have to train taggers.

# Unigram, training on Treebank corpus
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
unigram_tagger = UnigramTagger(train_sents)
import nltk
from nltk.tag import DefaultTagger
tag = DefaultTagger('NN')
print(tag.tag(['Beautiful', 'morning']))

コード例 #16
0
import nltk
from nltk.corpus import treebank  #import treebank corpus
from nltk.tag import DefaultTagger  #import DefaultTagger

tagger = DefaultTagger('NN')  #Default Tagger with assigning NN tag
treebank_tagged_sents = treebank.tagged_sents(
)  #initialising treebank_tagged_sents
tagger.tag(treebank_tagged_sents)  #tag treebank_tagged_sents
print('Accuracy %4.1f%%' %
      (100.0 *
       tagger.evaluate(treebank_tagged_sents)))  #calculate and print Accuracy
コード例 #17
0
from nltk.corpus import treebank
data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
print(train_data[0])

#%%
# default tagger
from nltk.tag import DefaultTagger
dt = DefaultTagger('NN')

# accuracy on test data
dt.evaluate(test_data)

# tagging our sample headline
dt.tag(nltk.word_tokenize(sentence))

#%%
# regex tagger
from nltk.tag import RegexpTagger
# define refex tag patterns
patterns = [
    (r'.*ing$', 'VBG'),  # gerunds
    (r'.*ed$', 'VBD'),  # simple past
    (r'.*es$', 'VBZ'),  # 3rd singular present
    (r'.*ould$', 'MD'),  # modals
    (r'.*\'s$', 'NNS'),  # possessive nouns
    (r'.*s$', 'NNS'),  # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')  # nouns (default)
]
コード例 #18
0
ファイル: tutPosTagging01.py プロジェクト: bindaasamit/pycode
######### DEFAULT TAGGER ###############

#Assigning the default Tag
from nltk.tag import DefaultTagger, untag
tagger=DefaultTagger('NN')
tokens=[['Hello','World'],['How','are','you','?']]
print tagger.tag(tokens)

print tagger.tag_sents(tokens)

#Untagging
tagged=tagger.tag(tokens)
print untag(tagged)

#Evaluating the tagger accuracy
from nltk.corpus import treebank
test_sents=treebank.tagged_sents()[3000:]
print tagger.evaluate(test_sents)
コード例 #19
0
import nltk
from nltk.corpus import brown
from nltk.corpus import treebank
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

brown.tagged_words(tagset='universal')

train_sents = treebank.tagged_sents()[:3500]
test_sents = treebank.tagged_sents()[3500:]
tagger = DefaultTagger('NN')


def back_off_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff


tagger = back_off_tagger(train_sents,
                         [UnigramTagger, BigramTagger, TrigramTagger],
                         backoff=DefaultTagger('NN'))

tagged = tagger.tag(["The", "lecture", "will", "be", "in", "Wean", "4623"])
print(tagged)
print(tagger.evaluate(test_sents))