def task2b(data, tag):
    default_tagger = DefaultTagger(tag)
    for str in ["brown50", "brown90", "nps50", "nps90"]:
        tagger = CombinedTagger(train=data["train_" + str],
                                default=default_tagger,
                                name=str)
        test_tagger(tagger, data)
def default_tag():
    #Tagging any word by assigning the most frequent tag in a given corpus

    tags = []
    for (word, tag) in brown.tagged_words(categories='news'):
        tags.append(tag)
    most_freq_tag = FreqDist(tags).max()
    raw = 'I love AIT because AIT is interesting and professors here give a lot of challenging assignment'
    tokens = word_tokenize(raw)

    #Here is our tagger, it means in default, it will assign 'NN' tag to a word input
    default_tagger = DefaultTagger('NN')
    tagged = default_tagger.tag(tokens)
    print(tagged)
    score = default_tagger.evaluate(brown_tagged_sents)
    print(score)
Example #3
0
 def _model_definition(self) -> UnigramTagger:
     """Function to define and compile the model.
     
     Returns:
       Model object.
     """
     t0 = DefaultTagger('NOUN')
     return UnigramTagger([[(".", "PUNCT")]], backoff=t0)
Example #4
0
 def _model_definition(self) -> RegexpTagger:
     """Function to define and compile the model.
     
     Returns:
       Model object.
     """
     t0 = DefaultTagger('NOUN')
     return RegexpTagger(Model.RULES, backoff=t0)
    def train(self, model_path):
        corpus = [[(token.lower(), tag) for token, tag in sent]
                  for sent in CORPUS]

        unigram_tagger = UnigramTagger(corpus, backoff=DefaultTagger('UNK'))
        bigram_tagger = BigramTagger(corpus, backoff=unigram_tagger)

        with open(model_path, "wb") as model_file:
            pickle.dump(bigram_tagger, model_file)
Example #6
0
    def __init__(self, train_sents):
        """Show parameters.

        train_sents: trained sentences which have already been tagged.
        using Brown, conll2000, and TreeBank corpus.
        """
        t0 = DefaultTagger('NN')
        t1 = UnigramTagger(train_sents, backoff=t0)
        t2 = BigramTagger(train_sents, backoff=t1)
        self.tagger = TrigramTagger(train_sents, backoff=t2)
Example #7
0
def create_tagger(sents,patterns=PATTERNS,maxngram=4):
    '''Обучение Backoff tagger на каком-либо корпусе предложений'''
    
    train = sents
    def_tagger = DefaultTagger('NN')
    re_tagger = RegexpTagger(patterns, backoff=def_tagger)
    uni_tagger = UnigramTagger(train, backoff=re_tagger) 
    bi_tagger = BigramTagger(train, backoff=uni_tagger) 
    tri_tagger = TrigramTagger(train, backoff=bi_tagger) 
    ngram_tagger = NgramTagger(maxngram, train, backoff=tri_tagger)
    return ngram_tagger
def task2a(data):
    tags = []
    for key in data.keys():
        for sentence in data[key]:
            for _, tag in sentence:
                tags.append(tag)
    fd = FreqDist(tags)
    most_frequent_tag = fd.max()
    print("Most frequent tag: {}".format(most_frequent_tag))
    default_tagger = DefaultTagger(most_frequent_tag)
    test_tagger(default_tagger, data)
    return tag
Example #9
0
def get_tagger(type="StandfordPOSTagger"):
    if type == "Custom":
        brown_tagged_sents = brown.tagged_sents(categories='news',
                                                tagset='universal')
        t0 = DefaultTagger('NOUN')
        t1 = UnigramTagger(brown_tagged_sents, backoff=t0)
        t2 = BigramTagger(brown_tagged_sents, backoff=t1)
    else:
        t2 = StanfordPOSTagger(
            'data/./models/wsj-0-18-bidirectional-distsim.tagger',
            '3rdparty_libs/stanford-postagger.jar')

    return t2
Example #10
0
def generateTagger():
    default_tagger = DefaultTagger('V')
    patterns = [
        (r'.*o$', 'NMS'),  # noun masculine singular
        (r'.*os$', 'NMP'),  # noun masculine plural
        (r'.*a$', 'NFS'),  # noun feminine singular
        (r'.*as$', 'NFP')  # noun feminine singular
    ]
    regexp_tagger = RegexpTagger(patterns, backoff=default_tagger)
    #train nltk.UnigramTagger using tagged sentences from cess_esp
    cess_tagged_sents = cess_esp.tagged_sents()
    combined_tagger = UnigramTagger(cess_tagged_sents, backoff=regexp_tagger)

    return combined_tagger
Example #11
0
def trained_tagger():
    """Returns a trained trigram tagger
    existing : set to True if already trained tagger has been pickled
    """
    # Aggregate trained sentences for N-Gram Taggers
    train_sents = nltk.corpus.brown.tagged_sents()
    train_sents += nltk.corpus.conll2000.tagged_sents()
    train_sents += nltk.corpus.treebank.tagged_sents()

    t0 = DefaultTagger('NN')
    t1 = UnigramTagger(train_sents, backoff=t0)
    t2 = BigramTagger(train_sents, backoff=t1)
    trigram_tagger = TrigramTagger(train_sents, backoff=t2)

    pickle.dump(trigram_tagger, open(r'DataBase/trained_tagger.pkl', 'wb'))

    return trigram_tagger
Example #12
0
def trained_tagger():
    """Returns a trained trigram tagger
    existing : set to True if already trained tagger has been pickled
    """

    if os.path.exists(os.path.join(os.getcwd(),
                                   r"DataBase/trained_tagger.pkl")):
        print("Trained Tagger File already Exists..")
        return

    # Aggregate trained sentences for N-Gram Taggers
    train_sents = nltk.corpus.brown.tagged_sents()
    train_sents += nltk.corpus.conll2000.tagged_sents()
    train_sents += nltk.corpus.treebank.tagged_sents()

    t0 = DefaultTagger('NN')
    t1 = UnigramTagger(train_sents, backoff=t0)
    t2 = BigramTagger(train_sents, backoff=t1)
    trigram_tagger = TrigramTagger(train_sents, backoff=t2)

    pickle.dump(trigram_tagger, open(r'DataBase/trained_tagger.pkl', 'wb'))
Example #13
0
    def __init__(self, train_sents, load=False):
        if load:
            print 'Loading saved tagger...',
            self.load()
            print 'done.'
        else:
            time_start = time.time()

            print 'Training the tagger...'
            tag_counts = Counter([t for s in train_sents for w, t in s])
            default_tag = argmax(tag_counts)

            def_tgr = DefaultTagger(default_tag)
            af_tgr = AffixTagger(train_sents, affix_length=-3, backoff=def_tgr)
            uni_tgr = UnigramTagger(train_sents, backoff=af_tgr)
            bi_tgr = BigramTagger(train_sents, backoff=uni_tgr)
            tri_tgr = TrigramTagger(train_sents, backoff=bi_tgr)
            self.tgr = tri_tgr
            print 'Done.'

            time_stop = time.time()
            print 'Training time: {0:.2f}s'.format(time_stop - time_start)
Example #14
0
    def train(self,
              corpus: Corpus,
              evaluate: bool = True,
              config: dict = None) -> Union[None, Dict[str, Dict[str, float]]]:
        """Train method.

        Args:
          corpus: Corpus to train model.
          evaluate: Flag to return evaluation of the model.
          config: Training config dict (not used for this model).

        Returns: 
          Model evaluation metrics.
        """
        if self.model is None:
            self._model_definition()

        self.model = UnigramTagger(corpus.train.sentences,
                                   backoff=DefaultTagger('NOUN'))

        if evaluate:
            return self.evaluate(corpus)
        return None
Example #15
0
def prepare_toolset():
    toolset = {}
    patterns = [(r'^[\.1-9]+$', 'NUM'), (r'^[^a-zA-Z]+$', '.'),
                (r'^[^a-zA-Z]*[a-zA-Z]+[-\'][a-zA-Z]+[^a-zA-Z]*$', 'NOUN'),
                (r'^.*[a-zA-Z]+[^-a-zA-Z]+[a-zA-Z]+.*$', '.')]
    train_set = brown.tagged_sents(
        categories='learned', tagset='universal') + brown.tagged_sents(
            categories='news', tagset='universal') + brown.tagged_sents(
                categories='reviews', tagset='universal')
    utgr = UnigramTagger(train=train_set, backoff=DefaultTagger('NN'))
    btgr = BigramTagger(train=train_set, backoff=utgr)
    ttgr = TrigramTagger(train=train_set, backoff=btgr)
    toolset['tgr'] = RegexpTagger(regexps=patterns, backoff=ttgr)
    toolset['sw'] = stopwords.words('english')
    toolset['lr'] = WordNetLemmatizer()
    toolset['wntg'] = {
        'NOUN': wordnet.NOUN,
        'VERB': wordnet.VERB,
        'ADJ': wordnet.ADJ,
        'ADV': wordnet.ADV,
        'X': wordnet.NOUN
    }
    print('Tools Ready')
    return toolset
Example #16
0
    nltk.data.find('taggers/averaged_perceptron_tagger')
except:
    nltk.download('averaged_perceptron_tagger')

from nltk.corpus import wordnet as wn
from nltk.corpus import treebank, conll2000, brown, conll2002
from nltk import DefaultTagger, UnigramTagger, BigramTagger

wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

# The code below trains bigram part of speech tagger from various datasets.
train_sents = treebank.tagged_sents() + brown.tagged_sents() + conll2000.tagged_sents() + conll2002.tagged_sents()
edited_train = []
for sent in train_sents:
    edited_train.append([(word.lower(),tag) for (word,tag) in sent])
t0 = DefaultTagger(None)
et1 = UnigramTagger(edited_train, backoff = t0)
et2 = BigramTagger(edited_train, backoff = et1)

# The function below converts bigram pos to wordnet pos for lemmatization
def penn_to_wn(tag):
    nltk_wn_pos = {'J':wn.ADJ,'V':wn.VERB,'N':wn.NOUN,'R':wn.ADV}
    try:
        return nltk_wn_pos[tag[0]]
    except:
        return None

# The list below is a list of unwanted tokens
unwanted_tokens = ['"','!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/','”','“','–',"'s",
                ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'] 
Example #17
0
# In[13]:


print("Performance of complete Tagger: ",complete_tagger.evaluate(brown_tagged_sents))


# The rate of correctly taggged words is quite high. However, this method of evaluation is not valid, since the same corpus has been used for evaluation as for training. Therefore we split the corpus into a *training-part* and a *test-part*. The *UnigramTagger* is then trained with the *training-part* and evaluated with the disjoint  *test-part*.

# In[14]:


size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = UnigramTagger(train_sents,backoff=DefaultTagger("NN"))
print("Performance of Tagger with 90% Training and 10% Testdata: ",unigram_tagger.evaluate(test_sents))


# As expected the rate of correctly tagged words is lower, but this value is now a valid evaluation measure.

# ### Unigram Tagger, which applies only frequent words for training
# A trained Unigram Tagger must store a table, which assigns to each word the most frequent POS-tag. Since this table can be quite large, an option is to train the Unigram Tagger only with the most frequent words. 
# The following code generates a list of different Unigram taggers, each with an other amount of frequent words of the brown corpus. The plot visualizes the Unigram Tagger performance in dependence of the number of most frequent words, stored in the tagger. Note that in the code below the _UnigrammTagger_ is initialized with a dictionary of tagged words, whereas in the code above the _UnigrammTagger_ is initialized with a corpus of tagged words. Both options are possible. 

# In[15]:


def display():
    import pylab
    words_by_freq = FreqDist(brown.words(categories='news')).most_common(2**15)
Example #18
0
from nltk import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from tag_util import backoff_tagger, train_sents, test_sents, train_brill_tagger

default_tagger = DefaultTagger('NN')
init_tagger = backoff_tagger(train_sents,
                             [UnigramTagger, BigramTagger, TrigramTagger],
                             backoff=default_tagger)
print(init_tagger.evaluate(test_sents))

brill_tagger = train_brill_tagger(init_tagger, train_sents)
print(brill_tagger.evaluate(test_sents))
nltk.help.upenn_tagset('NNS')
nltk.help.upenn_tagset('VB.*')

text = nltk.word_tokenize("I cannot bear the pain of bear")
out = nltk.pos_tag(text)
out = nltk.tag.str2tuple('bear/NN')
print(out)
print((out[0], out[1]))
print(nltk.tag.tuple2str(out))

treebank_tagged = treebank.tagged_words(tagset='universal')
tag = nltk.FreqDist(tag for (word, tag) in treebank_tagged)
out = tag.most_common()
print(out)

tag = DefaultTagger('NN')
out = tag.tag(['Beautiful', 'morning'])
print(out)
"""英语的十大词类
1.名词noun n.
2.代词pronoun pron.
3.形容词adjective adj.
4.副词 adverb adv.
5.动词verb v.
6.数词numeral num.
7.冠词article art.
8.介词preposition prep.
9.连词conjunction conj.
10.感叹词interjection interj.
"""
print("**********************************")
Example #20
0
# this step is not necessary since the algorithm, NgraTagger is expecting a sentence (so a list of (str, str))
#tagged_sents = [ e for sublist in tagged_sents for e in sublist ]
random.shuffle( tagged_sents )

test_size = int(len(tagged_sents) / 10)
evaluation = 0

for I in range(10):
  test_sents = tagged_sents[I * test_size : (I+1) * test_size ]
  train_sents = tagged_sents[: I * test_size] + tagged_sents[ (I+1) * test_size :]
  # Tagger that shooses the tag based on the word string and the preceding "n" word's tags
  tagger = NgramTagger(2, train=train_sents)
  evaluation += tagger.evaluate(test_sents)

print('evaluation with 2-gram model')
print(evaluation/10)

tagger = [0,0,0,0,0,0,0]
evaluation = [0,0,0,0,0,0]
tagger[0] = DefaultTagger('NN')

for N in range(1, 7):
  for I in range(10):
    test_sents = tagged_sents[I * test_size : (I+1) * test_size ]
    train_sents = tagged_sents[: I * test_size] + tagged_sents[ (I+1) * test_size :]
    tagger[N] = NgramTagger(1, train=train_sents, backoff=tagger[N-1]) # <- to be used in "retrospective" if it encounters an unknown context
    evaluation[N-1] += tagger[N].evaluate(test_sents)
  evaluation[N-1] = evaluation[N-1] / 10

print('evaluation with 1-gram model but activating backoff')
print(sum(evaluation) / len(evaluation))
Example #21
0
import nltk
nltk.download('brown')

from nltk.corpus import brown

brown_news_tagged = brown.tagged_sents(categories='news', tagset='universal')
brown_news_words = brown.tagged_words(categories='news', tagset='universal')

brown_train = brown_news_tagged[100:]
brown_test = brown_news_tagged[:100]

from nltk.tag import untag
test_sent = untag(brown_test[0])
print("Tagged: ", brown_test[0])
print()
print("Untagged: ", test_sent)

# A default tagger assigns the same tag to all words
from nltk import DefaultTagger
default_tagger = DefaultTagger('NOUN')
default_tagger.tag('This is a test'.split())
Example #22
0
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import treebank
from nltk.tag import hmm
from nltk import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk.corpus import brown
brown_a = nltk.corpus.brown.tagged_sents(
    categories=['news', 'editorial', 'reviews'])
text = brown.tagged_sents(categories='news')[:500]

t0 = DefaultTagger('NN')
t1 = UnigramTagger(text, backoff=t0)
t2 = BigramTagger(text, backoff=t1)
t3 = TrigramTagger(text, backoff=t1)
# default_tagger = nltk.data.load(nltk.tag._POS_TAGGER)

test_sent = brown.sents()[502]
# test_sent = [u'Noting', u'that', u'Plainfield', u'last', u'year', u'had', u'lost', u'the', u'Mack', u'Truck', u'Co.', u'plant', u',', u'he', u'said', u'industry', u'will', u'not', u'come', u'into', u'this', u'state', u'until', u'there', u'is', u'tax', u'reform', u'.']


def ie_preprocess(document):
    print document
    sentences = nltk.sent_tokenize(document)
    # print sentences
    trigram_tagger = nltk.TrigramTagger(brown_a, cutoff=0)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    print "\nDefault tagger"
    x = [t0.tag(sent) for sent in sentences]
    print x
    print "\nUnigram tagger"
Example #23
0
Podemos utilizar um corpus anotado morfologicamente para treinar o POS-Tagger do NLTK
"""

from nltk.corpus import mac_morpho

#Obtém as sentenças taggeadas
tagged_sents = mac_morpho.tagged_sents()

# Divide-as em duas partes - uma maior, para TREINAMENTO - e outra menor, para TESTE
train_tsents = tagged_sents[100:]  # Todas sentenças após as 100 primeiras
test_tsents = tagged_sents[:100]  # Pega todas sentenças até a centésima

from nltk import DefaultTagger

# Define um tagger padrão, que sempre etiquetará a palavra com "N" = "NOUM" = "SUBSTANTIVO", visto que é a tag que mais ocorre
tagger0 = DefaultTagger("N")
# Avalia a acurácia do POS-Tagger ao etiquetar as sentenças de TESTE
tagger0.evaluate(test_tsents)

from nltk import UnigramTagger

# Define um tagger Unigram (falaremos mais sobre isso depois)
# Este tagger aprende ao ver as sentenças etiquetadas na base de TREINAMENTO
# Além disso, utiliza o DefaultTagger caso não saiba o que marcar
tagger1 = UnigramTagger(train_tsents, backoff=tagger0)
tagger1.evaluate(test_tsents)

from nltk import BigramTagger

# Define um tagger Bigram (falaremos mais sobre isso depois)
tagger2 = BigramTagger(train_tsents, backoff=tagger1)
Example #24
0
# for kfold validation, not working though
# cross-fold validation is just brute forced...
#from sklearn.model_selection import KFold
#import numpy as np


mypath = "C:/Users/Lauren Shin/Documents/LING 111/.final project"

EstonianCorpus = TaggedCorpusReader(mypath, "estonianCaps.txt", encoding = "latin-1")

sentences = EstonianCorpus.tagged_sents()

tags = [tag for _, tag in EstonianCorpus.tagged_words()]
mostFrequent = FreqDist(tags).max()

default = DefaultTagger(mostFrequent)

# cross validation

#kf = KFold(n_splits = 3)
#
## turns the data into a 2d array
#X = np.array(sentences)
## creates a 1d array with same length/number of rows as X
#y = np.arange(0, len(sentences), 1)
#
#for train, test in kf.split(X):
#    # this works
#    # training for training and training for evaluation
#    X_train, X_test = X[train], X[test]
#    # testing for training and testing for evaluation
Example #25
0
def performance(cfd, wordlist):
    lt = dict((word[0], cfd[word[0]].max()) for word in wordlist)
    baseline_tagger = UnigramTagger(model=lt, backoff=DefaultTagger('NN'))
    return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
Example #26
0
from modulo_wikicorpus_en import wikicorpus
inicio = 14500
long = 50000

train = int(long * 0.9)
test = int(long * 0.1)

print("wiki train")
wiki_train = universal_tags(wikicorpus.tagged_sents()[inicio:inicio + train])
print("wiki test")
wiki_test = universal_tags(wikicorpus.tagged_sents()[inicio + train:inicio +
                                                     train + test])

from nltk import DefaultTagger

default_tagger = DefaultTagger('NOUN')

#entrenamos el affixtagger
from nltk import AffixTagger

print("affix tagger")
affix_tagger = AffixTagger(wiki_train, backoff=default_tagger)

#inicializamos el brillTagger
from nltk import BrillTaggerTrainer
from nltk.tag.brill import Template
from nltk.tag.brill import Word, Pos

templates = [
    Template(Pos([-1])),
    Template(Pos([1])),
from nltk import DefaultTagger, RegexpTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk.corpus import treebank
from pattern.text import parsetree

sentence = 'The brown fox is quick and he is jumping over the lazy dog'

tokens = nltk.word_tokenize(sentence)
tagged_sent = nltk.pos_tag(tokens, tagset='universal')

data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]

tokens = nltk.word_tokenize(sentence)

dt = DefaultTagger('NN')

dt.evaluate(test_data)

patterns = [
    (r'.*ing$', 'VBG'),  # gerunds
    (r'.*ed$', 'VBD'),  # simple past
    (r'.*es$', 'VBZ'),  # 3rd singular present
    (r'.*ould$', 'MD'),  # modals
    (r'.*\'s$', 'NN$'),  # possessive nouns
    (r'.*s$', 'NNS'),  # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')  # nouns (default) ...
]

rt = RegexpTagger(patterns)
Example #28
0
from nltk.corpus import brown

#getting the most common tag in the brown corpus
tags = [tag for (word, tag) in brown.tagged_words()]
most_common_tag = nltk.FreqDist(tags).max()
print(most_common_tag)

from nltk import DefaultTagger

barack = """Barack Hussein Obama (born August 4, 1961) is an American politician 
who served as the 44th President of the United States from January 20, 2009, to January 20, 2017.
A member of the Democratic Party, he was the first African American to assume the presidency 
and previously served as a United States Senator from Illinois (2005–2008)."""

tokenized_barack = word_tokenize(barack)
default_tagger = DefaultTagger(most_common_tag)
def_tagged_barack = default_tagger.tag(tokenized_barack)
print(def_tagged_barack)

#Lookup Tagger
#Ngram tagger
message = "the quick brown fox jumped over the lazy dog"
training_tag = pos_tag(word_tokenize(message))
print(training_tag)
#training the ngram tagger
ngram_tagger = nltk.NgramTagger(n=2, train=[training_tag])

message2 = "the lazy dog jumped over the quick brown fox"
message2_tags = ngram_tagger.tag(word_tokenize(message2))
print(message2_tags)
Example #29
0
 def __init__(self, train_sents):
     t0 = DefaultTagger('NN')
     t1 = UnigramTagger(train_sents, backoff=t0)
     t2 = BigramTagger(train_sents, backoff=t1)
     self.tagger = TrigramTagger(train_sents, backoff=t2)
 def generate_tokenizer(corpus):
     test = corpus
     t0 = DefaultTagger("n")
     t1 = UnigramTagger(test, backoff=t0)
     t2 = BigramTagger(test, backoff=t1)
     return t2