Example #1
0
def doc_tagging (textfile, train_test_valid):

    # This is solely for testing. Remove once using real corpus
    # stringthing = "Hello welcome to the world of to learn Categorizing and POS Tagging with NLTK and Python this should be a yeah 1992 and this a cardinal number 0.4"
    # text = nltk.word_tokenize(stringthing)

    file_content = open(textfile).read()
    text = nltk.word_tokenize(file_content)

    patterns = [
        (r'^(19|20)\d\d$', 'YY'),  # years
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'(The|the|A|a|An|an)$', 'AT'),   # articles
        (r'.*able$', 'JJ'),                # adjectives
        (r'.*ly$', 'RB'),                  # adverbs
        (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns
        (r'(His|his|Her|her|Its|its)$', 'PRP$'),    # possesive
        (r'(my|Your|your|Yours|yours)$', 'PRP$'),   # possesive
        # WARNING : Put the default value in the end
        (r'.*', 'NN')                      # nouns (default)
        ]


    # giving the tagger our patterns for the tags
    regexp_tagger = nltk.RegexpTagger(patterns)

    # tag our document
    tags = regexp_tagger.tag(text)
    ctr1 = 0
    ctr2 = 0
    ctr3 = 0
    ctr4 = 0

    # replace the words with their tags
    new_tokens = []
    for word, tag in tags:
        if tag == "JJ":
            ctr1+=1
            new_tokens.append("<ADJECTIVE>")
        elif tag == "CD":
            ctr2+=1
            new_tokens.append("<NUMBER>")
        elif tag == "YY":
            ctr3+=1
            new_tokens.append("<YEAR>")
        elif tag == "PRP":
            ctr4+=1
            new_tokens.append("<PRONOUN>")
        else:
            new_tokens.append(word)

        if (word != '<' and word != '/s' and word != 's' and word != '@' and word != '-'):
            new_tokens.append(" ")

    #writing the tagged content to a text file
    filename = "brown_"+train_test_valid+".txt"
    fff = open(filename, "wt")
    for xf in new_tokens:
        fff.write(xf)
    fff.close()

    voc = Vocabulary(train_test_valid)

    for sent in new_tokens:
        voc.add_sentence(sent)

    tagged_vocab = []

    for word in range(voc.num_words):
        tagged_vocab.append(voc.to_word(word))


    print("Vocab Count for ", train_test_valid,": ", len(tagged_vocab))
Example #2
0
# coding=UTF-8
import nltk
from nltk.corpus import brown
from TextSummarization.Summarizer import SummaryTool
from nltk.tokenize import word_tokenize

# This is our fast Part of Speech tagger
#############################################################################
brown_train = brown.tagged_sents(categories=['news', 'editorial', 'reviews'])
regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
                                   (r'(-|:|;)$', ':'), (r'\'*$', 'MD'),
                                   (r'(The|the|A|a|An|an)$', 'AT'),
                                   (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'),
                                   (r'.*ness$', 'NN'), (r'.*ly$', 'RB'),
                                   (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'),
                                   (r'.*ed$', 'VBD'), (r'.*', 'NN')])
# from pprint import pprint
unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger)
# print unigram_tagger._taggers
bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger)
# print bigram_tagger._context_to_tag
#############################################################################

# This is our semi-CFG; Extend it according to your own needs
#############################################################################
cfg = {}
cfg["NNP+NNP"] = "NNP"
cfg["NN+NN"] = "NNI"
cfg["NNI+NN"] = "NNI"
cfg["JJ+JJ"] = "JJ"
cfg["JJ+NN"] = "NNI"
Example #3
0
brown_tagged_sents = brown.tagged_sents(categories = 'news')
brown_sents = brown.sents(categories = 'news')

"""add patterns for tagging"""
patterns = [
     (r'.*ing$', 'VBG'),               # gerunds
     (r'.*ed$', 'VBD'),                # simple past
     (r'.*es$', 'VBZ'),                # 3rd singular present
     (r'.*ould$', 'MD'),               # modals
     (r'.*\'s$', 'NN$'),               # possessive nouns
     (r'.*s$', 'NNS'),                 # plural nouns
     (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
     (r'.*', 'NN')                     # nouns (default)
 ]
 
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(brown_sents[3])
regexp_tagger.evaluate(brown_tagged_sents)
 
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = fd.most_common(100)
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff = nltk.DefaultTagger('NN'))
baseline_tagger.evaluate(brown_tagged_sents) 
sent = brown.sents(categories='news')[3]
baseline_tagger.tag(sent)


def performance(cfd, wordlist):
    lt = dict((word, cfd[word].max()) for word in wordlist)
esp = nltk.corpus.cess_esp.tagged_words()

#size = int(len(listaOraciones) * 0.9)
train_sents = listaOraciones[10]

nltk.corpus.cess_esp.tagged_words()

patterns = [
    (r'.*o$', 'NCMS'),  # Sustantivo Masculino
    (r'.*a$', 'NCFS'),  # Sustantivo Femenino
    (r'.*as$', 'NCFP'),
    (r'.*os$', 'NCMP')
]

regexp_tagger = nltk.RegexpTagger(patterns)
cess_tagged_sents = nltk.corpus.cess_esp.tagged_sents()

oracion = listaOraciones[10]
oracionTokenizada = nltk.Text(nltk.word_tokenize(oracion))

var = regexp_tagger.tag(oracionTokenizada)
""" Training nltk.UnigramTagger usando oraciones desde cess_esp """
unigram_tagger = nltk.UnigramTagger(cess_tagged_sents,
                                    backoff=nltk.RegexpTagger(patterns))

example = unigram_tagger.tag(oracionTokenizada)

print(example)
"""print(unigram_tagger.evaluate(train_sents))"""
Example #5
0
####
#### FUNCTION TO SAVE WORDS AND ITS TAGS IN A DICTIONARY ("results" variable)
def write_words_tagged(results):
    file_output = open('output-test.txt', 'w')

    for w in results:
        file_output.write(w[0] + ' - ' + w[1] + '\n')

    file_output.close()


if __name__ == '__main__':
    f = open('texto_test.txt', encoding='utf8')

    words = nltk.word_tokenize(f.read())
    words = [word.lower() for word in words]
    #print(len(words)) #total words: 662

    #fd = nltk.FreqDist(word.lower() for word in words)
    #fdf= fd.most_common(100)

    dict = load_dict()
    p = load_regex()
    rt = nltk.RegexpTagger(p)
    taggedText = rt.tag(words)

    results = save_words_tagged(taggedText)
    #print(save_words_tagged(taggedText))
    write_words_tagged(results)
Example #6
0
 def __init__(self):
     self.tagger = nltk.RegexpTagger(patterns)
     self.chunker = nltk.RegexpParser(grammar)