def train():
    print("Collecting sub-corpus from Penn Treebank (nltk.corpus)")
    
    # prepare parsing trees, extrated from treebank
    tbank_trees = []
    for sent in treebank.parsed_sents():
        sent.chomsky_normal_form()
        tbank_trees.append(sent)
    
    # build vocabulary list, extracted from treebank
    vocab_size = 10000 # set vocabulary size to 10000
    words = [wrd.lower() for wrd in treebank.words()]
    vocab = [wrd for wrd,freq in Counter(treebank.words()).most_common(vocab_size)]
    
    # generate grammar rules list, extracted from treebank. and calculate their probablity based their frequency
    tbank_productions = set(production for tree in tbank_trees for production in tree.productions())
    tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions))
    production_rules = tbank_grammar.productions()
    rules_to_prob = defaultdict(int)
    nonterm_occurrence = defaultdict(int)
    
    #calculate probablity for rules
    for sent in tbank_trees:
        for production in sent.productions():
            if len(production.rhs()) == 1 and not isinstance(production.rhs()[0], Nonterminal):
                production = Production(production.lhs(), [production.rhs()[0].lower()])
            nonterm_occurrence[production.lhs()] += 1
            rules_to_prob[production] += 1
    for rule in rules_to_prob:
        rules_to_prob[rule] /= nonterm_occurrence[rule.lhs()]

    # use Katz smoothing
    rules_to_prob, vocab = katz_smooth(rules_to_prob, vocab)
    rules = list(rules_to_prob.keys())
    rules_reverse_dict = dict((j,i) for i, j in enumerate(rules))
    left_rules = defaultdict(set)
    right_rules = defaultdict(set)
    unary_rules = defaultdict(set)
    
    # classify left, right rules
    for rule in rules:
        if len(rule.rhs()) > 1:
            left_rules[rule.rhs()[0]].add(rule)
            right_rules[rule.rhs()[1]].add(rule)
        else:
            unary_rules[rule.rhs()[0]].add(rule)
    terminal_nonterms_rules = set(rule for rule in rules_to_prob if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str))
    terminal_nonterms = defaultdict(int)
    for rule in terminal_nonterms_rules:
        terminal_nonterms[rule.lhs()] += 1
        pcfg_parser = {
    'vocab': vocab,
        'left_rules': left_rules,
        'right_rules': right_rules,
        'unary_rules': unary_rules,
        'rules_to_prob': rules_to_prob,
        'terminal_nonterms': terminal_nonterms
    }
    return pcfg_parser
Exemple #2
0
def test_sentences(grammar):

    for t in test:
        print("Processing: " + str(t))
        reference = list(treebank.tagged_words(t))

        tokens = list(treebank.words(t))

        print("fixing grammar.....")
        # Checks if grammar covers all words in the sentence and adds them to the grammar if necessary
        fixed_grammar = get_fixed_grammer(grammar, tokens)

        print("fixed grammar")
        print("Building Parser....")
        parser = ViterbiParser(fixed_grammar)

        print("Parsing...")
        #Gets list of all possible trees, the most likely tree is at index 0
        start = time.time()
        parses = parser.parse_all(tokens)
        print("Time")
        print(start - time.time())

        #Getting POS tags from parser tree
        leafs = parses[0].pos()

        #Calculating accuracy of Parser results
        correct_tags = 0.0
        for i in range(len(leafs)):
            if leafs[i] == reference[i]:
                correct_tags += 1.0


        print(str(correct_tags/len(leafs)))
Exemple #3
0
def demo():
    from nltk.corpus import treebank
    for word in treebank.words('wsj_0034.mrg'):
        wt = word_type(word)
        if len(wt) == 0: wt = None
        if '*' in word: continue
        print "%-20s\t%s" % (word, wt)
Exemple #4
0
 def __init__(self,dname='treebank'):
     super().__init__()
     
     
     data = None
     #selecting the datset
     if dname =='treebank':
         if len(treebank.words()) == 0:    
             nltk.download('treebank')
         data = treebank.tagged_sents(tagset='universal')
         
     elif dname == 'brown':
         if len(brown.words()) == 0:    
             nltk.download('brown')
         data = brown.tagged_sents(tagset='universal')
         
     
     self.data=data
     #print(data[0:1])
     vocab,tags =self._build_vocab()
     max_sent_len = max(map(len, data))
     self.max_sent_len = max_sent_len
     self.word_to_idx = defaultdict(lambda:0, {word:idx for idx,word in enumerate(vocab)})
     self.idx_to_word = {idx:word for word,idx in self.word_to_idx.items()}
     self.tag_to_idx = {tag:idx for idx,tag in enumerate(tags)}
     self.idx_to_tag = {idx:tag for tag,idx in self.tag_to_idx.items()}
     self.sen_list,self.tag_list = self._convert_to_num()
Exemple #5
0
def demo():
    from nltk.corpus import treebank
    for word in treebank.words('wsj_0034.mrg'):
        wt = word_type(word)
        if len(wt) == 0: wt = None
        if '*' in word: continue
        print "%-20s\t%s" % (word, wt)
Exemple #6
0
 def __init__(self, *args, **kwargs):
     SequentialBackoffTagger.__init__(self, *args, **kwargs)
     self.wordnet_tag_map = {
         'n': 'NN',
         's': 'JJ',
         'a': 'JJ',
         'r': 'RB',
         'v': 'VB'
     }
     self.fd = FreqDist(treebank.words())
Exemple #7
0
def question4():
    #Taken from http://www.nltk.org/book/ch05.html
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]

    train_words = treebank.words()
    init_tagger = RegexpTagger(patterns)

    #Not sure if we need to use BrillTagger or BrillTaggerTrainer??
    #tagger = BrillTagger(init_tagger)
    # tagger = BrillTaggerTrainer(init_tagger)
    return
            result = model(x_data)

            #calculate loss,
            loss = criterion(result, y_data)
            lossValue = loss.item()

            # compute gradients
            loss.backward()

            # update gradients
            optimizer.step()
        print(lossValue)


sentences = treebank.tagged_sents()
words = treebank.words()

word_to_index = create_word_idx(words)
tag_to_index = create_tag_idx(sentences)
hidden_dim = 32
embedding_dim = 64

training_data = sentences[:3000]
test_data = sentences[3000:]
print("len sentences: ", len(sentences), "training: ", len(training_data), "test: ", len(test_data))

vocab_size = len(word_to_index)
target_size = len(tag_to_index)

print("vocab_size: ", vocab_size)
print("target_size (#tags): ", target_size)
Exemple #9
0
#for sent in brown.tagged_sents():#tagset="universal"):
for sent in nltk.corpus.treebank.tagged_sents():

    # sent is a list of word/tag pairs
    # add START/START at the beginning
    brown_tags_words.append( ("START", "START") )
    # then all the tag/word pairs for the word/tag pairs in the sentence
    brown_tags_words.extend([ (tag, word) for (word, tag) in sent ])
    # then END/END
    brown_tags_words.append( ("END", "END") )

# conditional frequency distribution
cfd_tagwords = nltk.ConditionalFreqDist(brown_tags_words)
# conditional probability distribution
#cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords,nltk.LaplaceProbDist, bins=len(treebank.words()))
cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist, bins=len(treebank.words()))
print "The probability of an adjective (ADJ) being 'new' is", cpd_tagwords["JJ"].prob("new")
print "The probability of an ADP being 'to' is", cpd_tagwords["TO"].prob("to")
print "The probability of an adjective (ADJ) being 'I' is", cpd_tagwords["VBN"].prob("eat")

"""Part 2 : P( si | s{i-1})
Estimating P(ti | t{i-1}) from corpus data using Maximum Likelihood Estimation (MLE):
P(ti | t{i-1}) = count(t{i-1}, ti) / count(t{i-1})
"""

brown_tags = [tag for (tag, word) in brown_tags_words]

# make conditional frequency distribution:
# count(t{i-1} ti)
cfd_tags= nltk.ConditionalFreqDist(nltk.bigrams(brown_tags))
# make conditional probability distribution, using
Exemple #10
0
text2 = Text(gutenberg.words("austen-sense.txt"))
print("text2:", text2.name)

text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
print("text8:", text8.name)

text9 = Text(gutenberg.words("chesterton-thursday.txt"))
print("text9:", text9.name)


def texts():
    print("text1:", text1.name)
    print("text2:", text2.name)
    print("text3:", text3.name)
    print("text4:", text4.name)
    print("text5:", text5.name)
Exemple #11
0
print(brown.categories(), '\n')

brown_humor_tagged = brown.tagged_words(categories='humor', tagset='universal')
print(brown_humor_tagged[:50])

# the chat corpus uses Penn POS tags
print(nltk.corpus.nps_chat.tagged_words()[:50])

# Penn treebank
from nltk.corpus import treebank

# use corpus methods to get the text as strings and as tokens as before
treebank_text = treebank.raw()
print(treebank_text[:150], '\n')

treebank_tokens = treebank.words()
print(treebank_tokens[:20])

# but we also have functions to get words with tags and sentences with tagged words
treebank_tagged_words = treebank.tagged_words()
print(treebank_tagged_words[:50])

treebank_tagged = treebank.tagged_sents()
print(len(treebank.tagged_sents()))
print(treebank_tagged[:2])


## Frequency distribution of tags in Penn Treebank
tag_fd = nltk.FreqDist(tag for (word, tag) in treebank_tagged_words)
print(tag_fd.keys(), '\n')
Exemple #12
0
    'its just can my one up just can so me my when find u not your I',
    '-Mnager of number eight basically laughed in my face when I asked about the job in there hahaha cheers',
    'I\'m not even pooping omg I https://t.co/8Q9QlDvUoQ',
    'Mexican cheese dip & Doritos = good eating',
    '2 down 1 to go',
    'RT @_RyanHowell: Imagine what a rainbow would taste like....',
    '@VCrippen this should be a broadway musical! #lol #waffles']

tc = make_corpus(tweets)

with open('tc_words.txt', 'w') as f:
    for word in tc['words']:
        f.write('%s\n' % word)

with open('treebank_words.txt', 'w') as f:
    for word in treebank.words():
        f.write('%s\n' % word)

tc_tags = []
for t in tc['tagged_words']:
    tc_tags.append(t[1])

with open('tc_tags.txt', 'w') as f:
    for tag in tc_tags:
        f.write('%s\n' % tag)

treebank_tags = []
for t in treebank.tagged_words():
    treebank_tags.append(t[1])

with open('treebank_tags.txt', 'w') as f:
Exemple #13
0
from nltk.corpus import treebank as tb
import sys
ids = tb.fileids()

for id in ids:
    wsj = tb.words(id)
    wsj = ' '.join(wsj)
    wsj = wsj.split(' . ')
    counter = 1
    for i, sent in enumerate(wsj):
        with open(
                'UCCA_English-WSJ-master/WSJ_DIR/' + str(id[:-4]) + '.' +
                str(counter) + '.mrg', 'w') as outfile:
            if i + 1 != len(wsj):
                to_write = sent + ' .'
                outfile.write(to_write)
            else:
                outfile.write(sent)
        counter += 1
Exemple #14
0
  'English: Brown Corpus (Press)':
      lambda: brown.words(categories=['news', 'editorial', 'reviews']),
  'English: Brown Corpus (Religion)':
      lambda: brown.words(categories='religion'),
  'English: Brown Corpus (Learned)':
      lambda: brown.words(categories='learned'),
  'English: Brown Corpus (Science Fiction)':
      lambda: brown.words(categories='science_fiction'),
  'English: Brown Corpus (Romance)':
      lambda: brown.words(categories='romance'),
  'English: Brown Corpus (Humor)':
      lambda: brown.words(categories='humor'),
  'English: NPS Chat Corpus':
      lambda: nps_chat.words(),
  'English: Wall Street Journal Corpus':
      lambda: treebank.words(),
  'Chinese: Sinica Corpus':
      lambda: sinica_treebank.words(),
  'Dutch: Alpino Corpus':
      lambda: alpino.words(),
  'Hindi: Indian Languages Corpus':
      lambda: indian.words(files='hindi.pos'),
  'Portuguese: Floresta Corpus (Portugal)':
      lambda: floresta.words(),
  'Portuguese: MAC-MORPHO Corpus (Brazil)':
      lambda: mac_morpho.words(),
  'Portuguese: Machado Corpus (Brazil)':
      lambda: machado.words(),
  'Spanish: CESS-ESP Corpus':
      lambda: cess_esp.words()
 }
CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
POLL_INTERVAL = 100

_DEFAULT = "English: Brown Corpus (Humor)"
_CORPORA = {
    "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
    "English: Brown Corpus": lambda: brown.words(),
    "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
Exemple #16
0
print('total_arvores treebank=',acc)


#contando total de árvores no único texto disponível do corpus 'floresta'
acc=0
for i in floresta.fileids()[:1]:
  lf = len(floresta.parsed_sents(i))
  acc = acc+lf
print('total_arvores floresta=',acc)

#habilitando novamente std_err para que as mensagens de erro voltem a ser impressas
enable_stderr(r)

"""Vamos agora inspecionar os totais de palavras de cada corpus. Percebam que o corpus floresta é bem mais rico em número de palavras."""

print("floresta.words=",len(floresta.words()), "\ntreebank.words=", len(treebank.words()))

"""Vamos agora aprender a percorrer as árvores de parsing do corpus 'floresta' e normalizar as regras de produção, evitando aquelas que não podem ser normalizadas.  

Para fazer isso, vamos empregar tratamento de exceções. Ao executar o código, é possível perceber que apenas uma pequena quantidade de árvores de  parsing não pode ser normalizada.
"""

#desabilitando std_err. Para habilitar, basta chamar enable_stderr(r)
r=disable_stderr()

from nltk import treetransforms
from nltk import induce_pcfg
from nltk import Nonterminal

#contadores para árvores ok e para árvores com falha
ok=0;
Exemple #17
0
to separate this data so it could be checked definitely during truecasing, and
not with a threshold probability of capitalisation.

A use of this type of program may be in customer-facing natural language query
systems where it cannot be guaranteed the user will use correct casing. If
correct casing can be guessed from the input it makes discovering contextual
meaning of their input easier and may make certain backend queries simpler
(correctly casing the name of a movie in a natural language request for
information on the movie, and the movies' actor allows simpler extraction of the
key information as it can be tagged as a proper noun easier). Could also be used
as part of a spellchecker that automatically corrected capitalisation.
"""

# need some global data variables

wordlist = set(words.words() + treebank.words())
common_words_lower = set([w for w in wordlist if w.islower()])
common_words_titlecase = set([w.lower() for w in wordlist if (w.istitle() and w not in common_words_lower)])

def truecase(s, threshold = 0.5):
    '''Attempts to correctly capitalise words in a sentence.
    Returns a string.'''
    # capitalise the first alphabet character, not simply the first
    s = s.capitalize()
    for i, c in enumerate(s):
        if c == ' ':  # potentially broken quote, don't capitalise in this case
            break
        if c.isalpha():
            s = s[:i] + s[i:].capitalize()
            break
Exemple #18
0
# line = lines[2]
# print(line.encode('unicode_escape'))
# for c in line:
#     if ord(c) > 127:
#         print('{} U+{:04x} {}'.format(c, ord(c), unicodedata.name(c)))

# m = re.search('\u015b\w*', line)
# print(m.group())

wlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

# textonyms = [w for w in wlist if re.search('^[ghi][mno][jlk][def]$', w)]

chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))

wsj = sorted(set(treebank.words()))

# fd = nltk.FreqDist(vs for word in wsj for vs in re.findall(r'[aeiou]{2,}', word))

# print([int(n) for n in re.findall(r'[0-9]{2,4}', '2009-12-31')])

regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'


def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)


def stem(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
Exemple #19
0
from nltk import UnigramTagger
from nltk.corpus import treebank

from tag_util import word_tag_model

model = word_tag_model(treebank.words(), treebank.tagged_words())
tagger = UnigramTagger(model=model)

test_sents = treebank.tagged_sents()[3000:]
print(tagger.evaluate(test_sents))
"""句法分析-形式语言与自动机"""
import nltk
from nltk import FreqDist, Nonterminal, nonterminals, Production
from nltk.corpus import treebank, sinica_treebank
from nltk.grammar import toy_pcfg2

print(str(nltk.corpus.treebank).replace('\\\\', '/'))
out = treebank.fileids()
print(out)
print(treebank.words('wsj_0007.mrg'))
print(treebank.tagged_words('wsj_0007.mrg'))
print(treebank.parsed_sents('wsj_0007.mrg')[2])
# 语法树
# treebank_chunk.chunked_sents()[1].draw()
# out = treebank_chunk.chunked_sents()[1].leaves()
# out = treebank_chunk.chunked_sents()[1].pos()
# out = treebank_chunk.chunked_sents()[1].productions()
# print(out)
fd = FreqDist()
fd.items()
print(sinica_treebank.sents())
print(sinica_treebank.parsed_sents()[27])
"""上下文无关文法(Context-free Grammar, CFG)
参考wiki 
自动机理论 https://zh.wikipedia.org/zh-cn/%E8%87%AA%E5%8B%95%E6%A9%9F%E7%90%86%E8%AB%96
在计算机科学中,若一个形式文法 G = (V, Σ, P, S) 的产生式规则都取如下的形式:A -> α,则谓之。其中 A∈V ,α∈(V∪Σ)* 。
上下文无关文法取名为“上下文无关”的原因就是因为字符 A 总可以被字符串 α 自由替换,而无需考虑字符 A 出现的上下文。
一个CFG由以下部分组成:
    非终结符的有限集合(N)
    终结符的有限集合(T)
    开始符号(S)
#print(wordnet.synsets("bank")[0].lemmas()[1].name())
#print(wordnet.synsets("bank")[0].name())
n = 15

# Creating w matrix
w = [[0.0 for x in range(len(word_list))] for y in range(len(word_list))]
for word1, word2 in product(word_list, word_list):
    count = 0

    n_grams = ngrams(brown.words(), n)

    for grams in n_grams:
        if word1 in grams and word2 in grams:
            count += 1

    n_grams = ngrams(treebank.words(), n)

    for grams in n_grams:
        if word1 in grams and word2 in grams:
            count += 1

    n_grams = ngrams(inaugural.words(), n)

    for grams in n_grams:
        if word1 in grams and word2 in grams:
            count += 1

    n_grams = ngrams(names.words(), n)

    for grams in n_grams:
        if word1 in grams and word2 in grams:
Exemple #22
0
import nltk
import nltk.corpus
print(str(nltk.corpus.treebank).replace('\\\\','/'))
print(nltk.corpus.treebank.fileids())
from nltk.corpus import treebank
print(treebank.words('wsj_0007.mrg'))
print(treebank.tagged_words('wsj_0007.mrg'))

Exemple #23
0
# tokenize words
word_tokenizer = TreebankWordTokenizer()
word_list = [word_tokenizer.tokenize(sent) for sent in article_sent]

# train pos tagger
# evaluate accuracy
test_sents = treebank.tagged_sents()[3000:]
test_chunks = treebank_chunk.chunked_sents()[3000:]
conll_test = conll2000.chunked_sents('test.txt')

train_new_tagger = False 
if train_new_tagger:
  train_sents = treebank.tagged_sents()[:3000]
  #create dictionary from treeback of most frequent words
  print("creating dictionary from treeback")
  model = word_tag_model(treebank.words(), treebank.tagged_words())
  
  #keeping tagger default for chaining purposes
  print("Training tagger")
  
  backoff= DefaultTagger('NN')
  nt = NamesTagger(backoff=backoff)
  #taggers = [UnigramTagger, BigramTagger, TrigramTagger]
  #trained_taggers = backoff_tagger(train_sents,taggers,backoff=nt)
  #Regexp - best to treat numbers? 
  regexp_tagger = RegexpTagger(patterns, backoff=nt)
  treebank_tagger = UnigramTagger(model=model,backoff=regexp_tagger)

  #skipping affix
  
  #skipping brill
__author__ = 'rumesh'

import nltk
from nltk.corpus import treebank

treebank_tagged = treebank.tagged_sents()
treebank_text = treebank.words()
# print len(treebank_text)
# print treebank_text[:50]

default_tagger = nltk.DefaultTagger("NN")
# print default_tagger.tag(treebank_text[:50])
# print default_tagger.evaluate(treebank_tagged)

unigram_tagger = nltk.UnigramTagger(treebank_tagged)
# print unigram_tagger.tag(treebank_text[:50])

size = int(len(treebank_tagged) * 0.9)
treebank_train = treebank_tagged[:size]
treebank_test = treebank_tagged[size:]
unigram_tagger = nltk.UnigramTagger(treebank_train)
# print unigram_tagger.evaluate(treebank_test)

t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(treebank_train, backoff=t0)
t2 = nltk.BigramTagger(treebank_train, backoff=t1)
print t2.evaluate(treebank_test)

text = "Three Calgarians have found a rather unusual way of leaving snow and ice behind. They set off this week on foot and by camels on a grueling trek across the burning Arabian desert."
tokens = nltk.wordpunct_tokenize(text)
taggedtext = t2.tag(tokens)
Exemple #25
0
#BrownAndTreebankTagsList.py - by Tarek Kanan, 9/15/2014, for VT CS4984, CL
from __future__ import division
import nltk, pickle
from nltk.corpus import brown
from nltk.corpus import treebank

# Building a large tagging corpus(FireEventTrainingSet) by combining
#  the Brown and Reuters POS tagging corpora.
FireEventTrainingSet = nltk.corpus.brown.tagged_words() + nltk.corpus.treebank.tagged_words()
fire = brown.words() + treebank.words()

#To print the number of POS tags in the new big tags corpus

#print 'the number of tags in the corpus: ', len(FireEventTrainingSet)

#To print the new corpus tags list
#print '\n the corpus tags list', FireEventTrainingSet

>>>>>>> Unit 4 files. Subsets for manual classification
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])

size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]

unigram_tagger = nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents)
Exemple #26
0
_DEFAULT = "English: Brown Corpus (Humor)"
_CORPORA = {
    "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
    "English: Brown Corpus": lambda: brown.words(),
    "English: Brown Corpus (Press)": lambda: brown.words(
        categories=["news", "editorial", "reviews"]
    ),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(
        categories="science_fiction"
    ),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
#!/local/bin/python3

from nltk import FreqDist
from nltk.corpus import treebank
from nltk import chunk, tag

# load treebank data
data = [treebank.words(i) for i in treebank.fileids()]
data = [tag.pos_tag(i) for i in data]

# chunk the data
chunkd_data = [chunk.ne_chunk(i) for i in data]
# select subtrees which are NE
chunkd_tree = [
    i.subtrees(filter=lambda x: x.label() in [
        "ORGANIZATION", "PERSON", "LOCATION", "DATE", "TIME", "MONEY",
        "PERCENT", "FACULTY", "GPE"
    ]) for i in chunkd_data
]
chunkd_trees = [[i for i in j] for j in chunkd_tree]
arr = []
for i in chunkd_trees:
    for j in i:
        arr.append(j)
word_fd = FreqDist(
    [' '.join(word for word, pos in tree.leaves()) for tree in arr])
print("Three most common named entities are: ")
for token, freq in word_fd.most_common(3):
    print("%s : %d" % (token, freq))
text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
print("text8:", text8.name)

text9 = Text(gutenberg.words('chesterton-thursday.txt'))
print("text9:", text9.name)

def texts():
    print("text1:", text1.name)
    print("text2:", text2.name)
    print("text3:", text3.name)
    print("text4:", text4.name)
    print("text5:", text5.name)
    print("text6:", text6.name)
from tqdm import tqdm
from nltk.corpus import treebank
from nltk.corpus import propbank

# Tiny example: PropBank
pb_instances = propbank.instances()
len(pb_instances)  # 112917
inst = pb_instances[1]
inst.fileid, inst.sentnum, inst.wordnum
print(propbank.instances()[1])
infl = inst.inflection
infl.form, infl.tense, infl.aspect, infl.person, infl.voice

# Tiny example: TreeBank
len(treebank.fileids())  # 199
len(treebank.parsed_sents())  # 3914
print(treebank.words('wsj_0001.mrg')[:])

# Compile all propbank metadata of verbs
pb_instances = propbank.instances()
index = [(inst.fileid, inst.sentnum, inst.wordnum, inst.inflection.tense)
         for inst in tqdm(pb_instances)]

ann = []
for fileid, sentnum, wordnum, tense in tqdm(index):
    allwords = treebank.parsed_sents(fileid)[sentnum].leaves()
    word = allwords[wordnum]
    ann.append((fileid, sentnum, wordnum, tense, word, allwords))

with open('propbank_preprocessed.pkl', 'wb') as f:
    pickle.dump(ann, f)
Exemple #30
0
    'English: Brown Corpus (Press)':
    lambda: brown.words(categories=['news', 'editorial', 'reviews']),
    'English: Brown Corpus (Religion)':
    lambda: brown.words(categories='religion'),
    'English: Brown Corpus (Learned)':
    lambda: brown.words(categories='learned'),
    'English: Brown Corpus (Science Fiction)':
    lambda: brown.words(categories='science_fiction'),
    'English: Brown Corpus (Romance)':
    lambda: brown.words(categories='romance'),
    'English: Brown Corpus (Humor)':
    lambda: brown.words(categories='humor'),
    'English: NPS Chat Corpus':
    lambda: nps_chat.words(),
    'English: Wall Street Journal Corpus':
    lambda: treebank.words(),
    'Chinese: Sinica Corpus':
    lambda: sinica_treebank.words(),
    'Dutch: Alpino Corpus':
    lambda: alpino.words(),
    'Hindi: Indian Languages Corpus':
    lambda: indian.words(files='hindi.pos'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.words(),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.words(),
    'Portuguese: Machado Corpus (Brazil)':
    lambda: machado.words(),
    'Spanish: CESS-ESP Corpus':
    lambda: cess_esp.words()
}
Exemple #31
0
import re
#import nltk
from nltk.corpus import words as wc
wordlist = [w for w in wc.words('en') if w.islower()]
print(len(wordlist))
print(len(set(wordlist)))

#===============
from nltk.corpus import treebank as tb
wsj = sorted(set(tb.words()))

#[Q] re.search() return boolean ? --> return m, matched set, m.group()

print([w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)])
# ['0.0085', '0.05', '0.1', '0.16', '0.2', '0.25', '0.28', '0.3', '0.4', '0.5',
# '0.50', '0.54', '0.56', '0.60', '0.7', '0.82', '0.84', '0.9', '0.95', '0.99',
# '1.01', '1.1', '1.125', '1.14', '1.1650', '1.17', '1.18', '1.19', '1.2', ...]

print([w for w in wsj if re.search('^[A-Z]+\$$', w)])
#['C$', 'US$']

print([w for w in wsj if re.search('^[0-9]{4}$', w)])
#['1614', '1637', '1787', '1901', '1903', '1917', '1925', '1929', '1933', ...]

print([w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)])
#['10-day', '10-lap', '10-year', '100-share', '12-point', '12-year', ...]

print([w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)])
#['black-and-white', 'bread-and-butter', 'father-in-law', 'machine-gun-toting',
#'savings-and-loan']
Exemple #32
0
    def train_supervised(self, labelled_sequences, extra_data=False, estimator=None):
        # This is copied from HiddenMarkovModelTrainer

        if estimator is None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurrences of starting states, transitions out of each state
        # and output symbols observed in each state
        known_symbols = set(self._symbols)
        known_states = set(self._states)

        starting = FreqDist()
        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in labelled_sequences:
            lasts = None
            for token in sequence:
                state = token[1]
                symbol = token[0]
                if lasts is None:
                    starting[state] += 1
                else:
                    transitions[lasts][state] += 1
                outputs[state][symbol] += 1
                lasts = state

                # update the state and symbol lists
                if state not in known_states:
                    self._states.append(state)
                    known_states.add(state)

                if symbol not in known_symbols:
                    self._symbols.append(symbol)
                    known_symbols.add(symbol)

        if extra_data:
            print('-'*20)
            print("Using extra data to calculate transition probability")
            sent = ""
            for word in tqdm(treebank.words()):
                if word == '.':
                    sent = sent[:-1] + word
                    lasts = None
                    for c in sent:
                        if c in list(string.ascii_lowercase)+[' ', ',', '.']:
                            if lasts is not None:
                                transitions[lasts][c] += 1
                        lasts = c
                    sent = ""
                elif word == ',':
                    sent = sent[:-1] + word + ' '
                else:
                    sent += word + ' '

        # create probability distributions (with smoothing)
        N = len(self._states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, N)
        B = ConditionalProbDist(outputs, estimator, len(self._symbols))

        return hmm.HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
Exemple #33
0
import sqlite3
import re
from collections import defaultdict
from nltk.corpus import brown, treebank, words as words_list, abc, movie_reviews, genesis

conn = sqlite3.connect(os.path.join(os.path.dirname(os.path.realpath(__file__)), "wofkov_db.sqlite"))
c = conn.cursor()

with open('wofkov_db_schema.sql', 'r') as sql:
    commands = sql.read().split(';')
    for command in commands:
        c.execute(command)
    
print "Building clean words list..."
words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]
words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])

print "Building clean sentences list"
sentences = []
for s in brown.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in treebank.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in abc.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in movie_reviews.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))