def train(): print("Collecting sub-corpus from Penn Treebank (nltk.corpus)") # prepare parsing trees, extrated from treebank tbank_trees = [] for sent in treebank.parsed_sents(): sent.chomsky_normal_form() tbank_trees.append(sent) # build vocabulary list, extracted from treebank vocab_size = 10000 # set vocabulary size to 10000 words = [wrd.lower() for wrd in treebank.words()] vocab = [wrd for wrd,freq in Counter(treebank.words()).most_common(vocab_size)] # generate grammar rules list, extracted from treebank. and calculate their probablity based their frequency tbank_productions = set(production for tree in tbank_trees for production in tree.productions()) tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions)) production_rules = tbank_grammar.productions() rules_to_prob = defaultdict(int) nonterm_occurrence = defaultdict(int) #calculate probablity for rules for sent in tbank_trees: for production in sent.productions(): if len(production.rhs()) == 1 and not isinstance(production.rhs()[0], Nonterminal): production = Production(production.lhs(), [production.rhs()[0].lower()]) nonterm_occurrence[production.lhs()] += 1 rules_to_prob[production] += 1 for rule in rules_to_prob: rules_to_prob[rule] /= nonterm_occurrence[rule.lhs()] # use Katz smoothing rules_to_prob, vocab = katz_smooth(rules_to_prob, vocab) rules = list(rules_to_prob.keys()) rules_reverse_dict = dict((j,i) for i, j in enumerate(rules)) left_rules = defaultdict(set) right_rules = defaultdict(set) unary_rules = defaultdict(set) # classify left, right rules for rule in rules: if len(rule.rhs()) > 1: left_rules[rule.rhs()[0]].add(rule) right_rules[rule.rhs()[1]].add(rule) else: unary_rules[rule.rhs()[0]].add(rule) terminal_nonterms_rules = set(rule for rule in rules_to_prob if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str)) terminal_nonterms = defaultdict(int) for rule in terminal_nonterms_rules: terminal_nonterms[rule.lhs()] += 1 pcfg_parser = { 'vocab': vocab, 'left_rules': left_rules, 'right_rules': right_rules, 'unary_rules': unary_rules, 'rules_to_prob': rules_to_prob, 'terminal_nonterms': terminal_nonterms } return pcfg_parser
def test_sentences(grammar): for t in test: print("Processing: " + str(t)) reference = list(treebank.tagged_words(t)) tokens = list(treebank.words(t)) print("fixing grammar.....") # Checks if grammar covers all words in the sentence and adds them to the grammar if necessary fixed_grammar = get_fixed_grammer(grammar, tokens) print("fixed grammar") print("Building Parser....") parser = ViterbiParser(fixed_grammar) print("Parsing...") #Gets list of all possible trees, the most likely tree is at index 0 start = time.time() parses = parser.parse_all(tokens) print("Time") print(start - time.time()) #Getting POS tags from parser tree leafs = parses[0].pos() #Calculating accuracy of Parser results correct_tags = 0.0 for i in range(len(leafs)): if leafs[i] == reference[i]: correct_tags += 1.0 print(str(correct_tags/len(leafs)))
def demo(): from nltk.corpus import treebank for word in treebank.words('wsj_0034.mrg'): wt = word_type(word) if len(wt) == 0: wt = None if '*' in word: continue print "%-20s\t%s" % (word, wt)
def __init__(self,dname='treebank'): super().__init__() data = None #selecting the datset if dname =='treebank': if len(treebank.words()) == 0: nltk.download('treebank') data = treebank.tagged_sents(tagset='universal') elif dname == 'brown': if len(brown.words()) == 0: nltk.download('brown') data = brown.tagged_sents(tagset='universal') self.data=data #print(data[0:1]) vocab,tags =self._build_vocab() max_sent_len = max(map(len, data)) self.max_sent_len = max_sent_len self.word_to_idx = defaultdict(lambda:0, {word:idx for idx,word in enumerate(vocab)}) self.idx_to_word = {idx:word for word,idx in self.word_to_idx.items()} self.tag_to_idx = {tag:idx for idx,tag in enumerate(tags)} self.idx_to_tag = {idx:tag for tag,idx in self.tag_to_idx.items()} self.sen_list,self.tag_list = self._convert_to_num()
def __init__(self, *args, **kwargs): SequentialBackoffTagger.__init__(self, *args, **kwargs) self.wordnet_tag_map = { 'n': 'NN', 's': 'JJ', 'a': 'JJ', 'r': 'RB', 'v': 'VB' } self.fd = FreqDist(treebank.words())
def question4(): #Taken from http://www.nltk.org/book/ch05.html patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] train_words = treebank.words() init_tagger = RegexpTagger(patterns) #Not sure if we need to use BrillTagger or BrillTaggerTrainer?? #tagger = BrillTagger(init_tagger) # tagger = BrillTaggerTrainer(init_tagger) return
result = model(x_data) #calculate loss, loss = criterion(result, y_data) lossValue = loss.item() # compute gradients loss.backward() # update gradients optimizer.step() print(lossValue) sentences = treebank.tagged_sents() words = treebank.words() word_to_index = create_word_idx(words) tag_to_index = create_tag_idx(sentences) hidden_dim = 32 embedding_dim = 64 training_data = sentences[:3000] test_data = sentences[3000:] print("len sentences: ", len(sentences), "training: ", len(training_data), "test: ", len(test_data)) vocab_size = len(word_to_index) target_size = len(tag_to_index) print("vocab_size: ", vocab_size) print("target_size (#tags): ", target_size)
#for sent in brown.tagged_sents():#tagset="universal"): for sent in nltk.corpus.treebank.tagged_sents(): # sent is a list of word/tag pairs # add START/START at the beginning brown_tags_words.append( ("START", "START") ) # then all the tag/word pairs for the word/tag pairs in the sentence brown_tags_words.extend([ (tag, word) for (word, tag) in sent ]) # then END/END brown_tags_words.append( ("END", "END") ) # conditional frequency distribution cfd_tagwords = nltk.ConditionalFreqDist(brown_tags_words) # conditional probability distribution #cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords,nltk.LaplaceProbDist, bins=len(treebank.words())) cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist, bins=len(treebank.words())) print "The probability of an adjective (ADJ) being 'new' is", cpd_tagwords["JJ"].prob("new") print "The probability of an ADP being 'to' is", cpd_tagwords["TO"].prob("to") print "The probability of an adjective (ADJ) being 'I' is", cpd_tagwords["VBN"].prob("eat") """Part 2 : P( si | s{i-1}) Estimating P(ti | t{i-1}) from corpus data using Maximum Likelihood Estimation (MLE): P(ti | t{i-1}) = count(t{i-1}, ti) / count(t{i-1}) """ brown_tags = [tag for (tag, word) in brown_tags_words] # make conditional frequency distribution: # count(t{i-1} ti) cfd_tags= nltk.ConditionalFreqDist(nltk.bigrams(brown_tags)) # make conditional probability distribution, using
text2 = Text(gutenberg.words("austen-sense.txt")) print("text2:", text2.name) text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis") print("text3:", text3.name) text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text4.name) text5 = Text(nps_chat.words(), name="Chat Corpus") print("text5:", text5.name) text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail") print("text6:", text6.name) text7 = Text(treebank.words(), name="Wall Street Journal") print("text7:", text7.name) text8 = Text(webtext.words("singles.txt"), name="Personals Corpus") print("text8:", text8.name) text9 = Text(gutenberg.words("chesterton-thursday.txt")) print("text9:", text9.name) def texts(): print("text1:", text1.name) print("text2:", text2.name) print("text3:", text3.name) print("text4:", text4.name) print("text5:", text5.name)
print(brown.categories(), '\n') brown_humor_tagged = brown.tagged_words(categories='humor', tagset='universal') print(brown_humor_tagged[:50]) # the chat corpus uses Penn POS tags print(nltk.corpus.nps_chat.tagged_words()[:50]) # Penn treebank from nltk.corpus import treebank # use corpus methods to get the text as strings and as tokens as before treebank_text = treebank.raw() print(treebank_text[:150], '\n') treebank_tokens = treebank.words() print(treebank_tokens[:20]) # but we also have functions to get words with tags and sentences with tagged words treebank_tagged_words = treebank.tagged_words() print(treebank_tagged_words[:50]) treebank_tagged = treebank.tagged_sents() print(len(treebank.tagged_sents())) print(treebank_tagged[:2]) ## Frequency distribution of tags in Penn Treebank tag_fd = nltk.FreqDist(tag for (word, tag) in treebank_tagged_words) print(tag_fd.keys(), '\n')
'its just can my one up just can so me my when find u not your I', '-Mnager of number eight basically laughed in my face when I asked about the job in there hahaha cheers', 'I\'m not even pooping omg I https://t.co/8Q9QlDvUoQ', 'Mexican cheese dip & Doritos = good eating', '2 down 1 to go', 'RT @_RyanHowell: Imagine what a rainbow would taste like....', '@VCrippen this should be a broadway musical! #lol #waffles'] tc = make_corpus(tweets) with open('tc_words.txt', 'w') as f: for word in tc['words']: f.write('%s\n' % word) with open('treebank_words.txt', 'w') as f: for word in treebank.words(): f.write('%s\n' % word) tc_tags = [] for t in tc['tagged_words']: tc_tags.append(t[1]) with open('tc_tags.txt', 'w') as f: for tag in tc_tags: f.write('%s\n' % tag) treebank_tags = [] for t in treebank.tagged_words(): treebank_tags.append(t[1]) with open('treebank_tags.txt', 'w') as f:
from nltk.corpus import treebank as tb import sys ids = tb.fileids() for id in ids: wsj = tb.words(id) wsj = ' '.join(wsj) wsj = wsj.split(' . ') counter = 1 for i, sent in enumerate(wsj): with open( 'UCCA_English-WSJ-master/WSJ_DIR/' + str(id[:-4]) + '.' + str(counter) + '.mrg', 'w') as outfile: if i + 1 != len(wsj): to_write = sent + ' .' outfile.write(to_write) else: outfile.write(sent) counter += 1
'English: Brown Corpus (Press)': lambda: brown.words(categories=['news', 'editorial', 'reviews']), 'English: Brown Corpus (Religion)': lambda: brown.words(categories='religion'), 'English: Brown Corpus (Learned)': lambda: brown.words(categories='learned'), 'English: Brown Corpus (Science Fiction)': lambda: brown.words(categories='science_fiction'), 'English: Brown Corpus (Romance)': lambda: brown.words(categories='romance'), 'English: Brown Corpus (Humor)': lambda: brown.words(categories='humor'), 'English: NPS Chat Corpus': lambda: nps_chat.words(), 'English: Wall Street Journal Corpus': lambda: treebank.words(), 'Chinese: Sinica Corpus': lambda: sinica_treebank.words(), 'Dutch: Alpino Corpus': lambda: alpino.words(), 'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(), 'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(), 'Spanish: CESS-ESP Corpus': lambda: cess_esp.words() }
CORPUS_LOADED_EVENT = "<<CL_EVENT>>" ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>" POLL_INTERVAL = 100 _DEFAULT = "English: Brown Corpus (Humor)" _CORPORA = { "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(), "English: Brown Corpus": lambda: brown.words(), "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self): self.queue = q.Queue()
print('total_arvores treebank=',acc) #contando total de árvores no único texto disponível do corpus 'floresta' acc=0 for i in floresta.fileids()[:1]: lf = len(floresta.parsed_sents(i)) acc = acc+lf print('total_arvores floresta=',acc) #habilitando novamente std_err para que as mensagens de erro voltem a ser impressas enable_stderr(r) """Vamos agora inspecionar os totais de palavras de cada corpus. Percebam que o corpus floresta é bem mais rico em número de palavras.""" print("floresta.words=",len(floresta.words()), "\ntreebank.words=", len(treebank.words())) """Vamos agora aprender a percorrer as árvores de parsing do corpus 'floresta' e normalizar as regras de produção, evitando aquelas que não podem ser normalizadas. Para fazer isso, vamos empregar tratamento de exceções. Ao executar o código, é possível perceber que apenas uma pequena quantidade de árvores de parsing não pode ser normalizada. """ #desabilitando std_err. Para habilitar, basta chamar enable_stderr(r) r=disable_stderr() from nltk import treetransforms from nltk import induce_pcfg from nltk import Nonterminal #contadores para árvores ok e para árvores com falha ok=0;
to separate this data so it could be checked definitely during truecasing, and not with a threshold probability of capitalisation. A use of this type of program may be in customer-facing natural language query systems where it cannot be guaranteed the user will use correct casing. If correct casing can be guessed from the input it makes discovering contextual meaning of their input easier and may make certain backend queries simpler (correctly casing the name of a movie in a natural language request for information on the movie, and the movies' actor allows simpler extraction of the key information as it can be tagged as a proper noun easier). Could also be used as part of a spellchecker that automatically corrected capitalisation. """ # need some global data variables wordlist = set(words.words() + treebank.words()) common_words_lower = set([w for w in wordlist if w.islower()]) common_words_titlecase = set([w.lower() for w in wordlist if (w.istitle() and w not in common_words_lower)]) def truecase(s, threshold = 0.5): '''Attempts to correctly capitalise words in a sentence. Returns a string.''' # capitalise the first alphabet character, not simply the first s = s.capitalize() for i, c in enumerate(s): if c == ' ': # potentially broken quote, don't capitalise in this case break if c.isalpha(): s = s[:i] + s[i:].capitalize() break
# line = lines[2] # print(line.encode('unicode_escape')) # for c in line: # if ord(c) > 127: # print('{} U+{:04x} {}'.format(c, ord(c), unicodedata.name(c))) # m = re.search('\u015b\w*', line) # print(m.group()) wlist = [w for w in nltk.corpus.words.words('en') if w.islower()] # textonyms = [w for w in wlist if re.search('^[ghi][mno][jlk][def]$', w)] chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words())) wsj = sorted(set(treebank.words())) # fd = nltk.FreqDist(vs for word in wsj for vs in re.findall(r'[aeiou]{2,}', word)) # print([int(n) for n in re.findall(r'[0-9]{2,4}', '2009-12-31')]) regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]' def compress(word): pieces = re.findall(regexp, word) return ''.join(pieces) def stem(word): for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
from nltk import UnigramTagger from nltk.corpus import treebank from tag_util import word_tag_model model = word_tag_model(treebank.words(), treebank.tagged_words()) tagger = UnigramTagger(model=model) test_sents = treebank.tagged_sents()[3000:] print(tagger.evaluate(test_sents))
"""句法分析-形式语言与自动机""" import nltk from nltk import FreqDist, Nonterminal, nonterminals, Production from nltk.corpus import treebank, sinica_treebank from nltk.grammar import toy_pcfg2 print(str(nltk.corpus.treebank).replace('\\\\', '/')) out = treebank.fileids() print(out) print(treebank.words('wsj_0007.mrg')) print(treebank.tagged_words('wsj_0007.mrg')) print(treebank.parsed_sents('wsj_0007.mrg')[2]) # 语法树 # treebank_chunk.chunked_sents()[1].draw() # out = treebank_chunk.chunked_sents()[1].leaves() # out = treebank_chunk.chunked_sents()[1].pos() # out = treebank_chunk.chunked_sents()[1].productions() # print(out) fd = FreqDist() fd.items() print(sinica_treebank.sents()) print(sinica_treebank.parsed_sents()[27]) """上下文无关文法(Context-free Grammar, CFG) 参考wiki 自动机理论 https://zh.wikipedia.org/zh-cn/%E8%87%AA%E5%8B%95%E6%A9%9F%E7%90%86%E8%AB%96 在计算机科学中,若一个形式文法 G = (V, Σ, P, S) 的产生式规则都取如下的形式:A -> α,则谓之。其中 A∈V ,α∈(V∪Σ)* 。 上下文无关文法取名为“上下文无关”的原因就是因为字符 A 总可以被字符串 α 自由替换,而无需考虑字符 A 出现的上下文。 一个CFG由以下部分组成: 非终结符的有限集合(N) 终结符的有限集合(T) 开始符号(S)
#print(wordnet.synsets("bank")[0].lemmas()[1].name()) #print(wordnet.synsets("bank")[0].name()) n = 15 # Creating w matrix w = [[0.0 for x in range(len(word_list))] for y in range(len(word_list))] for word1, word2 in product(word_list, word_list): count = 0 n_grams = ngrams(brown.words(), n) for grams in n_grams: if word1 in grams and word2 in grams: count += 1 n_grams = ngrams(treebank.words(), n) for grams in n_grams: if word1 in grams and word2 in grams: count += 1 n_grams = ngrams(inaugural.words(), n) for grams in n_grams: if word1 in grams and word2 in grams: count += 1 n_grams = ngrams(names.words(), n) for grams in n_grams: if word1 in grams and word2 in grams:
import nltk import nltk.corpus print(str(nltk.corpus.treebank).replace('\\\\','/')) print(nltk.corpus.treebank.fileids()) from nltk.corpus import treebank print(treebank.words('wsj_0007.mrg')) print(treebank.tagged_words('wsj_0007.mrg'))
# tokenize words word_tokenizer = TreebankWordTokenizer() word_list = [word_tokenizer.tokenize(sent) for sent in article_sent] # train pos tagger # evaluate accuracy test_sents = treebank.tagged_sents()[3000:] test_chunks = treebank_chunk.chunked_sents()[3000:] conll_test = conll2000.chunked_sents('test.txt') train_new_tagger = False if train_new_tagger: train_sents = treebank.tagged_sents()[:3000] #create dictionary from treeback of most frequent words print("creating dictionary from treeback") model = word_tag_model(treebank.words(), treebank.tagged_words()) #keeping tagger default for chaining purposes print("Training tagger") backoff= DefaultTagger('NN') nt = NamesTagger(backoff=backoff) #taggers = [UnigramTagger, BigramTagger, TrigramTagger] #trained_taggers = backoff_tagger(train_sents,taggers,backoff=nt) #Regexp - best to treat numbers? regexp_tagger = RegexpTagger(patterns, backoff=nt) treebank_tagger = UnigramTagger(model=model,backoff=regexp_tagger) #skipping affix #skipping brill
__author__ = 'rumesh' import nltk from nltk.corpus import treebank treebank_tagged = treebank.tagged_sents() treebank_text = treebank.words() # print len(treebank_text) # print treebank_text[:50] default_tagger = nltk.DefaultTagger("NN") # print default_tagger.tag(treebank_text[:50]) # print default_tagger.evaluate(treebank_tagged) unigram_tagger = nltk.UnigramTagger(treebank_tagged) # print unigram_tagger.tag(treebank_text[:50]) size = int(len(treebank_tagged) * 0.9) treebank_train = treebank_tagged[:size] treebank_test = treebank_tagged[size:] unigram_tagger = nltk.UnigramTagger(treebank_train) # print unigram_tagger.evaluate(treebank_test) t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(treebank_train, backoff=t0) t2 = nltk.BigramTagger(treebank_train, backoff=t1) print t2.evaluate(treebank_test) text = "Three Calgarians have found a rather unusual way of leaving snow and ice behind. They set off this week on foot and by camels on a grueling trek across the burning Arabian desert." tokens = nltk.wordpunct_tokenize(text) taggedtext = t2.tag(tokens)
#BrownAndTreebankTagsList.py - by Tarek Kanan, 9/15/2014, for VT CS4984, CL from __future__ import division import nltk, pickle from nltk.corpus import brown from nltk.corpus import treebank # Building a large tagging corpus(FireEventTrainingSet) by combining # the Brown and Reuters POS tagging corpora. FireEventTrainingSet = nltk.corpus.brown.tagged_words() + nltk.corpus.treebank.tagged_words() fire = brown.words() + treebank.words() #To print the number of POS tags in the new big tags corpus #print 'the number of tags in the corpus: ', len(FireEventTrainingSet) #To print the new corpus tags list #print '\n the corpus tags list', FireEventTrainingSet >>>>>>> Unit 4 files. Subsets for manual classification brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') unigram_tagger = nltk.UnigramTagger(brown_tagged_sents) unigram_tagger.tag(brown_sents[2007]) size = int(len(brown_tagged_sents) * 0.9) train_sents = brown_tagged_sents[:size] test_sents = brown_tagged_sents[size:] unigram_tagger = nltk.UnigramTagger(train_sents) unigram_tagger.evaluate(test_sents)
_DEFAULT = "English: Brown Corpus (Humor)" _CORPORA = { "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(), "English: Brown Corpus": lambda: brown.words(), "English: Brown Corpus (Press)": lambda: brown.words( categories=["news", "editorial", "reviews"] ), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words( categories="science_fiction" ), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self): self.queue = q.Queue()
#!/local/bin/python3 from nltk import FreqDist from nltk.corpus import treebank from nltk import chunk, tag # load treebank data data = [treebank.words(i) for i in treebank.fileids()] data = [tag.pos_tag(i) for i in data] # chunk the data chunkd_data = [chunk.ne_chunk(i) for i in data] # select subtrees which are NE chunkd_tree = [ i.subtrees(filter=lambda x: x.label() in [ "ORGANIZATION", "PERSON", "LOCATION", "DATE", "TIME", "MONEY", "PERCENT", "FACULTY", "GPE" ]) for i in chunkd_data ] chunkd_trees = [[i for i in j] for j in chunkd_tree] arr = [] for i in chunkd_trees: for j in i: arr.append(j) word_fd = FreqDist( [' '.join(word for word, pos in tree.leaves()) for tree in arr]) print("Three most common named entities are: ") for token, freq in word_fd.most_common(3): print("%s : %d" % (token, freq))
text2 = Text(gutenberg.words('austen-sense.txt')) print("text2:", text2.name) text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis") print("text3:", text3.name) text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text4.name) text5 = Text(nps_chat.words(), name="Chat Corpus") print("text5:", text5.name) text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail") print("text6:", text6.name) text7 = Text(treebank.words(), name="Wall Street Journal") print("text7:", text7.name) text8 = Text(webtext.words('singles.txt'), name="Personals Corpus") print("text8:", text8.name) text9 = Text(gutenberg.words('chesterton-thursday.txt')) print("text9:", text9.name) def texts(): print("text1:", text1.name) print("text2:", text2.name) print("text3:", text3.name) print("text4:", text4.name) print("text5:", text5.name) print("text6:", text6.name)
from tqdm import tqdm from nltk.corpus import treebank from nltk.corpus import propbank # Tiny example: PropBank pb_instances = propbank.instances() len(pb_instances) # 112917 inst = pb_instances[1] inst.fileid, inst.sentnum, inst.wordnum print(propbank.instances()[1]) infl = inst.inflection infl.form, infl.tense, infl.aspect, infl.person, infl.voice # Tiny example: TreeBank len(treebank.fileids()) # 199 len(treebank.parsed_sents()) # 3914 print(treebank.words('wsj_0001.mrg')[:]) # Compile all propbank metadata of verbs pb_instances = propbank.instances() index = [(inst.fileid, inst.sentnum, inst.wordnum, inst.inflection.tense) for inst in tqdm(pb_instances)] ann = [] for fileid, sentnum, wordnum, tense in tqdm(index): allwords = treebank.parsed_sents(fileid)[sentnum].leaves() word = allwords[wordnum] ann.append((fileid, sentnum, wordnum, tense, word, allwords)) with open('propbank_preprocessed.pkl', 'wb') as f: pickle.dump(ann, f)
import re #import nltk from nltk.corpus import words as wc wordlist = [w for w in wc.words('en') if w.islower()] print(len(wordlist)) print(len(set(wordlist))) #=============== from nltk.corpus import treebank as tb wsj = sorted(set(tb.words())) #[Q] re.search() return boolean ? --> return m, matched set, m.group() print([w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)]) # ['0.0085', '0.05', '0.1', '0.16', '0.2', '0.25', '0.28', '0.3', '0.4', '0.5', # '0.50', '0.54', '0.56', '0.60', '0.7', '0.82', '0.84', '0.9', '0.95', '0.99', # '1.01', '1.1', '1.125', '1.14', '1.1650', '1.17', '1.18', '1.19', '1.2', ...] print([w for w in wsj if re.search('^[A-Z]+\$$', w)]) #['C$', 'US$'] print([w for w in wsj if re.search('^[0-9]{4}$', w)]) #['1614', '1637', '1787', '1901', '1903', '1917', '1925', '1929', '1933', ...] print([w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)]) #['10-day', '10-lap', '10-year', '100-share', '12-point', '12-year', ...] print([w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]) #['black-and-white', 'bread-and-butter', 'father-in-law', 'machine-gun-toting', #'savings-and-loan']
def train_supervised(self, labelled_sequences, extra_data=False, estimator=None): # This is copied from HiddenMarkovModelTrainer if estimator is None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurrences of starting states, transitions out of each state # and output symbols observed in each state known_symbols = set(self._symbols) known_states = set(self._states) starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[1] symbol = token[0] if lasts is None: starting[state] += 1 else: transitions[lasts][state] += 1 outputs[state][symbol] += 1 lasts = state # update the state and symbol lists if state not in known_states: self._states.append(state) known_states.add(state) if symbol not in known_symbols: self._symbols.append(symbol) known_symbols.add(symbol) if extra_data: print('-'*20) print("Using extra data to calculate transition probability") sent = "" for word in tqdm(treebank.words()): if word == '.': sent = sent[:-1] + word lasts = None for c in sent: if c in list(string.ascii_lowercase)+[' ', ',', '.']: if lasts is not None: transitions[lasts][c] += 1 lasts = c sent = "" elif word == ',': sent = sent[:-1] + word + ' ' else: sent += word + ' ' # create probability distributions (with smoothing) N = len(self._states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, N) B = ConditionalProbDist(outputs, estimator, len(self._symbols)) return hmm.HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
import sqlite3 import re from collections import defaultdict from nltk.corpus import brown, treebank, words as words_list, abc, movie_reviews, genesis conn = sqlite3.connect(os.path.join(os.path.dirname(os.path.realpath(__file__)), "wofkov_db.sqlite")) c = conn.cursor() with open('wofkov_db_schema.sql', 'r') as sql: commands = sql.read().split(';') for command in commands: c.execute(command) print "Building clean words list..." words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")] words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) print "Building clean sentences list" sentences = [] for s in brown.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in treebank.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in abc.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in movie_reviews.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))