Example #1
0
def exercise1():
    # 进行词性标注
    text = nltk.word_tokenize("You are a good man, but i don't like you!")
    print(text)
    print(nltk.pos_tag(text))
    nltk.tag.pos_tag()

    words_tag = brown.tagged_words(categories='news')
    print(words_tag[:30])
    words_tag = brown.tagged_words(categories='news', tagset='universal')
    print(words_tag[:30])
    words_tag = brown.tagged_words(categories='news', tagset='wsj')
    print(words_tag[:30])
    words_tag = brown.tagged_words(categories='news', tagset='brown')
    print(words_tag[:30])

    words_tag = sinica_treebank.tagged_sents()
    print(words_tag)

    raw = "You are a good man, but i don't love you!"
    tokens = nltk.word_tokenize(raw)
    default_tagger = nltk.DefaultTagger('NN')
    lagged_words = default_tagger.tag(tokens)
    print(lagged_words)

    tagged_sents = brown.tagged_sents(categories='news')
    print(default_tagger.evaluate(tagged_sents))
Example #2
0
def Tag_Word_model():
    sinica_treebank_tagged_sents = sinica_treebank.tagged_sents()
    size = int(len(sinica_treebank_tagged_sents) * 0.9)
    train_sents = sinica_treebank_tagged_sents[:size]  # 90%数据作为训练集
    test_sents = sinica_treebank_tagged_sents[size:]  # 10%数据作为测试集
    t0 = nltk.DefaultTagger('Nab')  # 词性的默认值为名词
    # t1 = nltk.pos_tag(train_sents,str="cn")
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)  # 一元标注
    t2 = nltk.BigramTagger(train_sents, backoff=t1)  # 多元(二元)标注
    # dump_result(t2.tag(test_sents))
    print t2.evaluate(train_sents)  # 根据带标注的文本,评估标注器的正确率
    return t2
Example #3
0
def Tag_Word_model():
    sinica_treebank_tagged_sents = sinica_treebank.tagged_sents()
    size = int(len(sinica_treebank_tagged_sents) * 0.9)
    train_sents = sinica_treebank_tagged_sents[:size]  # 90%数据作为训练集
    test_sents = sinica_treebank_tagged_sents[size:]  # 10%数据作为测试集
    t0 = nltk.DefaultTagger('Nab')  # 词性的默认值为名词
    # t1 = nltk.pos_tag(train_sents,str="cn")
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)  # 一元标注
    t2 = nltk.BigramTagger(train_sents, backoff=t1)  # 多元(二元)标注
    # dump_result(t2.tag(test_sents))
    print t2.evaluate(train_sents)  # 根据带标注的文本,评估标注器的正确率
    return t2
Example #4
0
def load_featuresets():
    tagged_sents= sinica.tagged_sents()
    featuresets= []
    prev_tag= '<START>'
    prev2_tag= '<START>'

    for tagged_sent in tagged_sents:
        untagged_sent= nltk.tag.untag(tagged_sent)
        for i,(word, tag) in enumerate(tagged_sent):
            featuresets.append( (get_features(untagged_sent, i, prev_tag, prev2_tag), tag) )
            prev2_tag= prev_tag
            prev_tag= tag
    return featuresets
Example #5
0
	def __init__( self, mode, language ):
		self.mode = mode
		if language.lower()=='en':
			self.tagged_sents = brown.tagged_sents(categories='news')
			self.default_tagger = nltk.DefaultTagger('NN')
		elif language.lower()=='zh':
			self.tagged_sents = sinica_treebank.tagged_sents()   # 以句为单位标
			self.default_tagger = nltk.DefaultTagger('Nab')
		else:
			print( 'only supports en or zh as language.')
			raise
		self.train_size = int(len(self.tagged_sents) * 0.9)
		self.train_sets = self.tagged_sents[:self.train_size]
		self.test_sets = self.tagged_sents[self.train_size:]
		del(self.tagged_sents)	
		if self.mode=='unigram':
			self.unigram_tagger = nltk.UnigramTagger(train=self.train_sets,backoff=self.default_tagger)
		
		if self.mode=='bigram':
			self.unigram_tagger = nltk.UnigramTagger(train=self.train_sets,backoff=self.default_tagger)
			self.bigram_tagger = nltk.BigramTagger(train=self.train_sets,backoff=self.unigram_tagger)
Example #6
0
 lambda: brown.tagged_sents(categories="science_fiction",
                            tagset="universal"),
 "English: Brown Corpus (Romance, simplified)":
 lambda: brown.tagged_sents(categories="romance", tagset="universal"),
 "English: Brown Corpus (Humor, simplified)":
 lambda: brown.tagged_sents(categories="humor", tagset="universal"),
 "English: NPS Chat Corpus":
 lambda: nps_chat.tagged_posts(),
 "English: NPS Chat Corpus (simplified)":
 lambda: nps_chat.tagged_posts(tagset="universal"),
 "English: Wall Street Journal Corpus":
 lambda: treebank.tagged_sents(),
 "English: Wall Street Journal Corpus (simplified)":
 lambda: treebank.tagged_sents(tagset="universal"),
 "Chinese: Sinica Corpus":
 lambda: sinica_treebank.tagged_sents(),
 "Chinese: Sinica Corpus (simplified)":
 lambda: sinica_treebank.tagged_sents(tagset="universal"),
 "Dutch: Alpino Corpus":
 lambda: alpino.tagged_sents(),
 "Dutch: Alpino Corpus (simplified)":
 lambda: alpino.tagged_sents(tagset="universal"),
 "Hindi: Indian Languages Corpus":
 lambda: indian.tagged_sents(files="hindi.pos"),
 "Hindi: Indian Languages Corpus (simplified)":
 lambda: indian.tagged_sents(files="hindi.pos", tagset="universal"),
 "Portuguese: Floresta Corpus (Portugal)":
 lambda: floresta.tagged_sents(),
 "Portuguese: Floresta Corpus (Portugal, simplified)":
 lambda: floresta.tagged_sents(tagset="universal"),
 "Portuguese: MAC-MORPHO Corpus (Brazil)":
 'English: Brown Corpus (Science Fiction, simplified)':
     lambda: brown.tagged_sents(categories='science_fiction', tagset='universal'),
 'English: Brown Corpus (Romance, simplified)':
     lambda: brown.tagged_sents(categories='romance', tagset='universal'),
 'English: Brown Corpus (Humor, simplified)':
     lambda: brown.tagged_sents(categories='humor', tagset='universal'),
 'English: NPS Chat Corpus':
     lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
     lambda: nps_chat.tagged_posts(tagset='universal'),
 'English: Wall Street Journal Corpus':
     lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
     lambda: treebank.tagged_sents(tagset='universal'),
 'Chinese: Sinica Corpus':
     lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
     lambda: sinica_treebank.tagged_sents(tagset='universal'),
 'Dutch: Alpino Corpus':
     lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
     lambda: alpino.tagged_sents(tagset='universal'),
 'Hindi: Indian Languages Corpus':
     lambda: indian.tagged_sents(files='hindi.pos'),
 'Hindi: Indian Languages Corpus (simplified)':
     lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'),
 'Portuguese: Floresta Corpus (Portugal)':
     lambda: floresta.tagged_sents(),
 'Portuguese: Floresta Corpus (Portugal, simplified)':
     lambda: floresta.tagged_sents(tagset='universal'),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
Example #8
0
 'English: Brown Corpus (Science Fiction, simplified)':
     lambda: brown.tagged_sents(categories='science_fiction', tagset='simple'),
 'English: Brown Corpus (Romance, simplified)':
     lambda: brown.tagged_sents(categories='romance', tagset='simple'),
 'English: Brown Corpus (Humor, simplified)':
     lambda: brown.tagged_sents(categories='humor', tagset='simple'),
 'English: NPS Chat Corpus':
     lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
     lambda: nps_chat.tagged_posts(tagset='simple'),
 'English: Wall Street Journal Corpus':
     lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
     lambda: treebank.tagged_sents(tagset='simple'),
 'Chinese: Sinica Corpus':
     lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
     lambda: sinica_treebank.tagged_sents(tagset='simple'),
 'Dutch: Alpino Corpus':
     lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
     lambda: alpino.tagged_sents(tagset='simple'),
 'Hindi: Indian Languages Corpus':
     lambda: indian.tagged_sents(files='hindi.pos'),
 'Hindi: Indian Languages Corpus (simplified)':
     lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'),
 'Portuguese: Floresta Corpus (Portugal)':
     lambda: floresta.tagged_sents(),
 'Portuguese: Floresta Corpus (Portugal, simplified)':
     lambda: floresta.tagged_sents(tagset='simple'),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
Example #9
0
 'English: Brown Corpus (Science Fiction, simplified)':
     lambda: brown.tagged_sents(categories='science_fiction', simplify_tags=True),
 'English: Brown Corpus (Romance, simplified)':
     lambda: brown.tagged_sents(categories='romance', simplify_tags=True),
 'English: Brown Corpus (Humor, simplified)':
     lambda: brown.tagged_sents(categories='humor', simplify_tags=True),
 'English: NPS Chat Corpus':
     lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
     lambda: nps_chat.tagged_posts(simplify_tags=True),
 'English: Wall Street Journal Corpus':
     lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
     lambda: treebank.tagged_sents(simplify_tags=True),
 'Chinese: Sinica Corpus':
     lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
     lambda: sinica_treebank.tagged_sents(simplify_tags=True),
 'Dutch: Alpino Corpus':
     lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
     lambda: alpino.tagged_sents(simplify_tags=True),
 'Hindi: Indian Languages Corpus':
     lambda: indian.tagged_sents(files='hindi.pos'),
 'Hindi: Indian Languages Corpus (simplified)':
     lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True),
 'Portuguese: Floresta Corpus (Portugal)':
     lambda: floresta.tagged_sents(),
 'Portuguese: Floresta Corpus (Portugal, simplified)':
     lambda: floresta.tagged_sents(simplify_tags=True),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
Example #10
0
 lambda: brown.tagged_sents(categories='science_fiction',
                            simplify_tags=True),
 'English: Brown Corpus (Romance, simplified)':
 lambda: brown.tagged_sents(categories='romance', simplify_tags=True),
 'English: Brown Corpus (Humor, simplified)':
 lambda: brown.tagged_sents(categories='humor', simplify_tags=True),
 'English: NPS Chat Corpus':
 lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
 lambda: nps_chat.tagged_posts(simplify_tags=True),
 'English: Wall Street Journal Corpus':
 lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
 lambda: treebank.tagged_sents(simplify_tags=True),
 'Chinese: Sinica Corpus':
 lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
 lambda: sinica_treebank.tagged_sents(simplify_tags=True),
 'Dutch: Alpino Corpus':
 lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
 lambda: alpino.tagged_sents(simplify_tags=True),
 'Hindi: Indian Languages Corpus':
 lambda: indian.tagged_sents(files='hindi.pos'),
 'Hindi: Indian Languages Corpus (simplified)':
 lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True),
 'Portuguese: Floresta Corpus (Portugal)':
 lambda: floresta.tagged_sents(),
 'Portuguese: Floresta Corpus (Portugal, simplified)':
 lambda: floresta.tagged_sents(simplify_tags=True),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
 'English: Brown Corpus (Science Fiction, simplified)':
 lambda: brown.tagged_sents(categories='science_fiction', tagset='simple'),
 'English: Brown Corpus (Romance, simplified)':
 lambda: brown.tagged_sents(categories='romance', tagset='simple'),
 'English: Brown Corpus (Humor, simplified)':
 lambda: brown.tagged_sents(categories='humor', tagset='simple'),
 'English: NPS Chat Corpus':
 lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
 lambda: nps_chat.tagged_posts(tagset='simple'),
 'English: Wall Street Journal Corpus':
 lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
 lambda: treebank.tagged_sents(tagset='simple'),
 'Chinese: Sinica Corpus':
 lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
 lambda: sinica_treebank.tagged_sents(tagset='simple'),
 'Dutch: Alpino Corpus':
 lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
 lambda: alpino.tagged_sents(tagset='simple'),
 'Hindi: Indian Languages Corpus':
 lambda: indian.tagged_sents(files='hindi.pos'),
 'Hindi: Indian Languages Corpus (simplified)':
 lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'),
 'Portuguese: Floresta Corpus (Portugal)':
 lambda: floresta.tagged_sents(),
 'Portuguese: Floresta Corpus (Portugal, simplified)':
 lambda: floresta.tagged_sents(tagset='simple'),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
Example #12
0
 ),
 "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(
     categories="romance", tagset="universal"
 ),
 "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(
     categories="humor", tagset="universal"
 ),
 "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
 "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(
     tagset="universal"
 ),
 "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
 "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
     tagset="universal"
 ),
 "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
 "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
     tagset="universal"
 ),
 "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
 "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
     tagset="universal"
 ),
 "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
 "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
     files="hindi.pos", tagset="universal"
 ),
 "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
 "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
     tagset="universal"
 ),
Example #13
0
    "English: Brown Corpus (simplified)": lambda: brown.tagged_sents(tagset="simple"),
    "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
        categories=["news", "editorial", "reviews"], tagset="simple"
    ),
    "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(categories="religion", tagset="simple"),
    "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(categories="learned", tagset="simple"),
    "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
        categories="science_fiction", tagset="simple"
    ),
    "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"),
    "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"),
    "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
    "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"),
    "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
    "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
    "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"),
    "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
    "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"),
    "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"),
    "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = "#FFF"  # white
Example #14
0
    VP -> V NP | V NP PP
    PP -> P NP
    V -> "saw" | "ate" | "walked" | '爱'
    NP -> "John" | "Mary" | "Bob" | Det N | Det N PP | '我' | '你'
    Det -> "a" | "an" | "the" | "my"
    N -> "man" | "dog" | "cat" | "telescope" | "park"
    P -> "in" | "on" | "by" | "with"
''')

s = '我 爱 你'

tokens = nltk.word_tokenize(s)

from nltk.corpus import sinica_treebank

sinica_treebank_tagged_sents = sinica_treebank.tagged_sents()  # 以句为单位标
size = int(len(sinica_treebank_tagged_sents) * 0.9)
train_sents = sinica_treebank_tagged_sents[:size]  # 90% 数据作为训练集
test_sents = sinica_treebank_tagged_sents[size:]  # 10% 数据作为测试集

t0 = nltk.DefaultTagger('Nab')  # 词性的默认值为名词
t1 = nltk.UnigramTagger(train_sents, backoff=t0)  # 一元标注
t2 = nltk.BigramTagger(train_sents, backoff=t1)  # 多元(二元)标注

dump_result(t2.tag(tokens))
print(t2.evaluate(test_sents))  # 根据带标注的文本,评估标注器的正确率

exit()
tag = nltk.pos_tag(tokens)

for i in tag:
Example #15
0
File: Main.py Project: abr1989/PFC
import nltk
from nltk.corpus import sinica_treebank as sinica

import Segmenter
from Segmenter import sent_segment
import PCFG
from PCFG import PCFGChino
import time

####################################
## ESTA PARTE NO ES IMPORTANTE PARA TESTEAR EL PARSER
tagged_sents= sinica.tagged_sents()
sents= sinica.sents()

size= int(len(tagged_sents) * 0.9)
train_set= tagged_sents[:size]
test_set= tagged_sents[size:]
##trigram_tagger= nltk.TrigramTagger(train_set)
##score= trigram_tagger.evaluate(test_set)
print "Entrenando"
ini= time.time()
t0= nltk.DefaultTagger('Nab')
t1= nltk.UnigramTagger(train_set, backoff=t0)
t2= nltk.BigramTagger(train_set, backoff=t1)
t3= nltk.TrigramTagger(train_set, backoff=t2)
fin= time.time()
score= t3.evaluate(test_set)
print("Entrenamiento terminado ", str(fin-ini))
print "Evaluation Tagger= ",score
####################################
## Se crea el parser