コード例 #1
0
def lexical(tokens):
    print "\n"
    print "Step 2: Lexical Analysis\n"
    print "Essentially refers to dictionary and obtains the properties of the word"
    print "Part-Of-Speech tagging"
    print "The tagset is:\n"

    tag = DefaultTagger('NN')
    tagg = UnigramTagger(train_sent, backoff=tag)
    tagger = BigramTagger(train_sent, backoff=tagg)

    tagtokens = tagger.tag(tokens)
    for token, tag in tagtokens:
        print token + "->" + tag
    print "\n"
    print "The acurracy of the trained pos tagger is:"
    print tagger.evaluate(test_sents)

    return tagtokens
コード例 #2
0
class Tagger:

    def __init__(self, idiom):
        self.tagger0 = DefaultTagger('N')
        self.tagger1 = UnigramTagger(None, self.tagger0)
        self.tagger2 = BigramTagger(None, self.tagger1)
        self.lang = os.path.abspath('FriggAnswer')+'/pickle/'
        #self.lang = os.path.abspath('pickle')+'\\'
        self.loadIdiom(idiom)

    def loadIdiom(self, idiom):
        input = open(self.lang +idiom +'1.pkl', 'rb')
        self.tagger = load(input)
        input.close()
        input = open(self.lang +idiom+'2.pkl', 'rb')
        self.tagger2 = load(input)
        input.close()

    def classify(self, question):
        tags = self.tagger2.tag(question)
        return tags
コード例 #3
0
 #end init
 #pre Operate
 for word in my_corp[sent_count]:
     if my_corp[sent_count][wordcount] == ':':
         flag1 = 1
         break
     wordcount = wordcount + 1
 if flag1 == 1:
     #print(my_corp[sent_count][wordcount+1:-1])
     curr_sent = my_corp[sent_count][wordcount + 1:-1]
 else:
     #print(my_corp[sent_count])
     curr_sent = my_corp[sent_count]
 #curr_sent=rep.replace(curr_sent)
 #end pre Operate
 tag_curr_sent = t2.tag(curr_sent)
 #print(tag_curr_sent)
 for words_tup in tag_curr_sent:
     if words_tup[1] != (',' or '?' or '!' or '.'):
         All_count = All_count + 1
     if words_tup[1] == 'NN':
         NN_count = NN_count + 1
         #NN_Num=NN_Num+1
     if words_tup[1] == 'NNP':
         NNP_count = NNP_count + 1
         NNP_Num = NNP_Num + 1
     if words_tup[1] == 'JJ':
         JJ_count = JJ_count + 1
         JJ_Num = JJ_Num + 1
     #if words_tup[1]=='VB':
     #	VB_count=VB_count+1
コード例 #4
0

## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

print ut.evaluate(test_data)
print ut.tag(tokens)

print bt.evaluate(test_data)
print bt.tag(tokens)

print tt.evaluate(test_data)
print tt.tag(tokens)

def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

ct = combined_tagger(train_data=train_data, 
                     taggers=[UnigramTagger, BigramTagger, TrigramTagger],
                     backoff=rt)

print ct.evaluate(test_data)        
print ct.tag(tokens)
コード例 #5
0
ファイル: SDSBC.py プロジェクト: MullerLee/ChatBot2.0
        editdis = 1000
        line_num = 0
        rate = 0

## 3.The part of Standardize and NER
    if tag == 0:
        stopwords = nltk.corpus.stopwords.words('english')
        cont = nltk.word_tokenize(rep_22)
        # Name Standardize
        cont = [lc_male_name if x == 'he' else x for x in cont]
        cont = [lc_male_name if x == 'his' else x for x in cont]
        cont = [lc_female_name if x == 'she' else x for x in cont]
        cont = [lc_female_name if x == 'her' else x for x in cont]
        # Place Standardize

        cont_tag = t2.tag(cont)
        cont_unstop = [word for word in cont if word not in stopwords]
        cont_tagged = t2.tag(cont_unstop)  # Final result of NER

        ## 4.Answer Filter
        # First Filter: Find names in the input
        for tupl_1 in cont_tag:
            if tupl_1[1] == 'NN':
                if tupl_1[0] in male_names:
                    lc_male_name = tupl_1[0]
                elif tupl_1[0] in female_names:
                    lc_female_name = tupl_1[0]

# Second Filter: Find places in the input
        for tupl_2 in cont_tag:
            if tupl_2[1] == 'NN':
import nltk
from nltk.tag import BigramTagger
from nltk.corpus import treebank
training_1= treebank.tagged_sents()[:7000]
bigramtagger=BigramTagger(training_1)
print(treebank.sents()[0])
print(bigramtagger.tag(treebank.sents()[0]))
testing_1 = treebank.tagged_sents()[2000:]
print(bigramtagger.evaluate(testing_1))

コード例 #7
0
# Train the N-Gram taggers using the training_data (pre-tagged tokens, i.e. labeled observations)
ut = UnigramTagger(train=train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

# Test the performance of each N-Gram tagger
print("1-Gram Tagger Accuracy: {}".format(ut.evaluate(test_data)))
print("2-Gram Tagger Accuracy: {}".format(bt.evaluate(test_data)))
print("3-Gram Tagger Accuracy: {}".format(tt.evaluate(test_data)))

print("\n1-Gram tags:")
print(ut.tag(tokens))

print("\n2-Gram tags:")
print(bt.tag(tokens))

print("\n3-Gram tags:")
print(tt.tag(tokens))

# Note that the best accuracy is provided by the 1-Gram tagger, as it isn't always the case that the same bigrams
# and trigrams observed in the training data will be present in the same way in the testing data (e.g. pairs of words
# do not always appear paired in the same way)

# 4. TAGGER CHAINING WITH BACKOFF TAGGERS:


# Function to chain a set of taggers, with a backoff tagger as last resource
def combined_tagger(training_data, taggers, backoff=None):

    for tagger in taggers:
コード例 #8
0
        editdis = 1000
        line_num = 0
        rate = 0

## 3.The part of Standardize and NER
    if tag == 0:
        stopwords = nltk.corpus.stopwords.words('english')
        cont = nltk.word_tokenize(rep_22)
        # Name Standardize
        cont = [lc_male_name if x == 'he' else x for x in cont]
        cont = [lc_male_name if x == 'his' else x for x in cont]
        cont = [lc_female_name if x == 'she' else x for x in cont]
        cont = [lc_female_name if x == 'her' else x for x in cont]
        # Place Standardize

        cont_tag = t2.tag(cont)
        cont_unstop = [word for word in cont if word not in stopwords]
        cont_tagged = t2.tag(cont_unstop)  # Final result of NER
        print(cont_tagged)

        ## 4.Answer Filter
        # First Filter: Find names in the input
        for tupl_1 in cont_tag:
            if tupl_1[1] == 'NN' or tupl_1[1] == 'NNP':
                if tupl_1[0] in male_names:
                    lc_male_name = tupl_1[0]
                elif tupl_1[0] in female_names:
                    lc_female_name = tupl_1[0]

# Second Filter: Find places in the input
        for tupl_2 in cont_tag:
コード例 #9
0
ファイル: ch4_20.py プロジェクト: lesimor/nlp_python
import nltk
from nltk.tag import BigramTagger
from nltk.corpus import treebank
training_1 = treebank.tagged_sents()[:7000]
bigramtagger = BigramTagger(training_1)
print(treebank.sents()[0])
print(bigramtagger.tag(treebank.sents()[0]))
testing_1 = treebank.tagged_sents()[2000:]
print(bigramtagger.evaluate(testing_1))
コード例 #10
0
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

# testing performance of unigram tagger
print('unigram tagger: ')
print(ut.evaluate(test_data))
print(ut.tag(nltk.word_tokenize(sentence)))

# testing performance of bigram tagger
print('\nbigram tagger:')
print(bt.evaluate(test_data))
print(bt.tag(nltk.word_tokenize(sentence)))

# testing performance of trigram tagger
print('\ntrigram tagger:')
print(tt.evaluate(test_data))
print(tt.tag(nltk.word_tokenize(sentence)))


#%%
# combined tagger with a list of taggers and use a backoff tagger
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

コード例 #11
0
print("------------Unigram Tagger Overrode------------")
unigramTagger = UnigramTagger(model={'Pierre': 'NN'})
print(unigramTagger.tag(sent))

print("------------Unigram Tagger Trained------------")
unigramTagger = UnigramTagger(brown_train_sents)
print(unigramTagger.tag(sent))

#cutoff: The number of instances of training data the tagger must see in order not to use the backoff tagger
print("------------Unigram Tagger Trained with cutoff=3------------")
unigramTagger = UnigramTagger(brown_train_sents, cutoff=3)
print(unigramTagger.tag(sent))

print("------------Bigram Tagger------------")
print(bigramTagger.tag(sent))

print("------------Trigram Tagger------------")
print(trigramTagger.tag(sent))

print("------------Brill Tagger------------")
print(brillTagger.tag(sent))

print("------------Accuracy: Unigram Tagger Trained------------")
unigramTagger = UnigramTagger(brown_train_sents)
print(unigramTagger.evaluate(brown_test_sents))

print("------------Accuracy: Unigram Tagger Trained with cutoff = 3------------")
unigramTagger = UnigramTagger(brown_train_sents, cutoff = 3)
print(unigramTagger.evaluate(brown_test_sents))
コード例 #12
0
brown_tagged_sents = brown.tagged_sents(categories='news')
print(brown_tagged_sents)

# we are dividing the data into a test and train to evaluate our taggers.
train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] # [start : end]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]

unigram_tagger = UnigramTagger(train_data)
print (unigram_tagger.evaluate(test_data))

default_tagger = DefaultTagger('NN')

unigram_tagger = UnigramTagger(train_data, backoff = default_tagger)
print (unigram_tagger.evaluate(test_data))

bigram_tagger = BigramTagger(train_data, backoff = unigram_tagger)
print (bigram_tagger.evaluate(test_data))

trigram_tagger = TrigramTagger(train_data, backoff = bigram_tagger)
print (trigram_tagger.evaluate(test_data))

my_string = "Many text corpora contain linguistic annotations, representing POS tags, named entities, syntactic structures, semantic roles, and so forth. NLTK provides convenient ways to access several of these corpora, and has data packages containing corpora and corpus samples, freely downloadable for use in teaching and research."

my_sent = nltk.sent_tokenize(my_string) # tokenize sentences
mytoken = [] # create empty list
for sent in my_sent: # for each sentence, tokenize words
  mytoken.extend(word_tokenize(sent))
print(mytoken)
print('\n')
print(bigram_tagger.tag(mytoken)) # tag each word using unigram_tagger model