Example #1
4
def demo():
    root = Tk()
    root.bind('<Control-q>', lambda e: root.destroy())

    table = Table(root, 'Word Synset Hypernym Hyponym'.split(),
                  column_weights=[0, 1, 1, 1],
                  reprfunc=(lambda i,j,s: '  %s' % s))
    table.pack(expand=True, fill='both')

    from nltk.corpus import wordnet
    from nltk.corpus import brown
    for word, pos in sorted(set(brown.tagged_words()[:500])):
        if pos[0] != 'N': continue
        word = word.lower()
        for synset in wordnet.synsets(word):
            hyper = (synset.hypernyms()+[''])[0]
            hypo = (synset.hyponyms()+[''])[0]
            table.append([word,
                          getattr(synset, 'definition', '*none*'),
                          getattr(hyper, 'definition', '*none*'),
                          getattr(hypo, 'definition', '*none*')])

    table.columnconfig('Word', background='#afa')
    table.columnconfig('Synset', background='#efe')
    table.columnconfig('Hypernym', background='#fee')
    table.columnconfig('Hyponym', background='#ffe')
    for row in range(len(table)):
        for column in ('Hypernym', 'Hyponym'):
            if table[row, column] == '*none*':
                table.itemconfig(row, column, foreground='#666',
                                 selectforeground='#666')
    root.mainloop()
def exercise3():
    print
    print "Exercise 3"
    print "Part 1"
    count = 0
    total_brown_tagged_words = bn.tagged_words()
    cfd1 = nltk.ConditionalFreqDist(total_brown_tagged_words)
    set1 = set([a for (a, b) in total_brown_tagged_words])
    for s in set1:
        if (len(cfd1[s].keys()) == 5):
            count = count + 1

    print "Number of words which have exactly 5 different tags: %d" % count
    print

    print "Part 2"
    print "Words which have the most distinct tags are: "
    tags = [b for (a, b) in bn.tagged_words()]
    fd = nltk.FreqDist(tags)
    ft = fd.keys()
    cfd2 = nltk.ConditionalFreqDist(
        (tag, word) for (word, tag) in bn.tagged_words())

    for a in ft:
        if fd[a] == 1:
            print "For POS: " + a
            print cfd2[a].keys()
            print

    print
Example #3
0
def exploreTaggedCorpora():

    brown_learned_text = brown.words(categories="learned")
    sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == "often"))

    brown_lrnd_tagged = brown.tagged_words(categories="learned", simplify_tags=True)
    tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == "often"]
    fd = nltk.FreqDist(tags)
    fd.tabulate()

    def process(sentence):
        for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
            if t1.startswith("V") and t2 == "TO" and t3.startswith("V"):
                print w1, w2, w3

    for tagged_sent in brown.tagged_sents():
        process(tagged_sent)

    brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
    data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged)

    for word in data.conditions():
        if len(data[word]) > 3:
            tags = data[word].keys()
            print word, " ".join(tags)
def exercise2(category):
    print
    print "For Category: " + category
    print "Part 1"
    print "Words with the tag 'JJ':"
    words = bn.tagged_words(categories = category)
    wordlist = bn.words(categories = category)
    words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ']))
    print len(words_JJ)
    print
    print "Part 2"
    print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:"
    words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS']
    print words_VBP_NNPS_NNS[:10]
    print
    sent = ""
    print "Part 3"
    print "The 3 most frequent 3-word prepositional phrases are:"
    words = bn.tagged_words(categories = category)
    for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(words):
        if(t1.startswith('IN') and t2.startswith('AT') and t3.startswith('NN')):
            sent = sent + w1.lower() + " " + w2.lower() + " " + w3.lower() + "."
    sent_part = sent.split(".")
    fd = nltk.FreqDist(sent_part)
    v = fd.most_common(3)
    print v
    print
    print "Part 4"
    print "Ratio of Masculine to Feminine is:"
    male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b'
    female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b'
    male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())])
    female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())])
    print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns)
    print
Example #5
0
def verb_stem(s):
    """extracts the stem from the 3sg form of a verb, or returns empty string"""
    # add code here
    if re.match(".*[aeiou]ys$", s):
        snew = s[:-1]
    elif re.match(".*([^sxyzaeiou]|[^cs]h)s$", s):
        snew = s[:-1]
    elif re.match("[^aeiou]ies$", s):
        snew = s[:-1]
    elif re.match(".*[^s]ses$", s):
        snew = s[:-1]
    elif re.match(".*[^z]zes$", s):
        snew = s[:-1]
    elif re.match(".*([^iosxzh]|[^cs]h)es$", s):
        snew = s[:-1]
    elif s == "has":
        snew = "have"
    elif len(s) >= 5 and re.match(".*[^aeiou]ies$", s):
        snew = s[:-3] + 'y'
    elif re.match(".*([ox]|[cs]h|ss|zz)es$", s):
        snew = s[:-2]
    else:
        snew = ""
    if snew != "" and snew != "have":
        if not ((snew, "VB") in (brown.tagged_words()) and
                (s, "VBZ") in (brown.tagged_words())):
            snew = ""

    return snew
    def __init__(self):
        """Initialize your data structures in the constructor."""
        tag_corpus = []
        
#        from nltk.corpus import treebank
#        corpus = treebank.tagged_words()
#        for (word,tag) in treebank.tagged_words():
#            tag_corpus.append(tag)
        from nltk.corpus import brown 
        corpus = brown.tagged_words()
        for (word,tag) in brown.tagged_words():
            tag_corpus.append(tag)

        
        self.wordCounts = collections.defaultdict(int)
        self.tagCounts = collections.defaultdict(int)
        self.wordTagCounts = collections.defaultdict(int)
        self.wordTagList = {}
        self.totalTag = 0
        
        self.train(corpus)
        #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) 
        #estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2) 
        estimator = _estimator
        self.tagLM = NgramModel(2, tag_corpus, estimator)
def exercise3():
    print
    print "Exercise 3"
    print "Part 1"
    count = 0
    total_brown_tagged_words = bn.tagged_words()
    cfd1 = nltk.ConditionalFreqDist(total_brown_tagged_words)
    set1 = set([a for (a, b) in total_brown_tagged_words])
    for s in set1:
        if(len(cfd1[s].keys()) == 5):
            count = count + 1

    print "Number of words which have exactly 5 different tags: %d" % count
    print

    print "Part 2"
    print "Words which have the most distinct tags are: "
    tags = [b for (a, b) in bn.tagged_words()]
    fd = nltk.FreqDist(tags)
    ft = fd.keys()
    cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in bn.tagged_words())

    for a in ft:
        if fd[a] == 1:
            print "For POS: " +a
            print cfd2[a].keys()
            print

    print
Example #8
0
def exercise1():
    # 进行词性标注
    text = nltk.word_tokenize("You are a good man, but i don't like you!")
    print(text)
    print(nltk.pos_tag(text))
    nltk.tag.pos_tag()

    words_tag = brown.tagged_words(categories='news')
    print(words_tag[:30])
    words_tag = brown.tagged_words(categories='news', tagset='universal')
    print(words_tag[:30])
    words_tag = brown.tagged_words(categories='news', tagset='wsj')
    print(words_tag[:30])
    words_tag = brown.tagged_words(categories='news', tagset='brown')
    print(words_tag[:30])

    words_tag = sinica_treebank.tagged_sents()
    print(words_tag)

    raw = "You are a good man, but i don't love you!"
    tokens = nltk.word_tokenize(raw)
    default_tagger = nltk.DefaultTagger('NN')
    lagged_words = default_tagger.tag(tokens)
    print(lagged_words)

    tagged_sents = brown.tagged_sents(categories='news')
    print(default_tagger.evaluate(tagged_sents))
def verb_stem(s):
    """extracts the stem from the 3sg form of a verb, or returns empty string"""
    
    # goes through rules outlined in handout
    if re.match ("has", s):
        toReturn =  'have'
    elif re.match (".*(ays|eys|iys|oys|uys)", s):
        toReturn = s[:-1]
    elif re.match (".*(ies)", s):
        if (len(s) == 4):
            toReturn = s[:-1]
        else:
            s1 = s[:-3]
            s2 = s1 + "y"
            toReturn = s2 
    elif re.match(".*(oes|xes|ches|shes|sses|zzes)", s):
        toReturn = s[:-2]
    elif re.match (".*(!sses|!zzes|ses|zes)", s):
        toReturn = s[:-1]
    elif re.match(".*(!ies|!oes|!ses|!xes|!ches|!shes|es)", s):
        toReturn = s[:-1]
    elif re.match(".*(!ss|!xs|!ys|!zs|!chs|!shs|s)", s):
        toReturn = s[:-1]
    else:
        toReturn = ''

    # will check if original plural or creted singular verb is in the Brown corpus. 
    if ((s, 'VBZ') not in brown.tagged_words()):
        if ((toReturn, 'VB') not in brown.tagged_words()):
            return ''
        else: 
            return toReturn
    else:
        return toReturn
def verb_stem(s):
    """extracts the stem from the 3sg form of a verb, or returns empty string"""
    vowel_s = "aieou"
    verb = ""
    if re.match(".*ies$", s):
        if len(s) == 4 and not s[0] in vowel_s:
            verb = s[:-1]  #working
        else:
            verb = s[:-3] + 'y'  #working
    elif re.match(".*es$", s):
        if re.match(".*(o|x|sh|ch|ss|zz)es$", s):
            verb = s[:-2]
        elif re.match(".*[^(sxyz)]es$",
                      s) and s[-4:-2] != "sh" and s[-4:-2] != "ch":
            verb = s[:-1]
        elif re.match(".*(([^s]s)|([^z]z))es$", s):
            verb = s[:-1]
    elif re.match(".*s$", s):
        if (s[-2] == 'y' and s[-3] in vowel_s):  #working
            verb = s[:-1]
        elif re.match(".*[^sxyz]s$",
                      s) and s[-4:-2] != "sh" and s[-4:-2] != "ch":  #working
            verb = s[:-1]
        elif s == "has":  # working
            verb = "have"
    else:
        return s

    if not ((s, "VBZ") in set(brown.tagged_words()) and
            (verb, "VB") in set(brown.tagged_words())):
        verb = ""
    return verb
Example #11
0
def exploreTaggedCorpora():

    brown_learned_text = brown.words(categories='learned')
    sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often'))

    brown_lrnd_tagged = brown.tagged_words(categories='learned', simplify_tags=True)
    tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == 'often']
    fd = nltk.FreqDist(tags)
    fd.tabulate()


    def process(sentence):
        for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence): 
            if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
                print w1, w2, w3 


    for tagged_sent in brown.tagged_sents():
        process(tagged_sent)


    brown_news_tagged = brown.tagged_words(categories='news', simplify_tags=True)
    data = nltk.ConditionalFreqDist((word.lower(), tag)
            for (word, tag) in brown_news_tagged)

    for word in data.conditions():
        if len(data[word]) > 3:
            tags = data[word].keys()
            print word, ' '.join(tags)
Example #12
0
def tagged_token_representation():
  print nltk.tag.str2tuple("fly/NN")
  from nltk.corpus import brown
  print brown.tagged_words()
  # distribution of tags
  brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
  tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
  print tag_fd
  tag_fd.plot(cumulative=True)
  # distribution of POS+N pairs
  word_tag_pairs = nltk.bigrams(brown_news_tagged)
  print nltk.FreqDist(a[1] for (a, b) in word_tag_pairs if b[1] == "N")
Example #13
0
def automaticTagging():
    from nltk.corpus import brown
    print "=============== The Default Tagger   ==============="
    brown_tagged_sents = brown.tagged_sents(categories='news')
    print brown_tagged_sents[0:3]
    brown_sents = brown.sents(categories='news')
    print brown_sents[0:3]

    tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
    print nltk.FreqDist(tags).max()

    raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
    tokens = nltk.word_tokenize(raw)
    default_tagger = nltk.DefaultTagger('NN')
    print default_tagger.tag(tokens)

    print  default_tagger.evaluate(brown_tagged_sents)

    print "=============== The Regular Expression Tagger  ==============="
    patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'), (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN') ]
    regexp_tagger = nltk.RegexpTagger(patterns)
    print regexp_tagger.tag(brown_sents[3])
    print regexp_tagger.evaluate(brown_tagged_sents)

    print "=============== The Lookup Tagger ==============="
    fd = nltk.FreqDist(brown.words(categories='news'))
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
    most_freq_words = fd.keys()[:100]
    print most_freq_words
    likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
    baseline_tagger = nltk.UnigramTagger(model=likely_tags)
    print baseline_tagger
    print baseline_tagger.evaluate(brown_tagged_sents)

    def performance(cfd, wordlist):
        lt = dict((word, cfd[word].max()) for word in wordlist)
        baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
        return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))

    def display():
        import pylab
        words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
        cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
        sizes = 2 ** pylab.arange(15)
        perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
        pylab.plot(sizes, perfs, '-bo')
        pylab.title('Lookup Tagger Performance with Varying Model Size')
        pylab.xlabel('Model Size')
        pylab.ylabel('Performance')
        pylab.show()

    display()
Example #14
0
def primary():

    print("Training on the adventure category ...")
    train_corpus = brown.tagged_words(categories='adventure',
                                      tagset='universal')

    wprobs, tprobs = training([y[0] for y in train_corpus],
                              [y[1] for y in train_corpus])
    print("Corpus Trained")

    wprobs, wprobs1, tprobs, tprobs1, tsquash, tsquash1 = seperate_training(
        wprobs, tprobs)

    test_cats = brown.categories()
    vit_matrix = np.zeros((NUM_TAGS, NUM_TAGS))
    bi_matrix = np.zeros((NUM_TAGS, NUM_TAGS))
    for test_cat in test_cats:

        print("Testing on " + test_cat + "...")
        test_corpus = brown.tagged_words(categories=test_cat,
                                         tagset='universal')
        test_words = list_to_int([y[0] for y in test_corpus], UNIQUE_WORDS)
        test_tags = list_to_int([y[1] for y in test_corpus], UNIQUE_TAGS)

        fwd_vit_acc, vit_set = fwd_viterbi(test_words, test_tags, wprobs1,
                                           tprobs1, tsquash1)
        print("Forward Viterbi resulted in accuracy: " + str(fwd_vit_acc))
        bi_dir_acc, bi_set = bi_dir_method(test_words, test_tags, wprobs,
                                           tprobs, tsquash)
        print("Bidirection Method resulted in accuracy: " + str(bi_dir_acc))

        for i in range(len(test_tags)):
            vit_matrix[test_tags[i], vit_set[i]] += 1
            bi_matrix[test_tags[i], bi_set[i]] += 1

    np.set_printoptions(suppress=True)
    print("The Confusion Matrix for Viterbi:")
    print(vit_matrix)
    print("The Confusion Matrix for Bidirectional:")
    print(bi_matrix)
    for i in range(NUM_TAGS):
        print("Accuracy of '" + UNIQUE_TAGS[i] + "' tagging on Viterbi: " +
              str(vit_matrix[i, i] / sum(vit_matrix[i])))
        print("Accuracy of '" + UNIQUE_TAGS[i] + "' tagging on Bidirection: " +
              str(bi_matrix[i, i] / sum(bi_matrix[i])))

        print("misguess of '" + UNIQUE_TAGS[i] + "' on Viterbi: " +
              str(1 - vit_matrix[i, i] / sum(vit_matrix[:, i])))
        print("misguess of '" + UNIQUE_TAGS[i] + "' on Bidirection: " +
              str(1 - bi_matrix[i, i] / sum(bi_matrix[:, i])))
Example #15
0
def test_ex5():
    tagged_words = brown.tagged_words(categories='news', tagset='universal')
    (emission_FD, top_NN, emission_PD, p_NN, p_DT) = ex3(tagged_words)
    tagged_sentences = brown.tagged_sents(categories='news',
                                          tagset='universal')
    (transition_FD, transition_PD, p_VBD_NN, p_DT_NN) = ex4(tagged_sentences)
    states = list(
        set(pos for (
            word,
            pos) in brown.tagged_words(categories='news', tagset='universal')))
    sentence = [tp[0] for tp in tagged_sentences[42]]
    tag_sequence = viterbi(sentence, states, emission_PD, transition_PD)
    print "Viterbi tag sequence:" + ' '.join(tag_sequence)
    print "Gold tag sequence:" + ' '.join(
        [tp[1] for tp in tagged_sentences[42]])
Example #16
0
def exercise3c(category):
	print
	print "For category: " +category
	brown_tag_words = bn.tagged_words(categories = category)
	tag_fd = nltk.FreqDist(t for (w,t) in brown_tag_words)
	print tag_fd.keys()[:10]
	print
Example #17
0
def ch05_20_brown_corpus_words_phrases_by_tag():
  from nltk.corpus import brown
  tagged_words = brown.tagged_words(categories="news")
  # produce alpha sorted list of distinct words tagged MD
  print sorted(set([w.lower()
    for (w,t) in filter(lambda (w,t): t == "MD", tagged_words)]))
  # identify words that can be plural (NRS, NPS*, NNS*) or
  # third person singular verbs (BEDZ*, BEZ*, DOZ*, *BEZ)
  # AND the ones ending with "s"
  print set([w for (w, t) in tagged_words
    if w.lower().endswith("s") and
    (t == "NRS" or t.startswith("NPS")
    or t.startswith("NPS") or t.startswith("NNS")
    or t.startswith("BEDZ") or t.startswith("BEZ")
    or t.startswith("DOZ") or t.endswith("BEZ"))])
  # identify 3 word prepositional phrases IN+DET+NN
  tagged_word_trigrams = nltk.trigrams(tagged_words)
  print tagged_word_trigrams[:10]
  print set([" ".join([w1, w2, w3])
    for (w1,t1), (w2,t2), (w3,t3) in tagged_word_trigrams
    if t1 == "IN" and t2 == "DET" and t3 == "NN"])
  # ratio of masculine to feminine pronouns
  num_masc_pn = len([w for (w,t) in tagged_words if w.lower() == "he"])
  num_fem_pn = len([w for (w,t) in tagged_words if w.lower() == "she"])
  print "masc/fem = ", (num_masc_pn / num_fem_pn)
Example #18
0
def partb():
	print
	print
	tags = [b for (a, b) in bn.tagged_words()]
	fd = nltk.FreqDist(tags)
	ft = fd.keys()
	cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in bn.tagged_words())

	for a in ft:
		if fd[a] == 1:
			print "For POS: " +a
			print cfd2[a].keys()
			print

	print
	print
Example #19
0
def ch05_21_qualifiers_before_adore_love_like_prefer():
  from nltk.corpus import brown
  tagged_words = brown.tagged_words(categories="news")
  tagged_word_bigrams = nltk.bigrams(tagged_words)
  allp = set(["adore", "love", "like", "prefer"])
  print set([w for (w1,t1), (w2,t2) in tagged_word_bigrams
    if t1 == "QL" and w2.lower() in allp])
Example #20
0
def lookupTagger():

    fd = nltk.FreqDist(brown.words(categories='news'))
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
    most_freq_words = fd.keys()[:100]
    likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
    baseline_tagger = nltk.UnigramTagger(model=likely_tags)
    baseline_tagger.evaluate(brown_tagged_sents)

    sent = brown.sents(categories='news')[3]
    baseline_tagger.tag(sent)

    baseline_tagger = nltk.UnigramTagger(model=likely_tags,
            backoff=nltk.DefaultTagger('NN'))

    def performance(cfd, wordlist):
        lt = dict((word, cfd[word].max()) for word in wordlist)
        baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
        return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))

    def display():
        import pylab
        words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
        cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
        sizes = 2 ** pylab.arange(15)
        perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
        pylab.plot(sizes, perfs, '-bo')
        pylab.title('Lookup Tagger Performance with Varying Model Size')
        pylab.xlabel('Model Size')
        pylab.ylabel('Performance')
        pylab.show()   
Example #21
0
def ch05_34_num_words_with_1to10_distinct_tags():
  from nltk.corpus import brown
  tagged_words = brown.tagged_words(categories="news")
  # number of distinct tags and number of words in corpus for this
  dd = nltk.defaultdict(set)
  for w,t in tagged_words:
    dd[w].add(t)
  for i in range(1,10):
    print i, len(filter(lambda x: len(dd[x]) == i, dd.keys()))
  # for the word with greatest number of tags, print out concordance
  # one for each tag
  maxtags = 6
  word = None
  tags = None
  for w in dd.keys():
    if len(dd[w]) >= maxtags:
      word = w
      tags = dd[w]
      break
  poss = []
  pos = 0
  for w, t in tagged_words:
    if w == word and t in tags:
      poss.append((t, pos))
      tags.remove(t)
    pos += 1
  for t, pos in poss:
    print t, " ".join(w for w,t in tagged_words[pos-10:pos+10])
def nltk_simplify_brown_tag():
    ''' Produces ~36 POS tags '''
    other_pos_tagging = set(
        [sbt(tag) for (_, tag) in brown.tagged_words()[:600000]])
    print len(other_pos_tagging)
    print other_pos_tagging
    return other_pos_tagging
Example #23
0
def MostLikelyTag(word):
    print word
    text=brown.tagged_words()
    saved={}
    """Actually collect all the seen tags"""
    for w in text:
        if w[0].lower() == word.lower(): #check all these
            tag=w[1]
            try:
                saved[tag] = saved[tag] + 1 #aka if this tag has already been seen
            except KeyError:
                saved[tag] = 1 #if this is the first time we've seen the tag
    """Now find the one seen most often"""
    maxnum = 0
    maxtag = None
    print saved.keys()
    for t in saved.keys():
        if maxnum == 0:
            maxnum = saved[t]
            maxtag = t
        else:
            if saved[t] > maxnum:
                maxnum = saved[t]
                maxtag = t
    print 'maxtag',maxtag
    if maxtag==None:
        if word == "n't":
            return '*' #for some reason it wasn't tagging this correctly so i added it manually
        else:
            return 'UNK' #unknown words!
    else:
        return maxtag
Example #24
0
def pre_processing():
    global tag_count
    global tag_set
    global modified_tagged_sents
    global min_tag_count
    # counting no of occurences of each tag
    print "__________________counting occurences of tags_____________________"
    for (word, tag) in brown.tagged_words():
        tag_count[tag] += 1
    for key in tag_count:
        tag_set.append(key)
    # if count[key] is less then 10 put it as NONE
    print "_______________changing low occuring tags to NONE___________________"
    for sent in brown.tagged_sents():
        modified_sent = []
        for index, word in enumerate(sent):
            tag = word[1]
            if tag_count[word[1]] <= min_tag_count:
                tag = 'NONE'
            modified_sent.append([word[0], tag])
        modified_tagged_sents.append(modified_sent)
    print "___________________creating tag_set & tag_count_____________________"
    tag_count = defaultdict(int)
    tag_set = []
    for sent in modified_tagged_sents:
        for word in sent:
            tag = word[1]
            tag_count[tag] += 1
    for key in tag_count:
        tag_set.append(key)
Example #25
0
def ch05_11_train_test_affix_tagger():
  from nltk.corpus import brown
  fd = nltk.FreqDist(brown.words(categories="news"))
  cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news"))
  most_freq_pos = dict((word, cfd[word].max()) for word in fd.keys())
  affix_tagger = nltk.AffixTagger(model=most_freq_pos)
  print affix_tagger.evaluate(brown.tagged_sents(categories="editorial"))
Example #26
0
def word_count():
    from nltk.corpus import brown
    counts = nltk.defaultdict(int)
    for (word, tag) in brown.tagged_words(categories="news"):
        counts[tag] += 1
    from operator import itemgetter
    print sorted(counts.items(), key=itemgetter(1), reverse=True)
Example #27
0
def ch05_15_brown_corpus_trivia():
    from nltk.corpus import brown
    tagged_words = brown.tagged_words(categories="news")
    # which nouns are more common in plural form than singular?
    # NNS - plural, NN - singular. Calculate plural = singular + s
    s_nouns = [w for (w, t) in tagged_words if t == "NN"]
    plurals = set([w + "s" for w in s_nouns])
    p_nouns = [w for (w, t) in tagged_words if t == "NNS" and w in plurals]
    s_fd = nltk.FreqDist(s_nouns)
    p_fd = nltk.FreqDist(p_nouns)
    print "words where singular > plural=", \
      filter(lambda word: s_fd[word] < p_fd[word], p_fd.keys())[:50]
    # which word has the greatest number of distinct tags
    word_tags = nltk.defaultdict(lambda: set())
    for word, token in tagged_words:
        word_tags[word].add(token)
    ambig_words = sorted([(k, len(v)) for (k, v) in word_tags.items()],
                         key=itemgetter(1),
                         reverse=True)[:50]
    print[(word, numtoks, word_tags[word]) for (word, numtoks) in ambig_words]
    # list top 20 (by frequency) tags
    token_fd = nltk.FreqDist([token for (word, token) in tagged_words])
    print "top_tokens=", token_fd.keys()[:20]
    # which tags are nouns most commonly found after
    tagged_word_bigrams = nltk.bigrams(tagged_words)
    fd_an = nltk.FreqDist([
        t1 for (w1, t1), (w2, t2) in tagged_word_bigrams if t2.startswith("NN")
    ])
    print "nouns commonly found after these tags:", fd_an.keys()
Example #28
0
def ch05_11_train_test_affix_tagger():
    from nltk.corpus import brown
    fd = nltk.FreqDist(brown.words(categories="news"))
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news"))
    most_freq_pos = dict((word, cfd[word].max()) for word in fd.keys())
    affix_tagger = nltk.AffixTagger(model=most_freq_pos)
    print affix_tagger.evaluate(brown.tagged_sents(categories="editorial"))
Example #29
0
def ch05_34_num_words_with_1to10_distinct_tags():
    from nltk.corpus import brown
    tagged_words = brown.tagged_words(categories="news")
    # number of distinct tags and number of words in corpus for this
    dd = nltk.defaultdict(set)
    for w, t in tagged_words:
        dd[w].add(t)
    for i in range(1, 10):
        print i, len(filter(lambda x: len(dd[x]) == i, dd.keys()))
    # for the word with greatest number of tags, print out concordance
    # one for each tag
    maxtags = 6
    word = None
    tags = None
    for w in dd.keys():
        if len(dd[w]) >= maxtags:
            word = w
            tags = dd[w]
            break
    poss = []
    pos = 0
    for w, t in tagged_words:
        if w == word and t in tags:
            poss.append((t, pos))
            tags.remove(t)
        pos += 1
    for t, pos in poss:
        print t, " ".join(w for w, t in tagged_words[pos - 10:pos + 10])
Example #30
0
def construct_dataset(nvocab):

    dataset = [(a.lower(), b)
               for a, b in brown.tagged_words(tagset='universal')
               if a not in '!"#$%&()*+-,./:;<=>?@[\\]^_`{|}~\t\n']

    word_list, tag_list = zip(*dataset)
    word_set = set(word_list)
    tag_set = set(tag_list)

    tag_dict = {tag: i for i, tag in enumerate(tag_set)}

    c = Counter(word_list)
    c = dict(c.most_common(nvocab))

    inv_c = [(count, word) for word, count in c.items()]
    inv_c = sorted(inv_c, reverse=True)

    _, sorted_words = zip(*inv_c)

    word_dict = {word: i for i, word in enumerate(sorted_words)}

    result = [(word_dict[word], tag_dict[tag]) for word, tag in dataset
              if word in c]
    if __debug__:
        print('Num Words : {}, Total Words : {}'.format(
            len(word_set), len(word_list)))

    return result
Example #31
0
def ch05_18_brown_corpus_statistics():
    from nltk.corpus import brown
    tagged_words = brown.tagged_words(categories="news")
    vocab_size = len(set([w for (w, t) in tagged_words]))
    cfd = nltk.ConditionalFreqDist(tagged_words)
    # proportion of word types always assigned the same part-of-speech
    # ie words with a single POS
    num_single_pos_words = sum(
        len(cfd[word].hapaxes()) for word in cfd.conditions())
    print "prop of word types with single POS=", \
      num_single_pos_words / vocab_size
    # how many words are ambiguous, ie with >= 2 POS tags
    ambig_words = [
        w for w in cfd.conditions()
        if len(filter(lambda x: cfd[w][x] >= 2, cfd[w].keys())) >= 2
    ]
    num_ambig_words = len(ambig_words)
    print "prop of ambiguous words (>= 2 POS)=", \
      num_ambig_words / vocab_size
    # percentage of word tokens in the brown corpus that involve
    # ambiguous words
    token_size = len(set([t for (w, t) in tagged_words]))
    unique_tokens = set()
    for w in ambig_words:
        unique_tokens.update(set([t for t in cfd[w].keys()]))
    print "prop of ambig tokens=", len(unique_tokens) / token_size
Example #32
0
def verb_stem(s):
    """extracts the stem from the 3sg form of a verb, or returns empty string"""
    ok = 0

    if (re.match("\w*([^aeiousxyzh]|[^cs]h)s$", s)):
        stem = s[:-1]
    elif (re.match("(\w*)[aeiou]ys$", s)):
        stem = s[:-1]
    elif (re.match("\w+[^aeiou]ies$", s)):
        stem = s[:-3] + 'y'
    elif (re.match("[^aeiou]ies$", s)):
        stem = s[:-1]
    elif (re.match("\w*([ox]|ch|sh|ss|zz)es$", s)):
        stem = s[:-2]
    elif (re.match("\w*(([^s]se)|([^z]ze))s$", s)):
        stem = s[:-1]
    elif (re.match("has", s)):
        stem = "have"
    elif (re.match("\w*([^iosxzh]|[^cs]h)es$", s)):
        stem = s[:-1]
    else:
        stem = ""

    if (stem != "" and ok != 1):
        for (word, tag) in brown.tagged_words():
            if word == stem and tag in ('VB', 'VBZ'):
                return stem
                ok = 1
                break

    if (ok == 0):
        return ""
Example #33
0
def ch05_20_brown_corpus_words_phrases_by_tag():
    from nltk.corpus import brown
    tagged_words = brown.tagged_words(categories="news")
    # produce alpha sorted list of distinct words tagged MD
    print sorted(
        set([
            w.lower()
            for (w, t) in filter(lambda (w, t): t == "MD", tagged_words)
        ]))
    # identify words that can be plural (NRS, NPS*, NNS*) or
    # third person singular verbs (BEDZ*, BEZ*, DOZ*, *BEZ)
    # AND the ones ending with "s"
    print set([
        w for (w, t) in tagged_words if w.lower().endswith("s") and (
            t == "NRS" or t.startswith("NPS") or t.startswith("NPS")
            or t.startswith("NNS") or t.startswith("BEDZ")
            or t.startswith("BEZ") or t.startswith("DOZ") or t.endswith("BEZ"))
    ])
    # identify 3 word prepositional phrases IN+DET+NN
    tagged_word_trigrams = nltk.trigrams(tagged_words)
    print tagged_word_trigrams[:10]
    print set([
        " ".join([w1, w2, w3])
        for (w1, t1), (w2, t2), (w3, t3) in tagged_word_trigrams
        if t1 == "IN" and t2 == "DET" and t3 == "NN"
    ])
    # ratio of masculine to feminine pronouns
    num_masc_pn = len([w for (w, t) in tagged_words if w.lower() == "he"])
    num_fem_pn = len([w for (w, t) in tagged_words if w.lower() == "she"])
    print "masc/fem = ", (num_masc_pn / num_fem_pn)
Example #34
0
def lookupTagger():

    fd = nltk.FreqDist(brown.words(categories='news'))
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
    most_freq_words = fd.keys()[:100]
    likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
    baseline_tagger = nltk.UnigramTagger(model=likely_tags)
    baseline_tagger.evaluate(brown_tagged_sents)

    sent = brown.sents(categories='news')[3]
    baseline_tagger.tag(sent)

    baseline_tagger = nltk.UnigramTagger(model=likely_tags,
                                         backoff=nltk.DefaultTagger('NN'))

    def performance(cfd, wordlist):
        lt = dict((word, cfd[word].max()) for word in wordlist)
        baseline_tagger = nltk.UnigramTagger(model=lt,
                                             backoff=nltk.DefaultTagger('NN'))
        return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))

    def display():
        import pylab
        words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
        cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
        sizes = 2**pylab.arange(15)
        perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
        pylab.plot(sizes, perfs, '-bo')
        pylab.title('Lookup Tagger Performance with Varying Model Size')
        pylab.xlabel('Model Size')
        pylab.ylabel('Performance')
        pylab.show()
Example #35
0
    def run(self, model):
        print('Testing...')

        perplexity = 1

        genres = [
            'adventure', 'belles_lettres', 'editorial', 'fiction',
            'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery',
            'news', 'religion', 'reviews', 'romance', 'science_fiction'
        ]
        total = 0
        for i, genre in enumerate(genres):
            print(repr(i + 1) + '/' + repr(len(genres)))
            corpus = brown.tagged_words(categories=genre)
            size = int(len(corpus) * 0.90)
            corpus = corpus[size:]
            trigrams = nltk.trigrams(corpus)

            for ((word2, tag2), (word1, tag1), (word0, tag0)) in trigrams:
                total += 1
                score = model.get_score(word2, tag2, word1, tag1, word0, tag0)
                perplexity += math.log(score, 2)
        perplexity = perplexity / total
        perplexity = math.pow(2, -perplexity)
        print(perplexity)
def partOfSpeechTagging():

    from nltk.corpus import brown

    suffix_fdist = nltk.FreqDist()
    for word in brown.words():
        word = word.lower()
        suffix_fdist.inc(word[-1:])
        suffix_fdist.inc(word[-2:])
        suffix_fdist.inc(word[-3:])

    common_suffixes = suffix_fdist.keys()[:100]
    print common_suffixes 


    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]

    classifier = nltk.DecisionTreeClassifier.train(train_set)
    nltk.classify.accuracy(classifier, test_set)

    classifier.classify(pos_features('cats'))

    print classifier.pseudocode(depth=4)
Example #37
0
def question2(category):
	#print
	#print "For Category: " + category
	#print "Words with the tag 'JJ':"
	#print
	words = bn.tagged_words(categories = category)
	wordlist = bn.words(categories = category)
	words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ']))
	print len(words_JJ)
	print
	print
	print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:"
	print
	words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS']
	print words_VBP_NNPS_NNS[:10]
	print
	print
	print "Ratio"
	print
	male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b'
	female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b'
	male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())])
	female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())])
	print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns)
	print
	print
	sent = ""
	print "3 word prepositional phrases are:"
Example #38
0
def prepare_data_sentences(training_size):

    ## Take a subset
    brown_words = list(itertools.islice(brown.words(), training_size))
    brown_tags = [pair[1] for pair in brown.tagged_words(tagset='universal')]

    word_encoder = sklearn.preprocessing.LabelEncoder()
    pos_encoder = sklearn.preprocessing.LabelEncoder()
    brown_words_num = word_encoder.fit_transform(brown_words)
    brown_tags_num = pos_encoder.fit_transform(brown_tags)

    x_data_sents, y_data_sents = [], []
    x_data_sent, y_data_sent = [], []

    dot_label = word_encoder.transform(['.'])[0]
    dot_label_tags = pos_encoder.transform(['.'])[0]

    #split on sentences
    for word, tag in zip(brown_words_num, brown_tags_num):

        if word == dot_label and tag == dot_label_tags:
            if len(x_data_sent) > 0:
                x_data_sents.append(x_data_sent)
                y_data_sents.append(y_data_sent)
                x_data_sent, y_data_sent = [], []

        x_data_sent.append(word)
        y_data_sent.append(tag)

    input_dim = len(word_encoder.classes_)
    output_dim = len(pos_encoder.classes_)

    return input_dim, output_dim, x_data_sents, y_data_sents
Example #39
0
 class data:
     content_words = ["main", "content", "body"]
     tags = [
         "<!DOCTYPE>", "<a>", "<abbr>", "<acronym>", "<address>",
         "<applet>", "<area>", "<article>", "<aside>", "<audio>", "<b>",
         "<base>", "<basefont>", "<bdi>", "<bdo>", "<big>", "<blockquote>",
         "<body>", "<br>", "<button>", "<canvas>", "<caption>", "<center>",
         "<cite>", "<code>", "<col>", "<colgroup>", "<datalist>", "<dd>",
         "<del>", "<details>", "<dfn>", "<dialog>", "<dir>", "<div>",
         "<dl>", "<dt>", "<em>", "<embed>", "<fieldset>", "<figcaption>",
         "<figure>", "<font>", "<footer>", "<form>", "<frame>",
         "<frameset>", "<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>",
         "<head>", "<header>", "<hr>", "<html>", "<i>", "<iframe>", "<img>",
         "<input>", "<ins>", "<kbd>", "<keygen>", "<label>", "<legend>",
         "<li>", "<link>", "<main>", "<map>", "<mark>", "<menu>",
         "<menuitem>", "<meta>", "<meter>", "<nav>", "<noframes>",
         "<noscript>", "<object>", "<ol>", "<optgroup>", "<option>",
         "<output>", "<p>", "<param>", "<picture>", "<pre>", "<progress>",
         "<q>", "<rp>", "<rt>", "<ruby>", "<s>", "<samp>", "<script>",
         "<section>", "<select>", "<small>", "<source>", "<span>",
         "<strike>", "<strong>", "<style>", "<sub>", "<summary>", "<sup>",
         "<table>", "<tbody>", "<td>", "<textarea>", "<tfoot>", "<th>",
         "<thead>", "<time>", "<title>", "<tr>", "<track>", "<tt>", "<u>",
         "<ul>", "<var>", "<video>", "<wbr>"
     ]
     brown_words = dict(brown.tagged_words())
Example #40
0
def main():
    tagged_words = brown.tagged_words()
    words_corpus = brown.words()

    word2vec = Word2Vec()
    word2vec.train(words_corpus)

    word_vecs = [word2vec.word2vec(word) for word in words_corpus]

    n_clusters = 10 # random number for now
    kmeans = KMeans(n_clusters)
    kmeans.compute(word_vecs)

    # word-cluster HMM
    p_word = {}
    p_cluster = {}

    p_cluster_given_word = None # softmax
    p_word_given_cluster = None # joint probability formula

    p_transition_cluster = None # count
    p_initial_cluster = None # count

    # cluster-tag HMM
    p_cluster_given_tag = None # softmax
    p_transition_tag = None # count from tagged data
    p_initial_tag = None # count from tagged data

    hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster)
    hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag)

    words = []
    clusters = hmm_word_cluster.viterbi(words)
    tags = hmm_cluster_tag.viterbi(clusters)
def corpusBigrams():
    from nltk.corpus import brown
    corpus = brown.tagged_words()
    bigrams = []
    corpLen = len(corpus)
    for i in range(1, len(corpus)):
        tempStr = str(i) + "/" + str(corpLen)
        print tempStr
        tally = 1
        newBigram = [corpus[i - 1], corpus[i], tally]
        bigrams.append(newBigram)
    print("sorting")
    sortedBigrams = sorted(bigrams, key=itemgetter(0, 1))
    toReturn = []
    bigramToMatch = sortedBigrams[0]
    tally = 1
    listLen = len(sortedBigrams)
    print len(sortedBigrams)
    for i in range(1, len(sortedBigrams)):
        tempStr = str(i) + "/" + str(listLen)
        print(tempStr)
        if sortedBigrams[i] == bigramToMatch:
            tally += 1
        else:
            temp = bigramToMatch
            temp[2] = tally
            toReturn += [temp]
            bigramToMatch = sortedBigrams[i]
            tally = 1
    file1 = open('bigramGrammar.txt', 'w')
    for b in toReturn:
        file1.write(str(b) + "\n")
    file1.close()
    return bigrams
Example #42
0
 def rate(self):
     wor_tag = brown.tagged_words(fileids=['ca01'])
     sentence = []
     anse = []
     pres = 0
     presb = 0
     for word in wor_tag:
         if (word[0] == "."):
             ans = self.hmm(sentence)
             #print(ans)
             for i in range(ans.__len__()):
                 str = []
                 str = anse[i].split("+")
                 str = anse[i].split("-")
                 #print(anse[i])
                 #print(ans[i])
                 if ans[i] in str:
                     pres = pres + 1
                     presb += 1
                 else:
                     pres += 1  #最重要要修改的地方
             sentence = []
             anse = []
             print(presb / pres)
             pres = 0
             presb = 0
             break
         else:
             #print("hello")
             sentence.append(word[0])
             anse.append(word[1])
Example #43
0
def exercise3():
    cfd = nltk.ConditionalFreqDist(
        (word.lower(), tag)
        for genre in brown.categories()
        for (word, tag) in brown.tagged_words(categories=genre)
    )
    result = {'part1':{},
              'part2' : {}
              }
    print("Part 1")
    for word in sorted(cfd.conditions()):
        tags = set(cfd[word])
        if len(tags) == 5:
            if word not in result['part1']:
                result['part1'][word] = tags

    print("Number of words which has exactly 5 possible tags : ", len(result['part1']))
    print("words which has exactly 5 possible tags : ", result['part1'])

    print("Part 2")

    possible_tags = ['CS', 'WPS', 'DT', 'QL', 'NIL']
    distinct_word = 'that'

    print(" the distinct word is : ", distinct_word)
    for sentence in brown.tagged_sents():
        for tuple in sentence:
            if tuple[0] == distinct_word and len(possible_tags) > 0:
                if tuple[1] == possible_tags[0]:
                    print("Sentence : ", " ".join([w for (w, t) in sentence]))
                    if len(possible_tags) > 0:
                        possible_tags.remove(possible_tags[0])
def verb_stem(s):
    """extracts the stem from the 3sg form of a verb, or returns empty string"""
    ok = 0

    if (re.match("\w*([^aeiousxyzh]|[^cs]h)s$", s)):
        stem = s[:-1]
    elif (re.match("(\w*)[aeiou]ys$", s)):
        stem = s[:-1]
    elif (re.match("\w+[^aeiou]ies$", s)):
        stem = s[:-3]+'y'
    elif (re.match("[^aeiou]ies$", s)):
        stem = s[:-1]
    elif (re.match("\w*([ox]|ch|sh|ss|zz)es$", s)):
        stem = s[:-2]
    elif (re.match("\w*(([^s]se)|([^z]ze))s$", s)):
        stem = s[:-1]
    elif (re.match("has", s)):
        stem = "have"
    elif (re.match("\w*([^iosxzh]|[^cs]h)es$", s)):
        stem = s[:-1]
    else:
        stem = ""

    if (stem != "" and ok != 1):
        for (word, tag) in brown.tagged_words():
            if word == stem and tag in ('VB', 'VBZ'):
                return stem
                ok = 1
                break

    if (ok == 0):
        return ""
Example #45
0
def demo():
    root = Tk()
    root.bind('<Control-q>', lambda e: root.destroy())

    table = Table(root, 'Word Synset Hypernym Hyponym'.split(),
                  column_weights=[0, 1, 1, 1], 
                  reprfunc=(lambda i,j,s: '  %s' % s))
    table.pack(expand=True, fill='both')

    from nltk.corpus import wordnet
    from nltk.corpus import brown
    for word, pos in sorted(set(brown.tagged_words()[:500])):
        if pos[0] != 'N': continue
        word = word.lower()
        for synset in wordnet.synsets(word):
            hyper = (synset.hypernyms()+[''])[0]
            hypo = (synset.hyponyms()+[''])[0]
            table.append([word,
                          getattr(synset, 'definition', '*none*'),
                          getattr(hyper, 'definition', '*none*'),
                          getattr(hypo, 'definition', '*none*')])

    table.columnconfig('Word', background='#afa')
    table.columnconfig('Synset', background='#efe')
    table.columnconfig('Hypernym', background='#fee')
    table.columnconfig('Hyponym', background='#ffe')
    for row in range(len(table)):
        for column in ('Hypernym', 'Hyponym'):
            if table[row, column] == '*none*':
                table.itemconfig(row, column, foreground='#666',
                                 selectforeground='#666')
    root.mainloop()
Example #46
0
def analysis_using_word_and_prev_pos():
    from nltk.corpus import brown

    pos = nltk.defaultdict(lambda: nltk.defaultdict(int))
    brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
    for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged):
        pos[(t1, w2)][t2] += 1
    print pos[("DET", "right")]
Example #47
0
def ch05_33_list_pos_of_word_given_word_and_pos():
  from nltk.corpus import brown
  tagged_words = brown.tagged_words(categories="news")
  tagged_word_bigrams = nltk.bigrams(tagged_words)
  dd = nltk.defaultdict(dict)
  for (w1,t1), (w2,t2) in tagged_word_bigrams:
    dd[w1][t1] = t2
  print dd
Example #48
0
def retag_brown_words(tag_map):
    wordpos_fd = nltk.FreqDist()
    for word, tag in brown.tagged_words():
        if tag_map.has_key(tag):
            normed_pos = tag_map[tag]
            retagged_word = DELIM.join([word.lower(), normed_pos])
            wordpos_fd.inc(retagged_word)  
    return wordpos_fd
Example #49
0
def main():
    brown_news_tagged = brown.tagged_words(categories='news', simplify_tags=True)
    tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
    print tag_fd.keys()
    print

    print tag_fd
    print
Example #50
0
 def setup(self):
     for tag in brown.tagged_words(categories='news'):
         if tag[1] == 'NN':
             self.nouns.append(tag[0])
         elif tag[1] == 'VB':
             self.verbs.append(tag[0])
         elif tag[1] == 'JJ':
             self.adjs.append(tag[0])
def lookup_tagger():
    fd = nltk.FreqDist(brown.words(categories='news'))
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
    most_freq_words = fd.keys()[:100]
    likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
    #return nltk.UnigramTagger(model=likely_tags)
    return nltk.UnigramTagger(model=likely_tags,
                              backoff=nltk.DefaultTagger('NN'))
def complexDict():

    pos = nltk.defaultdict(lambda: nltk.defaultdict(int))
    brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
    for ((w1, t1), (w2, t2)) in nltk.ibigrams(brown_news_tagged):
        pos[(t1, w2)][t2] += 1

    pos[("DET", "right")]
Example #53
0
def ch05_35_must_contexts():
  from nltk.corpus import brown
  tagged_words = brown.tagged_words(categories="news")
  tagged_word_bigrams = nltk.bigrams(tagged_words)
  fd = nltk.FreqDist((w1,t2) for (w1,t1),(w2,t2)
    in tagged_word_bigrams
    if w1 == "must")
  for t in fd.keys():
    print t, fd[t]
Example #54
0
def importingBrownCorpusFromNLTK(outF):
    "importing tagged brown corpus from NLTK and writing on a file OutF"
    outF = open(outF,'w')
    from nltk.corpus import brown
    brown_news_tagged = brown.tagged_words(categories='news',simplify_tags=True)
    print 'size', len(brown_news_tagged)
    for i in brown_news_tagged:
        outF.write(i[0]+'\t'+i[1]+'\n')
    outF.close()
Example #55
0
def find_highly_ambiguous_words():
    from nltk.corpus import brown

    brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
    cfd = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged)
    for word in cfd.conditions():
        if len(cfd[word]) > 3:
            tags = cfd[word].keys()
            print word, ":", " ".join(tags)
Example #56
-1
def category_by_pos():
    from nltk.corpus import brown
    from nltk import FreqDist
    from nltk import DecisionTreeClassifier
    from nltk import NaiveBayesClassifier
    from nltk import classify

    suffix_fdist = FreqDist()
    for word in brown.words():
        word = word.lower()
        suffix_fdist.inc(word[-1:])
        suffix_fdist.inc(word[-2:])
        suffix_fdist.inc(word[-3:])

    common_suffixes = suffix_fdist.keys()[:100]
#    print common_suffixes

    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = DecisionTreeClassifier.train(train_set)
#    print 'Decision Tree %f' % classify.accuracy(classifier, test_set)

    classifier = NaiveBayesClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)