Ejemplo n.º 1
0
 def tagged_words(self, fileids=None, categories=None):
     return ConllCorpusReader.tagged_words(
         self, self._resolve(fileids, categories))
Ejemplo n.º 2
0
from __future__ import division
from nltk.corpus.reader import ConllCorpusReader
from nltk.probability import FreqDist, DictionaryProbDist, LaplaceProbDist, SimpleGoodTuringProbDist, MLEProbDist

conllreader = ConllCorpusReader(".", "de-train.tt", ('words', 'pos'))  # getting a train corpus from file
states = ('VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.')  # list of 12 POS tags
sentslen = len(conllreader.tagged_sents())  # getting number of sentences

tagfdist = FreqDist(pair[1] for pair in conllreader.tagged_words())   # getting frequence of (word,tag)

firsttagfdist = FreqDist(pair[0][1] for pair in conllreader.tagged_sents())  # getting frequence of first tags
A0j = DictionaryProbDist(dict(map(lambda (k, x): (k, x/sentslen), firsttagfdist.iteritems())))
A0jLap = LaplaceProbDist(firsttagfdist)
A0jGT = SimpleGoodTuringProbDist(firsttagfdist)
A0jMLE = MLEProbDist(firsttagfdist)

TagPair = []
words = conllreader.tagged_words()
for i in range(0, len(words)-1):
    TagPair.append((words[i][1], words[i+1][1]))

TagPairfdist = FreqDist(TagPair)
Aij = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[0])), TagPairfdist.iteritems())))
AijLap = LaplaceProbDist(TagPairfdist)
AijGT = SimpleGoodTuringProbDist(TagPairfdist)
AijMLE = MLEProbDist(TagPairfdist)

TagWordfdist = FreqDist(conllreader.tagged_words())
Biw = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[1])), TagWordfdist.iteritems())))
BiwLap = LaplaceProbDist(TagWordfdist)
BiwGT = SimpleGoodTuringProbDist(TagWordfdist)
	def tagged_words(self, fileids=None, categories=None):
		return ConllCorpusReader.tagged_words(self, self._resolve(fileids, categories))
Ejemplo n.º 4
0
## Function to add an adjective to a noun key
def add_adj(noun_param, adj_param):
    if (noun_param in a):
        a[noun_param].append(adj_param)
    else:
        a[noun_param] = [adj_param]


filedir = '/Users/fnascime/Documents/Sicily_Project/texts/'
filename = 'ilgattopardo_prima'

mycorpus = ConllCorpusReader(filedir, filename + '.conll',
                             ('ignore', 'words', 'ignore', 'pos', 'ignore',
                              'ignore', 'ignore', 'ignore'))

words = mycorpus.tagged_words()
list_len = len(words)

## Loop through file and retrieve adjetives directly associated with nouns (adjunct words)
for i in range(list_len):

    if (words[i][1] == 'S'):
        if ((i > 0) and (words[i - 1][1] == 'A')):
            add_adj(words[i][0], words[i - 1][0])
        elif ((i < list_len - 1) and (words[i + 1][1] == 'A')):
            add_adj(words[i][0], words[i + 1][0])

## Loop throught the list of words and verify the ones with more adjective

nouns_counting = len(a)
adj_counting = 0