Example #1
0
	def mostprobableparse(self, sent, sample=None):
		"""warning: this problem is NP-complete. using an unsorted
		chart parser avoids unnecessary sorting (since we need all
		derivations anyway).
		
		@param sent: a sequence of terminals
		@param sample: None or int; if int then sample that many parses"""
		p = FreqDist()
		for a in self.parser.nbest_parse(sent, sample):
			p.inc(removeids(a).freeze(), a.prob())
		if p.max():
			return ProbabilisticTree(p.max().node, p.max(), prob=p[p.max()])
		else: raise ValueError("no parse")
Example #2
0
def plot_freq(productions):
    prod_fd = FreqDist(productions)
    prod_to_dist = [prod_fd[key] for key in prod_fd]
    dist_fd = FreqDist(prod_to_dist)
    X_vec = list(range(prod_fd[prod_fd.max()]))[1:]
    Y_vec = [dist_fd[x] for x in X_vec]
    py.plot(X_vec, Y_vec)
Example #3
0
def max_dist(emoList):
    x = {}
    for e in emoList:
        fd = FreqDist(emoList[e])
        m = fd.max()
        x[m] = fd.freq(m)
    return max(x, key=lambda k: x[k])
Example #4
0
	def select(self, key):
		''''
			select(key) mimics the db.fetch method from crusher.py
			The name select comes from the sql sytax for processing select queries
		'''
		selection = []
		checksumList = []
		
		# Get data for the supplied key
		for i in range(1, VERSIONS):
			keyToSelect = keyForDb(key[0], i, key[1])
			try:
				selection.append(self.db.fetch(keyToSelect))
			except KeyError:
				selection.append("DOES_NOT_EXIST")

		# Get checksum for the supplied key
		for i in range(1, VERSIONS):
			if(key[1][0] == "o"):
				keyForChecksum = keyForDb(key[0], i, "om")
			if(key[1][0] == "c"):
				keyForChecksum = keyForDb(key[0], i, "cm")
			if(key[1][0] == "t"):
				keyForChecksum = keyForDb(key[0], i, "tm")
			try:
				checksumList.append(self.db.fetch(keyForChecksum))
			except KeyError:
				checksumList.append("CHECKSUM_DOES_NOT_EXIST")
			except UnboundLocalError:
				pass

		# Voting using NLTK's FreqDist module.
		freqSelection = FreqDist(selection)
		mostCommonFromSelection = freqSelection.max()

		freqChecksum = FreqDist(checksumList)
		mostCommonFromChecksum = freqChecksum.max()

		# Compare checksum 
		if(self.__CompareChecksumWithSelection__(mostCommonFromSelection, mostCommonFromChecksum) == True):
			return mostCommonFromSelection
		else:
			# Raise checksum error is it does not match.
			# TO_DO: Again try more selections and checksum comparision
			return mostCommonFromSelection
Example #5
0
    def choose_tag(self, tokens, index, history):
        word = tokens[index]
        fd = FreqDist()

        for synset in wordnet.synsets(word):
            fd[synset.pos()] += 1

        if fd:
            return self.wordnet_tag_map.get(fd.max())
        else:
            return None
Example #6
0
    def handle(self, *args, **options):
    	fdist = FreqDist()
    	print "Analyzing raw data"
    	limit = 10
    	if args:
    		raw_datas = RawData.objects.filter(pk__in=args)
    	else:
	   		raw_datas = RawData.objects.all()[:limit]
    	tagged_data = []
    	for raw_data in raw_datas:
    		words = nltk.word_tokenize(raw_data.data)
    		tagged_data.extend(nltk.pos_tag(words))
    		for word in words:
    			word = word.strip()
    			if word:
	    			fdist.inc(word)

    	print "Anaylzed %s items" % len(raw_datas)
    	print

    	print "Top word: %s" % fdist.max()
    	print 

    	print "Top 10 words"
    	for word in fdist.keys()[:10]:
    		times = fdist[word]
    		print " -- %s occurred %s times" % (word, times)
    	print

    	
    	print "Bottom 10 words"
    	for word in fdist.keys()[-10:]:
    		times = fdist[word]
    		print " -- %s occurred %s times" % (word, times)
    	print

    	print "Words occurring between 50-100 times"
    	words = [ word for word in fdist.keys() if fdist[word] >= 50 and fdist[word] <= 100 ]
    	print ", ".join(words)


    	cfdist = ConditionalFreqDist()
    	for (word, tag) in tagged_data:
    		cfdist[tag].inc(word)
    	
    	print "Most popular noun: %s" % cfdist["NN"].max()
    	print 

    	print "Top 50 nouns"
    	for word in cfdist["NN"].keys()[:50]:
    		times = cfdist["NN"][word]
    		print " -- %s occurred %s times" % (word, times)
    	print
def task2a(data):
    tags = []
    for key in data.keys():
        for sentence in data[key]:
            for _, tag in sentence:
                tags.append(tag)
    fd = FreqDist(tags)
    most_frequent_tag = fd.max()
    print("Most frequent tag: {}".format(most_frequent_tag))
    default_tagger = DefaultTagger(most_frequent_tag)
    test_tagger(default_tagger, data)
    return tag
Example #8
0
def mm(path):
    rules = {}
    current_sents = get_sents(path)
    cpt = 0
    while (len(current_sents) != 0):
        grams = extract_ngrams(current_sents)  #get the n_grams
        fd = FreqDist(grams)
        r = fd.max()  #r has the most frequente gram
        rules["NT" + str(cpt)] = r
        current_sents = rplc(current_sents, r, "NT" +
                             str(cpt))  #replace the gram by her rule name
        cpt += 1
    return rules
Example #9
0
def run (path):
    rules = {}
    current_sents = func.get_tagged_sents(path)
    current_sents =func.compress_tags(current_sents)
    print("there's " + str(len(current_sents)) + " sentence(s)")
    text=""
    for s in current_sents : 
        if(" "in s):
            text = text+s+"\n"
    cpt=0
    while True:
        current_sents = text.split("\n")
        grams = extract_ngrams(current_sents)#get the n_grams
        fd = FreqDist(grams)
        max_freq_gram = fd.max()#r has the most frequente gram
        if(fd[max_freq_gram]==1):
            break
        r="" #the string of the grame to replace
        for g in max_freq_gram:
            r = r + g + " "
        #exp ("at","nn") => r="at nn "   
        r=r.strip()
        print(f"{cpt} => {r}")
        rules["NT"+str(cpt)]=r
        text=text.replace(" " + r + " "," NT"+str(cpt)+" ")#replace the gram by her rule name
        text=text.replace(" "+r+"\n"," NT"+str(cpt)+"\n")
        text=text.replace("\n"+r+" ","\nNT"+str(cpt)+" ")
        text=text.replace("\n"+r+"\n","\nNT"+str(cpt)+"\n")
        cpt+=1
    #all the gram do repeat one time only 
    #till here its correct the rest its not working at all 
    sentences = text.split("\n")
    sentences = sort_len(sentences)
    for i in range(0,len(sentences)):
        r=sentences[i]
        b = False # the rule is not a sub_rule
        if(text.count(r+" ")+text.count(r+"\n")>1):b=True # it is a sub rule
        if(len(sentences[i].split())>1):
            rules["NT"+str(cpt)]=r
            text=text.replace(" "+r+" "," NT"+str(cpt)+" ")#replace the gram by her rule name
            text=text.replace(" "+r+"\n"," NT"+str(cpt)+"\n")
            text=text.replace("\n"+r+" ","\nNT"+str(cpt)+" ")
            text=text.replace("\n"+r+"\n","\nNT"+str(cpt)+"\n")
            if(b==False):rule_base.append("NT"+str(cpt))
            cpt=cpt+1
            sentences = text.split("\n")
            sentences = sort_len(sentences)
        else:
            if(b==False):rule_base.append(r)
    return rules
Example #10
0
def main():
	""" a basic REPL for testing """
	corpus = """(S (NP John) (VP (V likes) (NP Mary)))
(S (NP Peter) (VP (V hates) (NP Susan)))
(S (NP Harry) (VP (V eats) (NP pizza)))
(S (NP Hermione) (VP (V eats)))""".splitlines()
	corpus = """(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (JJ hungry) (NN dog))))
(S (NP (DT The) (JJ little) (NN mouse)) (VP (VBP ate) (NP (DT the) (NN cat))))""".splitlines()
	#corpus = """(S (NP mary) (VP walks) (AP quickly))""".splitlines()
	#(S (NP Harry) (VP (V likes) (NP Susan) (ADVP (RB very) (RB much))))
	corpus = [Tree(a) for a in corpus]
	#d = GoodmanDOP(corpus, rootsymbol='S')
	from bitpar import BitParChartParser
	d = GoodmanDOP(corpus, rootsymbol='TOP', wrap='TOP',
						parser=BitParChartParser)
	#d = GoodmanDOP(corpus, rootsymbol='TOP', wrap='TOP')
	#print d.grammar
	print "corpus"
	for a in corpus: print a
	w = "foo!"
	while w:
		print "sentence:",
		w = raw_input().split()
		try:
			p = FreqDist()
			for n, a in enumerate(d.parser.nbest_parse(w)):
				if n > 1000: break
				print a
				p.inc(ImmutableTree.convert(removeids(a)), a.prob())
			#for b, a in sorted((b,a) for (a,b) in p.items()):
			#	print a, b
			print
			print 'best', p.max(), p[p.max()]
			#print d.parse(w)
		except Exception: # as e:
			print "error", #e
Example #11
0
def extract_doc_feats(refactorized_documents):
    from nltk import FreqDist
    from collections import defaultdict
    import itertools
    import math
    import pdb
    import numpy

    doc_num = len(refactorized_documents)

    occurences = defaultdict(lambda: 0)
    for doc in refactorized_documents:
        for x in set(doc): occurences[x] += 1

    ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents))
    glob_freqs = FreqDist(ref_docs_flat)

    tokens = glob_freqs.samples()
    glob_features = [{}]*doc_num


    for i in range(0, doc_num):
        doc_features = [0]*len(tokens)
        doc_freqs = FreqDist(refactorized_documents[i])
        doc_len = len(refactorized_documents[i])

        for (tok,num) in doc_freqs.items():
            max_doc_freq = doc_freqs.freq(doc_freqs.max())*float(doc_len)

            # augmented
            #tf = 0.5 + (0.5*float(num)) / float(max_doc_freq)
            tf = 1+math.log(num,10)
            idf = math.log( float(doc_num) / (float(occurences[tok])) ,10)
            tfidf = tf*idf

            indx = tokens.index(tok)
            doc_features[indx] = tfidf

        f_tmp = numpy.asarray(doc_features)
        f_tmp = f_tmp/(numpy.linalg.norm(f_tmp)+numpy.finfo(float).eps)
        glob_features[i] = f_tmp.tolist()

    glob_features = numpy.asarray(glob_features)*glob_freqs.N()
    print "Glob Freqs:", glob_freqs.N()

    return (glob_features,tokens)
    def textChanged_inputTextEdit(self):
        inputText = self.inputTextEdit.toPlainText().strip()
        inputText = ''.join(c for c in inputText
                            if not ud.category(c).startswith('P')
                            )  # Delete all ponctuations (Arabic included)

        inputTokens = functions.tokenization(inputText)
        freqDist = FreqDist(inputTokens)

        self.numWordEdit.setText(str(freqDist.N()))
        self.mostFreqWordEdit.setText(freqDist.max())

        numSentences = len(
            functions.tok_stem(self.inputTextEdit.toPlainText(), False))
        self.numSentenceEdit.setText(str(numSentences))

        self.inStatsGroup.setEnabled(
            True if self.inputTextEdit.toPlainText().strip() else False)
        self.searchWordGroup.setEnabled(
            True if self.inputTextEdit.toPlainText().strip() else False)
        self.startPosTagButton.setEnabled(
            True if self.inputTextEdit.toPlainText().strip() else False)
Example #13
0
word_len = [len(w) for w in text1]
print word_len





# Example	Description
# fdist = FreqDist(samples)	create a frequency distribution containing the given samples
# fdist[sample] += 1	increment the count for this sample
# fdist['monstrous']	count of the number of times a given sample occurred
# fdist.freq('monstrous')	frequency of a given sample
# fdist.N()	total number of samples
# fdist.most_common(n)	the n most common samples and their frequencies
# for sample in fdist:	iterate over the samples
# fdist.max()	sample with the greatest count
# fdist.tabulate()	tabulate the frequency distribution
# fdist.plot()	graphical plot of the frequency distribution
# fdist.plot(cumulative=True)	cumulative plot of the frequency distribution
# fdist1 |= fdist2	update fdist1 with counts from fdist2
# fdist1 < fdist2	test if samples in fdist1 occur less frequently than in fdist2

fdlist = FreqDist(len(w) for w in text1)
print dict(fdlist)
print fdlist.most_common(3)
print fdlist.max()
print fdlist[2]
print fdlist.tabulate()
fdlist.plot()
fdlist.plot(cumulative=True)
Example #14
0
	sentence_dic_nights[file] = len(corpus_nightsII.sents(file))
	sentence_dic_nights = collections.OrderedDict(sentence_dic_nights) # we make sure that the order of the data stays the same 

# Which night has the most sentences?
for file, characters in sentence_list_nights:
	if characters == max(sentence_dic_nights.values()):
		print(file, characters)	# the Eight Hundred and Forty-fifth.txt => 399

# In the following block of code, we calculate what the average word length is in each night

dict_word_length = {}
for file in corpus_nightsII.fileids():
	text = corpus_nightsII.words(file)
	x = [len(words) for words in text]
	fdist = FreqDist(x)
	dict_word_length[file] = fdist.max()
print(dict_word_length)

# We now calculate the readability for each file. We do this by using the Automated Readability Index (ARI).

stat_list = []
x = word_dic_nights.keys()
for name in x:
	n_char = char_dict_night[name]
	n_words = word_dic_nights[name]
	n_sents = sentence_dic_nights[name]
	stat_list.append((name, n_char, n_words, n_sents))
print(stat_list)

def ARI(n_char, n_words, n_sents):
	x = n_char/n_words
Example #15
0
tweet_tokenizer = TweetTokenizer()
with open('C:/users/onlyone/desktop/prefeito/prefeito.txt',
          mode='r',
          encoding='UTF-8') as dados_tratados:

    mining1 = str(dados_tratados.readlines())
    mining2 = re.sub(caracters, u'', mining1)
    mining3 = tweet_tokenizer.tokenize(str(mining2))
    mining4 = FreqDist(mining3)

dados_tratados.close()

#Impressão de padrões frequêntes
print(mining4)
print(mining4)
print(mining4.max())
print(mining4.most_common())

#Plotagem Gráfico 1
mining4.plot(60,
             cumulative=False,
             title="Gráfico de Padrões Frequêntes - Prefeito de Salvador")

#Plotagem núvem de palavras

from PIL import Image
from nltk import FreqDist
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
Example #16
0
def multi_sentence(context_sentences, ambiguous_word):
    fdist = FreqDist()
    for sentence in context_sentences:
        fdist.inc(lesk(sentence, ambiguous_word))
    return fdist.max()
def word_frequencies(data):
    f = FreqDist(data)
    return f.max()
Example #18
0
title = nostop_title_dsc + nostop_title_kd
nltk.Text(title).collocations()
fdist_title = FreqDist(title)
fdist_title.most_common(50)
fdist_title.plot(50, cumulative=True)
fdist_title.plot(50)
total_words = len(set(title))
print("The total number of words in title of dsc is: " + str(total_words))
avg_words = fdist_title.N() / total_words
print("Each word appears in title of dsc is: " + str(int(avg_words)))

text = nostop_text_dsc + nostop_text_kd
nltk.Text(text).collocations()
fdist_text = FreqDist(text)
fdist_text.most_common(50)
fdist_text.max()
fdist_text.plot(50, cumulative=True)
fdist_text.plot(50)
total_textwords = len(set(text))
print("The total number of words in text is: " + str(total_textwords))
avg_text = fdist_text.N() / total_textwords
print("Each word appears in text " + str(int(avg_text)) + " times")

# bigrams and trigrams
word_pair_text = list(bigrams(text))
word_triple_text = list(trigrams(text))
bigrams_text = FreqDist(word_pair_text)
trigrams_text = FreqDist(word_triple_text)
bigrams_text.most_common(50)
bigrams_text.plot(50)
bigrams_text.plot(50, cumulative=True)
Example #19
0

### What is the most frequent tag?
### Which word has the most number of distinct tags?
fd = FreqDist()
cfd = ConditionalFreqDist()

# for each tagged sentence in the corpus, get the (token, tag) pair and update
# both count(tag) and count(tag given token)
for sentence in brown.tagged_sents():
    for (token, tag) in sentence:
        fd[tag] += 1
        cfd[token][tag] += 1

# Find the most frequent tag
fd.max()

# Initialize a list to hold (numtags,word) tuple
wordbins = []

# Append each tuple (number of unique tags for token, token) to list
for token in cfd.conditions():
    wordbins.append((cfd[token].B(), token))

# sort tuples by number of unique tags (highest first)
wordbins.sort(reverse=True)
print wordbins[0] # token with max. no. of tags is ...


### What is the ratio of masculine to feminine pronouns?
male = ['he','his','him','himself']  # masculine pronouns
# words_set = set(book_of_genesis)

# ignore capitalization and duplicates
# words_set = set(word.lower() for word in book_of_genesis)

# ignore capitalization, duplicates and non-alphabetic items (numbers and punctuation characters)
words_set = set(word.lower() for word in book_of_genesis if word.isalpha())

# number of words
len(words_set)

# get words longer than 10
minimum_characters = 10
long_words = [word for word in words_set if len(word) > minimum_characters]
sorted(long_words)  # sorted alphabetically (capital letters first)

# get words longer than 7 that occur more than 7 times
minimum_characters = 7
minimum_frequency = 7
fdist = FreqDist(book_of_genesis)
frequent_long_words = [word for word in words_set
                       if len(word) > minimum_characters and fdist[word] > minimum_frequency]
sorted(frequent_long_words)  # sorted alphabetically (capital letters first)

# frequency of words based on their length
words_length = [len(word) for word in book_of_genesis]
fdist = FreqDist(words_length)
fdist.most_common()
fdist.max()  # most frequent word length
fdist.freq(3)  # frequency of words whose length is 3
Example #21
0
File: dsc.py Project: dmml/NLTK
    return [w for w in word if w not in stopwords.words('english') and w != '']


# lemma
def lemma(text):
    lmtzr = WordNetLemmatizer()
    return [lmtzr.lemmatize(w) for w in text]

nostop_title = lemma(remove_stopwords(text_title))
# check the collocations of text
nostop_title = nltk.Text(nostop_title)
nostop_title.collocations()
fdist_title = FreqDist(nostop_title)  # Frequency distribution of text
fdist_title.most_common(50)  # most common 50
fdist_title['science']  # return count of a given word
fdist_title.max()  # max counts
fdist_title.plot(50, cumulative=True)  # plot
fdist_title.plot(50)
fdist_title.tabulate(50)  # tabulate
total_words = len(set(nostop_title))
print("The total number of words in title of dsc is: " + str(total_words))
avg_words = fdist_title.N() / total_words
print("Each word appears in title of dsc is: " + str(int(avg_words)))

# bigrams, trigrams
from nltk import bigrams
from nltk import trigrams
word_pair = list(bigrams(nostop_title))
word_triple = list(trigrams(nostop_title))
bigrams_title = FreqDist(word_pair)
trigrams_title = FreqDist(word_triple)
Example #22
0
class Document(object):
    def __init__(self, doc_id):
        #rename metadata something more general?
        self.metadata = { "doc_title": None, "author_lastname": None, "author_first_middle": None, "year_written": None, "year_published": None,
                "pub_title": None, "pub_type": None, "Type-Token Ratio": None, "Hapax Dislegomena": None, "Honore's R": None, "Yule's K": None, "tokenized_doc": []}
        self.doc_id = doc_id
        self.fdist = None
        self.frequencies = []
        self.metadata_getter()
        self.tokenized_doc_getter() 
        self.thrk_getter()
        self.frequency_dist_getter()
        #method?
        #self.timestamp()
    
    def timestamp(self):
        ts = time.time()
        return datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d_%H%M%S_')
        
    def metadata_getter(self):
        # move to object?
        cursor = db.cursor()
        c = cursor.execute('SELECT author_lastname, author_first_middle, doc_title, original_publication_title, original_publication_type, year_written, year_published FROM metadata WHERE doc_id = (?)', (self.doc_id,))
        for row in c:
            self.metadata["author_lastname"] = row[0]
            self.metadata["author_first_middle"] = row[1]
            self.metadata["doc_title"] = row[2]
            self.metadata["pub_title"] = row[3]
            self.metadata["pub_type"] = row[4]
            self.metadata["year_written"] = row[5]
            self.metadata["year_published"] = row[6]
        #print "Metadata Found for Doc ", (self.doc_id)
        
    def tokenized_doc_getter(self):
        #assumes we're connected to db
        doc_name = 'document_' + str(self.doc_id) 
        cursor = db.execute('SELECT * FROM {}'.format(doc_name,))
        text = []
        for i in cursor:
            text.append(str(i[0]))
            self.metadata["tokenized_doc"] = text
        #print "Tokenized Document ", (self.doc_id)
    
    def type_token_ratio(self):
        self.metadata["Type-Token Ratio"] = float(self.V / self.N)
        
    def hap_dis_ratio(self):
        self.metadata["Hapax Dislegomena"] = float(self.hapaxes[2] / self.V)
        #assignments can go in methods
    
    def honore_r(self):
        if self.hapaxes[1] != 0:
            self.metadata["Honore's R"] = float((100*math.log(self.N, 10)) / (1 - (self.hapaxes[1] / self.V)))
        else:
            self.metadata["Honore's R"] = 'NA'

    def yule_k(self):
        #we find the value of the greatest number of times any word appears
        summation = []
        for i in self.hapaxes:
            summation.append(float(i**2 * self.hapaxes[i]))
        #with the summation, find K
        self.metadata["Yule's K"] = float((10**4 * (sum(summation) - self.N)) / (self.N**2))

    def frequency_dist(self):
        self.fdist = FreqDist(self.metadata["tokenized_doc"])
    
    def frequency_dist_getter(self):
        if self.fdist == None:
            self.frequency_dist()
        self.frequencies = self.fdist.items()
                
    def hapaxes_summation(self):
        self.frequency_dist()
        max = self.fdist[self.fdist.max()]
        # hapaxes method (only gets called if you hit else here)
        hapaxes = {}
        for n in range(1, max+1):
            hapaxes[n] = 0
        for i in self.fdist:
            hapaxes[self.fdist[i]] += 1
        self.hapaxes = hapaxes
    
    def thrk_getter(self):
        cursor = db.cursor()
        c = cursor.execute('SELECT doc_id, t, h, r, k FROM thrk WHERE doc_id = (?)', (self.doc_id,))
        count = 0
        for i in c:
            count +=1
        if count > 0:
            c = cursor.execute('SELECT doc_id, t, h, r, k FROM thrk WHERE doc_id = (?)', (self.doc_id,))
            for i in c:
                self.metadata["Type-Token Ratio"] = i[1]
                self.metadata["Hapax Dislegomena"] = i[2]
                self.metadata["Honore's R"] = i[3]
                self.metadata["Yule's K"] = i[4]
        else:
            self.hapaxes_summation()    
            # make these instance variables
            self.N = float(self.fdist.N())
            self.V = float(len(self.fdist))
            
            #Just call these
            self.type_token_ratio()
            self.hap_dis_ratio()
            self.honore_r()
            self.yule_k()
            cursor.execute('INSERT INTO thrk (doc_id, t, h, r, k) VALUES (?, ?, ?, ?, ?)', (self.doc_id, self.metadata["Type-Token Ratio"], self.metadata["Hapax Dislegomena"], self.metadata["Honore's R"], self.metadata["Yule's K"]))
            db.commit()
Example #23
0
from nltk import FreqDist
from common.books import text1

fdist = FreqDist(len(w) for w in text1())
print(fdist)
# print(fdist.keys())
# print(fdist.items())

print(fdist.most_common())
print(fdist.max())
print(fdist[3])
print(fdist.freq(3))
    def clicked_startPosTagButton(self):
        self.inputText = self.inputTextEdit.toPlainText()

        file = open(self.mainWindow.modelResults + 'Input.txt',
                    'w',
                    encoding='utf-8')
        file.write(self.inputText)
        file.close()

        tokStems = functions.tok_stem(self.inputTextEdit.toPlainText())
        normTokStems = functions.normalization(tokStems,
                                               self.mainWindow.modelSources)

        numberStems = len(normTokStems)
        numberUNK = 0

        text = ''
        counter = 0
        while counter < len(tokStems):
            text += tokStems[counter] + ' '
            if normTokStems[counter] == 'مجه':
                numberUNK += 1
            counter += 1

        file = open(self.mainWindow.modelResults + 'Affix.txt',
                    'w',
                    encoding='utf-8')
        file.write(text)
        file.close()

        stemsTags = functions.viterbi(normTokStems,
                                      self.mainWindow.modelSources)

        text = ''
        tagsText = ''
        counter = 0
        while counter < len(stemsTags):
            tag = stemsTags[counter]
            token = tokStems[counter]

            tagsText += tag + ' '
            text += token + '/' + '<span style="background-color: yellow; font: bold 11px;">' + tag + '</span>' + ' '

            counter += 1

        file = open(self.mainWindow.modelResults + 'Tag.txt',
                    'w',
                    encoding='utf-8')
        file.write(tagsText)
        file.close()

        self.parentWindow.posTagTab.taggedTextEdit.setHtml(text)

        file = open(self.mainWindow.modelResults + 'Out.txt',
                    'w',
                    encoding='utf-8')
        file.write(self.parentWindow.posTagTab.taggedTextEdit.toPlainText())
        file.close()

        self.parentWindow.posTagTab.numStemEdit.setText(str(numberStems))
        self.parentWindow.posTagTab.numUnkTagsEdit.setText(str(numberUNK))

        tagsList = tagsText.split()
        freqDist = FreqDist(tagsList)
        self.parentWindow.posTagTab.mostFreqTagEdit.setText(freqDist.max())

        self.mainWindow.statusBarLabel.setText(
            'Input text has been PoS-Tagged. Check text files in  "/model/results/"'
        )

        self.parentWindow.tabWidget.setTabEnabled(1, True)
Example #25
0
[len(w) for w in text1]

# Collocations are frequent bigrams from words that are not so common as unigrams. 
# This function returns nothing, just prints the collocations to screen
text1.collocations()

# Computing the frequency distribution of word lengths. Returns a dictionary.
fdistWordLength = FreqDist([len(w) for w in text1])

fdistWordLength.keys() # The different word lengths
fdistWordLength.values() # The frequency of each word length
fdistWordLength.items() # Shows both keys and values at the same time

fdist1['the']
fdist1.freq('the') # Frequency of the word ‘the’
fdist1.max()



#### MOVIE REVIEWS ####
import nltk
from nltk.corpus import movie_reviews

movie_reviews.categories()
movie_reviews.fileids('pos')
movie_reviews.fileids('neg')
movie_reviews.words('neg/cv729_10475.txt')
len(movie_reviews.words('neg/cv729_10475.txt'))

documents = [(list(movie_reviews.words(fileid)), category)
    for category in movie_reviews.categories()
Example #26
0
File: kd.py Project: dmml/NLTK
# stem of word
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem


def lexical_diversity(text):
    return len(text) / len(set(text))

nostop_title = lemma(remove_stopwords(text_title))
nltk.Text(nostop_title).collocations()
# Frequency distribution of text
fdist_title = FreqDist(nostop_title)
fdist_title.most_common(50)
fdist_title.max()
fdist_title.plot(50, cumulative=True)#plot
fdist_title.plot(50)
total_words = len(set(nostop_title))
print("The total number of words in title of KD is: " + str(total_words))
avg_words = fdist_title.N()/total_words
print("Each word appears in title of KD is: " + str(int(avg_words)))


# process for text
f = open('kdtext.txt', encoding="latin-1")
raw_text = f.read()
# type
type(raw_text)
tokens = word_tokenize(raw_text)
type(tokens)
Example #27
0
#!/usr/bin/python
# coding: utf-8

# 2013/03/20

from nltk import FreqDist

fdist = FreqDist(samples) # samples で指定されたデータの頻度分布を生成
fdist.inc(sample) # sampleで指定されたデータの数を1増やす
fdist['データ'] # 指定されたデータの出現数
fdist.freq('データ') # 指定されたデータの頻度
fdist.N() # サンプルの総数
fdist.keys() # 頻度の順にソートされたサンプル
for sample in fdist: # 頻度の順にサンプルをイテレート
    pass
fdist.max() # 数の最も多いサンプル
fdist.tabulate() # 頻度分布を表形式で表示
fdist.plot() # 頻度分布をプロット
fdist.plot(cumulative=True) # 累積頻度をプロット
fdist1 < fdist2 # fdist1のサンプルの頻度がfdist2 より少ないかをテスト


Example #28
0
#!/usr/bin/env python

from nltk.corpus import brown
from nltk import FreqDist, ConditionalFreqDist
fd = FreqDist()
cfd = ConditionalFreqDist()

# for each tagged sentence in the corpus, get the (token, tag) pair and update
# both count(tag) and count(tag given token)
for sentence in brown.tagged_sents():
    for (token, tag) in sentence:
        fd[tag] += 1
        cfd[token][tag] += 1

# The most frequent tag is ...
print(fd.max())

# Initialize a list to hold (numtags,word) tuple
wordbins = []

# Append each (n(unique tags for token),token) tuple to list
for token in cfd.conditions():
    wordbins.append((cfd[token].B(), token))

# Sort tuples by number of unique tags (highest first)
wordbins.sort(reverse=True)

# The token with max. no. of tags is ...
print(wordbins[0])

# masculine pronouns
 Autor: RodriguesFAS
 Email: <*****@*****.**> | <*****@*****.**>
 Website: <http://rodriguesfas.com.br> | <http://clubedosgeeks.com.br>
 Github: <https://github.com/rodriguesfas>
'''

import nltk
from nltk import FreqDist
import matplotlib.pyplot as plt
#import matplotlib

text_src = open('corpus.txt').read()
'''
	nltk.word_tokenize(text)
'''
print "=====================[word_tokenize]"
tokens = nltk.word_tokenize(text_src)
print tokens
'''
	Retorna a frequência max de uma palavra
	FreqDist()
'''
print "=====================[FreqDist]"
frequency_word = FreqDist('are')
print frequency_word
'''
	Retorna a frequência da palavra mais frequente.
'''
print "=====================[max]"
print frequency_word.max()
#!/usr/bin/python3
# coding: utf-8
import nltk
from nltk.corpus import gutenberg  # 导入 gutenberg 集
##################################################################
## FreqDist 跟踪分布中的采样频率 (sample frequencies)
from nltk import FreqDist  # 导入 FreqDist 类
fd = FreqDist(gutenberg.words('austen-persuasion.txt'))  # 频率分布实例化, 统计文本中的 Token
print(fd)  # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token
print(type(fd))  # <class 'nltk.probability.FreqDist'>
print(fd['the'])  # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典
print(fd.N())  # 98171; 是单词, 不是字母, 有重复的
print(fd.B())  # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中
print(len(fd.keys()), type(fd.keys()))  # 6132 <class 'dict_keys'>
print(fd.keys())  # fd.B() 只是输出个数, 这个是把所有词汇表输出
print(fd.max())  # 频率最高的一个词
print(fd.freq('the'))  # 0.03178127960395636; 出现频率 3120 / 98171
print(fd.hapaxes())  # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词
# 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征
for idx, word in enumerate(fd):  # 可以用 enumerate 来遍历, 是按出现顺序排的
    if idx == 5: break
    print(idx, word)  # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen
##################################################################
## 统计词的长度频率
fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt'))
print(fdist)  # <FreqDist with 16 samples and 98171 outcomes>
print(fdist.items())  # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)])
print(fdist.most_common(3))  # [(3, 20013), (1, 16274), (2, 16165)]
##################################################################
## 统计 英文字符
fdist = nltk.FreqDist(ch.lower() for ch in gutenberg.raw('austen-persuasion.txt') if ch.isalpha())  # 可以不用 [] 将生成器 list 化
Example #31
0
 def most_frequent_sense_accuracy(self):
     """ Computes the accuracy of always predicting the overall most frequent sense for all instances in the dataset. """
     label_list = [inst.label for inst in self.instance_list]
     freq_dist = FreqDist(label_list)
     return freq_dist[freq_dist.max()] / len(label_list)
Example #32
0
    text1)  # bigramsText1[0] is the tuple containing the first bigram

# Collocations are frequent bigrams from words that are not so common as unigrams.
# This function returns nothing, just prints the collocations to screen
text1.collocations()

# Computing the frequency distribution of word lengths. Returns a dictionary.
fdistWordLength = FreqDist([len(w) for w in text1])

fdistWordLength.keys()  # The different word lengths
fdistWordLength.values()  # The frequency of each word length
fdistWordLength.items()  # Shows both keys and values at the same time

fdist1['the']
fdist1.freq('the')  # Frequency of the word ‘the’
fdist1.max()

# String methods

s = "MatTias"

s.lower()

s.upper()

s.startswith("ma")

"T" in s

# Find all the words in Moby Dick that ends with -ableness. Sort then alphabetically.
from nltk.book import text2, text3, text5, text7
##################################################################
## FreqDist 跟踪分布中的采样频率 (sample frequencies)
from nltk import FreqDist  # 导入 FreqDist 类
fd = FreqDist(
    gutenberg.words('austen-persuasion.txt'))  # 频率分布实例化, 统计文本中的 Token
print(
    fd
)  # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token
print(type(fd))  # <class 'nltk.probability.FreqDist'>
print(fd['the'])  # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典
print(fd.N())  # 98171; 是单词, 不是字母, 有重复的
print(fd.B()
      )  # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中
print(len(fd.keys()), type(fd.keys()))  # 6132 <class 'dict_keys'>
print(fd.keys())  # fd.B() 只是输出个数, 这个是把所有词汇表输出
print(fd.max())  # 频率最高的一个词
print(fd.freq('the'))  # 0.03178127960395636; 出现频率 3120 / 98171
print(fd.hapaxes())  # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词
# 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征
for idx, word in enumerate(fd):  # 可以用 enumerate 来遍历, 是按出现顺序排的
    if idx == 5: break
    print(idx, word)  # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen
##################################################################
## 统计词的长度频率
fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt'))
print(fdist)  # <FreqDist with 16 samples and 98171 outcomes>
print(
    fdist.items()
)  # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)])
print(fdist.most_common(3))  # [(3, 20013), (1, 16274), (2, 16165)]
##################################################################