コード例 #1
0
def get_type_token_ratio(category=''):
	# returns the type to token ratio for the given topic
	if category=='':
		text=brown.words() # get the text from the entire corpus
	else:
		text=brown.words(categories=category) # get the text from the given category
	return len(set(text))/len(text)
コード例 #2
0
    def test_clusterer(self):
        """Here we take 10 documents categorized as 'government' and
        'mystery' from the brown corpus, and perform k-means clustering on
        these. Optimally we would like the clusterer to divide them in two
        clusters.
        The clusterer generates clusters depending on random initial
        conditions, so the result can be different in different test runs.
        In order to account for that that we run a lot of iterations
        (50) which hopefully will generate a good result. The success
        condition is that a max of 2 out of 10 documents will fall in the
        wrong cluster.
        """

        clusterer = KMeans()
        government_ids = brown.fileids(categories='government')[:10]
        mystery_ids = brown.fileids(categories='mystery')[:10]
        government_uids = []
        mystery_uids = []

        for articleid in government_ids:
            text = " ".join(brown.words(articleid))
            self.folder.invokeFactory('Document', articleid, text=text)
            government_uids.append(self.folder[articleid].UID())

        for articleid in mystery_ids:
            text = " ".join(brown.words(articleid))
            self.folder.invokeFactory('Document', articleid, text=text)
            mystery_uids.append(self.folder[articleid].UID())

        result = clusterer.clusterize(2, 50, repeats=50)
        cluster1 = set(result[0])
        missed = min(len(cluster1-set(government_uids)),
                     len(cluster1-set(mystery_uids)))
        self.failUnless(missed<=2)
コード例 #3
0
ファイル: parse.py プロジェクト: owenphen/Gradify
def load_brown_freq_ratios():
    brown_freqdist = nltk.FreqDist([w.lower() for w in brown.words()])
    num_words = len(brown.words())
    ratios = {}
    for word, number in brown_freqdist.iteritems():
        ratios[word] = float(number) / num_words
    return ratios
コード例 #4
0
ファイル: rhymelib.py プロジェクト: StefanKopieczek/pyverse
    def _build_wordset(clazz, obscurity_limit):
        # I'm sorry this method is so disgusting.
        # It's all in the cause of fast loading in the main case.

        from nltk import FreqDist

        # Ensure corpora are loaded.
        try:
            from nltk.corpus import cmudict
            cmudict.entries()
        except LookupError:
            print "CMUDict corpus not found. Downloading..."
            from nltk import download
            download('cmudict')
            print "[Done]"
        if obscurity_limit is not None:
            from nltk.corpus import brown
            try:
                brown.words()
            except LookupError:
                print "Brown corpus not found. Downloading...",
                from nltk import download
                download('brown')
                print "[Done]"

        words = cmudict.entries()
        if obscurity_limit is not None:
            freqs = FreqDist([w.lower() for w in brown.words()])
            words = sorted(words,
                           key=lambda x: freqs[x[0].lower()],
                           reverse=True)
            return words[:obscurity_limit]
        else:
            return list(words)
コード例 #5
0
def get_prob_word_in_category(word, category=''):
	# returns the probability of the given word appearing in the given category 
	# (or the entire corpus, if no category is given).
	if category=='':
		text=brown.words() # get the text from the entire corpus
	else:
		text=brown.words(categories=category) # get the text from the given category
	return text.count(word)/len(text) 
コード例 #6
0
def get_vocabulary_size(category=''):
	# returns the size of the vocabulary for a single category from the corpus. 
	# If no category is given, the function should return the vocabulary size for the entire corpus.
	if category=='':
		text=brown.words() # get the text from the entire corpus
	else:
		text=brown.words(categories=category) # get the text from the given category
	return len(set(text))
コード例 #7
0
def get_top_n_words(n, category=''):
	#return the most frequent n words from a category (or the entire corpus)
	if category=='':
		text=brown.words() # get the text from the entire corpus
	else:
		text=brown.words(categories=category) # get the text from the given category
	fdist=FreqDist(text)
	top_words=fdist.keys()
	return top_words[:n]
コード例 #8
0
def fetchCorpus():
    corpus = nltk.pos_tag(brown.words(categories="news")[:CORPUS_SIZE] +
                          brown.words(categories="editorial")[:CORPUS_SIZE] + 
                          brown.words(categories="reviews")[:CORPUS_SIZE] +
                          brown.words(categories="lore")[:CORPUS_SIZE] +
                          brown.words(categories="hobbies")[:CORPUS_SIZE])
    categories = list(set(map(lambda x:x[1], corpus)))

    return corpus, categories
コード例 #9
0
def Automated_Readability_Index(section):
	sents = len(brown.sents(categories = section))
	words = len(brown.words(categories = section))
	text = " ".join(brown.words(categories = section))
	letters = len(text)
	uw = letters / float(words) 
	us = words / float(sents) 
	ari = (4.71 * uw) + (0.5 * us) - 21.43
	return ari
コード例 #10
0
ファイル: toturial.py プロジェクト: Paul-Lin/misc
def print_brown():
    from nltk.corpus import brown
    print brown.categories()
    print brown.words(categories='news')
    print brown.words(fileids=['cg22'])
    print brown.sents(categories=['news','reviews'])
    news_text=brown.words(categories='news')
    fdist=nltk.FreqDist([w.lower() for w in news_text])
    modals=['can','could','may','might','must','will']
    for m in modals:
        print m+':',fdist[m]
コード例 #11
0
def pre_processor(grams=3):

    vocabulary = set()

    t = 0

    for di in brown.fileids():
        vocabulary = vocabulary.union(set(brown.words(di)))
        t += 1
        if t == 2:
            break

    vocabulary = list(vocabulary)

    for i, word in enumerate(vocabulary):
        wordDic[word] = i
        posiDic[i] = word

    t = 0

    x1 = np.zeros(shape=(0, grams-1), dtype=int)
    x2 = np.zeros(shape=(0, grams-1), dtype=int)
    y1 = np.zeros(shape=(0, 1), dtype=int)
    y2 = np.zeros(shape=(0, 1), dtype=int)

    for _id in brown.fileids():
        if t == 0:
            t += 1

            text = brown.words(_id)

            size_ant = x1.shape[0]
            x1.resize((x1.shape[0] + len(text) - grams - 1, grams-1))
            y1.resize((y1.shape[0] + len(text) - grams - 1, 1))

            for i in range(size_ant, size_ant + len(text) - grams-1):
                x1[i] = [wordDic[text[index]] for index in range(i, i+grams-1)]
                y1[i] = [wordDic[text[i + grams-1]]]

            continue

        text = brown.words(_id)

        size_ant = x2.shape[0]
        x2.resize((x2.shape[0] + len(text) - grams - 1, grams-1))
        y2.resize((y2.shape[0] + len(text) - grams - 1, 1))

        for i in range(size_ant, size_ant + len(text) - grams-1):
            x2[i] = [wordDic[text[index]] for index in range(i, i+grams-1)]
            y2[i] = [wordDic[text[i + grams-1]]]

        break

    return vocabulary, x1, y1, x2, y2
コード例 #12
0
ファイル: brown.py プロジェクト: seanbethard/plurals-english
def print_corpus_info(categories, stopwords):
    
    print("Corpus name: Brown Corpus")
    tokens = [w for w in brown.words()]
    no_stopwords = [w for w in tokens if w not in stopwords]
    print_scores(tokens, no_stopwords)

    for category in categories:
        print("Category:", category)
        tokens = [w for w in brown.words(categories=category)]
        no_stopwords = [w for w in tokens if w not in stopwords]
        print_scores(tokens, no_stopwords)
コード例 #13
0
def plot_word_counts():
  #copying all the words in the Brown corpus
  corpus_full_text = brown.words()
  corpus_news = brown.words(categories = 'news')

  fdist = FreqDist(corpus_news)
  xx=fdist.values()
  plt.hist(xx, bins=3000)

  # Annotate the graph 
  plt.xlablel('Frequency of occurences')
  plt.ylabel('Freqency of words in that bucket')
  plt.axis([0,500,0,500]) 
  plt.show()   
コード例 #14
0
def syn():
	
	while True:
		#syns=wordnet.synsets(brown.words()[random.randint(1, len(brown.words())-1)].lower())
		syns=wordnet.synsets(brown.words()[random.randint(1, 1000000)].lower())
		syns2=wordnet.synsets(brown.words()[random.randint(1, 1000000)].lower())
		try:
			word=syns[0].lemmas[0].name
			word2=syns2[0].lemmas[0].name
			#print "word: ", word
			if not (word==word2) and not(word.find("_")>0 or len(word)<4) and not( word2.find("_")>0 or len(word2)<4):
				return (word,word2)
		except Exception:
			continue
コード例 #15
0
ファイル: text.py プロジェクト: prz3m/kind2anki
def demo():
    from nltk.corpus import brown

    text = Text(brown.words(categories='news'))
    print(text)
    print()
    print("Concordance:")
    text.concordance('news')
    print()
    print("Distributionally similar words:")
    text.similar('news')
    print()
    print("Collocations:")
    text.collocations()
    print()
    # print("Automatically generated text:")
    # text.generate()
    # print()
    print("Dispersion plot:")
    text.dispersion_plot(['news', 'report', 'said', 'announced'])
    print()
    print("Vocabulary plot:")
    text.plot(50)
    print()
    print("Indexing:")
    print("text[3]:", text[3])
    print("text[3:5]:", text[3:5])
    print("text.vocab()['news']:", text.vocab()['news'])
コード例 #16
0
ファイル: vocabulary.py プロジェクト: yt71656/lda_gibbs
def load_corpus(range):#使用nltk中的语料brown来做测试,m为切片值,选择语料起始和终止
    m = re.match(r'(\d+):(\d+)$', range)#正则表达式 形如 1:5
    if m:
        start = int(m.group(1))
        end = int(m.group(2))
        from nltk.corpus import brown as corpus
        return [corpus.words(fileid) for fileid in corpus.fileids()[start:end]]
コード例 #17
0
def exercise2(category):
    print
    print "For Category: " + category
    print "Part 1"
    print "Words with the tag 'JJ':"
    words = bn.tagged_words(categories = category)
    wordlist = bn.words(categories = category)
    words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ']))
    print len(words_JJ)
    print
    print "Part 2"
    print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:"
    words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS']
    print words_VBP_NNPS_NNS[:10]
    print
    sent = ""
    print "Part 3"
    print "The 3 most frequent 3-word prepositional phrases are:"
    words = bn.tagged_words(categories = category)
    for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(words):
        if(t1.startswith('IN') and t2.startswith('AT') and t3.startswith('NN')):
            sent = sent + w1.lower() + " " + w2.lower() + " " + w3.lower() + "."
    sent_part = sent.split(".")
    fd = nltk.FreqDist(sent_part)
    v = fd.most_common(3)
    print v
    print
    print "Part 4"
    print "Ratio of Masculine to Feminine is:"
    male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b'
    female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b'
    male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())])
    female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())])
    print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns)
    print
コード例 #18
0
def load_corpus(range):
    m = re.match(r'(\d+):(\d+)$', range)
    if m:
        start = int(m.group(1))
        end = int(m.group(2))
        from nltk.corpus import brown as corpus
        return [corpus.words(fileid) for fileid in corpus.fileids()[start:end]]
コード例 #19
0
	def compare(self,file):
		
		word_list = []
		for a in brown.words(fileids=['cc17','ca16']):
			word_list.append(str(a))

		word_list = set(word_list)

		text = []

		with open(file, "r+b") as f:

			while 1:
			 	read_data = f.read(1)

			 	if not read_data :
			 		break
			 	text.append(read_data)
		
		text = "".join(text)
		w = set( text.split() )

		occurencies = len(word_list & w)

		return occurencies
コード例 #20
0
ファイル: main.py プロジェクト: Sowmith-iiit/nlp-ssp
def main():
    tagged_words = brown.tagged_words()
    words_corpus = brown.words()

    word2vec = Word2Vec()
    word2vec.train(words_corpus)

    word_vecs = [word2vec.word2vec(word) for word in words_corpus]

    n_clusters = 10 # random number for now
    kmeans = KMeans(n_clusters)
    kmeans.compute(word_vecs)

    # word-cluster HMM
    p_word = {}
    p_cluster = {}

    p_cluster_given_word = None # softmax
    p_word_given_cluster = None # joint probability formula

    p_transition_cluster = None # count
    p_initial_cluster = None # count

    # cluster-tag HMM
    p_cluster_given_tag = None # softmax
    p_transition_tag = None # count from tagged data
    p_initial_tag = None # count from tagged data

    hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster)
    hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag)

    words = []
    clusters = hmm_word_cluster.viterbi(words)
    tags = hmm_cluster_tag.viterbi(clusters)
コード例 #21
0
ファイル: Twertbot.py プロジェクト: mathieuhendey/Twitter-bot
def generateSentence():
    corpus = random.randint(0,3)
    if corpus == 0:
        text = brown.words()
    elif corpus == 1:
        text = gutenberg.words()
    elif corpus == 2:
        text = webtext.words()
    elif corpus == 3:
        text = movie_reviews.words()
    tweetString = ''
    lengthOfTweet = random.randint(0,20)
    len(text)
    firstRun = True
    blank = ' '
    startOfWord = ''
    startOfWordIndex = 0
    startingWord = random.randint(0, (len(text) - 40))
    punctuation = [".", ",", '"', ";", ":", "?", "!", ")", "(", "*", "[", "]", "‘", "“", "#"]

    for x in xrange(startingWord,(startingWord + len(text))):
        startOfWord = text[x]
        if startOfWord ==".":
                startOfWordIndex = x
                break

    for x in xrange(startOfWordIndex + 1, startOfWordIndex+lengthOfTweet):
        if text[x] in punctuation:
            tweetString = tweetString + text[x]

        elif text[x] not in punctuation:
            tweetString = tweetString + blank + text[x]
    return tweetString
コード例 #22
0
ファイル: brown.py プロジェクト: liuwaiting/localrepo
def word_freq(word, genre):
    word_list = brown.words(categories=genre)
    num = 0
    for w in word_list:
        if w == word:
            num += 1
    return num
def load_dict():
    category_noun_dictionary = {}
    
    br_cats=['adventure', 'fiction', 'mystery', 'reviews', 'science_fiction']
    #Finding top 500 words from categories
    for category in br_cats:
        top_words_category=[]
        words_of_category = brown.words(categories=category)
        category_word_freq = nltk.FreqDist(w.lower() for w in words_of_category)
        top_in_category = category_word_freq.most_common(500)
        for i in top_in_category:
            top_words_category.append(i[0])
        top_words_category = set(top_words_category)
        category_noun_dictionary[str(category)] = top_words_category

    #Get top 500 words from Programming Language
    reload(sys)  
    sys.setdefaultencoding('Cp1252')
    plfile = open("PL_corpora.txt").read()
    pl_top=[]
    plwords=word_tokenize(plfile);
    pl_freq = nltk.FreqDist(w.lower() for w in plwords)
    pl_topf=pl_freq.most_common(500)
    for i in pl_topf:
       pl_top.append(i[0])
    pl_top = set(pl_top)
    category_noun_dictionary['programming_language'] = pl_top
    
    return category_noun_dictionary
コード例 #24
0
def partOfSpeechTagging():

    from nltk.corpus import brown

    suffix_fdist = nltk.FreqDist()
    for word in brown.words():
        word = word.lower()
        suffix_fdist.inc(word[-1:])
        suffix_fdist.inc(word[-2:])
        suffix_fdist.inc(word[-3:])

    common_suffixes = suffix_fdist.keys()[:100]
    print common_suffixes 


    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]

    classifier = nltk.DecisionTreeClassifier.train(train_set)
    nltk.classify.accuracy(classifier, test_set)

    classifier.classify(pos_features('cats'))

    print classifier.pseudocode(depth=4)
コード例 #25
0
ファイル: mainDLG.py プロジェクト: devsjee/MDL
def load_corpus(fname):
	''' takes the filename in which space separated text is located 
	and returns as the continous text in lowercase and spaces replaced by underscores
        fname : file name as string
	'''
	global DELIM
	data = ''
	file_text = None
	#punctuation ='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r\x0b\x0c'
	
	if fname == 'BROWN':
		words = brown.words()
		file_text = ' '.join(words)
	else:
		with open(fname,'r') as f:
			file_text = f.readlines()

	for line in file_text:
		text = line.lower()
		text = text.strip()
#    		for punc in punctuation:
#			text = text.replace(punc,' ')
    		text = text.replace('  ',' ')
		text = text.replace('  ',' ')
		text = text.replace('  ',' ')
		text = text.replace(' ','_')
		data += text

	return data
コード例 #26
0
def question2(category):
	#print
	#print "For Category: " + category
	#print "Words with the tag 'JJ':"
	#print
	words = bn.tagged_words(categories = category)
	wordlist = bn.words(categories = category)
	words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ']))
	print len(words_JJ)
	print
	print
	print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:"
	print
	words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS']
	print words_VBP_NNPS_NNS[:10]
	print
	print
	print "Ratio"
	print
	male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b'
	female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b'
	male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())])
	female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())])
	print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns)
	print
	print
	sent = ""
	print "3 word prepositional phrases are:"
コード例 #27
0
def getSimpleQuestionKeywords(query):
    browntext = brown.words()
    browndist = nltk.FreqDist(browntext)

    reuterstext = reuters.words()
    reutersdist = nltk.FreqDist(reuterstext)

    text = nltk.word_tokenize(query)
    tagged = nltk.pos_tag(text)

    filteredparts = []
    for pair in tagged:
        if pair[1] in ['FW', 'JJ', 'JJR', 'JJS', 'JJT', 'N', 'NN', 'NNP', 'NNS', 'NP', 'NPS', 'NR', 'RB', 'RBR', 'RBT' 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NUM', 'CD', 'OD']:
            filteredparts.append(pair[0])

    filtereddist = {}
    for word in filteredparts:
        frequency = browndist[word] + reutersdist[word]
        if frequency < 600:
            filtereddist[word] = frequency
    sortedlist = sorted(filtereddist.items(), key=itemgetter(1))
    keywords = []
    for pair in sortedlist:
        keywords.append(pair[0])
    return keywords
コード例 #28
0
ファイル: c05_auto_tagging.py プロジェクト: AkiraKane/Python
def lookupTagger():

    fd = nltk.FreqDist(brown.words(categories='news'))
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
    most_freq_words = fd.keys()[:100]
    likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
    baseline_tagger = nltk.UnigramTagger(model=likely_tags)
    baseline_tagger.evaluate(brown_tagged_sents)

    sent = brown.sents(categories='news')[3]
    baseline_tagger.tag(sent)

    baseline_tagger = nltk.UnigramTagger(model=likely_tags,
            backoff=nltk.DefaultTagger('NN'))

    def performance(cfd, wordlist):
        lt = dict((word, cfd[word].max()) for word in wordlist)
        baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
        return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))

    def display():
        import pylab
        words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
        cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
        sizes = 2 ** pylab.arange(15)
        perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
        pylab.plot(sizes, perfs, '-bo')
        pylab.title('Lookup Tagger Performance with Varying Model Size')
        pylab.xlabel('Model Size')
        pylab.ylabel('Performance')
        pylab.show()   
コード例 #29
0
ファイル: lib_kron.py プロジェクト: howonlee/kroneckerText
def get_brown_freqs(net):
    counts = collections.Counter(brown.words())
    freq_counts = counts.most_common()
    for i, node in enumerate(sorted(net.degree_iter(), key=itemgetter(1), reverse=True)):
        node_id, _ = node
        net.node[node_id]["word"] = freq_counts[i][0]
    return net
コード例 #30
0
 def create_engBoW_brown(self): # nltk brown corpus
     BoW = Counter()
     for word in brown.words():
         BoW[word] += 1.
     return BoW
コード例 #31
0
ファイル: taggers.py プロジェクト: carlos-ochoa/NLP
tags = default_tagger.tag(tokens)
print(tags)

# Regular Expression Tagger
patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'),
            (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'),
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]

print('\n\nRegexp tagger\n')
regexp_tagger = nltk.RegexpTagger(patterns)
tags = regexp_tagger.tag(tokens)
print(tags)

# Lookup tagger
print('\n\nLookup tagger\n')
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = fd.most_common(100)
likely_tags = dict((word, cfd[word].max()) for (word, cant) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=default_tagger)
tags = baseline_tagger.tag(tokens)
print(tags)

# N-Gram tagger
print('\n\nN-Gram tagger\n')
brown_tagged_sents = brown.tagged_sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
tags = unigram_tagger.tag(tokens)
print(tags)

size = int(len(brown_tagged_sents) * 0.9)
コード例 #32
0
## basic
import urllib2
from bs4 import BeautifulSoup
target_website = "http://www.realclearpolitics.com/2015/08/27/"
html = urllib2.urlopen(target_website).read()

soup = BeautifulSoup(open(doc.html))
soup.prettify() 
soup.get_text(


## NLP for currated data sources:
import nltk
from nltk.corpus import brown
brown.words()
rom nltk.collocations import *
## bigram measures
bigram_measures = nltk.collocations.BigramAssocMeasures()
dir(bigram_measures)
## trigram reps
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = BigramCollocationFinder.from_words(nltk.corpus.genesis.words('english-web.txt'))
finder.nbest(bigram_measures.pmi, 10)

nltk.corpus.genesis.words('english-web.txt')
len(nltk.corpus.genesis.words('english-web.txt'))
nltk.corpus.genesis.words('english-web.txt')[1:100]


## Moby Dick
コード例 #33
0
def exercise11():
    print
    print "Exercise 11"
    cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre))
    genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
    modal_verb = ['shall', 'should', 'may', 'had', 'have']
    print "Tabulation data for closed class of words in english"
    print "For modal verbs:"
    cfd.tabulate(conditions = genres, samples = modal_verb)
    print
    print "For Prepositions:"
    prepositions = ['for', 'from', 'onto', 'to', 'with']
    cfd.tabulate(conditions = genres, samples = prepositions)
    print
    print "For Pronoun:"
    pronoun = ['me', 'she', 'her', 'I', 'we']
    cfd.tabulate(conditions = genres, samples = pronoun)
    print
コード例 #34
0
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer

#Code to read the sentence from the file
file = open('test.txt', 'r')
sentence = file.read()
sentence_broke = sent_tokenize(sentence)

#Thoe following code snippets work in the following fashion
#Line 1 refers to an array that stores the count of the n-grams
#Line 2 refers to the brown corpus broken into n-grams
#Line 3 calculates the frequency of the ngrams in corpus

#Code snippet that works upon the unigrams list
unigrams = ngrams(brown.words(), 1)
unigrams_freq = Counter(unigrams)

#Code snippet that works upon the bigrams list
bigrams = ngrams(brown.words(), 2)
bigrams_freq = Counter(bigrams)

#Code snippet that works upon the trigrams list
trigrams = ngrams(brown.words(), 3)
trigrams_freq = Counter(trigrams)

len_corpus = brown.words().__len__()

for sentence in sentence_broke:
    tokened = RegexpTokenizer(r'\w+')
    tokened = tokened.tokenize(sentence)
コード例 #35
0
print('STOPWORD LANGUAGES:')
print(stopwords.fileids())
print('English stopwords', len(stopwords.words('english')))
print(stopwords.words('english')[:20])
print('Turkish stopwords', len(stopwords.words('turkish')))
print(stopwords.words('turkish')[:20])
print()

# TAGGED
# remember categories and fileids
# print brown.categories()
# print brown.words(categories='news')
# print brown.sents(categories=['news', 'editorial', 'reviews'])
print('POS TAGS:')
print('WORDS:')
print(brown.words()[:5])
print(brown.tagged_words()[:5])
print('SENTS:')
print([s[:5] for s in brown.sents()[:5]])
print([s[:5] for s in brown.tagged_sents()[:5]])
print()

# CHUNKED
# The CoNLL 2000 Corpus includes phrasal chunks
# The CoNLL 2002 Corpus includes named entity chunks
print('CHUNKING & NER:')
print(conll2000.fileids())
print(conll2000.sents()[0])
print(conll2000.chunked_sents()[0])
print(conll2002.sents()[0])
print(conll2002.chunked_sents()[0])
コード例 #36
0
ファイル: search.py プロジェクト: RachelBin/nltk-neo4j
word_tag_fd = nltk.FreqDist(wsj)

idx1 = wsj.index(('kicked', 'VD'))


def findtags(tag_prefix, tagged_text):
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
                                   if tag.startswith(tag_prefix))
    return dict((tag, cfd[tag].keys()[:5]) for tag in cfd.conditions())


tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news'))
for tag in sorted(tagdict):
    print tag, tagdict[tag]

brown_learned_text = brown.words(categories='learned')
sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often'))

brown_lrnd_tagged = brown.tagged_words(categories='learned',
                                       simplify_tags=True)
tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == 'often']
fd = nltk.FreqDist(tags)
fd.tabulate()


def process(sentence):
    for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
            print w1, w2, w3

コード例 #37
0
import urllib

import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

from nltk.corpus import brown
import os
print(brown.words())
print(len(brown.words()))

if "NLTK_DATA" in os.environ:
    print(os.environ.get("NLTK_DATA"))


def get_data(url):
    response = urllib.request.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html, "html.parser")

    text = soup.get_text(strip=True)
    tokens = [t for t in text.split()]
    print(tokens)
    freq = nltk.FreqDist(tokens)
    freq.plot(20, cumulative=False, title='Work Count Token')

    stop_words = stopwords.words('english')
    clean_tokens = [t for t in tokens if t not in stop_words]
    freq = nltk.FreqDist(clean_tokens)
    freq.plot(20, cumulative=False, title='Work Count Clean Token')
コード例 #38
0
        cost.append(c)

    out = []
    i = len(s)
    while i > 0:
        c, k = best_match(i)
        assert c == cost[i]
        out.append(s[i - k:i])
        i -= k

    return " ".join(reversed(out))


link = set()
words = set(
    list(wd.words()) + list(brown.words()) + word_man + list(udhr.words()) +
    list(cess.words()))
some_variable = 0


def fcn(domain_data, pt, date):
    list_no = domain_data[1]
    forbids = [
        '[', '`', '\\', '-', '=', '~', '!', '@', '#', '$', '%', '^', '&', '*',
        '(', ')', '_', '+', '\\', '[', '\\', ']', '{', '}', ';', "'", '\\',
        ':', '"', '|', '<', ',', '.', '/', '<', '>', '?', ']'
    ]
    file = open('filtered_domains.txt', 'a')
    global words, link, some_variable, result_list, result_list_b, master_data
    domain = domain_data[0]
    inter = list(set(forbids) & set(domain.split(".")[0]))
コード例 #39
0
    - Frequency Lists
    - Collocations
    - Data Analysis with R
    - Concordance Analysis (Patterns, Constructions?)
        - Patterns on sentence strings
        - Patterns on sentence word-tag strings

## Preparing Corpus Data

import nltk
from nltk.corpus import brown
from nltk.text import Text
import pandas as pd
import numpy as np

brown_text = Text(brown.words())

## Collocations

- Documentation [nltk.collocations](https://www.nltk.org/howto/collocations.html)
- `nltk.collocations`: Get the `BigramCollocationFinder` which we can use to find n-grams
- `nltk.metrics`: Get the `BigramAssocMeasures` to define collocations (It's also available in `nltk.collocations`)
- Use `finder.nbest()` methods to select/filter collocations

## Collocations based on Text
brown_text.collocation_list()[:10]
#brown_text.collocations()

from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

bigram_measures = nltk.collocations.BigramAssocMeasures() # measures
コード例 #40
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import brown, stopwords
import random, pprint

# === Global Variables === #
# brown text corpora
text = brown.words()
stopwords_en = stopwords.words('english')
bigrams = nltk.bigrams(text)

# nltk frequency distribution
freqDist = nltk.FreqDist(text)
conditionalFreqDist = nltk.ConditionalFreqDist(bigrams)
oppositeConditionalFreqDist = nltk.ConditionalFreqDist(
    (back, front) for front, back in bigrams)
# ======================== #


def get_adverbs():
    """
    Process intensity-modifying adverbs, from NLTK wordnet corpus. 
    Returns list of adverbs. 
    """
    # keywords from definition of 'highly' and 'very'
    keywords = [
        'extent', 'intensifier', 'intensity', 'quantifier', 'degree',
        'comparative'
コード例 #41
0
"""

# Import Dependencies ---------------------------------------------------------
print("Importing dependencies...")

import pandas as pd
import json
import gzip
import numpy as np

import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('brown')
word_list = brown.words()
word_set = set(word_list)
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Importing the data ----------------------------------------------------------
print("Importing data...")
books = json.load(
コード例 #42
0
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import brown
corpus = brown.words()
frequencia = nltk.FreqDist(corpus)
frequencia = list(frequencia.items())
frequencia = sorted(frequencia, key=lambda x: x[1], reverse=True)
mais_frequentes = frequencia[0:50]
unigramas = [i[0] for i in mais_frequentes]
valores = [i[1] for i in mais_frequentes]
tags = [i[1] for i in (nltk.pos_tag(unigramas))]
tags = nltk.FreqDist(tags)
print(tags.items())
plt.bar(unigramas, valores)
plt.show()
コード例 #43
0
import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
import re
import string
import math
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from io import IOBase

#%%
#Read in raw data
raw_data = brown.words()

#%%
#Clean up data
data = [word for word in raw_data if not re.fullmatch('['+string.punctuation+']+',word)]
data = [word for word in data if not re.fullmatch('['+string.digits+']+',word)]
data = [word.lower() for word in data]
stop_words = set(stopwords.words('english'))
data = [word for word in data if not word in stop_words]

#%%
#Get the frequency distribution of all of the (cleaned up) words in corpus
fd = nltk.FreqDist(data)

#%%
#Get list of 1002 (Set C) and 5002 (Set V) most common words in corpus.
コード例 #44
0
@license: Apache Licence 
@contact: [email protected]
@site: 
@software: PyCharm
@file: D8.py
@time: 2017/12/28 0028 上午 11:59
"""
# 搜索已分词文本
import nltk
from nltk.book import gutenberg, nps_chat

moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(r'<a><.*><man>')

chat = nltk.Text(nps_chat.words())
chat.findall(r'<.*><.*><bro>')
chat.findall(r'<l.*>{3,}')

nltk.app.nemo()
nltk.re_show(r'\d', 'we are good friend, and you? how old are you? 25')
import re
re.search(r'\d', 'we are good friend, and you? how old are you? 25')

from nltk.corpus import brown
hobbies_learend = nltk.Text(brown.words(categories=['hobbies', ['learned']]))
hobbies_learend.findall(r'<\w*><and><other><\w*s>')

# as x as y
x_y = nltk.Text(brown.words(categories=['fiction', 'romance']))
x_y.findall(r'<as><\w*><as><\w*>')
コード例 #45
0
ファイル: 02-15.py プロジェクト: shuxinzhang/nltk-learning
# -*- coding: utf-8 -*-
import matplotlib
matplotlib.use('TkAgg')
import nltk 
'''
◑ Write a program to find all words that occur at least three times in the Brown Corpus.
'''
from nltk.corpus import brown
from nltk import FreqDist
brown_words = [word.lower() for word in brown.words()]
fd = FreqDist(brown_words)
wordlist = []
for word in brown_words:
	if fd[word] > 3 and word.islower():
		wordlist.append(word)
print sorted(set(wordlist))
コード例 #46
0
ファイル: 2.Corpora.py プロジェクト: tej1996nitrr/NLP
# %%
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
print(macbeth_sentences)

# %%
for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65], '...')

# %%
#Web and Chat Text
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[0]
# %%
#Brown Corpus
brown.categories()
news = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print(m, ":", fdist[m])
# %%
cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories()
                               for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

# %%
#Reuters Corpus
reuters.fileids()
reuters.categories()
コード例 #47
0
    stop = stopwords.words('english')
    b = ['where', 'how', 'what', 'who', 'why']
    stop = list(set(stop) - set(b))

    stem = PorterStemmer()
    nltk.download('wordnet')
    lemma = WordNetLemmatizer()

    data1 = data
    #data1['data'] = data['data'].apply(func)

    x = TfidfVectorizer(stop_words=stop)
    y = x.fit_transform(data1['data'])

    nltk.download('brown')
    a = FreqDist(i.lower() for i in brown.words())
    b = a.most_common()
    words = []
    for j in b:
        if j[0] not in stop and len(j[0]) > 4:
            words.append(j[0])

    ####################################################

    model = load_model('./../models/new_model8.h5')
    # Get stream from webcam and set parameters)
    vs = VideoStream().start()

    # max number of hands we want to detect/track
    num_hands_detect = 1
コード例 #48
0
ファイル: Bag of Words.py プロジェクト: jieuhyl/Deep_Learning
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 15 17:21:05 2020

@author: Jie.Hu
"""
''' Bag of Words '''
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import brown
#from text_chunker import chunker

# Read the data from the Brown corpus
input_data = ' '.join(brown.words()[:5400])

# Number of words in each chunk
chunk_size = 800

text_chunks = chunker(input_data, chunk_size)

# Convert to dict items
chunks = []
for count, chunk in enumerate(text_chunks):
    d = {'index': count, 'text': chunk}
    chunks.append(d)

# Extract the document term matrix
count_vectorizer = CountVectorizer(min_df=7, max_df=20)
document_term_matrix = count_vectorizer.fit_transform(
    [chunk['text'] for chunk in chunks])
コード例 #49
0
#!/usr/bin/python
from nltk.corpus import brown
import random

corpus_length = len(brown.words())
hardcopy = brown.words()


def create_docs(number_of_words_per_doc=200, num_doc=10, startnr=0):
    # control number of words per doc
    # and number of documents
    # we fix the line length at 20
    line_length = 20
    number_of_lines = int(number_of_words_per_doc / line_length)

    for i in range(0, num_doc):
        # create new file with writing + permission
        new_file = open(
            "textdoc" + str(number_of_words_per_doc) + "words" +
            str(i + startnr) + ".txt", "w+")
        for line in range(0, number_of_lines):
            words = list(
                map(lambda x: hardcopy[x:x + line_length],
                    random.sample(range(corpus_length), line_length)))
            sentences = list(map(lambda x: ' '.join(word for word in x),
                                 words))
            text = ''.join(map(str, sentences))
            new_file.write(text + "\n")
        new_file.close()
        if (i % 10 == 0):
            print("You created " + str(i) + " files! " + str(num_doc - i) +
コード例 #50
0
from nltk.corpus import brown
import nltk

suffix_fdist = nltk.FreqDist()

for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

top_items = sorted(suffix_fdist.viewitems(), key=lambda t: t[1] * -1)[:20]
common_suffixes = [suffix for (suffix, count) in top_items]


def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
    return features


tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

classifier = nltk.DecisionTreeClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)
print classifier.classify(pos_features('cats'))
コード例 #51
0
print('Length of synsets', len(nouns))
yes = []
for w in nouns:
    if len(w.hyponyms()) != 0:
        yes.append(w)
print('Length of synsets with hyponyms', len(yes))
print('Percentage of noun synsets with no hypnoyms',
      (100 - (len(yes) / len(nouns)) * 100))

#Number 3 (2.19) in HW3

print('################ Number 3 ################')
#Creating a Confitional Frequency Distribution to tabulate all of the words in each genre included in the corpus
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories()
                               for word in brown.words(categories=genre))
genres = brown.categories()
modals = [
    'mysterious', 'sad', 'kiss', 'death', 'congress', 'beautiful', 'cry',
    'leave', 'love', 'pope', 'horrible'
]
cfd.tabulate(conditions=genres, samples=modals)

#Number 4 in HW3

print('################ Number 4 ################')


def censor(bad, good, text):
    #Split the text up into a list
    text_list = text.split(" ")
コード例 #52
0
#Nouns are most commonly found after AT(determiner), JJ(adjective), IN(preposition), NN(noun), PP$(pronoun), DT(determiner), CC(conjunction), VBG(gerunds) , AP(adjective), and ,(comma)

 
#18 Generate some statistics for tagged data to answer the following questions:
wordfreq=nltk.ConditionalFreqDist(tag_words) 
cond = wordfreq.conditions()   

 #18a What proportion of word types are always assigned the same part-of-speech tag?
solo= [a for a in cond if len(wordfreq[a]) == 1]
pct_solo = len(solo) / len(cond) 

 #18b How many words are ambiguous, in the sense that they appear with at least two tags?
non_solo = len(cond) - len(solo) 
 
 #15c What percentage of word tokens in the Brown Corpus involve these ambiguous words?
brown = set(brown.words())
brown_solo = [a for a in solo if a in brown]
pct_brown_non_solo = 1- (len(brown_solo) / len(brown))
 
 
#21 In 3.1 we saw a table involving frequency counts for the verbs adore, love, like, prefer and 
#preceding qualifiers absolutely and definitely. Investigate the full range of adverbs that appear before these four verbs.
tag_bigs = nltk.bigrams(tag_words)
verb_preceders = [a[0] for (a, b) in tag_bigs if b[0] in ('adore', 'love' , 'like' , 'prefer') and a[1] == ('RB')]
set(verb_preceders)

#22 We defined the regexp_tagger that can be used as a fall-back tagger for unknown words. This tagger only checks for cardinal numbers. 
#By testing for particular prefix or suffix strings, it should be possible to guess other tags. For example, we could tag any word that ends with -s as a plural noun. 
#Define a regular expression tagger (using  RegexpTagger()) that tests for at least five other patterns in the spelling of words. (Use inline documentation to explain the rules.)
patterns = [
    (r'.*ing$', 'VBG'), # gerunds
コード例 #53
0
 def nltk_similar(self, word):
     text = Text(word.lower() for word in brown.words())
     print(text.similar(word))
コード例 #54
0
]
print(nltk.FreqDist(wordlist_suffixes).most_common(20))
raw = """DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government.  Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony."""
raw_tokens = word_tokenize(raw)
raw_stems = [stem(t) for t in raw_tokens]
print(raw_stems)

#searching tokenized text
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
print(moby.findall(r'<a><man>'))  #print only a man
print(moby.findall(r'<a>(<.*>)<man>'))  #prints words between a and man
chat_words = nltk.Text(nps_chat.words())
print(chat_words.findall(r'<.*><.*><bro>'))
print(chat_words.findall(r'<1.*>{3,}'))
#discover hypernyms in text i.e a and other ys
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
print(hobbies_learned.findall(r'<\w*><and><other><\w*s>'))
print(hobbies_learned.findall(r'<\w*><as><\w*>'))

#text normalization
#stemmers - to remove affixes from words, 2 off-the-shelf in nltk 1.PorterStemmer 2.LancasterStemmer
print(raw_tokens)
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
print([porter.stem(w) for w in raw_tokens])
print([lancaster.stem(w) for w in raw_tokens])


#Indexing a Text Using a Stemmer, support search for alternative forms of words
#revise later
class IndexedText(object):
コード例 #55
0
ファイル: test223a.py プロジェクト: tspeaking/zipfs-law
# test223a.py - confirmation of Zipf's Law, using brown corpus in nltk

import nltk
import math
import matplotlib.pyplot as plt
from nltk.corpus import brown


def zipf(text):
    '''text - a list of words
    '''
    tokens = [w.lower() for w in text if w.isalpha()]
    fdist = nltk.FreqDist(tokens)
    f = sorted([math.log(v) for v in fdist.values()], reverse=True)
    r = list(range(1, len(f) + 1))

    if len(f) == len(r):
        plt.plot(r, f)
        plt.xlabel('Rank')
        plt.ylabel('Frequency')

        plt.title(r'''Zipf's Law''')

        plt.show()
    else:
        print("The rank does not match the frequency.")


if __name__ == '__main__':
    zipf(brown.words(categories='news'))
コード例 #56
0
ファイル: posTaging.py プロジェクト: sa91/Language-Modelling
from HMM import HiddenMarkovModel
from nltk.corpus import brown
import numpy as np

TopElem = 100
Folds = 10
Tags =10
Obs = list(set(brown.words()))
Sentences = brown.sents()
alphabetReverseMap = {Obs[t]:t for t in xrange(len(Obs))}
number_obs = len(Obs)
number_sents = len(Sentences)
foldsize = number_sents/Folds
for fold in xrange(Folds):
    train = Sentences[0:foldsize*fold] + Sentences[foldsize*(fold+1):-1]
    train = [alphabetReverseMap[word] for sentence in train for word in sentence]
    #print train[:20]
    hmm = HiddenMarkovModel(Tags,len(train),number_obs)
    I = np.random.rand(Tags)
    A = np.random.rand(Tags,Tags)
    B = np.random.rand(Tags,number_obs)
    I/=I.sum()
    for i in xrange(Tags):
        A[i][:] /= A[i][:].sum()
        B[i][:] /= B[i][:].sum()
    #print A[1:3]
    #print B[1:3]
    #print I
    I,A,B = hmm.forward_backward(I,A,B,train)

    print "#Fold: ", fold
コード例 #57
0
ファイル: collocations_app.py プロジェクト: aczapata/twitter
from nltk.corpus import (cess_cat, brown, nps_chat, treebank, sinica_treebank, alpino,
                         indian, floresta, mac_morpho, machado, cess_esp)
from nltk.util import in_idle
from nltk.probability import FreqDist


CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
POLL_INTERVAL = 100

_DEFAULT = 'English: Brown Corpus (Humor)'
_CORPORA = {
            'Catalan: CESS-CAT Corpus':
                lambda: cess_cat.words(),
            'English: Brown Corpus':
                lambda: brown.words(),
            'English: Brown Corpus (Press)':
                lambda: brown.words(categories=['news', 'editorial', 'reviews']),
            'English: Brown Corpus (Religion)':
                lambda: brown.words(categories='religion'),
            'English: Brown Corpus (Learned)':
                lambda: brown.words(categories='learned'),
            'English: Brown Corpus (Science Fiction)':
                lambda: brown.words(categories='science_fiction'),
            'English: Brown Corpus (Romance)':
                lambda: brown.words(categories='romance'),
            'English: Brown Corpus (Humor)':
                lambda: brown.words(categories='humor'),
            'English: NPS Chat Corpus':
                lambda: nps_chat.words(),
            'English: Wall Street Journal Corpus':
コード例 #58
0
# The Brown Corpus was the first million-word electronic corpus in English, created in 1961 at Brown University. This corpus contains text from 500 sources, and the sources have been categorized by genre, such as news, editorial, and so on(for a complete genre-list, see http://icame.uib.no/brown/bcm-los.html).

import nltk
from nltk.corpus import brown

print(brown.categories())
print()

print(brown.words(categories="humor"))
print()

print(brown.words(fileids=["ch15"]))
print()

print(brown.sents(categories=["mystery","science_fiction", "adventure"]))
コード例 #59
0
    words = data.split(' ')
    output = []

    cur_count = 0
    cur_words = []
    for word in words:
        cur_words.append(word)
        cur_count += 1
        if cur_count == num_words:
            output.append(' '.join(cur_words))
            cur_words = []
            cur_count = 0

    output.append(' '.join(cur_words) )

    return output 

if __name__=='__main__':
    # Read the data from the Brown corpus
    data = ' '.join(brown.words()[:10000])

    # Number of words in each chunk 
    num_words = 1700

    chunks = []
    counter = 0

    text_chunks = splitter(data, num_words)

    print("Number of text chunks =", len(text_chunks))
コード例 #60
-1
ファイル: category_nltk.py プロジェクト: brenden17/infinity
def category_by_pos():
    from nltk.corpus import brown
    from nltk import FreqDist
    from nltk import DecisionTreeClassifier
    from nltk import NaiveBayesClassifier
    from nltk import classify

    suffix_fdist = FreqDist()
    for word in brown.words():
        word = word.lower()
        suffix_fdist.inc(word[-1:])
        suffix_fdist.inc(word[-2:])
        suffix_fdist.inc(word[-3:])

    common_suffixes = suffix_fdist.keys()[:100]
#    print common_suffixes

    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = DecisionTreeClassifier.train(train_set)
#    print 'Decision Tree %f' % classify.accuracy(classifier, test_set)

    classifier = NaiveBayesClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)