def get_type_token_ratio(category=''): # returns the type to token ratio for the given topic if category=='': text=brown.words() # get the text from the entire corpus else: text=brown.words(categories=category) # get the text from the given category return len(set(text))/len(text)
def test_clusterer(self): """Here we take 10 documents categorized as 'government' and 'mystery' from the brown corpus, and perform k-means clustering on these. Optimally we would like the clusterer to divide them in two clusters. The clusterer generates clusters depending on random initial conditions, so the result can be different in different test runs. In order to account for that that we run a lot of iterations (50) which hopefully will generate a good result. The success condition is that a max of 2 out of 10 documents will fall in the wrong cluster. """ clusterer = KMeans() government_ids = brown.fileids(categories='government')[:10] mystery_ids = brown.fileids(categories='mystery')[:10] government_uids = [] mystery_uids = [] for articleid in government_ids: text = " ".join(brown.words(articleid)) self.folder.invokeFactory('Document', articleid, text=text) government_uids.append(self.folder[articleid].UID()) for articleid in mystery_ids: text = " ".join(brown.words(articleid)) self.folder.invokeFactory('Document', articleid, text=text) mystery_uids.append(self.folder[articleid].UID()) result = clusterer.clusterize(2, 50, repeats=50) cluster1 = set(result[0]) missed = min(len(cluster1-set(government_uids)), len(cluster1-set(mystery_uids))) self.failUnless(missed<=2)
def load_brown_freq_ratios(): brown_freqdist = nltk.FreqDist([w.lower() for w in brown.words()]) num_words = len(brown.words()) ratios = {} for word, number in brown_freqdist.iteritems(): ratios[word] = float(number) / num_words return ratios
def _build_wordset(clazz, obscurity_limit): # I'm sorry this method is so disgusting. # It's all in the cause of fast loading in the main case. from nltk import FreqDist # Ensure corpora are loaded. try: from nltk.corpus import cmudict cmudict.entries() except LookupError: print "CMUDict corpus not found. Downloading..." from nltk import download download('cmudict') print "[Done]" if obscurity_limit is not None: from nltk.corpus import brown try: brown.words() except LookupError: print "Brown corpus not found. Downloading...", from nltk import download download('brown') print "[Done]" words = cmudict.entries() if obscurity_limit is not None: freqs = FreqDist([w.lower() for w in brown.words()]) words = sorted(words, key=lambda x: freqs[x[0].lower()], reverse=True) return words[:obscurity_limit] else: return list(words)
def get_prob_word_in_category(word, category=''): # returns the probability of the given word appearing in the given category # (or the entire corpus, if no category is given). if category=='': text=brown.words() # get the text from the entire corpus else: text=brown.words(categories=category) # get the text from the given category return text.count(word)/len(text)
def get_vocabulary_size(category=''): # returns the size of the vocabulary for a single category from the corpus. # If no category is given, the function should return the vocabulary size for the entire corpus. if category=='': text=brown.words() # get the text from the entire corpus else: text=brown.words(categories=category) # get the text from the given category return len(set(text))
def get_top_n_words(n, category=''): #return the most frequent n words from a category (or the entire corpus) if category=='': text=brown.words() # get the text from the entire corpus else: text=brown.words(categories=category) # get the text from the given category fdist=FreqDist(text) top_words=fdist.keys() return top_words[:n]
def fetchCorpus(): corpus = nltk.pos_tag(brown.words(categories="news")[:CORPUS_SIZE] + brown.words(categories="editorial")[:CORPUS_SIZE] + brown.words(categories="reviews")[:CORPUS_SIZE] + brown.words(categories="lore")[:CORPUS_SIZE] + brown.words(categories="hobbies")[:CORPUS_SIZE]) categories = list(set(map(lambda x:x[1], corpus))) return corpus, categories
def Automated_Readability_Index(section): sents = len(brown.sents(categories = section)) words = len(brown.words(categories = section)) text = " ".join(brown.words(categories = section)) letters = len(text) uw = letters / float(words) us = words / float(sents) ari = (4.71 * uw) + (0.5 * us) - 21.43 return ari
def print_brown(): from nltk.corpus import brown print brown.categories() print brown.words(categories='news') print brown.words(fileids=['cg22']) print brown.sents(categories=['news','reviews']) news_text=brown.words(categories='news') fdist=nltk.FreqDist([w.lower() for w in news_text]) modals=['can','could','may','might','must','will'] for m in modals: print m+':',fdist[m]
def pre_processor(grams=3): vocabulary = set() t = 0 for di in brown.fileids(): vocabulary = vocabulary.union(set(brown.words(di))) t += 1 if t == 2: break vocabulary = list(vocabulary) for i, word in enumerate(vocabulary): wordDic[word] = i posiDic[i] = word t = 0 x1 = np.zeros(shape=(0, grams-1), dtype=int) x2 = np.zeros(shape=(0, grams-1), dtype=int) y1 = np.zeros(shape=(0, 1), dtype=int) y2 = np.zeros(shape=(0, 1), dtype=int) for _id in brown.fileids(): if t == 0: t += 1 text = brown.words(_id) size_ant = x1.shape[0] x1.resize((x1.shape[0] + len(text) - grams - 1, grams-1)) y1.resize((y1.shape[0] + len(text) - grams - 1, 1)) for i in range(size_ant, size_ant + len(text) - grams-1): x1[i] = [wordDic[text[index]] for index in range(i, i+grams-1)] y1[i] = [wordDic[text[i + grams-1]]] continue text = brown.words(_id) size_ant = x2.shape[0] x2.resize((x2.shape[0] + len(text) - grams - 1, grams-1)) y2.resize((y2.shape[0] + len(text) - grams - 1, 1)) for i in range(size_ant, size_ant + len(text) - grams-1): x2[i] = [wordDic[text[index]] for index in range(i, i+grams-1)] y2[i] = [wordDic[text[i + grams-1]]] break return vocabulary, x1, y1, x2, y2
def print_corpus_info(categories, stopwords): print("Corpus name: Brown Corpus") tokens = [w for w in brown.words()] no_stopwords = [w for w in tokens if w not in stopwords] print_scores(tokens, no_stopwords) for category in categories: print("Category:", category) tokens = [w for w in brown.words(categories=category)] no_stopwords = [w for w in tokens if w not in stopwords] print_scores(tokens, no_stopwords)
def plot_word_counts(): #copying all the words in the Brown corpus corpus_full_text = brown.words() corpus_news = brown.words(categories = 'news') fdist = FreqDist(corpus_news) xx=fdist.values() plt.hist(xx, bins=3000) # Annotate the graph plt.xlablel('Frequency of occurences') plt.ylabel('Freqency of words in that bucket') plt.axis([0,500,0,500]) plt.show()
def syn(): while True: #syns=wordnet.synsets(brown.words()[random.randint(1, len(brown.words())-1)].lower()) syns=wordnet.synsets(brown.words()[random.randint(1, 1000000)].lower()) syns2=wordnet.synsets(brown.words()[random.randint(1, 1000000)].lower()) try: word=syns[0].lemmas[0].name word2=syns2[0].lemmas[0].name #print "word: ", word if not (word==word2) and not(word.find("_")>0 or len(word)<4) and not( word2.find("_")>0 or len(word2)<4): return (word,word2) except Exception: continue
def demo(): from nltk.corpus import brown text = Text(brown.words(categories='news')) print(text) print() print("Concordance:") text.concordance('news') print() print("Distributionally similar words:") text.similar('news') print() print("Collocations:") text.collocations() print() # print("Automatically generated text:") # text.generate() # print() print("Dispersion plot:") text.dispersion_plot(['news', 'report', 'said', 'announced']) print() print("Vocabulary plot:") text.plot(50) print() print("Indexing:") print("text[3]:", text[3]) print("text[3:5]:", text[3:5]) print("text.vocab()['news']:", text.vocab()['news'])
def load_corpus(range):#使用nltk中的语料brown来做测试,m为切片值,选择语料起始和终止 m = re.match(r'(\d+):(\d+)$', range)#正则表达式 形如 1:5 if m: start = int(m.group(1)) end = int(m.group(2)) from nltk.corpus import brown as corpus return [corpus.words(fileid) for fileid in corpus.fileids()[start:end]]
def exercise2(category): print print "For Category: " + category print "Part 1" print "Words with the tag 'JJ':" words = bn.tagged_words(categories = category) wordlist = bn.words(categories = category) words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ'])) print len(words_JJ) print print "Part 2" print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:" words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS'] print words_VBP_NNPS_NNS[:10] print sent = "" print "Part 3" print "The 3 most frequent 3-word prepositional phrases are:" words = bn.tagged_words(categories = category) for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(words): if(t1.startswith('IN') and t2.startswith('AT') and t3.startswith('NN')): sent = sent + w1.lower() + " " + w2.lower() + " " + w3.lower() + "." sent_part = sent.split(".") fd = nltk.FreqDist(sent_part) v = fd.most_common(3) print v print print "Part 4" print "Ratio of Masculine to Feminine is:" male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b' female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b' male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())]) female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())]) print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns) print
def load_corpus(range): m = re.match(r'(\d+):(\d+)$', range) if m: start = int(m.group(1)) end = int(m.group(2)) from nltk.corpus import brown as corpus return [corpus.words(fileid) for fileid in corpus.fileids()[start:end]]
def compare(self,file): word_list = [] for a in brown.words(fileids=['cc17','ca16']): word_list.append(str(a)) word_list = set(word_list) text = [] with open(file, "r+b") as f: while 1: read_data = f.read(1) if not read_data : break text.append(read_data) text = "".join(text) w = set( text.split() ) occurencies = len(word_list & w) return occurencies
def main(): tagged_words = brown.tagged_words() words_corpus = brown.words() word2vec = Word2Vec() word2vec.train(words_corpus) word_vecs = [word2vec.word2vec(word) for word in words_corpus] n_clusters = 10 # random number for now kmeans = KMeans(n_clusters) kmeans.compute(word_vecs) # word-cluster HMM p_word = {} p_cluster = {} p_cluster_given_word = None # softmax p_word_given_cluster = None # joint probability formula p_transition_cluster = None # count p_initial_cluster = None # count # cluster-tag HMM p_cluster_given_tag = None # softmax p_transition_tag = None # count from tagged data p_initial_tag = None # count from tagged data hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster) hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag) words = [] clusters = hmm_word_cluster.viterbi(words) tags = hmm_cluster_tag.viterbi(clusters)
def generateSentence(): corpus = random.randint(0,3) if corpus == 0: text = brown.words() elif corpus == 1: text = gutenberg.words() elif corpus == 2: text = webtext.words() elif corpus == 3: text = movie_reviews.words() tweetString = '' lengthOfTweet = random.randint(0,20) len(text) firstRun = True blank = ' ' startOfWord = '' startOfWordIndex = 0 startingWord = random.randint(0, (len(text) - 40)) punctuation = [".", ",", '"', ";", ":", "?", "!", ")", "(", "*", "[", "]", "‘", "“", "#"] for x in xrange(startingWord,(startingWord + len(text))): startOfWord = text[x] if startOfWord ==".": startOfWordIndex = x break for x in xrange(startOfWordIndex + 1, startOfWordIndex+lengthOfTweet): if text[x] in punctuation: tweetString = tweetString + text[x] elif text[x] not in punctuation: tweetString = tweetString + blank + text[x] return tweetString
def word_freq(word, genre): word_list = brown.words(categories=genre) num = 0 for w in word_list: if w == word: num += 1 return num
def load_dict(): category_noun_dictionary = {} br_cats=['adventure', 'fiction', 'mystery', 'reviews', 'science_fiction'] #Finding top 500 words from categories for category in br_cats: top_words_category=[] words_of_category = brown.words(categories=category) category_word_freq = nltk.FreqDist(w.lower() for w in words_of_category) top_in_category = category_word_freq.most_common(500) for i in top_in_category: top_words_category.append(i[0]) top_words_category = set(top_words_category) category_noun_dictionary[str(category)] = top_words_category #Get top 500 words from Programming Language reload(sys) sys.setdefaultencoding('Cp1252') plfile = open("PL_corpora.txt").read() pl_top=[] plwords=word_tokenize(plfile); pl_freq = nltk.FreqDist(w.lower() for w in plwords) pl_topf=pl_freq.most_common(500) for i in pl_topf: pl_top.append(i[0]) pl_top = set(pl_top) category_noun_dictionary['programming_language'] = pl_top return category_noun_dictionary
def partOfSpeechTagging(): from nltk.corpus import brown suffix_fdist = nltk.FreqDist() for word in brown.words(): word = word.lower() suffix_fdist.inc(word[-1:]) suffix_fdist.inc(word[-2:]) suffix_fdist.inc(word[-3:]) common_suffixes = suffix_fdist.keys()[:100] print common_suffixes def pos_features(word): features = {} for suffix in common_suffixes: features['endswith(%s)' % suffix] = word.lower().endswith(suffix) return features tagged_words = brown.tagged_words(categories='news') featuresets = [(pos_features(n), g) for (n,g) in tagged_words] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.DecisionTreeClassifier.train(train_set) nltk.classify.accuracy(classifier, test_set) classifier.classify(pos_features('cats')) print classifier.pseudocode(depth=4)
def load_corpus(fname): ''' takes the filename in which space separated text is located and returns as the continous text in lowercase and spaces replaced by underscores fname : file name as string ''' global DELIM data = '' file_text = None #punctuation ='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r\x0b\x0c' if fname == 'BROWN': words = brown.words() file_text = ' '.join(words) else: with open(fname,'r') as f: file_text = f.readlines() for line in file_text: text = line.lower() text = text.strip() # for punc in punctuation: # text = text.replace(punc,' ') text = text.replace(' ',' ') text = text.replace(' ',' ') text = text.replace(' ',' ') text = text.replace(' ','_') data += text return data
def question2(category): #print #print "For Category: " + category #print "Words with the tag 'JJ':" #print words = bn.tagged_words(categories = category) wordlist = bn.words(categories = category) words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ'])) print len(words_JJ) print print print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:" print words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS'] print words_VBP_NNPS_NNS[:10] print print print "Ratio" print male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b' female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b' male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())]) female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())]) print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns) print print sent = "" print "3 word prepositional phrases are:"
def getSimpleQuestionKeywords(query): browntext = brown.words() browndist = nltk.FreqDist(browntext) reuterstext = reuters.words() reutersdist = nltk.FreqDist(reuterstext) text = nltk.word_tokenize(query) tagged = nltk.pos_tag(text) filteredparts = [] for pair in tagged: if pair[1] in ['FW', 'JJ', 'JJR', 'JJS', 'JJT', 'N', 'NN', 'NNP', 'NNS', 'NP', 'NPS', 'NR', 'RB', 'RBR', 'RBT' 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NUM', 'CD', 'OD']: filteredparts.append(pair[0]) filtereddist = {} for word in filteredparts: frequency = browndist[word] + reutersdist[word] if frequency < 600: filtereddist[word] = frequency sortedlist = sorted(filtereddist.items(), key=itemgetter(1)) keywords = [] for pair in sortedlist: keywords.append(pair[0]) return keywords
def lookupTagger(): fd = nltk.FreqDist(brown.words(categories='news')) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) most_freq_words = fd.keys()[:100] likely_tags = dict((word, cfd[word].max()) for word in most_freq_words) baseline_tagger = nltk.UnigramTagger(model=likely_tags) baseline_tagger.evaluate(brown_tagged_sents) sent = brown.sents(categories='news')[3] baseline_tagger.tag(sent) baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('NN')) def performance(cfd, wordlist): lt = dict((word, cfd[word].max()) for word in wordlist) baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN')) return baseline_tagger.evaluate(brown.tagged_sents(categories='news')) def display(): import pylab words_by_freq = list(nltk.FreqDist(brown.words(categories='news'))) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) sizes = 2 ** pylab.arange(15) perfs = [performance(cfd, words_by_freq[:size]) for size in sizes] pylab.plot(sizes, perfs, '-bo') pylab.title('Lookup Tagger Performance with Varying Model Size') pylab.xlabel('Model Size') pylab.ylabel('Performance') pylab.show()
def get_brown_freqs(net): counts = collections.Counter(brown.words()) freq_counts = counts.most_common() for i, node in enumerate(sorted(net.degree_iter(), key=itemgetter(1), reverse=True)): node_id, _ = node net.node[node_id]["word"] = freq_counts[i][0] return net
def create_engBoW_brown(self): # nltk brown corpus BoW = Counter() for word in brown.words(): BoW[word] += 1. return BoW
tags = default_tagger.tag(tokens) print(tags) # Regular Expression Tagger patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'), (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')] print('\n\nRegexp tagger\n') regexp_tagger = nltk.RegexpTagger(patterns) tags = regexp_tagger.tag(tokens) print(tags) # Lookup tagger print('\n\nLookup tagger\n') fd = nltk.FreqDist(brown.words(categories='news')) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) most_freq_words = fd.most_common(100) likely_tags = dict((word, cfd[word].max()) for (word, cant) in most_freq_words) baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=default_tagger) tags = baseline_tagger.tag(tokens) print(tags) # N-Gram tagger print('\n\nN-Gram tagger\n') brown_tagged_sents = brown.tagged_sents(categories='news') unigram_tagger = nltk.UnigramTagger(brown_tagged_sents) tags = unigram_tagger.tag(tokens) print(tags) size = int(len(brown_tagged_sents) * 0.9)
## basic import urllib2 from bs4 import BeautifulSoup target_website = "http://www.realclearpolitics.com/2015/08/27/" html = urllib2.urlopen(target_website).read() soup = BeautifulSoup(open(doc.html)) soup.prettify() soup.get_text( ## NLP for currated data sources: import nltk from nltk.corpus import brown brown.words() rom nltk.collocations import * ## bigram measures bigram_measures = nltk.collocations.BigramAssocMeasures() dir(bigram_measures) ## trigram reps trigram_measures = nltk.collocations.TrigramAssocMeasures() finder = BigramCollocationFinder.from_words(nltk.corpus.genesis.words('english-web.txt')) finder.nbest(bigram_measures.pmi, 10) nltk.corpus.genesis.words('english-web.txt') len(nltk.corpus.genesis.words('english-web.txt')) nltk.corpus.genesis.words('english-web.txt')[1:100] ## Moby Dick
def exercise11(): print print "Exercise 11" cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modal_verb = ['shall', 'should', 'may', 'had', 'have'] print "Tabulation data for closed class of words in english" print "For modal verbs:" cfd.tabulate(conditions = genres, samples = modal_verb) print print "For Prepositions:" prepositions = ['for', 'from', 'onto', 'to', 'with'] cfd.tabulate(conditions = genres, samples = prepositions) print print "For Pronoun:" pronoun = ['me', 'she', 'her', 'I', 'we'] cfd.tabulate(conditions = genres, samples = pronoun) print
from nltk.util import ngrams from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tokenize import RegexpTokenizer #Code to read the sentence from the file file = open('test.txt', 'r') sentence = file.read() sentence_broke = sent_tokenize(sentence) #Thoe following code snippets work in the following fashion #Line 1 refers to an array that stores the count of the n-grams #Line 2 refers to the brown corpus broken into n-grams #Line 3 calculates the frequency of the ngrams in corpus #Code snippet that works upon the unigrams list unigrams = ngrams(brown.words(), 1) unigrams_freq = Counter(unigrams) #Code snippet that works upon the bigrams list bigrams = ngrams(brown.words(), 2) bigrams_freq = Counter(bigrams) #Code snippet that works upon the trigrams list trigrams = ngrams(brown.words(), 3) trigrams_freq = Counter(trigrams) len_corpus = brown.words().__len__() for sentence in sentence_broke: tokened = RegexpTokenizer(r'\w+') tokened = tokened.tokenize(sentence)
print('STOPWORD LANGUAGES:') print(stopwords.fileids()) print('English stopwords', len(stopwords.words('english'))) print(stopwords.words('english')[:20]) print('Turkish stopwords', len(stopwords.words('turkish'))) print(stopwords.words('turkish')[:20]) print() # TAGGED # remember categories and fileids # print brown.categories() # print brown.words(categories='news') # print brown.sents(categories=['news', 'editorial', 'reviews']) print('POS TAGS:') print('WORDS:') print(brown.words()[:5]) print(brown.tagged_words()[:5]) print('SENTS:') print([s[:5] for s in brown.sents()[:5]]) print([s[:5] for s in brown.tagged_sents()[:5]]) print() # CHUNKED # The CoNLL 2000 Corpus includes phrasal chunks # The CoNLL 2002 Corpus includes named entity chunks print('CHUNKING & NER:') print(conll2000.fileids()) print(conll2000.sents()[0]) print(conll2000.chunked_sents()[0]) print(conll2002.sents()[0]) print(conll2002.chunked_sents()[0])
word_tag_fd = nltk.FreqDist(wsj) idx1 = wsj.index(('kicked', 'VD')) def findtags(tag_prefix, tagged_text): cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text if tag.startswith(tag_prefix)) return dict((tag, cfd[tag].keys()[:5]) for tag in cfd.conditions()) tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news')) for tag in sorted(tagdict): print tag, tagdict[tag] brown_learned_text = brown.words(categories='learned') sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == 'often')) brown_lrnd_tagged = brown.tagged_words(categories='learned', simplify_tags=True) tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == 'often'] fd = nltk.FreqDist(tags) fd.tabulate() def process(sentence): for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence): if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')): print w1, w2, w3
import urllib import nltk from bs4 import BeautifulSoup from nltk.corpus import stopwords from nltk.corpus import brown import os print(brown.words()) print(len(brown.words())) if "NLTK_DATA" in os.environ: print(os.environ.get("NLTK_DATA")) def get_data(url): response = urllib.request.urlopen(url) html = response.read() soup = BeautifulSoup(html, "html.parser") text = soup.get_text(strip=True) tokens = [t for t in text.split()] print(tokens) freq = nltk.FreqDist(tokens) freq.plot(20, cumulative=False, title='Work Count Token') stop_words = stopwords.words('english') clean_tokens = [t for t in tokens if t not in stop_words] freq = nltk.FreqDist(clean_tokens) freq.plot(20, cumulative=False, title='Work Count Clean Token')
cost.append(c) out = [] i = len(s) while i > 0: c, k = best_match(i) assert c == cost[i] out.append(s[i - k:i]) i -= k return " ".join(reversed(out)) link = set() words = set( list(wd.words()) + list(brown.words()) + word_man + list(udhr.words()) + list(cess.words())) some_variable = 0 def fcn(domain_data, pt, date): list_no = domain_data[1] forbids = [ '[', '`', '\\', '-', '=', '~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+', '\\', '[', '\\', ']', '{', '}', ';', "'", '\\', ':', '"', '|', '<', ',', '.', '/', '<', '>', '?', ']' ] file = open('filtered_domains.txt', 'a') global words, link, some_variable, result_list, result_list_b, master_data domain = domain_data[0] inter = list(set(forbids) & set(domain.split(".")[0]))
- Frequency Lists - Collocations - Data Analysis with R - Concordance Analysis (Patterns, Constructions?) - Patterns on sentence strings - Patterns on sentence word-tag strings ## Preparing Corpus Data import nltk from nltk.corpus import brown from nltk.text import Text import pandas as pd import numpy as np brown_text = Text(brown.words()) ## Collocations - Documentation [nltk.collocations](https://www.nltk.org/howto/collocations.html) - `nltk.collocations`: Get the `BigramCollocationFinder` which we can use to find n-grams - `nltk.metrics`: Get the `BigramAssocMeasures` to define collocations (It's also available in `nltk.collocations`) - Use `finder.nbest()` methods to select/filter collocations ## Collocations based on Text brown_text.collocation_list()[:10] #brown_text.collocations() from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder bigram_measures = nltk.collocations.BigramAssocMeasures() # measures
#!/usr/bin/env python # -*- coding: utf-8 -*- import nltk from nltk.corpus import wordnet as wn from nltk.corpus import brown, stopwords import random, pprint # === Global Variables === # # brown text corpora text = brown.words() stopwords_en = stopwords.words('english') bigrams = nltk.bigrams(text) # nltk frequency distribution freqDist = nltk.FreqDist(text) conditionalFreqDist = nltk.ConditionalFreqDist(bigrams) oppositeConditionalFreqDist = nltk.ConditionalFreqDist( (back, front) for front, back in bigrams) # ======================== # def get_adverbs(): """ Process intensity-modifying adverbs, from NLTK wordnet corpus. Returns list of adverbs. """ # keywords from definition of 'highly' and 'very' keywords = [ 'extent', 'intensifier', 'intensity', 'quantifier', 'degree', 'comparative'
""" # Import Dependencies --------------------------------------------------------- print("Importing dependencies...") import pandas as pd import json import gzip import numpy as np import nltk from nltk.corpus import brown from nltk.corpus import stopwords nltk.download('stopwords') nltk.download('brown') word_list = brown.words() word_set = set(word_list) from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') from sklearn.model_selection import train_test_split from sklearn import preprocessing from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC from sklearn.pipeline import Pipeline from sklearn.metrics import accuracy_score # Importing the data ---------------------------------------------------------- print("Importing data...") books = json.load(
import nltk import matplotlib.pyplot as plt from nltk.corpus import brown corpus = brown.words() frequencia = nltk.FreqDist(corpus) frequencia = list(frequencia.items()) frequencia = sorted(frequencia, key=lambda x: x[1], reverse=True) mais_frequentes = frequencia[0:50] unigramas = [i[0] for i in mais_frequentes] valores = [i[1] for i in mais_frequentes] tags = [i[1] for i in (nltk.pos_tag(unigramas))] tags = nltk.FreqDist(tags) print(tags.items()) plt.bar(unigramas, valores) plt.show()
import nltk from nltk.corpus import brown from nltk.corpus import stopwords import re import string import math from sklearn.decomposition import PCA from sklearn.neighbors import NearestNeighbors from sklearn.preprocessing import normalize from sklearn.cluster import KMeans from sklearn.cluster import SpectralClustering from io import IOBase #%% #Read in raw data raw_data = brown.words() #%% #Clean up data data = [word for word in raw_data if not re.fullmatch('['+string.punctuation+']+',word)] data = [word for word in data if not re.fullmatch('['+string.digits+']+',word)] data = [word.lower() for word in data] stop_words = set(stopwords.words('english')) data = [word for word in data if not word in stop_words] #%% #Get the frequency distribution of all of the (cleaned up) words in corpus fd = nltk.FreqDist(data) #%% #Get list of 1002 (Set C) and 5002 (Set V) most common words in corpus.
@license: Apache Licence @contact: [email protected] @site: @software: PyCharm @file: D8.py @time: 2017/12/28 0028 上午 11:59 """ # 搜索已分词文本 import nltk from nltk.book import gutenberg, nps_chat moby = nltk.Text(gutenberg.words('melville-moby_dick.txt')) moby.findall(r'<a><.*><man>') chat = nltk.Text(nps_chat.words()) chat.findall(r'<.*><.*><bro>') chat.findall(r'<l.*>{3,}') nltk.app.nemo() nltk.re_show(r'\d', 'we are good friend, and you? how old are you? 25') import re re.search(r'\d', 'we are good friend, and you? how old are you? 25') from nltk.corpus import brown hobbies_learend = nltk.Text(brown.words(categories=['hobbies', ['learned']])) hobbies_learend.findall(r'<\w*><and><other><\w*s>') # as x as y x_y = nltk.Text(brown.words(categories=['fiction', 'romance'])) x_y.findall(r'<as><\w*><as><\w*>')
# -*- coding: utf-8 -*- import matplotlib matplotlib.use('TkAgg') import nltk ''' ◑ Write a program to find all words that occur at least three times in the Brown Corpus. ''' from nltk.corpus import brown from nltk import FreqDist brown_words = [word.lower() for word in brown.words()] fd = FreqDist(brown_words) wordlist = [] for word in brown_words: if fd[word] > 3 and word.islower(): wordlist.append(word) print sorted(set(wordlist))
# %% macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') print(macbeth_sentences) # %% for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:65], '...') # %% #Web and Chat Text chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[0] # %% #Brown Corpus brown.categories() news = brown.words(categories='news') fdist = nltk.FreqDist([w.lower() for w in news]) modals = ['can', 'could', 'may', 'might', 'must', 'will'] for m in modals: print(m, ":", fdist[m]) # %% cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modals = ['can', 'could', 'may', 'might', 'must', 'will'] cfd.tabulate(conditions=genres, samples=modals) # %% #Reuters Corpus reuters.fileids() reuters.categories()
stop = stopwords.words('english') b = ['where', 'how', 'what', 'who', 'why'] stop = list(set(stop) - set(b)) stem = PorterStemmer() nltk.download('wordnet') lemma = WordNetLemmatizer() data1 = data #data1['data'] = data['data'].apply(func) x = TfidfVectorizer(stop_words=stop) y = x.fit_transform(data1['data']) nltk.download('brown') a = FreqDist(i.lower() for i in brown.words()) b = a.most_common() words = [] for j in b: if j[0] not in stop and len(j[0]) > 4: words.append(j[0]) #################################################### model = load_model('./../models/new_model8.h5') # Get stream from webcam and set parameters) vs = VideoStream().start() # max number of hands we want to detect/track num_hands_detect = 1
# -*- coding: utf-8 -*- """ Created on Mon Jun 15 17:21:05 2020 @author: Jie.Hu """ ''' Bag of Words ''' import numpy as np from sklearn.feature_extraction.text import CountVectorizer from nltk.corpus import brown #from text_chunker import chunker # Read the data from the Brown corpus input_data = ' '.join(brown.words()[:5400]) # Number of words in each chunk chunk_size = 800 text_chunks = chunker(input_data, chunk_size) # Convert to dict items chunks = [] for count, chunk in enumerate(text_chunks): d = {'index': count, 'text': chunk} chunks.append(d) # Extract the document term matrix count_vectorizer = CountVectorizer(min_df=7, max_df=20) document_term_matrix = count_vectorizer.fit_transform( [chunk['text'] for chunk in chunks])
#!/usr/bin/python from nltk.corpus import brown import random corpus_length = len(brown.words()) hardcopy = brown.words() def create_docs(number_of_words_per_doc=200, num_doc=10, startnr=0): # control number of words per doc # and number of documents # we fix the line length at 20 line_length = 20 number_of_lines = int(number_of_words_per_doc / line_length) for i in range(0, num_doc): # create new file with writing + permission new_file = open( "textdoc" + str(number_of_words_per_doc) + "words" + str(i + startnr) + ".txt", "w+") for line in range(0, number_of_lines): words = list( map(lambda x: hardcopy[x:x + line_length], random.sample(range(corpus_length), line_length))) sentences = list(map(lambda x: ' '.join(word for word in x), words)) text = ''.join(map(str, sentences)) new_file.write(text + "\n") new_file.close() if (i % 10 == 0): print("You created " + str(i) + " files! " + str(num_doc - i) +
from nltk.corpus import brown import nltk suffix_fdist = nltk.FreqDist() for word in brown.words(): word = word.lower() suffix_fdist[word[-1:]] += 1 suffix_fdist[word[-2:]] += 1 suffix_fdist[word[-3:]] += 1 top_items = sorted(suffix_fdist.viewitems(), key=lambda t: t[1] * -1)[:20] common_suffixes = [suffix for (suffix, count) in top_items] def pos_features(word): features = {} for suffix in common_suffixes: features['endswith(%s)' % suffix] = word.lower().endswith(suffix) return features tagged_words = brown.tagged_words(categories='news') featuresets = [(pos_features(n), g) for (n, g) in tagged_words] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.DecisionTreeClassifier.train(train_set) print nltk.classify.accuracy(classifier, test_set) print classifier.classify(pos_features('cats'))
print('Length of synsets', len(nouns)) yes = [] for w in nouns: if len(w.hyponyms()) != 0: yes.append(w) print('Length of synsets with hyponyms', len(yes)) print('Percentage of noun synsets with no hypnoyms', (100 - (len(yes) / len(nouns)) * 100)) #Number 3 (2.19) in HW3 print('################ Number 3 ################') #Creating a Confitional Frequency Distribution to tabulate all of the words in each genre included in the corpus from nltk.corpus import brown cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = brown.categories() modals = [ 'mysterious', 'sad', 'kiss', 'death', 'congress', 'beautiful', 'cry', 'leave', 'love', 'pope', 'horrible' ] cfd.tabulate(conditions=genres, samples=modals) #Number 4 in HW3 print('################ Number 4 ################') def censor(bad, good, text): #Split the text up into a list text_list = text.split(" ")
#Nouns are most commonly found after AT(determiner), JJ(adjective), IN(preposition), NN(noun), PP$(pronoun), DT(determiner), CC(conjunction), VBG(gerunds) , AP(adjective), and ,(comma) #18 Generate some statistics for tagged data to answer the following questions: wordfreq=nltk.ConditionalFreqDist(tag_words) cond = wordfreq.conditions() #18a What proportion of word types are always assigned the same part-of-speech tag? solo= [a for a in cond if len(wordfreq[a]) == 1] pct_solo = len(solo) / len(cond) #18b How many words are ambiguous, in the sense that they appear with at least two tags? non_solo = len(cond) - len(solo) #15c What percentage of word tokens in the Brown Corpus involve these ambiguous words? brown = set(brown.words()) brown_solo = [a for a in solo if a in brown] pct_brown_non_solo = 1- (len(brown_solo) / len(brown)) #21 In 3.1 we saw a table involving frequency counts for the verbs adore, love, like, prefer and #preceding qualifiers absolutely and definitely. Investigate the full range of adverbs that appear before these four verbs. tag_bigs = nltk.bigrams(tag_words) verb_preceders = [a[0] for (a, b) in tag_bigs if b[0] in ('adore', 'love' , 'like' , 'prefer') and a[1] == ('RB')] set(verb_preceders) #22 We defined the regexp_tagger that can be used as a fall-back tagger for unknown words. This tagger only checks for cardinal numbers. #By testing for particular prefix or suffix strings, it should be possible to guess other tags. For example, we could tag any word that ends with -s as a plural noun. #Define a regular expression tagger (using RegexpTagger()) that tests for at least five other patterns in the spelling of words. (Use inline documentation to explain the rules.) patterns = [ (r'.*ing$', 'VBG'), # gerunds
def nltk_similar(self, word): text = Text(word.lower() for word in brown.words()) print(text.similar(word))
] print(nltk.FreqDist(wordlist_suffixes).most_common(20)) raw = """DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony.""" raw_tokens = word_tokenize(raw) raw_stems = [stem(t) for t in raw_tokens] print(raw_stems) #searching tokenized text moby = nltk.Text(gutenberg.words('melville-moby_dick.txt')) print(moby.findall(r'<a><man>')) #print only a man print(moby.findall(r'<a>(<.*>)<man>')) #prints words between a and man chat_words = nltk.Text(nps_chat.words()) print(chat_words.findall(r'<.*><.*><bro>')) print(chat_words.findall(r'<1.*>{3,}')) #discover hypernyms in text i.e a and other ys hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned'])) print(hobbies_learned.findall(r'<\w*><and><other><\w*s>')) print(hobbies_learned.findall(r'<\w*><as><\w*>')) #text normalization #stemmers - to remove affixes from words, 2 off-the-shelf in nltk 1.PorterStemmer 2.LancasterStemmer print(raw_tokens) porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() print([porter.stem(w) for w in raw_tokens]) print([lancaster.stem(w) for w in raw_tokens]) #Indexing a Text Using a Stemmer, support search for alternative forms of words #revise later class IndexedText(object):
# test223a.py - confirmation of Zipf's Law, using brown corpus in nltk import nltk import math import matplotlib.pyplot as plt from nltk.corpus import brown def zipf(text): '''text - a list of words ''' tokens = [w.lower() for w in text if w.isalpha()] fdist = nltk.FreqDist(tokens) f = sorted([math.log(v) for v in fdist.values()], reverse=True) r = list(range(1, len(f) + 1)) if len(f) == len(r): plt.plot(r, f) plt.xlabel('Rank') plt.ylabel('Frequency') plt.title(r'''Zipf's Law''') plt.show() else: print("The rank does not match the frequency.") if __name__ == '__main__': zipf(brown.words(categories='news'))
from HMM import HiddenMarkovModel from nltk.corpus import brown import numpy as np TopElem = 100 Folds = 10 Tags =10 Obs = list(set(brown.words())) Sentences = brown.sents() alphabetReverseMap = {Obs[t]:t for t in xrange(len(Obs))} number_obs = len(Obs) number_sents = len(Sentences) foldsize = number_sents/Folds for fold in xrange(Folds): train = Sentences[0:foldsize*fold] + Sentences[foldsize*(fold+1):-1] train = [alphabetReverseMap[word] for sentence in train for word in sentence] #print train[:20] hmm = HiddenMarkovModel(Tags,len(train),number_obs) I = np.random.rand(Tags) A = np.random.rand(Tags,Tags) B = np.random.rand(Tags,number_obs) I/=I.sum() for i in xrange(Tags): A[i][:] /= A[i][:].sum() B[i][:] /= B[i][:].sum() #print A[1:3] #print B[1:3] #print I I,A,B = hmm.forward_backward(I,A,B,train) print "#Fold: ", fold
from nltk.corpus import (cess_cat, brown, nps_chat, treebank, sinica_treebank, alpino, indian, floresta, mac_morpho, machado, cess_esp) from nltk.util import in_idle from nltk.probability import FreqDist CORPUS_LOADED_EVENT = '<<CL_EVENT>>' ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>' POLL_INTERVAL = 100 _DEFAULT = 'English: Brown Corpus (Humor)' _CORPORA = { 'Catalan: CESS-CAT Corpus': lambda: cess_cat.words(), 'English: Brown Corpus': lambda: brown.words(), 'English: Brown Corpus (Press)': lambda: brown.words(categories=['news', 'editorial', 'reviews']), 'English: Brown Corpus (Religion)': lambda: brown.words(categories='religion'), 'English: Brown Corpus (Learned)': lambda: brown.words(categories='learned'), 'English: Brown Corpus (Science Fiction)': lambda: brown.words(categories='science_fiction'), 'English: Brown Corpus (Romance)': lambda: brown.words(categories='romance'), 'English: Brown Corpus (Humor)': lambda: brown.words(categories='humor'), 'English: NPS Chat Corpus': lambda: nps_chat.words(), 'English: Wall Street Journal Corpus':
# The Brown Corpus was the first million-word electronic corpus in English, created in 1961 at Brown University. This corpus contains text from 500 sources, and the sources have been categorized by genre, such as news, editorial, and so on(for a complete genre-list, see http://icame.uib.no/brown/bcm-los.html). import nltk from nltk.corpus import brown print(brown.categories()) print() print(brown.words(categories="humor")) print() print(brown.words(fileids=["ch15"])) print() print(brown.sents(categories=["mystery","science_fiction", "adventure"]))
words = data.split(' ') output = [] cur_count = 0 cur_words = [] for word in words: cur_words.append(word) cur_count += 1 if cur_count == num_words: output.append(' '.join(cur_words)) cur_words = [] cur_count = 0 output.append(' '.join(cur_words) ) return output if __name__=='__main__': # Read the data from the Brown corpus data = ' '.join(brown.words()[:10000]) # Number of words in each chunk num_words = 1700 chunks = [] counter = 0 text_chunks = splitter(data, num_words) print("Number of text chunks =", len(text_chunks))
def category_by_pos(): from nltk.corpus import brown from nltk import FreqDist from nltk import DecisionTreeClassifier from nltk import NaiveBayesClassifier from nltk import classify suffix_fdist = FreqDist() for word in brown.words(): word = word.lower() suffix_fdist.inc(word[-1:]) suffix_fdist.inc(word[-2:]) suffix_fdist.inc(word[-3:]) common_suffixes = suffix_fdist.keys()[:100] # print common_suffixes def pos_features(word): features = {} for suffix in common_suffixes: features['endswith(%s)' % suffix] = word.lower().endswith(suffix) return features tagged_words = brown.tagged_words(categories='news') featuresets = [(pos_features(n), g) for (n, g) in tagged_words] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] # classifier = DecisionTreeClassifier.train(train_set) # print 'Decision Tree %f' % classify.accuracy(classifier, test_set) classifier = NaiveBayesClassifier.train(train_set) print 'NaiveBay %f' % classify.accuracy(classifier, test_set)