def load_sentences_brown(nb_sentences=None): """ :param nb_sentences: Use if all brown sentences are too many :return: index2word (list of string) """ from nltk.corpus import brown import gensim print 'building vocab ...' if nb_sentences is None: sents = brown.sents() else: sents = brown.sents()[:nb_sentences] # I use gensim model only for building vocab model = gensim.models.Word2Vec() model.build_vocab(sents) vocab = model.vocab # ids: list of (list of word-id) ids = [[vocab[w].index for w in sent if w in vocab and vocab[w].sample_int > model.random.rand() * 2**32] for sent in sents] return ids, model.index2word
def clean(): ''' 1. Removes any individual special character. 2. Lowers all the words. :return: list of clean sentences ''' sents = list(brown.sents()) sents_copy = list(brown.sents()) n = len(sents) print 'Removing special chars...' for i in range(0, n): for word in sents[i]: if not bool(re.search('[A-Za-z0-9]', word)): sents_copy[i].remove(word) print 'Removed special chars.' sents = None print 'Lowercasing all the words...' for i in range(0, n): m = len(sents_copy[i]) for j in range(0, m): sents_copy[i][j] = sents_copy[i][j].lower() print 'Lowered all the words.' return sents_copy
def print_brown(): from nltk.corpus import brown print brown.categories() print brown.words(categories='news') print brown.words(fileids=['cg22']) print brown.sents(categories=['news','reviews']) news_text=brown.words(categories='news') fdist=nltk.FreqDist([w.lower() for w in news_text]) modals=['can','could','may','might','must','will'] for m in modals: print m+':',fdist[m]
def read_datas(self): brown_tagged_sentence = brown.tagged_sents() brown_sent = brown.sents() size = int(len(brown_tagged_sentence) * 0.9) train_set = brown_tagged_sentence[:size] test_set = brown_tagged_sentence[size:] return (train_set,test_set)
def build_index(out_filename, in_filename = None): '''Builds data files for word lookup. Can take an optional input file to add to the data pool which is processed (not working). Data is then dumped to a pickle file.''' sents_data = [] try: in_file = open(in_filename).read() sents_data += sent_tokenize(in_file) in_file.close() except: print("Warning: Failed to load external file for building.") sents_data += brown.sents() + treebank.sents() # get sentences, chop of rtheir ambiguous heads, and look at their words! mysents = [sent[1:] for sent in sents_data] # flatten sublists of words to list of words mywords = [word for word in mysents for word in word] cfd = ConditionalFreqDist((word.lower(), word) for word in mywords) # look up most frequent form of lowercase word by doing cfd['word'].max() # but need to check for existance of word in cfd first # made pickle file too large and slow # wordlist = set(words.words()) # wordlist.update(brown.words()) # wordlist.update(treebank.words()) # common_words_lower = set([w for w in wordlist if w.islower()]) # common_words_titlecase = set([w.lower() for w in wordlist if (w.istitle() and w not in common_words_lower)]) out_file = open(out_filename, 'wb') pickle.dump(cfd, out_file, 2) # pickle.dump(common_words_lower, out_file, 2) # pickle.dump(common_words_titlecase, out_file, 2) out_file.close()
def update_category_by_pos(): from nltk.corpus import brown from nltk import NaiveBayesClassifier from nltk import classify from nltk.tag import untag from nltk import DecisionTreeClassifier def pos_features(sentence, i): features = {'suffix(1)':sentence[i][-1:], 'suffix(2)':sentence[i][-2:], 'suffix(3)':sentence[i][-3:] } features['prev-word'] = '<start>' if i==0 else sentence[i-1] return features print pos_features(brown.sents()[0], 8) tagged_sents = brown.tagged_sents(categories='news') featuresets = [] for tagged_sent in tagged_sents: untagged_sent = untag(tagged_sent) for i, (word, tag) in enumerate(tagged_sent): featuresets.append((pos_features(untagged_sent, i), tag)) size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] # classifier = NaiveBayesClassifier.train(train_set) classifier = DecisionTreeClassifier.train(train_set) print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
def find_ngrams(self, n): """ Input: the 'n' of 'n-grams' Find all the n-grams in the brown corpus. Store in frequency dictionary. Optionally it can be decided to use more corpora in order to have more data. Note: these are of course n-grams based on going through the sentence from left to right If we want to give the correction back based on the dependency tree, we need to parse the brown corpus (or any other data set) with the dependency parser, so that we can use this data. """ total_ngram_count = 0 ngram_freq_dict = {} sents = brown.sents() for sent in sents: sent = ['-START-']*(n-1)+sent ngrams_brown = ngrams(sent, n) for i in ngrams_brown: total_ngram_count += 1 old = ngram_freq_dict.get(i,0) old += 1 ngram_freq_dict[i] = old #print i,old return ngram_freq_dict, total_ngram_count
def import_brown_pos(ds, simplify_tags=False, silent=False, log=sys.stdout): """ Import the brown corpus into `ds`. E.g. >>> from nathan.core import Dataspace >>> ds = Dataspace() >>> %time brown.import_brown(ds, silent=True) CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s Wall time: 12min 29s """ if not silent: total = len(brown.sents()) counter = 0 for category in brown.categories(): cat_handle = ds.insert("#%s" % category) for sent in brown.tagged_sents(categories=category): if simplify_tags: norm = (simplify_tag(t) for t in sent) norm = [nltk.tuple2str(t) for t in norm] sen_handle = ds.insert(norm) ds.link(cat_handle, sen_handle) if not silent: counter += 1 if (counter % 100 == 0): print("importing %s of %s sentences..." % (counter, total), file=log)
def load_movie_corpus_each_sentence(range): m = re.match(r'(\d+):(\d+)$', range) if m: start = int(m.group(1)) end = int(m.group(2)) from nltk.corpus import movie_reviews as corpus return [corpus.sents(fileid) for fileid in corpus.fileids()[start:end]]
def lookupTagger(): fd = nltk.FreqDist(brown.words(categories='news')) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) most_freq_words = fd.keys()[:100] likely_tags = dict((word, cfd[word].max()) for word in most_freq_words) baseline_tagger = nltk.UnigramTagger(model=likely_tags) baseline_tagger.evaluate(brown_tagged_sents) sent = brown.sents(categories='news')[3] baseline_tagger.tag(sent) baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('NN')) def performance(cfd, wordlist): lt = dict((word, cfd[word].max()) for word in wordlist) baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN')) return baseline_tagger.evaluate(brown.tagged_sents(categories='news')) def display(): import pylab words_by_freq = list(nltk.FreqDist(brown.words(categories='news'))) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) sizes = 2 ** pylab.arange(15) perfs = [performance(cfd, words_by_freq[:size]) for size in sizes] pylab.plot(sizes, perfs, '-bo') pylab.title('Lookup Tagger Performance with Varying Model Size') pylab.xlabel('Model Size') pylab.ylabel('Performance') pylab.show()
def data_api(spilt_rate): raw_sent = brown.sents() partial_data = raw_sent[:int(0.1*len(raw_sent))] data_x, data_y = prepare_0(partial_data, word2intdict) print 'len data_x', len(data_x), len(data_y) train_inds = npr.choice(range(len(data_x)), size = int((1 - spilt_rate) * len(data_x)), replace = False) X_train = [] Y_train = [] X_test = [] Y_test = [] print 'len train_inds', len(train_inds), len(data_x) for i in range(len(data_x)): if i in train_inds: #print 'trn', i X_train.append(data_x[i]) Y_train.append(data_y[i]) else : #print 'tst', i X_test.append(data_x[i]) Y_test.append(data_y[i]) print 'len X_train', len(X_train), len(X_test) return (X_train, Y_train), (X_test, Y_test)
def cal_idf(): # brown.sents() total_wordlists = [] doc_sents = [] for f in brown.fileids(): print f doc_wordlist = [] doc_sentlist = brown.sents(fileids=[f]) d_sents = '' for sent in doc_sentlist: s = '' # sent = stem_tokens(sent) for w in sent: w = w.lower() s += w + ' ' d_sents += s + '\n' doc_wordlist.extend(sent) total_wordlists.append(doc_wordlist) doc_sents.append(d_sents) print 'start caling tfidf' from sklearn.feature_extraction.text import TfidfVectorizer corpus = doc_sents vectorizer = TfidfVectorizer(min_df=1) X = vectorizer.fit_transform(corpus) idf = vectorizer.idf_ # print dict(zip(vectorizer.get_feature_names(), idf)) pickle.dump(vectorizer, open('idf_vectorizer', 'w')) dictionary = corpora.Dictionary(total_wordlists) dic, corps = get_corpus_by_lists(total_wordlists) tfidf = models.TfidfModel(corps, id2word=dic) pickle.dump(tfidf, open('brown_tfidf', 'w'))
def auto_tag(company): """ tag a given text using brown corpus and unigram tagger :param company: company whose reviews are tagged :return: a list of tagged words """ brown_tagged_sents = brown.tagged_sents(categories = 'news', tagset='universal') brown_sents = brown.sents(categories = 'news') # open the review of a company, and print error message if company review doesn't exist # first deal with unique cases such as General Motors => GM if company == 'General Motors': company = 'GM' elif company == 'Ford Motor Company': company = 'Ford' try: text = open('/Users/vickyzhang/Documents/Python/chart/comp/review/'+ company.capitalize() + '_review.txt').read() except FileNotFoundError: print('The system doesn\'t have a review for the company you entered. Please enter another company.') # normalize (tokenize and lowercase-ize) each word in the string text_token = nltk.word_tokenize(text) text_normal = [w.lower() for w in text_token] # build unigram tagger based on brown corpus, and use it to tag the normalized text unigram_tagger = nltk.UnigramTagger(brown_tagged_sents) text_tagged = unigram_tagger.tag(text_normal) return text_tagged
def createModel(): global classifierit global classifierloose global classifieryou global classifierto global classifiertheir trainingitSet = [] traininglooseSet = [] trainingyouSet = [] trainingtoSet = [] trainingtheirSet= [] st = POSTagger('/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/models/english-bidirectional-distsim.tagger', '/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/stanford-postagger.jar') for line in brown.sents(): print line tagSent = st.tag(line) print tagSent arrayOfitFeature = pos_itfeatures(tagSent) arrayOfyouFeature = pos_youfeatures(tagSent) arrayOftheirFeature = pos_theirfeatures(tagSent) arrayOflooseFeature = pos_loosefeatures(tagSent) arrayOftoFeature = pos_tofeatures(tagSent) if arrayOfitFeature: trainingitSet.extend(arrayOfitFeature) if arrayOftheirFeature: trainingtheirSet.extend(arrayOftheirFeature) if arrayOflooseFeature: traininglooseSet.extend(arrayOflooseFeature) if arrayOftoFeature: trainingtoSet.extend(arrayOftoFeature) if arrayOfyouFeature: trainingyouSet.extend(arrayOfyouFeature) algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[1] #encodingit = maxent.TypedMaxentFeatureEncoding.train(trainingitSet, count_cutoff=3, alwayson_features=True) classifierit = maxent.MaxentClassifier.train(trainingitSet, algorithm) f = open('classifierit.pickle', 'wb') pickle.dump(classifierit, f) f.close() #encodingloose = maxent.TypedMaxentFeatureEncoding.train(traininglooseSet, count_cutoff=3, alwayson_features=True) classifierloose = maxent.MaxentClassifier.train(traininglooseSet, algorithm) f = open('classifierloose.pickle', 'wb') pickle.dump(classifierloose, f) f.close() #encodingyou = maxent.TypedMaxentFeatureEncoding.train(trainingyouSet, count_cutoff=3, alwayson_features=True) classifieryou = maxent.MaxentClassifier.train(trainingyouSet, algorithm) f = open('classifieryou.pickle', 'wb') pickle.dump(classifieryou, f) f.close() #encodingto = maxent.TypedMaxentFeatureEncoding.train(trainingtoSet, count_cutoff=3, alwayson_features=True) classifierto = maxent.MaxentClassifier.train(trainingtoSet, algorithm) f = open('classifierto.pickle', 'wb') pickle.dump(classifierto, f) f.close() #encodingtheir = maxent.TypedMaxentFeatureEncoding.train(trainingtheirSet, count_cutoff=3, alwayson_features=True) classifiertheir = maxent.MaxentClassifier.train(trainingtheirSet, algorithm) f = open('classifiertheir.pickle', 'wb') pickle.dump(classifiertheir, f) f.close()
def brown_tagged_sents(): from nltk.corpus import brown brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') unigram_tagger = nltk.UnigramTagger(brown_tagged_sents) size = int(len(brown_tagged_sents) * 0.9) train_sents = brown_tagged_sents[:size] return (train_sents, brown_tagged_sents[size:])
def get_valid_brown_corpus(): global DIR DIR = BROWN_DIR genre = ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] sentences = brown.sents(categories=genre) sents = remove_bad_sents(sentences) sents = [[w.lower() for w in s] for s in sents] return sents
def ic(w) : total = 0 for sentence in b.sents(): for word in sentence: total = total + 1 brown_freqs[word.lower()] +=1 print w.lower() ,":",brown_freqs[w.lower()], 1.0 - (math.log(brown_freqs[w.lower()]) / math.log(total+1))
def uG(): global uniCounter #counts repeats of uniGrams global uniGram #dictionary of biGrams global uniGrams #counts biGrams uniCounter = {} uniGram = [] uniGrams = 0 news = brown.sents(categories='editorial') for x in range (1, MAX, 1): sent = news[x] sent.append('</s>') #ending sentences with '</s>' sent.insert(0, '<s>') #beginning sentences with '<s>' for x in range (0,sent.count('.')+1,1): try: sent.remove('.') #removing .'s except: pass for x in range (0,sent.count(',')+1,1): try: sent.remove(',') #removing ,'s except: pass for x in range (0,sent.count("'")+1,1): try: sent.remove("'") #removing ''s except: pass for x in range (0,sent.count('"')+1,1): try: sent.remove('"') #removing ''s except: pass x = 0 for word in sent: word = word.lower() #making all letters lowercase sent[x] = word #so differences dont occur when x = x+1 #they shouldn't value = '1' for x in range (0,len(sent),1): try: word = sent[x] if(word not in uniGram): uniGram.append(word) uniGrams = uniGrams + 1 if (word in uniCounter): value = uniCounter[word] value = value + 1 uniCounter[word] = value else: uniCounter[word] = 1 except: pass
def Automated_Readability_Index(section): sents = len(brown.sents(categories = section)) words = len(brown.words(categories = section)) text = " ".join(brown.words(categories = section)) letters = len(text) uw = letters / float(words) us = words / float(sents) ari = (4.71 * uw) + (0.5 * us) - 21.43 return ari
def learn(self, listofsentences=[], n=2000): self.learned = defaultdict(mydict) if listofsentences == []: listofsentences = brown.sents() for i, sent in enumerate(listofsentences): if i >= n: # Limit to the first nth sentences of the corpus break for word in sent: self.learned[self.specialhash(word)][word.lower()] += 1
def collect_data_from_ptb_brow_duc2004(): start_collect = time.time() samples = [] # Penn Tree Bank treebank_sents = treebank.sents() for i in range(len(treebank_sents)): senttmp = " ".join(treebank_sents[i]) words = nltk.word_tokenize(senttmp) samples.append(words) sys.stdout.write("Finish collecting training data from Penn Tree Bank") sys.stdout.flush() # Brown brown_sents = brown.sents() for i in range(len(brown_sents)): senttmp = " ".join(brown_sents[i]) words = nltk.word_tokenize(senttmp) samples.append(words) sys.stdout.write("Finish collecting training data from Brown") sys.stdout.flush() # DUC data folder_path = "/Users/HyNguyen/Documents/Research/Data/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs" clusters_name = os.listdir(folder_path) for cluster_name in clusters_name: if cluster_name[0] == ".": # except file .DStore in my macbook continue files_name = os.listdir(folder_path + "/" + cluster_name) for file_name in files_name: if file_name[0] == ".": # except file .DStore in my macbook continue file_path = folder_path + "/" + cluster_name +"/"+ file_name try: tree = ET.parse(file_path) root = tree.getroot() text_tag = root._children[3] if text_tag.tag == "TEXT": text = text_tag.text.replace("\n", "") sentences = nltk.tokenize.sent_tokenize(text) for sentence in sentences: words = nltk.word_tokenize(sentence) samples.append(words) except: print "exception parse XML: ", file_name continue sys.stdout.write("Finish collecting training data from DUC2004") sys.stdout.flush() sys.stdout.write("length of samples" + str(len(samples))) sys.stdout.flush() end_collect = time.time() sys.stdout.write("Total time for collecting training data: " + str(end_collect - start_collect)) sys.stdout.flush() return samples
def brown_corpus_word_frequency(targetWord): words = FreqDist() for sentence in brown.sents(): for word in sentence: words.inc(word.lower()) print words[targetWord] print words.freq(targetWord)
def __init__(self, file_name): self.embsize = int(file_name.split('_')[-1]) self.model = None if os.path.isfile(file_name): self.model = word2vec.Word2Vec.load(file_name) if self.model is None: model = word2vec.Word2Vec(brown.sents(), size=self.embsize, window=5, min_count=5, workers=4) model.save(file_name)
def main(): corpussub = brown.sents()[:3000] #runCalc(corpussub) evalf = Evaluate() sents = map(lambda s: ' '.join(s), corpussub) finsents = reduce(lambda a,b: a + ' '+ b, sents) evalf.initBleu(sents) #ngramEval(finsents,evalf,10) #humanEval(finsents, evalf, "rawcorpus/humansentence.txt") evalAllHmm(finsents, evalf,"rawcorpus/andersen.txt",5)
def exercise_brown(): # 打印布朗语料库中的分类 print brown.categories() # 打印分类为新闻的文本词汇 print brown.words(categories="news") # 打印文本'cg22' print brown.words(fileids=["cg22"]) # 打印句子 print brown.sents(categories=["news", "reviews"]) """比较不同文体中的情态动词的用法""" # 获取文本 news_text = brown.words(categories="news") # 单词定义频率 fdist = nltk.FreqDist([w.lower() for w in news_text]) # 定义情态动词表 modals = ["can", "could", "may", "might", "must", "will"] for m in modals: print m + ":", fdist[m]
def calc_readability(corpus): texts = [] results = [] for fileid in corpus.fileids(): sentlist = brown.sents(fileids=[fileid]) text = ' '.join([ ' '.join(ss) for ss in sentlist ]) texts.append(text) for text in texts: results.append(simple.get_text_stats(text)['read']) return results
def lookupTagger(i): brown_tagged_sents = bn.tagged_sents(categories='news') brown_sents = bn.sents(categories='news') fd = nltk.FreqDist(bn.words(categories = 'news')) cfd = nltk.ConditionalFreqDist(bn.tagged_words(categories = 'news')) most_freq_words = fd.keys()[:i] likely_tags = dict((word, cfd[word].max()) for word in most_freq_words) baseline_tagger = nltk.UnigramTagger(model=likely_tags) evalResult = baseline_tagger.evaluate(brown_tagged_sents) print "Evaluation of lookupTagger for the size %d is: %f" %(i, evalResult)
def preprocess(wikipedia_text): brown_text = brown.sents() brown_tagged = brown.tagged_sents() unigram_tagger = nltk.UnigramTagger(brown_tagged) bigram_tagger = nltk.BigramTagger(brown_tagged,backoff=unigram_tagger) paragraph_text = wikipedia_text.split('\n') paragraph_tagged = [tp for p in paragraph_text for tp in bigram_tagger.tag(nltk.word_tokenize(p.translate(None,'.')))] #print paragraph_tagged return paragraph_tagged
def __init__(self): """Initialize your data structures in the constructor.""" self.bigramCounts = collections.defaultdict(lambda : 0) self.unigramCounts = collections.defaultdict(lambda : 1) self.continuationCounts = collections.defaultdict(lambda: 0) self.followingCounts = collections.defaultdict(lambda: 0) self.total = 1 print "Training Language Model..." self.train(brown.sents()) print "--Training Complete--"
def nGramTagging(): print "=============== Unigram Tagging ===============" from nltk.corpus import brown brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') unigram_tagger = nltk.UnigramTagger(brown_tagged_sents) print unigram_tagger.tag(brown_sents[2007]) print unigram_tagger.evaluate(brown_tagged_sents) print "=============== Separating the Training and Testing Data ===============" size = int(len(brown_tagged_sents) * 0.9) print size train_sents = brown_tagged_sents[:size] test_sents = brown_tagged_sents[size:] unigram_tagger = nltk.UnigramTagger(train_sents) print unigram_tagger.evaluate(test_sents) print "=============== General N-Gram Tagging ===============" bigram_tagger = nltk.BigramTagger(train_sents) print bigram_tagger.tag(brown_sents[2007]) unseen_sent = brown_sents[4203] print bigram_tagger.tag(unseen_sent) print bigram_tagger.evaluate(test_sents) print "=============== Combining Taggers ===============" t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) print t2.evaluate(test_sents) print "=============== Tagging Across Sentence Boundaries ===============" brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') size = int(len(brown_tagged_sents) * 0.9) train_sents = brown_tagged_sents[:size] test_sents = brown_tagged_sents[size:] t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) print t2.evaluate(test_sents)
entropy = -1 * mean perplexity = pow(2.0, entropy) return perplexity def avg_sent_perplexity(corpus, lm): perplexities = [] for sent in corpus: ngrams = [ngram for ngram in sent] perplexities.append(lm.perplexity(ngrams)) return sum(perplexities) / len(perplexities) if __name__ == '__main__': args = parse_args() lm = Laplace(args.n) # smoothing if args.train is not None: train_corpus = load_corpus(args.train) else: train_corpus = brown.sents() train, vocab = padded_everygram_pipeline(args.n, train_corpus) lm.fit(train, vocab) for test_file in args.corpora: test_corpus = load_corpus(test_file) test, vocab = padded_everygram_pipeline(args.n, test_corpus) perplexity = avg_sent_perplexity(test, lm) print('{}: {}'.format(test_file, perplexity))
from nltk.corpus import brown import nltk print(brown.categories()) print(brown.words(categories='news')) print(brown.words(fileids=['cg22'])) print(brown.sents(categories=['news', 'editorial', 'reviews'])) from nltk.corpus import brown news_text = brown.words(categories='news') fdist = nltk.FreqDist(w.lower() for w in news_text) modals = ['can', 'could', 'may', 'might', 'must', 'will'] for m in modals: print(m + ':', fdist[m], end=' ') cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modals = ['can', 'could', 'may', 'might', 'must', 'will'] print() print(cfd.tabulate(conditions=genres, samples=modals))
from nltk.corpus import brown import statistics, viterbi_algo corpus_tagged_sentences = brown.tagged_sents(categories='news') corpus_sentences = brown.sents(categories='news') training_size = round(len(corpus_sentences) * 0.9) training_set = corpus_tagged_sentences[:training_size] test_set = corpus_tagged_sentences[training_size:][:100] untagged_test_set = corpus_sentences[training_size:][:100] corpus_size = len(brown.words(categories='news')) # Constants START_1 = "START_1" START_2 = "START_2" STOP = "STOP" TIMES = "times" TAG_TO_UNKNOWN_WORD = "NN" COMMON_TAGS = 20 # --------------------- Helper Function --------------------- def get_common_tags(corpus): """ :param corpus: a corpus :return: set with the common tags of the corpus """ # Counts number of occurrences of each tag tags = {} for sen in corpus:
print(classifier.pseudocode(depth=4)) ## Exploiting Context from nltk.corpus import brown def pos_features(sentence,i): features={'suffix(1)':sentence[i][-1:], 'suffix(2)':sentence[i][-2:], 'suffix(3)':sentence[i][-3:]} if i ==0: features['prev-word']='<START>' else: features['prev-word']=sentence[i-1] return features brown.sents()[0] pos_features(brown.sents()[0],8) tagged_sents=brown.tagged_sents(categories='news') featuresets=[] for tagged_sent in tagged_sents: untagged_sent=nltk.tag.untag(tagged_sent) for i, (word, tag) in enumerate(tagged_sent): featuresets.append((pos_features(untagged_sent,i),tag)) size=int(len(featuresets)*0.1) train_set,test_set=featuresets[size:],featuresets[:size] classifier=nltk.NaiveBayesClassifier.train(train_set) nltk.classify.accuracy(classifier,test_set) # 0.789
#print("Labels : ", Y) count = Counter(Y) print(count) list_of_sents = [] list_of_sents_raw = pickle.load(open('listOfSentences.pkl', 'rb')) for line in list_of_sents_raw: list_of_sents.append(line.split(" ")) #building word2vec model on brown corpus size = 20 window = 3 print("Word2Vec parameters - Vector size : ", size, "Window size : ", window) sentences_brown = brown.sents() w2v_model_brown = Word2Vec(sentences_brown, size=size, window=window, min_count=1) w2v_model_wv = w2v_model_brown.wv del w2v_model_brown """ #load the labeled raw data data = pickle.load(open("actualData.pkl","rb")) #create pickle file of features and get the labels for the videos Y = np.array(load_data(data)) #print("Labels : ", Y) count = Counter(Y) print(count)
def exercise1(): train_data = brown.tagged_sents(categories='news') test_data = brown.tagged_sents(categories='lore') unigram_tagger_model = nltk.UnigramTagger(train_data) print("Evaluate on all of the sentences from the Brown corpus with the category lore : ",unigram_tagger_model.evaluate(test_data)) print("Evaluate on all of the sentences from the Brown corpus with the category news : ", unigram_tagger_model.evaluate(train_data)) print("Output of tagger on the 200th sentence of the lore category of the Brown Corpus : ", unigram_tagger_model.tag(brown.sents(categories='lore')[199]))
np.random.seed(1000) def scatter_documents(X): fig, ax = plt.subplots(1, 1, figsize=(10, 6)) ax.scatter(X[:, 0], X[:, 1]) ax.set_xlabel('t0') ax.set_ylabel('t1') ax.grid() plt.show() if __name__ == '__main__': # Compose a corpus sentences = sentences = brown.sents(categories=['news', 'fiction']) corpus = [] for s in sentences: corpus.append(' '.join(s)) # Vectorize the corpus vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', sublinear_tf=True, use_idf=True) Xc = vectorizer.fit_transform(corpus).todense() # Perform SVD U, s, V = svd(Xc, full_matrices=False) # Extract a sub-space with rank=2 rank = 2
with open('results.txt', 'a') as resfile: resfile.write( 'pearson correlation in dataset [%s] for Datamuse methods is %f\n' % ('STS-131', corr)) # part 6 from gensim.models import Word2Vec import nltk nltk.download('brown') from nltk.corpus import brown with open('datasets/stss-131.csv', newline='') as csvfile: contents = list(csv.reader(csvfile, delimiter=';')) model_word2vec = Word2Vec(brown.sents(), min_count=8) sim_cal = np.array( sentence_similarity_dataset_model(contents, model_word2vec.wv)).reshape(-1, ) with open('sentence_similarity.txt', 'a') as simfile: simfile.write('Using Word2Vec embedding\n') simfile.write('s1; s2; human_sim; method_sim\n\n') for i, pair in enumerate(contents): simfile.write('%s;%s;%s;%f\n' % (pair[0], pair[1], pair[2], sim_cal[i] * 4)) simfile.write('\n\n') sim_ref = np.array(contents)[:, 2].astype(float) / 4.0 corr = pearson_correlation(sim_cal, sim_ref)
# -*- coding:utf-8 -*- """ 2019/4/2 15:46 by young """ import nltk from nltk.corpus import brown print(brown.categories()) print('共有{}个句子'.format(len(brown.sents()))) print('共有{}个单词'.format(len(brown.words())))
""" text = [None, None, None] sentence_finished = False # generate random sentences while not sentence_finished: r = random.random() accumulator = .0 for word in model[tuple(text[-2:])].keys(): accumulator += model[tuple(text[-2:])][word] if accumulator >= r: text.append(word) break if text[-2:] == [None, None]: sentence_finished = True return ' '.join([t for t in text if t]) if __name__ == '__main__': print('Modelling the corpus') model = model_trigram(brown.sents()) print('Assign probabilities.') model = model_proabilities(model) print('Generating sentences from the model') print(generate_sentence(model))
def main(): taggedsents = [] for f in inputstring: s = brown.sents(f)[:] for i in s: i = nltk.pos_tag(i) i.insert(0, ("<s>", "<s>")) i.append(("</s>", "</s>")) taggedsents.append(i) tagbigrams = createbigrams(taggedsents) taggedwords = [] uniquewords = [] words = [] uniquetags = [] tags = [] for sent in taggedsents: for i in sent: taggedwords.append(i) words.append(i[0]) tags.append(i[1]) if i[0] not in uniquewords: uniquewords.append(i[0]) if i[1] not in uniquetags: uniquetags.append(i[1]) words = [i for i in words if i not in ["<s>", "</s>"]] uniquewords = [i for i in uniquewords if i not in ["<s>", "</s>"]] epmatrix = [[0] * len(uniquewords) for i in range(len(uniquetags))] tpmatrix = [[0] * len(uniquetags) for i in range(len(uniquetags))] #hmm traning for wordi in range(len(uniquewords)): for tagi in range(len(uniquetags)): epmatrix[tagi][wordi] = taggedwords.count( (uniquewords[wordi], uniquetags[tagi])) / tags.count( uniquetags[tagi]) for tagi in range(len(uniquetags)): for t in range(len(uniquetags)): tpmatrix[tagi][t] = tagbigrams.count( (uniquetags[tagi], uniquetags[t])) / tags.count( uniquetags[tagi]) #hmm testing s = brown.sents(outputstring)[:] defaulttaggedsents = [] for i in s: i = nltk.pos_tag(i) defaulttaggedsents.append(i) hmmtaggedsents = [] for i in s: i = hmm_pos_tag(i, epmatrix, tpmatrix, uniquetags, uniquewords, tags) hmmtaggedsents.append(i) #testing correct = 0 wrong = 0 for i in range(len(defaulttaggedsents)): for j in range(len(defaulttaggedsents[i])): if (defaulttaggedsents[i][j][1] == hmmtaggedsents[i][j][1]): correct += 1 else: wrong += 1 print("Correct tags: " + str(correct)) print("Wrong tags: " + str(wrong)) print("Accuracy of hmm pos tagger: " + str(correct / (correct + wrong)))
'10-24-40s_706posts.xml', '10-26-teens_706posts.xml', '11-06-adults_706posts.xml', '11-08-20s_705posts.xml', '11-08-40s_706posts.xml', '11-08-adults_705posts.xml', '11-08-teens_706posts.xml', '11-09-20s_706posts.xml', '11-09-40s_706posts.xml', '11-09-adults_706posts.xml', '11-09-teens_706posts.xml' ]) # Create a placeholder for model model = defaultdict(lambda: defaultdict(lambda: 0)) # Count frequency of co-occurance i = 0 for sentence in brown.sents(categories=[ 'adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction' ]): for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True): model[(w1, w2)][w3] += 1 for sentence in posts: for w1, w2, w3 in trigrams(sentence.text, pad_right=True, pad_left=True): model[(w1, w2)][w3] += 1 # Let's transform the counts to probabilities for w1_w2 in model: total_count = float(sum(model[w1_w2].values())) for w3 in model[w1_w2]: model[w1_w2][w3] /= total_count # default test case sentence = "find the nearest medical shop to center of arizona"
#webtext in nltk.corpus from nltk.corpus import webtext for filleid in webtext.fileids(): print (fileid, webtext.raw(fileid)[:2]) from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123] #brown corpus from nltk.corpus import brown brown.categories() brown.words(categories='editorial') brown.words(fileids=['cp12']) brown.sents(categories=['news','editorials']) edi_text = brown.words(categories='fiction') fdist=nltk.FreqDist([w.lower() for w in edi_text]) modals=['what','who','where','when','why'] for m in modals: print (m + ':', fdist[m]) #Reuters corpus from nltk.corpus import reuters reuters.fileids() reuters.categories() reuters.words('training/9947')[:14] reuters.words(categories=['sorghum','rye']) #inaugural address
def get_word2vec(): model = gensim.models.Word2Vec(brown.sents()) return model
def ARI(cat): words = brown.words(categories=cat) sents = brown.sents(categories=cat) mw = sum(len(w) for w in words) / len(words) ms = sum(len(s) for s in sents) / len(sents) return 4.71 * mw + 0.5 * ms - 21.43
import nltk # Import the Brown corpus from nltk.corpus import brown # Show the categories available in the Brown corpus print("Total categories in the Brown corpus: {}".format(len( brown.categories()))) print("- {}".format("\n- ".join(brown.categories()))) # Tokenized senteces brown.sents(categories="mystery") # POS tagged senteces print("\nPOS Tagged sentences: \n{}".format( brown.tagged_sents(categories="learned"))) # Get the nouns from the tagged words. Nouns are tagged as NN or NP. # Note the use of a generator within the any function, thus the values # evaluated by any are generated as it iterates over the words and short-circuits # as it sees the first True value. (Some words have several tags. e.g. NN-HL!) tagged_words = brown.tagged_words(categories="science_fiction") nouns = [(word, tag) for word, tag in tagged_words if any(noun_tag in tag for noun_tag in ['NN', 'NP'])] print("\nNouns: {}\n- {}".format( len(nouns), "\n- ".join( (wt_pair[0] + ": " + wt_pair[1]) for wt_pair in nouns[0:20]))) # Build frequency distribution for nouns. (Note that using a generator instead of a comprehension # should have a positive effect in performance/memory footprint)
import nltk from nltk.corpus import brown from nltk import word_tokenize import pylab # setting up data brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') # The default tagger tags = [tag for (word, tag) in brown.tagged_words(categories='news')] # most frequent print nltk.FreqDist(tags).max() # tagger that tags everything as NN (Noun) raw = 'I do not like green eggs and ham, I do not like them Sam I am!' tokens = word_tokenize(raw) default_tagger = nltk.DefaultTagger('NN') print default_tagger.tag(tokens) # this will perform poorly on a corpus print default_tagger.evaluate(brown_tagged_sents) # The regular expression Tagger # these are processed in order the first one to match applies patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals
filebase = "/home/fnielsen/" def word_feats(words): return dict([(word, True) for word in words]) def sents2words(sents): return [ set(map(lambda w: w.lower(), pattern_word.findall(j))) for j in sents ] pattern_word = re.compile('[^\W\d_]+', re.UNICODE) news_sents = map(lambda words: " ".join(words), brown.sents(categories='news')) categories = [ 'reviews', 'religion', 'hobbies', 'lore', 'belles_lettres', 'government', 'learned', 'fiction', 'mystery', 'science_fiction', 'adventure', 'romance', 'humor' ] other_feats = [] others_feats = [] other_sents = [] for category in categories: sents = map(lambda words: " ".join(words), brown.sents(categories=category)) other_sents.append(sents) words = sents2words(sents)
#-*-coding:utf-8 -*- #自动标注 from nltk.corpus import brown brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') #默认标注器 import nltk from nltk.corpus import brown tags = [tag for (word, tag) in brown.tagged_words(categories='news')] print(nltk.FreqDist(tags).max()) raw = 'I do not like gree eggs and ham,I do not like them Sam I am!' tokens = nltk.word_tokenize(raw) default_tagger = nltk.DefaultTagger('NN') print(default_tagger.tag(tokens)) print(default_tagger.evaluate(brown_tagged_sents)) #正则表达式标注器 patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'), (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')] regexp_tagger = nltk.RegexpTagger(patterns) print(regexp_tagger.tag(brown_sents[3])) #查询标注器 fd = nltk.FreqDist(brown.words(categories='news')) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) most_freq_words = fd.keys()[:100] likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
# File for getting the Brown corpus into just stings of text, one sentence # per line, via NLTK # Brown corpus = 1161192 words in 57340 sentences # Outputs separate files with ~5000 sentences each # Tested w/ Python 3.7 and NLTK 3.7.3 from nltk.corpus import brown import os import time filebase = '/Users/garrettsmith/Google Drive/UniPotsdam/Research/Features/GenEmbeddings/BrownCorpus/' start_time = time.time() fileno = 0 for i, sent in enumerate(brown.sents()): if (i > 0) and (i % 5000 == 0): fileno += 1 print('{} sentences processed\r'.format(i), end='') file = filebase + 'brown' + str(fileno) + '.txt' # Open file to append to if it exists if os.path.exists(filebase): mode = 'a' else: mode = 'w' with open(file, mode) as f: sent = ' '.join(sent) + '\n' f.write(sent) print('Elapsed time: {} seconds'.format(time.time() - start_time))
print classifier.pseudocode(depth=4) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "E X A M P L E 5: Exploiting Context" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Exploiting Context. " \ " contextual features often provide powerful clues about " \ "the correct tag—for example, when tagging the word fly, " \ "knowing that the previous word is a will allow us to " \ "determine that it is functioning as a noun, not a verb." \ "In order to accommodate features that depend on a word’s " \ "context, we must revise the pattern that we used to define " \ "our feature extractor. Instead of just passing in the word " \ "to be tagged, we will pass in a complete (untagged) sentence, " \ "along with the index of the target word." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print pos_features2(brown.sents()[0], 6) print pos_features2(brown.sents()[0], 7) print pos_features2(brown.sents()[0], 8) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Now that we’ve defined our feature extractor, we can use it " \ "to generate a features set from UNTAGGED sentences in corpus " \ "Data structure of brown.tagged_sents():" \ "[ [(sentence1-token1,POS-tag), (sentence1-tokens2,POS-tag),...]" \ " [(sentence2-token1,POS-tag), (sentence2-tokens2,POS-tag),...]" \ " ... ]" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" tagged_sents = brown.tagged_sents(categories='news')[0:5] print "length of tagged sentence set: ", len(tagged_sents) print tagged_sents[:3] featuresets = [] for tagged_sent in tagged_sents:
x=(words_te[i],words_te[i+1],words_te[i+2]) if(Interpolated_Kneser_Ney_dict.get(x,"empty")=="empty"): if((x[0],x[1]) not in bgcounter): Interpolated_Kneser_Ney_dict[x]=findPKn_bigram((x[1],x[2])) else: Interpolated_Kneser_Ney_dict[x]=findPKn_trigram(x,discount_final) perp=perp*((1/Interpolated_Kneser_Ney_dict[x])**(1/N)) return perp # In[30]: text_gutenberg=list(gutenberg.sents()) text_brown=list(brown.sents()) text_gutenberg_size=len(text_gutenberg) text_gutenberg_size=len(text_gutenberg) text_brown_size=len(text_brown) for i in range(text_gutenberg_size): text_gutenberg[i].insert(0,"<s>") text_gutenberg[i].insert(len(text_gutenberg[i]),'<e>') text_gutenberg[i].insert(len(text_gutenberg[i]),'<e>') for i in range(text_brown_size): text_brown[i].insert(0,"<s>") text_brown[i].insert(len(text_brown[i]),'<e>') text_brown[i].insert(len(text_brown[i]),'<e>') text_gutenberg_tr,text_gutenberg_te=train_test_split(text_gutenberg,test_size=.20,random_state=4) text_brown_tr,text_brown_te=train_test_split(text_brown,test_size=.20,random_state=4) plt.close()
tkinter.Tk().withdraw() in_path = filedialog.askopenfilename() tkinter.Tk().withdraw() out_path = filedialog.asksaveasfilename() try: bigram_model = json.load(open("Spelling_Correction/bigrams.txt")) except IOError: bigram_model = {} for sentence in brown.sents(): for w1, w2 in bigrams(sentence): if w1 in bigram_model: if w2 in bigram_model[w1].keys(): bigram_model[w1][w2] = bigram_model[w1][w2] + 1 else: bigram_model[w1][w2] = 1 else:
s = '' for i in range(len(text)): s += text[i] + ' ' if (len(text) != 10): return False, s else: return True, s if __name__ == '__main__': time.clock() print() brown_corpus = list(brown.sents(brown.fileids())) for i in range(len(brown_corpus)): brown_corpus[i] = list(map(lambda x: x.lower(), brown_corpus[i])) gutenberg_corpus = list(gutenberg.sents(gutenberg.fileids())) for i in range(len(gutenberg_corpus)): gutenberg_corpus[i] = list( map(lambda x: x.lower(), gutenberg_corpus[i])) combined_corpus = brown_corpus + gutenberg_corpus unigram_list, bigram_list = training(combined_corpus) i = 0 while (i < 1): bool, s = generate_trigram_token(bigram_list) if (bool): i += 1 print(s)
import gensim import logging from nltk.corpus import brown logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = brown.sents() model = gensim.models.Word2Vec(sentences, min_count=1) model.save('brown_model')
sg=0) print svk.syn0[0] dsk = Doc2VecKeras(dm_concat=1) dsk.train_with_word2vec_instance(test_docs, svk, learn_words=True, iter=3) print dsk.syn0[0] print(dk0.docvecs.most_similar(0)) print(dk.docvecs.most_similar(0)) print(dsk.docvecs.most_similar(0)) print(dklw.docvecs.most_similar(0)) #sys.exit() from nltk.corpus import brown brown_sents_sub = list(brown.sents()[:100]) brown_docs_sub = LabeledListSentence(brown_sents_sub) brown_scorewordsents = list( ScoredListSentence(brown_sents_sub, dummy_score_vec_fn)) vck_br = Word2VecKeras(brown_sents_sub, null_word=1, iter=3, sg=0) vkk_br = Word2VecKeras(brown_sents_sub, null_word=1, iter=3, sg=1) dg_br = gensim.models.doc2vec.Doc2Vec(brown_docs_sub) dk0_br = Doc2VecKeras(brown_docs_sub, iter=3) svk_br = ScoreWord2VecKeras(brown_scorewordsents, null_word=1, iter=3, sg=0) dk_br = Doc2VecKeras(dm_concat=1) dk_br.train_with_word2vec_instance(brown_docs_sub, vck_br,
# Optionally, shorten words to their stems if stem_words: text = text.split() #stemmer = SnowballStemmer('english') #stemmed_words = [stemmer.stem(word) for word in text] stemmed_words = [ nltk.PorterStemmer().stem_word(word.lower()) for word in text ] text = " ".join(stemmed_words) # Return a list of words return (text) #word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) br = Word2Vec(brown.sents()) def get_similar(word): if word in br: lis = br.most_similar(word, topn=3) ret = [] for one in lis: ret.append(one[0]) return ret else: return [word] logger.info('Read data...') train = pd.read_csv('../../data/tmp/train_sample.csv')
unigramFreq = do_train_uni(train) # use the maximum likelihood estimate MLEProbDist to create # a probability distribution from the observed frequencies unigram = MLEProbDist(unigramFreq) bigram = ConditionalProbDist(bigramFreq, MLEProbDist) bigram_add_one = ConditionalProbDist(bigramFreq, LaplaceProbDist, bins=bigramFreq.__len__()) if method == 'no_smoothing': print "%s:%s:%s" % (method, 'train', compute_perplexity(bigram, train)) print "%s:%s:%s" % (method, 'test', compute_perplexity(bigram, test)) elif method == 'interpolation': sents = [] for l in brown.sents(categories=trainsection): sents = sents + l V = len(sents) print "%s:%s:%s" % (method, 'train', compute_perplexity_interp(unigram, bigram, train, lambda_vector, V)) print "%s:%s:%s" % (method, 'test', compute_perplexity_interp(unigram, bigram, test, lambda_vector, V)) elif method == 'add_one': print "%s:%s:%s" % (method, 'train', compute_perplexity(bigram_add_one, train)) print "%s:%s:%s" % (method, 'test', compute_perplexity(bigram_add_one, test)) elif method == 'interpolation_add_one': print "%s:%s:%s" % (method, 'train', compute_perplexity(bigram, train))
#Projet de Sciences des données #Master 2 IA "Machine Learning for Data Science" #Import de toutes les méthodes de la classe DistributionalSemantics et de la classe NltkBigrams from distributional_semantics import DistributionalSemantics as ds from nltk_bigrams import NltkBigrams as nb import gensim from nltk.corpus import brown #Test des méthodes #Construction du corpus de bigrammes corpus_bigrams = nb() #Entrainement des models gensim print('Création des vecteurs de sens des mots') unigram_model = gensim.models.Word2Vec(brown.sents(),min_count = 1,size = 100) print('Création des vecteurs de sens des bigrammes') bigram_model = gensim.models.Word2Vec(corpus_bigrams.bigram_sents,min_count = 1,size = 100) #Calcul de la matrice de composition pour la relation ADJNOUN W = ds.composition_w(brown.tagged_sents(categories = 'science_fiction', tagset = 'universal'), corpus_bigrams.tagged_sents_bigram, unigram_model, bigram_model, "ADJNOUN") #Calcul de la matrice de décomposition pour l'étiquette de phrases ADJNOUN W2 = ds.decomposition_w(brown.tagged_sents(categories = 'science_fiction', tagset = 'universal'), corpus_bigrams.tagged_sents_bigram,bigram_model,unigram_model,"ADJNOUN") #Calcul de la matrice de décomposition pour l'étiquette de phrases ADJNOUN selon la composition effectuée précédemment W3 = ds.decomposition_from_composition_w(brown.tagged_sents(categories = 'science_fiction', tagset = 'universal'),unigram_model,W,"ADJNOUN") #Calcul de composition de vecteurs de sens de "new" et "ones" P = ds.compose(unigram_model["new"],unigram_model["ones"],W)
def get_sentences(): # returns 57340 of the Brown corpus # each sentence is represented as a list of individual string tokens return brown.sents()