def select_genres(n): ''' Selects genres with more than n files. Returns raw data and the genre of each file in the selected genres as two 1d numpy arrays. Parameters ---------- n: An integer. Returns ------- A tuple of (raw, genres) raw: A 1d numpy array. genres: A 1d numpy array. ''' genres = [] raw = [] #Creates arrays of the genres and raw data for genres with more than n files for file in brown.fileids(): for k in brown.categories(file): if len(brown.fileids(k)) > n: genres.append(k) raw.append(brown.raw(file)) return raw, genres
def test_clusterer(self): """Here we take 10 documents categorized as 'government' and 'mystery' from the brown corpus, and perform k-means clustering on these. Optimally we would like the clusterer to divide them in two clusters. The clusterer generates clusters depending on random initial conditions, so the result can be different in different test runs. In order to account for that that we run a lot of iterations (50) which hopefully will generate a good result. The success condition is that a max of 2 out of 10 documents will fall in the wrong cluster. """ clusterer = KMeans() government_ids = brown.fileids(categories='government')[:10] mystery_ids = brown.fileids(categories='mystery')[:10] government_uids = [] mystery_uids = [] for articleid in government_ids: text = " ".join(brown.words(articleid)) self.folder.invokeFactory('Document', articleid, text=text) government_uids.append(self.folder[articleid].UID()) for articleid in mystery_ids: text = " ".join(brown.words(articleid)) self.folder.invokeFactory('Document', articleid, text=text) mystery_uids.append(self.folder[articleid].UID()) result = clusterer.clusterize(2, 50, repeats=50) cluster1 = set(result[0]) missed = min(len(cluster1-set(government_uids)), len(cluster1-set(mystery_uids))) self.failUnless(missed<=2)
def load_corpus(range): m = re.match(r'(\d+):(\d+)$', range) print 'm=', m if m: start = int(m.group(1)) end = int(m.group(2)) from nltk.corpus import brown as corpus print corpus.fileids() return [corpus.words(fileid) for fileid in corpus.fileids()[start:end]]
def _hasNextSearchBrown(self): sizeOfBrownCorpus = len(brown.fileids()) if self._countText < sizeOfBrownCorpus: self._isReady = True self._nameOfNextFile = brown.fileids()[self._countText] return True else: self._isReady = False return False
def third_lexicon(positive_seeds, negative_seeds): positive_list = [] negative_list = [] all_dic = {} seed_total_dic = {} for fileid in brown.fileids(): bow = get_BOW(brown.words(fileid)) for aword in bow: all_dic[aword] = all_dic.get(aword, {}) all_dic[aword]['word_count'] = all_dic[aword].get('word_count', 0) + 1 for pseed in positive_seeds: if pseed in bow: all_dic[aword][pseed] = all_dic[aword].get(pseed, 0) + 1 for nseed in negative_seeds: if nseed in bow: all_dic[aword][nseed] = all_dic[aword].get(nseed, 0) + 1 for pseed in positive_seeds: if pseed in bow: seed_total_dic[pseed] = seed_total_dic.get(pseed, 0) + 1 for nseed in negative_seeds: if nseed in bow: seed_total_dic[nseed] = seed_total_dic.get(nseed, 0) + 1 total_count = float(len(brown.fileids())) for aword in all_dic: score = 0 for pseed in positive_seeds: if all_dic[aword].get(pseed) != None: a_score = math.log( (all_dic[aword][pseed] / total_count) / ((all_dic[aword]['word_count'] / total_count) * (seed_total_dic[pseed] / total_count)), 2) if a_score > 0: score += a_score for nseed in negative_seeds: if all_dic[aword].get(nseed) != None: a_score = math.log( (all_dic[aword][nseed] / total_count) / ((all_dic[aword]['word_count'] / total_count) * (seed_total_dic[nseed] / total_count)), 2) if a_score > 0: score -= a_score score = score / 16.0 if score > 0.3: positive_list.append(aword) elif score < -0.3: negative_list.append(aword) return positive_list, negative_list
def load_corpus_each_sentence(range): m = re.match(r'(\d+):(\d+)$', range) if m: start = int(m.group(1)) end = int(m.group(2)) # from nltk.corpus import brown as corpus from nltk.corpus import movie_reviews as corpus print([corpus.sents(fileid) for fileid in corpus.fileids()[start:end]]) return [corpus.sents(fileid) for fileid in corpus.fileids()[start:end]]
def pre_processor(grams=3): vocabulary = set() t = 0 for di in brown.fileids(): vocabulary = vocabulary.union(set(brown.words(di))) t += 1 if t == 2: break vocabulary = list(vocabulary) for i, word in enumerate(vocabulary): wordDic[word] = i posiDic[i] = word t = 0 x1 = np.zeros(shape=(0, grams-1), dtype=int) x2 = np.zeros(shape=(0, grams-1), dtype=int) y1 = np.zeros(shape=(0, 1), dtype=int) y2 = np.zeros(shape=(0, 1), dtype=int) for _id in brown.fileids(): if t == 0: t += 1 text = brown.words(_id) size_ant = x1.shape[0] x1.resize((x1.shape[0] + len(text) - grams - 1, grams-1)) y1.resize((y1.shape[0] + len(text) - grams - 1, 1)) for i in range(size_ant, size_ant + len(text) - grams-1): x1[i] = [wordDic[text[index]] for index in range(i, i+grams-1)] y1[i] = [wordDic[text[i + grams-1]]] continue text = brown.words(_id) size_ant = x2.shape[0] x2.resize((x2.shape[0] + len(text) - grams - 1, grams-1)) y2.resize((y2.shape[0] + len(text) - grams - 1, 1)) for i in range(size_ant, size_ant + len(text) - grams-1): x2[i] = [wordDic[text[index]] for index in range(i, i+grams-1)] y2[i] = [wordDic[text[i + grams-1]]] break return vocabulary, x1, y1, x2, y2
def generate(): global CDICT, DDICT for category in CATAGORIES: # skip the two categories with too little information CDICT[category] = generate_corpus(brown.fileids(categories=category), "cavnar") DDICT[category] = generate_corpus(brown.fileids(categories=category), "damashek") dict_file = open('cdict.pk1', 'wb') pickle.dump(CDICT, dict_file) dict_file.close() dict_file = open('ddict.pk1', 'wb') pickle.dump(DDICT, dict_file) dict_file.close()
def main(): print(str.format( 'Количество слов в коллекции текстов (corpus) brown: {}', len(brown.words()) )) print(str.format( 'Список файлов, которые формируют коллекцию brown: {}', brown.fileids() )) print(str.format( 'Количество файлов в коллекции brown: {}', len(brown.fileids()) ))
def third_lexicon(positive_seeds,negative_seeds): positive_list=[] negative_list=[] all_dic={} seed_total_dic={} for fileid in brown.fileids(): bow=get_BOW(brown.words(fileid)) for aword in bow: all_dic[aword]=all_dic.get(aword,{}) all_dic[aword]['word_count']=all_dic[aword].get('word_count',0)+1 for pseed in positive_seeds: if pseed in bow: all_dic[aword][pseed]=all_dic[aword].get(pseed,0)+1 for nseed in negative_seeds: if nseed in bow: all_dic[aword][nseed]=all_dic[aword].get(nseed,0)+1 for pseed in positive_seeds: if pseed in bow: seed_total_dic[pseed]=seed_total_dic.get(pseed,0)+1 for nseed in negative_seeds: if nseed in bow: seed_total_dic[nseed]=seed_total_dic.get(nseed,0)+1 total_count=float(len(brown.fileids())) for aword in all_dic: score=0 for pseed in positive_seeds: if all_dic[aword].get(pseed) != None: a_score=math.log((all_dic[aword][pseed]/total_count)/((all_dic[aword]['word_count']/total_count)*(seed_total_dic[pseed]/total_count)), 2) if a_score>0: score+=a_score for nseed in negative_seeds: if all_dic[aword].get(nseed) != None: a_score=math.log((all_dic[aword][nseed]/total_count)/((all_dic[aword]['word_count']/total_count)*(seed_total_dic[nseed]/total_count)), 2) if a_score>0: score-=a_score score=score/16.0 if score>0.3: positive_list.append(aword) elif score<-0.3: negative_list.append(aword) return positive_list,negative_list
def get_brown_data(useN=100): try: fileids = brown.fileids() except LookupError: import nltk nltk.download('brown') fileids = brown.fileids() fileids = fileids[:useN] texts = [brown.raw(fid) for fid in fileids] fileids = [os.path.splitext(fid)[0] for fid in fileids] return texts, fileids
def TrainMultinomialNB(train): #V Extract Vocabulary for i in train: v=brown.words(fileids=i) # print v vsort=sorted(set(v)) print vsort #Count docs count=len(train) # for j in brown.fileids(): #Count docs in each class # out=0 countd=[] dicti={} condprob=collections.defaultdict(dict) for i in brown.categories(): # if i=="adventure": gho=0 print "Hey1" for j in brown.fileids(categories=i): if j in train: gho=gho+1 #Nc = Number of documents in each class countd.append(gho) # out=out+1 prior[i]=float(gho/(1.0*count)) textc=[] for t in brown.fileids(categories=i): if t in train: tp=brown.words(fileids=t) for ko in tp: textc.append(ko) # print textc for j in vsort: # print "Hey2" freq=0 for k in textc: if j==k: freq=freq+1 #Tct dicti[j]=freq tot=0 for t in range(len(dicti)): tot=tot+dicti[j] for t in vsort: condprob[t][i]=float((dicti[t]+1)/(1.0*tot)) return v,prior,condprob
def get_corpus(self): for fileid in brown.fileids(): sentences = brown.sents(fileids=[fileid]) for s in sentences: clean_sentence = [w.lower() for w in s if w.isalpha()] self.uni_corpus.extend(clean_sentence) self.bigram_corpus.extend(list(nltk.bigrams(clean_sentence)))
def load_brown_corpus(): import nltk nltk.download('brown') from nltk.corpus import brown corpus = [] for cat in brown.categories(): for text_id in brown.fileids(cat): sentences = [] for sent in brown.sents(text_id): text = ' '.join(sent) text = text.lower() for punct in string.punctuation: text.replace(punct, ' ') text = re.sub('[^a-z.,0-9 ]+', '', text) tokens = [w for w in text.split() if w != ''] if len(tokens) == 0: continue if tokens[-1] == '.': del tokens[-1] tokens.append('<eos>') sentences.append(tokens) corpus.append(sentences) # list of sentences (which are lists of words) corpus = list(itertools.chain.from_iterable(corpus)) return corpus
def Main(): db = Database() index = InvertedIndex(db) brown_list = brown.fileids() gutenberg_list = gutenberg.fileids() # document1 = { # 'id': '1', # 'text': 'The big sharks of Belgium drink beer.' # } # document2 = { # 'id': '2', # 'text': 'Belgium has great beer. They drink beer all the time.' # } i = 0 for item in brown_list: documentTemp = {'id': str(i), 'text': brown.raw(item)} index.index_document(documentTemp) for item in gutenberg_list: documentTemp = {'id': str(i), 'text': gutenberg.raw(item)} index.index_document(documentTemp) while True: search_term = input("Enter term(s) to search: ") result = index.lookup_query(search_term.lower()) for term in result.keys(): for appearance in result[term]: # Belgium: { docId: 1, frequency: 1} document = db.get(appearance.docId) print(highlight_term(appearance.docId, term, document['text'])) print("-----------------------------")
def fetch_train_test(corpora, test_corpus): train = [] test = [] unknown.clear() vocab.clear() pred_count_dict.clear() succ_count_dict.clear() for corpus in corpora: if corpus == 'brown': files = brown.fileids() elif corpus == 'gutenberg': files = gutenberg.fileids() else: print("config Error") for file in files: if corpus == 'brown': sentences = brown.sents(file) elif corpus == 'gutenberg': sentences = gutenberg.sents(file) else: print("config Error") permute = np.ones(len(sentences)) if corpus == test_corpus: permute[:int(len(sentences) * 0.2)] = 0 np.random.shuffle(permute) for index in range(len(sentences)): if permute[index] == 0: test.append(sentences[index]) else: train.append(sentences[index]) return [train, test]
def get_data(): data = [] for fileid in brown.fileids(): document = ' '.join(brown.words(fileid)) data.append(document) return data
def update_elements(elements, dataset, common_adverbs): ''' Update elements freq : See through corpus adjective : See through synsets used_common/uncommon : See through dataset ''' # freq for fileid in corpus.fileids(): for word in corpus.words(fileid): if word in elements: elements[word]['freq'] += 1 # adjective for element in elements: for synset in wn.synsets(element): if synset.pos() in ['a', 's', 'r']: elements[element]['adjective'] += 1 # used_common for common in common_adverbs: data_dict = dataset[common[0]]['used'] for element in data_dict: num = data_dict[element] elements[element]['used_common'][common[0]] = num # used_uncommon uncommon_adverbs = [ word for word in dataset if word not in [common for common, score in common_adverbs] ] for uncommon in uncommon_adverbs: data_dict = dataset[uncommon]['used'] for element in data_dict: num = data_dict[element] elements[element]['used_uncommon'][uncommon] = num
def build_word_count(): if os.path.isfile('pickled/wcount.pickle'): return read_pickle('pickled/wcount.pickle') wcount = Counter() for fid in words.fileids(): for word in words.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in gutenberg.fileids(): for word in gutenberg.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in brown.fileids(): for word in brown.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in reuters.fileids(): for word in reuters.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in inaugural.fileids(): for word in inaugural.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 dump_pickle(wcount, 'pickled/wcount.pickle') return wcount
def cal_idf(): # brown.sents() total_wordlists = [] doc_sents = [] for f in brown.fileids(): print f doc_wordlist = [] doc_sentlist = brown.sents(fileids=[f]) d_sents = '' for sent in doc_sentlist: s = '' # sent = stem_tokens(sent) for w in sent: w = w.lower() s += w + ' ' d_sents += s + '\n' doc_wordlist.extend(sent) total_wordlists.append(doc_wordlist) doc_sents.append(d_sents) print 'start caling tfidf' from sklearn.feature_extraction.text import TfidfVectorizer corpus = doc_sents vectorizer = TfidfVectorizer(min_df=1) X = vectorizer.fit_transform(corpus) idf = vectorizer.idf_ # print dict(zip(vectorizer.get_feature_names(), idf)) pickle.dump(vectorizer, open('idf_vectorizer', 'w')) dictionary = corpora.Dictionary(total_wordlists) dic, corps = get_corpus_by_lists(total_wordlists) tfidf = models.TfidfModel(corps, id2word=dic) pickle.dump(tfidf, open('brown_tfidf', 'w'))
def evaluation(): print "=============== The Test Set ===============" import random from nltk.corpus import brown tagged_sents = list(brown.tagged_sents(categories='news')) random.shuffle(tagged_sents) size = int(len(tagged_sents) * 0.1) train_set, test_set = tagged_sents[size:], tagged_sents[:size] file_ids = brown.fileids(categories='news') size = int(len(file_ids) * 0.1) train_set = brown.tagged_sents(file_ids[size:]) test_set = brown.tagged_sents(file_ids[:size]) train_set = brown.tagged_sents(categories='news') test_set = brown.tagged_sents(categories='fiction') print "=============== Accuracy ===============" classifier = nltk.NaiveBayesClassifier.train(train_set) print 'Accuracy: %4.2f' % nltk.classify.accuracy(classifier, test_set) print "=============== Confusion Matrices ===============" def tag_list(tagged_sents): return [tag for sent in tagged_sents for (word, tag) in sent] def apply_tagger(tagger, corpus): return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus] gold = tag_list(brown.tagged_sents(categories='editorial')) test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial'))) cm = nltk.ConfusionMatrix(gold, test)
def ex_constitution_corpus(): themes = { "news": ["news", "reviews", "editorial"], "literature": ["science_fiction", "romance", "fiction", "mystery"], "sciences": ["learned"] } nb_instances = 0 corpus = {} for category in themes: print(category, ":") nb_doc = len(brown.fileids(categories=themes[category])) print(" ", nb_doc, "documents") nb_instances += nb_doc corpus[category] = brown.fileids(categories=themes[category]) print("NB instances :", nb_instances) return corpus
def load_movie_corpus_each_sentence(range): m = re.match(r'(\d+):(\d+)$', range) if m: start = int(m.group(1)) end = int(m.group(2)) from nltk.corpus import movie_reviews as corpus return [corpus.sents(fileid) for fileid in corpus.fileids()[start:end]]
def load_corpus(range):#使用nltk中的语料brown来做测试,m为切片值,选择语料起始和终止 m = re.match(r'(\d+):(\d+)$', range)#正则表达式 形如 1:5 if m: start = int(m.group(1)) end = int(m.group(2)) from nltk.corpus import brown as corpus return [corpus.words(fileid) for fileid in corpus.fileids()[start:end]]
def CreateFeatures(): features = [] sw = stopwords.words('english') lemmatizer = WordNetLemmatizer() stemmer = PorterStemmer() # yielded an extra 0.1% so I kept it in. # get the categories in brown for c in brown.categories(): # get the files in a category for d in brown.fileids(categories=c): # get words from a file words = brown.words(fileids=d) # extracted words of a document appended together. extracted_words = "" # filter each word for w in words: w = lemmatizer.lemmatize(w) w = stemmer.stem(w) if (w not in sw) and (w.isalnum()): # append until we have the filtered document recreated extracted_words += w.lower() + " " # create features and add them to a feature set. feature = ({"words": extracted_words}, c) features.append(feature) return features
def get_data(sub_task): """ returns train data and test data according to sub_task :param sub_task: :return: """ sentences_brown = list(brown.sents(brown.fileids())) sentences_gutenberg = list(gutenberg.sents(gutenberg.fileids())) # adding stop symbols add_stop_symbol(sentences_brown) add_stop_symbol(sentences_gutenberg) # get training and test data sentences_brown_train, sentences_brown_test = split(sentences_brown, 0.9) sentences_gutenberg_train, sentences_gutenberg_test = split( sentences_gutenberg, 0.9) if sub_task == S1: return sentences_brown_train, sentences_brown_test elif sub_task == S2: return sentences_gutenberg_train, sentences_gutenberg_test elif sub_task == S3: sentences_brown_train.extend(sentences_gutenberg_train) return sentences_brown_train, sentences_brown_test elif sub_task == S4: sentences_brown_train.extend(sentences_gutenberg_train) return sentences_brown_train, sentences_gutenberg_test else: print("Provide proper sub_task") exit(0)
def load_corpus(range): m = re.match(r'(\d+):(\d+)$', range) if m: start = int(m.group(1)) end = int(m.group(2)) from nltk.corpus import brown as corpus return [corpus.words(fileid) for fileid in corpus.fileids()[start:end]]
def update_wordset(adverbs): for fileid in corpus.fileids(): source = corpus.words(fileid) for i in range(len(source)): if source[ i] in adverbs: # If we find the adverb, update the neightbor words cand = source[i + 1] update_word(cand, source[i])
def get_texts(self): i = 0 for fileid in brown.fileids(): self.texts[fileid] = brown.words(fileid) if self.max_docs is not None: i += 1 if i >= self.max_docs: break
def calcDF(): doc_dic = defaultdict(set) for fileid in brown.fileids(): for token in brown.words(fileids=[fileid]): doc_dic[token].add(fileid) return doc_dic
def __init__(self, max_ocd_size=7, do_count=True): self.file_ids = brown.fileids() self.max_id = len(self.file_ids) self.max_ocd_size = max_ocd_size self.all_files = [] self.fgram_files = [] if do_count is True: self.ocds = self.count_occurrences()
def create_news_or_not(): news_or_not = [] for category in brown.categories(): for fileid in brown.fileids(category): if category == 'news': news_or_not.append((brown.words(fileid), category)) else: news_or_not.append((brown.words(fileid), 'non-news')) return news_or_not
def brownWordOccurences(): wordOccurences = defaultdict(lambda: 0) for fileid in brown.fileids(): wordList = brown.words(fileid) for word in wordList: wordOccurences[word.lower()] += 1 return wordOccurences
def fun5(): from nltk.corpus import brown # nltk.download('brown') for i in brown.fileids(): print i print brown.categories() print brown.words(categories='news') print brown.words(fileids='cg22') print brown.sents(categories=['news', 'editorial', 'reviews'])
def main(): data = [] for fileid in brown.fileids(): document = ' '.join( brown.words( fileid ) ) data.append( document ) NO_DOCUMENTS = len( data ) print( NO_DOCUMENTS ) print( data[:5] ) # For gensim we need to tokenize the data and filter out stopwords tokenized_data = [] for text in data: tokenized_data.append( clean_text( text ) ) # Build a Dictionary - association word to numeric id dictionary = corpora.Dictionary( tokenized_data ) # Transform the collection of texts to a numerical form corpus = [dictionary.doc2bow( text ) for text in tokenized_data] # Have a look at how the 20th document looks like: [(word_id, count), ...] print( corpus[20] ) # [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2), ... # Build the LDA model lda_model = models.LdaModel( corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary ) # Build the LSI model lsi_model = models.LsiModel( corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary ) print( "LDA Model:" ) for idx in range( NUM_TOPICS ): # Print the first 10 most representative topics print( "Topic #%s:" % idx, lda_model.print_topic( idx, 10 ) ) print( "=" * 20 ) print( "LSI Model:" ) for idx in range( NUM_TOPICS ): # Print the first 10 most representative topics print( "Topic #%s:" % idx, lsi_model.print_topic( idx, 10 ) ) print( "=" * 20 ) text = open('17170-0.txt', 'r').read() bow = dictionary.doc2bow( clean_text( text ) ) print( lsi_model[bow] ) # [(0, 0.091615426138426506), (1, -0.0085557463300508351), (2, 0.016744863677828108), (3, 0.040508186718598529), (4, 0.014201267714185898), (5, -0.012208538275305329), (6, 0.031254053085582149), (7, 0.017529584659403553), (8, 0.056957633371540077), #(9, 0.025989149894888153)] print( lda_model[bow] )
def main(): #create our training set and testing set from the news category in the #brown corpus fileids = brown.fileids(categories='news') training_ids = fileids[:33] test_ids = fileids[33:] training = brown.tagged_words(fileids=training_ids) test = brown.tagged_words(fileids=test_ids) trainingTokens = reviseTags(training) #revise tags in our training set hist = createHistogram( trainingTokens) #used revised tags to make histogram fdist = mostCommonTags(trainingTokens) mostCommonTag = fdist.most_common(1)[0][0] #retrieve most common tag print("\nQuestion 8\n") tagGuesses, unknownTokens = assignTags(hist, test, [mostCommonTag]) tagTruth = reviseTags(test) incorrectTags = analyzeAssignedTags(tagGuesses, tagTruth) print("\nQuestion 9\n") analyzeIncorrectTags(incorrectTags) print("\nQuestion 10\n") incorrectTags = analyzeAssignedTagsUnknownTokens(unknownTokens, tagTruth) analyzeIncorrectTags(incorrectTags) print("\nQuestion 10 part 2\n") # repeat test, but randomly assign the 10 most frequently found tags top5Tags = fdist.most_common(10) defaultTags = [tag for tag, count in top5Tags] tagGuesses, unknownTokens = assignTags(hist, test, defaultTags) incorrectTags = analyzeAssignedTagsUnknownTokens(unknownTokens, tagTruth) analyzeIncorrectTags(incorrectTags) print("\nQuestion 11\n") testids = brown.fileids(categories='romance') test = brown.tagged_words(fileids=testids) tagGuesses, unknowTokens = assignTags(hist, test, [mostCommonTag]) tagTruth = reviseTags(test) incorrectTags = analyzeAssignedTags(tagGuesses, tagTruth) analyzeIncorrectTags(incorrectTags)
def create_train_test_ids(): train_id = [] test_id = [] brown_categories = brown.categories() for category in brown_categories: ttid = tts(brown.fileids(category)) train_id.append(ttid[0]) test_id.append(ttid[1]) pickle.dump((train_id, test_id), open("./data/train_test_ids.pkl", 'wb')) return train_id, test_id
def load_corpus(ranges): """ load data from corpus """ tmp = re.match(r'(\d+):(\d+)$', ranges) if tmp: start = int(tmp.group(1)) end = int(tmp.group(2)) from nltk.corpus import brown as corpus return [corpus.words(fileid) for fileid in corpus.fileids()[start:end]]
def generate_dataset(): print("Processando tokens dos documentos do corpus Brown") brown_tokens = get_brown_documents() print("Carregando árvore etimológica") etymwn = load_etymology() print("Extraíndo assinatura etimológica dos documentos") fingerprints = generate_sig_dataset(brown_tokens, etymwn) # Indexar por nome do documento fingerprints.index = brown.fileids() fingerprints.to_csv('brown_fingerprints_2.csv')
def ex_constitution_corpus(): themes = { "news": ["news", "reviews", "editorial"], "literature": ["science_fiction", "romance", "fiction", "mystery"], "sciences": ["learned"] } nb_instances = 0 corpus = {} for category in themes: print("category : ", category) nb_doc = len(brown.fileids(categories=themes[category])) print("Il y a ", nb_doc, " %s(category) documents in Brown corpus." % category) nb_instances += nb_doc corpus[category] = brown.fileids(categories=themes[category]) print( "Il y a ", nb_instances, " documents dans les 3 catégories (news,literature,sciences) pour Brown." ) return corpus
def populate_texts(session): if dl.db.not_empty(session, Text): # Cowardly refusing to continue return fids = brown.fileids(categories='news') for fid in fids: session.add(Text(file=fid)) session.commit()
def get_data(): data = [] for fileid in brown.fileids(): document = ' '.join(brown.words(fileid)) data.append(document) NO_DOCUMENTS = len(data) print(NO_DOCUMENTS) print(data[:5]) return data
def test_clusterer(self): """Here we take 10 documents categorized as 'government' and 'mystery' from the brown corpus, and perform k-means clustering on these. Optimally we would like the clusterer to divide them in two clusters. The clusterer generates clusters depending on random initial conditions, so the result can be different in different test runs. In order to account for that that we run a lot of iterations (50) which hopefully will generate a good result. The success condition is that a max of 1 out of 10 documents will fall in the wrong cluster. """ tagged_sents = brown.tagged_sents( categories=['government','mystery']) tagger = getUtility(IPOSTagger, name="collective.classification.taggers.NgramTagger") tagger.train(tagged_sents) extractor = getUtility(ITermExtractor) extractor.setTagger(tagger) storage = getUtility(INounPhraseStorage) clusterer = KMeans() government_ids = brown.fileids(categories='government')[:10] mystery_ids = brown.fileids(categories='mystery')[:10] for articleid in government_ids: text = " ".join(brown.words(articleid)) storage.addDocument(articleid,text) for articleid in mystery_ids: text = " ".join(brown.words(articleid)) storage.addDocument(articleid,text) result = clusterer.clusterize(2,20,repeats=50) cluster1 = set(result[0]) missed = min(len(cluster1-set(government_ids)), len(cluster1-set(mystery_ids))) self.failUnless(missed<2)
def iterate_brown_corpus(): print("parsing brown corpus") for fileid in brown.fileids(): print("parse - brown: " + fileid) sentence = ["^"] for sent in brown.sents(fileid): for word in sent: legal_form = legal_word.search(word.lower()) if (legal_form): sentence.append(legal_form.group()) sentence.append("$") parseSentence(sentence) sentence = ["^"]
def testSet(): tagged_sents = list(brown.tagged_sents(categories='news')) random.shuffle(tagged_sents) size = int(len(tagged_sents) * 0.1) train_set, test_set = tagged_sents[size:], tagged_sents[:size] file_ids = brown.fileids(categories='news') size = int(len(file_ids) * 0.1) train_set = brown.tagged_sents(file_ids[size:]) test_set = brown.tagged_sents(file_ids[:size]) train_set = brown.tagged_sents(categories='news') test_set = brown.tagged_sents(categories='fiction')
def build_all_brown(subset_size=None): documents = [] categories = [] all_categories = set() try: fileids = brown.fileids() for fileid in fileids: if subset_size: if len(all_categories) > subset_size: break category = brown.categories(fileid)[0] words = [x.lower() for x in brown.words(fileid)] documents.append(words) categories.append(category) all_categories.add(category) if subset_size != len(brown.categories()): # exclude the final item, since it's the sole member of the next group documents = documents[:-1] categories = categories[:-1] documents = [" ".join(d) for d in documents] except LookupError: """ we don't have the Brown corpus via nltk on this machine """ try: with open("brown_docs_cats.pickle") as f: documents, categories = pickle.load(f) except IOError: raise Exception("can't load Brown Corpus via NLTK or file") # documents = [' '.join(d) for d in documents] """ # let's NOT get tempted to hide away the encoding # we'll probably need to access, e.g., the vectorizer, to do reverse # transformations once we want to interpret/evaluate the model doc_vectorizer = CountVectorizer() doc_vec = doc_vectorizer.fit_transform(documents) """ return documents, categories
def main(): # store word lengths brown_word_lens = [] web_word_lens = [] inaugural_word_lens = [] gutenberg_word_lens = [] genesis_word_lens = [] for file in gutenberg.fileids(): for word in gutenberg.words(file): gutenberg_word_lens.append(len(word)) for file in brown.fileids(): for word in brown.words(file): brown_word_lens.append(len(word)) for file in webtext.fileids(): for word in webtext.words(file): web_word_lens.append(len(word)) for file in inaugural.fileids(): for word in inaugural.words(file): inaugural_word_lens.append(len(word)) for file in genesis.fileids(): for word in genesis.words(file): genesis_word_lens.append(len(word)) with open("wordlens.txt", 'w') as f: sys.stdout = f f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n") for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), len(web_word_lens), len(brown_word_lens), len(gutenberg_word_lens))): for corpus in [genesis_word_lens, inaugural_word_lens, web_word_lens, brown_word_lens, gutenberg_word_lens]: if(i >= len(corpus)): f.write(",") else: f.write(str(corpus[i]) + ",") f.write("\n")
def categorize_cat(category): print "categorizing: " + category total = 0 count = 0 cos_correct = 0 oop_correct = 0 shared = 0 cos_false_negative = [] oop_false_negative = [] for text in brown.fileids(category)[10:]: # only compare the first five files after the training set if count >= COMPARISON_SET: count = 0 break # indicate that we have compared one more file count += 1 total += 1 #categorize this text cos, oop = categorize(text) # check to see if the cos distance or out of place distance categorized # it correctly if cos == category: # cos distance correctly classified cos_correct += 1 else: # cos distance incorrectly classified cos_false_negative.append(cos) if oop == category: # out of place distance correctly classified oop_correct += 1 if cos == category: # both classified correctly shared += 1 else: # out of place distance incorrectly classified oop_false_negative.append(oop) return (category, cos_correct, oop_correct, total, cos_false_negative, oop_false_negative)
from nltk.corpus import brown from nltk.text import TextCollection # the default initialization for a corpus fails # since files() was changed to fileids() # anyway, brown_collection = TextCollection(brown) doesn't work # so we use this workaround words = [brown.words(f) for f in brown.fileids()] brown_collection = TextCollection(words)
from pprint import pprint train_dict = nltk.defaultdict(list) test_dict = nltk.defaultdict(list) def FDtoDIC(fd): out_dict = nltk.defaultdict(float) for key in fd.keys(): out_dict[key] = fd[key] out_dict['N'] = fd.N() return out_dict for category in set(brown.categories()).\ difference(set(['humor', 'science_fiction'])): cat_files = brown.fileids(categories=category) random.shuffle(cat_files) size = int(len(cat_files) * 0.85) train, test = cat_files[:size], cat_files[size:] key_list = [] for f in train: temp = brown.open(f).read().split() temp = [entry.split('/')[0] for entry in temp] temp = [entry for entry in temp if entry \ not in stopwords.words('english')] train_dict[category].append(FDtoDIC(nltk.FreqDist(temp))) key_list.extend(train_dict[category][-1].keys()) # compute the averge sample for the given category key_list = set(key_list) cat_avg_dict = {} for word in key_list:
if (isMatch): table[word][matchCount] += 1 else: table[word][nonmatchCount] += 1 else: table.append(word, category, matchCount, nonMatchCount) import nltk.classify import nltk.cluster import nltk.corpus import random from nltk.corpus import brown documents = [(list(brown.words(fileid)), category) for category in brown.categories() for fileid in brown.fileids(category)] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in brown.words()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features def classify(documents, words): featuresets = [(document_features(d), c) for (d,c) in documents] train_set, test_set = featuresets[100:], featuresets[:100]
import TdMat reload(TdMat) from nltk.corpus import brown for f in brown.fileids(): docs = TdMat.process_sample(f) print f for doc in docs: TdMat.tdm.add_doc(doc) import re class_num = [] for f in brown.fileids(): docs = TdMat.process_sample(f) ch = re.findall(r'c([a-r])\d\d',f)[0] for doc in docs: class_num.append(ord(ch) - 96)
print(""" ---------------------------------------------------------------------- 3 Evaluation 3.1 The Test Set ---------------------------------------------------------------------- """) import random from nltk.corpus import brown tagged_sents = list(brown.tagged_sents(categories='news')) random.shuffle(tagged_sents) size = int(len(tagged_sents) * 0.1) train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] file_ids = brown.fileids(categories='news') size = int(len(file_ids) * 0.1) train_sents = brown.tagged_sents(file_ids[size:]) test_sents = brown.tagged_sents(file_ids[:size]) train_sents = brown.tagged_sents(categories='news') test_sents = brown.tagged_sents(categories='fiction') print("-" * 40) print(""" ---------------------------------------------------------------------- 3.2 Accuracy ---------------------------------------------------------------------- """) train_set = [({'word': w}, t) for sent in train_sents for (w, t) in sent] test_set = [({'word': w}, t) for sent in test_sents for (w, t) in sent]
#!/usr/bin/python3 # coding: utf-8 # Brown Corpus (布朗语料库): Brown Corpus of Standard American English 被认为是第一个可以在计算语言学处理中使用的通用英语语料库 # 它包含了一百万字 1961 年出版的美语文本; 它代表了通用英语的样本, 采样自小说, 新闻和宗教文本; 随后, 在大量的人工标注后, 诞生了词性标注过的版本 from nltk.corpus import brown print(len(brown.fileids())) # 500; 个 文档 print(brown.fileids()[:5]) # ['ca01', 'ca02', 'ca03', 'ca04', 'ca05'] print(len(brown.words())) # 1161192; 总共 1161192 个单词 print(brown.words()[:5]) # ['The', 'Fulton', 'County', 'Grand', 'Jury']; 打印前 5 个单词 print(len(brown.words('ca01'))) # 2242; 一片文档还是比较少的 ################################################################## ## 标记数据 print(brown.tagged_words()[:3]) # [('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL')]; 打印前 3 个单词的标注 ################################################################## ## categories print(len(brown.categories())) # 15; 个分类 print(brown.categories()) # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] print(len(brown.words(categories='news'))) # 100554; 统一类数据的单词 print(len(brown.sents(categories=['news', 'editorial', 'reviews']))) # 9371 # brown 包括标记数据 和 非标记数据 print(len(brown.words())) # 1161192 print(len(brown.words(categories=brown.categories()))) # 1161192; 所有数据都在 categories 里面 ################################################################## ## 路径 print(brown.abspath('ca01')) # /home/coder352/nltk_data/corpora/brown/ca01 print(brown.abspaths()) # 所有文档路径 ################################################################## ## 类型 print(type(brown)) # <class 'nltk.corpus.reader.tagged.CategorizedTaggedCorpusReader'> print(type(brown.words())) # <class 'nltk.corpus.reader.util.ConcatenatedCorpusView'> print(type(brown.words('ca01'))) # <class 'nltk.corpus.reader.tagged.TaggedCorpusView'>
from nltk.corpus import brown import numpy feature_dictionary = [] feature_map = {} for word in brown.words(): word = word.lower() if word not in feature_map: feature_map[word] = len(feature_dictionary) feature_dictionary.append(word) frequency_matrix = numpy.zeros((len(feature_dictionary), len(brown.fileids())), dtype=numpy.uint32) for document_index, document in enumerate(brown.fileids()): for word in brown.words(document): word = word.lower() frequency_matrix[feature_map[word], document_index] += 1 with open("/tmp/feature-dictionary.scidb", "w") as file: file.write("{0}[\n") for feature in feature_dictionary[:-1]: file.write('("%s"),\n' % feature) for feature in feature_dictionary[-1:]: file.write('("%s")\n' % feature) file.write("]\n") with open("/tmp/frequency-matrix.csv", "w") as file: for i in range(frequency_matrix.shape[0]): for j in range(frequency_matrix.shape[1]): file.write("%s,%s,%s\n" % (i, j, frequency_matrix[i, j]))
def init_kwargs(cls, root=None, fileids=None): return dict( root=brown.root if root is None else root, paths=brown.fileids() if fileids is None else fileids, )
def main(): # store word lengths brown_common_freq = [] web_common_freq = [] inaugural_common_freq = [] gutenberg_common_freq = [] genesis_common_freq = [] common = ["the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take", "people", "into", "year", "your", "good", "some", "could", "them", "see", "other", "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want", "because", "any", "these", "give", "day", "most", "us"] common.sort() for file in gutenberg.fileids(): total_words = len(gutenberg.words(file)) total_common = 0 for word in gutenberg.words(file): if word.lower() in common: total_common += 1 gutenberg_common_freq.append(float(total_common)/total_words) for file in brown.fileids(): total_words = len(brown.words(file)) total_common = 0 for word in brown.words(file): if word.lower() in common: total_common += 1 brown_common_freq.append(float(total_common)/total_words) for file in webtext.fileids(): total_words = len(webtext.words(file)) total_common = 0 for word in webtext.words(file): if word.lower() in common: total_common += 1 web_common_freq.append(float(total_common)/total_words) for file in inaugural.fileids(): total_words = len(inaugural.words(file)) total_common = 0 for word in inaugural.words(file): if word.lower() in common: total_common += 1 inaugural_common_freq.append(float(total_common)/total_words) for file in genesis.fileids(): total_words = len(genesis.words(file)) total_common = 0 for word in genesis.words(file): if word.lower() in common: total_common += 1 genesis_common_freq.append(float(total_common)/total_words) with open("common-words.txt", 'w') as f: sys.stdout = f f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n") for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq), len(web_common_freq), len(brown_common_freq), len(gutenberg_common_freq))): for corpus in [genesis_common_freq, inaugural_common_freq, web_common_freq, brown_common_freq, gutenberg_common_freq]: if i >= len(corpus): f.write(",") else: f.write(str(round(corpus[i], 5)) + ",") f.write("\n")
def brownPart(part): """Return one section of the Brown corpus as a list of sentences.""" return brown.sents([f for f in brown.fileids() if f.startswith('c'+part)])
featurewords = words freqThreshold = 5 while len(featurewords) > maxWordCount: featurewords = [word for word in featurewords if words[word] > freqThreshold] freqThreshold += 1 return featurewords #create dict with boolean values for existence of words in a document def getDocFeatures(doc, words): features = {} for word in words: features[word] = (word in doc) return features #import data into words, category pairs docs = [(list(brown.words(fileid)), category) for category in brown.categories() for fileid in brown.fileids(category)] #identify list of words to be used as features allwords = nltk.FreqDist([word.lower() for word in brown.words()]) featurewords = getFeatureWords(5000, allwords) #filter for stopwords #featurewords = [word for word in featurewords if word not in set(stopwords.words('english'))] #create category, featureset pairs docfeatures = [(getDocFeatures(doc, featurewords), category) for (doc, category) in docs] #Break into training and test sets random.shuffle(docfeatures) train, test = docfeatures[:400], docfeatures[400:]