def tfidf(word, wordCount): docCount = len(reuters.fileids()) wordCountCorpus = 0 count = 0 for doc in reuters.fileids(): count = count + 1 present = 0 for word2 in reuters.words(doc): if word.lower() == word2.lower(): present = 1 break if present == 1: wordCountCorpus == wordCountCorpus + 1 if count == 200: break tf = wordCount idf = math.log(docCount/(1 + wordCountCorpus)) tfidf = tf * idf return tfidf
def import_reuters_files(ds, silent=False, log=sys.stdout): """ Import the brown corpus into `ds`. E.g. >>> from nathan.core import Dataspace >>> ds = Dataspace() >>> %time brown.import_brown(ds, silent=True) CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s Wall time: 12min 29s """ if not silent: total = len(reuters.fileids()) counter = 0 root_handle = ds.insert("#reuters") for fileid in reuters.fileids(): tags = ["@%s" % category for category in reuters.categories(fileid)] file_handle = ds.insert(["#%s" % fileid] + tags) ds.link(root_handle, file_handle) for sent in reuters.sents(fileid): norm = [word.lower() for word in sent] sen_handle = ds.insert(norm) ds.link(file_handle, sen_handle) if not silent: counter += 1 if (counter % 10 == 0): print("importing %s of %s files..." % (counter, total), file=log)
def collection_stats(): # List of documents documents = reuters.fileids() print(str(len(documents)) + " documents"); train_docs = list(filter(lambda doc: doc.startswith("train"), documents)); print(str(len(train_docs)) + " total train documents"); test_docs = list(filter(lambda doc: doc.startswith("test"), documents)); print(str(len(test_docs)) + " total test documents"); # List of categories categories = reuters.categories(); print(str(len(categories)) + " categories"); # Documents in a category category_docs = reuters.fileids("acq"); # Words for a document document_id = category_docs[0] document_words = reuters.words(category_docs[0]); print(document_words); # Raw document print(reuters.raw(document_id));
def __init__(self, categories=None, lower=True): if categories == None or len(categories) == 1: self.fileids = reuters.fileids() else: self.fileids = reuters.fileids(categories) self.categories = categories self.lower = lower
def print_reuters(): from nltk.corpus import reuters # print reuters.fileids() # print reuters.categories() print reuters.categories('training/9865') print reuters.categories(['training/9865','training/9880']) print reuters.fileids('barley') print reuters.fileids(['barely','corn'])
def explore_categories(max_len=5000, min_len=100, percentage=0.3): for cat in reuters.categories(): for cat2 in reuters.categories(): if cat2 > cat: if len(set(reuters.fileids(cat)) & set(reuters.fileids(cat2))) == 0: l1 = len(reuters.fileids(cat)) l2 = len(reuters.fileids(cat2)) if ( (l1 + l2) > min_len) and ( (l1 + l2) < max_len) and float((min(l1, l2))/float(l1+l2) > percentage): print cat, cat2, l1 + l2, float(min(l1, l2))/float(l1+l2)
def generateTextList( category, size, normalize = False ): i = 0 text = [] while i < size and i < len( reuters.fileids( category ) ): if not normalize: text.insert( i, reuters.words( reuters.fileids( category )[i] ) ) else: text.insert( i, getNormalizedText( reuters.words( reuters.fileids( category )[i] ) ) ) i += 1 return text
def get_reuters_ids_cnt(num_doc=100, max_voca=10000, remove_top_n=5): """To get test data for training a model reuters, stopwords, english words corpora should be installed in nltk_data: nltk.download() Parameters ---------- num_doc: int number of documents to be returned max_voca: int maximum number of vocabulary size for the returned corpus remove_top_n: int remove top n frequently used words Returns ------- voca_list: ndarray list of vocabulary used to construct a corpus doc_ids: list list of list of word id for each document doc_cnt: list list of list of word count for each document """ file_list = reuters.fileids() corpus = [reuters.words(file_list[i]) for i in xrange(num_doc)] return get_ids_cnt(corpus, max_voca, remove_top_n)
def get_testset_trainset_nltk_reuters(): from nltk.corpus import reuters global categories_file_name_dict global cat_num_docs clean_files = [f for f in reuters.fileids() if len(reuters.categories(fileids=f))==1] testset = [f for f in clean_files if f[:5]=='test/'] trainset = [f for f in clean_files if f[:9]=='training/'] for cat in reuters.categories(): li=[f for f in reuters.fileids(categories=cat) if f in trainset] li_te = [f for f in reuters.fileids(categories=cat) if f in testset] if len(li)>20 and len(li_te)>20: cat_num_docs[cat]=len(li) li.extend(li_te) categories_file_name_dict[cat]=li return [[ f for f in trainset if f2c('reuters',f) in categories_file_name_dict], [ f for f in testset if f2c('reuters',f) in categories_file_name_dict]]
def run(): """Import the Reuters Corpus which contains 10,788 news articles""" from nltk.corpus import reuters raw_docs = [reuters.raw(fileid) for fileid in reuters.fileids()] # Select 100 documents randomly rand_idx = random.sample(range(len(raw_docs)), 100) raw_docs = [raw_docs[i] for i in rand_idx] # Preprocess Documents tokenized_docs = [ie_preprocess(doc) for doc in raw_docs] # Remove single occurance words docs = remove_infrequent_words(tokenized_docs) # Create dictionary and corpus dictionary = corpora.Dictionary(docs) corpus = [dictionary.doc2bow(doc) for doc in docs] # Build LDA model lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10) for topic in lda.show_topics(): print topic
def load_data(config={}): """ Load the Reuters dataset. Returns ------- data : dict with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels' """ stop_words = stopwords.words("english") vectorizer = TfidfVectorizer(stop_words=stop_words) mlb = MultiLabelBinarizer() documents = reuters.fileids() test = [d for d in documents if d.startswith('test/')] train = [d for d in documents if d.startswith('training/')] docs = {} docs['train'] = [reuters.raw(doc_id) for doc_id in train] docs['test'] = [reuters.raw(doc_id) for doc_id in test] xs = {'train': [], 'test': []} xs['train'] = vectorizer.fit_transform(docs['train']).toarray() xs['test'] = vectorizer.transform(docs['test']).toarray() ys = {'train': [], 'test': []} ys['train'] = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train]) ys['test'] = mlb.transform([reuters.categories(doc_id) for doc_id in test]) data = {'x_train': xs['train'], 'y_train': ys['train'], 'x_test': xs['test'], 'y_test': ys['test'], 'labels': globals()["labels"]} return data
def preProcess(): print 'PreProcess Reuters Corpus' start_time = time.time() docs = 0 bad = 0 tokenizer = Tokenizer() if not os.path.isdir(Paths.base): os.makedirs(Paths.base) with open(Paths.text_index, 'w') as fileid_out: with codecs.open(Paths.texts_clean, 'w', 'utf-8-sig') as out: with codecs.open(Paths.reuter_test, 'w', 'utf-8-sig') as test: for f in reuters.fileids(): contents = reuters.open(f).read() try: tokens = tokenizer.tokenize(contents) docs += 1 if docs % 1000 == 0: print "Normalised %d documents" % (docs) out.write(' '.join(tokens) + "\n") # if f.startswith("train"): # # else: # test.write(' '.join(tokens) + "\n") fileid_out.write(f + "\n") except UnicodeDecodeError: bad += 1 print "Normalised %d documents" % (docs) print "Skipped %d bad documents" % (bad) print 'Finished building train file ' + Paths.texts_clean end_time = time.time() print '(Time to preprocess Reuters Corpus: %s)' % (end_time - start_time)
def create_token_stream(): """ A funtion that creates token stream based on the nltk reuters corpus A token stream is a list of tuples containing terms to docID """ token_stream = [] docID = 1 termID = 1 print "Creating token stream..." for fileid in reuters.fileids('barley'): for term in reuters.words(fileid): # Strip punctuatuion from the word and make lower case term = remove_punct_from_word(term).lower() # Check to make sure word is not "" and term is not a number if len(term) > 0 and not is_number(term): stemmed_term = stem().stem_word(term) if stemmed_term not in terms: terms[stemmed_term] = termID termID += 1 new_token = (terms[stemmed_term], docID) token_stream.append(new_token) # Add to docs dictionary mapping docID to file docs[docID] = fileid docID += 1 return token_stream
def get_reuters_cnt_ids(num_doc=100, max_voca=10000): ''' to get test data for training a model reuters corpus should be installed in nltk_data: nltk.download() ''' file_list = reuters.fileids() docs = list() freq = Counter() for i in range(num_doc): doc = reuters.words(file_list[i]) freq.update(doc) docs.append(doc) voca = [key for key,val in freq.most_common(max_voca)] voca_dic = dict() voca_list = list() for word in voca: voca_dic[word] = len(voca_dic) voca_list.append(word) doc_ids = list() doc_cnt = list() for doc in docs: words = set(doc) ids = np.array([int(voca_dic[word]) for word in words if voca_dic.has_key(word)]) cnt = np.array([int(doc.count(word)) for word in words if voca_dic.has_key(word)]) doc_ids.append(ids) doc_cnt.append(cnt) return np.array(voca_list), doc_ids, doc_cnt
def create_dictionary_index_reuters(): """ A function that creates a dictonary with terms as keys and positings lists as values for the nltk reuters corpus """ idx = {} docs = {} docID = 1 for fileid in reuters.fileids(): for word in reuters.words(fileid): if not is_number(word): # Strip punctuatuion from the word and make lower case word = remove_punct_from_word(word).lower() # Check to make sure word is not "" if len(word) > 0: # Check to see if word already is in index if word in idx: # Check to see if docID is not already present for word if docID not in idx[word]: idx[word].append(docID) # Otherwise add word and docID in array to index else: idx[word] = [] idx[word].append(docID) # Add to docs dictionary mapping docID to file docs[docID] = fileid docID += 1 size = 0 for k in idx.iterkeys(): size += os.sys.getsizeof(k) size += os.sys.getsizeof(idx[k]) #print "size of original dictionary is:", size return idx
def train(self): training_set = [] print self.activeCategories for category in self.activeCategories: files = reuters.fileids(category) for fi in files: training_set.append((self.getFileFeatures(fi), category)) self.classifier = nltk.NaiveBayesClassifier.train(training_set)
def __init__(self): # Generate training set from sample of Reuters corpus train_docs = [(self.bag_of_words(reuters.words(fileid)), category) for category in reuters.categories() for fileid in reuters.fileids(category) if fileid.startswith("train")] # Create a classifier from the training data self.classifier = NaiveBayesClassifier.train(train_docs)
def __init__(self, dataset=''): """ Docs in reuters corpus are identified by ids like "training|test/xxxx". :param dataset: filter for ids """ self.dataset = dataset # filter docs self.categories = {c: n for n, c in enumerate(reuters.categories())} # map class with int self.docs = {d: n for n, d in enumerate(reuters.fileids())} # map docs with int self.category_mask = [] # mask nth doc with its ith class
def loadReutersCorpus(self): ''' Load the Reuters datasets. Note: need to install nltk data beforehand, using nltk.download() for the first run ''' self.fileIds = reuters.fileids() # Use only the training set corpus - documents located in the reuters/training folder. self.fileIds = [x for x in self.fileIds if x.startswith( 'training' )] return
def topic_mod5(): ##For fetching words from topics parser = OptionParser() parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.001) parser.add_option("-k", dest="K", type="int", help="number of topics", default=50) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="seed", type="int", help="random seed", default=None) parser.add_option("-n", dest="samplesize", type="int", help="dataset sample size", default=100) (options, args) = parser.parse_args() random.seed(options.seed) numpy.random.seed(options.seed) idlist = random.sample(reuters.fileids(), options.samplesize) labels = [] corpus = [] labelset = [] result_set=[]; cur.execute("SELECT distinct topic from topic_words where category = 'Shoes'") for row in cur: topicset = [] topicset.append(row[0]) labels.append(topicset) labelset.append(row[0]) wordlist = [] cur1 = conn.cursor() cur1.execute("SELECT word from topic_words where category = 'Shoes' and topic = '"+row[0]+"' order by weight desc") for word in cur1: wordlist.append(word[0]) print wordlist corpus.append([x.lower() for x in wordlist if x!='' and (x[0] in string.ascii_letters)]) llda = LLDA(options.K, options.alpha, options.beta) llda.set_corpus(labelset, corpus, labels) # print "M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), options.K) for i in range(options.iteration): sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity())) llda.inference() print "perplexity : %.4f" % llda.perplexity() phi = llda.phi() cur.execute("delete from topic_words where category = '"+cats+"'") for k, label in enumerate(labelset): print "\n-- label %d : %s" % (k, label) if (label!= "common"): for w in numpy.argsort(-phi[k])[:20]: print "%s: %.4f" % (llda.vocas[w], phi[k,w]) cur.execute("INSERT INTO topic_words (category, topic, word, weight) VALUES ('Shoes', '"+label+"','"+llda.vocas[w]+"', phi[k,w]) ON DUPLICATE KEY UPDATE weight=VALUES(weight)") cur.execute("delete from topic_detail where category = '"+cats+"'") cur.execute("delete from topic_date_detail where category = '"+cats+"'") cur.execute("Insert into topic_detail select t1.topic, t1.pos_count, t2.neg_Count, (t1.pos_count+t2.neg_Count) total_count,'Shoes' from (select a.topic, count(b.word) pos_count from topic_words a, comment_words b where a.word = b.word and b.score >3 group by a.topic)t1,(select a.topic, count(b.word) neg_count from topic_words a, comment_words b where a.word = b.word and b.score < 3 group by a.topic) t2 where t1.topic = t2.topic") cur.execute("insert into topic_date_detail select t1.topic, t1.c_date, t1.pos_count, t1.pos_words, t2.neg_count, t2.neg_words,'Shoes' from (select a.topic, b.c_date, count(a.word) pos_count, GROUP_CONCAT(distinct(b.word)) pos_words from topic_words a, comment_words b where a.word = b.word and b.score >3 group by a.topic,b.c_date )t1, (select a.topic, b.c_date, count(a.word) neg_count, GROUP_CONCAT(distinct(b.word)) neg_words from topic_words a, comment_words b where a.word = b.word and b.score < 3 group by a.topic,b.c_date )t2 where t1.topic = t2.topic and t1.c_date= t2.c_date")
def get_list_fileids(corpus): li=[] if corpus=='mr': from nltk.corpus import movie_reviews li = movie_reviews.fileids() else: from nltk.corpus import reuters li = reuters.fileids() return li
def __iter__(self): """ Generator of docs while collecting ordered structured info. """ for n, reutersid in enumerate(reuters.fileids()): # 'training|test/xxxx' dataset, _ = reutersid.split('/') # extract dataset if self.dataset in dataset: # yield only filtered dataset if self.categories is not None: top_category = reuters.categories(reutersid)[0] # grab first category only self.category_mask.append(self.categories[top_category]) # n-th doc -> classid yield reuters.raw(reutersid) # return raw document
def get_reuters_token_list_by_sentence(num_doc=100): """ Get a test data from reuters corpus. Stopwords will be included to see how HMM_LDA works with these stopwords. Parameters ---------- num_doc: int number of documents to be returned max_voca maximum number of vocabulary size for the returned corpus Returns ------- voca: ndarray vocabulary corpus: list nested list of """ file_list = reuters.fileids() corpus = [reuters.sents(file_list[i]) for i in xrange(num_doc)] valid_voca = set(w.lower() for w in nltk.corpus.words.words()) stop = stopwords.words('english') valid_voca = valid_voca.union(stop) tmp_corpus = list() voca_dic = dict() voca = list() for doc in corpus: tmp_doc = list() for sent in doc: tmp_sent = list() for word in sent: if word in valid_voca: tmp_sent.append(word) if word not in voca_dic: voca_dic[word] = len(voca_dic) voca.append(word) if len(tmp_sent) > 0: tmp_doc.append(tmp_sent) if len(tmp_doc) > 0: tmp_corpus.append(tmp_doc) # convert token list to word index list corpus = list() for doc in tmp_corpus: new_doc = list() for sent in doc: new_sent = list() for word in sent: new_sent.append(voca_dic[word]) new_doc.append(new_sent) corpus.append(new_doc) return np.array(voca), corpus
def get_test_set(): single_categories = [(id, re.categories(id)[0]) for id in re.fileids() if len(re.categories(id)) == 1] single_cat_list = distribution(single_categories, itemgetter(1)) used_categories = [x[0] for x in single_cat_list if x[1] < 600 and x[1] > 200] return [pair for pair in single_categories if pair[1] in used_categories]
def reuters_train_test_feats(feature_detector=bag_of_words): train_feats = [] test_feats = [] for fileid in reuters.fileids(): if fileid.startswith('training'): featlist = train_feats else: # fileid.startswith('test') featlist = test_feats feats = feature_detector(reuters.words(fileid)) labels = reuters.categories(fileid) featlist.append((feats, labels)) return train_feats, test_feats
def build_TFIDF_model(self): """ Build term-document matrix containing TF-IDF score for each word in each document in the Reuters corpus (via NLTK). """ token_dict = {} for article in reuters.fileids(): token_dict[article] = reuters.raw(article) # Use TF-IDF to determine frequency of each word in our article, relative to the # word frequency distributions in corpus of 11k Reuters news articles. self._tfidf = TfidfVectorizer(tokenizer=self.tokenize_and_stem, stop_words='english', decode_error='ignore') tdm = self._tfidf.fit_transform(token_dict.values()) # Term-document matrix
def main(): train_docs = [] test_docs = [] for doc_id in reuters.fileids(): if doc_id.startswith("train"): train_docs.append(reuters.raw(doc_id)) else: test_docs.append(reuters.raw(doc_id)) representer = tf_idf(train_docs); for doc in test_docs: print(feature_values(doc, representer))
def batchRead(): #STUB files = reuters.fileids() tags = ['training', 'test'] training_files = [] for file_name in files: entry = file_name.strip().split('/', 2) if len(entry) != 2: continue element, value = entry[0], entry[1] if element == 'training': training_files.append(file_name) return training_files
def gen_financial_top_words(maxN=40000): # generate corpus based on Reuters news if not os.path.isfile('./input/topWords.json'): wordCnt = {} for field in reuters.fileids(): for word in reuters.words(field): word = unify_word(word) if word in nltk.corpus.stopwords.words('english'): continue wordCnt[word] = wordCnt.get(word, 0) + 1 sorted_wordCnt = sorted(wordCnt.items(), key=operator.itemgetter(1), reverse=True) wordCnt = {} # reset wordCnt for i in sorted_wordCnt[:maxN]: wordCnt[i[0]] = i[1] # convert list to dict with open('./input/topWords.json', 'w') as fout: json.dump(wordCnt, fout, indent=4) else: return
def batchReadReuters(collection, categories): #STUB files = reuters.fileids(categories) #print reuters.categories() tags = ['training', 'test'] training_files = [] for file_name in files: entry = file_name.strip().split('/', 2) if len(entry) != 2: continue element, value = entry[0], entry[1] if element == collection: training_files.append(file_name) return training_files
from nltk.corpus import reuters print(reuters.fileids()) print(reuters.categories()) print(reuters.categories('training/9865')) print(reuters.categories(['training/9865', 'training/9880'])) print(reuters.fileids('barley')) print(reuters.fileids(['barley', 'corn'])) print(reuters.words('training/9865')[:14]) print(reuters.words(['training/9865', 'training/9880']))
from nltk.corpus import reuters # download: nltk.download("reuters") files = reuters.fileids() print(files[:5])
#gerar pares de frequencias. cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modals = ['can', 'could', 'may', 'might', 'must', 'will'] #para agora desenhar a tabela com esses pares. cfd.tabulate(conditions=genres, samples=modals) #routers é um file com mihloes de palavras. from nltk.corpus import reuters reuters.categories(['training/9865', 'training/9880']) reuters.fileids('barley') reuters.fileids(['barley', 'corn']) reuters.words('training/9865')[:14] reuters.words('training/9865')[:14] reuters.words(['training/9865', 'training/9880']) reuters.words(categories='barley') reuters.words(categories=['barley', 'corn']) """ 1.5 """ from nltk.corpus import inaugural inaugural.fileids() [fileid[:4] for fileid in inaugural.fileids()]
import os import re from string import punctuation, whitespace import html import nltk from gensim.parsing.preprocessing import STOPWORDS from nltk.corpus import reuters as rt from nltk.corpus import stopwords as st from stop_words import get_stop_words LOT_OF_STOPWORDS = frozenset( list(STOPWORDS) + get_stop_words('en') + st.words('english')) TRAINING_SET = list(filter(lambda x: x.startswith('train'), rt.fileids())) TESTING_SET = list(filter(lambda x: x.startswith('test'), rt.fileids())) INPUTS_DIR = os.environ.get('INPUTS_DIR', 'inputs') OUTPUTS_DIR = os.environ.get('OUTPUTS_DIR', 'outputs') WHITE_PUNC_REGEX = re.compile(r"[%s]+" % re.escape(whitespace + punctuation), re.UNICODE) lemma = nltk.wordnet.WordNetLemmatizer() def preprocess_document(document_text): """ 1.) Lowercase it all 2.) Remove HTML Entities 3.) Split by punctuations to remove them.
def load_data(self, data_holder): docs = [ list(data_holder.words(doc_fileid)) for doc_fileid in data_holder.fileids() ] return docs
#script to perform CV on RF for Movie and select max_features 500 for 500 partition choices #output: results (1000 by #parameter options pickle) ############################## numberoffolds=5 #set seeds for reproducibility random.seed(1234) rs=1234 #choose number of folds numberoffolds=5 #data preparation #select only articles about wheat and/or corn wheat_docs = reuters.fileids("wheat") corn_docs = reuters.fileids("corn") wheat_train = list(filter(lambda doc: doc.startswith("train"),wheat_docs)) wheat_test = list(filter(lambda doc: doc.startswith("test"),wheat_docs)) corn_train = list(filter(lambda doc: doc.startswith("train"),corn_docs)) corn_test = list(filter(lambda doc: doc.startswith("test"),corn_docs)) training_index = wheat_train +wheat_test+ corn_train+corn_test #prepare data for wheat vs not wheat case text=[] clas = [] classname = ["pos", "neg"] for i in training_index: text.append(reuters.raw(i)) #check categorisation to make response if "wheat" in reuters.categories(i):
import nltk, textwrap import numpy as np #nltk.download('reuters') from nltk.corpus import reuters ### Get all reuters training articles fIDs = np.array(reuters.fileids()) trainIdx = np.where(np.array([t[:4] for t in fIDs]) == 'trai')[0] nCatgs = np.empty(len(trainIdx), dtype=int) catgs = np.empty(len(trainIdx), dtype=object) # pre-allocate texts = np.empty(len(trainIdx), dtype=object) # pre-allocate outNames = np.empty(len(trainIdx), dtype='<S16') # pre-allocate for n, idx in enumerate(trainIdx): name = fIDs[idx] cat = reuters.categories(name) if cat: # if category is not empty tmp = name.split('/') outNames[n] = tmp[0] + '-' + tmp[1].zfill(5) nCatgs[n] = len(reuters.categories(name)) catgs[n] = ','.join(reuters.categories(name)) texts[n] = ' '.join(reuters.words(name)) # trimming articles without categories toTrim = np.invert(np.equal(catgs, None)) catgs = catgs[toTrim] texts = texts[toTrim] nCatgs = nCatgs[toTrim] outNames = outNames[toTrim] for n in range(len(outNames)):
import requests from nltk.corpus import reuters n_insertions = 1000 url = "http://0.0.0.0:8000/api/store_doc/submit_doc" for index, i in enumerate(reuters.fileids()): doc = reuters.raw(fileids=[i]) doc_name = "doc" + str(index) insert_dict = {doc_name: doc} r = requests.post(url=url, json=insert_dict) print(doc_name) print(r) if index == n_insertions: break print("Inserted {} doccuments", n_insertions)
if x not in unique_list: unique_list.append(x.lower()) # print list return unique_list doclist = [] category_docs = [] categories = reuters.categories()[:5] # 5 categories print(categories) category_length = len(categories) print("Category length :",category_length) # slice_index = 1200//category_length # print(slice_index) category_lengths = [] for category in categories : category_docs = category_docs + reuters.fileids(category) test_docs = [] train_docs = [] for doc_id in category_docs: if doc_id.startswith("train"): train_docs.append(doc_id) else: test_docs.append(doc_id) train_docs = train_docs[:300] test_docs = test_docs[:700] category_docs = train_docs + test_docs #30 percent training docs and 70 percent test # print(category_docs) print("Length of category documents:",len(category_docs)) allwords = [] docnames = [] for doc in category_docs:
# -*- coding: utf-8 -*- """ Created on Tue Sep 12 14:46:28 2017 @author: Ritesh """ import string from nltk.corpus import reuters from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures from nltk.corpus import stopwords grain_tok = [reuters.words(f) for f in reuters.fileids('grain')] trade_tok = [reuters.words(f) for f in reuters.fileids('trade')] words = [w.lower() for f in grain_tok for w in f] words bcf = BigramCollocationFinder.from_words(words) top100 = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20) top = [(t1, t2) for (t1, t2) in top100 if t1 not in string.punctuation and t2 not in string.punctuation] stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset bcf.apply_word_filter(filter_stops) bcf.nbest(BigramAssocMeasures.likelihood_ratio, 10) bcf.nbest(BigramAssocMeasures.chi_sq, 10) bcf.nbest(BigramAssocMeasures.jaccard, 10) bcf.nbest(BigramAssocMeasures.mi_like, 10) bcf.nbest(BigramAssocMeasures.pmi, 10) bcf.nbest(BigramAssocMeasures.raw_freq, 10)
of external corpora. NLTK corpus okuyuculari. Bu paketteki moduller, cesitli bicimlerde corpus dosyalarini okumak icin kullanilabilen islevleri saglar. Bu islevler, hem NLTK corpus paketinde dagitilan corpus dosyalarini hem de dis corpora'nin bir parcasi olan corpus dosyalarini okumak icin kullanilabilir. Common Structures for Text Corpora: The simplest kind of corpus is a collection of isolated texts with no particular organization; some corpora are structured into categories like genre (Brown Corpus); some categorizations overlap, such as topic categories (Reuters Corpus); other corpora represent language use over time (Inaugural Address Corpus). 1- Gutenberg Corpus 2- Web and Chat Text 3- Brown Corpus 4- Reuters Corpus 5- Inaugural Address Corpus 6- Annotated Text Corpora """ from nltk.corpus import gutenberg, webtext, brown, reuters, inaugural print "Gutenberg FileIds :", gutenberg.fileids() print "Webtext FileIds :", webtext.fileids() print "Brown FileIds :", brown.fileids() print "Brown Categories :", brown.categories() print "Reuters FileIds :", reuters.fileids() print "Reuters Categories :", reuters.categories() print "Inaugural FileIds :", inaugural.fileids()
ticker = get_ticker(NE[0]) if ticker is not None: tickers.append(ticker) return tickers def get_only_text(url): page = urlopen(url).read().decode('utf8') soup = BeautifulSoup(page, "html.parser") text = ' '.join(map(lambda p: p.text, soup.find_all('p'))) return soup.title.text, text found_stocks = {} # {ticker: num_of_occurrences} article_fileids = [ f for f in random.sample(reuters.fileids(), len(reuters.fileids()))[:1500] ] i = 0 for f in article_fileids[:50]: i += 1 if i % 5 == 0: print("Currently on:", str(i)) print("found_stocks is") print(found_stocks) print() text = reuters.raw(f) tickers = find_all_stocks_mentioned(text) for ticker in tickers: if ticker is not None and ticker is not "": try: found_stocks[ticker] += 1
import text_clean as tc from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.decomposition import NMF, LatentDirichletAllocation from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.metrics import confusion_matrix, classification_report # create counter to summarize categories = [] file_count = [] # count each tag's number of documents for i in reuters.categories(): """print("$ There are {} documents included in topic \"{}\"" .format(len(reuters.fileids(i)), i))""" file_count.append(len(reuters.fileids(i))) categories.append(i) # create a dataframe out of the counts df = pd.DataFrame( {'categories': categories, "file_count": file_count}) \ .sort_values('file_count', ascending=False) # Select documents that only contains top two labels with most documents cat_start = 0 cat_end = cat_start + 1 category_filter = df.iloc[cat_start:cat_end + 1, 0].values.tolist() print(f"The following categories are selected for the analysis: \ {category_filter}") # select fileid with the category filter
default=100) parser.add_option("-s", dest="seed", type="int", help="random seed", default=None) parser.add_option("-n", dest="samplesize", type="int", help="dataset sample size", default=100) (options, args) = parser.parse_args() numpy.random.seed(options.seed) numpy.random.seed(options.seed) idlist = numpy.random.sample(reuters.fileids(), options.samplesize) labels = [] corpus = [] for id in idlist: labels.append(reuters.categories(id)) corpus.append( [x.lower() for x in reuters.words(id) if x[0] in string.ascii_letters]) reuters.words(id).close() labelset = list(set(reduce(list.__add__, labels))) llda = LLDA(options.K, options.alpha, options.beta) llda.set_corpus(labelset, corpus, labels) print("M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), options.K))
gutenberg.sents(fileid) from nltk.tokenize import sent_tokenize tok = sent_tokenize(text) for x in range(5): print(tok[x]) from nltk.corpus import brown brown.categories() from nltk.corpus import brown brown.categories() text = brown.raw(categories='news') from nltk.corpus import reuters reuters.fileids() reuters.categories() text = reuters.raw(fileid) reuters.categories(fileid) from nltk.corpus import movie_reviews movie_reviews.fileids() movie_reviews.categories() text = movie_reviews.raw(fileid) movie_reviews.categories(fileid) #Frequency distribution by creating our own corpus
def encode_doc(doc): d = np.zeros((1, max_len - 1, n_chars), dtype=np.bool) for p, j in enumerate(doc.lower()[:max_len]): d[0, p, char_to_idx[j]] = 1 return d # load or create the character encoding dictionaries if os.path.exists(char_idx_path): with open(char_idx_path, 'rb') as f: logger.info('Loading character encodings from "%s"' % char_idx_path) idx_to_char = pickle.load(f) char_to_idx = pickle.load(f) cat_enc = pickle.load(f) else: n_docs = len(reuters.fileids()) cat_enc = dict((x, i + 1) for i, x in enumerate(set(reuters.categories()))) chars = set() for fid in reuters.fileids(): chars = chars.union(set(reuters.raw(fid).lower())) idx_to_char = dict((i, c) for i, c in enumerate(chars)) char_to_idx = dict((c, i) for i, c in enumerate(chars)) with open(char_idx_path, 'wb') as f: logger.info('Saving character encodings to "%s"' % char_idx_path) pickle.dump(idx_to_char, f) pickle.dump(char_to_idx, f) pickle.dump(cat_enc, f)
from os.path import isfile from utils import read_list from nltk.corpus import reuters from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS from nltk.stem.snowball import PorterStemmer from gensim.models import Word2Vec from gensim.utils import simple_preprocess import itertools # Fetch the dataset category_dict = {'acq':0, 'coffee':1, 'crude':2, 'earn':3, 'gold':4, 'interest':5, 'money-fx':6, 'ship':7, 'sugar':8, 'trade':9} data = [] target = [] docs = reuters.fileids() for doc in docs: # Check if the document is only related to 1 class and that class is in category_dict if len(reuters.categories(doc)) == 1 and reuters.categories(doc)[0] in category_dict: data.append(" ".join(reuters.words(doc))) # Text of the document target.append(category_dict[reuters.categories(doc)[0]]) # Index for the class print("Dataset REUTERS loaded...") # Pre-process the dataset print("Pre-processing the dataset...") stemmer = PorterStemmer() # Define the type of stemmer to use additional_stop_words = [] stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words) stop_words = set([stemmer.stem(word) for word in stop_words]) # Stem the stop words for larger detection processed_data = [] id_to_delete = []
def initModelPipeline(corpusName="reuters"): '''Create and save all the necessary models. Assumes we use brown.words(..) returns lda model ''' # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) base_path = os.path.join('./pretrained/topics/', corpusName) normalized_docs = [normalization.normalizeWords(reuters.words(fileid)) for fileid in reuters.fileids()] \ if corpusName == 'reuters' \ else [normalization.normalizeWords(brown.words(fileid)) for fileid in brown.fileids()] dictionary = corpora.Dictionary(normalized_docs) saveDictionary(dictionary, os.path.join(base_path, corpusName + 'Dict.dict')) corpus = ReutersCorpus( dictionary) if corpusName == 'reuters' else BrownCorpus(dictionary) saveCorpus(corpus, os.path.join(base_path, corpusName + '_bow_norm.mm')) tfidfModel = models.TfidfModel(corpus) saveModel(tfidfModel, os.path.join(base_path, corpusName + '_tfidf_model.tfidf')) tfidfCorpus = tfidfModel[corpus] saveCorpus(tfidfCorpus, os.path.join(base_path, corpusName + '_tfidf_corpus.mm')) ldaModel = models.LdaModel(tfidfCorpus, id2word=dictionary, num_topics=90, passes=20, iterations=400) saveModel(ldaModel, os.path.join(base_path, corpusName + '_lda_model.lda')) ldaCorpus = ldaModel[tfidfCorpus] saveCorpus(ldaCorpus, os.path.join(base_path, corpusName + '_lda_corpus.mm'))
doc_words.append(w.lower()) #build word_to_index dictionary of words if w not in word_to_index.keys(): word_to_index[w] = idx idx = idx + 1 #clean_doc = [l.lemmatize(l.lemmatize(s.lower(),pos='n'),pos='v') for s in clean_doc.split(' ') if s!='' and s not in remove_words] clean_doc_words.append(doc_words) corpus.close() val.close() test.close() #clean_docs.append(" ".join(doc_words)) return word_to_index doc_files = np.array(reuters.fileids()) test_mask = ['test' in doc for doc in doc_files] train_mask = np.array(test_mask, dtype=int) == 0 test_doc_files = doc_files[test_mask] num_docs = len(doc_files) num_train = np.sum(train_mask) num_test = np.sum(test_mask) all_docs = [" ".join(reuters.words(doc_files[i])) for i in range(num_docs)] train_docs = np.array(all_docs)[train_mask] #get stop words to remove from NLTK remove_words = set(stopwords.words('english')) print('building dict')
def download_data_as_text(): all_articles = [] for article in reuters.fileids(): all_articles += reuters.raw(article) return all_articles
def print_time(start_time): tm = time.time() - start_time if tm > 100: return "{} minuites".format(tm / 60.0) else: return "{} seconds".format(tm) #----------------Get Corpus-------------------------- start_time = time.time() program_start = start_time #Top 10 documents = [ f for f in reuters.fileids() if len(reuters.categories(fileids=f)) == 1 ] train_docs_id = list( filter(lambda doc: doc.startswith("train") and len(reuters.raw(doc)) > 51, documents)) test_docs_id = list( filter(lambda doc: doc.startswith("test") and len(reuters.raw(doc)) > 51, documents)) new_train_docs_id = [] new_test_docs_id = [] for cat in reuters.categories(): li = [f for f in reuters.fileids(categories=cat) if f in train_docs_id] li_te = [f for f in reuters.fileids(categories=cat) if f in test_docs_id] if len(li) > 20 and len(li_te) > 20: new_train_docs_id.extend(li) new_test_docs_id.extend(li_te)
article.nlp() titles.append(article.title) #prints the title of the article texts.append((article.text)) #prints the entire text of the article summaries.append(article.summary) #prints the summary of the article #print(article.keywords) #prints the keywords of the article counter += 1 if counter >= articles_examined: break except newspaper.article.ArticleException: continue #########################PREPARE REUTERS CORPUS AND TRAIN CLASSIFIER###################################################### preprocessed_corpus = [] for fid in reuters.fileids(): preprocessed_corpus.append(preprocess_text(reuters.words(fid))) cleaned_preprocessed_corpus = [] # creating the bag of words model bag_of_words_creator = CountVectorizer() bag_of_words = bag_of_words_creator.fit_transform(cleaned_preprocessed_corpus) # creating the tf-idf model tfidf_creator = TfidfVectorizer(min_df=0.2) tfidf = tfidf_creator.fit_transform(preprocessed_corpus) documents = [(list(reuters.words(fileid)), category) for category in reuters.categories() for fileid in reuters.fileids(category)]
def __iter__(self): for fileid in reuters.fileids(): yield self.dictionary.doc2bow( normalization.normalizeWords(reuters.words(fileid)))
sorted_b = sorted(index_b, key=lambda x: x[1], reverse=True) scores = [a[1] * b[1] for a, b in zip(sorted_a, sorted_b) if a[0] == b[0]] return sum(scores) def standardized_sr(index_a, index_b): sum_a_squared = sum([centr**2 for word, centr in index_a]) sum_b_squared = sum([centr**2 for word, centr in index_b]) norm = math.sqrt((sum_a_squared * sum_b_squared)) standardized = simple_resonance(index_a, index_b) / norm return standardized if __name__ == '__main__': stopwords = stopwords.words() + ['.', ',', '"', "'", '-', '.-'] first_doc = reuters.sents(reuters.fileids()[0]) stopd_sents = [[ token.lower() for token in sent if token.lower() not in stopwords ] for sent in first_doc] for token in stopd_sents[0]: print(token) tagged_sents = [nltk.pos_tag(sentence) for sentence in stopd_sents] for token, tag in tagged_sents[0]: if re.match(r'NN*|JJ*', tag): print(token, tag) noun_phrases = [[ token for token, tag in sent if re.match(r'NN*|JJ*', tag)
def __init__(self): test_files = [ fileid for fileid in reuters.fileids() if fileid.startswith('test') ] super(ReutersTestCorpus, self).__init__(reuters.sents(test_files))
emma = nltk.Text(emma) #to use previous functions as with nltk.book txts print(emma.concordance('surprise')) print(' '.join(emma[20:50])) #LIST to STRING - comes out as text #examples of corpus available in nltk from nltk.corpus import webtext #less formal text print(webtext.fileids()) #filenames from nltk.corpus import nps_chat #predators print(nps_chat.fileids()) from nltk.corpus import brown #brown uni various texts print(brown.fileids()) from nltk.corpus import reuters print(reuters.fileids()) from nltk.corpus import inaugural print(inaugural.fileids()) #page 72 for a variety of corpus functionality commands ##SPACY SECTION - DataCamp course code collection, starting with 'Feature Engineering for NLP' import spacy # Load model and create Doc object nlp = spacy.load('en') # Function to preprocess text def preprocess(text): # Create Doc object
# REUTERS CORPUS DEMO from nltk.corpus import reuters print 'Total Categories:', len(reuters.categories()) print reuters.categories() # get sentences in housing and income categories sentences = reuters.sents(categories=['housing', 'income']) sentences = [' '.join(sentence_tokens) for sentence_tokens in sentences] print sentences[0:5] # prints the first 5 sentences # fileid based access print reuters.fileids(categories=['housing', 'income']) print reuters.sents(fileids=[u'test/16118', u'test/18534']) # WORDNET CORPUS DEMO from nltk.corpus import wordnet as wn word = 'hike' # taking hike as our word of interest # get word synsets word_synsets = wn.synsets(word) print word_synsets # get details for each synonym in synset for synset in word_synsets: print 'Synset Name:', synset.name()
from nltk.corpus import reuters import math print(reuters.fileids()) # The list of file names inside the corpus print(len(reuters.fileids())) # Number of files in the corpus = 10788 # Print the categories associated with a file print(reuters.categories('training/999')) # [u'interest', u'money-fx'] # Print the contents of the file print(reuters.raw('test/14829')) from string import punctuation from nltk.corpus import stopwords from nltk import word_tokenize from collections import defaultdict from six import string_types stop_words = stopwords.words('english') + list(punctuation) def tokenize(text): words = word_tokenize(text) words = [w.lower() for w in words] return [w for w in words if w not in stop_words and not w.isdigit()] # build the vocabulary in one pass vocabulary = set() for file_id in reuters.fileids(): words = tokenize(reuters.raw(file_id)) vocabulary.update(words)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Dec 4 19:40:46 2017 @author: jiahaoyuan """ from nltk.corpus import reuters from Standard import tokenize import json Dict = {} document = reuters.fileids() for doc in document: x = tokenize(reuters.raw(doc)) #x = list(set(x)) for words in x: if words in Dict: if doc in Dict[words]: Dict[words][doc]= Dict[words][doc]+1 else: Dict[words][doc] = 1 else: Dict[words] = {doc:1}
Vettorizzazioned dei dati di addestramento - Bag of words 1) creazione dei Token 2) assegnazione di un intero ad ogni token 3) conta dei token in un documento 4) normalizzazione 5) creazione di matrice Creazione di SVM e il suo addestramento Predizioni e test """ logger = set_project_logger() # -------------------- Lettura lista dei documenti e categorie ---------------- logger.info("Lista dei documenti reuters") categories_docs = futils.loadListFromFile("categories") docs = filter(filter_docs_with_categories, reuters.fileids()) training_docs = [x for x in docs if 'training' in x] test_docs = [x for x in docs if 'test' in x] logger.debug("{:-8} documenti per il training".format(len(training_docs))) logger.debug("{:-8} documenti per il test".format(len(test_docs))) logger.debug("{:-8} categorie diverse".format(len(categories_docs))) logger.debug(str(categories_docs)) train_corpus = [reuters.raw(f).lower() for f in training_docs] test_corpus = [reuters.raw(f).lower() for f in test_docs] # ----------------------------------------------------------------------------- # --------------------- Vettorizzazione dei documenti ---------------------------------- logger.info("Vettorizzazione dei documenti")