def tfidf(word, wordCount):

	docCount = len(reuters.fileids())

	wordCountCorpus = 0
	count = 0
	for doc in reuters.fileids():
		count = count + 1
		present = 0
		for word2 in reuters.words(doc):
			if word.lower() == word2.lower():
				present = 1
				break

		if present == 1:
			wordCountCorpus == wordCountCorpus + 1

		if count == 200:
			break

	tf = wordCount
	idf = math.log(docCount/(1 + wordCountCorpus))

	tfidf = tf * idf

	return tfidf
Ejemplo n.º 2
0
def import_reuters_files(ds, silent=False, log=sys.stdout):
    """
    Import the brown corpus into `ds`. E.g.
    
    >>> from nathan.core import Dataspace
    >>> ds = Dataspace()
    >>> %time brown.import_brown(ds, silent=True)
    CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
    Wall time: 12min 29s
    """
    if not silent:
        total = len(reuters.fileids())
        counter = 0
    root_handle = ds.insert("#reuters")
    for fileid in reuters.fileids():
        tags = ["@%s" % category for category in reuters.categories(fileid)]
        file_handle = ds.insert(["#%s" % fileid] + tags)
        ds.link(root_handle, file_handle)
        for sent in reuters.sents(fileid):
            norm = [word.lower() for word in sent]
            sen_handle = ds.insert(norm)
            ds.link(file_handle, sen_handle)
        if not silent:
            counter += 1
            if (counter % 10 == 0):
                print("importing %s of %s files..." % (counter, total), 
                    file=log)
Ejemplo n.º 3
0
def collection_stats():
	# List of documents
	documents = reuters.fileids()
	print(str(len(documents)) + " documents");
	
	train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
	print(str(len(train_docs)) + " total train documents");
	
	test_docs = list(filter(lambda doc: doc.startswith("test"), documents));	
	print(str(len(test_docs)) + " total test documents");

	# List of categories 
	categories = reuters.categories();
	print(str(len(categories)) + " categories");

	# Documents in a category
	category_docs = reuters.fileids("acq");

	# Words for a document
	document_id = category_docs[0]
	document_words = reuters.words(category_docs[0]);
	print(document_words);	

	# Raw document
	print(reuters.raw(document_id));
Ejemplo n.º 4
0
    def __init__(self, categories=None, lower=True):
        if categories == None or len(categories) == 1:
            self.fileids = reuters.fileids()
        else:
            self.fileids = reuters.fileids(categories)
        self.categories = categories

        self.lower = lower
Ejemplo n.º 5
0
def print_reuters():
    from nltk.corpus import reuters
    # print reuters.fileids()
    # print reuters.categories()
    print reuters.categories('training/9865')
    print reuters.categories(['training/9865','training/9880'])
    print reuters.fileids('barley')
    print reuters.fileids(['barely','corn'])
Ejemplo n.º 6
0
def explore_categories(max_len=5000, min_len=100, percentage=0.3):
    for cat in reuters.categories():
        for cat2 in reuters.categories():
            if cat2 > cat:
                if  len(set(reuters.fileids(cat)) & set(reuters.fileids(cat2))) == 0:
                    l1 = len(reuters.fileids(cat))
                    l2 = len(reuters.fileids(cat2))
                    if ( (l1 + l2) > min_len) and ( (l1 + l2) < max_len) and float((min(l1, l2))/float(l1+l2) > percentage):
                        print cat, cat2, l1 + l2, float(min(l1, l2))/float(l1+l2)
Ejemplo n.º 7
0
def generateTextList( category, size, normalize = False ):
	i = 0
	text = []
	
	while i < size and i < len( reuters.fileids( category ) ):
		if not normalize:
			text.insert( i, reuters.words( reuters.fileids( category )[i] ) )
		else:
			text.insert( i, 
			getNormalizedText( reuters.words( reuters.fileids( category )[i] ) ) )
		i += 1
	
	return text
Ejemplo n.º 8
0
def get_reuters_ids_cnt(num_doc=100, max_voca=10000, remove_top_n=5):
    """To get test data for training a model
    reuters, stopwords, english words corpora should be installed in nltk_data: nltk.download()

    Parameters
    ----------
    num_doc: int
        number of documents to be returned
    max_voca: int
        maximum number of vocabulary size for the returned corpus
    remove_top_n: int
        remove top n frequently used words

    Returns
    -------
    voca_list: ndarray
        list of vocabulary used to construct a corpus
    doc_ids: list
        list of list of word id for each document
    doc_cnt: list
        list of list of word count for each document
    """
    file_list = reuters.fileids()
    corpus = [reuters.words(file_list[i]) for i in xrange(num_doc)]

    return get_ids_cnt(corpus, max_voca, remove_top_n)
def get_testset_trainset_nltk_reuters():
    from nltk.corpus import reuters
    global categories_file_name_dict
    global cat_num_docs
    clean_files = [f for f in reuters.fileids() if len(reuters.categories(fileids=f))==1]    
    testset = [f for f in clean_files if f[:5]=='test/']
    trainset = [f for f in clean_files if f[:9]=='training/']
    for cat in reuters.categories():
        li=[f for f in reuters.fileids(categories=cat) if f in trainset]
        li_te = [f for f in reuters.fileids(categories=cat) if f in testset]
        if len(li)>20 and len(li_te)>20:
            cat_num_docs[cat]=len(li)
            li.extend(li_te)
            categories_file_name_dict[cat]=li
    return [[ f for f in trainset if f2c('reuters',f) in categories_file_name_dict],
            [ f for f in testset if f2c('reuters',f) in categories_file_name_dict]]            
Ejemplo n.º 10
0
def run():

    """Import the Reuters Corpus which contains 10,788 news articles"""

    from nltk.corpus import reuters
    raw_docs = [reuters.raw(fileid) for fileid in reuters.fileids()]

    # Select 100 documents randomly
    rand_idx = random.sample(range(len(raw_docs)), 100)
    raw_docs = [raw_docs[i] for i in rand_idx]

    # Preprocess Documents
    tokenized_docs = [ie_preprocess(doc) for doc in raw_docs]

    # Remove single occurance words
    docs = remove_infrequent_words(tokenized_docs)

    # Create dictionary and corpus
    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    # Build LDA model
    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10)
    for topic in lda.show_topics():
        print topic
Ejemplo n.º 11
0
def load_data(config={}):
    """
    Load the Reuters dataset.

    Returns
    -------
    data : dict
        with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
    """
    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    mlb = MultiLabelBinarizer()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]
    xs = {'train': [], 'test': []}
    xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
    xs['test'] = vectorizer.transform(docs['test']).toarray()
    ys = {'train': [], 'test': []}
    ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
                                     for doc_id in train])
    ys['test'] = mlb.transform([reuters.categories(doc_id)
                                for doc_id in test])
    data = {'x_train': xs['train'], 'y_train': ys['train'],
            'x_test': xs['test'], 'y_test': ys['test'],
            'labels': globals()["labels"]}
    return data
Ejemplo n.º 12
0
def preProcess():
    print 'PreProcess Reuters Corpus'
    start_time = time.time()
    docs = 0
    bad = 0
    tokenizer = Tokenizer()

    if not os.path.isdir(Paths.base):
        os.makedirs(Paths.base)

    with open(Paths.text_index, 'w') as fileid_out:
      with codecs.open(Paths.texts_clean, 'w', 'utf-8-sig') as out:
          with codecs.open(Paths.reuter_test, 'w', 'utf-8-sig') as test:
              for f in reuters.fileids():
                  contents = reuters.open(f).read()
                  try:
                      tokens = tokenizer.tokenize(contents)
                      docs += 1
                      if docs % 1000 == 0:
                          print "Normalised %d documents" % (docs)

                      out.write(' '.join(tokens) + "\n")
                      # if f.startswith("train"):
                      #
                      # else:
                      #     test.write(' '.join(tokens) + "\n")
                      fileid_out.write(f + "\n")

                  except UnicodeDecodeError:
                      bad += 1
    print "Normalised %d documents" % (docs)
    print "Skipped %d bad documents" % (bad)
    print 'Finished building train file ' + Paths.texts_clean
    end_time = time.time()
    print '(Time to preprocess Reuters Corpus: %s)' % (end_time - start_time)
def create_token_stream():
	"""
	A funtion that creates token stream based on the nltk reuters corpus
	A token stream is a list of tuples containing terms to docID
	"""
	token_stream = []
	docID = 1
	termID = 1
	print "Creating token stream..."
	for fileid in reuters.fileids('barley'):
	 	for term in reuters.words(fileid):
 			# Strip punctuatuion from the word and make lower case
 			term = remove_punct_from_word(term).lower()
 			# Check to make sure word is not "" and term is not a number
 			if len(term) > 0 and not is_number(term):
 				stemmed_term = stem().stem_word(term)
 				if stemmed_term not in terms:
 					terms[stemmed_term] = termID
 					termID += 1
 				new_token = (terms[stemmed_term], docID)
 				token_stream.append(new_token)
	 	# Add to docs dictionary mapping docID to file
	 	docs[docID] = fileid
	 	docID += 1
	return token_stream
Ejemplo n.º 14
0
def get_reuters_cnt_ids(num_doc=100, max_voca=10000):
    ''' to get test data for training a model
    reuters corpus should be installed in nltk_data: nltk.download()
    '''
    file_list = reuters.fileids()
    
    docs = list()
    freq = Counter()

    for i in range(num_doc):
        doc = reuters.words(file_list[i])
        freq.update(doc)
        docs.append(doc)

    voca = [key for key,val in freq.most_common(max_voca)]

    voca_dic = dict()
    voca_list = list()
    for word in voca:
        voca_dic[word] = len(voca_dic)
        voca_list.append(word)

    doc_ids = list()
    doc_cnt = list()

    for doc in docs:
        words = set(doc)
        ids = np.array([int(voca_dic[word]) for word in words if voca_dic.has_key(word)])
        cnt = np.array([int(doc.count(word)) for word in words if voca_dic.has_key(word)])

        doc_ids.append(ids)
        doc_cnt.append(cnt)

    return np.array(voca_list), doc_ids, doc_cnt
def create_dictionary_index_reuters():
	""" 
	A function that creates a dictonary with terms as keys
	and positings lists as values for the nltk reuters corpus
	"""

	idx = {}
	docs = {}
	docID = 1
	for fileid in reuters.fileids():
	 	for word in reuters.words(fileid):
	 		if not is_number(word):
	 			# Strip punctuatuion from the word and make lower case
	 			word = remove_punct_from_word(word).lower()
	 			# Check to make sure word is not ""
	 			if len(word) > 0:
	 				# Check to see if word already is in index
	 				if word in idx:
	 					# Check to see if docID is not already present for word
	 					if docID not in idx[word]:
	 						idx[word].append(docID)
	 				# Otherwise add word and docID in array to index
	 				else:
	 					idx[word] = []
	 					idx[word].append(docID)
	 	
	 	# Add to docs dictionary mapping docID to file
	 	docs[docID] = fileid
	 	docID += 1
	size =  0
	for k in idx.iterkeys():
		size += os.sys.getsizeof(k)
		size += os.sys.getsizeof(idx[k])
	#print "size of original dictionary is:", size
	return idx
Ejemplo n.º 16
0
 def train(self):
     training_set = []
     print self.activeCategories
     for category in self.activeCategories:
         files = reuters.fileids(category)
         for fi in files:
             training_set.append((self.getFileFeatures(fi), category))
     self.classifier = nltk.NaiveBayesClassifier.train(training_set)
Ejemplo n.º 17
0
 def __init__(self):
     # Generate training set from sample of Reuters corpus
     train_docs = [(self.bag_of_words(reuters.words(fileid)), category)
                   for category in reuters.categories()
                   for fileid in reuters.fileids(category) if
                   fileid.startswith("train")]
     # Create a classifier from the training data
     self.classifier = NaiveBayesClassifier.train(train_docs)
Ejemplo n.º 18
0
 def __init__(self, dataset=''):
     """
         Docs in reuters corpus are identified by ids like "training|test/xxxx".
     :param dataset: filter for ids
     """
     self.dataset = dataset # filter docs
     self.categories = {c: n for n, c in enumerate(reuters.categories())} # map class with int
     self.docs = {d: n for n, d in enumerate(reuters.fileids())}  # map docs with int
     self.category_mask = [] # mask nth doc with its ith class
Ejemplo n.º 19
0
 def loadReutersCorpus(self):
     '''  
     Load the Reuters datasets.
     Note: need to install nltk data beforehand, using nltk.download() for the first run
     '''       
     self.fileIds = reuters.fileids()
     # Use only the training set corpus - documents located in the reuters/training folder.
     self.fileIds = [x for x in self.fileIds if x.startswith( 'training' )]
     return
Ejemplo n.º 20
0
def topic_mod5():
	##For fetching words from topics
	parser = OptionParser()
	parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001)
	parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.001)
	parser.add_option("-k", dest="K", type="int", help="number of topics", default=50)
	parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
	parser.add_option("-s", dest="seed", type="int", help="random seed", default=None)
	parser.add_option("-n", dest="samplesize", type="int", help="dataset sample size", default=100)
	(options, args) = parser.parse_args()
	random.seed(options.seed)
	numpy.random.seed(options.seed)
	
	idlist = random.sample(reuters.fileids(), options.samplesize)

	labels = []
	corpus = []
	labelset = []

	result_set=[];
	cur.execute("SELECT distinct topic from topic_words where category = 'Shoes'")
	for row in cur:
		topicset = []
		topicset.append(row[0])
		labels.append(topicset)
		labelset.append(row[0])
		wordlist = []
		cur1 = conn.cursor()
		cur1.execute("SELECT word from topic_words where category = 'Shoes' and topic = '"+row[0]+"' order by weight desc")
		for word in cur1:
			wordlist.append(word[0])
		print wordlist
		corpus.append([x.lower() for x in wordlist if x!='' and (x[0] in string.ascii_letters)])	

	llda = LLDA(options.K, options.alpha, options.beta)
	llda.set_corpus(labelset, corpus, labels)

	# print "M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), options.K)

	for i in range(options.iteration):
	    sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity()))
	    llda.inference()
	print "perplexity : %.4f" % llda.perplexity()

	phi = llda.phi()
	cur.execute("delete from topic_words where category = '"+cats+"'")
	
	for k, label in enumerate(labelset):
	    print "\n-- label %d : %s" % (k, label)
	    if (label!= "common"):
	    	for w in numpy.argsort(-phi[k])[:20]:
	    	   	print "%s: %.4f" % (llda.vocas[w], phi[k,w])
	    	   	cur.execute("INSERT INTO topic_words (category, topic, word, weight) VALUES ('Shoes', '"+label+"','"+llda.vocas[w]+"', phi[k,w]) ON DUPLICATE KEY UPDATE  weight=VALUES(weight)")	    
	cur.execute("delete from topic_detail where category = '"+cats+"'")
	cur.execute("delete from topic_date_detail where category = '"+cats+"'")
	cur.execute("Insert into topic_detail select t1.topic, t1.pos_count, t2.neg_Count, (t1.pos_count+t2.neg_Count) total_count,'Shoes' from  (select a.topic, count(b.word) pos_count from topic_words a, comment_words b where a.word = b.word    and  b.score >3 group by a.topic)t1,(select a.topic, count(b.word) neg_count from topic_words a, comment_words b where a.word = b.word    and  b.score < 3 group by a.topic) t2 where t1.topic = t2.topic") 
	cur.execute("insert into topic_date_detail select t1.topic, t1.c_date, t1.pos_count, t1.pos_words, t2.neg_count, t2.neg_words,'Shoes' from (select a.topic, b.c_date, count(a.word) pos_count, GROUP_CONCAT(distinct(b.word)) pos_words from topic_words a, comment_words b  where a.word = b.word    and  b.score >3  group by a.topic,b.c_date )t1, (select a.topic, b.c_date, count(a.word) neg_count, GROUP_CONCAT(distinct(b.word)) neg_words from topic_words a, comment_words b  where a.word = b.word    and  b.score < 3  group by a.topic,b.c_date )t2 where t1.topic = t2.topic and t1.c_date= t2.c_date") 
Ejemplo n.º 21
0
def get_list_fileids(corpus):
    li=[]
    if corpus=='mr':
        from nltk.corpus import movie_reviews
        li = movie_reviews.fileids()
    else:
        from nltk.corpus import reuters
        li = reuters.fileids()
    return li
Ejemplo n.º 22
0
 def __iter__(self):
     """ Generator of docs while collecting ordered structured info. """
     for n, reutersid in enumerate(reuters.fileids()):         # 'training|test/xxxx'
         dataset, _ = reutersid.split('/')       # extract dataset
         if self.dataset in dataset:             # yield only filtered dataset
             if self.categories is not None:
                 top_category = reuters.categories(reutersid)[0]            # grab first category only
                 self.category_mask.append(self.categories[top_category])   # n-th doc -> classid
             yield reuters.raw(reutersid)        # return raw document
Ejemplo n.º 23
0
def get_reuters_token_list_by_sentence(num_doc=100):
    """ Get a test data from reuters corpus.
    Stopwords will be included to see how HMM_LDA works with these stopwords.

    Parameters
    ----------
    num_doc: int
        number of documents to be returned
    max_voca
        maximum number of vocabulary size for the returned corpus
    Returns
    -------
    voca: ndarray
        vocabulary
    corpus: list
        nested list of

    """
    file_list = reuters.fileids()
    corpus = [reuters.sents(file_list[i]) for i in xrange(num_doc)]

    valid_voca = set(w.lower() for w in nltk.corpus.words.words())
    stop = stopwords.words('english')
    valid_voca = valid_voca.union(stop)

    tmp_corpus = list()
    voca_dic = dict()
    voca = list()
    for doc in corpus:
        tmp_doc = list()
        for sent in doc:
            tmp_sent = list()
            for word in sent:
                if word in valid_voca:
                    tmp_sent.append(word)
                    if word not in voca_dic:
                        voca_dic[word] = len(voca_dic)
                        voca.append(word)
            if len(tmp_sent) > 0:
                tmp_doc.append(tmp_sent)
        if len(tmp_doc) > 0:
            tmp_corpus.append(tmp_doc)

    # convert token list to word index list
    corpus = list()
    for doc in tmp_corpus:
        new_doc = list()
        for sent in doc:
            new_sent = list()
            for word in sent:
                new_sent.append(voca_dic[word])
            new_doc.append(new_sent)
        corpus.append(new_doc)

    return np.array(voca), corpus
Ejemplo n.º 24
0
def get_test_set():
    single_categories = [(id, re.categories(id)[0])
                         for id in re.fileids()
                         if len(re.categories(id)) == 1]

    single_cat_list = distribution(single_categories, itemgetter(1))
    used_categories = [x[0]
                       for x in single_cat_list
                       if x[1] < 600 and x[1] > 200]

    return [pair for pair in single_categories if pair[1] in used_categories]
Ejemplo n.º 25
0
def reuters_train_test_feats(feature_detector=bag_of_words):
	train_feats = []
	test_feats = []
	for fileid in reuters.fileids():
		if fileid.startswith('training'):
			featlist = train_feats
		else:   # fileid.startswith('test')
			featlist = test_feats
		feats = feature_detector(reuters.words(fileid))
		labels = reuters.categories(fileid)
		featlist.append((feats, labels))
	return train_feats, test_feats
Ejemplo n.º 26
0
    def build_TFIDF_model(self):
        """ Build term-document matrix containing TF-IDF score for each word in each document
            in the Reuters corpus (via NLTK).
        """
        token_dict = {}
        for article in reuters.fileids():
            token_dict[article] = reuters.raw(article)

        # Use TF-IDF to determine frequency of each word in our article, relative to the
        # word frequency distributions in corpus of 11k Reuters news articles.
        self._tfidf = TfidfVectorizer(tokenizer=self.tokenize_and_stem, stop_words='english', decode_error='ignore')
        tdm = self._tfidf.fit_transform(token_dict.values())  # Term-document matrix
Ejemplo n.º 27
0
def main():
	train_docs = []
	test_docs = []

	for doc_id in reuters.fileids():
		if doc_id.startswith("train"):		
			train_docs.append(reuters.raw(doc_id))
		else:
			test_docs.append(reuters.raw(doc_id))
		
	representer = tf_idf(train_docs);

	for doc in test_docs:
		print(feature_values(doc, representer))
Ejemplo n.º 28
0
def batchRead():
    #STUB
    files = reuters.fileids()
    tags = ['training', 'test']
    training_files = []

    for file_name in files:
        entry = file_name.strip().split('/', 2)
        if len(entry) != 2:
            continue
        element, value = entry[0], entry[1]
        if element == 'training':
            training_files.append(file_name)
    return training_files
def gen_financial_top_words(maxN=40000): # generate corpus based on Reuters news
    if not os.path.isfile('./input/topWords.json'):
        wordCnt = {}
        for field in reuters.fileids():
            for word in reuters.words(field):
                word = unify_word(word)
                if word in nltk.corpus.stopwords.words('english'):
                    continue
                wordCnt[word] = wordCnt.get(word, 0) + 1

        sorted_wordCnt = sorted(wordCnt.items(), key=operator.itemgetter(1), reverse=True)
        wordCnt = {} # reset wordCnt
        for i in sorted_wordCnt[:maxN]: wordCnt[i[0]] = i[1] # convert list to dict
        with open('./input/topWords.json', 'w') as fout: json.dump(wordCnt, fout, indent=4)
    else: return
Ejemplo n.º 30
0
def batchReadReuters(collection, categories):
    #STUB
    files = reuters.fileids(categories)
    #print reuters.categories()
    tags = ['training', 'test']
    training_files = []

    for file_name in files:
        entry = file_name.strip().split('/', 2)
        if len(entry) != 2:
            continue
        element, value = entry[0], entry[1]
        if element == collection:
            training_files.append(file_name)
    return training_files
Ejemplo n.º 31
0
from nltk.corpus import reuters
print(reuters.fileids())
print(reuters.categories())
print(reuters.categories('training/9865'))
print(reuters.categories(['training/9865', 'training/9880']))
print(reuters.fileids('barley'))
print(reuters.fileids(['barley', 'corn']))
print(reuters.words('training/9865')[:14])
print(reuters.words(['training/9865', 'training/9880']))
Ejemplo n.º 32
0
from nltk.corpus import reuters

# download: nltk.download("reuters")

files = reuters.fileids()
print(files[:5])
Ejemplo n.º 33
0
#gerar pares de frequencias.	
cfd = nltk.ConditionalFreqDist(
	(genre, word)
	for genre in brown.categories()
	for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
#para agora desenhar a tabela com esses pares.
cfd.tabulate(conditions=genres, samples=modals)

#routers é um file com mihloes de palavras.

from nltk.corpus import reuters
reuters.categories(['training/9865', 'training/9880'])
reuters.fileids('barley')
reuters.fileids(['barley', 'corn'])
 reuters.words('training/9865')[:14]
reuters.words('training/9865')[:14]
reuters.words(['training/9865', 'training/9880'])
reuters.words(categories='barley')
reuters.words(categories=['barley', 'corn'])



"""
	1.5
"""
from nltk.corpus import inaugural
inaugural.fileids()
[fileid[:4] for fileid in inaugural.fileids()]
Ejemplo n.º 34
0
import os
import re
from string import punctuation, whitespace

import html

import nltk
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import reuters as rt
from nltk.corpus import stopwords as st
from stop_words import get_stop_words

LOT_OF_STOPWORDS = frozenset(
    list(STOPWORDS) + get_stop_words('en') + st.words('english'))

TRAINING_SET = list(filter(lambda x: x.startswith('train'), rt.fileids()))
TESTING_SET = list(filter(lambda x: x.startswith('test'), rt.fileids()))

INPUTS_DIR = os.environ.get('INPUTS_DIR', 'inputs')
OUTPUTS_DIR = os.environ.get('OUTPUTS_DIR', 'outputs')

WHITE_PUNC_REGEX = re.compile(r"[%s]+" % re.escape(whitespace + punctuation),
                              re.UNICODE)
lemma = nltk.wordnet.WordNetLemmatizer()


def preprocess_document(document_text):
    """
        1.) Lowercase it all
        2.) Remove HTML Entities
        3.) Split by punctuations to remove them.
 def load_data(self, data_holder):
     docs = [
         list(data_holder.words(doc_fileid))
         for doc_fileid in data_holder.fileids()
     ]
     return docs
Ejemplo n.º 36
0
#script to perform CV on RF for Movie and select max_features 500 for 500 partition choices
#output: results (1000 by #parameter options pickle)
##############################

numberoffolds=5
#set seeds for reproducibility
random.seed(1234)
rs=1234


#choose number of folds
numberoffolds=5
#data preparation

#select only articles about wheat and/or corn
wheat_docs = reuters.fileids("wheat")
corn_docs = reuters.fileids("corn")
wheat_train = list(filter(lambda doc: doc.startswith("train"),wheat_docs))
wheat_test = list(filter(lambda doc: doc.startswith("test"),wheat_docs))
corn_train = list(filter(lambda doc: doc.startswith("train"),corn_docs))
corn_test = list(filter(lambda doc: doc.startswith("test"),corn_docs))
training_index = wheat_train +wheat_test+ corn_train+corn_test

#prepare data for wheat vs not wheat case
text=[]
clas = []
classname = ["pos", "neg"]
for i in training_index:
        text.append(reuters.raw(i))
        #check categorisation to make response
        if "wheat" in reuters.categories(i):
Ejemplo n.º 37
0
import nltk, textwrap
import numpy as np
#nltk.download('reuters')
from nltk.corpus import reuters

### Get all reuters training articles
fIDs = np.array(reuters.fileids())
trainIdx = np.where(np.array([t[:4] for t in fIDs]) == 'trai')[0]

nCatgs = np.empty(len(trainIdx), dtype=int)
catgs = np.empty(len(trainIdx), dtype=object)  # pre-allocate
texts = np.empty(len(trainIdx), dtype=object)  # pre-allocate
outNames = np.empty(len(trainIdx), dtype='<S16')  # pre-allocate
for n, idx in enumerate(trainIdx):
    name = fIDs[idx]
    cat = reuters.categories(name)
    if cat:  # if category is not empty
        tmp = name.split('/')
        outNames[n] = tmp[0] + '-' + tmp[1].zfill(5)
        nCatgs[n] = len(reuters.categories(name))
        catgs[n] = ','.join(reuters.categories(name))
        texts[n] = ' '.join(reuters.words(name))

# trimming articles without categories
toTrim = np.invert(np.equal(catgs, None))
catgs = catgs[toTrim]
texts = texts[toTrim]
nCatgs = nCatgs[toTrim]
outNames = outNames[toTrim]

for n in range(len(outNames)):
Ejemplo n.º 38
0
import requests

from nltk.corpus import reuters

n_insertions = 1000
url = "http://0.0.0.0:8000/api/store_doc/submit_doc"
for index, i in enumerate(reuters.fileids()):
    doc = reuters.raw(fileids=[i])
    doc_name = "doc" + str(index)
    insert_dict = {doc_name: doc}
    r = requests.post(url=url, json=insert_dict)
    print(doc_name)
    print(r)
    if index == n_insertions:
        break
print("Inserted {} doccuments", n_insertions)
Ejemplo n.º 39
0
        if x not in unique_list:
            unique_list.append(x.lower())
    # print list
    return unique_list

doclist = []
category_docs = []
categories = reuters.categories()[:5] # 5 categories
print(categories)
category_length = len(categories)
print("Category length :",category_length)
# slice_index = 1200//category_length
# print(slice_index)
category_lengths = []
for category in categories :
    category_docs = category_docs + reuters.fileids(category)
test_docs = []
train_docs = []
for doc_id in category_docs:
    if doc_id.startswith("train"):
        train_docs.append(doc_id)
    else:
        test_docs.append(doc_id)
train_docs = train_docs[:300]
test_docs = test_docs[:700]
category_docs = train_docs + test_docs #30 percent training docs and 70 percent test
# print(category_docs)
print("Length of category documents:",len(category_docs))
allwords = []
docnames = []
for doc in category_docs:
Ejemplo n.º 40
0
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 12 14:46:28 2017

@author: Ritesh
"""
import string
from nltk.corpus import reuters
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from nltk.corpus import stopwords

grain_tok = [reuters.words(f) for f in reuters.fileids('grain')]
trade_tok = [reuters.words(f) for f in reuters.fileids('trade')]

words = [w.lower() for f in grain_tok for w in f]
words
bcf = BigramCollocationFinder.from_words(words)
top100 = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20)
top = [(t1, t2) for (t1, t2) in top100
       if t1 not in string.punctuation and t2 not in string.punctuation]

stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bcf.apply_word_filter(filter_stops)
bcf.nbest(BigramAssocMeasures.likelihood_ratio, 10)
bcf.nbest(BigramAssocMeasures.chi_sq, 10)
bcf.nbest(BigramAssocMeasures.jaccard, 10)
bcf.nbest(BigramAssocMeasures.mi_like, 10)
bcf.nbest(BigramAssocMeasures.pmi, 10)
bcf.nbest(BigramAssocMeasures.raw_freq, 10)
of external corpora.
 
NLTK corpus okuyuculari. Bu paketteki moduller, cesitli bicimlerde 
corpus dosyalarini okumak icin kullanilabilen islevleri saglar. 
Bu islevler, hem NLTK corpus paketinde dagitilan corpus dosyalarini 
hem de dis corpora'nin bir parcasi olan corpus dosyalarini okumak icin kullanilabilir.

Common Structures for Text Corpora: The simplest kind of corpus is a collection of 
isolated texts with no particular organization; some corpora are structured 
into categories like genre (Brown Corpus); some categorizations overlap, such as
topic categories (Reuters Corpus); other corpora represent language use over
time (Inaugural Address Corpus).
 
1- Gutenberg Corpus
2- Web and Chat Text
3- Brown Corpus
4- Reuters Corpus
5- Inaugural Address Corpus
6- Annotated Text Corpora
"""

from nltk.corpus import gutenberg, webtext, brown, reuters, inaugural

print "Gutenberg FileIds   :", gutenberg.fileids()
print "Webtext FileIds     :", webtext.fileids()
print "Brown FileIds       :", brown.fileids()
print "Brown Categories    :", brown.categories()
print "Reuters FileIds     :", reuters.fileids()
print "Reuters Categories  :", reuters.categories()
print "Inaugural FileIds   :", inaugural.fileids()
            ticker = get_ticker(NE[0])
            if ticker is not None:
                tickers.append(ticker)
    return tickers


def get_only_text(url):
    page = urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(page, "html.parser")
    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
    return soup.title.text, text


found_stocks = {}  # {ticker: num_of_occurrences}
article_fileids = [
    f for f in random.sample(reuters.fileids(), len(reuters.fileids()))[:1500]
]
i = 0
for f in article_fileids[:50]:
    i += 1
    if i % 5 == 0:
        print("Currently on:", str(i))
        print("found_stocks is")
        print(found_stocks)
        print()
    text = reuters.raw(f)
    tickers = find_all_stocks_mentioned(text)
    for ticker in tickers:
        if ticker is not None and ticker is not "":
            try:
                found_stocks[ticker] += 1
Ejemplo n.º 43
0
import text_clean as tc
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
# create counter to summarize
categories = []
file_count = []

# count each tag's number of documents
for i in reuters.categories():
    """print("$ There are {} documents included in topic \"{}\""
          .format(len(reuters.fileids(i)), i))"""
    file_count.append(len(reuters.fileids(i)))
    categories.append(i)

# create a dataframe out of the counts
df = pd.DataFrame(
    {'categories': categories, "file_count": file_count}) \
    .sort_values('file_count', ascending=False)

# Select documents that only contains top two labels with most documents
cat_start = 0
cat_end = cat_start + 1
category_filter = df.iloc[cat_start:cat_end + 1, 0].values.tolist()
print(f"The following categories are selected for the analysis: \
      {category_filter}")

# select fileid with the category filter
Ejemplo n.º 44
0
                  default=100)
parser.add_option("-s",
                  dest="seed",
                  type="int",
                  help="random seed",
                  default=None)
parser.add_option("-n",
                  dest="samplesize",
                  type="int",
                  help="dataset sample size",
                  default=100)
(options, args) = parser.parse_args()
numpy.random.seed(options.seed)
numpy.random.seed(options.seed)

idlist = numpy.random.sample(reuters.fileids(), options.samplesize)

labels = []
corpus = []
for id in idlist:
    labels.append(reuters.categories(id))
    corpus.append(
        [x.lower() for x in reuters.words(id) if x[0] in string.ascii_letters])
    reuters.words(id).close()
labelset = list(set(reduce(list.__add__, labels)))

llda = LLDA(options.K, options.alpha, options.beta)
llda.set_corpus(labelset, corpus, labels)

print("M=%d, V=%d, L=%d, K=%d" %
      (len(corpus), len(llda.vocas), len(labelset), options.K))
gutenberg.sents(fileid)
from nltk.tokenize import sent_tokenize
tok = sent_tokenize(text)

for x in range(5):
    print(tok[x])
    
from nltk.corpus import brown
brown.categories()
from nltk.corpus import brown
brown.categories()
text = brown.raw(categories='news')


from nltk.corpus import reuters
reuters.fileids()
reuters.categories()
text = reuters.raw(fileid)
reuters.categories(fileid)



from nltk.corpus import movie_reviews
movie_reviews.fileids()
movie_reviews.categories()
text = movie_reviews.raw(fileid)
movie_reviews.categories(fileid)


#Frequency distribution by creating our own corpus
Ejemplo n.º 46
0
def encode_doc(doc):
    d = np.zeros((1, max_len - 1, n_chars), dtype=np.bool)
    for p, j in enumerate(doc.lower()[:max_len]):
        d[0, p, char_to_idx[j]] = 1
    return d


# load or create the character encoding dictionaries
if os.path.exists(char_idx_path):
    with open(char_idx_path, 'rb') as f:
        logger.info('Loading character encodings from "%s"' % char_idx_path)
        idx_to_char = pickle.load(f)
        char_to_idx = pickle.load(f)
        cat_enc = pickle.load(f)
else:
    n_docs = len(reuters.fileids())
    cat_enc = dict((x, i + 1) for i, x in enumerate(set(reuters.categories())))

    chars = set()
    for fid in reuters.fileids():
        chars = chars.union(set(reuters.raw(fid).lower()))

    idx_to_char = dict((i, c) for i, c in enumerate(chars))
    char_to_idx = dict((c, i) for i, c in enumerate(chars))

    with open(char_idx_path, 'wb') as f:
        logger.info('Saving character encodings to "%s"' % char_idx_path)
        pickle.dump(idx_to_char, f)
        pickle.dump(char_to_idx, f)
        pickle.dump(cat_enc, f)
Ejemplo n.º 47
0
from os.path import isfile
from utils import read_list
from nltk.corpus import reuters
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem.snowball import PorterStemmer
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import itertools


# Fetch the dataset
category_dict = {'acq':0, 'coffee':1, 'crude':2, 'earn':3, 'gold':4, 'interest':5, 'money-fx':6, 'ship':7, 'sugar':8,
                 'trade':9}
data = []
target = []
docs = reuters.fileids()
for doc in docs:
    # Check if the document is only related to 1 class and that class is in category_dict
    if len(reuters.categories(doc)) == 1 and reuters.categories(doc)[0] in category_dict:
        data.append(" ".join(reuters.words(doc))) # Text of the document
        target.append(category_dict[reuters.categories(doc)[0]]) # Index for the class
print("Dataset REUTERS loaded...")

# Pre-process the dataset
print("Pre-processing the dataset...")
stemmer = PorterStemmer() # Define the type of stemmer to use
additional_stop_words = []
stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)
stop_words = set([stemmer.stem(word) for word in stop_words]) # Stem the stop words for larger detection
processed_data = []
id_to_delete = []
Ejemplo n.º 48
0
def initModelPipeline(corpusName="reuters"):
    '''Create and save all the necessary models. Assumes we use brown.words(..)
    returns lda model
    '''

    # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
    base_path = os.path.join('./pretrained/topics/', corpusName)

    normalized_docs = [normalization.normalizeWords(reuters.words(fileid)) for  fileid in reuters.fileids()] \
                      if corpusName == 'reuters' \
                      else [normalization.normalizeWords(brown.words(fileid)) for  fileid in brown.fileids()]

    dictionary = corpora.Dictionary(normalized_docs)
    saveDictionary(dictionary, os.path.join(base_path,
                                            corpusName + 'Dict.dict'))

    corpus = ReutersCorpus(
        dictionary) if corpusName == 'reuters' else BrownCorpus(dictionary)
    saveCorpus(corpus, os.path.join(base_path, corpusName + '_bow_norm.mm'))

    tfidfModel = models.TfidfModel(corpus)
    saveModel(tfidfModel,
              os.path.join(base_path, corpusName + '_tfidf_model.tfidf'))

    tfidfCorpus = tfidfModel[corpus]
    saveCorpus(tfidfCorpus,
               os.path.join(base_path, corpusName + '_tfidf_corpus.mm'))

    ldaModel = models.LdaModel(tfidfCorpus,
                               id2word=dictionary,
                               num_topics=90,
                               passes=20,
                               iterations=400)
    saveModel(ldaModel, os.path.join(base_path, corpusName + '_lda_model.lda'))

    ldaCorpus = ldaModel[tfidfCorpus]
    saveCorpus(ldaCorpus, os.path.join(base_path,
                                       corpusName + '_lda_corpus.mm'))
                doc_words.append(w.lower())
                #build word_to_index dictionary of words
                if w not in word_to_index.keys():
                    word_to_index[w] = idx
                    idx = idx + 1
        #clean_doc = [l.lemmatize(l.lemmatize(s.lower(),pos='n'),pos='v') for s in clean_doc.split(' ') if s!='' and s not in remove_words]
        clean_doc_words.append(doc_words)
    corpus.close()
    val.close()
    test.close()

    #clean_docs.append(" ".join(doc_words))
    return word_to_index


doc_files = np.array(reuters.fileids())
test_mask = ['test' in doc for doc in doc_files]
train_mask = np.array(test_mask, dtype=int) == 0

test_doc_files = doc_files[test_mask]

num_docs = len(doc_files)
num_train = np.sum(train_mask)
num_test = np.sum(test_mask)

all_docs = [" ".join(reuters.words(doc_files[i])) for i in range(num_docs)]
train_docs = np.array(all_docs)[train_mask]

#get stop words to remove from NLTK
remove_words = set(stopwords.words('english'))
print('building dict')
Ejemplo n.º 50
0
def download_data_as_text():
    all_articles = []
    for article in reuters.fileids():
        all_articles += reuters.raw(article)

    return all_articles
Ejemplo n.º 51
0
def print_time(start_time):
    tm = time.time() - start_time
    if tm > 100:
        return "{} minuites".format(tm / 60.0)
    else:
        return "{} seconds".format(tm)


#----------------Get Corpus--------------------------

start_time = time.time()
program_start = start_time

#Top 10
documents = [
    f for f in reuters.fileids() if len(reuters.categories(fileids=f)) == 1
]
train_docs_id = list(
    filter(lambda doc: doc.startswith("train") and len(reuters.raw(doc)) > 51,
           documents))
test_docs_id = list(
    filter(lambda doc: doc.startswith("test") and len(reuters.raw(doc)) > 51,
           documents))
new_train_docs_id = []
new_test_docs_id = []
for cat in reuters.categories():
    li = [f for f in reuters.fileids(categories=cat) if f in train_docs_id]
    li_te = [f for f in reuters.fileids(categories=cat) if f in test_docs_id]
    if len(li) > 20 and len(li_te) > 20:
        new_train_docs_id.extend(li)
        new_test_docs_id.extend(li_te)
Ejemplo n.º 52
0
        article.nlp()
        titles.append(article.title)  #prints the title of the article
        texts.append((article.text))  #prints the entire text of the article
        summaries.append(article.summary)  #prints the summary of the article
        #print(article.keywords) #prints the keywords of the article
        counter += 1
        if counter >= articles_examined:
            break

    except newspaper.article.ArticleException:
        continue

#########################PREPARE REUTERS CORPUS AND TRAIN CLASSIFIER######################################################
preprocessed_corpus = []

for fid in reuters.fileids():
    preprocessed_corpus.append(preprocess_text(reuters.words(fid)))

cleaned_preprocessed_corpus = []

# creating the bag of words model
bag_of_words_creator = CountVectorizer()
bag_of_words = bag_of_words_creator.fit_transform(cleaned_preprocessed_corpus)

# creating the tf-idf model
tfidf_creator = TfidfVectorizer(min_df=0.2)
tfidf = tfidf_creator.fit_transform(preprocessed_corpus)

documents = [(list(reuters.words(fileid)), category)
             for category in reuters.categories()
             for fileid in reuters.fileids(category)]
Ejemplo n.º 53
0
 def __iter__(self):
     for fileid in reuters.fileids():
         yield self.dictionary.doc2bow(
             normalization.normalizeWords(reuters.words(fileid)))
Ejemplo n.º 54
0
    sorted_b = sorted(index_b, key=lambda x: x[1], reverse=True)
    scores = [a[1] * b[1] for a, b in zip(sorted_a, sorted_b) if a[0] == b[0]]
    return sum(scores)


def standardized_sr(index_a, index_b):
    sum_a_squared = sum([centr**2 for word, centr in index_a])
    sum_b_squared = sum([centr**2 for word, centr in index_b])
    norm = math.sqrt((sum_a_squared * sum_b_squared))
    standardized = simple_resonance(index_a, index_b) / norm
    return standardized


if __name__ == '__main__':
    stopwords = stopwords.words() + ['.', ',', '"', "'", '-', '.-']
    first_doc = reuters.sents(reuters.fileids()[0])
    stopd_sents = [[
        token.lower() for token in sent if token.lower() not in stopwords
    ] for sent in first_doc]

    for token in stopd_sents[0]:
        print(token)

    tagged_sents = [nltk.pos_tag(sentence) for sentence in stopd_sents]

    for token, tag in tagged_sents[0]:
        if re.match(r'NN*|JJ*', tag):
            print(token, tag)

    noun_phrases = [[
        token for token, tag in sent if re.match(r'NN*|JJ*', tag)
Ejemplo n.º 55
0
 def __init__(self):
     test_files = [
         fileid for fileid in reuters.fileids() if fileid.startswith('test')
     ]
     super(ReutersTestCorpus, self).__init__(reuters.sents(test_files))
emma = nltk.Text(emma)  #to use previous functions as with nltk.book txts
print(emma.concordance('surprise'))
print(' '.join(emma[20:50])) #LIST to STRING - comes out as text

#examples of corpus available in nltk
from nltk.corpus import webtext  #less formal text
print(webtext.fileids())  #filenames

from nltk.corpus import nps_chat #predators
print(nps_chat.fileids()) 

from nltk.corpus import brown #brown uni various texts
print(brown.fileids())

from nltk.corpus import reuters
print(reuters.fileids())

from nltk.corpus import inaugural
print(inaugural.fileids())
#page 72 for a variety of corpus functionality commands



##SPACY SECTION - DataCamp course code collection, starting with 'Feature Engineering for NLP'
import spacy 
# Load model and create Doc object
nlp = spacy.load('en')

# Function to preprocess text
def preprocess(text):
  	# Create Doc object
Ejemplo n.º 57
0
# REUTERS CORPUS DEMO
from nltk.corpus import reuters

print 'Total Categories:', len(reuters.categories())

print reuters.categories()

# get sentences in housing and income categories
sentences = reuters.sents(categories=['housing', 'income'])
sentences = [' '.join(sentence_tokens) for sentence_tokens in sentences]

print sentences[0:5]  # prints the first 5 sentences

# fileid based access
print reuters.fileids(categories=['housing', 'income'])

print reuters.sents(fileids=[u'test/16118', u'test/18534'])

# WORDNET CORPUS DEMO
from nltk.corpus import wordnet as wn

word = 'hike'  # taking hike as our word of interest

# get word synsets
word_synsets = wn.synsets(word)
print word_synsets

# get details for each synonym in synset
for synset in word_synsets:
    print 'Synset Name:', synset.name()
Ejemplo n.º 58
0
from nltk.corpus import reuters
import math
print(reuters.fileids())  # The list of file names inside the corpus
print(len(reuters.fileids()))  # Number of files in the corpus = 10788

# Print the categories associated with a file
print(reuters.categories('training/999'))  # [u'interest', u'money-fx']

# Print the contents of the file
print(reuters.raw('test/14829'))

from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
from collections import defaultdict
from six import string_types

stop_words = stopwords.words('english') + list(punctuation)


def tokenize(text):
    words = word_tokenize(text)
    words = [w.lower() for w in words]
    return [w for w in words if w not in stop_words and not w.isdigit()]


# build the vocabulary in one pass
vocabulary = set()
for file_id in reuters.fileids():
    words = tokenize(reuters.raw(file_id))
    vocabulary.update(words)
Ejemplo n.º 59
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Dec  4 19:40:46 2017

@author: jiahaoyuan
"""

from nltk.corpus import reuters
from Standard import tokenize
import json

Dict = {}

document = reuters.fileids()
for doc in document:
    x = tokenize(reuters.raw(doc))

    #x = list(set(x))

    for words in x:
        if words in Dict:
            if doc in Dict[words]:
                Dict[words][doc]= Dict[words][doc]+1
            else: 
                Dict[words][doc] = 1
        else:
            Dict[words] = {doc:1}
        

Ejemplo n.º 60
0
      Vettorizzazioned dei dati di addestramento - Bag of words
        1) creazione dei Token
        2) assegnazione di un intero ad ogni token
        3) conta dei token in un documento
        4) normalizzazione
        5) creazione di matrice
      Creazione di SVM e il suo addestramento
      Predizioni e test
    """

    logger = set_project_logger()

    # -------------------- Lettura lista dei documenti e categorie ----------------
    logger.info("Lista dei documenti reuters")
    categories_docs = futils.loadListFromFile("categories")
    docs = filter(filter_docs_with_categories, reuters.fileids())

    training_docs = [x for x in docs if 'training' in x]
    test_docs = [x for x in docs if 'test' in x]

    logger.debug("{:-8} documenti per il training".format(len(training_docs)))
    logger.debug("{:-8} documenti per il test".format(len(test_docs)))
    logger.debug("{:-8} categorie diverse".format(len(categories_docs)))
    logger.debug(str(categories_docs))

    train_corpus = [reuters.raw(f).lower() for f in training_docs]
    test_corpus = [reuters.raw(f).lower() for f in test_docs]
    # -----------------------------------------------------------------------------

    # --------------------- Vettorizzazione dei documenti ----------------------------------
    logger.info("Vettorizzazione dei documenti")