r'brown.pos', word_tokenizer=SpaceTokenizer()) print(reader.words()) print(reader.sents()) print(reader.tagged_words()) print(reader.tagged_sents()) print( reader.tagged_words(tagset='universal') ) ## Mapping tags to universal format, if tagset is not correct every TAG will have UNK ## Reading chunk corpora ####### reader = ChunkedCorpusReader('/Users/atul/nltk_data', r'treebank.chunk', tagset='en-brown') print(reader.chunked_words()) ## Word level structure print(reader.chunked_sents()) ## Sentence level structure print(reader.chunked_paras()) ## Paragraph level structure ## Reading classifed corpora ################## ## classification extracted using cat_pattern (from file name), or cat_dict or cat_file ###### from nltk.corpus.reader import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader( '/Users/atul/nltk_data', r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt' ) ## Easiest is to read files for different category reader.categories() reader.fileids(categories=['neg']) reader.fileids(categories=['pos']) reader.fileids()
from nltk.tokenize import LineTokenizer, RegexpTokenizer from nltk.classify import NaiveBayesClassifier from nltk.classify.util import accuracy from nltk.stem import SnowballStemmer from processor import Processor as Proc data_folder = './data' encoding = 'UTF8' language = 'italian' wordTok = RegexpTokenizer( r'(\w+|@\w+|<3|(\:\-?\))|(\:\-?\()|(\;\-?\))|((\:|(X|x))\-?(D|d)))') sentTok = LineTokenizer() reader = CategorizedPlaintextCorpusReader(data_folder, r'SENTIPOLC-.*\.txt', cat_pattern=r'SENTIPOLC-(\w+)\.txt', encoding=encoding, word_tokenizer=wordTok, sent_tokenizer=sentTok) pos_tweets = reader.sents(reader.fileids('pos')) neg_tweets = reader.sents(reader.fileids('neg')) # Inspection rndP = random.randrange(len(pos_tweets)) rndN = random.randrange(len(neg_tweets)) print 'Pos:\n', pos_tweets[rndP:rndP + 3], '\nNeg:\n', neg_tweets[rndN:rndN + 3], '\n' # All lowercase pos_tweets = Proc.lowerize(pos_tweets) neg_tweets = Proc.lowerize(neg_tweets)
output = open('airline_review\\pos\\' + \ filename + '.txt', 'w') output.write(row['reviewcontent']) output.close() count_pos += 1 elif row['rating_overall'] in ("1", "2", "3", "4", "5") and count_neg < 15000: output = open('airline_review\\neg\\' + \ filename + '.txt', 'w') output.write(row['reviewcontent']) output.close() count_neg += 1 os.chdir("E:/Documents/GSU/Python Development/Unstructured Data/Team Project/machine_learning_text_analysis") reader = CategorizedPlaintextCorpusReader('./airline_review', r'.*\.txt', cat_pattern = r'(\w+)/*') # file name format # Positive reviews file ids pos_ids = reader.fileids('pos') # Negative reviews file ids neg_ids = reader.fileids('neg') '''Generating word feature list''' def word_feats(words): return dict([(word, True) for word in words]) '''Building positive and negative feature lists. Each item is the positive/negative word features for a review file''' pos_feat = [(word_feats(reader.words(fileids = f)), 'pos')
data = get_data() print(len(data)) evrth, maindict = tags_assignment(data) # Save new final dictionary as well as the mapping for categories-numbers listingssss = json.dumps(evrth) with open("FinalCleanJuly1.json", "w") as f: f.write(listingssss) dictionaries = json.dumps(maindict) with open("CorpusCatMapJuly1.json", "w") as f: f.write(dictionaries) #### This is IMPORTANT - CHOOSE ! ##### default is key2 #### Choose the label you want to have for naming! ### two options: ### 1) key1 with format: docID + _(i) where i numerated number of category e.g. -doc-_cr14021.txt ### 2) key2 with format country name + year + _(i) e.g. Albania2015_1.txt ### if you want to change--> line 90: "key2: taglist" to key1 ### line 121: filename=evrth[i]['key2'] to key1 create_corpus(evrth) #### Check if working reader = CategorizedPlaintextCorpusReader('corpusCategory/', r'\w+\d+_.*\.txt', cat_map=maindict) print(reader.categories()) #print all categories in a list print(reader.fileids(categories=['Fiscal'])) #check docIDs in fiscal category #Good reference - https://www.packtpub.com/books/content/python-text-processing-nltk-20-creating-custom-corpora #They have options for creating chunked (by words, sentences, paragraphs and even customized paragraphs) corpora, tagged corpora etc
#print(bigrams) cfd = nltk.ConditionalFreqDist(bigrams) print(cfd[word]) print(generate_model(cfd, word)) def generate_model(cfdist, word, num=15): for i in range(num): print(word, end=' ') word = cfdist[word].max() # 1. Construir Corpus texto categorizado locPT = 'ch02/ES' corpusPT = CategorizedPlaintextCorpusReader(locPT, '.*\.txt', cat_file="cat.txt") print(corpusPT.fileids()) print(corpusPT.categories()) print(corpusPT.words(categories='ciencia')) #print(corpusPT.raw()) vocab = set(w.lower() for w in corpusPT.words()) print('Tamanho Vocabulario:', len(vocab)) corpusCom = corpusPT.raw() corpusComList = corpusCom.split() print('Tamanho Total de palabras:', len(corpusComList)) # 2. Calcular medidas estadisticas simples '''
loc = '/Users/rmoura/nltk_data/corpora/rai/textoSimples/' corpus1 = PlaintextCorpusReader(loc, '.*\.txt') print(corpus1.fileids()) print(corpus1.sents()) print(corpus1.words()) # Corpus texto etiquetado from nltk.corpus.reader.tagged import TaggedCorpusReader loc = '/Users/rmoura/nltk_data/corpora/rai/textoEtiquetas/' corpus2 = TaggedCorpusReader(loc, '.*\.txt') print(corpus2.fileids()) print(corpus2.words()) print("Palavras etiquetadas: ", corpus2.tagged_words()) print(corpus2.tagged_words('003.txt')) print("Sentencas diretas:") for s in corpus2.sents(): print(' '.join(s)) from nltk.corpus.reader import CategorizedPlaintextCorpusReader loc = '/Users/rmoura/nltk_data/corpora/rai/textoCategorias/' corpus3 = CategorizedPlaintextCorpusReader(loc, '.*\.txt', cat_file="categorias.txt") print(corpus3.fileids()) print(corpus3.categories()) print(corpus3.words(categories='brasnam')) # Definicao de stopwords stopwords = nltk.corpus.stopwords.words('portuguese') fd = nltk.FreqDist(w.lower() for w in corpus3.words()) fd1 = nltk.FreqDist(w.lower() for w in corpus3.words() if w.isalpha() and w not in stopwords)
def process_plaintext(dir_path): reader = CategorizedPlaintextCorpusReader(dir_path, r'.*\.txt', cat_pattern=r'.+_.+_(.*)\.txt') facilitator_files = reader.fileids(categories='facilitator') participant_files = reader.fileids(categories='participant') print facilitator_files, participant_files #print reader.categories() #print len(reader.words()) #print len(reader.sents()) fac_words = [word for word in reader.words(facilitator_files)] par_words = [word for word in reader.words(participant_files)] fac_words = edit_tokens(fac_words) par_words = edit_tokens(par_words) speakers = ([(word, 'facilitator') for word in reader.words(facilitator_files)] + [(word, 'participant') for word in reader.words(participant_files)]) features = get_features(speakers) size = int(len(features) * 0.3) nb_train = features[size:] nb_test = features[:size] classifier = nltk.NaiveBayesClassifier.train(nb_train) print "Classifier labels:", classifier.labels() print classifier.show_most_informative_features() print "Clasify test:", nltk.classify.accuracy(classifier, nb_test) #print classifier.classify(get_features(["Yolo", "bag", "sp"], False)) #random.shuffle(speakers) three_quarters = int(len(speakers) * 0.75) train = speakers[:three_quarters] test = speakers[three_quarters:] est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist) un_lm = NgramModel(1, train, estimator=est) bi_lm = NgramModel(2, train, estimator=est) tr_lm = NgramModel(3, train, estimator=est) qu_lm = NgramModel(4, train, estimator=est) pe_lm = NgramModel(5, train, estimator=est) print un_lm print bi_lm print tr_lm print qu_lm print pe_lm print "1 gram Perplexity:", un_lm.perplexity(test) print "2 gram Perplexity:", bi_lm.perplexity(test) print "3 gram Perplexity:", tr_lm.perplexity(test) print "4 gram Perplexity:", qu_lm.perplexity(test) print "5 gram Perplexity:", pe_lm.perplexity(test) print bi_lm.generate(10, ["uh", "sp"]) fd_fac = nltk.FreqDist(fac_words) vocab_fac = fd_fac.keys() fd_par = nltk.FreqDist(par_words) vocab_par = fd_par.keys() print "Fac Vocab: ", len(vocab_fac) print "Fac Tokens: ", len(fac_words) print vocab_fac[:20] print "Par Vocab: ", len(vocab_par) print "Par Tokens: ", len(par_words) print vocab_par[:20] fd_par.plot(50)
] for topic in topics: statuses = Cursor(api.search, q=f"{topic} -filter:retweets", tweet_mode="extended").items(200) for status in statuses: if status.lang == "en": file = open( f"C:/Users/olgur/natural_language_toolkit_data/twitter_corpus/tweets_{topic}.txt", "a", encoding="utf-8") file.write(status.full_text) file.close() reader = CategorizedPlaintextCorpusReader( "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus", r'tweets_.*\.txt', cat_pattern=r'tweets_(\w+)\.txt') # setting up stopwords stopword_reader = PlaintextCorpusReader( "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus/twitterstopwords/", r'.*\.txt', encoding='latin-1') stop_words = set(['“', '”', '’', ",", "#", "—", "__", "_", "___"]) for file in stopword_reader.fileids(): stops = stopword_reader.raw(file).replace("\n", ",").split(",") for word in stops: stop_words.add(word) # text wrangling functions:
features['contains({})'.format(bigram)] = (bigram in article_bigrams) article_words = set(article_words) for word in word_features: features['contains({})'.format(word)] = (word in article_words) return features if __name__ == '__main__': #set up path to data data_folder_name = sys.argv[1] data_path = os.path.join(os.getcwd(), '', data_folder_name) #make article object to read in files article = CategorizedPlaintextCorpusReader(data_path, r'.*\.*\.txt', cat_pattern=r'(\w+).*\.txt') #make list of all articles with labels based on what folder the file is in all_articles = [] for category in article.categories(): for fileid in article.fileids(category): #lowercases words and takes out stopwords process = list( w.lower() for w in list(article.words(fileid)) if w.isalpha() and w not in stopwords.words('english')) entry = [process, category] all_articles.append(entry) random.shuffle(all_articles)
i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True Label') plt.xlabel('Predicted Label') if __name__ == '__main__': print("\nStarting the Classifier. First, let's set everything up.") traincorpus_root = raw_input( "Please specify the location of the training data: ") # traincorpus_root = '/Users/taniamaldonado/PycharmProjects/corpora/humanin/train4' traincorpus = CategorizedPlaintextCorpusReader( traincorpus_root, r".*_.*\.txt", cat_pattern=r'(\w+)_.*\.txt') testcorpus_root = raw_input( "Please specify the location of the test data: ") # testcorpus_root = '/Users/taniamaldonado/PycharmProjects/corpora/humanin/test' testcorpus = CategorizedPlaintextCorpusReader(testcorpus_root, r".*_.*\.txt", cat_pattern=r'(\w+)_.*\.txt') try: traindata, testdata = datainput(traincorpus, testcorpus) except NameError: print "The training/test corpus is not defined, please check if the location is correct." print("\nPlease choose a classification algorithm:") print("1. Multinomial Naive Bayes")
def fetch_news(dir): base = 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/{}/rss.xml' for category in ['world', 'technology']: rss = fp.parse(base.format(category)) for i, entry in enumerate(rss.entries): fname = '{0}_bbc_{1}.txt'.format(i, category) fname = os.path.join(dir, fname) if not dl.conf.file_exists(fname): store_txt(entry.link, fname, entry.title) if __name__ == "__main__": dir = os.path.join(dl.data.get_data_dir(), 'bbc_news_corpus') if not os.path.exists(dir): os.mkdir(dir) fetch_news(dir) reader = CategorizedPlaintextCorpusReader(dir, r'.*bbc.*\.txt', cat_pattern=r'.*bbc_(\w+)\.txt') printer = dl.log_api.Printer(nelems=3) printer.print('Categories', reader.categories()) printer.print('World fileids', reader.fileids(categories=['world'])) printer.print('Technology fileids', reader.fileids(categories=['technology']))
categ_dict = { 'test_1.txt': 'Regulatory Update', 'test_2.txt': 'Press Release', 'test_3.txt': 'Regulatory Update', 'test_4.txt': 'Regulatory Update', 'test_5.txt': 'Stock Update', 'test_6.txt': 'Press Release', 'test_7.txt': 'Market Opinion' } art_i = [] class_i = [] #Conversion of Train Data into Single Input File corpus_root = 'Train_set' newcorpus = CategorizedPlaintextCorpusReader(corpus_root, r'.*\.txt', cat_pattern=r'(\w+)/*') myfile = open('Input_Article_Data.csv', 'wb') wr = csv.writer(myfile, quoting=csv.QUOTE_ALL, lineterminator="\n") for category in newcorpus.categories(): for fileid in newcorpus.fileids(category): #print fileid,category data1 = (newcorpus.raw(fileid).encode('utf-8')).replace(",", " ") data_list = [data1, category] wr.writerow(data_list) myfile.close() #Reading of Train Data as Lists
__author__ = 'Piotr' from random import shuffle from pickle import dump import os from nltk import word_tokenize from nltk.corpus.reader import CategorizedPlaintextCorpusReader from text_processing.replacers import RegexpReplacer training = CategorizedPlaintextCorpusReader("Articles", r'.*\.txt', cat_pattern=r'(\w+)', encoding="utf-8") def print_corpus_info(): print("Training Corpus INFO") for category in training.categories(): print("Number of documents in {0:8} category: {1}".format(category, len(training.fileids(category)))) print("\n") def save_documents(documents, name): with open(os.path.join("Classifiers", name + ".pickle"), 'wb') as file_handler: dump(documents, file_handler) def get_training_documents(cut_off=0.75, save=False): train_set = [] test_set = []
import nltk from nltk.corpus.reader import CategorizedPlaintextCorpusReader import random from BeautifulSoup import BeautifulSoup #Reading from custom created categorized corpora #categorized corpora will be categorized for topic, genre, polarity, etc. #In addition to the standard corpus interface, these corpora provide access to the list of categories #and the mapping between the documents and their categories (in both directions) # Access the categories using the categories() method d = nltk.data.find('corpora/SecurityThreat') reader = CategorizedPlaintextCorpusReader(d, r'.*\.txt', cat_pattern=r'(\w+)/*') from textblob.classifiers import NaiveBayesClassifier random.seed(1) train = [ ('Identity', 'IdentityThreat'), ('identity', 'IdentityThreat'), ('identities', 'IdentityThreat'), ('identity loss', 'IdentityThreat'), ('insider', 'InsiderThreat'), ('Malware', 'Malware'), ] # Categorized corpora Reader collect the respective words based on ThreatType ThreatTypes = [(list(reader.words(fileid)), category) for category in reader.categories() for fileid in reader.fileids(category)] random.shuffle(ThreatTypes)
import sys if (len(sys.argv) != 5): print( 'Usage: Pass Arguments for Input PDF path, Category-File Mapping Path, Input Links File and Output Sub-Category Update File' ) sys.exit(1) print("Input PDFs File Path " + sys.argv[1]) print("Category File Name and Path " + sys.argv[2]) print("Input Links File Name & Path is " + sys.argv[3]) print("SubCategory Update File is " + sys.argv[4]) reader = CategorizedPlaintextCorpusReader(sys.argv[1], r'.*\.txt', cat_file=sys.argv[2], cat_delimiter='|') # Access each file in the corpus. #for infile in sorted(reader.fileids()): # print (infile) # The fileids of each file. # #file = reader.open(infile) # #print (file.read().strip()) # Prints the content of the file #print(reader.fileids()) #print(reader.fileids(categories=['General'])) #print(reader.categories()) #print(reader.categories())
corpusfile=open(corpusfolder+'/'+fname,'a') corpusfile.write(str(body)) corpusfile.close() except Exception as e: print('Error on :'+id_) corpusfile.close() os.remove(mydir+'\\'+fname) pass else: print('Empty File:'+id) CreateCorpusFromDataFrame(mydir,data_sample) my_corpus=CategorizedPlaintextCorpusReader(mydir,r'.*', cat_pattern=r'.*_(.*).txt') def preprocess(words, to_lowercase = True, remove_punctuation = True, remove_digits = True, remove_odd_chars = True, remove_stopwords=True, stem = True): if to_lowercase: words = [w.lower() for w in words] if remove_punctuation: words = [w for w in words if not (re.match(r'^\W+$', w) != None)] if remove_digits: words = [w for w in words if not w.replace('.','',1).isdigit()] if remove_odd_chars: words = [re.sub(r'[^a-zA-Z0-9_]','_', w) for w in words] if remove_stopwords:
import time import nltk import pickle import re from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords reader = CategorizedPlaintextCorpusReader('/media/storage/dpla-data/words/colls.oct/', r'.*\.txt', cat_pattern=r'(\w+)\.txt') # Removing oversized collections: hathi, nypl; Also, chunking them out: # First batch represents what was completed on 4/10-4/11. #colls = ["searches"] colls = ["artstor","biodiv","rumsey","commonwealth","georgia","harvard", "ia","getty","kentucky","minnesota","missouri","mwdl","nara","nocar", "smiths","socar","texas","gpo","illinois","usc","virginia","nocoll", "hathi","nypl"] #colls = ["ia","getty","kentucky","minnesota","missouri","mwdl"] #colls = ["nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"] #data = {} stats = {} common = {} for coll in colls: print(reader.categories(coll+".txt")) stats[coll] = {} # 'kay. Can't pickle words. It's a stream reader. # But maybe you can if you tokenize we regex # Which also pulls out punctuation print("prep & pickle words") words = re.split(r'\W+', reader.raw(coll+'.txt'))
auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = API(auth, wait_on_rate_limit=True) # setting limit to avoid upsetting Twitter '''accounts = [("NASA", 11348282), ("BarackObama", 813286)] for account in accounts: statuses = Cursor(api.user_timeline, user_id=account[1], include_rts=False, exclude_replies=True, count=10000, tweet_mode="extended").items() for status in statuses: if status.lang == "en": file = open(f"C:/Users/olgur/nltk_data/twitter_corpus/tweets_{account[0]}.txt", "a", encoding="utf-8") file.write(status.full_text.replace("\n", " ") + "\n") file.close()''' reader = CategorizedPlaintextCorpusReader( "C:/Users/olgur/nltk_data/twitter_corpus", r'tweets_.*\.txt', cat_pattern=r'tweets_(\w+)\.txt') # setting up stopwords stop_words = set([ '“', '”', '’', ",", "#", "—", "__", "_", "___", ".", ":", '"', "?", "!", "-", ")", "(", "...", "$" ]).union(set(stopwords.words("english"))) def remove_links(text): http_regex = re.compile(r"(https|http)://.*") return http_regex.sub(r"", text) def remove_users(text):
#!pip install wordcloud # In[2]: from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords stopwordlist=stopwords.words('german') from wordcloud import WordCloud rootDir="../01access/GERMAN" filepattern=r"(?!\.)[\w_]+(/RSS/FeedText/)[\w-]+/[\w-]+\.txt" #filepattern=r"(?!\.)[\w_]+(/RSS/FullText/)[\w-]+/[\w-]+\.txt" catpattern=r"([\w_]+)/.*" rssreader=CategorizedPlaintextCorpusReader(rootDir,filepattern,cat_pattern=catpattern) # In[3]: singleDoc=rssreader.paras(categories="TECH")[0] print("The first paragraph:\n",singleDoc) print("Number of paragraphs in the corpus: ",len(rssreader.paras(categories="TECH"))) # In[4]: techdocs=[[w.lower() for sent in singleDoc for w in sent if (len(w)>1 and w.lower() not in stopwordlist)] for singleDoc in rssreader.paras(categories="TECH")] print("Number of documents in category Tech: ",len(techdocs))
from nltk.corpus.reader import CategorizedPlaintextCorpusReader import nltk d = nltk.data.find('corpora/cookbook') reader = CategorizedPlaintextCorpusReader(d, r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt') print(reader.categories()) print(reader.fileids(categories='neg')) print(reader.fileids(categories='pos')) # from nltk.corpus import brown # print(brown.categories())
from nltk.corpus import stopwords from nltk_trainer.classification.featx import bag_of_words from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures import collections import pickle from nltk.corpus.reader import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader( '/home/arjun/nltk_data/health/diabetes', r'health.*?[0-9]+.txt', cat_pattern=r'health(.*?)[0-9]+.txt') #print reader.categories() #takes a corpus .. creates labelled feature sets def label_feats_from_corpus(corp, feature_detector=bag_of_words): label_feats = collections.defaultdict(list) for label in corp.categories(): for fileid in corp.fileids(categories=[label]): feats = feature_detector(corp.words(fileids=[fileid])) label_feats[label].append(feats) return label_feats #creates test and train features def split_label_feats(lfeats, split=0.75): train_feats = [] test_feats = [] for label, feats in lfeats.items():
@author: jagpr """ import collections, itertools import nltk.classify.util, nltk.metrics from nltk.metrics import * from nltk.classify import NaiveBayesClassifier from nltk.corpus import stopwords from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist #Creating Corpus using WordListCorpusReader from nltk.corpus.reader import CategorizedPlaintextCorpusReader imdb_reviews = CategorizedPlaintextCorpusReader( 'D://USF//Independent Research Project//Dataset//Movie Review Dataset Pos Neg//aclImdb//train//negpos', r'.*\.txt', cat_pattern=r'(\w+)/*') len(imdb_reviews.fileids()) def evaluate_classifier(featx): negids = imdb_reviews.fileids('neg') posids = imdb_reviews.fileids('pos') negfeats = [(featx(imdb_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(featx(imdb_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats) * 3 / 4
import nltk as nltk import os, os.path path = os.path.expanduser('~/nltk_data') if not os.path.exists(path): os.mkdir(path) os.path.exists(path) import nltk.data path in nltk.data.path from nltk.corpus.reader import CategorizedPlaintextCorpusReader reader2 = CategorizedPlaintextCorpusReader('.', r'news_.*\.csv', cat_pattern=r'news_(\w+)\.csv') reader.categories() reader.fileids(categories=['UP']) def bag_of_words(words): return dict([(word, True) for word in words]) import collections def label_feats_from_corpus(corp, feature_detector=bag_of_words): label_feats = collections.defaultdict(list) for label in corp.categories(): for fileid in corp.fileids(categories=[label]): feats = feature_detector(corp.words(fileids=[fileid])) label_feats[label].append(feats) return label_feats def split_label_feats(lfeats, split=0.75): train_feats = [] test_feats = [] for label, feats in lfeats.iteritems(): cutoff = int(len(feats) * split) train_feats.extend([(feat, label) for feat in feats[:cutoff]]) test_feats.extend([(feat, label) for feat in feats[cutoff:]])