def main(): articles = CategorizedPlaintextCorpusReader(corpusdir, '.*', cat_pattern = r'(.*)[/]') feats = {} trainfeats = [] testfeats = [] for cat in articles.categories(): wow = len([f for f in articles.fileids(cat)]) # such variable name print "for category", cat, ":", wow feats[cat] = [(word_feats(articles.words(fileids = [f])), cat) for f in articles.fileids(cat)] cutoff = wow - hold_back(wow) trainfeats.append(feats[cat][:cutoff]) testfeats.append(feats[cat][cutoff:]) train = [item for sublist in trainfeats for item in sublist] test = [item for sublist in testfeats for item in sublist] print 'train on %d instances, test on %d instances' % (len(train), len(test)) classifier = NaiveBayesClassifier.train(train) print 'accuracy:', nltk.classify.util.accuracy(classifier, test) classifier.show_most_informative_features() # I don't understand the output for more than 2 categories :( # load with: # import pickle # f = open('my_classifier.pickle') # classifier = pickle.load(f) # f.close() with open('../data/classifier.pickle', 'wb') as f: pickle.dump(classifier, f)
def create_categorized_corpus(self, categories_directory): boolean_list = [] boolean_for_categories_test = '' reader = CategorizedPlaintextCorpusReader(categories_directory, r'\.txt.*wordtype_(\w+)', cat_pattern=r'\.txt.*wordtype_(\w+)') for category in reader.categories(): boolean_list.append(category != '') if False in boolean_list: boolean_for_categories_test = False else: boolean_for_categories_test = True return reader, boolean_for_categories_test
r'.*\.txt', cat_pattern=r'(\w+)/*') from textblob.classifiers import NaiveBayesClassifier random.seed(1) train = [ ('Identity', 'IdentityThreat'), ('identity', 'IdentityThreat'), ('identities', 'IdentityThreat'), ('identity loss', 'IdentityThreat'), ('insider', 'InsiderThreat'), ('Malware', 'Malware'), ] # Categorized corpora Reader collect the respective words based on ThreatType ThreatTypes = [(list(reader.words(fileid)), category) for category in reader.categories() for fileid in reader.fileids(category)] random.shuffle(ThreatTypes) print(reader.categories()) new_train = ThreatTypes print(new_train) #Naive Bayes classifiers assume that the value of a particular feature is independent of the value of #any other feature, given the class variable. cl = NaiveBayesClassifier(train) #update the classifier with training keywords from Categorized corpora cl.update(new_train) inputpath = nltk.data.find('corpora/abc/threatdescp.txt') f = open(inputpath, encoding='latin2') outputpath = nltk.data.find('corpora/abc/ResultNB.txt') ResultFile = open(outputpath, 'w', encoding='latin2') for line in f:
dirList = [] for f in os.listdir( path ) : if not os.path.isfile( path ) : if not f == ".DS_Store" : dirList.append(f) return dirList ############################################### ############################################### ################# # TRAINING DATA # ################# train_reader = CategorizedPlaintextCorpusReader('./training_data', r'.*\_.*\.txt', cat_pattern=r'.*\_(\w+)\.txt') train_documents = [(list(train_reader.words(fileid)), category) for category in train_reader.categories() for fileid in train_reader.fileids(category)] random.shuffle(train_documents) #print train_documents train_documents_clean = [] for i in train_documents : cat = i[1] #print cat newList = [] for word in i[0] : #print j clean_word = word.encode('ascii', 'ignore').decode('ascii').encode('ascii', 'ignore') newList.append(clean_word) newTup = (newList, cat) train_documents_clean.append(newTup)
import nltk, random, string from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords reader = CategorizedPlaintextCorpusReader('./', r'.*\.txt', cat_pattern=r'(\w+)/*') print reader.categories() print reader.fileids() documents = [(list(reader.words(fileid)), category) for category in reader.categories() for fileid in reader.fileids(category)] random.shuffle(documents) # Remove stopwords & punc from content table = string.maketrans("","") stopwords = nltk.corpus.stopwords.words('english') filtered_words = [w for w in reader.words() if not w in stopwords] filtered_words_nopunc = [w for w in filtered_words if not w in string.punctuation] all_words = nltk.FreqDist(w.lower() for w in filtered_words_nopunc) print all_words word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {}
'test_6.txt': 'Press Release', 'test_7.txt': 'Market Opinion' } art_i = [] class_i = [] #Conversion of Train Data into Single Input File corpus_root = 'Train_set' newcorpus = CategorizedPlaintextCorpusReader(corpus_root, r'.*\.txt', cat_pattern=r'(\w+)/*') myfile = open('Input_Article_Data.csv', 'wb') wr = csv.writer(myfile, quoting=csv.QUOTE_ALL, lineterminator="\n") for category in newcorpus.categories(): for fileid in newcorpus.fileids(category): #print fileid,category data1 = (newcorpus.raw(fileid).encode('utf-8')).replace(",", " ") data_list = [data1, category] wr.writerow(data_list) myfile.close() #Reading of Train Data as Lists with open('Input_Article_Data.csv', 'r') as f: for line in f.readlines(): l, name = line.strip().split(',') l = (re.sub('[^A-Za-z0-9.]+', ' ', l)).lower() # l=porter_stemmer.stem(l) #Reduces Accuracy From 50% To 37% if (name != "Category"):
loc = '/Users/rmoura/nltk_data/corpora/rai/textoSimples/' corpus1 = PlaintextCorpusReader(loc, '.*\.txt') print(corpus1.fileids()) print(corpus1.sents()) print(corpus1.words()) # Corpus texto etiquetado from nltk.corpus.reader.tagged import TaggedCorpusReader loc = '/Users/rmoura/nltk_data/corpora/rai/textoEtiquetas/' corpus2 = TaggedCorpusReader(loc, '.*\.txt') print(corpus2.fileids()) print(corpus2.words()) print("Palavras etiquetadas: ", corpus2.tagged_words()) print(corpus2.tagged_words('003.txt')) print("Sentencas diretas:") for s in corpus2.sents(): print(' '.join(s)) from nltk.corpus.reader import CategorizedPlaintextCorpusReader loc = '/Users/rmoura/nltk_data/corpora/rai/textoCategorias/' corpus3 = CategorizedPlaintextCorpusReader(loc, '.*\.txt', cat_file="categorias.txt") print(corpus3.fileids()) print(corpus3.categories()) print(corpus3.words(categories='brasnam')) # Definicao de stopwords stopwords = nltk.corpus.stopwords.words('portuguese') fd = nltk.FreqDist(w.lower() for w in corpus3.words()) fd1 = nltk.FreqDist(w.lower() for w in corpus3.words() if w.isalpha() and w not in stopwords)
def generate_model(cfdist, word, num=15): for i in range(num): print(word, end=' ') word = cfdist[word].max() # 1. Construir Corpus texto categorizado locPT = 'ch02/ES' corpusPT = CategorizedPlaintextCorpusReader(locPT, '.*\.txt', cat_file="cat.txt") print(corpusPT.fileids()) print(corpusPT.categories()) print(corpusPT.words(categories='ciencia')) #print(corpusPT.raw()) vocab = set(w.lower() for w in corpusPT.words()) print('Tamanho Vocabulario:', len(vocab)) corpusCom = corpusPT.raw() corpusComList = corpusCom.split() print('Tamanho Total de palabras:', len(corpusComList)) # 2. Calcular medidas estadisticas simples ''' Medidas: Tamanho médio das palavras, Tamanho médio das sentenças e Número de vezes que cada item do vocabulário aparece no texto em média (escore de diversidade léxica) ''' print(
number_free = remove_numebrs(user_free) hashtag_free = remove_hashtags(number_free) twitter_words = [ term.lower() for term in tweet_tokenizer.tokenize(hashtag_free) if term.lower() not in stop_words ] twitter_words_with_hashtags = [ term.lower() for term in tweet_tokenizer.tokenize(number_free) if term.lower() not in stop_words ] return twitter_words, twitter_words_with_hashtags corpus_tokens = [] for category in reader.categories(): for file in reader.fileids(categories=category): without_hashtags, with_hashtags = tokenize_tweets(file) # c fdist_category = nltk.FreqDist(without_hashtags) print("Most common words in", category, ":", fdist_category.most_common(10)) # d hashtags = [word for word in with_hashtags if word.startswith("#")] fdist_category_hashtag = nltk.FreqDist(hashtags) print("Most common hashtags in", category, ":", fdist_category_hashtag.most_common(10)) corpus_tokens += without_hashtags
#!/usr/bin/env python # coding: utf-8 import nltk from nltk.corpus.reader import CategorizedPlaintextCorpusReader corpus_root = '/Users/athessen/nltk_data/corpora/eco' reader = CategorizedPlaintextCorpusReader(corpus_root,r'lion|shark\d*\.txt',cat_file='cats.txt') print reader.fileids() print reader.categories() """ all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = all_words.keys()[:2000] [1] def document_features(document): [2] document_words = set(document) [3] features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features """
class PolarityDataReader(object): """ PolarityDataReader: Reader for POS/NEG Categorized Sentiword data uses: nltk.corpus.reader.CategorizedPlaintextCorpusReader usage: dataReader = PolarityDataReader([rootLocation],[readerObject]) dataReader.getDocuments() dataReader.setTerms([No:ofTerms]) featuresets = dataReader.getTermDocMatrix() """ def __init__(self, rootLocation=config.POLARITY_DATASET, reader=None): super(PolarityDataReader, self).__init__() if reader == None: self.reader = Reader(rootLocation, r'.*/.*', cat_pattern=r'(.*)/.*') else: self.reader = reader self.setStopWords() self.documents = None self.terms = None def getDocuments(self): if not self.documents: self.documents = [(list(self.reader.words(fileid)), category) for category in self.reader.categories() for fileid in self.reader.fileids(category)] return self.documents def setStopWords(self, fileLocation=config.STOP_WORDS_FILE): stopfile = open(fileLocation, 'r') self.stopwords = stopfile.read().split() def removeStopWords(self, wordList): """ Remove common words which have no search value """ return [word for word in wordList if word not in self.stopwords] def setTerms(self, size=2000, featureSelection='PD', removeStopWords=True): if featureSelection == 'PD': self.__setTermsPD__(size) print "Feature Selection : PD :done " elif featureSelection == 'CHI_SQUARE': self.__setTermsCHISQUARE__(size) print "Feature Selection : CHI_SQUARE :done " else: """ geting most frequent Words """ all_words = [w.lower() for w in self.reader.words()] if removeStopWords: all_words = self.removeStopWords(all_words) all_words = FreqDist(w for w in all_words) self.terms = all_words.keys()[:size] print "Feature Selection: frequent Words :done " def documentFeatures(self, document, sentiwordnet=False): document_words = set(document) features = {} if sentiwordnet: pass #TODO else: for word in self.terms: features[word] = (word in document_words) return features def getTermDocMatrix(self): return [(self.documentFeatures(document), category) for (document, category) in self.documents] def __setTermsPD__(self, size): """ score=|(posDF-negDF)|/(posDF+negDF) """ posWord = {} negWord = {} for word in self.reader.words(categories=['pos']): inc(posWord, word.lower()) for word in self.reader.words(categories=['neg']): inc(negWord, word.lower()) wordScores = {} for word in self.reader.words(): try: posScore = posWord[word] except KeyError, e: posScore = 0 try: negScore = negWord[word] except KeyError, e: negScore = 0 totalScore = posScore + negScore if totalScore <= 10: # min total count wordScores[word] = 0.1 else: wordScore[word] = abs(posScore - negScore) / totalScore
if proption == "Y" or proption == "y": prchoice = True elif proption == "N" or proption == "n": prchoice = False rocoption = raw_input("ROC curve plot: (Y/N) ") if rocoption == "Y" or rocoption == "y": rocchoice = True elif rocoption == "N" or rocoption == "n": rocchoice = False confmatplot = raw_input("Confusion matrix plot: (Y/N) ") print("\nStarting the classifier...\n") classify(traindata, testdata, classifier=clf, learncurve=learnchoice, prcurve=prchoice, roccurve=rocchoice) if confmatplot == "Y" or confmatplot == "y": # Plot cnf_matrix = confusion_matrix(ytest, predictions) class_names = testcorpus.categories() plot_confusion_matrix(cnf_matrix, classes=class_names) plt.show() print("\nFinished!")
return features if __name__ == '__main__': #set up path to data data_folder_name = sys.argv[1] data_path = os.path.join(os.getcwd(), '', data_folder_name) #make article object to read in files article = CategorizedPlaintextCorpusReader(data_path, r'.*\.*\.txt', cat_pattern=r'(\w+).*\.txt') #make list of all articles with labels based on what folder the file is in all_articles = [] for category in article.categories(): for fileid in article.fileids(category): #lowercases words and takes out stopwords process = list( w.lower() for w in list(article.words(fileid)) if w.isalpha() and w not in stopwords.words('english')) entry = [process, category] all_articles.append(entry) random.shuffle(all_articles) #make bigrams for every article word_bigrams = [(nltk.bigrams(all_articles[i][0])) for i in range(len(all_articles))] #create frequency distribution for all words and select top 2000 for features
def fetch_news(dir): base = 'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/{}/rss.xml' for category in ['world', 'technology']: rss = fp.parse(base.format(category)) for i, entry in enumerate(rss.entries): fname = '{0}_bbc_{1}.txt'.format(i, category) fname = os.path.join(dir, fname) if not dl.conf.file_exists(fname): store_txt(entry.link, fname, entry.title) if __name__ == "__main__": dir = os.path.join(dl.data.get_data_dir(), 'bbc_news_corpus') if not os.path.exists(dir): os.mkdir(dir) fetch_news(dir) reader = CategorizedPlaintextCorpusReader(dir, r'.*bbc.*\.txt', cat_pattern=r'.*bbc_(\w+)\.txt') printer = dl.log_api.Printer(nelems=3) printer.print('Categories', reader.categories()) printer.print('World fileids', reader.fileids(categories=['world'])) printer.print('Technology fileids', reader.fileids(categories=['technology']))
words = [re.sub(r'[^a-zA-Z0-9_]','_', w) for w in words] if remove_stopwords: sw = set(nltk.corpus.stopwords.words("english")) words = [w for w in words if not w in sw] if stem: porter = nltk.PorterStemmer() words = [porter.stem(w) for w in words] return words documents = [((fileid, category), preprocess(my_corpus.words(fileid), to_lowercase = True, remove_punctuation = True, remove_digits = True, remove_odd_chars = True, remove_stopwords=True, stem = False)) \ for category in my_corpus.categories() \ for fileid in my_corpus.fileids(category)] def dummy_fun(doc): return doc bow_gen = sklearn.feature_extraction.text.CountVectorizer( analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None, ngram_range=(1, 2), min_df = 150, # changed from 100 max_df = 0.85)
# Removing oversized collections: hathi, nypl; Also, chunking them out: # First batch represents what was completed on 4/10-4/11. colls = ["searches"] #colls = ["artstor","biodiv","rumsey","commonwealth","georgia","harvard", # "ia","getty","kentucky","minnesota","missouri","mwdl","nara","nocar", # "smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"] #colls = ["ia","getty","kentucky","minnesota","missouri","mwdl"] #colls = ["nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"] #data = {} stats = {} common = {} for coll in colls: print(reader.categories(coll+".txt")) stats[coll] = {} # 'kay. Can't pickle words. It's a stream reader. # But maybe you can if you tokenize we regex # Which also pulls out punctuation print("prep & pickle words") words = re.split(r'\W+', reader.raw(coll+'.txt')) pickle.dump( words, open( "/media/storage/dpla-data/pickles/new/"+coll+"_words.p", "wb")) #words = reader.words(coll+".txt") #data[coll]["words"] = reader.words(coll+".txt") print("getting count " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())) stats[coll]["wc"] = len(words) print(stats[coll]["wc"]) print("getting uniq " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())) stats[coll]["uniq"] = len(set([w.lower() for w in words])) print(stats[coll]["uniq"])
# Removing oversized collections: hathi, nypl; Also, chunking them out: # First batch represents what was completed on 4/10-4/11. #colls = ["searches"] colls = ["artstor","biodiv","rumsey","commonwealth","georgia","harvard", "ia","getty","kentucky","minnesota","missouri","mwdl","nara","nocar", "smiths","socar","texas","gpo","illinois","usc","virginia","nocoll", "hathi","nypl"] #colls = ["ia","getty","kentucky","minnesota","missouri","mwdl"] #colls = ["nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"] #data = {} stats = {} common = {} for coll in colls: print(reader.categories(coll+".txt")) stats[coll] = {} # 'kay. Can't pickle words. It's a stream reader. # But maybe you can if you tokenize we regex # Which also pulls out punctuation print("prep & pickle words") words = re.split(r'\W+', reader.raw(coll+'.txt')) pickle.dump( words, open( "/media/storage/dpla-data/words/colls.oct/pickles/"+coll+"_words.p", "wb")) #words = reader.words(coll+".txt") #data[coll]["words"] = reader.words(coll+".txt") print("getting count " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())) stats[coll]["wc"] = len(words) print(stats[coll]["wc"]) print("getting uniq " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())) stats[coll]["uniq"] = len(set([w.lower() for w in words])) print(stats[coll]["uniq"])
from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.tokenize.casual import TweetTokenizer from normalization import normalizeTwitterWordsWithExtraFeatures, normalizeTwitterWordsWithNegationHandle import pickle, nltk tweetTokenizer = TweetTokenizer(reduce_len=True, preserve_case=True, strip_handles=False) corpus = CategorizedPlaintextCorpusReader('corpus/2-step/polar', r'(\w+)-tweet[0-9]+\.txt', cat_pattern=r'(\w+)-tweet[0-9]+\.txt', word_tokenizer=tweetTokenizer) normalizationFunction = normalizeTwitterWordsWithNegationHandle wordsTaggedToCategory = [] i = 1 for category in corpus.categories(): for fileid in corpus.fileids(category): words = corpus.words(fileids=[fileid]) normalizedWords = normalizationFunction(words) extraNormalizedWords = normalizeTwitterWordsWithExtraFeatures(words) wordsTagged = nltk.pos_tag(normalizedWords) wordsTaggedToCategory += [(wordsTagged, category)] print(i) i += 1 with open("wordsTaggedToCategory-polar", 'wb') as fileout: pickle.dump(wordsTaggedToCategory, fileout)
class PolarityDataReader(object): """ PolarityDataReader: Reader for POS/NEG Categorized Sentiword data uses: nltk.corpus.reader.CategorizedPlaintextCorpusReader usage: dataReader = PolarityDataReader([rootLocation],[readerObject]) dataReader.getDocuments() dataReader.setTerms([No:ofTerms]) featuresets = dataReader.getTermDocMatrix() """ def __init__(self, rootLocation = config.POLARITY_DATASET,reader=None): super(PolarityDataReader, self).__init__() if reader == None: self.reader = Reader(rootLocation,r'.*/.*', cat_pattern=r'(.*)/.*') else: self.reader = reader self.setStopWords() self.documents = None; self.terms = None; def getDocuments(self): if not self.documents: self.documents = [(list(self.reader.words(fileid)), category) for category in self.reader.categories() for fileid in self.reader.fileids(category)] return self.documents; def setStopWords(self,fileLocation = config.STOP_WORDS_FILE): stopfile = open(fileLocation, 'r') self.stopwords = stopfile.read().split() def removeStopWords(self,wordList): """ Remove common words which have no search value """ return [word for word in wordList if word not in self.stopwords] def setTerms(self,size=2000,featureSelection='PD',removeStopWords=True): if featureSelection == 'PD': self.__setTermsPD__(size) print "Feature Selection : PD :done " elif featureSelection == 'CHI_SQUARE': self.__setTermsCHISQUARE__(size) print "Feature Selection : CHI_SQUARE :done " elif featureSelection == 'SWNSS': self.__setTermsSWNSS__(size) print "Feature Selection : SWNPD :done " else: """ geting most frequent Words """ all_words = [w.lower() for w in self.reader.words()]; if removeStopWords: all_words = self.removeStopWords(all_words); all_words = FreqDist(w for w in all_words) self.terms = all_words.keys()[:size] print "Feature Selection: frequent Words :done " def documentFeatures(self,document,sentiwordnet=False): document_words = set(document) features = {} if sentiwordnet: pass #TODO else : for word in self.terms: features[word] = (word in document_words) return features def getTermDocMatrix(self): return [(self.documentFeatures(document), category) for (document,category) in self.documents] def __setTermsPD__(self,size): """ score=|(posDF-negDF)|/(posDF+negDF) """ posWord = {}; negWord = {}; for word in self.reader.words(categories = ['pos']): inc(posWord,word.lower()); for word in self.reader.words(categories = ['neg']): inc(negWord,word.lower()); wordScores = {} for word in self.reader.words(): try: posScore = posWord[word] except KeyError, e: posScore = 0 try: negScore = negWord[word] except KeyError, e: negScore = 0 totalScore = posScore + negScore if totalScore <= 10 : # min total count wordScores[word] = 0.1 else : wordScores[word] = abs(posScore-negScore)/totalScore #removeStopWords does no affect accurcy termScore = sorted(wordScores.items(),key=lambda(w,s):s,reverse=True)[:size] self.terms = [w for (w,s) in termScore];
from nltk.corpus.reader import CategorizedPlaintextCorpusReader import nltk d = nltk.data.find('corpora/cookbook') reader = CategorizedPlaintextCorpusReader(d, r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt') print(reader.categories()) print(reader.fileids(categories='neg')) print(reader.fileids(categories='pos')) # from nltk.corpus import brown # print(brown.categories())
for word, cat in Feature_Set.keys(): file.write(str(word)) file.write("\t\t") file.write((str(cat))) file.write("\t\t") file.write(str(Feature_Set[word,cat])) file.write("\n") file.close() Classification_Accuracy=0 for file in Testing_Corpus.fileids(): pos_prob=1 neg_prob=1 real_category=Testing_Corpus.categories([file]) for word, cat in Feature_Set: if word in Testing_Corpus.words([file]): if cat=="pos": pos_prob=Feature_Set[word, cat]*float(pos_prob)*10000 else: neg_prob=Feature_Set[word, cat]*float(neg_prob)*10000 if float(pos_prob)>=float(neg_prob): derived_category="['pos']" else: derived_category="['neg']" if str(real_category)==str(derived_category): Classification_Accuracy=Classification_Accuracy + 1
import random import nltk as nltk #nltk.download() from nltk.corpus import stopwords import os, os.path path = os.path.expanduser('~/nltk_data') if not os.path.exists(path): os.mkdir(path) os.path.exists(path) import nltk.data path in nltk.data.path from nltk.corpus.reader import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader('.', r'.*_news_.*\.csv', cat_pattern=r'.*_news_(\w+)\.csv') reader.categories() def bag_of_words(words): return dict([(word, True) for word in words if word[0].isalpha()]) import collections def bag_of_words_not_in_set(words, badwords): return bag_of_words(set(words)-set(badwords)) def bag_of_non_stopwords(words, stopfile='english'): badwords = stopwords.words(stopfile) return bag_of_words_not_in_set(words, badwords) from nltk.metrics import BigramAssocMeasures from nltk.collocations import BigramCollocationFinder def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=2000):
data = get_data() print(len(data)) evrth, maindict = tags_assignment(data) # Save new final dictionary as well as the mapping for categories-numbers listingssss = json.dumps(evrth) with open("FinalCleanJuly1.json", "w") as f: f.write(listingssss) dictionaries = json.dumps(maindict) with open("CorpusCatMapJuly1.json", "w") as f: f.write(dictionaries) #### This is IMPORTANT - CHOOSE ! ##### default is key2 #### Choose the label you want to have for naming! ### two options: ### 1) key1 with format: docID + _(i) where i numerated number of category e.g. -doc-_cr14021.txt ### 2) key2 with format country name + year + _(i) e.g. Albania2015_1.txt ### if you want to change--> line 90: "key2: taglist" to key1 ### line 121: filename=evrth[i]['key2'] to key1 create_corpus(evrth) #### Check if working reader = CategorizedPlaintextCorpusReader('corpusCategory/', r'\w+\d+_.*\.txt', cat_map=maindict) print(reader.categories()) #print all categories in a list print(reader.fileids(categories=['Fiscal'])) #check docIDs in fiscal category #Good reference - https://www.packtpub.com/books/content/python-text-processing-nltk-20-creating-custom-corpora #They have options for creating chunked (by words, sentences, paragraphs and even customized paragraphs) corpora, tagged corpora etc