def main(): articles = CategorizedPlaintextCorpusReader(corpusdir, '.*', cat_pattern = r'(.*)[/]') feats = {} trainfeats = [] testfeats = [] for cat in articles.categories(): wow = len([f for f in articles.fileids(cat)]) # such variable name print "for category", cat, ":", wow feats[cat] = [(word_feats(articles.words(fileids = [f])), cat) for f in articles.fileids(cat)] cutoff = wow - hold_back(wow) trainfeats.append(feats[cat][:cutoff]) testfeats.append(feats[cat][cutoff:]) train = [item for sublist in trainfeats for item in sublist] test = [item for sublist in testfeats for item in sublist] print 'train on %d instances, test on %d instances' % (len(train), len(test)) classifier = NaiveBayesClassifier.train(train) print 'accuracy:', nltk.classify.util.accuracy(classifier, test) classifier.show_most_informative_features() # I don't understand the output for more than 2 categories :( # load with: # import pickle # f = open('my_classifier.pickle') # classifier = pickle.load(f) # f.close() with open('../data/classifier.pickle', 'wb') as f: pickle.dump(classifier, f)
from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.tokenize.casual import TweetTokenizer from normalization import normalizeTwitterWordsWithExtraFeatures, normalizeTwitterWordsWithNegationHandle import pickle, nltk tweetTokenizer = TweetTokenizer(reduce_len=True, preserve_case=True, strip_handles=False) corpus = CategorizedPlaintextCorpusReader('corpus/2-step/polar', r'(\w+)-tweet[0-9]+\.txt', cat_pattern=r'(\w+)-tweet[0-9]+\.txt', word_tokenizer=tweetTokenizer) normalizationFunction = normalizeTwitterWordsWithNegationHandle wordsTaggedToCategory = [] i = 1 for category in corpus.categories(): for fileid in corpus.fileids(category): words = corpus.words(fileids=[fileid]) normalizedWords = normalizationFunction(words) extraNormalizedWords = normalizeTwitterWordsWithExtraFeatures(words) wordsTagged = nltk.pos_tag(normalizedWords) wordsTaggedToCategory += [(wordsTagged, category)] print(i) i += 1 with open("wordsTaggedToCategory-polar", 'wb') as fileout: pickle.dump(wordsTaggedToCategory, fileout)
words = [w for w in words if not w.replace('.','',1).isdigit()] if remove_odd_chars: words = [re.sub(r'[^a-zA-Z0-9_]','_', w) for w in words] if remove_stopwords: sw = set(nltk.corpus.stopwords.words("english")) words = [w for w in words if not w in sw] if stem: porter = nltk.PorterStemmer() words = [porter.stem(w) for w in words] return words documents = [((fileid, category), preprocess(my_corpus.words(fileid), to_lowercase = True, remove_punctuation = True, remove_digits = True, remove_odd_chars = True, remove_stopwords=True, stem = False)) \ for category in my_corpus.categories() \ for fileid in my_corpus.fileids(category)] def dummy_fun(doc): return doc bow_gen = sklearn.feature_extraction.text.CountVectorizer( analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None,
reader = CategorizedPlaintextCorpusReader(d, r'.*\.txt', cat_pattern=r'(\w+)/*') from textblob.classifiers import NaiveBayesClassifier random.seed(1) train = [ ('Identity', 'IdentityThreat'), ('identity', 'IdentityThreat'), ('identities', 'IdentityThreat'), ('identity loss', 'IdentityThreat'), ('insider', 'InsiderThreat'), ('Malware', 'Malware'), ] # Categorized corpora Reader collect the respective words based on ThreatType ThreatTypes = [(list(reader.words(fileid)), category) for category in reader.categories() for fileid in reader.fileids(category)] random.shuffle(ThreatTypes) print(reader.categories()) new_train = ThreatTypes print(new_train) #Naive Bayes classifiers assume that the value of a particular feature is independent of the value of #any other feature, given the class variable. cl = NaiveBayesClassifier(train) #update the classifier with training keywords from Categorized corpora cl.update(new_train) inputpath = nltk.data.find('corpora/abc/threatdescp.txt') f = open(inputpath, encoding='latin2') outputpath = nltk.data.find('corpora/abc/ResultNB.txt') ResultFile = open(outputpath, 'w', encoding='latin2')
def getDirnames( path ) : dirList = [] for f in os.listdir( path ) : if not os.path.isfile( path ) : if not f == ".DS_Store" : dirList.append(f) return dirList ############################################### ############################################### ################# # TRAINING DATA # ################# train_reader = CategorizedPlaintextCorpusReader('./training_data', r'.*\_.*\.txt', cat_pattern=r'.*\_(\w+)\.txt') train_documents = [(list(train_reader.words(fileid)), category) for category in train_reader.categories() for fileid in train_reader.fileids(category)] random.shuffle(train_documents) #print train_documents train_documents_clean = [] for i in train_documents : cat = i[1] #print cat newList = [] for word in i[0] : #print j clean_word = word.encode('ascii', 'ignore').decode('ascii').encode('ascii', 'ignore') newList.append(clean_word) newTup = (newList, cat)
import nltk, random, string from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords reader = CategorizedPlaintextCorpusReader('./', r'.*\.txt', cat_pattern=r'(\w+)/*') print reader.categories() print reader.fileids() documents = [(list(reader.words(fileid)), category) for category in reader.categories() for fileid in reader.fileids(category)] random.shuffle(documents) # Remove stopwords & punc from content table = string.maketrans("","") stopwords = nltk.corpus.stopwords.words('english') filtered_words = [w for w in reader.words() if not w in stopwords] filtered_words_nopunc = [w for w in filtered_words if not w in string.punctuation] all_words = nltk.FreqDist(w.lower() for w in filtered_words_nopunc) print all_words word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {}
from textblob.classifiers import NaiveBayesClassifier from nltk.corpus.reader import PlaintextCorpusReader, CategorizedPlaintextCorpusReader from nltk.corpus import movie_reviews import nltk import random from BeautifulSoup import BeautifulSoup p = nltk.data.find('corpora/SecurityThreat-MaxEnt') reader = CategorizedPlaintextCorpusReader(p, r'.*\.txt', cat_pattern=r'(\w+)/*') from nltk import WordNetLemmatizer #Using Wordnet Lemmatizer wordnet_lemmatizer = WordNetLemmatizer() all_words = nltk.FreqDist(word for word in reader.words()) top_words = list(all_words)[:100] print(top_words) def word_feats(words): return {word: True for word in words if word in top_words} #def word_feats(words): #return dict([(wordnet_lemmatizer.lemmatize(word), True) for word in words]) # Generate all the files based on ThreatType. IdentityThreat = reader.fileids('IdentityThreat') InsiderThreat = reader.fileids('InsiderThreat') Malware = reader.fileids('Malware')
loc = '/Users/rmoura/nltk_data/corpora/rai/textoSimples/' corpus1 = PlaintextCorpusReader(loc, '.*\.txt') print(corpus1.fileids()) print(corpus1.sents()) print(corpus1.words()) # Corpus texto etiquetado from nltk.corpus.reader.tagged import TaggedCorpusReader loc = '/Users/rmoura/nltk_data/corpora/rai/textoEtiquetas/' corpus2 = TaggedCorpusReader(loc, '.*\.txt') print(corpus2.fileids()) print(corpus2.words()) print("Palavras etiquetadas: ", corpus2.tagged_words()) print(corpus2.tagged_words('003.txt')) print("Sentencas diretas:") for s in corpus2.sents(): print(' '.join(s)) from nltk.corpus.reader import CategorizedPlaintextCorpusReader loc = '/Users/rmoura/nltk_data/corpora/rai/textoCategorias/' corpus3 = CategorizedPlaintextCorpusReader(loc, '.*\.txt', cat_file="categorias.txt") print(corpus3.fileids()) print(corpus3.categories()) print(corpus3.words(categories='brasnam')) # Definicao de stopwords stopwords = nltk.corpus.stopwords.words('portuguese') fd = nltk.FreqDist(w.lower() for w in corpus3.words()) fd1 = nltk.FreqDist(w.lower() for w in corpus3.words() if w.isalpha() and w not in stopwords)
class PolarityDataReader(object): """ PolarityDataReader: Reader for POS/NEG Categorized Sentiword data uses: nltk.corpus.reader.CategorizedPlaintextCorpusReader usage: dataReader = PolarityDataReader([rootLocation],[readerObject]) dataReader.getDocuments() dataReader.setTerms([No:ofTerms]) featuresets = dataReader.getTermDocMatrix() """ def __init__(self, rootLocation=config.POLARITY_DATASET, reader=None): super(PolarityDataReader, self).__init__() if reader == None: self.reader = Reader(rootLocation, r'.*/.*', cat_pattern=r'(.*)/.*') else: self.reader = reader self.setStopWords() self.documents = None self.terms = None def getDocuments(self): if not self.documents: self.documents = [(list(self.reader.words(fileid)), category) for category in self.reader.categories() for fileid in self.reader.fileids(category)] return self.documents def setStopWords(self, fileLocation=config.STOP_WORDS_FILE): stopfile = open(fileLocation, 'r') self.stopwords = stopfile.read().split() def removeStopWords(self, wordList): """ Remove common words which have no search value """ return [word for word in wordList if word not in self.stopwords] def setTerms(self, size=2000, featureSelection='PD', removeStopWords=True): if featureSelection == 'PD': self.__setTermsPD__(size) print "Feature Selection : PD :done " elif featureSelection == 'CHI_SQUARE': self.__setTermsCHISQUARE__(size) print "Feature Selection : CHI_SQUARE :done " else: """ geting most frequent Words """ all_words = [w.lower() for w in self.reader.words()] if removeStopWords: all_words = self.removeStopWords(all_words) all_words = FreqDist(w for w in all_words) self.terms = all_words.keys()[:size] print "Feature Selection: frequent Words :done " def documentFeatures(self, document, sentiwordnet=False): document_words = set(document) features = {} if sentiwordnet: pass #TODO else: for word in self.terms: features[word] = (word in document_words) return features def getTermDocMatrix(self): return [(self.documentFeatures(document), category) for (document, category) in self.documents] def __setTermsPD__(self, size): """ score=|(posDF-negDF)|/(posDF+negDF) """ posWord = {} negWord = {} for word in self.reader.words(categories=['pos']): inc(posWord, word.lower()) for word in self.reader.words(categories=['neg']): inc(negWord, word.lower()) wordScores = {} for word in self.reader.words(): try: posScore = posWord[word] except KeyError, e: posScore = 0 try: negScore = negWord[word] except KeyError, e: negScore = 0 totalScore = posScore + negScore if totalScore <= 10: # min total count wordScores[word] = 0.1 else: wordScore[word] = abs(posScore - negScore) / totalScore
#def tokenize(text): # min_length = 3 # words = map(lambda word: word, word_tokenize(text)); # words = [word for word in words # if word not in cachedStopWords] # tokens =(list(map(lambda token: PorterStemmer().stem(token), # words))); # p = re.compile('[a-zA-Z]+'); # filtered_tokens = list(filter(lambda token: # p.match(token) and len(token)>=min_length, tokens)); # # return filtered_tokens #Preparing a Tuple List of the Corpus Data based on #Words In the corpus file and correspoindg category data = [(list(tokenize(' '.join(reader.words(fileid)))), category) for category in reader.categories() for fileid in reader.fileids(category)] #First preparing a train data set based on pre-identified features featureListTrain = [ ('Natural Language Processing', 'General'), ('Text Retrieval', 'General'), ('Text Access', 'General'), ('Information Retrieval', 'General'), ('NLP', 'General'), ('Content Analysis', 'General'), ('Vector', 'IR Models & Implementations'), ('Length', 'IR Models & Implementations'), ('Indexing', 'IR Models & Implementations'), ('Statistical', 'IR Models & Implementations'), ('Evaluation', 'IR Models- Evaluation,Ranking & Feedback'), ('Feedback', 'IR Models- Evaluation,Ranking & Feedback'),
def process_plaintext(dir_path): reader = CategorizedPlaintextCorpusReader(dir_path, r'.*\.txt', cat_pattern=r'.+_.+_(.*)\.txt') facilitator_files = reader.fileids(categories='facilitator') participant_files = reader.fileids(categories='participant') print facilitator_files, participant_files #print reader.categories() #print len(reader.words()) #print len(reader.sents()) fac_words = [word for word in reader.words(facilitator_files)] par_words = [word for word in reader.words(participant_files)] fac_words = edit_tokens(fac_words) par_words = edit_tokens(par_words) speakers = ([(word, 'facilitator') for word in reader.words(facilitator_files)] + [(word, 'participant') for word in reader.words(participant_files)]) features = get_features(speakers) size = int(len(features) * 0.3) nb_train = features[size:] nb_test = features[:size] classifier = nltk.NaiveBayesClassifier.train(nb_train) print "Classifier labels:", classifier.labels() print classifier.show_most_informative_features() print "Clasify test:", nltk.classify.accuracy(classifier, nb_test) #print classifier.classify(get_features(["Yolo", "bag", "sp"], False)) #random.shuffle(speakers) three_quarters = int(len(speakers) * 0.75) train = speakers[:three_quarters] test = speakers[three_quarters:] est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist) un_lm = NgramModel(1, train, estimator=est) bi_lm = NgramModel(2, train, estimator=est) tr_lm = NgramModel(3, train, estimator=est) qu_lm = NgramModel(4, train, estimator=est) pe_lm = NgramModel(5, train, estimator=est) print un_lm print bi_lm print tr_lm print qu_lm print pe_lm print "1 gram Perplexity:", un_lm.perplexity(test) print "2 gram Perplexity:", bi_lm.perplexity(test) print "3 gram Perplexity:", tr_lm.perplexity(test) print "4 gram Perplexity:", qu_lm.perplexity(test) print "5 gram Perplexity:", pe_lm.perplexity(test) print bi_lm.generate(10, ["uh", "sp"]) fd_fac = nltk.FreqDist(fac_words) vocab_fac = fd_fac.keys() fd_par = nltk.FreqDist(par_words) vocab_par = fd_par.keys() print "Fac Vocab: ", len(vocab_fac) print "Fac Tokens: ", len(fac_words) print vocab_fac[:20] print "Par Vocab: ", len(vocab_par) print "Par Tokens: ", len(par_words) print vocab_par[:20] fd_par.plot(50)
def process_plaintext(dir_path): reader = CategorizedPlaintextCorpusReader(dir_path, r'.*\.txt', cat_pattern=r'.+_.+_(.*)\.txt') facilitator_files = reader.fileids(categories='facilitator') participant_files = reader.fileids(categories='participant') print facilitator_files, participant_files #print reader.categories() #print len(reader.words()) #print len(reader.sents()) fac_words = [word for word in reader.words(facilitator_files)] par_words = [word for word in reader.words(participant_files)] fac_words = edit_tokens(fac_words) par_words = edit_tokens(par_words) speakers = ( [(word, 'facilitator') for word in reader.words(facilitator_files)] + [(word, 'participant') for word in reader.words(participant_files)] ) features = get_features(speakers) size = int(len(features) * 0.3) nb_train = features[size:] nb_test = features[:size] classifier = nltk.NaiveBayesClassifier.train(nb_train) print "Classifier labels:", classifier.labels() print classifier.show_most_informative_features() print "Clasify test:", nltk.classify.accuracy(classifier, nb_test) #print classifier.classify(get_features(["Yolo", "bag", "sp"], False)) #random.shuffle(speakers) three_quarters = int(len(speakers) * 0.75) train = speakers[:three_quarters] test = speakers[three_quarters:] est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist) un_lm = NgramModel(1, train, estimator=est) bi_lm = NgramModel(2, train, estimator=est) tr_lm = NgramModel(3, train, estimator=est) qu_lm = NgramModel(4, train, estimator=est) pe_lm = NgramModel(5, train, estimator=est) print un_lm print bi_lm print tr_lm print qu_lm print pe_lm print "1 gram Perplexity:", un_lm.perplexity(test) print "2 gram Perplexity:", bi_lm.perplexity(test) print "3 gram Perplexity:", tr_lm.perplexity(test) print "4 gram Perplexity:", qu_lm.perplexity(test) print "5 gram Perplexity:", pe_lm.perplexity(test) print bi_lm.generate(10, ["uh", "sp"]) fd_fac = nltk.FreqDist(fac_words) vocab_fac = fd_fac.keys() fd_par = nltk.FreqDist(par_words) vocab_par = fd_par.keys() print "Fac Vocab: " , len(vocab_fac) print "Fac Tokens: " , len(fac_words) print vocab_fac[:20] print "Par Vocab: " , len(vocab_par) print "Par Tokens: " , len(par_words) print vocab_par[:20] fd_par.plot(50)
#set up path to data data_folder_name = sys.argv[1] data_path = os.path.join(os.getcwd(), '', data_folder_name) #make article object to read in files article = CategorizedPlaintextCorpusReader(data_path, r'.*\.*\.txt', cat_pattern=r'(\w+).*\.txt') #make list of all articles with labels based on what folder the file is in all_articles = [] for category in article.categories(): for fileid in article.fileids(category): #lowercases words and takes out stopwords process = list( w.lower() for w in list(article.words(fileid)) if w.isalpha() and w not in stopwords.words('english')) entry = [process, category] all_articles.append(entry) random.shuffle(all_articles) #make bigrams for every article word_bigrams = [(nltk.bigrams(all_articles[i][0])) for i in range(len(all_articles))] #create frequency distribution for all words and select top 2000 for features all_words = nltk.FreqDist(article.words()) word_features = list(all_words)[:2000] #create list holding all bigrams
def generate_model(cfdist, word, num=15): for i in range(num): print(word, end=' ') word = cfdist[word].max() # 1. Construir Corpus texto categorizado locPT = 'ch02/ES' corpusPT = CategorizedPlaintextCorpusReader(locPT, '.*\.txt', cat_file="cat.txt") print(corpusPT.fileids()) print(corpusPT.categories()) print(corpusPT.words(categories='ciencia')) #print(corpusPT.raw()) vocab = set(w.lower() for w in corpusPT.words()) print('Tamanho Vocabulario:', len(vocab)) corpusCom = corpusPT.raw() corpusComList = corpusCom.split() print('Tamanho Total de palabras:', len(corpusComList)) # 2. Calcular medidas estadisticas simples ''' Medidas: Tamanho médio das palavras, Tamanho médio das sentenças e Número de vezes que cada item do vocabulário aparece no texto em média (escore de diversidade léxica) ''' print( 'Tamanho médio das palavras/Tamanho médio das sentenças/Escore de diversidade léxica'
class PolarityDataReader(object): """ PolarityDataReader: Reader for POS/NEG Categorized Sentiword data uses: nltk.corpus.reader.CategorizedPlaintextCorpusReader usage: dataReader = PolarityDataReader([rootLocation],[readerObject]) dataReader.getDocuments() dataReader.setTerms([No:ofTerms]) featuresets = dataReader.getTermDocMatrix() """ def __init__(self, rootLocation = config.POLARITY_DATASET,reader=None): super(PolarityDataReader, self).__init__() if reader == None: self.reader = Reader(rootLocation,r'.*/.*', cat_pattern=r'(.*)/.*') else: self.reader = reader self.setStopWords() self.documents = None; self.terms = None; def getDocuments(self): if not self.documents: self.documents = [(list(self.reader.words(fileid)), category) for category in self.reader.categories() for fileid in self.reader.fileids(category)] return self.documents; def setStopWords(self,fileLocation = config.STOP_WORDS_FILE): stopfile = open(fileLocation, 'r') self.stopwords = stopfile.read().split() def removeStopWords(self,wordList): """ Remove common words which have no search value """ return [word for word in wordList if word not in self.stopwords] def setTerms(self,size=2000,featureSelection='PD',removeStopWords=True): if featureSelection == 'PD': self.__setTermsPD__(size) print "Feature Selection : PD :done " elif featureSelection == 'CHI_SQUARE': self.__setTermsCHISQUARE__(size) print "Feature Selection : CHI_SQUARE :done " elif featureSelection == 'SWNSS': self.__setTermsSWNSS__(size) print "Feature Selection : SWNPD :done " else: """ geting most frequent Words """ all_words = [w.lower() for w in self.reader.words()]; if removeStopWords: all_words = self.removeStopWords(all_words); all_words = FreqDist(w for w in all_words) self.terms = all_words.keys()[:size] print "Feature Selection: frequent Words :done " def documentFeatures(self,document,sentiwordnet=False): document_words = set(document) features = {} if sentiwordnet: pass #TODO else : for word in self.terms: features[word] = (word in document_words) return features def getTermDocMatrix(self): return [(self.documentFeatures(document), category) for (document,category) in self.documents] def __setTermsPD__(self,size): """ score=|(posDF-negDF)|/(posDF+negDF) """ posWord = {}; negWord = {}; for word in self.reader.words(categories = ['pos']): inc(posWord,word.lower()); for word in self.reader.words(categories = ['neg']): inc(negWord,word.lower()); wordScores = {} for word in self.reader.words(): try: posScore = posWord[word] except KeyError, e: posScore = 0 try: negScore = negWord[word] except KeyError, e: negScore = 0 totalScore = posScore + negScore if totalScore <= 10 : # min total count wordScores[word] = 0.1 else : wordScores[word] = abs(posScore-negScore)/totalScore #removeStopWords does no affect accurcy termScore = sorted(wordScores.items(),key=lambda(w,s):s,reverse=True)[:size] self.terms = [w for (w,s) in termScore];
from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords bigram_measures = nltk.collocations.BigramAssocMeasures() #print reader.categories() for name in os.listdir("."): if os.path.isdir(name): reader = CategorizedPlaintextCorpusReader(name, r'.*\.txt', cat_pattern=r'(\w+)/*') # reader = CategorizedPlaintextCorpusReader(name, r'./raw_reviews/\.txt', cat_pattern=r'(\w+)/*') print reader.fileids() table = string.maketrans("","") stopwords = nltk.corpus.stopwords.words('english') filtered_words = [w for w in reader.words() if not w in stopwords] filtered_words_nopunc = [w for w in filtered_words if not w in string.punctuation] #all_words = nltk.FreqDist(w.lower() for w in filtered_words_nopunc) finder = BigramCollocationFinder.from_words(filtered_words_nopunc) #scored = finder.score_ngrams(bigram_measures.raw_freq) #a = sorted(bigram for bigram, score in scored) finder.apply_freq_filter(3) a = finder.nbest(bigram_measures.pmi, 5) #b = finder.score_ngrams(bigram_measures.pmi) print a #documents = [(list(reader.words(fileid)), category) # for category in reader.categories() # for fileid in reader.fileids(category)]
file.write((str(cat))) file.write("\t\t") file.write(str(Feature_Set[word,cat])) file.write("\n") file.close() Classification_Accuracy=0 for file in Testing_Corpus.fileids(): pos_prob=1 neg_prob=1 real_category=Testing_Corpus.categories([file]) for word, cat in Feature_Set: if word in Testing_Corpus.words([file]): if cat=="pos": pos_prob=Feature_Set[word, cat]*float(pos_prob)*10000 else: neg_prob=Feature_Set[word, cat]*float(neg_prob)*10000 if float(pos_prob)>=float(neg_prob): derived_category="['pos']" else: derived_category="['neg']" if str(real_category)==str(derived_category): Classification_Accuracy=Classification_Accuracy + 1 print "Feature set is stored in model_file"
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) classifier.show_most_informative_features() def word_feats(words): return dict([(word, True) for word in words]) print 'evaluating single word features' evaluate_classifier(word_feats) word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in imdb_reviews.words(categories=['pos']): word_fd[word.lower()] += 1 label_word_fd['pos'][word.lower()] += 1 for word in imdb_reviews.words(categories=['neg']): word_fd[word.lower()] += 1 label_word_fd['neg'][word.lower()] += 1 # n_ii = label_word_fd[label][word] # n_ix = word_fd[word] # n_xi = label_word_fd[label].N() # n_xx = label_word_fd.N()` pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count
cat_pattern = r'(\w+)/*') # file name format # Positive reviews file ids pos_ids = reader.fileids('pos') # Negative reviews file ids neg_ids = reader.fileids('neg') '''Generating word feature list''' def word_feats(words): return dict([(word, True) for word in words]) '''Building positive and negative feature lists. Each item is the positive/negative word features for a review file''' pos_feat = [(word_feats(reader.words(fileids = f)), 'pos') for f in pos_ids] neg_feat = [(word_feats(reader.words(fileids = f)), 'neg') for f in neg_ids] '''refining feature lists, stemming, removing punctuation and stop words from pos_feat''' pos_feat = [] import re for file in pos_ids[:15000]: # reset review variable review = '' # Create a string of the text in the file review = ' '.join(word for word in reader.words(fileids = [file]))