def buildEmailText(requests, rebuild = 1, stemmer='PorterStemmer', vectorizer='TfidfVectorizer', num_features = None): rawText = {} output = {} if rebuild: for i in range(0, 54): data_file = data_directory + 'email_text/email_text_tmp_test_' + str(i) + '.txt' print "Reading data from %s" % data_file emails = read_email_file(data_file) # for req_id in emails: # text = emails[req_id] # rawText{req_id}, text]) rawText.update(emails) f = open('pickle_data/rawEmailText.pkl', 'w') pickle.dump(rawText, f) f.close() else: f = open('pickle_data/rawEmailText.pkl', 'r') rawText = pickle.load(f) f.close() requests_only = select_data(requests, 0) requests_only = [[int(req_id)] for req_id in requests_only] append_data(requests_only, rawText) for row in requests_only: if len(row) < 2: row.append("") rawText = [row[1] for row in requests_only] print "New Raw Text Array %i" % len(rawText) print "Got data from files: " print len(rawText) return preProcessText(rawText, stemmer=stemmer, vectorizer=vectorizer, num_features=num_features)
stemmer = LancasterStemmer() elif stemmer_type == 'RegexpStemmer': stemmer = RegexpStemmer('ing$|s$|e$', min=3) for word in word_list: stemmed_words.append(stemmer.stem(word)) return stemmed_words if __name__ == "__main__": final_out = {} for i in range(0, 44): data_file = data_directory + 'email_text/email_text_tmp_test_' + str(i) + '.txt' print "Reading data from %s" % data_file emails = read_email_file(data_file) for req_id in emails: text = emails[req_id] tokens = word_tokenize(text) tokens = [str(t).lower() for t in tokens] remove_stopwords(tokens) stemmed_words = stemming(tokens, 'PorterStemmer') final_out[req_id] = ' '.join(stemmed_words) i+=1 f = open('testEmailText.pkl', 'w') pickle.dump(final_out, f) f.close()