def filter(ham_word_pro, spam_word_pro, test_file): test_paths = fun(test_file) for test_path in test_paths: email_spam_prob = 0.0 spam_prob = 0.5 ham_prob = 0.5 file_name = test_path.split('\\')[-1] prob_dict = {} words = set(email_parser(test_path)) for word in words: Psw = 0.0 if word not in spam_word_pro: Psw = 0.4 else: Pws = spam_word_pro[word] Pwh = ham_word_pro[word] Psw = spam_prob*(Pws/(Pwh*ham_prob+Pws*spam_prob)) prob_dict[word] = Psw numerator = 1 denominator_h = 1 for k, v in prob_dict.items(): numerator *= v denominator_h *= (1-v) email_spam_prob = round(numerator/(numerator+denominator_h), 4) if email_spam_prob > 0.5: print(file_name, 'spam', email_spam_prob) else: print(file_name, 'ham', email_spam_prob) print(prob_dict) print('******************************************************')
def main(): file_path = r'..\data\data_of_movie' # file_path = r'..\data\test' output_path = r'..\data\tenTimesTraining' files = fun(file_path) # output_path = buildfile(output_path) data_storage = split_ten(files) group_data(data_storage, output_path)
def get_word(email_file): word_list = [] word_set = [] punctuations = """,.<>()*&^%$#@!'";~`[]{}|、\\/~+_-=?""" email_paths = fun(email_file) for email_path in email_paths: clean_word = email_parser(email_path) word_list.append(clean_word) word_set.extend(clean_word) return word_list, set(word_set)
def get_data(data_path): label_vec = [] files = fwalker.fun(data_path) for file in files: ech_label_vec = [] ech_label = int((file.split('\\'))[-1][0]) ech_vec = ((np.loadtxt(file)).ravel()) ech_label_vec.append(ech_label) ech_label_vec.append(ech_vec) label_vec.append(ech_label_vec) return label_vec
def main(): filepath = r'..\email' testpath = r'..\test' files = fun(filepath) random.shuffle(files) top10 = files[:10] for ech in top10: ech_name = testpath + '\\' + ('_'.join(ech.split('\\')[-2:])) shutil.move(ech, testpath) os.rename(testpath + '\\' + ech.split('\\')[-1], ech_name) print('%s moved' % ech_name)
def get_word(email_file): word_list = [] word_set = [] punctuations = """,.<>()*&^%$#@!'";~`[]{}|、\\/~+_-=?""" email_paths = fun(email_file) for email_path in email_paths: # content_list = readtxt(email_path, 'utf8') # content = (' '.join(content_list)).replace( # '\r\n', ' ').replace('\t', ' ') # for punctuation in punctuations: # # content = content.replace(punctuation, '').replace(' ', ' ') # content = (' '.join(content.split(punctuation))).replace(' ', ' ') # clean_word = [word.lower() # for word in content.split(' ') if len(word) > 2] clean_word = email_parser(email_path) word_list.append(clean_word) word_set.extend(clean_word) return word_list, set(word_set)
def main(): datapath = r'D:\DevelopmentLanguage\Python\MachineLearning\KNN\lab3_0930\digits' inputpath = bfile.buildfile( r'D:\DevelopmentLanguage\Python\MachineLearning\KNN\lab3_0930\input_digits') files = fwalker.fun(datapath) change_data(files, inputpath)