def classification(neuro_obj = None, epoch = 100000, num_class = 10): print 'initialize Neural Network.' if neuro_obj: nn_obj = neuro_obj else : nn_obj = mln.Mln().make_neuralnet([28*28, 1000, num_class], ['sigmoid', 'softmax'], 0.01) # mnist classification # else : nn_obj = mln.Mln().make_neuralnet([28*28, 1000, num_class], ['sigmoid', 'softmax'], 0.15) # mnist classification print "dump obj..." dp.obj_dump(nn_obj, './default-classification.pkl') print 'read training data and label.' training_data = dp.obj_load_gzip('../../mnist/mnist-training_all.pkl.gz') print 'data size : ', len(training_data) print 'label size : ', len(training_data) data_num = len(training_data) print '--start--' print '@@ Learn Character Recognition @@' for j in range(0, epoch): prg.show_progressxxx(j+1, epoch) i = np.random.randint(data_num) nn_obj.learn(training_data[i][0], training_data[i][1]) prg.end_progress() print "dump obj..." dp.obj_dump(nn_obj, './learn-classification.pkl') return nn_obj
def naive_bayes(cache_model): print("Running Naive-Bayes Classifier Training") idx = 0 total = len(data) start_progress("Pre-processing {} of data".format(total)) for gender, comment in data: idx += 1 word_exist = {} word_not_exist = {} if gender not in list_of_gender: list_of_gender.append(gender) for word in word_tokenize(comment): word_exist[word] = True word_not_exist[word] = False if word not in list_of_words: list_of_words.append(word) for gen in list_of_gender: if gen == gender: train_data_gender.append((word_exist, gen)) else: train_data_gender.append((word_not_exist, gen)) progress(idx / total * 100) end_progress() print("\nFinished pre-processing ({} data)".format(total)) print("Training {} gender data".format(total)) main_gender_classifier = NaiveBayesClassifier.train(train_data_gender) if cache_model: cache.cache_model(main_gender_classifier, "model/gender_classifier_{}.p".format(total)) print("Cross validation") average_accuracy = 0 size = len(train_data_gender) for i in range(1, 9): test_set = train_data_gender[round((i - 1) * size / 8):round((i) * size / 8)] training_set = train_data_gender[0:round((i - 1) * size / 8)] training_set.extend(train_data_gender[round((i) * size / 8):]) gender_classifier = NaiveBayesClassifier.train(training_set) print("Test-{0}: {1:.2%}".format( i, classify.accuracy(gender_classifier, test_set))) average_accuracy += classify.accuracy(gender_classifier, test_set) average_accuracy /= 8 print("Average accuracy: " + "{0:.2%}\n".format(average_accuracy)) return main_gender_classifier
def classification(neuro_obj=None, epoch=100000, minibatch_size=1): num_class = 10 print "initialize Neural Network." if neuro_obj: nn_obj = neuro_obj else: nn_obj = mln.Mln().make_neuralnet( [28 * 28, 1000, num_class], ["sigmoid", "softmax"], 0.15 ) # mnist classification # else : nn_obj = mln.Mln().make_neuralnet([28*28, 1000, num_class], ['sigmoid', 'softmax'], 0.01) # mnist classification # use weight decay. # nn_obj.use_weight_decay(0.01) # unlearnable # nn_obj.use_weight_decay(0.001) # learnable nn_obj.use_weight_decay(0.0001) # learnable # nn_obj.unuse_weight_decay() # use momentum. # nn_obj.use_momentum(0.1) nn_obj.use_momentum(0.5) # nn_obj.use_momentum(0.9) # nn_obj.unuse_momentum() print "dump obj..." dp.obj_dump(nn_obj, "./default-classification-batch.pkl") print "read training data and label." training_data = dp.obj_load_gzip("../../mnist/mnist-training_all.pkl.gz") print "data size : ", len(training_data) print "label size : ", len(training_data) print "minibatch size : ", minibatch_size data_num = len(training_data) print "--start--" print "@@ Learn Character Recognition @@" mb_size = minibatch_size for i in range(0, epoch): prg.show_progressxxx(i + 1, epoch) xxx = random.sample(training_data, data_num) x = [] d = [] for i in range(0, mb_size): x.append(xxx[i][0]) d.append(xxx[i][1]) nn_obj.batch_learn(x[0:mb_size], d[0:mb_size], mb_size) prg.end_progress() print "dump obj..." dp.obj_dump(nn_obj, "./learn-classification-batch.pkl") return nn_obj
def classification(neuro_obj=None, epoch=100000, num_class=10): print "initialize Neural Network." if neuro_obj: nn_obj = neuro_obj else: nn_obj = mln.Mln().make_neuralnet( [28 * 28, 1000, num_class], ["sigmoid", "softmax"], 0.01 ) # mnist classification # else : nn_obj = mln.Mln().make_neuralnet([28*28, 1000, num_class], ['sigmoid', 'softmax'], 0.15) # mnist classification # use weight decay. # nn_obj.use_weight_decay(0.01) # # nn_obj.use_weight_decay(0.001) # # nn_obj.use_weight_decay(0.0001) # 20.02% error, with momentum 0.5. nn_obj.use_weight_decay(0.00001) # 8.45% error, with momentum 0.5. # nn_obj.unuse_weight_decay() # use momentum. # nn_obj.use_momentum(0.1) nn_obj.use_momentum(0.5) # nn_obj.use_momentum(0.9) # nn_obj.unuse_momentum() print "dump obj..." dp.obj_dump(nn_obj, "./default-classification.pkl") print "read training data and label." training_data = dp.obj_load_gzip("../../mnist/mnist-training_all.pkl.gz") print "data size : ", len(training_data) print "label size : ", len(training_data) data_num = len(training_data) print "--start--" print "@@ Learn Character Recognition @@" for j in range(0, epoch): prg.show_progressxxx(j + 1, epoch) i = np.random.randint(data_num) nn_obj.learn(training_data[i][0], training_data[i][1]) prg.end_progress() print "dump obj..." dp.obj_dump(nn_obj, "./learn-classification.pkl") return nn_obj
def test_classification(): num_class = 10 print 'initialize Neural Network.' nn_obj = dp.obj_load('./learn-classification-batch.pkl') print 'read test data.' test_data = dp.obj_load_gzip('../../mnist/mnist-test_data.pkl.gz') print 'read test label.' test_label = dp.obj_load_gzip('../../mnist/mnist-test_label.pkl.gz') print 'data size : ', len(test_data) print 'label size : ', len(test_label) data_num = len(test_data) prediction_error = [] prediction_recog = [] print '--start--' print '@@ Test Character Recognition @@' for j in range(0, data_num): prg.show_progressxxx(j+1, data_num) num_recog, list_recog = nn_obj.test(test_data[j], test_label[j]) if test_label[j][num_recog] == 0: prediction_error.append((test_label[j], num_recog)) prg.end_progress() count = len(prediction_error) for item in prediction_error: print "Truth:", np.argmax(item[0]), ", --> But predict:", item[1] print '' print 'error : ', count, '/', data_num, '(', count * 1.0 / data_num * 100.0, '%)' return
def classification(neuro_obj = None, epoch = 100000, minibatch_size = 1): num_class = 10 print 'initialize Neural Network.' if neuro_obj: nn_obj = neuro_obj else : nn_obj = mln.Mln().make_neuralnet([28*28, 1000, num_class], ['sigmoid', 'softmax'], 0.15) # mnist classification # else : nn_obj = mln.Mln().make_neuralnet([28*28, 1000, num_class], ['sigmoid', 'softmax'], 0.01) # mnist classification print "dump obj..." dp.obj_dump(nn_obj, './default-classification-batch.pkl') print 'read training data and label.' training_data = dp.obj_load_gzip('../../mnist/mnist-training_all.pkl.gz') print 'data size : ', len(training_data) print 'label size : ', len(training_data) print "minibatch size : ", minibatch_size data_num = len(training_data) print '--start--' print '@@ Learn Character Recognition @@' mb_size = minibatch_size for i in range(0, epoch): prg.show_progressxxx(i+1, epoch) xxx = random.sample(training_data, data_num) x = [] d = [] for i in range(0, mb_size): x.append(xxx[i][0]) d.append(xxx[i][1]) nn_obj.batch_learn(x[0:mb_size], d[0:mb_size], mb_size) prg.end_progress() print "dump obj..." dp.obj_dump(nn_obj, './learn-classification-batch.pkl') return nn_obj
col_of_item = 0 for item in allLines[12:16]: col_of_item = (col_of_item << 8 | ord(item)) for i in range(0, num_of_item): prg.show_progress(i, num_of_item-1) offset = 16 min_coodinate = i *row_of_item*col_of_item+offset max_coodinate = (i+1)*row_of_item*col_of_item+offset obj_dump(allLines[min_coodinate:max_coodinate], fname + str(i) + '.dump') if show: for j in range(0, row_of_item*col_of_item): if ord(allLines[j+i*row_of_item*col_of_item+16]) > 0.: print '#', else : print ' ', if j%row_of_item == 0 : print '' prg.end_progress() # # return image of n for file # def get_image_n(fname, n): # set number labels with open(fname, 'rb') as file: allLines = file.read() num_of_item = 0 for item in allLines[ 4: 8]: num_of_item = (num_of_item << 8 | ord(item)) row_of_item = 0 for item in allLines[ 8:12]: row_of_item = (row_of_item << 8 | ord(item)) col_of_item = 0 for item in allLines[12:16]: col_of_item = (col_of_item << 8 | ord(item))
def main(args): start_time = time.time() print("Running AdaBoost Classifier") print("Reading blacklist words file") load_blacklist_words("../data/blacklist.txt") print("Reading raw gender-comment data") with open("../data/male-comments.json", "r") as f: male_comment = json.load(f) with open("../data/female-comments.json", "r") as f: female_comment = json.load(f) # Lower case all comments male_comment = [[x[0], x[1].lower()] for x in male_comment] female_comment = [[x[0], x[1].lower()] for x in female_comment] # Filter blacklisted words in comments male_comment = [[x[0], x[1]] for x in male_comment if all(c not in BLACKLIST_WORDS for c in x[1].split(" "))] female_comment = [[x[0], x[1]] for x in female_comment if all( c not in BLACKLIST_WORDS for c in x[1].split(" "))] random.shuffle(male_comment) random.shuffle(female_comment) print("Loaded {} male and {} female comments".format( len(male_comment), len(female_comment))) female_ratio = 1.0 - args.male_female_ratio if args.limit != -1: print( "Limiting male and female comments to {} male and {} female ({} total)" .format(int(args.limit * args.male_female_ratio), int(args.limit * female_ratio), args.limit)) try: del male_comment[int(args.limit * args.male_female_ratio):] del female_comment[int(args.limit * female_ratio):] except: print("Not enough male/female comments data") sys.exit(1) gender_comment = [] for idx, data in enumerate(male_comment): data[1] = data[1].lower() gender_comment.append(data) for idx, data in enumerate(female_comment): data[1] = data[1].lower() gender_comment.append(data) random.shuffle(gender_comment) list_of_words = set() for data in gender_comment: list_of_words.update(data[1].split(" ")) list_of_words = list(list_of_words) word_count = len(list_of_words) if args.cache: cache.cache_list_of_words(list_of_words) print("Total of {} words found\n".format(word_count)) data = coo_matrix((1, 1)) label = [] total = len(gender_comment) start_progress("Processing {} raw gender-comment data".format(total)) for i, j in enumerate(gender_comment): if j[0] == "female": # Label for female = 0, and male = 1 label.append(0) else: label.append(1) wc = {} for word in j[1].split(): if word in wc: wc[word] += 1 else: wc[word] = 1 d = [] for idx in range(word_count): count = 0 if list_of_words[idx] in wc: count = wc[list_of_words[idx]] d.append(count) if i == 0: data = coo_matrix(d) else: data = vstack((data, coo_matrix(d))) progress((i + 1) / total * 100) if i == total: break end_progress() if args.cache: cache.cache_data_and_label(data, label, word_count) run_tests(data, label, total, args.split, args.algorithm, args.n_estimator) print("Elapsed time: {0:.2f}s".format(time.time() - start_time))
def main(args): print("Running Naive-Bayes Classifier\n") print("Reading blacklist words file\n") load_blacklist_words("../data/blacklist.txt") if args.model != "": print("Loading model file: {}\n".format(args.model)) classifier = cache.load_pickle(args.model) else: filenames_male = glob.glob("../data/raw_comments/male/*.json") filenames_female = glob.glob("../data/raw_comments/female/*.json") shuffle(filenames_male) shuffle(filenames_female) male_data = [] male_user = len(filenames_male) start_progress("Reading {} male user(s) data".format(male_user)) for index, filename in enumerate(filenames_male): progress((index + 1) / male_user * 100) read_file(filename, male_data) end_progress() female_data = [] female_user = len(filenames_female) start_progress("Reading {} female user(s) data".format(female_user)) for index, filename in enumerate(filenames_female): progress((index + 1) / female_user * 100) read_file(filename, female_data) end_progress() female_ratio = 1.0 - args.male_female_ratio female_count = int(len(female_data)) male_count = int(len(male_data)) total_data = male_count + female_count print( "Loaded {} male(s) and {} female(s) comment data, total of {} comment(s)" .format(male_count, female_count, total_data)) if args.limit != -1: female_count = int(args.limit * female_ratio) male_count = int(args.limit * args.male_female_ratio) if male_count < len(male_data): del male_data[male_count:] if female_count < len(female_data): del female_data[female_count:] print( "Limiting number of comments: {}, {} male(s) and {} female(s)". format(args.limit, len(male_data), len(female_data))) global data data = male_data data.extend(female_data) shuffle(data) print("\nFinished reading data") print("Total number of user: "******"Total number of comments: " + str(len(data)) + "\n") classifier = naive_bayes(args.cache_model) nb_classify(classifier)