Exemple #1
0
def classification(neuro_obj = None, epoch = 100000, num_class = 10):

    print 'initialize Neural Network.'
    if neuro_obj: nn_obj = neuro_obj
    else        : nn_obj = mln.Mln().make_neuralnet([28*28, 1000, num_class], ['sigmoid', 'softmax'], 0.01) # mnist classification
#    else        : nn_obj = mln.Mln().make_neuralnet([28*28, 1000, num_class], ['sigmoid', 'softmax'], 0.15) # mnist classification

    print "dump obj..."
    dp.obj_dump(nn_obj, './default-classification.pkl')

    print 'read training data and label.'
    training_data = dp.obj_load_gzip('../../mnist/mnist-training_all.pkl.gz')

    print 'data      size : ', len(training_data)
    print 'label     size : ', len(training_data)

    data_num = len(training_data)

    print '--start--'
    print '@@ Learn Character Recognition @@'
    for j in range(0, epoch):
        prg.show_progressxxx(j+1, epoch)
    
        i = np.random.randint(data_num)
        nn_obj.learn(training_data[i][0], training_data[i][1])

    prg.end_progress()

    print "dump obj..."
    dp.obj_dump(nn_obj, './learn-classification.pkl')
    
    return nn_obj
Exemple #2
0
def naive_bayes(cache_model):
    print("Running Naive-Bayes Classifier Training")

    idx = 0
    total = len(data)
    start_progress("Pre-processing {} of data".format(total))
    for gender, comment in data:
        idx += 1
        word_exist = {}
        word_not_exist = {}

        if gender not in list_of_gender:
            list_of_gender.append(gender)

        for word in word_tokenize(comment):
            word_exist[word] = True
            word_not_exist[word] = False

            if word not in list_of_words:
                list_of_words.append(word)

        for gen in list_of_gender:
            if gen == gender:
                train_data_gender.append((word_exist, gen))
            else:
                train_data_gender.append((word_not_exist, gen))
        progress(idx / total * 100)
    end_progress()
    print("\nFinished pre-processing ({} data)".format(total))

    print("Training {} gender data".format(total))
    main_gender_classifier = NaiveBayesClassifier.train(train_data_gender)

    if cache_model:
        cache.cache_model(main_gender_classifier,
                          "model/gender_classifier_{}.p".format(total))

    print("Cross validation")
    average_accuracy = 0
    size = len(train_data_gender)

    for i in range(1, 9):
        test_set = train_data_gender[round((i - 1) * size / 8):round((i) *
                                                                     size / 8)]
        training_set = train_data_gender[0:round((i - 1) * size / 8)]
        training_set.extend(train_data_gender[round((i) * size / 8):])

        gender_classifier = NaiveBayesClassifier.train(training_set)

        print("Test-{0}: {1:.2%}".format(
            i, classify.accuracy(gender_classifier, test_set)))
        average_accuracy += classify.accuracy(gender_classifier, test_set)
    average_accuracy /= 8

    print("Average accuracy: " + "{0:.2%}\n".format(average_accuracy))

    return main_gender_classifier
Exemple #3
0
def classification(neuro_obj=None, epoch=100000, minibatch_size=1):
    num_class = 10

    print "initialize Neural Network."
    if neuro_obj:
        nn_obj = neuro_obj
    else:
        nn_obj = mln.Mln().make_neuralnet(
            [28 * 28, 1000, num_class], ["sigmoid", "softmax"], 0.15
        )  # mnist classification
    #    else        : nn_obj = mln.Mln().make_neuralnet([28*28, 1000, num_class], ['sigmoid', 'softmax'], 0.01) # mnist classification

    # use weight decay.
    #    nn_obj.use_weight_decay(0.01)   # unlearnable
    #    nn_obj.use_weight_decay(0.001)  # learnable
    nn_obj.use_weight_decay(0.0001)  # learnable
    #    nn_obj.unuse_weight_decay()
    # use momentum.
    #    nn_obj.use_momentum(0.1)
    nn_obj.use_momentum(0.5)
    #    nn_obj.use_momentum(0.9)
    #    nn_obj.unuse_momentum()

    print "dump obj..."
    dp.obj_dump(nn_obj, "./default-classification-batch.pkl")

    print "read training data and label."
    training_data = dp.obj_load_gzip("../../mnist/mnist-training_all.pkl.gz")

    print "data      size : ", len(training_data)
    print "label     size : ", len(training_data)
    print "minibatch size : ", minibatch_size

    data_num = len(training_data)

    print "--start--"
    print "@@ Learn Character Recognition @@"
    mb_size = minibatch_size
    for i in range(0, epoch):
        prg.show_progressxxx(i + 1, epoch)

        xxx = random.sample(training_data, data_num)
        x = []
        d = []
        for i in range(0, mb_size):
            x.append(xxx[i][0])
            d.append(xxx[i][1])

        nn_obj.batch_learn(x[0:mb_size], d[0:mb_size], mb_size)
    prg.end_progress()

    print "dump obj..."
    dp.obj_dump(nn_obj, "./learn-classification-batch.pkl")

    return nn_obj
Exemple #4
0
def classification(neuro_obj=None, epoch=100000, num_class=10):

    print "initialize Neural Network."
    if neuro_obj:
        nn_obj = neuro_obj
    else:
        nn_obj = mln.Mln().make_neuralnet(
            [28 * 28, 1000, num_class], ["sigmoid", "softmax"], 0.01
        )  # mnist classification
    #    else        : nn_obj = mln.Mln().make_neuralnet([28*28, 1000, num_class], ['sigmoid', 'softmax'], 0.15) # mnist classification

    # use weight decay.
    #    nn_obj.use_weight_decay(0.01)   #
    #    nn_obj.use_weight_decay(0.001)  #
    #    nn_obj.use_weight_decay(0.0001) # 20.02% error, with momentum 0.5.
    nn_obj.use_weight_decay(0.00001)  #  8.45% error, with momentum 0.5.
    #    nn_obj.unuse_weight_decay()
    # use momentum.
    #    nn_obj.use_momentum(0.1)
    nn_obj.use_momentum(0.5)
    #    nn_obj.use_momentum(0.9)
    #    nn_obj.unuse_momentum()

    print "dump obj..."
    dp.obj_dump(nn_obj, "./default-classification.pkl")

    print "read training data and label."
    training_data = dp.obj_load_gzip("../../mnist/mnist-training_all.pkl.gz")

    print "data      size : ", len(training_data)
    print "label     size : ", len(training_data)

    data_num = len(training_data)

    print "--start--"
    print "@@ Learn Character Recognition @@"
    for j in range(0, epoch):
        prg.show_progressxxx(j + 1, epoch)

        i = np.random.randint(data_num)
        nn_obj.learn(training_data[i][0], training_data[i][1])

    prg.end_progress()

    print "dump obj..."
    dp.obj_dump(nn_obj, "./learn-classification.pkl")

    return nn_obj
Exemple #5
0
def test_classification():
    num_class = 10

    print 'initialize Neural Network.'
    nn_obj = dp.obj_load('./learn-classification-batch.pkl')

    print 'read test data.'
    test_data  = dp.obj_load_gzip('../../mnist/mnist-test_data.pkl.gz')

    print 'read test label.'
    test_label = dp.obj_load_gzip('../../mnist/mnist-test_label.pkl.gz')
 
    print 'data  size : ', len(test_data)
    print 'label size : ', len(test_label)

    data_num = len(test_data)
    prediction_error = []
    prediction_recog = []

    print '--start--'
    print '@@ Test Character Recognition @@'
    for j in range(0, data_num):
        prg.show_progressxxx(j+1, data_num)
    
        num_recog, list_recog = nn_obj.test(test_data[j], test_label[j])

        if test_label[j][num_recog] == 0:
            prediction_error.append((test_label[j], num_recog))

    prg.end_progress()

    count = len(prediction_error)

    for item in prediction_error:
        print "Truth:", np.argmax(item[0]), ", --> But predict:", item[1]

    print ''
    print 'error : ', count, '/', data_num, '(', count * 1.0 / data_num * 100.0, '%)'

    return
Exemple #6
0
def classification(neuro_obj = None, epoch = 100000, minibatch_size = 1):
    num_class = 10

    print 'initialize Neural Network.'
    if neuro_obj: nn_obj = neuro_obj
    else        : nn_obj = mln.Mln().make_neuralnet([28*28, 1000, num_class], ['sigmoid', 'softmax'], 0.15) # mnist classification
#    else        : nn_obj = mln.Mln().make_neuralnet([28*28, 1000, num_class], ['sigmoid', 'softmax'], 0.01) # mnist classification

    print "dump obj..."
    dp.obj_dump(nn_obj, './default-classification-batch.pkl')

    print 'read training data and label.'
    training_data = dp.obj_load_gzip('../../mnist/mnist-training_all.pkl.gz')

    print 'data      size : ', len(training_data)
    print 'label     size : ', len(training_data)
    print "minibatch size : ", minibatch_size

    data_num = len(training_data)

    print '--start--'
    print '@@ Learn Character Recognition @@'
    mb_size = minibatch_size
    for i in range(0, epoch):
        prg.show_progressxxx(i+1, epoch)

        xxx = random.sample(training_data, data_num)
        x = []
        d = []
        for i in range(0, mb_size):
            x.append(xxx[i][0])
            d.append(xxx[i][1])

        nn_obj.batch_learn(x[0:mb_size], d[0:mb_size], mb_size)
    prg.end_progress()

    print "dump obj..."
    dp.obj_dump(nn_obj, './learn-classification-batch.pkl')
    
    return nn_obj
Exemple #7
0
    col_of_item = 0
    for item in allLines[12:16]: col_of_item = (col_of_item << 8 | ord(item))        

    for i in range(0, num_of_item):
        prg.show_progress(i, num_of_item-1)
        offset = 16
        min_coodinate = i    *row_of_item*col_of_item+offset
        max_coodinate = (i+1)*row_of_item*col_of_item+offset
        obj_dump(allLines[min_coodinate:max_coodinate], fname + str(i) + '.dump')

        if show:
            for j in range(0, row_of_item*col_of_item):
                if  ord(allLines[j+i*row_of_item*col_of_item+16]) > 0.: print '#',
                else                                                  : print ' ',
                if j%row_of_item == 0                                 : print ''
    prg.end_progress()

    
#
# return image of n for file
#
def get_image_n(fname, n):
    # set number labels
    with open(fname, 'rb') as file: allLines = file.read()
    num_of_item = 0
    for item in allLines[ 4: 8]: num_of_item = (num_of_item << 8 | ord(item))        
    row_of_item = 0
    for item in allLines[ 8:12]: row_of_item = (row_of_item << 8 | ord(item))
    col_of_item = 0
    for item in allLines[12:16]: col_of_item = (col_of_item << 8 | ord(item))        
Exemple #8
0
def main(args):
    start_time = time.time()
    print("Running AdaBoost Classifier")

    print("Reading blacklist words file")
    load_blacklist_words("../data/blacklist.txt")

    print("Reading raw gender-comment data")
    with open("../data/male-comments.json", "r") as f:
        male_comment = json.load(f)
    with open("../data/female-comments.json", "r") as f:
        female_comment = json.load(f)

    # Lower case all comments
    male_comment = [[x[0], x[1].lower()] for x in male_comment]
    female_comment = [[x[0], x[1].lower()] for x in female_comment]

    # Filter blacklisted words in comments
    male_comment = [[x[0], x[1]] for x in male_comment
                    if all(c not in BLACKLIST_WORDS for c in x[1].split(" "))]
    female_comment = [[x[0], x[1]] for x in female_comment if all(
        c not in BLACKLIST_WORDS for c in x[1].split(" "))]

    random.shuffle(male_comment)
    random.shuffle(female_comment)
    print("Loaded {} male and {} female comments".format(
        len(male_comment), len(female_comment)))

    female_ratio = 1.0 - args.male_female_ratio
    if args.limit != -1:
        print(
            "Limiting male and female comments to {} male and {} female ({} total)"
            .format(int(args.limit * args.male_female_ratio),
                    int(args.limit * female_ratio), args.limit))
        try:
            del male_comment[int(args.limit * args.male_female_ratio):]
            del female_comment[int(args.limit * female_ratio):]
        except:
            print("Not enough male/female comments data")
            sys.exit(1)

    gender_comment = []
    for idx, data in enumerate(male_comment):
        data[1] = data[1].lower()
        gender_comment.append(data)
    for idx, data in enumerate(female_comment):
        data[1] = data[1].lower()
        gender_comment.append(data)
    random.shuffle(gender_comment)

    list_of_words = set()
    for data in gender_comment:
        list_of_words.update(data[1].split(" "))
    list_of_words = list(list_of_words)
    word_count = len(list_of_words)

    if args.cache:
        cache.cache_list_of_words(list_of_words)

    print("Total of {} words found\n".format(word_count))

    data = coo_matrix((1, 1))
    label = []
    total = len(gender_comment)
    start_progress("Processing {} raw gender-comment data".format(total))
    for i, j in enumerate(gender_comment):
        if j[0] == "female":  # Label for female = 0, and male = 1
            label.append(0)
        else:
            label.append(1)

        wc = {}
        for word in j[1].split():
            if word in wc:
                wc[word] += 1
            else:
                wc[word] = 1

        d = []
        for idx in range(word_count):
            count = 0
            if list_of_words[idx] in wc:
                count = wc[list_of_words[idx]]
            d.append(count)

        if i == 0:
            data = coo_matrix(d)
        else:
            data = vstack((data, coo_matrix(d)))

        progress((i + 1) / total * 100)
        if i == total:
            break
    end_progress()

    if args.cache:
        cache.cache_data_and_label(data, label, word_count)

    run_tests(data, label, total, args.split, args.algorithm, args.n_estimator)

    print("Elapsed time: {0:.2f}s".format(time.time() - start_time))
Exemple #9
0
def main(args):
    print("Running Naive-Bayes Classifier\n")

    print("Reading blacklist words file\n")
    load_blacklist_words("../data/blacklist.txt")

    if args.model != "":
        print("Loading model file: {}\n".format(args.model))
        classifier = cache.load_pickle(args.model)
    else:
        filenames_male = glob.glob("../data/raw_comments/male/*.json")
        filenames_female = glob.glob("../data/raw_comments/female/*.json")
        shuffle(filenames_male)
        shuffle(filenames_female)

        male_data = []
        male_user = len(filenames_male)
        start_progress("Reading {} male user(s) data".format(male_user))
        for index, filename in enumerate(filenames_male):
            progress((index + 1) / male_user * 100)
            read_file(filename, male_data)
        end_progress()

        female_data = []
        female_user = len(filenames_female)
        start_progress("Reading {} female user(s) data".format(female_user))
        for index, filename in enumerate(filenames_female):
            progress((index + 1) / female_user * 100)
            read_file(filename, female_data)
        end_progress()

        female_ratio = 1.0 - args.male_female_ratio
        female_count = int(len(female_data))
        male_count = int(len(male_data))
        total_data = male_count + female_count
        print(
            "Loaded {} male(s) and {} female(s) comment data, total of {} comment(s)"
            .format(male_count, female_count, total_data))
        if args.limit != -1:
            female_count = int(args.limit * female_ratio)
            male_count = int(args.limit * args.male_female_ratio)
            if male_count < len(male_data):
                del male_data[male_count:]
            if female_count < len(female_data):
                del female_data[female_count:]
            print(
                "Limiting number of comments: {}, {} male(s) and {} female(s)".
                format(args.limit, len(male_data), len(female_data)))

        global data
        data = male_data
        data.extend(female_data)
        shuffle(data)

        print("\nFinished reading data")
        print("Total number of user: "******"Total number of comments: " + str(len(data)) + "\n")

        classifier = naive_bayes(args.cache_model)

    nb_classify(classifier)