Example #1
0
def prune_tree(tree_node, valid_set):
    valid_answers = valid_set[setup.label]
    # print ('valid answers before resetting index:')
    # print(valid_answers)
    valid_answers.reset_index(drop=True, inplace=True)
    valid_set = valid_set.drop([setup.label], axis=1)

    prune_list = []
    prune_list = find_potential_prunes(tree_node, prune_list)

    valid_prediction_list = []
    for index, row in valid_set.iterrows():
        predicted_label = predict.predict_label(tree_node, row)
        valid_prediction_list.append(predicted_label)
    previous_score = assess.set_score(valid_prediction_list, valid_answers)

    for prune in prune_list:
        temp_node = copy.deepcopy(prune)
        prune['attr'] = setup.label
        prune['left'] = None
        prune['right'] = None
        prune['leaf'] = True

        prune['value'] = temp_node['left']['value']
        valid_prediction_list = []
        for index, row in valid_set.iterrows():
            predicted_label = predict.predict_label(tree_node, row)
            valid_prediction_list.append(predicted_label)
        left_score = assess.set_score(valid_prediction_list, valid_answers)

        prune['value'] = temp_node['right']['value']
        valid_prediction_list = []
        for index, row in valid_set.iterrows():
            predicted_label = predict.predict_label(tree_node, row)
            valid_prediction_list.append(predicted_label)
        right_score = assess.set_score(valid_prediction_list, valid_answers)
        # print('\nprevious score:\t' + str(previous_score) )
        # print('left score:\t' + str(left_score))
        # print('right score:\t' + str(right_score))
        if ((left_score >= previous_score) or (right_score >= previous_score)):
            # print('tree has been pruned')
            if (left_score > right_score):
                prune['value'] = temp_node['left']['value']
        # print('new value is left branch value')
        #     else:
        # #print('new value is right branch value')
        else:
            prune = temp_node
        #print('prune denied')

    return tree_node
Example #2
0
def evaluate_unpruned(train_df, n_partitions):
    print('\n')
    part_len = int(len(train_df) / 10)
    # print('the train dataframe is:')
    # print (train_df)
    # print ('and is ' + str(len(train_df)) + " long")
    depths_total = 0
    scores_total = 0
    for i in range(n_partitions):
        test_begin = i * part_len
        test_end = test_begin + part_len
        # print('test_begin:\t' + str(test_begin))
        # print('test_end:\t' + str(test_end))
        test_part = train_df.iloc[test_begin:test_end, :]
        train_part = train_df.drop(train_df.index[test_begin:test_end])
        # print('the test part is:')
        # print(test_part)
        # print('the train part is')
        # print(train_part)
        # print('\n' * 5)
        train_part.reset_index(drop=True, inplace=True)
        answers = test_part[setup.label]
        answers.reset_index(drop=True, inplace=True)
        answers = answers.tolist()
        test_part = test_part.drop([setup.label], axis=1)
        tree_info = tuple()
        tree_info = build_tree.decision_tree_learning(train_part, 0)
        depths_total += tree_info[1]
        # print('current tree:')
        # print(tree_info[0])
        # print('tree_info[1]:')
        # print(tree_info[1])
        prediction_list = []
        for index, row in test_part.iterrows():
            predicted_label = predict.predict_label(tree_info[0], row)
            prediction_list.append(predicted_label)
        score = assess.set_score(prediction_list, answers)
        scores_total += score
        #print ( 'score for test_data with index ' + str(test_begin) + ' to ' + str(test_end) + ' is ' + str (score) + '%')
    print(
        colored(('\n' * 5) + 'the average depth is ' +
                str(depths_total / n_partitions), 'red'))
    print(
        colored(
            'the average score for unpruned tree is ' +
            str(scores_total / n_partitions) + '%', 'red'))
Example #3
0
def getName():
    input = json.loads(request.data)['text']
    tkn_txt = preprocess.tokenize_text(input)
    while ("" in tkn_txt):
        tkn_txt.remove("")
    X = [preprocess.extract_features(preprocess.pos_tagger([tkn_txt])[0])]

    pred = predict.predict_label(X)
    inputArray = input.split(' ')
    respObj = {
        "data": [{
            'word': w,
            'isName': p
        } for w, p in zip(inputArray, pred[0])]
    }

    return jsonify(respObj)
Example #4
0
def getImage():
    try:
        if 'img' in request.files:
            fileObj = request.files['img']
            in_memory_file = io.BytesIO()
            fileObj.save(in_memory_file)
            imgObj = np.fromstring(in_memory_file.getvalue(), dtype=np.uint8)
            color_image_flag = 1
            imgArr = cv2.imdecode(imgObj, color_image_flag)
            imgLabel = str((predict.predict_label(imgArr))[0])

            ret = {'Label': imgLabel, 'status': 1, 'msg': '‘success’'}
            return jsonify(ret)
        else:
            ret = {'Label': None, 'status': 0, 'msg': 'wrong header name'}
            return jsonify(ret)
    except Exception as e:
        err = str(e)
        ret = {'Label': None, 'status': 0, 'msg': err}
        return jsonify(ret)
Example #5
0
def evaluate_pruned(train_df, n_partitions):
    print('\n')
    part_len = int(len(train_df) / 10)
    # print('the train dataframe is:')
    # print (train_df)
    # print ('and is ' + str(len(train_df)) + " long")
    depths_total = 0
    scores_total = 0
    for i in range(n_partitions):
        test_begin = i * part_len
        test_end = test_begin + part_len
        # print('test_begin:\t' + str(test_begin))
        # print('test_end:\t' + str(test_end))
        test_part = train_df.iloc[test_begin:test_end, :]
        # print('testing part of dataset:')
        # print( test_part)
        train_part = train_df.drop(train_df.index[test_begin:test_end])
        # print('the test part is:')
        # print(test_part)
        # print('the train part is')
        # print(train_part)
        # print('\n' * 5)
        if ((test_begin - part_len) < 0):
            valid_part = train_df.iloc[-part_len:, :]
        else:
            valid_part = train_df.iloc[test_begin - part_len:test_begin, :]
        # print('validation part of dataset:')
        # print( valid_part)
        train_part = train_part.drop(train_part.index[test_begin -
                                                      part_len:test_begin])
        train_part.reset_index(drop=True, inplace=True)
        tree_info = tuple()
        tree_info = build_tree.decision_tree_learning(train_part, 0)
        depths_total += tree_info[1]
        tree_info = list(tree_info)
        tree_info[0] = pruning.prune_tree(tree_info[0], valid_part)
        # print('current tree:')
        # print(tree_info[0])
        # print('tree_info[1]:')
        # print(tree_info[1])
        ########################################################################################################################################### VALIDATION
        # valid_answers = valid_part[setup.label]
        # valid_answers.reset_index(drop = True , inplace = True)
        # valid_part = valid_part.drop([setup.label], axis = 1)
        # valid_prediction_list = []
        # for index, row in valid_part.iterrows():
        #     predicted_label = predict.predict_label ( tree_info[0], row)
        #     valid_prediction_list.append(predicted_label)
        # score = assess.set_score (test_prediction_list, valid_answers)
        # scores_total += score
        ########################################################################################################################################### TESTING
        test_answers = test_part[setup.label]
        test_answers.reset_index(drop=True, inplace=True)
        test_answers = test_answers.tolist()
        test_part = test_part.drop([setup.label], axis=1)
        test_prediction_list = []
        for index, row in test_part.iterrows():
            predicted_label = predict.predict_label(tree_info[0], row)
            test_prediction_list.append(predicted_label)
        score = assess.set_score(test_prediction_list, test_answers)
        scores_total += score
        #print ( 'score for test_data with index ' + str(test_begin) + ' to ' + str(test_end) + ' is ' + str (score) + '%')
    print(
        colored(('\n' * 5) + 'the average depth is ' +
                str(depths_total / n_partitions), 'red'))
    print(
        colored(
            'the average score for pruned tree is ' +
            str(scores_total / n_partitions) + '%', 'red'))
    boot.first_filter_threshold = 1
    boot.second_filter_threshold = 1.7

    for i in range(len(whole_unlabeled_data) // batch_size + 1):
        unlabeled_data = whole_unlabeled_data[int(i) * batch_size:(i + 1) *
                                              batch_size]
        train_with_hierarchy(boot, unlabeled_data)

    train_label = []
    with open('./bootstrapped_data/data.txt', 'w') as f:
        for sentence in boot.best_score_X:
            f.write(sentence + '\n')
    with open('./bootstrapped_data/label.txt', 'w') as f:
        for label in boot.best_score_Y:
            f.write(boot.inv_target_dict[int(label)] + '\n')
            train_label.append(boot.inv_target_dict[int(label)])

    del whole_unlabeled_data

    unlabeled_data_for_labeling = pre.load_unlabeled_data(
        '../data/unlabeled_data/donga_pos_unlabeled_data.txt',
        dup=True,
        shuffle=False)
    if os.path.isfile('./results/DailyLife_MachineLabeled_SG_SVM.txt'):
        os.remove('./results/DailyLife_MachineLabeled_SG_SVM.txt')

    for i in range(len(unlabeled_data_for_labeling) // 50000 + 1):
        predict.predict_label(
            boot.best_score_X, train_label,
            unlabeled_data_for_labeling[i * 50000:(i + 1) * 50000])