Ejemplo n.º 1
0
def get_predicting_data(trainfrom, testat):
    h, content = toolkit.load_csv('db4school', trainfrom, True)
    contentT = map(list, zip(*content))
    y_train = contentT[-1]
    content = list()
    for attr, col in zip(h, contentT):
        if attr in settings.record_attrs:
            content.append(col)
    x_train = map(list, zip(*content))

    h, content = toolkit.load_csv('db4school', testat, True)
    contentT = map(list, zip(*content))
    y_test = contentT[-1]
    content=list()
    for attr, col in zip(h, contentT):
        if attr in settings.record_attrs:
            content.append(col)
    x_test = map(list, zip(*content))

    x_test = map(lambda r:map(toolkit.str2num, r), x_test)
    x_train = map(lambda r:map(toolkit.str2num, r), x_train)
    y_test = map(toolkit.str2num, y_test)
    y_train = map(toolkit.str2num, y_train)

    return x_train, y_train, x_test, y_test
Ejemplo n.º 2
0
def CLIFF(model,
          db_folder,
          write_out_folder=None):
    """
    Core function for CLIFF algorithm
    prune the data set according to the power
    attributes are discretized

    :param model: should be a csv file containing the original database
    :param db_folder: the folder name of db
    :param write_out_folder: where to write out the generated data base into "write_out_folder/model.csv"
    :return: the CLIFFED database
    """
    ori_attrs, alldata = toolkit.load_csv(db_folder, model)  # load the database
    record_attrs = settings.record_attrs

    alldataT = map(list, zip(*alldata))
    valued_dataT = list()
    for attr, col in zip(ori_attrs, alldataT):
        if attr in record_attrs:
            valued_dataT.append(col)
    valued_dataT.append(alldataT[-1])  # cant miss the classification

    alldata = map(list, zip(*valued_dataT))
    alldata = map(lambda row:map(toolkit.str2num, row), alldata)  # numbering the 2d table

    after_cliff = cliff_core(alldata)
    after_cliff.insert(0, record_attrs+[ori_attrs[-1]])  # add the header

    if write_out_folder:
        toolkit.write_csv(write_out_folder, model, after_cliff)

    return after_cliff
Ejemplo n.º 3
0
def module2():
    print('this is the second module in this file.')
    head, content = toolkit.load_csv(settings.project_path+'/Reports', 'PREDICTION_report')
    content = map(lambda r: map(toolkit.str2num, r), content)

    lg_rmse = lambda x: x[4]=='linear regression' and x[5]=='RMSE'
    dt_rmse = lambda x: x[4]=='decision tree' and x[5]=='RMSE'

    lace1 = lambda x: x[3]=='Lace1Out'
    lace2 = lambda x: x[3]=='Lace2Out'
    org   = lambda x: x[3]=='NoHandle'

    v = list()
    for clf in [lg_rmse, dt_rmse]:
        for alg in [org, lace1, lace2]:
            selected = filter(clf and alg, content)
            v.append(zip(*selected)[-1])

    plt.clf()
    fig = plt.figure(1)
    fig.set_size_inches(7, 5)
    ax = fig.add_subplot(111)
    box = ax.boxplot(v)

    plt.xticks(range(1, 7), ['org', 'lace1', 'lace2']*2)

    ax.axvspan(0, 3.5, alpha=0.3, color='gray')

    ax.text(1, 6, 'Linear regression', fontsize=10)
    ax.text(4, 6, 'Decision tree', fontsize=10)

    ax.set_ylim([0, 10])
    ax.set_title('RMSE for predicting at the whole shoolcard set')
    fig.savefig('school.png', bbox_inhes='tight')
Ejemplo n.º 4
0
def load_csv_within_region(folder, model, region):
    header, content = toolkit.load_csv(folder, model, has_header=True)
    assert 'STABBR' in header, 'please make sure region info in database'
    abbr_at = header.index('STABBR')

    content = [i for i in content if i[abbr_at] in REGIONS[region]]

    return header, content
Ejemplo n.º 5
0
def get_moprhed_train(source_folder, model):
    # type: (str, str) -> list, list
    _, all_trains = toolkit.load_csv(source_folder, model)
    all_trains = map(lambda r:map(toolkit.str2num, r), all_trains)  # change the numbers if possible

    x = [row[:-1] for row in all_trains]
    y = [row[-1] for row in all_trains]

    return x, y
Ejemplo n.º 6
0
def get_test(model):
    ori_attrs, all_trains = toolkit.load_csv('TestSet', model)
    all_trains = map(lambda r: map(toolkit.str2num, r), all_trains)  # change the numbers if possible

    trainsT = map(list, zip(*all_trains))
    tmp_trainsT = list()
    for oa, col in zip(ori_attrs, trainsT):
        if oa in settings.record_attrs:
            tmp_trainsT.append(col)
    tmp_trainsT.append(trainsT[-1])

    all_trains = map(list, zip(*tmp_trainsT))

    x = [row[:-1] for row in all_trains]
    y = [row[-1] for row in all_trains]

    return x, y
Ejemplo n.º 7
0
def apriori_cmpr(model, org_folder, ptz_folder):
    """
    Note:
    ignore the class attribute. just focus on the independent attributes
    :param model:
    :param org_folder:
    :param ptz_folder:
    :return:
    """
    # load the data sets
    org_attrs, org_data = toolkit.load_csv(org_folder, model)
    org_data = map(lambda r: map(toolkit.str2num, r), org_data)

    ptz_attrs, ptz_data = toolkit.load_csv(ptz_folder, model)
    ptz_data = map(lambda r: map(toolkit.str2num, r), ptz_data)
    ptz_data = toolkit.del_col_in_table(ptz_data, -1)

    # delete the useless columns
    attributes = settings.record_attrs
    org_dataT = map(list, zip(*org_data))
    org_dataT = [col for col, a1 in zip(org_dataT, org_attrs) if a1 in attributes]
    org_data = map(list, zip(*org_dataT))

    # discretize the data
    # translate the continuous attribute into 'attr+level'

    dis_org_data = []
    dis_ptz_data = []
    # ranges_dict = dict()  # for backup

    for attr_name, col1, col2 in zip(attributes, zip(*org_data), zip(*ptz_data)):
        col1 = list(col1)
        col2 = list(col2)

        col = col1 + col2  # NOTE: put two dataset together
        ranges = toolkit.binrange(col)
        # ranges_dict[attr_name] = ranges

        tags = []
        for element in col1:
            for cursor, upper_bound in enumerate(ranges):
                if upper_bound >= element:
                    break
            # lower_bound = ranges[max(cursor-1, 0)]
            # mid = (upper_bound + lower_bound) / 2
            # if type(mid) is float:
            #     mid = round(mid, 2)
            #
            # tags.append(attr_name+':' + str(mid))
            tags.append(attr_name + ':' + str(cursor))
        dis_org_data.append(tags)

        tags = []
        for element in col2:
            for cursor, upper_bound in enumerate(ranges):
                if upper_bound >= element:
                    break
            tags.append(attr_name + ':' + str(cursor))
        dis_ptz_data.append(tags)

    dis_org_data = map(list, zip(*dis_org_data))
    dis_ptz_data = map(list, zip(*dis_ptz_data))

    logging.info("Database discretization done.")

    org_iter = dataset_iter(dis_org_data)
    ptz_iter = dataset_iter(dis_ptz_data)

    items_org, rules_org = runApriori(org_iter, settings.apriori_min_support, settings.apriori_min_confidence)
    items_ptz, rules_ptz = runApriori(ptz_iter, settings.apriori_min_support, settings.apriori_min_confidence)

    return items_org, items_ptz, rules_org, rules_ptz, dis_org_data, dis_ptz_data
Ejemplo n.º 8
0
def module1():
    header, content = toolkit.load_csv(settings.project_path+"/db4school", "precision_report")

    content = map(lambda r: map(toolkit.str2num, r), content)

    source_db = lambda x: x[2]
    test_for = lambda x: x[4]

    cases = ['school0', 'school1', 'school2', 'school3']

    lg_rmse = lambda x: x[5]=='linear regression' and x[6]=='RMSE'
    # lg_mae = lambda x: x[5]=='linear regression' and x[6]=='MAE'
    dt_rmse = lambda x: x[5]=='decision tree' and x[6]=='RMSE'
    # dt_mae = lambda x: x[5]=='decision tree' and x[6]=='MAE'

    for case in cases:
        plt.clf()
        fig = plt.figure(1)
        fig.set_size_inches(7,5)
        match_cases = filter(lambda x: source_db(x) == case, content)
        lace1_cases = filter(lambda x: 'LACE1' in x, match_cases)
        lace2_cases = filter(lambda x: 'LACE2' in x, match_cases)
        v = list()

        for testat in cases:
            # lace1
            selected = filter(lambda x: test_for(x) == testat, lace1_cases)

            lg = filter(lg_rmse, selected)
            dt = filter(dt_rmse, selected)

            v.append(zip(*lg)[-1])
            v.append(zip(*dt)[-1])

            # lace2
            selected = filter(lambda x: test_for(x) == testat, lace2_cases)
            lg = filter(lg_rmse, selected)
            dt = filter(dt_rmse, selected)

            v.append(zip(*lg)[-1])
            v.append(zip(*dt)[-1])

        ax = fig.add_subplot(111)
        box = ax.boxplot(v)

        ax.axvspan(0, 4.5, alpha=0.3, color='gray')
        ax.axvspan(8.5, 12.5, alpha=0.3, color='gray')

        ax.text(1, 6, 'LACE1\nregression', fontsize=10)
        ax.text(5, 6, 'LACE1\ndecision tree', fontsize=10)
        ax.text(9, 6, 'LACE2\nregression', fontsize=10)
        ax.text(13, 6, 'LACE2\ndecision tree', fontsize=10)

        plt.setp(box['boxes'][cases.index(case)], color='red')
        plt.setp(box['boxes'][cases.index(case)+4], color='red')
        plt.setp(box['boxes'][cases.index(case)+8], color='red')
        plt.setp(box['boxes'][cases.index(case)+12], color='red')
        # pdb.set_trace()
        plt.xticks(range(1, 17), ['NE', 'NW', 'S', 'W']*4)
        ax.set_ylim([0, 10])
        ax.set_title('RMSE for prediction from region data.')
        fig.savefig(case+'.png', bbox_inches='tight')