def pickle_static_income(index): """pickle sorted values of datasets """ userfile = open('data/demographics.csv', 'rU') need_static = False support = {} try: static_file = open('data/informs_income_Static_value.pickle', 'rb') # print "Income pickle Data exist..." result = pickle.load(static_file) except: need_static = True static_file = open('data/informs_income_Static_value.pickle', 'wb') print "Pickle Data..." for i, line in enumerate(userfile): line = line.strip() if i == 0: continue # ignore first line of csv row = line.split(',') try: support[row[index]] += 1 except: support[row[index]] = 1 sort_value = support.keys() sort_value.sort(cmp=cmp_str) result = NumRange(sort_value, support) pickle.dump(result, static_file) static_file.close() userfile.close() return result
def read_pickle_file(att_name, name): """ read pickle file for numeric attributes return numrange object """ with open('tmp/' + att_name + '_static.pickle', 'rb') as static_file: (numeric_dict, sort_value) = pickle.load(static_file) result = NumRange(sort_value, numeric_dict) return result
def read_pickle_file(att_name): """ read pickle file for numeric attributes return numrange object """ try: static_file = open('data/adult_' + att_name + '_static.pickle', 'rb') (numeric_dict, sort_value) = pickle.load(static_file) except: print "Pickle file not exists!!" static_file.close() result = NumRange(sort_value, numeric_dict) return result
def read_numeric_identifier(att_name): """ read pickle file for numeric attributes return numrange object """ csvdata = pd.read_csv('data/adult.data', sep=',', header=0, names=ATT_NAMES) csvdata = csvdata.sort_values(by=att_name) csvdata = csvdata.dropna(subset=[att_name]) sort_value = csvdata.age.astype('str').unique() result = NumRange(sort_value.tolist(), dict()) return result
def init_tree(): global ATT_TREE, NUM_RANGE ATT_TREE = {} root = GenTree('*') ATT_TREE['*'] = root lt = GenTree('A', root) ATT_TREE['A'] = lt ATT_TREE['a1'] = GenTree('a1', lt, True) ATT_TREE['a2'] = GenTree('a2', lt, True) rt = GenTree('B', root) ATT_TREE['B'] = rt ATT_TREE['b1'] = GenTree('b1', rt, True) ATT_TREE['b2'] = GenTree('b2', rt, True) NUM_RANGE = NumRange(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], dict())
def init(): global ATT_TREE ATT_TREE = [] tree_temp = {} tree = GenTree('*') tree_temp['*'] = tree lt = GenTree('1,5', tree) tree_temp['1,5'] = lt rt = GenTree('6,10', tree) tree_temp['6,10'] = rt for i in range(1, 11): if i <= 5: t = GenTree(str(i), lt, True) else: t = GenTree(str(i), rt, True) tree_temp[str(i)] = t numrange = NumRange(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], dict()) ATT_TREE.append(tree_temp) ATT_TREE.append(numrange)
TREE_TEMP = {} tree = GenTree('*') TREE_TEMP['*'] = tree lt = GenTree('1,5', tree) TREE_TEMP['1,5'] = lt rt = GenTree('6,10', tree) TREE_TEMP['6,10'] = rt for i in range(1, 11): if i <= 5: t = GenTree(str(i), lt, True) else: t = GenTree(str(i), rt, True) TREE_TEMP[str(i)] = t ATT_TREES = [ TREE_TEMP, TREE_TEMP, TREE_TEMP, TREE_TEMP, NumRange([str(t) for t in range(1, 11)], dict()) ] IS_CAT = [True, True, True, True, False] def NCP(record): """ compute Certainlty Penalty of records """ record_ncp = 0.0 for i in range(QI_LEN): if IS_CAT[i] is False: temp = 0 try: float(record[i]) except ValueError: