def DtModuleData1(data_set_csv, m_file_name): title, data_set = csvOperation.readCsv(data_set_csv) #读取可达集的表头和数据集 sign = search_sign(m_file_name) #读取对称符号 print sign true_title = titleChange(title, sign) #去表头中的除对称符号,得到原始表头。将表头小写化统一名称 true_data_set = dataSetChange(data_set, sign) #除去数据集中的对称符号 return true_title, true_data_set
def priority_test(csvfile): result = {} title, dataSet = csvOperation.readCsv(csvfile) for member in title: if member == 'Chan2[NODE_1].Cmd' or member == 'ExGntd' or member == 'ShrSet[NODE_1]' or member == 'CurCmd': result[member] = 0 print('1') else: result[member] = 0.1 return result
def createTree(atom_csv, undefined_percentage): #输入为处理过的可达集、优先级字典 title, dataSet = csvOperation.readCsv(atom_csv) #add 0822 for only one class 可能返回字符串,而不是字典 classList = [example[-1] for example in dataSet] if len(set(classList)) == 1: return '(' + title[-1] + ' = ' + classList[0] + ')' else: myTree = id3.createTree( dataSet, title[:-1], undefined_percentage ) #0726,修改函数id3.createTree,加入优先级判定依据,即undefined_percentage return myTree
def percentage(csvfile): title, dataSet = csvOperation.readCsv(csvfile) undefined_percentage_dict = {} number_of_data = len(dataSet) for i in range(len(title)): undefined_counter = 0 #初始化计数器 for j in range(number_of_data): if dataSet[j][i].lower() == 'undefined': #j代表行数,i代表列数 undefined_counter = undefined_counter + 1 undefined_percentage_dict[ title[i]] = undefined_counter / number_of_data return undefined_percentage_dict
def convert(origin_csv, atom_txt, atom_csv): atom_list = csvOperation.txtToList(atom_txt) origin_title, origin_dataSet = csvOperation.readCsv(origin_csv) left_list = csvOperation.getLeft(atom_list[:]) convert_list = csvOperation.getConvertList(origin_title, left_list) right_list = csvOperation.getRight(atom_list[:]) atom_dataset = csvOperation.dataSetToAtomDataSet(origin_dataSet, convert_list, right_list, origin_title) newtitle, newdataset = csvOperation.creatAtomCsv(atom_list[:], atom_dataset[:], origin_dataSet) csvOperation.creatCsv(newtitle, newdataset, atom_csv)
def chooseClassifyAttribute(origin_csv, attribute_list): title1, dataSet = csvOperation.readCsv(origin_csv) title = read_txt('title.txt') result = attribute_list[0] #默认返回第一属性,防止所有属性包含的状态种类数均为1 for i in range(len(title)): title[i] = title[i].replace('NODE_', '').lower() if len(attribute_list) == 1: #只有一种属性 return attribute_list[0] else: #有多种属性 for member in attribute_list: if member not in title: print 'error treemain 153 member not in title!!!!!!!!' print 'memeber :' + member print 'title :' + title else: position = title.index(member) temp_list = [example[position] for example in dataSet] #找到数据集中该位置的所有取值 temp_set = set(temp_list) temp_set_list = list(temp_set) if len(temp_set_list) != 1: #该属性包含的状态不只一个,选择该属性作为分类属性 result = member break return result