def discretize_ent(infilename,outfilename): """ Discretize features of data sets according to the MDL method proposed by [#fayyad1993]_. Necessitate Orange Python module to perform the discretization. Only discretize all continuous features of classification datasets. :param infilename: name of the input file (expecting an arff file) :type infilename: string :param outfilename: name of the output file :type outfilename: string """ data = OTable(infilename) disc=Disc() disc.method=EntropyMDL() data_ent = disc(data) # Manipulation of the discretized data for attr in data_ent.domain.attributes : #Reset renamed attributes name to original ones if (attr.name[0:2] == "D_"): attr.name = attr.name[2:] attr.values = [val.replace(',',";") for val in attr.values] # save the discretized data data_ent.save(outfilename)
def discretize_ent(infilename, outfilename): """ Discretize features of data sets according to the MDL method proposed by [#fayyad1993]_. Necessitate Orange Python module to perform the discretization. Only discretize all continuous features of classification datasets. :param infilename: name of the input file (expecting an arff file) :type infilename: string :param outfilename: name of the output file :type outfilename: string """ data = OTable(infilename) disc = Disc() disc.method = EntropyMDL() data_ent = disc(data) # Manipulation of the discretized data for attr in data_ent.domain.attributes: #Reset renamed attributes name to original ones if (attr.name[0:2] == "D_"): attr.name = attr.name[2:] attr.values = [val.replace(',', ";") for val in attr.values] # save the discretized data data_ent.save(outfilename)
def predict_wine_quality(table, n): #Make the continous varibles discrete disc = Discretize() disc.method = discretize.EqualWidth(n=n) table = disc(table) #Define domain feature_vars = list(table.domain[1:]) class_label_var = table.domain[0] wine_domain = Domain(feature_vars, class_label_var) table = Table.from_table(domain=wine_domain, source=table) #Construct learner and print results tree_learner = NNClassificationLearner(hidden_layer_sizes=(10, ), max_iter=4000) eval_results = CrossValidation(table, [tree_learner], k=10) print("Accuracy of cross validation: {:.3f}".format( scoring.CA(eval_results)[0])) print("AUC: {:.3f}".format(scoring.AUC(eval_results)[0]))
def formatTable(tble): ''' Bins the data, one hot encodes the data :param tble: :return: data: tble with binned data, X: representation of data with one-hot-encoding, mapping: representations of what our one-hot-encoding is ''' # Discretization (binning) # https://docs.orange.biolab.si/3/data-mining-library/reference/preprocess.html print("Discretizing data") disc = Discretize() disc.method = discretize.EqualWidth(n=4) data = disc(tble) # print("Discretized table:\n{}\n\n".format(data)) print("One hot encoding data") X, mapping = OneHot.encode(data, include_class=True) sorted(mapping.items()) return data, X, mapping