def __init__(self, training): self.training = training self.wnparents = trainer.WordnetParentsEngine(training) labels = ["Larger", "Smaller", "Equal", "None"] self.cls_variable = orange.EnumVariable("class", values=labels) alist = [] for var in self.wnparents.domain.attributes: if isinstance(var, orange.FloatVariable): v1 = orange.FloatVariable(name="%s_w1" % var.name) v2 = orange.FloatVariable(name="%s_w2" % var.name) alist.append(v1) alist.append(v2) elif isinstance(var, orange.EnumVariable): v1 = orange.EnumVariable(name="%s_w1" % var.name, values=var.values) v2 = orange.EnumVariable(name="%s_w2" % var.name, values=var.values) alist.append(v1) alist.append(v2) else: raise ValueError("Unhandled attribute: " + `var`) self.domain = orange.Domain(alist, self.cls_variable) self.training_table = self.makeTable(self.training)
def bench_orange(X, y, T, valid): # # .. Orange .. # import orange start = datetime.now() # prepare data in Orange's format columns = [] for i in range(0, X.shape[1]): columns.append("a" + str(i)) [orange.EnumVariable(x) for x in columns] classValues = ['0', '1'] domain = orange.Domain(map(orange.FloatVariable, columns), orange.EnumVariable("class", values=classValues)) y.shape = (len(y), 1) #reshape for Orange y[np.where(y < 0)] = 0 # change class labels to 0..K orng_train_data = orange.ExampleTable(domain, np.hstack((X, y))) valid.shape = (len(valid), 1) #reshape for Orange valid[np.where(valid < 0)] = 0 # change class labels to 0..K orng_test_data = orange.ExampleTable(domain, np.hstack((T, valid))) learner = orange.SVMLearner(orng_train_data, \ svm_type=orange.SVMLearner.Nu_SVC, \ kernel_type=orange.SVMLearner.RBF, C=1., \ gamma=1. / sigma) pred = np.empty(T.shape[0], dtype=np.int32) for i, e in enumerate(orng_test_data): pred[i] = learner(e) score = np.mean(pred == valid) return score, datetime.now() - start
def get_domain(self): if (self.domain != None): return self.domain values = ["0", "1"] mynames = self.known_objects attributes = [ orange.EnumVariable(mynames[i], values=values) for i in range(len(mynames)) ] classattr = orange.EnumVariable("classname", values=["-1", "0", "1"]) self.domain = orange.Domain(attributes + [classattr]) return self.domain
def __makeExampleTable(namesDict, data): import orange from constants import CLASS_ATRR_NAME, CONTROL_GROUP_KEY, DATA_GROUP_KEY geneIDs = sorted(data.keys()) attrList = [orange.FloatVariable(name=str(geneID)) for geneID in geneIDs] classAttr = orange.EnumVariable(name=CLASS_ATRR_NAME, values=[CONTROL_GROUP_KEY, DATA_GROUP_KEY]) domain = orange.Domain(attrList, classAttr) table = orange.ExampleTable(domain) # first half: group 1 for attrName in namesDict[CONTROL_GROUP_KEY].keys(): exampleValues = [ data[geneID][CONTROL_GROUP_KEY][attrName] for geneID in geneIDs ] + [CONTROL_GROUP_KEY] example = orange.Example(domain, exampleValues) table.append(example) # second half: group 2 for attrName in namesDict[DATA_GROUP_KEY].keys(): exampleValues = [ data[geneID][DATA_GROUP_KEY][attrName] for geneID in geneIDs ] + [DATA_GROUP_KEY] example = orange.Example(domain, exampleValues) table.append(example) return table
def wordnet_meronyms(training, testing): ancestor_to_count = training.meronym_ancestor_map() all_ancestors = list(ancestor_to_count.keys()) all_ancestors.sort(key=lambda a: ancestor_to_count[a], reverse=True) used_ancestors = all_ancestors print "name", used_ancestors[0].name attributes = [ orange.EnumVariable(a.name, values=["True", "False"]) for a in used_ancestors ] print "got", len(used_ancestors), "features" domain = orange.Domain(attributes, training.orange_class_var) results = [] for annotation in [training, testing]: table = orange.ExampleTable(domain) results.append(table) for i, (word, label) in enumerate(annotation.data): ancestors = annotation.ancestors(i) ex = orange.Example(domain) ex["class"] = label for a_i, a in enumerate(attributes): ancestor_i = used_ancestors[a_i] if ancestor_i in ancestors: ex[a.name] = "True" else: ex[a.name] = "False" table.append(ex) training_table, testing_table = results return training_table, testing_table
def addNewClassLabel(self): i = 1 while True: newlabel = "Class %i" % i if newlabel not in self.classValuesModel: # self.classValuesModel.append(newlabel) break i += 1 values = list(self.classValuesModel) + [newlabel] newclass = orange.EnumVariable("Class label", values=values) newdomain = orange.Domain(self.graph.data.domain.attributes, newclass) newdata = orange.ExampleTable(newdomain) for ex in self.graph.data: newdata.append( orange.Example(newdomain, [ex[a] for a in ex.domain.attributes] + [str(ex.getclass())])) self.classVariable = newclass self.classValuesModel.wrap(self.classVariable.values) self.graph.data = newdata self.graph.updateGraph() newindex = self.classValuesModel.index(len(self.classValuesModel) - 1) self.classValuesView.selectionModel().select( newindex, QItemSelectionModel.ClearAndSelect) self.removeClassLabel.setEnabled(len(self.classValuesModel) > 1)
def sendData(self, km=None): if km is None: km = self.bestRun[1] if self.optimized else self.km if not self.data or not km: self.send("Examples", None) self.send("Centroids", None) return clustVar = orange.EnumVariable( self.classifyName, values=["C%d" % (x + 1) for x in range(km.k)]) origDomain = self.data.domain if self.addIdAs == 0: domain = orange.Domain(origDomain.attributes, clustVar) if origDomain.classVar: domain.addmeta(orange.newmetaid(), origDomain.classVar) aid = -1 elif self.addIdAs == 1: domain = orange.Domain(origDomain.attributes + [clustVar], origDomain.classVar) aid = len(origDomain.attributes) else: domain = orange.Domain(origDomain.attributes, origDomain.classVar) aid = orange.newmetaid() domain.addmeta(aid, clustVar) domain.addmetas(origDomain.getmetas()) # construct a new data set, with a class as assigned by k-means clustering new = orange.ExampleTable(domain, self.data) for ex, midx in izip(new, km.clusters): ex[aid] = midx self.send("Examples", new) self.send("Centroids", orange.ExampleTable(km.centroids))
def removeSelectedClassLabel(self): index = self.selectedClassLabelIndex() if index is not None and len(self.classValuesModel) > 1: label = self.classValuesModel[index] examples = [ ex for ex in self.graph.data if str(ex.getclass()) != label ] values = [val for val in self.classValuesModel if val != label] newclass = orange.EnumVariable("Class label", values=values) newdomain = orange.Domain(self.graph.data.domain.attributes, newclass) newdata = orange.ExampleTable(newdomain) for ex in examples: if ex[self.classVariable] != label and ex[ self.classVariable] in values: newdata.append( orange.Example(newdomain, [ex[a] for a in ex.domain.attributes] + [str(ex.getclass())])) self.classVariable = newclass self.classValuesModel.wrap(self.classVariable.values) self.graph.data = newdata self.graph.updateGraph() newindex = self.classValuesModel.index(max(0, index - 1)) self.classValuesView.selectionModel().select( newindex, QItemSelectionModel.ClearAndSelect) self.removeClassLabel.setEnabled(len(self.classValuesModel) > 1)
def wordnet_glosses(training, testing): stopwords = set(nltk.corpus.stopwords.words()) gloss_dist = training.gloss_map() used_words = [ k for k in gloss_dist.keys() if not k in stopwords and gloss_dist[k] > 2 ] print "words", used_words attributes = [ orange.EnumVariable(a, values=["True", "False"]) for a in used_words ] print "got", len(used_words), "features" domain = orange.Domain(attributes, training.orange_class_var) results = [] for annotation in [training, testing]: table = orange.ExampleTable(domain) results.append(table) for i, (word, label) in enumerate(annotation.data): ancestors = annotation.ancestors(i) ex = orange.Example(domain) ex["class"] = label ex["word"] = word for a_i, a in enumerate(attributes): word_i = used_words[a_i] if word_i in annotation.synset(i).definition: ex[a.name] = "True" else: ex[a.name] = "False" table.append(ex) training_table, testing_table = results return training_table, testing_table
def cforange_hierarchical_clustering_finished(postdata, input_dict, output_dict): import json import orange matrix = input_dict['dm'] linkage = int(input_dict['linkage']) widget_pk = postdata['widget_id'][0] try: selected_nodes = json.loads(postdata.get('selected_nodes')[0]) except: raise Exception('Please select a threshold for determining clusters.') if isinstance(matrix.items, orange.ExampleTable): root = Clustering.hierarchical_clustering(linkage, matrix) cluster_ids = set([cluster for _, _, cluster in selected_nodes]) selected_clusters = set( [cluster for _, selected, cluster in selected_nodes if selected]) clustVar = orange.EnumVariable( str('Cluster'), values=["Cluster %d" % i for i in cluster_ids] + ["Other"]) origDomain = matrix.items.domain domain = orange.Domain(origDomain.attributes, origDomain.classVar) domain.addmeta(orange.newmetaid(), clustVar) domain.addmetas(origDomain.getmetas()) # Build table with selected clusters selected_table, unselected_table = orange.ExampleTable( domain), orange.ExampleTable(domain) for id, selected, cluster in selected_nodes: new_ex = orange.Example(domain, matrix.items[id]) if selected: new_ex[clustVar] = clustVar("Cluster %d" % cluster) selected_table.append(new_ex) else: new_ex[clustVar] = clustVar("Other") unselected_table.append(new_ex) # Build table of centroids centroids = orange.ExampleTable(selected_table.domain) if len(selected_table) > 0: for cluster in sorted(selected_clusters): clusterEx = orange.ExampleTable([ ex for ex in selected_table if ex[clustVar] == "Cluster %d" % cluster ]) # Attribute statistics contstat = orange.DomainBasicAttrStat(clusterEx) discstat = orange.DomainDistributions(clusterEx, 0, 0, 1) ex = [ cs.avg if cs else (ds.modus() if ds else "?") for cs, ds in zip(contstat, discstat) ] example = orange.Example(centroids.domain, ex) example[clustVar] = clustVar("Cluster %d" % cluster) centroids.append(example) else: # Attribute distance centroids, selected_table, unselected_table = None, None, None return { 'centroids': centroids, 'selected_examples': selected_table, 'unselected_examples': unselected_table }
def __init__(self, var1, var2): self.var1 = var1 self.var2 = var2 self.noValues2 = len(var2.values) self.classVar = orange.EnumVariable("%sx%s" % (var1.name, var2.name)) self.classVar.values = [ "%s-%s" % (v1, v2) for v1 in var1.values for v2 in var2.values ]
def make_orange_dataset(X, y, n_classes): classes = [str(c) for c in range(n_classes)] columns = ["feature_%d" % i for i in range(X.shape[1])] input_vars = map(orange.FloatVariable, tuple(columns)) class_var = orange.EnumVariable("y", values=classes) domain = orange.Domain(input_vars, class_var) examples = np.hstack((X, y.reshape(-1, 1))) return orange.ExampleTable(domain, examples)
def to_network(self, terms=None): """ Return an Orange.network.Network instance constructed from this ontology. """ edge_types = self.edge_types() terms = self.terms() from Orange.orng import orngNetwork import orange network = orngNetwork.Network(len(terms), True, len(edge_types)) network.objects = dict([(term.id, i) for i, term in enumerate(terms)]) edges = defaultdict(set) for term in self.terms(): related = self.related_terms(term) for relType, relTerm in related: edges[(term.id, relTerm)].add(relType) edgeitems = edges.items() for (src, dst), eTypes in edgeitems: network[src, dst] = [1 if e in eTypes else 0 for e in edge_types] domain = orange.Domain([ orange.StringVariable("id"), orange.StringVariable("name"), orange.StringVariable("def"), ], False) items = orange.ExampleTable(domain) for term in terms: ex = orange.Example( domain, [term.id, term.name, term.values.get("def", [""])[0]]) items.append(ex) relationships = set( [", ".join(sorted(eTypes)) for (_, _), eTypes in edgeitems]) domain = orange.Domain([ orange.FloatVariable("u"), orange.FloatVariable("v"), orange.EnumVariable("relationship", values=list(edge_types)) ], False) id2index = dict([(term.id, i + 1) for i, term in enumerate(terms)]) links = orange.ExampleTable(domain) for (src, dst), eTypes in edgeitems: ex = orange.Example(domain, [id2index[src], id2index[dst], eTypes.pop()]) links.append(ex) network.items = items network.links = links network.optimization = None return network
def createClassVar(attributes, MQCNotation=False): import orngMisc if MQCNotation: return orange.EnumVariable( "Q", values=[ "%s(%s)" % ("".join(["+-"[x] for x in v if x < 2]), ", ".join( [attr for attr, x in zip(attributes, v) if x < 2])) for v in orngMisc.LimitedCounter([3] * len(attributes)) ]) else: return orange.EnumVariable( "Q", values=[ "Q(%s)" % ", ".join([ "+-"[x] + attr for attr, x in zip(attributes, v) if x < 2 ]) for v in orngMisc.LimitedCounter([3] * len(attributes)) ])
def __make_rule_term_example_table(tableDict, allTerms): import orange import constants as const attrList = [ orange.EnumVariable(name=str(term), values=[const.PRESENT, const.ABSENT]) for term in allTerms ] # three meta attributes ruleName = orange.StringVariable(const.NAME_ATTR) mid = orange.newmetaid() ruleTerms = orange.StringVariable(const.TERMS_ATTR) mid1 = orange.newmetaid() #ruleNumber = orange.EnumVariable(SEQ_NUM_ATTR) #StringVariable(SEQ_NUM_ATTR) ruleNumber = orange.FloatVariable(const.SEQ_NUM_ATTR, startValue=1, endValue=len(tableDict), stepValue=1, numberOfDecimals=0) mid2 = orange.newmetaid() # this is a classless domain domain = orange.Domain(attrList, False) # name of the rule is a meta attribute domain.addmeta(mid, ruleName, False) domain.addmeta(mid1, ruleTerms, False) domain.addmeta(mid2, ruleNumber, False) table = orange.ExampleTable(domain) for k in sorted(tableDict.keys()): exampleValues = [] for (i, term) in enumerate(allTerms): if term in tableDict[k][const.RULETERMS_KEY]: #exampleValues.append(PRESENT) exampleValues.append(orange.Value(attrList[i], const.PRESENT)) else: #exampleValues.append(ABSENT) exampleValues.append(orange.Value(attrList[i], const.ABSENT)) example = orange.Example(domain, exampleValues) #example[NAME_ATTR] = tableDict[k][RULENAME_KEY][1:-1] #skip square brackets from the string #example[TERMS_ATTR] = tableDict[k][RULETERMS_STR_KEY][1:-1] #example[SEQ_NUM_ATTR] = k example[const.NAME_ATTR] = orange.Value(ruleName, tableDict[k][ const.RULENAME_KEY][1:-1]) #skip square brackets from the string example[const.TERMS_ATTR] = orange.Value( ruleTerms, tableDict[k][const.RULETERMS_STR_KEY][1:-1]) example[const.SEQ_NUM_ATTR] = orange.Value(ruleNumber, k) table.append(example) #end return table
def addDummyClass(data): print "********************data.domain.classVar*****************" print data.domain.classVar if not data.domain.classVar: newAttr = orange.EnumVariable("dummyClass", values["dummyClass"]) newDomain = orange.domain(data.domain.attributes, newAttr) newData = dataUtilities.DataTable(newDomain, data) data = newData return data
def makeDomain(names): attributes = [orange.FloatVariable(n) for n in names] domain = orange.Domain( attributes, orange.EnumVariable("class", values=["True", "False"])) domain.addmeta(orange.newmetaid(), orange.FloatVariable("weight")) domain.addmeta(orange.newmetaid(), orange.EnumVariable("isInsane", values=["True", "False"])) domain.addmeta(orange.newmetaid(), orange.StringVariable("filename")) domain.addmeta(orange.newmetaid(), orange.StringVariable("sourceEngineName")) domain.addmeta(orange.newmetaid(), orange.StringVariable("engineName")) domain.addmeta(orange.newmetaid(), orange.StringVariable("landmarkName")) domain.addmeta(orange.newmetaid(), orange.PythonVariable("geometry")) domain.addmeta(orange.newmetaid(), orange.PythonVariable("track")) domain.addmeta(orange.newmetaid(), orange.PythonVariable("drawMap")) domain.addmeta(orange.newmetaid(), orange.PythonVariable("description")) domain.addmeta(orange.newmetaid(), orange.PythonVariable("farAway")) return domain
def __call__(self, examples, weight=0): if examples.domain.classVar.varType != 1: raise "MultiClassLearner only works with discrete class" # simple handling for simple 2-class problems if len(examples.domain.classVar.values) <= 2: if weight != 0: return self.learner(examples, weight) else: return self.learner(examples) # count the classes and generate the classifier matrix nc = len(examples.domain.classVar.values) nv = len(examples.domain.attributes) template = self.matrix(nc) # prepare the domain, and the new binary class bin = orange.EnumVariable(name="binary", values=['0', '1']) b0 = bin(0) b1 = bin(1) nd = orange.Domain(examples.domain.attributes + [bin]) # generate all classifiers cm = [] for i in template: exs = orange.ExampleTable(nd) if weight != 0: exs.addMetaAttribute(1) for j in examples: if i[int(j.getclass())] == 1: r = [j[x] for x in range(nv)] r.append(b1) x = orange.Example(nd, r) if weight != 0: x.setmeta(j.getMetaAttribute(weight), 1) exs.append(x) else: if i[int(j.getclass())] == -1: r = [j[x] for x in range(nv)] r.append(b0) x = orange.Example(nd, r) if weight != 0: x.setmeta(j.getMetaAttribute(weight), 1) exs.append(x) # prepare the classifier if len(exs) <= 0: raise "MultiClass: More than one of the declared class values do not appear in the data. Filter them out." if weight != 0: c = self.learner(exs, weight=1) else: c = self.learner(exs) cm.append((c, len(exs))) return self.pestimator(cm, template, examples.domain)
def get_domain_obs(self): #if(self.domain != None): # return self.domain attributes = [ orange.FloatVariable(name) for name in self.dataset.obs_alphabet ] alp = [str(s) for s in self.dataset.label_alphabet] classattr = orange.EnumVariable("classname", values=alp) domain = orange.Domain(attributes + [classattr]) return domain
def get_domain_trans(self): #if(self.domain != None): # return self.domain attributes = [ orange.FloatVariable(name) for name in self.dataset.trans_alphabet ] alp = [str(i) for i in range(len(self.dataset.label_alphabet)**2)] classattr = orange.EnumVariable("classname", values=alp) domain = orange.Domain(attributes + [classattr]) return domain
def sendpredictions(self): if not self.data or not self.outvar: self.send("Predictions", None) return # predictions, data set with class predictions classification = self.outvar.varType == orange.VarTypes.Discrete metas = [] if classification: if len(self.selectedClasses): for c in self.predictors.values(): m = [orange.FloatVariable(name=str("%s(%s)" % (c.name, str(self.outvar.values[i]))), getValueFrom = lambda ex, rw, cindx=i, c=c: orange.Value(c(ex, c.GetProbabilities)[cindx])) \ for i in self.selectedClasses] metas.extend(m) if self.showClass: mc = [ orange.EnumVariable( name=str(c.name), values=self.outvar.values, getValueFrom=lambda ex, rw, c=c: orange.Value(c(ex))) for c in self.predictors.values() ] metas.extend(mc) else: # regression mc = [ orange.FloatVariable( name="%s" % c.name, getValueFrom=lambda ex, rw, c=c: orange.Value(c(ex))) for c in self.predictors.values() ] metas.extend(mc) classVar = self.outvar domain = orange.Domain(self.data.domain.attributes + [classVar]) domain.addmetas(self.data.domain.getmetas()) for m in metas: domain.addmeta(orange.newmetaid(), m) predictions = orange.ExampleTable(domain, self.data) if self.doPrediction: c = self.predictors.values()[0] for ex in predictions: ex[classVar] = c(ex) predictions.name = self.data.name self.send("Predictions", predictions) self.changedFlag = False
def cforange_attribute_distance(input_dict): import orange import orngInteract inputdata = input_dict['dataset'] discretizedData = None classInteractions = int(input_dict['classInteractions']) atts = inputdata.domain.attributes if len(atts) < 2: return None matrix = orange.SymMatrix(len(atts)) matrix.setattr('items', atts) if classInteractions < 3: if inputdata.domain.hasContinuousAttributes(): if discretizedData is None: try: discretizedData = orange.Preprocessor_discretize( inputdata, method=orange.EquiNDiscretization(numberOfIntervals=4)) except orange.KernelException, ex: return None data = discretizedData else: data = inputdata # This is ugly (no shit) if not data.domain.classVar: if classInteractions == 0: classedDomain = orange.Domain( data.domain.attributes, orange.EnumVariable("foo", values=["0", "1"])) data = orange.ExampleTable(classedDomain, data) else: return None im = orngInteract.InteractionMatrix(data, dependencies_too=1) off = 1 if classInteractions == 0: diss, labels = im.exportChi2Matrix() off = 0 elif classInteractions == 1: (diss, labels) = im.depExportDissimilarityMatrix( jaccard=1) # 2-interactions else: (diss, labels) = im.exportDissimilarityMatrix( jaccard=1) # 3-interactions for i in range(len(atts) - off): for j in range(i + 1): matrix[i + off, j] = diss[i][j]
def makeDomain(self): attributes = [orange.FloatVariable(n) for n in self.features.names] attributes.append( orange.EnumVariable("isInsane", values=["True", "False"])) domain = orange.Domain( attributes, # orange broke when there were two enume variables # with the same name but different values. # the one in spatial relations land is called # "class" with three values ("bad tracking"). # it was something to do with pickling and unpickling # and importing - anyway I fixed it by renaming the # class attribute. -- stefie10, 1/13/2009 orange.EnumVariable("verbclass", values=["True", "False"])) domain.addmeta(orange.newmetaid(), orange.PythonVariable("drawMap")) domain.addmeta(orange.newmetaid(), orange.PythonVariable("entry")) domain.addmeta(orange.newmetaid(), orange.PythonVariable("situation")) domain.addmeta(orange.newmetaid(), orange.PythonVariable("engine")) domain.addmeta(orange.newmetaid(), orange.PythonVariable("description")) domain.addmeta(orange.newmetaid(), orange.PythonVariable("exceptions")) return domain
def __call__(self, table, bound, weight=0): bound = [table.domain[a] for a in bound] newattr = orange.EnumVariable( reduce(lambda x, y: x + "-" + y, [a.name for a in bound]), values=["r%i" % i for i in range(self.n)]) if not len(bound): raise AttributeError, "no bound attributes" newattr.getValueFrom = orngLookup.lookupFromBound( newattr, [table.domain[x] for x in bound]) lookupTable = newattr.getValueFrom.lookupTable = [ random.randint(0, self.n - 1) for i in newattr.getValueFrom.lookupTable ] return newattr, random.randint(0, 100)
def add_class_noise(data, noise_level, rnd_seed): """adds class Noise :param data: Orange dataset :param noise_level: :param rnd_seed: :return: """ meta_noisy = orange.EnumVariable("noise", values=["no", "yes"]) mid = orange.newmetaid() while mid in data.domain.getmetas().keys(): mid = orange.newmetaid() data.domain.addmeta(mid, meta_noisy) data.addMetaAttribute("noise", "no") # Generate random indices for noise insertion percent = float(noise_level) / 100 try: rnds = int(rnd_seed) except: rnds = 0 print "Random Seed:", rnds orange.setrandseed(rnds) noise_indices = random.sample(range(len(data)), int(round(percent * len(data)))) #print "Amount of added noise:", percent*100, "percent (", len(noise_indices), "examples ):" #print "Random indices for added noise:", noise_indices className = data.domain.classVar.name #print "Class name:", className for index in noise_indices: data[index]["noise"] = "yes" temp = data[index][className] ## if len(data.domain.classVar.values) > 2: # random value + check if it is diferent from the current one new_label = data.domain.classVar.randomvalue() while new_label == temp: new_label = data.domain.classVar.randomvalue() data[index][className] = new_label ## else: ## # switch the class value ## data[index][className] = data.domain.classVar.nextvalue(data[index][className]) #print "\t", temp, "changed to:", data[index].getclass(), "(", index, ")" #print "\n" noise_indices.sort() return noise_indices, data
def __loadDataFromES(self, dataType, domain): table = None if dataType != "train": table = orange.ExampleTable(domain) else: attributes = map(self.__getOrangeVariableForFeature, self.features) classAttribute = orange.EnumVariable("is_good", values = ["0", "1"]) domain = orange.Domain(attributes, classAttribute) domain.addmeta(orange.newmetaid(), orange.StringVariable("phrase")) table = orange.ExampleTable(domain) phrases = [] if dataType == "train": phrasesCount = self.esClient.count(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_training":["1","0"]}}}) size = phrasesCount["count"] phrases = self.esClient.search(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_training":["1","0"]}}}, size=size) phrases = phrases["hits"]["hits"] elif dataType == "holdout": phraseCount = self.esClient.count(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_holdout":["1","0"]}}}) size = phrasesCount["count"] phrases = self.esClient.search(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_holdout":["1","0"]}}}, size=size) phrases = phrases["hits"]["hits"] else: self.phraseData = self.esClient.get(index=self.processorIndex, doc_type=self.processorPhraseType, id=self.phraseId) phrases = [self.phraseData] for row in phrases: try: row = row["_source"] featureValues = [] classType = "?" for feature in self.features: featureValues.append(row["features"][feature["name"]].encode("ascii")) if dataType == "train": classType = row["is_training"].encode("ascii", "ignore") elif dataType == "holdout": classType = row["is_holdout"].encode("ascii") example = None for i,featureValue in enumerate(featureValues): attr = domain.attributes[i] if type(attr) is orange.EnumVariable: attr.addValue(featureValue) example = orange.Example(domain, (featureValues + [classType])) example[domain.getmetas().items()[0][0]] = row["phrase"].encode("ascii") table.append(example) except: self.logger.error("Error classifying phrase '" + row["phrase"] + "'") return table
def getClabDescSignList(self, smiles, getMolFile=False): # Create an Orange ExampleTable with a smiles attribute smilesAttr = orange.EnumVariable("SMILEStoPred", values=[smiles]) myDomain = orange.Domain([smilesAttr], 0) smilesData = dataUtilities.DataTable(myDomain, [[smiles]]) # Calculate descriptors defined in the model files try: descList = self.model.varNames except: # Consensus object different attributes = self.model.domain.variables descList = [] for attr in attributes: descList.append(attr.name) # Determine Signature and non-Signature descriptor names cinfonyDesc, clabDesc, signatureHeight, bbrcDesc, signDesc = descUtilities.getDescTypes( descList) # Signatures if "sign" in DescMethodsAvailable and signatureHeight: print "Calculating signatures..." preCalcData = dataUtilities.DataTable(self.preDefSignatureFile) startHeight = 0 # Not used desc ignored in model prediction endHeight = signatureHeight dataSign, cmpdSignDict, cmpdSignList, sdfStr = getSignatures.getSignatures( smilesData, startHeight, endHeight, preCalcData, returnAtomID=True) else: cmpdSignList = [[]] sdfStr = "" if not getMolFile: return (clabDesc, cmpdSignList[0]) elif not sdfStr: return (clabDesc, cmpdSignList[0], "", "") # create a mol file molFile = miscUtilities.generateUniqueFile(desc="NN", ext="mol") file = open(molFile, "w") molStr = "" for line in sdfStr[0]: if "$$$$" in line: break molStr += line file.write(line) file.close() return (clabDesc, cmpdSignList[0], molFile, molStr)
def createLogRegExampleTable(data, weightID): finalData = orange.ExampleTable(data) origData = orange.ExampleTable(data) for at in data.domain.attributes: # za vsak atribut kreiraj nov newExampleTable newData # v dataOrig, dataFinal in newData dodaj nov atribut -- continuous variable if at.varType == orange.VarTypes.Continuous: atDisc = orange.FloatVariable(at.name + "Disc") newDomain = orange.Domain(origData.domain.attributes + [atDisc, data.domain.classVar]) newDomain.addmetas(newData.domain.getmetas()) finalData = orange.ExampleTable(newDomain, finalData) newData = orange.ExampleTable(newDomain, origData) origData = orange.ExampleTable(newDomain, origData) for d in origData: d[atDisc] = 0 for d in finalData: d[atDisc] = 0 for i, d in enumerate(newData): d[atDisc] = 1 d[at] = 0 d[weightID] = 100 * data[i][weightID] elif at.varType == orange.VarTypes.Discrete: # v dataOrig, dataFinal in newData atributu "at" dodaj ee eno vreednost, ki ima vrednost kar ime atributa + "X" atNew = orange.EnumVariable(at.name, values=at.values + [at.name + "X"]) newDomain = orange.Domain( filter(lambda x: x != at, origData.domain.attributes) + [atNew, origData.domain.classVar]) newDomain.addmetas(origData.domain.getmetas()) temp_finalData = orange.ExampleTable(finalData) finalData = orange.ExampleTable(newDomain, finalData) newData = orange.ExampleTable(newDomain, origData) temp_origData = orange.ExampleTable(origData) origData = orange.ExampleTable(newDomain, origData) for i, d in enumerate(origData): d[atNew] = temp_origData[i][at] for i, d in enumerate(finalData): d[atNew] = temp_finalData[i][at] for i, d in enumerate(newData): d[atNew] = at.name + "X" d[weightID] = 10 * data[i][weightID] finalData.extend(newData) return finalData
def sortAttrValues(self, attr, interattr=None): if not interattr: interattr = attr newvalues = list(interattr.values) newvalues.sort() if newvalues == list(interattr.values): return interattr newattr = orange.EnumVariable(interattr.name, values=newvalues) newattr.getValueFrom = orange.ClassifierByLookupTable(newattr, attr) lookupTable = newattr.getValueFrom.lookupTable distributions = newattr.getValueFrom.distributions for val in interattr.values: idx = attr.values.index(val) lookupTable[idx] = val distributions[idx][newvalues.index(val)] += 1 return newattr
def mergeClassValues(data, value): selection = orange.EnumVariable("Selection", values=["0", "1"]) selectedClassesStr = [value] nonSelectedClassesStr = [] for val in data.domain.classVar.values: if val not in selectedClassesStr: nonSelectedClassesStr.append(val) shortData1 = data.select({data.domain.classVar.name: selectedClassesStr}) shortData2 = data.select( {data.domain.classVar.name: nonSelectedClassesStr}) d1 = orange.Domain(shortData1.domain.attributes + [selection]) selection.getValueFrom = lambda ex, what: orange.Value(selection, "0") data1 = orange.ExampleTable(d1, shortData1) selection.getValueFrom = lambda ex, what: orange.Value(selection, "1") data2 = orange.ExampleTable(d1, shortData2) data1.extend(data2) return data1