def getExamples(con, experimentName, callback, callbackArgs, metaDataFileName=None, options=None, experimentMeta=None): con = connect(con) template = parseExperiment(experimentName).copy() template = parseTemplateOptions(options, template) #con = connect(con, template.get("functions", None)) #options = updateTemplateOptions(template, options) print "Template:", experimentName print json.dumps(template, indent=4) compiled, lambdaArgs = compileTemplate(template) print "Compiled experiment" examples = [dict(x) for x in compiled["example"](con=con, **lambdaArgs)] numHidden = hidden.setHiddenValues(examples, compiled) numExamples = len(examples) print "Examples " + str(numExamples) + ", hidden " + str(numHidden) count = 0 clsIds = compiled.get("classes", None) hiddenRule = compiled.get("include", "train") featureIds = {} meta = [] featureGroups = compiled.get("features", []) sampleRandom = MTwister() sampleRandom.set_seed(2) for example in examples: count += 1 if not hidden.getInclude(example, compiled.get("hidden", None), hiddenRule): continue hidden.setSet(example, compiled.get("hidden", None)) #print experiment["class"](con, example) #if count % 10 == 0: print "Processing example", example, cls = getIdOrValue(compiled["label"](con=con, example=example, **lambdaArgs), clsIds) print cls, str(count) + "/" + str(numExamples) strCls = str(cls) if "sample" in compiled and strCls in compiled["sample"] and sampleRandom.random() > compiled["sample"][strCls]: print "NOTE: Downsampled example" continue if "filter" in compiled and compiled["filter"] != None and len([x for x in compiled["filter"](con=con, example=example, **lambdaArgs)]) == 0: print "NOTE: Filtered example" continue features = {} for featureGroup in featureGroups: for row in featureGroup(con=con, example=example, **lambdaArgs): for key, value in itertools.izip(*[iter(row)] * 2): # iterate over each consecutive key,value columns pair if not isinstance(key, basestring): raise Exception("Non-string feature key '" + str(key) + "' in feature group " + str(featureGroups.index(featureGroup))) if not isinstance(value, Number): raise Exception("Non-number feature value '" + str(value) + "' in feature group " + str(featureGroups.index(featureGroup))) features[getId(key, featureIds)] = value if len(features) == 0: print "WARNING: example has no features" if callback != None: callback(example=example, cls=cls, features=features, **callbackArgs) if "meta" in compiled: meta.append(compiled["meta"](label=cls, features=features, example=example, **lambdaArgs)) saveMetaData(metaDataFileName, con, template, experimentName, options, clsIds, featureIds, meta, experimentMeta) return featureIds
class HiddenSet(): def __init__(self, seed=1): self.__random = MTwister() self.__random.set_seed(seed) self.__thresholds = [] def getThreshold(self, index): while len(self.__thresholds) <= index: self.__thresholds.append(self.__random.random()) return self.__thresholds[index] def getDonorThreshold(self, donorId): donorIndex = int(donorId[2:]) return self.getThreshold(donorIndex)
""" For calculating a hidden set of donors. """ import sys, os import result sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from lib.pymersennetwister.mtwister import MTwister __hiddenRandom = MTwister() __hiddenRandom.set_seed(1) __hidden = [] def getHiddenValue(index): while len(__hidden) <= index: __hidden.append(__hiddenRandom.random()) return __hidden[index] def getDonorHiddenValue(icgc_donor_id): return getHiddenValue(int(icgc_donor_id[2:])) def setHiddenValues(examples, template, donorIdKey="icgc_donor_id"): numHidden = 0 if "hidden" in template: for example in examples: example["hidden"] = getDonorHiddenValue(example[donorIdKey]) if example["hidden"] < template["hidden"]: numHidden += 1 return numHidden def getInclude(example, templateHidden, hiddenRule, verbose=True): if hiddenRule not in ("train", "hidden", "both"):