Exemple #1
0
def getExamples(con, experimentName, callback, callbackArgs, metaDataFileName=None, options=None, experimentMeta=None):
    con = connect(con)
    template = parseExperiment(experimentName).copy()
    template = parseTemplateOptions(options, template)
    #con = connect(con, template.get("functions", None))
    #options = updateTemplateOptions(template, options)
    print "Template:", experimentName
    print json.dumps(template, indent=4)
    compiled, lambdaArgs = compileTemplate(template)
    print "Compiled experiment"
    examples = [dict(x) for x in compiled["example"](con=con, **lambdaArgs)]
    numHidden = hidden.setHiddenValues(examples, compiled)
    numExamples = len(examples)
    print "Examples " +  str(numExamples) + ", hidden " + str(numHidden)
    count = 0
    clsIds = compiled.get("classes", None)
    hiddenRule = compiled.get("include", "train")
    featureIds = {}
    meta = []
    featureGroups = compiled.get("features", [])
    sampleRandom = MTwister()
    sampleRandom.set_seed(2)
    for example in examples:
        count += 1
        if not hidden.getInclude(example, compiled.get("hidden", None), hiddenRule):
            continue
        hidden.setSet(example, compiled.get("hidden", None))
        #print experiment["class"](con, example)
        #if count % 10 == 0:
        print "Processing example", example,
        cls = getIdOrValue(compiled["label"](con=con, example=example, **lambdaArgs), clsIds)
        print cls, str(count) + "/" + str(numExamples)
        strCls = str(cls)
        if "sample" in compiled and strCls in compiled["sample"] and sampleRandom.random() > compiled["sample"][strCls]:
            print "NOTE: Downsampled example"
            continue
        if "filter" in compiled and compiled["filter"] != None and len([x for x in compiled["filter"](con=con, example=example, **lambdaArgs)]) == 0:
            print "NOTE: Filtered example"
            continue
        features = {}
        for featureGroup in featureGroups:
            for row in featureGroup(con=con, example=example, **lambdaArgs):
                for key, value in itertools.izip(*[iter(row)] * 2): # iterate over each consecutive key,value columns pair
                    if not isinstance(key, basestring):
                        raise Exception("Non-string feature key '" + str(key) + "' in feature group " + str(featureGroups.index(featureGroup)))
                    if not isinstance(value, Number):
                        raise Exception("Non-number feature value '" + str(value) + "' in feature group " + str(featureGroups.index(featureGroup)))
                    features[getId(key, featureIds)] = value
        if len(features) == 0:
            print "WARNING: example has no features"
        if callback != None:
            callback(example=example, cls=cls, features=features, **callbackArgs)
        if "meta" in compiled:
            meta.append(compiled["meta"](label=cls, features=features, example=example, **lambdaArgs))
    saveMetaData(metaDataFileName, con, template, experimentName, options, clsIds, featureIds, meta, experimentMeta)
    return featureIds
Exemple #2
0
class HiddenSet():
    def __init__(self, seed=1):
        self.__random = MTwister()
        self.__random.set_seed(seed)
        self.__thresholds = []
    
    def getThreshold(self, index):
        while len(self.__thresholds) <= index:
            self.__thresholds.append(self.__random.random())
        return self.__thresholds[index]
    
    def getDonorThreshold(self, donorId):
        donorIndex = int(donorId[2:])
        return self.getThreshold(donorIndex)
Exemple #3
0
"""
For calculating a hidden set of donors.
"""
import sys, os
import result
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from lib.pymersennetwister.mtwister import MTwister

__hiddenRandom = MTwister()
__hiddenRandom.set_seed(1)
__hidden = []

def getHiddenValue(index):
    while len(__hidden) <= index:
        __hidden.append(__hiddenRandom.random())
    return __hidden[index]

def getDonorHiddenValue(icgc_donor_id):
    return getHiddenValue(int(icgc_donor_id[2:]))

def setHiddenValues(examples, template, donorIdKey="icgc_donor_id"):
    numHidden = 0
    if "hidden" in template:
        for example in examples:
            example["hidden"] = getDonorHiddenValue(example[donorIdKey])
            if example["hidden"] < template["hidden"]:
                numHidden += 1
    return numHidden

def getInclude(example, templateHidden, hiddenRule, verbose=True):
    if hiddenRule not in ("train", "hidden", "both"):