Ejemplo n.º 1
0
def checkLocalReferences(iterableObject, sourceObject, workflow_stats):

    if type(iterableObject) is dict:
        for key, value in iterableObject.iteritems():
            if hasattr(iterableObject[key], '__iter__'):
                checkLocalReferences(iterableObject[key], sourceObject,
                                     workflow_stats)
            elif (type(value) is str):
                iterableObject[key] = evaluateExpression(
                    value, sourceObject, workflow_stats)

    if type(iterableObject) is list or type(iterableObject) is tuple:
        for key, value in enumerate(iterableObject):
            if hasattr(iterableObject[key], '__iter__'):
                checkLocalReferences(iterableObject[key], sourceObject,
                                     workflow_stats)
            elif (type(value) is str):
                iterableObject[key] = evaluateExpression(
                    value, sourceObject, workflow_stats)
Ejemplo n.º 2
0
def checkReferencesInIterableAttributes(iterableObject,current_workflow_stats):
    
    if type(iterableObject) is dict :
        for key,value in iterableObject.iteritems():
            if hasattr(iterableObject[key], '__iter__'):
                checkReferencesInIterableAttributes(iterableObject[key],current_workflow_stats) 
            #elif (type(value) is str and '.' in value and value.split('.')[0] in current_workflow_stats) :
            #    iterableObject[key]=getInputFromOtherStage(iterableObject[key],current_workflow_stats)
            elif (type(value) is str) :
                iterableObject[key]=evaluateExpression(value, {}, current_workflow_stats)
                
    if type(iterableObject) is list or type(iterableObject) is tuple :
        for key,value in enumerate(iterableObject):
            if hasattr(iterableObject[key], '__iter__'):
                checkReferencesInIterableAttributes(iterableObject[key],current_workflow_stats) 
            #elif (type(value) is str and '.' in value and value.split('.')[0] in current_workflow_stats) :
            #    iterableObject[key]=getInputFromOtherStage(iterableObject[key],current_workflow_stats)
            elif (type(value) is str) :
                iterableObject[key]=evaluateExpression(value, {}, current_workflow_stats)  
Ejemplo n.º 3
0
def project_pipelineOLD(dataset, projection, workflow_stats={}):
    #this must be modified to be more efficient
    projectedDataset = []
    for d in dataset:
        objToInsert = {}

        if (type(projection) is dict):
            if bool(projection):
                for key, value in projection.iteritems():
                    objToInsert[key] = evaluateExpression(
                        value, d, workflow_stats)
            else:
                objToInsert = dict(d)
        elif (type(projection) is str):  #it's a value
            objToInsert['value'] = evaluateExpression(projection, d,
                                                      workflow_stats)

        projectedDataset.append(objToInsert)
    return projectedDataset
Ejemplo n.º 4
0
def project_pipeline(dataset, projection, workflow_stats={}, parameters=None):
    #this must be modified to be more efficient
    projectedDataset = []
    if len(dataset) > 0:

        if (type(projection) is str):  #it's a value
            for d in dataset:
                objToInsert = {}
                objToInsert['value'] = evaluateExpression(
                    projection, d, workflow_stats, parameters)

                projectedDataset.append(objToInsert)

        elif (type(projection) is dict):

            simpleProjectionKeys = {}
            complexProjectionKeys = {}

            simpleProjectionAid = set(projection.values()) & set(
                dataset[0].keys())

            for key, value in projection.iteritems():
                if value in simpleProjectionAid:
                    simpleProjectionKeys[key] = value
                else:
                    complexProjectionKeys[key] = value

            for d in dataset:
                objToInsert = {}
                for key, value in projection.iteritems():
                    if key in simpleProjectionKeys:
                        objToInsert[key] = d[value]
                    else:
                        objToInsert[key] = evaluateExpression(
                            value, d, workflow_stats, parameters)

                projectedDataset.append(objToInsert)

    return projectedDataset
Ejemplo n.º 5
0
def accumulator_min(objNew,
                    tupleKey,
                    dataset,
                    mapsGrouperStats,
                    measureKey,
                    measureFormula,
                    workflow_stats,
                    simple=True):
    grouperEntry = mapsGrouperStats.get(tupleKey)
    oldValueInDataset = dataset[grouperEntry['index']].get(
        measureKey, float('+inf'))
    if simple:
        objNewMeasure = objNew[measureFormula]
    else:
        objNewMeasure = evaluateExpression(measureFormula, objNew,
                                           workflow_stats)

    dataset[grouperEntry['index']][measureKey] = min(oldValueInDataset,
                                                     objNewMeasure)
Ejemplo n.º 6
0
def accumulator_avg(objNew,
                    tupleKey,
                    dataset,
                    mapsGrouperStats,
                    measureKey,
                    measureFormula,
                    workflow_stats,
                    simple=True):
    grouperEntry = mapsGrouperStats.get(tupleKey)
    oldValueInDataset = dataset[grouperEntry['index']].get(
        measureKey, float(0))
    if simple:
        objNewMeasure = objNew[measureFormula]
    else:
        objNewMeasure = evaluateExpression(measureFormula, objNew,
                                           workflow_stats)

    #ONE PASS MEAN : m_{k-1} + (x_k - m_{k-1}) / k
    dataset[grouperEntry['index']][measureKey] = oldValueInDataset + float(
        objNewMeasure - oldValueInDataset) / grouperEntry['count']
Ejemplo n.º 7
0
def accumulator_setappend(objNew,
                          tupleKey,
                          dataset,
                          mapsGrouperStats,
                          measureKey,
                          measureFormula,
                          workflow_stats,
                          simple=True):
    grouperEntry = mapsGrouperStats.get(tupleKey)
    #oldValueInDataset=dataset[grouperEntry['index']].get(measureKey,set())
    if simple:
        objNewMeasure = objNew[measureFormula]
    else:
        objNewMeasure = evaluateExpression(measureFormula, objNew,
                                           workflow_stats)

    try:
        dataset[grouperEntry['index']][measureKey] |= {objNewMeasure}
    except:
        dataset[grouperEntry['index']][measureKey] = {objNewMeasure}
Ejemplo n.º 8
0
def accumulator_append(objNew,
                       tupleKey,
                       dataset,
                       mapsGrouperStats,
                       measureKey,
                       measureFormula,
                       workflow_stats,
                       simple=True):
    grouperEntry = mapsGrouperStats.get(tupleKey)
    #print measureKey,dataset[grouperEntry['index']]
    oldValueInDataset = dataset[grouperEntry['index']].get(measureKey, [])

    if simple:
        objNewMeasure = objNew[measureFormula]
    else:
        objNewMeasure = evaluateExpression(measureFormula, objNew,
                                           workflow_stats)

    dataset[grouperEntry['index']][measureKey] = oldValueInDataset
    dataset[grouperEntry['index']][measureKey].append(objNewMeasure)
Ejemplo n.º 9
0
def accumulator_countDistinct(objNew,
                              tupleKey,
                              dataset,
                              mapsGrouperStats,
                              measureKey,
                              measureFormula,
                              workflow_stats,
                              simple=True):
    grouperEntry = mapsGrouperStats.get(tupleKey)
    #oldValueInDataset=dataset[grouperEntry['index']].get(measureKey,float(0))

    if simple:
        objNewMeasure = objNew[measureFormula]
    else:
        objNewMeasure = evaluateExpression(measureFormula, objNew,
                                           workflow_stats)

    if (grouperEntry.has_key('set')):
        grouperEntry['set'] |= {objNewMeasure}
    else:
        grouperEntry['set'] = set()
        grouperEntry['set'] |= {objNewMeasure}

    dataset[grouperEntry['index']][measureKey] = len(grouperEntry['set'])
Ejemplo n.º 10
0
def process_workflow_innerRecursive(workflow,current_workflow_stats={},currentIndex=0,fromIter=False,verbose=False):
    
    
    if currentIndex < len(workflow) :
        
        startProccessingStage = time()
        
        stage=workflow[currentIndex]
        
        stage['execute']=stage.get('execute',True)
        
        
        if (stage['type'] in ITERATORS_STAGES_ARRAY):
            
            stageCopy={}
            stageCopy['inputs']=deepcopy(stage['inputs'])
            stageCopy['configuration']=deepcopy(stage['configuration'])
            stageCopy['execute']=deepcopy(stage['execute'])
            if verbose :
                utilPrint('start processing stage ' + stage['id'] + ' of type' + stage['type'])
            iterableStage=process_workflowStage(stage,current_workflow_stats)
            lastReachedIndex=currentIndex+1
            
            enditerations=0
            beforeiterations=0
            sumTimeIterations=0
            while(next(iterableStage,None)):
                beforeiterations = time() 
                newCurrentIndex=currentIndex+1
                newFromIter=True
                lastReachedIndex=process_workflow_innerRecursive(workflow,current_workflow_stats,newCurrentIndex,newFromIter,verbose) 
                enditerations = time() 
                sumTimeIterations+=(enditerations-beforeiterations)
                
            stage['inputs']=deepcopy(stageCopy['inputs'])
            stage['configuration']=deepcopy(stageCopy['configuration'])    
            stage['execute']=deepcopy(stageCopy['execute'])
            
            
            
            endProccessingStage = time() 
            stage['timespent']=stage.get('timespent',0)+  (endProccessingStage-startProccessingStage)-sumTimeIterations
            return process_workflow_innerRecursive(workflow,current_workflow_stats,lastReachedIndex+1,fromIter,verbose)   
            
        elif (stage['type'] in MATCHERS_STAGES_ARRAY):
            if evaluateExpression(stage['execute'] , {}, current_workflow_stats):
                stageCopy={}
                stageCopy['inputs']=deepcopy(stage['inputs'])
                stageCopy['configuration']=deepcopy(stage['configuration'])
                stageCopy['execute']=deepcopy(stage['execute'])
            
                if verbose :
                    utilPrint('start processing stage ' + stage['id'] + ' of type : ' + stage['type'])
                process_workflowStage(stage,current_workflow_stats)    
                stage['inputs']=(stageCopy['inputs'])
                stage['configuration']=(stageCopy['configuration'])
                stage['execute']=(stageCopy['execute'])
            
                
                if (stage['outputs']['continue']) :
                    currentIndex+=1
                    endProccessingStage = time()  
                    stage['timespent']=stage.get('timespent',0)+  (endProccessingStage-startProccessingStage)   
                    return process_workflow_innerRecursive(workflow,current_workflow_stats,currentIndex,fromIter,verbose)
                else :
                    currentChain=getWorkflowChain(workflow)
                    for indexToReturn in range(currentIndex,len(currentChain)):
                        if currentChain[indexToReturn]==')' :
                            break
                    endProccessingStage = time()   
                    stage['timespent']=stage.get('timespent',0)+  (endProccessingStage-startProccessingStage)  
                    return indexToReturn
            else :
                currentIndex+=1
                endProccessingStage = time()  
                stage['timespent']=stage.get('timespent',0)+  (endProccessingStage-startProccessingStage)   
                return process_workflow_innerRecursive(workflow,current_workflow_stats,currentIndex,fromIter,verbose) 
                    
        
        elif (stage['type'] in SYNCERS_STAGES_ARRAY and fromIter):
            #this is processed each time this method reach a syncer stage after starting an iterator
            
            stageCopy={}
            stageCopy['inputs']=deepcopy(stage['inputs'])
            stageCopy['configuration']=deepcopy(stage['configuration'])
            stageCopy['execute']=deepcopy(stage['execute'])
            
            if verbose :
                utilPrint('start processing stage ' + stage['id'] + ' of type : ' + stage['type'])
            process_workflowStage(stage,current_workflow_stats)    
            stage['inputs']=(stageCopy['inputs'])
            stage['configuration']=(stageCopy['configuration'])
            stage['execute']=(stageCopy['execute'])
            
            
            #Nested ietration syncers must be reinitialized after each terminaison
            syncersStageToReInit=get_sincerStages_to_reinitialize(workflow[:currentIndex+1])
            for i in syncersStageToReInit:
                stageToReinit=workflow[i]
                stageToReinit['outputs']={'syncedData':None}
            
            endProccessingStage = time()   
            stage['timespent']=stage.get('timespent',0)+  (endProccessingStage-startProccessingStage)  
            return currentIndex  #process_workflow_innerRecursive(workflow,current_workflow_stats,currentIndex+1,verbose)    
        
        elif (stage['type'] in SYNCERS_STAGES_ARRAY and not fromIter):
            #this is processed each time this method reach a syncer stage but not after an iteration of stages, it means do nothing and process the next stages
            
            #new
            stageCopy={}
            stageCopy['inputs']=deepcopy(stage['inputs'])
            stageCopy['configuration']=deepcopy(stage['configuration'])
            stageCopy['execute']=deepcopy(stage['execute'])
            
            if verbose :
                utilPrint('start processing stage ' + stage['id'] + ' of type : ' + stage['type'])
            process_workflowStage(stage,current_workflow_stats)    
            stage['inputs']=(stageCopy['inputs'])
            stage['configuration']=(stageCopy['configuration'])
            stage['execute']=(stageCopy['execute'])
            
            #new
            
            currentIndex+=1
            
            endProccessingStage = time()   
            stage['timespent']=stage.get('timespent',0)+  (endProccessingStage-startProccessingStage)  
            return process_workflow_innerRecursive(workflow,current_workflow_stats,currentIndex,fromIter,verbose)
        
        
        else :   
            currentIndex+=1
            startStage = time()
            if evaluateExpression(stage['execute'] , {}, current_workflow_stats):
                stageCopy={}
                stageCopy['inputs']=deepcopy(stage['inputs'])
                stageCopy['configuration']=deepcopy(stage['configuration'])
                stageCopy['execute']=deepcopy(stage['execute'])
            
                
                if verbose :
                    utilPrint('start processing stage ' + stage['id'] + ' of type' + stage['type'])
                process_workflowStage(stage,current_workflow_stats)
                stopStage = time()
                if verbose :
                    utilPrint('time elapsed while processing stage '+ stage['id']+ ' : '+ str(stopStage-startStage))
                
                stage['inputs']=(stageCopy['inputs'])
                stage['configuration']=(stageCopy['configuration'])
                stage['execute']=(stageCopy['execute'])
            
                
            endProccessingStage = time()    
            stage['timespent']=stage.get('timespent',0)+  (endProccessingStage-startProccessingStage) 
            return process_workflow_innerRecursive(workflow,current_workflow_stats,currentIndex,fromIter,verbose) #ADD RETURN
        
    return currentIndex
Ejemplo n.º 11
0
def aggregate_dataset(dataset, dimensions, measures, workflow_stats={}):
    header = dimensions.keys() + measures.keys()
    rawAggregateMapping = {}  #tuple and their index in array
    flatResultsDataset = []
    if len(dataset) > 0:

        simpleGroupAid = set(dimensions.values()) & set(dataset[0].keys())
        key_formula_measures = {
            k: v[v.keys()[0]]
            for k, v in measures.iteritems()
        }
        key_acc_measures = {k: v.keys()[0] for k, v in measures.iteritems()}
        simpleMeasuresAid = set(key_formula_measures.values()) & set(
            dataset[0].keys())

        simpleGroupKeys = {
            key: value
            for key, value in dimensions.iteritems() if value in simpleGroupAid
        }
        simpleMeasuresKeys = {
            key: value
            for key, value in key_formula_measures.iteritems()
            if value in simpleMeasuresAid
        }

        accumulators_ready_to_use = []
        for key, value in key_formula_measures.iteritems():
            accumulator_callable = ACCUMULATOR_MAP_FUNCTIONS[
                key_acc_measures[key]]
            accumulators_ready_to_use.append(
                [key, value, accumulator_callable, key in simpleMeasuresKeys])

        resultsAppender = flatResultsDataset.append
        dimensions_iteritems = [[key, value, key in simpleGroupKeys]
                                for key, value in dimensions.iteritems()]
        sizeOfResults = 0

        for d in dataset:
            objToInsert = {
                key: d[value] if is_simple else evaluateExpression(
                    value, d, workflow_stats)
                for key, value, is_simple in dimensions_iteritems
            }
            tuple_key = tuple(objToInsert.iteritems())

            try:
                rawAggregateMapping[tuple_key]['count'] += 1
            except:
                resultsAppender(objToInsert)
                rawAggregateMapping[tuple_key] = {
                    'index': sizeOfResults,
                    'count': 1
                }
                sizeOfResults += 1

            for key, value, acc_callable, is_simple_measure in accumulators_ready_to_use:
                acc_callable(dict(d), tuple_key, flatResultsDataset,
                             rawAggregateMapping, key, value, workflow_stats,
                             is_simple_measure)
    elif len(dimensions) == 0:

        #key_formula_measures= {k:v[v.keys()[0]] for k,v in measures.iteritems()}
        key_acc_measures = {k: v.keys()[0] for k, v in measures.iteritems()}
        accumulators_ready_to_use = []
        rowToReturn = {}
        for key, value in key_acc_measures.iteritems():
            rowToReturn[key] = ACCUMULATOR_MAP_FUNCTIONS_INIT[value]
        flatResultsDataset.append(rowToReturn)

    return flatResultsDataset, header
Ejemplo n.º 12
0
def aggregate_datasetOLD(dataset, dimensions, measures, workflow_stats={}):
    #mapping
    header = dimensions.keys() + measures.keys()
    rawAggregateMapping = {}  #tuple and their index in array
    flatResultsDataset = []

    simpleGroupKeys = {}
    complexGroupKeys = {}
    simpleGroupAid = set(dimensions.values()) & set(dataset[0].keys())

    simpleMeasuresKeys = {}
    complexMeasuresKeys = {}
    key_formula_measures = {k: v[v.keys()[0]] for k, v in measures.iteritems()}
    simpleMeasuresAid = set(key_formula_measures.values()) & set(
        dataset[0].keys())

    for key, value in dimensions.iteritems():
        if value in simpleGroupAid:
            simpleGroupKeys[key] = value
        else:
            complexGroupKeys[key] = value

    for key, value in key_formula_measures.iteritems():
        if value in simpleMeasuresAid:
            simpleMeasuresKeys[key] = value
        else:
            complexMeasuresKeys[key] = value

    for d in dataset:
        objToInsert = {}
        for key, value in dimensions.iteritems():
            if key in simpleGroupKeys:
                objToInsert[key] = d[value]
            else:
                objToInsert[key] = evaluateExpression(value, d, workflow_stats)

        tuple_key = tuple(objToInsert.iteritems())

        for key, value in key_formula_measures.iteritems():

            if key in simpleMeasuresKeys:
                objToInsert[key] = d[value]
            else:
                objToInsert[key] = evaluateExpression(value, d, workflow_stats)

        indexOfObjInFlatResultsDataset = rawAggregateMapping.get(
            tuple_key, None)
        if indexOfObjInFlatResultsDataset is None:
            flatResultsDataset.append(objToInsert)
            rawAggregateMapping[tuple_key] = {
                'index': len(flatResultsDataset) - 1
            }
            for key, value in measures.iteritems():
                accumulator = value.keys()[0]
                if accumulator == 'append':
                    flatResultsDataset[
                        rawAggregateMapping[tuple_key]['index']][key] = [
                            flatResultsDataset[rawAggregateMapping[tuple_key]
                                               ['index']][key]
                        ]
                elif accumulator == 'set_append':
                    flatResultsDataset[
                        rawAggregateMapping[tuple_key]['index']][key] = set([
                            flatResultsDataset[rawAggregateMapping[tuple_key]
                                               ['index']][key]
                        ])

        else:
            for key, value in measures.iteritems():
                accumulator = value.keys()[0]
                if accumulator == 'sum':
                    flatResultsDataset[indexOfObjInFlatResultsDataset[
                        'index']][key] += objToInsert[key]

                elif accumulator == 'max':
                    flatResultsDataset[indexOfObjInFlatResultsDataset[
                        'index']][key] = objToInsert[
                            key] if objToInsert[key] > flatResultsDataset[
                                indexOfObjInFlatResultsDataset['index']][
                                    key] else flatResultsDataset[
                                        indexOfObjInFlatResultsDataset[
                                            'index']][key]
                elif accumulator == 'min':
                    flatResultsDataset[indexOfObjInFlatResultsDataset[
                        'index']][key] = objToInsert[
                            key] if objToInsert[key] < flatResultsDataset[
                                indexOfObjInFlatResultsDataset['index']][
                                    key] else flatResultsDataset[
                                        indexOfObjInFlatResultsDataset[
                                            'index']][key]
                elif accumulator == 'avg':  #ONE PASS MEAN : m_{k-1} + (x_k - m_{k-1}) / k
                    rawAggregateMapping[tuple_key][
                        'count'] = rawAggregateMapping[tuple_key].get(
                            'count', 1) + 1
                    oldAvg = flatResultsDataset[
                        indexOfObjInFlatResultsDataset['index']][key]
                    flatResultsDataset[indexOfObjInFlatResultsDataset[
                        'index']][key] = oldAvg + (
                            float(objToInsert[key] - oldAvg) /
                            float(rawAggregateMapping[tuple_key]['count']))

                elif accumulator == 'append':
                    if type(flatResultsDataset[indexOfObjInFlatResultsDataset[
                            'index']][key]) is list:
                        flatResultsDataset[indexOfObjInFlatResultsDataset[
                            'index']][key].append(objToInsert[key])
                    else:
                        arr = []
                        arr.append(flatResultsDataset[
                            indexOfObjInFlatResultsDataset['index']][key])
                        flatResultsDataset[
                            indexOfObjInFlatResultsDataset['index']][key] = arr
                        flatResultsDataset[indexOfObjInFlatResultsDataset[
                            'index']][key].append(objToInsert[key])
                elif accumulator == 'set_append':
                    flatResultsDataset[
                        indexOfObjInFlatResultsDataset['index']][key].add(
                            objToInsert[key])
    return flatResultsDataset, header