コード例 #1
0
    def run(self):
        dp = SequentialDataProvider(self.dataset, 'C:\\data', self.task['chunkSize'], 0)
        db = DataBinner(self.task['dimensions'], self.task['dimensionAggregateFunctions'], self.task['nrOfBins'], self.task['aggregateDimensions'], self.task['aggregateFunctions'], self.task['brushes'])

        while True:
            if not self.isRunningPerUUID[self.uuid]:
                break
            time.sleep(0.2)
            self.step(dp, db)
                
        print 'VisualizationExecutor END'
コード例 #2
0
 def __init__(self, uuid, task, dataset, resultsPerUUID, isRunningPerUUID):
     Executor.__init__(self, uuid)
     
     self.task = task
     self.dataset = dataset
     self.resultsPerUUID = resultsPerUUID
     self.isRunningPerUUID = isRunningPerUUID
     
     #do first batch fast
     dp = SequentialDataProvider(self.dataset, 'C:\\data', self.task['chunkSize'], 0)
     db = DataBinner(self.task['dimensions'], self.task['dimensionAggregateFunctions'], self.task['nrOfBins'], self.task['aggregateDimensions'], self.task['aggregateFunctions'], self.task['brushes'])
     self.step(dp, db)
コード例 #3
0
    def do_POST(self):
        response = 200
        result = None     
        try:
            content_length = int(self.headers.getheader('content-length'))
            job = json.loads(self.rfile.read(content_length))
            print job
            logging.info('Request : ' + job['type'])
            
            key = json.dumps(job, indent=2, default=default)
            
            if job['type'] == 'execute':            
                if key in Shared().computationToUUIDLookup:
                    print 'existing comp ' + job['task']['type']
                    result = {'uuid' : Shared().computationToUUIDLookup[key]}
                    
                else:   
                    if job['task']['type'] == 'visualization':
                        print 'new comp visualization'
                        newUUID = str(uuid.uuid4())
                        exe = VisualizationExecutor(newUUID, job['task'], job['dataset'], Shared().resultsPerUUID, Shared().isRunningPerUUID)
                        Shared().isRunningPerUUID[newUUID] = True
                        exe.start()  
                        Shared().computationToUUIDLookup[key] = newUUID
                        result = {'uuid' : newUUID}
                        
                    elif job['task']['type'] == 'classify':
                        print 'new comp classify'
                        newUUID = str(uuid.uuid4())
                        exe = ClassificationExecutor(newUUID, job['task'], job['dataset'], Shared().resultsPerUUID, Shared().isRunningPerUUID, Shared().modelsPerUUID)
                        Shared().isRunningPerUUID[newUUID] = True
                        exe.start()  
                        Shared().computationToUUIDLookup[key] = newUUID
                        result = {'uuid' : newUUID}
                        
            elif job['type'] == 'test':
                print 'new comp test'
                reqUuid = job['uuid']
                print reqUuid
                print  pd.DataFrame(job['features'])
                
                if reqUuid in Shared().modelsPerUUID:
                    model = Shared().modelsPerUUID[reqUuid]
                    #print np.array(pd.DataFrame(job['features']))[0]
                    prediction = model.predict(np.array(pd.DataFrame(job['features'])[job['feature_dimensions']]))[0]
                    print np.array(pd.DataFrame(job['features'])[job['feature_dimensions']])
                    print 'test prediction : ', prediction
                    result = json.dumps({'result' : prediction}, default=default)
                else:
                    print ">>> not found"   
                    result = json.dumps({'result' : 0}, default=default)
                    
            elif job['type'] == 'catalog':
                subdirs = [name for name in os.listdir(Shared().dataFolder) if os.path.isdir(os.path.join(Shared().dataFolder, name))]
                schemas = {}
                print subdirs
                for subdir in subdirs:
                    dp = SequentialDataProvider(subdir, Shared().dataFolder, 100, 0)
                    p, df = dp.getDataFrame()
                    attrs = df.columns
                    dtypes = df.dtypes
                    schema = { 'uuid' : -1, 'schema': [(attr,str(dtypes[attr])) for attr in attrs] }
                    schemas[subdir] = schema
                result = json.dumps(schemas, indent=2, default=default)
                
            elif job['type'] == 'lookup':
                if job['uuid'] in Shared().resultsPerUUID:
                    result = json.dumps(Shared().resultsPerUUID[job['uuid']], indent=2, default=default)          
            
            elif job['type'] == 'halt':
                if job['uuid'] in Shared().isRunningPerUUID:
                    Shared().isRunningPerUUID[job['uuid']] = False      
                
            elif job['type'] == 'tasks':
                result = json.dumps([['classify',[
                                          'sgd',
                                          'naive_bayes',
                                          'perceptron',
                                          'passive_aggressive']]
                                    ], default=default)
                                    
        except:
            print traceback.format_exc()
            response = 500
            result = 'malformed request\n'

        self.send_response(response)
        self.send_header('Content-type','application/json')
        self.end_headers()
        self.wfile.write(result)
コード例 #4
0
 def run(self):
     dp = SequentialDataProvider(self.dataset, 'C:\\data', self.task['chunkSize'], 0)
     
     cls_name = self.task['classifier']
     cls = self.getClassifier(cls_name)
             
     progressList = []
     f1List = []
     
     X_test = []
     y_test = []
     df_test = None
     while True:
         if not self.isRunningPerUUID[self.uuid]:
             break
         start = time.time()
         tick = time.time()
         
         progress, df = dp.getDataFrame()
         if not self.task['filter'].strip() == '':
             df = df.query(self.task['filter'].strip())
             
         if not df is None: 
             split = int(math.ceil(len(df) * 0.3))
         
              # retain first as test
             if len(X_test) == 0:
                 X_test = df[self.task['features']]
                 y_test = df.eval(self.task['label'])
                 df_test = df
                 #X_test =  np.array(df[self.task['features']])
                 #y_test =  np.array([1 if x else 0 for x in np.array(df.eval(self.task['label']))])
                 
             else:
                 dfTest = df[:split]
                 dfTrain = df[split:]
             
                 y_train = dfTrain.eval(self.task['label'])
                 X_train = dfTrain[self.task['features']]
                 
                 y_test_current = dfTest.eval(self.task['label'])
                 X_test_current = dfTest[self.task['features']]
                 
                 cls.partial_fit(np.array(X_train), np.array([1 if x else 0 for x in np.array(y_train)]), classes=np.array([0, 1]))
                 
                 y_prob = None
                 y_pred = None
                 
                 if cls_name in ['sgd', 'perceptron', 'passive_aggressive']:
                     y_pred = cls.predict(np.array(pd.concat([X_test, X_test_current])))
                     y_prob = np.array([[0,y] for y in y_pred])
                 else:
                     y_prob = cls.predict_proba(np.array(pd.concat([X_test, X_test_current])))
                     y_pred = [1 if t[0] >= 0.5 else 0 for t in y_prob]
                     
                 y_test_concat = np.array([1 if x else 0 for x in np.array(pd.concat([y_test, y_test_current]))])
                 cm = confusion_matrix(y_test_concat, y_pred)
                 stats = self.classifyStats(cm, y_test_concat, y_prob, len(y_test_concat), progress)
                 progressList.append(progress)
                 f1List.append(stats['f1'])
                 stats['f1'] = f1List
                 stats['progress'] = progressList
                 
                 dfTest = df_test.copy(deep=True)
                 dfTest['actual'] = df_test.eval(self.task['label'])
                 dfTest['predicted'] = [True if t == 1.0 else False for t in y_pred[:len(y_test)]]
                 
                 histograms = {}
                 for feature in self.task['features']:
                     db = DataBinner([feature, feature], ['None', 'Count'], [10,10], [feature], ['Count'], 
                         ['actual and predicted', 'not actual and predicted', 'not actual and not predicted', 'actual and not predicted'])
                     
                     db.bin(dfTest, 1.0)
                     data = {
                         'binStructure' : db.binStructure.toJson(), 
                         'progress' : 1.0
                     }
                     histograms[feature] = data
                 
                 jsonMessage = json.dumps(stats, indent=2, default=default)
                 self.resultsPerUUID[self.uuid] = {self.task['label'] : stats, 'progress' : progress, 'histograms' : histograms}
                 
                 self.modelsPerUUID[self.uuid] = cls
                 
         end = time.time()
         print 'ClassificationExecutor : p= ' + '{0:.2f}'.format(progress) + ', t= ' + str(end - start)
         if df is None or progress >= 1.0:
             self.isRunningPerUUID[self.uuid] = False
             break 
             
     print 'ClassificationExecutor END'        
コード例 #5
0
    "dataset": "cars.csv_100000",
    "task": {
        "type": "classify",
        "classifier": "passive_aggressive",
        "chunkSize": 10000,
        "features": ["car_name", "model_year"],
        "label": "mpg < 30",
        "filter": ""
    }
}

print json.dumps(job)

task = job['task']

dp = SequentialDataProvider(job['dataset'], 'C:\\data\\', task['chunkSize'], 0)
cls = getClassifier(task['classifier'])


def default(o):
    if isinstance(o, np.integer): return int(o)
    raise TypeError


cls_stats = {}
cls_name = task['classifier']

stats = {
    'n_train': 0,
    'n_train_pos': 0,
    'accuracy': 0.0,