예제 #1
0
class KafkaClassificationModelConsumerExecutor(object):
    def __init__(self, config):
        pass
        if config == None:
            self.configFile = "acm.config.dev.yml"
        self.configManager = ConfigManager(
            params={"config.file": self.configFile})
        self.config = self.configManager.read()

    def start(self):
        pass
        k = KafkaClassificationModelConsumerProcess(self.config.toJson())
        k.start()
        k.join()
예제 #2
0
class MulticlassClassifierExecutor(object):
    def __init__(self, config=None):
        pass
        if config == None:
            self.configFile = "acm.config.dev.yml"
        self.configManager = ConfigManager(
            params={"config.file": self.configFile})
        self.config = self.configManager.read()

    # this method run as a child process
    # lr: logistic regression
    def lrModelTrainer(self, q, parentEnv, configJsonStr):
        pass
        classifier = MulticlassLogisticRegressionModelTrainer()
        classifier.start(q, parentEnv, configJsonStr)
        q.put(["hello parent!", "I am lrModelTrainer"])

    # this is the start point of parent process
    def start(self):
        pass
        fs = [
            self.lrModelTrainer,
            self.lrModelTrainer,
        ]  #functions
        ps = []  #processes
        qs = []  # queues
        #passing objects to child process via queue
        env_ = json.dumps(os.environ.copy())
        for f in fs:
            q = Queue()
            configJsonStr = self.config.toJson()
            p = Process(target=f, args=(q, env_, configJsonStr))
            qs.append(q)
            p.start()
            ps.append(p)
            p.join()

        for q in qs:
            print(q.get())
class AcmTextLogisticRegressionClassifierView(APIView):
    """
    List all snippets, or create a new snippet.
    """


    def hdfsizePath(self, path):
        return self.hdfsServerUrl+path



    def classify(self, inputJson):
        pass

        self.hdfs = PyWebHdfsClient(host=self.config.acm.servers.hdfs.host,port=self.config.acm.servers.hdfs.restPort, user_name=self.config.acm.servers.hdfs.fileOwner)
        self.hdfsServerUrl = "hdfs://"+self.config.acm.servers.hdfs.host+":"+str(self.config.acm.servers.hdfs.port)

        if hasattr(self, 'sc')==False: 
            self.sc =SparkContext()
        if hasattr(self, 'sqlContext')==False:
            self.sqlContext = SQLContext(self.sc)


        schema = StructType([StructField('Category', StringType(), True),
                     StructField('Descript', StringType(), True),
                     StructField('Dates', StringType(), True),
                     StructField('DayOfWeek', StringType(), True),
                     StructField('PdDistrict', StringType(), True),
                     StructField('Resolution', StringType(), True),
                     StructField('Address', StringType(), True),
                     StructField('X', DoubleType(), True),
                     StructField('Y', DoubleType(), True)
                    ])
        test = self.sqlContext.createDataFrame(inputJson, schema)

        #pipeline= PipelineModel.load("/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/lr.model.pipeline.savepoint")
        pipeline= PipelineModel.load(self.pipelineHdfsPath)


        testData = pipeline.transform(test)
        print("Test Dataset Count: " + str(testData.count()))

        ########################################################## 
        ################## Train/load the model ################## 
        ########################################################## 

        #lrModel = LogisticRegressionModel.load("/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/lr.model.savepoint")
        lrModel = LogisticRegressionModel.load(self.modelHdfsPath)

        predictions = lrModel.transform(testData)

        predictions.filter(predictions['prediction'] == 7)  \
            .select("Descript","Category","probability","label","prediction") \
            .orderBy("probability", ascending=False) \
            .show(n = 10, truncate = 30)

        #.select("probability","label","prediction") \
        resultJson = predictions.filter(predictions['prediction'] == 7)  \
            .select("prediction") \
            .orderBy("probability", ascending=False) \
            .toJSON().collect()
        self.sc.stop()

        return ["al sana ML!", resultJson]

    def get(self, request, format=None):
        print ("django-root:",  settings.BASE_DIR)
        self.configFile= os.path.join(settings.BASE_DIR, "acm.config.dev.yml")
        self.configManager = ConfigManager(params={"config.file":self.configFile})
        self.config = self.configManager.read()
        self.hdfsServerUrl = "hdfs://"+self.config.acm.servers.hdfs.host+":"+str(self.config.acm.servers.hdfs.port)

        #jsonFile = '/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/jsontordd.test.json'
        jsonFile =  os.path.join(settings.BASE_DIR, self.config.acm.test.test001.jsonRequestPath)
        self.modelHdfsPath = self.hdfsizePath(self.config.acm.test.test001.modelHdfsPath)
        self.pipelineHdfsPath = self.hdfsizePath(self.config.acm.test.test001.pipelineHdfsPath)

        json_ = None
        with open(jsonFile, 'r') as f:
            content = f.read()
            json_ = json.loads(content)
        ret_ = self.classify(json_)
        return Response(["hello","world!", ret_])

    def post(self, request, format=None):
        return self.get(request, format)