Ejemplo n.º 1
0
    def process(self, inputs):
        input_meta = self.get_input_meta()
        predict_col = self.conf.get('prediction', 'predict')
        data_df = inputs[self.INPUT_PORT_NAME]

        if self.INPUT_PORT_MODEL_NAME in input_meta:
            # use external information instead of conf
            filename = get_file_path(inputs[self.INPUT_PORT_MODEL_NAME])
            train_cols = input_meta[self.INPUT_PORT_MODEL_NAME]['train']
            train_cols = list(train_cols.keys())
        else:
            # use the conf information
            filename = get_file_path(self.conf['file'])
            if 'columns' in self.conf:
                if self.conf.get('include', True):
                    train_cols = self.conf['columns']
                else:
                    train_cols = [
                        col for col in data_df.columns
                        if col not in self.conf['columns']
                    ]
        train_cols.sort()
        fm = ForestInference.load(filename,
                                  model_type=self.conf.get(
                                      "model_type", "xgboost"))
        prediction = fm.predict(data_df[train_cols])
        prediction.index = data_df.index
        data_df[predict_col] = prediction
        return {self.OUTPUT_PORT_NAME: data_df}
Ejemplo n.º 2
0
 def _compute_hash_key(self):
     """
     if hash changed, the port_setup, meta_setup
     and conf_json should be different
     In very rara case, might have the problem of hash collision,
     It affects the column, port and conf calculation. It won't
     change the computation result though.
     It returns the hash code, the loaded task_graph,
     the replacement conf obj
     """
     task_graph = ""
     inputs = ()
     replacementObj = {}
     input_node = ""
     task_graph_obj = None
     if 'taskgraph' in self.conf:
         task_graph = get_file_path(self.conf['taskgraph'])
         if os.path.exists(task_graph):
             with open(task_graph) as f:
                 task_graph = hashlib.md5(f.read().encode()).hexdigest()
             task_graph_obj = TaskGraph.load_taskgraph(
                 get_file_path(self.conf['taskgraph']))
     self.update_replace(replacementObj, task_graph_obj)
     if 'input' in self.conf:
         for inp in self.conf['input']:
             input_node += inp+","
             if hasattr(self, 'inputs'):
                 for i in self.inputs:
                     inputs += (hash(i['from_node']),
                                i['to_port'], i['from_port'])
     return (hash((self.uid, task_graph, inputs, json.dumps(self.conf),
                   input_node, json.dumps(replacementObj))), task_graph_obj,
             replacementObj)
Ejemplo n.º 3
0
    def ports_setup(self):
        cache_key = self._compute_hash_key()
        if cache_key in cache_ports:
            # print('cache hit')
            return cache_ports[cache_key]
        inports = {}
        outports = {}
        if 'taskgraph' in self.conf:
            task_graph = TaskGraph.load_taskgraph(
                get_file_path(self.conf['taskgraph']))
            replacementObj = {}
            self.update_replace(replacementObj)
            task_graph.build(replace=replacementObj)

            def inputNode_fun(inputNode, in_ports):
                inport = {}
                before_fix = inputNode.ports_setup().inports
                for key in before_fix.keys():
                    if key in in_ports:
                        inport[key] = before_fix[key]
                inports.update(fix_port_name(inport, inputNode.uid))

            def outNode_fun(outNode, out_ports):
                ouport = {}
                before_fix = outNode.ports_setup().outports
                for key in before_fix.keys():
                    if key in out_ports:
                        ouport[key] = before_fix[key]
                outports.update(fix_port_name(ouport, outNode.uid))

            self._make_sub_graph_connection(task_graph, inputNode_fun,
                                            outNode_fun)
        output_port = NodePorts(inports=inports, outports=outports)
        cache_ports[cache_key] = output_port
        return output_port
Ejemplo n.º 4
0
    def process(self, inputs):
        """
        Load the end of day stock CSV data into cuDF dataframe

        Arguments
        -------
         inputs: list
             empty list
        Returns
        -------
        cudf.DataFrame
        """
        output = {}
        if self.outport_connected(CUDF_PORT_NAME):
            path = get_file_path(self.conf['file'])
            df = cudf.read_csv(path)
            # extract the year, month, day
            ymd = df['DTE'].astype(
                'str').str.extract(r'(\d\d\d\d)(\d\d)(\d\d)')
            # construct the standard datetime str
            df['DTE'] = ymd[0].str.cat(
                ymd[1],
                '-').str.cat(ymd[2], '-').astype('datetime64[ms]')
            df = df[['DTE', 'OPEN', 'CLOSE', 'HIGH', 'LOW', 'SM_ID', 'VOLUME']]
            df['VOLUME'] /= 1000
            # change the names
            df.columns = ['datetime', 'open', 'close',
                          'high', 'low', "asset", 'volume']
            output.update({CUDF_PORT_NAME: df})
        if self.outport_connected(PANDAS_PORT_NAME):
            path = get_file_path(self.conf['file'])
            df = pd.read_csv(path,
                             converters={'DTE':
                                         lambda x: pd.Timestamp(str(x))})
            df = df[['DTE', 'OPEN',
                     'CLOSE', 'HIGH',
                     'LOW', 'SM_ID', 'VOLUME']]
            df['VOLUME'] /= 1000
            df.columns = ['datetime', 'open', 'close', 'high',
                          'low', "asset", 'volume']
            output.update({PANDAS_PORT_NAME: df})
        if self.outport_connected(DASK_CUDF_PORT_NAME):
            path = get_file_path(self.conf['path'])
            df = dask_cudf.read_csv(path+'/*.csv',
                                    parse_dates=['datetime'])
            output.update({DASK_CUDF_PORT_NAME: df})
        return output
Ejemplo n.º 5
0
 def init(self, class_obj):
     if nemo.core.NeuralModuleFactory.get_default_factory() is None:
         nemo.core.NeuralModuleFactory()
     self.instanceClass = class_obj
     self.instance = None
     self.file_fields = []
     conf_para = get_conf_parameters(class_obj)
     self.fix_type = {}
     self.INPUT_NM = 'in_nm'
     self.OUTPUT_NM = 'out_nm'
     for key in conf_para.keys():
         if key.find('name') >= 0:
             self.fix_type[key] = "string"
         if key.find('model') >= 0:
             self.fix_type[key] = "string"
         if key.find('file') >= 0:
             self.file_fields.append(key)
     for f in self.file_fields:
         self.fix_type[f] = 'string'
         if f in self.conf and self.conf[f]:
             self.conf[f] = get_file_path(self.conf[f])
     if not issubclass(class_obj, DataLayerNM):
         try:
             if issubclass(self.instanceClass, TrainableNM):
                 input_meta = self.get_input_meta()
                 if self.INPUT_NM in input_meta:
                     if (share_weight in self.conf
                             and self.conf[share_weight] == 'Reuse'):
                         self.conf = input_meta[self.INPUT_NM]
             app = nemo.utils.app_state.AppState()
             ins = None
             for mod in app._module_registry:
                 if isinstance(mod, self.instanceClass):
                     ins = mod
                     break
             if ins is None:
                 ins = class_obj(**self.conf)
             if self.instance is None:
                 self.instance = ins
         except Exception as e:
             print(e)
             pass
Ejemplo n.º 6
0
 def columns_setup(self):
     self.required = {}
     column_types = {"asset": "int64", "asset_name": "object"}
     out_cols = {
         STOCK_NAME_PORT_NAME: column_types,
     }
     if self.outport_connected(STOCK_MAP_PORT_NAME):
         if 'file' in self.conf:
             hash_key = self._compute_hash_key()
             if hash_key in cache_columns:
                 out_cols.update(
                     {STOCK_MAP_PORT_NAME: cache_columns[hash_key]})
             else:
                 path = get_file_path(self.conf['file'])
                 name_df = cudf.read_csv(path)[['SM_ID', 'SYMBOL']]
                 name_df.columns = ["asset", 'asset_name']
                 pdf = name_df.to_pandas()
                 column_data = pdf.to_dict('list')
                 cache_columns[hash_key] = column_data
                 out_cols.update({STOCK_MAP_PORT_NAME: column_data})
     return out_cols
Ejemplo n.º 7
0
    def process(self, inputs):
        """
        Load the csv file mapping stock id to symbol name into cudf DataFrame

        Arguments
        -------
         inputs: list
             empty list
        Returns
        -------
        cudf.DataFrame
        """
        output = {}
        if self.outport_connected(STOCK_NAME_PORT_NAME):
            path = get_file_path(self.conf['file'])
            name_df = cudf.read_csv(path)[['SM_ID', 'SYMBOL']]
            # change the names
            name_df.columns = ["asset", 'asset_name']
            output.update({STOCK_NAME_PORT_NAME: name_df})
        if self.outport_connected(STOCK_MAP_PORT_NAME):
            output.update({STOCK_MAP_PORT_NAME: StockMap()})
        return output
Ejemplo n.º 8
0
    def columns_setup(self):
        cache_key = self._compute_hash_key()
        if cache_key in cache_columns:
            # print('cache hit')
            return cache_columns[cache_key]
        required = {}
        out_columns = {}
        if 'taskgraph' in self.conf:
            task_graph = TaskGraph.load_taskgraph(
                get_file_path(self.conf['taskgraph']))
            replacementObj = {}
            self.update_replace(replacementObj)
            task_graph.build(replace=replacementObj)

            def inputNode_fun(inputNode, in_ports):
                req = {}
                # do columns_setup so required columns are ready
                inputNode.columns_setup()
                for key in inputNode.required.keys():
                    if key in in_ports:
                        req[key] = inputNode.required[key]
                required.update(fix_port_name(req, inputNode.uid))

            def outNode_fun(outNode, out_ports):
                oucols = {}
                before_fix = outNode.columns_setup()
                for key in before_fix.keys():
                    if key in out_ports:
                        oucols[key] = before_fix[key]
                out_columns.update(fix_port_name(oucols, outNode.uid))

            self._make_sub_graph_connection(task_graph, inputNode_fun,
                                            outNode_fun)

        self.required = required
        cache_columns[cache_key] = out_columns
        return out_columns
Ejemplo n.º 9
0
    def process(self, inputs):
        """
        Composite computation

        Arguments
        -------
         inputs: list
            list of input dataframes.
        Returns
        -------
        dataframe
        """
        if 'taskgraph' in self.conf:
            task_graph = TaskGraph.load_taskgraph(
                get_file_path(self.conf['taskgraph']))
            task_graph.build()

            outputLists = []
            replaceObj = {}
            input_feeders = []

            def inputNode_fun(inputNode, in_ports):
                inports = inputNode.ports_setup().inports

                class InputFeed(Node):

                    def meta_setup(self):
                        output = {}
                        for inp in inputNode.inputs:
                            output[inp['to_port']] = inp[
                                'from_node'].meta_setup().outports[
                                    inp['from_port']]
                        # it will be something like { input_port: columns }
                        return MetaData(inports={}, outports=output)

                    def ports_setup(self):
                        # it will be something like { input_port: types }
                        return NodePorts(inports={}, outports=inports)

                    def conf_schema(self):
                        return ConfSchema()

                    def process(self, empty):
                        output = {}
                        for key in inports.keys():
                            if inputNode.uid+'@'+key in inputs:
                                output[key] = inputs[inputNode.uid+'@'+key]
                        return output

                uni_id = str(uuid.uuid1())
                obj = {
                    TaskSpecSchema.task_id: uni_id,
                    TaskSpecSchema.conf: {},
                    TaskSpecSchema.node_type: InputFeed,
                    TaskSpecSchema.inputs: []
                }
                input_feeders.append(obj)
                newInputs = {}
                for key in inports.keys():
                    if inputNode.uid+'@'+key in inputs:
                        newInputs[key] = uni_id+'.'+key
                for inp in inputNode.inputs:
                    if inp['to_port'] not in in_ports:
                        # need to keep the old connections
                        newInputs[inp['to_port']] = (inp['from_node'].uid
                                                     + '.' + inp['from_port'])
                replaceObj.update({inputNode.uid: {
                    TaskSpecSchema.inputs: newInputs}
                })

            def outNode_fun(outNode, out_ports):
                out_ports = outNode.ports_setup().outports
                # fixed_outports = fix_port_name(out_ports, outNode.uid)
                for key in out_ports.keys():
                    if self.outport_connected(outNode.uid+'@'+key):
                        outputLists.append(outNode.uid+'.'+key)

            self._make_sub_graph_connection(task_graph,
                                            inputNode_fun, outNode_fun)

            task_graph.extend(input_feeders)
            self.update_replace(replaceObj, task_graph)
            result = task_graph.run(outputLists, replace=replaceObj)
            output = {}
            for key in result.get_keys():
                splits = key.split('.')
                output['@'.join(splits)] = result[key]
            return output
        else:
            return {}
Ejemplo n.º 10
0
            def search_fun(config, checkpoint_dir=None):
                myinputs = {}
                for key in data_store.keys():
                    v = ray.get(data_store[key])
                    if isinstance(v, pandas.DataFrame):
                        myinputs[key] = cudf.from_pandas(v)
                    else:
                        myinputs[key] = v
                task_graph = TaskGraph.load_taskgraph(
                    get_file_path(self.conf['taskgraph']))
                task_graph.build()

                outputLists = [train_id + '.' + 'checkpoint_dir']
                replaceObj = {}
                input_feeders = []

                def inputNode_fun(inputNode, in_ports):
                    inports = inputNode.ports_setup().inports

                    class InputFeed(Node):
                        def meta_setup(self):
                            output = {}
                            for inp in inputNode.inputs:
                                output[inp['to_port']] = inp[
                                    'from_node'].meta_setup()[inp['from_port']]
                            # it will be something like { input_port: columns }
                            return output

                        def ports_setup(self):
                            # it will be something like { input_port: types }
                            return NodePorts(inports={}, outports=inports)

                        def conf_schema(self):
                            return ConfSchema()

                        def process(self, empty):
                            output = {}
                            for key in inports.keys():
                                if (inputNode.uid + '@' + key in myinputs):
                                    output[key] = myinputs[inputNode.uid +
                                                           '@' + key]
                            return output

                    uni_id = str(uuid.uuid1())
                    obj = {
                        TaskSpecSchema.task_id: uni_id,
                        TaskSpecSchema.conf: {},
                        TaskSpecSchema.node_type: InputFeed,
                        TaskSpecSchema.inputs: []
                    }
                    input_feeders.append(obj)
                    newInputs = {}
                    for key in inports.keys():
                        if inputNode.uid + '@' + key in myinputs:
                            newInputs[key] = uni_id + '.' + key
                    for inp in inputNode.inputs:
                        if inp['to_port'] not in in_ports:
                            # need to keep the old connections
                            newInputs[inp['to_port']] = (inp['from_node'].uid +
                                                         '.' +
                                                         inp['from_port'])
                    replaceObj.update(
                        {inputNode.uid: {
                            TaskSpecSchema.inputs: newInputs
                        }})

                def outNode_fun(outNode, out_ports):
                    pass

                self._make_sub_graph_connection(task_graph, inputNode_fun,
                                                outNode_fun)

                task_graph.extend(input_feeders)
                self.update_conf_for_search(replaceObj, task_graph, config)
                task_graph.run(outputLists, replace=replaceObj)
Ejemplo n.º 11
0
    def conf_schema(self):
        cache_key = self._compute_hash_key()
        if cache_key in cache_schema:
            # print('cache hit')
            return cache_schema[cache_key]
        json = {
            "title": "Composite Node configure",
            "type": "object",
            "description": """Use a sub taskgraph as a composite node""",
            "properties": {
                "taskgraph": {
                    "type": "string",
                    "description": "the taskgraph filepath"
                },
                "input": {
                    "type": "array",
                    "description": "the input node ids",
                    "items": {
                        "type": "string"
                    }
                },
                "output": {
                    "type": "array",
                    "description": "the output node ids",
                    "items": {
                        "type": "string"
                    }
                },
                "subnode_ids": {
                    "title":
                    self.uid + " subnode ids",
                    "type":
                    "array",
                    "items": {
                        "type": "string"
                    },
                    "description":
                    """sub graph node ids that need
                    to be reconfigured"""
                },
                "subnodes_conf": {
                    "title": self.uid + " subnodes configuration",
                    "type": "object",
                    "properties": {}
                }
            },
            "required": ["taskgraph"],
        }
        ui = {
            "taskgraph": {
                "ui:widget": "TaskgraphSelector"
            },
            "subnodes_conf": {}
        }
        if 'taskgraph' in self.conf:
            task_graphh = TaskGraph.load_taskgraph(
                get_file_path(self.conf['taskgraph']))
            replacementObj = {}
            self.update_replace(replacementObj)
            task_graphh.build(replace=replacementObj)

            def inputNode_fun(inputNode, in_ports):
                pass

            def outNode_fun(outNode, out_ports):
                pass

            self._make_sub_graph_connection(task_graphh, inputNode_fun,
                                            outNode_fun)

            ids_in_graph = []
            in_ports = []
            out_ports = []
            for t in task_graphh:
                node_id = t.get('id')
                if node_id != '':
                    node = task_graphh[node_id]
                    all_ports = node.ports_setup()
                    for port in all_ports.inports.keys():
                        in_ports.append(node_id + '.' + port)
                    for port in all_ports.outports.keys():
                        out_ports.append(node_id + '.' + port)
                    ids_in_graph.append(node_id)
            json['properties']['input']['items']['enum'] = in_ports
            json['properties']['output']['items']['enum'] = out_ports
            json['properties']['subnode_ids']['items']['enum'] = ids_in_graph
        if 'subnode_ids' in self.conf:
            for subnodeId in self.conf['subnode_ids']:
                if subnodeId in task_graphh:
                    nodeObj = task_graphh[subnodeId]
                    schema = nodeObj.conf_schema()
                    json['properties']["subnodes_conf"]['properties'][
                        subnodeId] = {
                            "type": "object",
                            "properties": {
                                "conf": schema.json
                            }
                        }
                    ui["subnodes_conf"].update(
                        {subnodeId: {
                            'conf': schema.ui
                        }})
        out_schema = ConfSchema(json=json, ui=ui)
        cache_schema[cache_key] = out_schema
        return out_schema