Exemple #1
0
    def process(self, inputs):
        input_meta = self.get_input_meta()
        predict_col = self.conf.get('prediction', 'predict')
        data_df = inputs[self.INPUT_PORT_NAME]

        if self.INPUT_PORT_MODEL_NAME in input_meta:
            # use external information instead of conf
            filename = get_file_path(inputs[self.INPUT_PORT_MODEL_NAME])
            train_cols = input_meta[self.INPUT_PORT_MODEL_NAME]['train']
            train_cols = list(train_cols.keys())
        else:
            # use the conf information
            filename = get_file_path(self.conf['file'])
            if 'columns' in self.conf:
                if self.conf.get('include', True):
                    train_cols = self.conf['columns']
                else:
                    train_cols = [
                        col for col in data_df.columns
                        if col not in self.conf['columns']
                    ]
        # train_cols.sort()
        fm = ForestInference.load(filename,
                                  model_type=self.conf.get(
                                      "model_type", "xgboost"))
        prediction = fm.predict(data_df[train_cols])
        prediction.index = data_df.index
        data_df[predict_col] = prediction
        return {self.OUTPUT_PORT_NAME: data_df}
Exemple #2
0
    def process(self, inputs):
        """
        dump the input datafram to the resulting csv file.
        the output filepath is defined as `path` in the `conf`.
        if only a subset of columns is needed for the csv file, enumerate the
        columns in the `columns` of the `conf`

        Arguments
        -------
         inputs: list
            list of input dataframes.
        Returns
        -------
        dataframe
        """

        raw_input_df = inputs[self.INPUT_PORT_NAME]
        if 'columns' in self.conf:
            raw_input_df = raw_input_df[self.conf['columns']]
        if isinstance(raw_input_df, dask_cudf.DataFrame):
            input_df = raw_input_df.compute()  # get the computed value
        else:
            input_df = raw_input_df
        input_df.to_pandas().to_csv(get_file_path(self.conf['path']),
                                    index=False)
        return {self.OUTPUT_PORT_NAME: raw_input_df}
    def process(self, inputs):
        """
        Load the end of day stock CSV data into cuDF dataframe

        Arguments
        -------
         inputs: list
             empty list
        Returns
        -------
        cudf.DataFrame
        """
        output = {}
        if self.outport_connected(CUDF_PORT_NAME):
            path = get_file_path(self.conf['file'])
            df = cudf.read_csv(path)
            # extract the year, month, day
            ymd = df['DTE'].astype(
                'str').str.extract(r'(\d\d\d\d)(\d\d)(\d\d)')
            # construct the standard datetime str
            df['DTE'] = ymd[0].str.cat(
                ymd[1],
                '-').str.cat(ymd[2], '-').astype('datetime64[ms]')
            df = df[['DTE', 'OPEN', 'CLOSE', 'HIGH', 'LOW', 'SM_ID', 'VOLUME']]
            df['VOLUME'] /= 1000
            # change the names
            df.columns = ['datetime', 'open', 'close',
                          'high', 'low', "asset", 'volume']
            output.update({CUDF_PORT_NAME: df})
        if self.outport_connected(PANDAS_PORT_NAME):
            path = get_file_path(self.conf['file'])
            df = pd.read_csv(path,
                             converters={'DTE':
                                         lambda x: pd.Timestamp(str(x))})
            df = df[['DTE', 'OPEN',
                     'CLOSE', 'HIGH',
                     'LOW', 'SM_ID', 'VOLUME']]
            df['VOLUME'] /= 1000
            df.columns = ['datetime', 'open', 'close', 'high',
                          'low', "asset", 'volume']
            output.update({PANDAS_PORT_NAME: df})
        if self.outport_connected(DASK_CUDF_PORT_NAME):
            path = get_file_path(self.conf['path'])
            df = dask_cudf.read_csv(path+'/*.csv',
                                    parse_dates=['datetime'])
            output.update({DASK_CUDF_PORT_NAME: df})
        return output
 def _compute_hash_key(self):
     """
     if hash changed, the port_setup, meta_setup
     and conf_json should be different
     In very rara case, might have the problem of hash collision,
     It affects the column, port and conf calculation. It won't
     change the computation result though.
     It returns the hash code, the loaded task_graph,
     the replacement conf obj
     """
     task_graph = ""
     inputs = ()
     replacementObj = {}
     input_node = ""
     task_graph_obj = None
     if 'taskgraph' in self.conf:
         try:
             task_graph = get_file_path(self.conf['taskgraph'])
         except FileNotFoundError:
             task_graph = None
         if task_graph is not None and os.path.exists(task_graph):
             with open(task_graph) as f:
                 task_graph = hashlib.md5(f.read().encode()).hexdigest()
             task_graph_obj = TaskGraph.load_taskgraph(
                 get_file_path(self.conf['taskgraph']))
     self.update_replace(replacementObj, task_graph_obj)
     if 'input' in self.conf:
         for inp in self.conf['input']:
             input_node += inp+","
             if hasattr(self, 'inputs'):
                 for i in self.inputs:
                     inputs += (hash(i['from_node']),
                                i['to_port'], i['from_port'])
     return (hash((self.uid, task_graph, inputs, json.dumps(self.conf),
                   input_node, json.dumps(replacementObj))), task_graph_obj,
             replacementObj)
Exemple #5
0
 def process(self, inputs):
     import dask.distributed
     try:
         client = dask.distributed.client.default_client()
     except ValueError:
         from dask_cuda import LocalCUDACluster
         cluster = LocalCUDACluster()
         from dask.distributed import Client
         client = Client(cluster)  # noqa
         print('start new Cluster')
     filename = get_file_path(self.conf['csvfile'])
     df = cudf.read_csv(filename, parse_dates=[0])
     df.columns = ['date'] + [c for c in df.columns][1:]
     output = {}
     if self.outport_connected('df_out'):
         output.update({'df_out': df})
     return output
    def process(self, inputs):
        """
        dump the model into the file
        Arguments
        -------
         inputs: list
            list of input dataframes.
        Returns
        -------
        dataframe
        """

        model = inputs[self.INPUT_PORT_NAME]
        if isinstance(model, dict):
            model = model['booster']
        pathname = get_file_path(self.conf['path'])
        model.save_model(pathname)
        return {self.OUTPUT_PORT_NAME: pathname}
Exemple #7
0
 def init(self, class_obj):
     if nemo.core.NeuralModuleFactory.get_default_factory() is None:
         nemo.core.NeuralModuleFactory()
     self.instanceClass = class_obj
     self.instance = None
     self.file_fields = []
     conf_para = get_conf_parameters(class_obj)
     self.fix_type = {}
     self.INPUT_NM = 'in_nm'
     self.OUTPUT_NM = 'out_nm'
     for key in conf_para.keys():
         if key.find('name') >= 0:
             self.fix_type[key] = "string"
         if key.find('model') >= 0:
             self.fix_type[key] = "string"
         if key.find('file') >= 0:
             self.file_fields.append(key)
     for f in self.file_fields:
         self.fix_type[f] = 'string'
         if f in self.conf and self.conf[f]:
             self.conf[f] = get_file_path(self.conf[f])
     if not issubclass(class_obj, DataLayerNM):
         try:
             if issubclass(self.instanceClass, TrainableNM):
                 input_meta = self.get_input_meta()
                 if self.INPUT_NM in input_meta:
                     if (share_weight in self.conf and
                             self.conf[share_weight] == 'Reuse'):
                         self.conf = input_meta[self.INPUT_NM]
             app = nemo.utils.app_state.AppState()
             ins = None
             for mod in app._module_registry:
                 if isinstance(mod, self.instanceClass):
                     ins = mod
                     break
             if ins is None:
                 ins = class_obj(**self.conf)
             if self.instance is None:
                 self.instance = ins
         except Exception as e:
             print(e)
             pass
Exemple #8
0
    def process(self, inputs):
        """
        Load the csv file mapping stock id to symbol name into cudf DataFrame

        Arguments
        -------
         inputs: list
             empty list
        Returns
        -------
        cudf.DataFrame
        """
        output = {}
        if self.outport_connected(STOCK_NAME_PORT_NAME):
            path = get_file_path(self.conf['file'])
            name_df = cudf.read_csv(path)[['SM_ID', 'SYMBOL']]
            # change the names
            name_df.columns = ["asset", 'asset_name']
            output.update({STOCK_NAME_PORT_NAME: name_df})
        if self.outport_connected(STOCK_MAP_PORT_NAME):
            output.update({STOCK_MAP_PORT_NAME: StockMap()})
        return output
Exemple #9
0
 def meta_setup(self):
     required = {}
     column_types = {"asset": "int64", "asset_name": "object"}
     out_cols = {
         STOCK_NAME_PORT_NAME: column_types,
     }
     if self.outport_connected(STOCK_MAP_PORT_NAME):
         if 'file' in self.conf:
             hash_key = self._compute_hash_key()
             if hash_key in CACHE_NAME:
                 out_cols.update(
                     {STOCK_MAP_PORT_NAME: CACHE_NAME[hash_key]})
             else:
                 path = get_file_path(self.conf['file'])
                 name_df = cudf.read_csv(path)[['SM_ID', 'SYMBOL']]
                 name_df.columns = ["asset", 'asset_name']
                 pdf = name_df.to_pandas()
                 column_data = pdf.to_dict('list')
                 CACHE_NAME[hash_key] = column_data
                 out_cols.update({STOCK_MAP_PORT_NAME: column_data})
     metadata = MetaData(inports=required, outports=out_cols)
     return metadata
    def process(self, inputs):
        """
        Composite computation

        Arguments
        -------
         inputs: list
            list of input dataframes.
        Returns
        -------
        dataframe
        """
        if 'taskgraph' in self.conf:
            task_graph = TaskGraph.load_taskgraph(
                get_file_path(self.conf['taskgraph']))
            task_graph.build()

            outputLists = []
            replaceObj = {}
            input_feeders = []

            def inputNode_fun(inputNode, in_ports):
                inports = inputNode.ports_setup().inports

                class InputFeed(Node):

                    def meta_setup(self):
                        output = {}
                        for inp in inputNode.inputs:
                            output[inp['to_port']] = inp[
                                'from_node'].meta_setup().outports[
                                    inp['from_port']]
                        # it will be something like { input_port: columns }
                        return MetaData(inports={}, outports=output)

                    def ports_setup(self):
                        # it will be something like { input_port: types }
                        return NodePorts(inports={}, outports=inports)

                    def conf_schema(self):
                        return ConfSchema()

                    def process(self, empty):
                        output = {}
                        for key in inports.keys():
                            if inputNode.uid+'@'+key in inputs:
                                output[key] = inputs[inputNode.uid+'@'+key]
                        return output

                uni_id = str(uuid.uuid1())
                obj = {
                    TaskSpecSchema.task_id: uni_id,
                    TaskSpecSchema.conf: {},
                    TaskSpecSchema.node_type: InputFeed,
                    TaskSpecSchema.inputs: []
                }
                input_feeders.append(obj)
                newInputs = {}
                for key in inports.keys():
                    if inputNode.uid+'@'+key in inputs:
                        newInputs[key] = uni_id+'.'+key
                for inp in inputNode.inputs:
                    if inp['to_port'] not in in_ports:
                        # need to keep the old connections
                        newInputs[inp['to_port']] = (inp['from_node'].uid
                                                     + '.' + inp['from_port'])
                replaceObj.update({inputNode.uid: {
                    TaskSpecSchema.inputs: newInputs}
                })

            def outNode_fun(outNode, out_ports):
                out_ports = outNode.ports_setup().outports
                # fixed_outports = fix_port_name(out_ports, outNode.uid)
                for key in out_ports.keys():
                    if self.outport_connected(outNode.uid+'@'+key):
                        outputLists.append(outNode.uid+'.'+key)

            self._make_sub_graph_connection(task_graph,
                                            inputNode_fun, outNode_fun)

            task_graph.extend(input_feeders)
            self.update_replace(replaceObj, task_graph)
            result = task_graph.run(outputLists, replace=replaceObj)
            output = {}
            for key in result.get_keys():
                splits = key.split('.')
                output['@'.join(splits)] = result[key]
            return output
        else:
            return {}
Exemple #11
0
    def update(self):
        TemplateNodeMixin.update(self)
        self.conf_update()  # update the conf
        task_graph = ""
        replacementObj = {}
        task_graph_obj = None
        if 'taskgraph' in self.conf:
            try:
                task_graph = get_file_path(self.conf['taskgraph'])
            except FileNotFoundError:
                task_graph = None
            if task_graph is not None and os.path.exists(task_graph):
                # with open(task_graph) as f:
                #     task_graph = hashlib.md5(f.read().encode()).hexdigest()
                task_graph_obj = TaskGraph.load_taskgraph(
                    get_file_path(self.conf['taskgraph']))
        self.all_inputs = []
        self.all_outputs = []
        self.task_graph = task_graph_obj
        self.update_replace(replacementObj, task_graph_obj)
        self.replacementObj = replacementObj
        extra_updated = set()
        extra_roots = []
        if self.task_graph is not None:
            self.task_graph._build(replace=self.replacementObj)
            if 'input' in self.conf:
                # group input ports by node id
                self.inp_groups = group_ports(self.conf['input'])
                for inp in self.inp_groups.keys():
                    if inp in self.task_graph:
                        inputNode = self.task_graph[inp]
                        update_inputs = []
                        replaced_ports = set(self.inp_groups[inp])
                        for oldInput in inputNode.inputs:
                            if oldInput['to_port'] in replaced_ports:
                                # we want to disconnect this old one and
                                # connect to external node
                                if hasattr(self, 'inputs'):
                                    for externalInput in self.inputs:
                                        if (_get_node(externalInput['to_port'])
                                                == inputNode.uid and _get_port(
                                                    externalInput['to_port'])
                                                == oldInput['to_port']):
                                            newInput = {}
                                            newInput['to_port'] = _get_port(
                                                externalInput['to_port'])
                                            newInput[
                                                'from_port'] = externalInput[
                                                    'from_port']
                                            newInput[
                                                'from_node'] = externalInput[
                                                    'from_node']
                                            update_inputs.append(newInput)
                            else:
                                update_inputs.append(oldInput)
                        inputNode.inputs = update_inputs

                        # add all the `updated` parents to the set
                        for i in inputNode.inputs:
                            if hasattr(i['from_node'], 'ports_setup_cache'):
                                extra_updated.add(i['from_node'])
                        # if all the parents are updated, this is
                        # a new root node
                        if all([
                                i['from_node'] in extra_updated
                                for i in inputNode.inputs
                        ]):
                            extra_roots.append(inputNode)

                        self.all_inputs.append((inputNode, inp))

            if 'output' in self.conf:
                self.oup_groups = group_ports(self.conf['output'])
                for oup in self.oup_groups.keys():
                    if oup in self.task_graph:
                        outNode = self.task_graph[oup]
                        # we do not disconnect anything here, as we take extra
                        # outputs for composite node.
                        # Node, we rely on the fact that taskgraph.run method
                        # will remove the output collector from taskgraph if
                        # the outputlist is set
                        self.all_outputs.append((outNode, oup))
                        # outNode_fun(outNode, oup_groups[oup])

            # update all the nodes and cache it
            self.task_graph.breadth_first_update(extra_roots=extra_roots,
                                                 extra_updated=extra_updated)
Exemple #12
0
            def search_fun(config, checkpoint_dir=None):
                myinputs = {}
                for key in data_store.keys():
                    v = ray.get(data_store[key])
                    if isinstance(v, pandas.DataFrame):
                        myinputs[key] = cudf.from_pandas(v)
                    else:
                        myinputs[key] = v
                task_graph = TaskGraph.load_taskgraph(
                    get_file_path(self.conf['taskgraph']))
                task_graph.build()

                outputLists = [train_id + '.' + 'checkpoint_dir']
                replaceObj = {}
                input_feeders = []

                def inputNode_fun(inputNode, in_ports):
                    inports = inputNode.ports_setup().inports

                    class InputFeed(Node):
                        def meta_setup(self):
                            output = {}
                            for inp in inputNode.inputs:
                                output[inp['to_port']] = inp[
                                    'from_node'].meta_setup()[inp['from_port']]
                            # it will be something like { input_port: columns }
                            return output

                        def ports_setup(self):
                            # it will be something like { input_port: types }
                            return NodePorts(inports={}, outports=inports)

                        def conf_schema(self):
                            return ConfSchema()

                        def process(self, empty):
                            output = {}
                            for key in inports.keys():
                                if (inputNode.uid + '@' + key in myinputs):
                                    output[key] = myinputs[inputNode.uid +
                                                           '@' + key]
                            return output

                    uni_id = str(uuid.uuid1())
                    obj = {
                        TaskSpecSchema.task_id: uni_id,
                        TaskSpecSchema.conf: {},
                        TaskSpecSchema.node_type: InputFeed,
                        TaskSpecSchema.inputs: []
                    }
                    input_feeders.append(obj)
                    newInputs = {}
                    for key in inports.keys():
                        if inputNode.uid + '@' + key in myinputs:
                            newInputs[key] = uni_id + '.' + key
                    for inp in inputNode.inputs:
                        if inp['to_port'] not in in_ports:
                            # need to keep the old connections
                            newInputs[inp['to_port']] = (inp['from_node'].uid +
                                                         '.' +
                                                         inp['from_port'])
                    replaceObj.update(
                        {inputNode.uid: {
                            TaskSpecSchema.inputs: newInputs
                        }})

                def outNode_fun(outNode, out_ports):
                    pass

                self._make_sub_graph_connection(task_graph, inputNode_fun,
                                                outNode_fun)

                task_graph.extend(input_feeders)
                self.update_conf_for_search(replaceObj, task_graph, config)
                task_graph.run(outputLists, replace=replaceObj)