Esempio n. 1
0
    def _init_reader_schema(self, field_names=None):
        """Restore a reader schema from the DB file.

        If `field_names` given, restore scheme according to it.

        Overwise, loade blobs from the DB file into the workspace,
        and restore schema from these blob names.
        It is also assumed that:
        1). Each field of the schema have corresponding blobs
            stored in the DB file.
        2). Each blob loaded from the DB file corresponds to
            a field of the schema.
        3). field_names in the original schema are in alphabetic order,
            since blob names loaded to the workspace from the DB file
            will be in alphabetic order.

        Load a set of blobs from a DB file. From names of these blobs,
        restore the DB file schema using `from_column_list(...)`.

        Returns:
            schema: schema.Struct. Used in Reader.__init__(...).
        """
        if field_names:
            return from_column_list(field_names)

        if self.db_type == "log_file_db":
            assert os.path.exists(self.db_path), \
                'db_path [{db_path}] does not exist'.format(db_path=self.db_path)
        with core.NameScope(self.name):
            # blob_prefix is for avoiding name conflict in workspace
            blob_prefix = scope.CurrentNameScope()
        workspace.RunOperatorOnce(
            core.CreateOperator(
                'Load',
                [],
                [],
                absolute_path=True,
                db=self.db_path,
                db_type=self.db_type,
                load_all=True,
                add_prefix=blob_prefix,
            ))
        col_names = [
            blob_name[len(blob_prefix):]
            for blob_name in sorted(workspace.Blobs())
            if blob_name.startswith(blob_prefix)
        ]
        schema = from_column_list(col_names)
        return schema
Esempio n. 2
0
    def __init__(self,
                 fields,
                 name=None,
                 capacity=1,
                 enforce_unique_name=False,
                 num_threads=1):
        assert isinstance(fields, list) or isinstance(fields, Struct), (
            'fields must be either a Struct or a list of raw field names.')
        if isinstance(fields, list):
            fields = from_column_list(fields)
        self.schema = fields
        self.name = name or 'queue'
        self.num_threads = num_threads
        num_blobs = len(self.schema.field_names())
        init_net = core.Net(self.name + '/init_net')
        self.blobs_queue = init_net.CreateBlobsQueue(
            [],
            1,
            capacity=capacity,
            num_blobs=num_blobs,
            enforce_unique_name=enforce_unique_name)
        core.workspace.RunNetOnce(init_net)

        self.writer = _QueueWriter(self.blobs_queue, self.schema)
        reader_name = self.name + '_reader'
        self.reader = _QueueReader(self.blobs_queue, self.schema, reader_name)

        exit_net = core.Net(self.name + '/exit_net')
        exit_net.CloseBlobsQueue(self.blobs_queue, 0)
        self.exit_step = core.execution_step(
            '{}_close_step'.format(str(exit_net)), exit_net)
Esempio n. 3
0
    def __init__(self, fields, name=None, capacity=1,
                 enforce_unique_name=False, num_threads=1):
        assert isinstance(fields, list) or isinstance(fields, Struct), (
            'fields must be either a Struct or a list of raw field names.')
        if isinstance(fields, list):
            fields = from_column_list(fields)
        self.schema = fields
        self.name = name or 'queue'
        self.num_threads = num_threads
        num_blobs = len(self.schema.field_names())
        init_net = core.Net(self.name + '/init_net')
        self.blobs_queue = init_net.CreateBlobsQueue(
            [], 1,
            capacity=capacity,
            num_blobs=num_blobs,
            enforce_unique_name=enforce_unique_name)
        core.workspace.RunNetOnce(init_net)

        self.writer = _QueueWriter(self.blobs_queue, self.schema)
        reader_name = self.name + '_reader'
        self.reader = _QueueReader(self.blobs_queue, self.schema, reader_name)

        exit_net = core.Net(self.name + '/exit_net')
        exit_net.CloseBlobsQueue(self.blobs_queue, 0)
        self.exit_step = core.execution_step(
            '{}_close_step'.format(str(exit_net)),
            exit_net)
Esempio n. 4
0
 def testFromColumnList(self):
     st = schema.Struct(('a', schema.Scalar()),
                        ('b', schema.List(schema.Scalar())),
                        ('c', schema.Map(schema.Scalar(), schema.Scalar())))
     columns = st.field_names()
     # test that recovery works for arbitrary order
     for _ in range(10):
         some_blobs = [core.BlobReference('blob:' + x) for x in columns]
         rec = schema.from_column_list(columns, col_blobs=some_blobs)
         self.assertTrue(rec.has_blobs())
         self.assertEqual(sorted(st.field_names()),
                          sorted(rec.field_names()))
         self.assertEqual(
             [str(blob) for blob in rec.field_blobs()],
             [str('blob:' + name) for name in rec.field_names()])
         random.shuffle(columns)
Esempio n. 5
0
 def testFromColumnList(self):
     st = schema.Struct(
         ('a', schema.Scalar()),
         ('b', schema.List(schema.Scalar())),
         ('c', schema.Map(schema.Scalar(), schema.Scalar()))
     )
     columns = st.field_names()
     # test that recovery works for arbitrary order
     for _ in range(10):
         some_blobs = [core.BlobReference('blob:' + x) for x in columns]
         rec = schema.from_column_list(columns, col_blobs=some_blobs)
         self.assertTrue(rec.has_blobs())
         self.assertEqual(sorted(st.field_names()), sorted(rec.field_names()))
         self.assertEqual([str(blob) for blob in rec.field_blobs()],
                          [str('blob:' + name) for name in rec.field_names()])
         random.shuffle(columns)
Esempio n. 6
0
def shrink_output_schema(net, out_schema):
    if len(out_schema.field_names()) <= 1:
        return out_schema
    exists = [net.BlobIsDefined(blob) for blob in out_schema.field_blobs()]
    return schema.from_column_list([
        col_name
        for ok, col_name in zip(exists, out_schema.field_names()) if ok
    ], [
        col_type
        for ok, col_type in zip(exists, out_schema.field_types()) if ok
    ], [
        col_blob
        for ok, col_blob in zip(exists, out_schema.field_blobs()) if ok
    ], [
        col_meta
        for ok, col_meta in zip(exists, out_schema.field_metadata()) if ok
    ])
Esempio n. 7
0
    def __init__(self, fields, name=None):
        """Create an un-initialized dataset with schema provided by `fields`.

        Before this dataset can be used, it must be initialized, either by
        `init_empty` or `init_from_dataframe`.

        Args:
            fields: either a schema.Struct or a list of field names in a format
                    compatible with the one described in schema.py.
            name: optional name to prepend to blobs that will store the data.
        """
        assert isinstance(fields, list) or isinstance(fields, Struct), (
            'fields must be either a Struct or a list of raw field names.')
        if isinstance(fields, list):
            fields = from_column_list(fields)
        self.schema = fields
        self.fields = fields.field_names()
        self.field_types = fields.field_types()
        self.name = name or 'dataset'
        self.field_blobs = fields.field_blobs() if fields.has_blobs() else None
Esempio n. 8
0
    def __init__(self, fields, name=None):
        """Create an un-initialized dataset with schema provided by `fields`.

        Before this dataset can be used, it must be initialized, either by
        `init_empty` or `init_from_dataframe`.

        Args:
            fields: either a schema.Struct or a list of field names in a format
                    compatible with the one described in schema.py.
            name: optional name to prepend to blobs that will store the data.
        """
        assert isinstance(fields, list) or isinstance(fields, Struct), (
            'fields must be either a Struct or a list of raw field names.')
        if isinstance(fields, list):
            fields = from_column_list(fields)
        self.schema = fields
        self.fields = fields.field_names()
        self.field_types = fields.field_types()
        self.name = name or 'dataset'
        self.field_blobs = fields.field_blobs() if fields.has_blobs() else None
def shrink_output_schema(net, out_schema):
    if len(out_schema.field_names()) <= 1:
        return out_schema
    exists = [net.BlobIsDefined(blob) for blob in out_schema.field_blobs()]
    return schema.from_column_list(
        [
            col_name for ok, col_name in
            zip(exists, out_schema.field_names()) if ok
        ],
        [
            col_type for ok, col_type in
            zip(exists, out_schema.field_types()) if ok
        ],
        [
            col_blob for ok, col_blob in
            zip(exists, out_schema.field_blobs()) if ok
        ],
        [
            col_meta for ok, col_meta in
            zip(exists, out_schema.field_metadata()) if ok
        ]
    )
Esempio n. 10
0
    def _init_reader_schema(self):
        """Restore a reader schema from the DB file.

        Here it is assumed that:
        1). Each field of the schema have corresponding blobs
            stored in the DB file.
        2). Each blob loaded from the DB file corresponds to
            a field of the schema.

        Load a set of blobs from a DB file. From names of these blobs,
        restore the DB file schema using `from_column_list(...)`.

        Returns:
            schema: schema.Struct. Used in Reader.__init__(...).
        """
        assert os.path.exists(self.db_path), \
            'db_path [{db_path}] does not exist'.format(db_path=self.db_path)
        with core.NameScope(self.name):
            # blob_prefix is for avoiding name conflict in workspace
            blob_prefix = scope.CurrentNameScope()
        workspace.RunOperatorOnce(
            core.CreateOperator(
                'Load',
                [],
                [],
                absolute_path=True,
                db=self.db_path,
                db_type=self.db_type,
                load_all=True,
                add_prefix=blob_prefix,
            )
        )
        col_names = [
            blob_name[len(blob_prefix):] for blob_name in workspace.Blobs()
            if blob_name.startswith(blob_prefix)
        ]
        schema = from_column_list(col_names)
        return schema
 def testFromEmptyColumnList(self):
     st = schema.Struct()
     columns = st.field_names()
     rec = schema.from_column_list(col_names=columns)
     self.assertEqual(rec, schema.Struct())
Esempio n. 12
0
    def get_predictor_export_meta_and_workspace(self,
                                                feature_extractor=None,
                                                output_transformer=None):
        """

        ONNX would load blobs into a private workspace. We returns the workspace
        here instead of copying the blobs to the global workspace in order to
        save memory in the export state. Returning private workspace, we only
        need memory for PyTorch model, ONNX buffer, and Caffe2 model. Including
        optimizer parameters, this means we can train and save a model
        a quarter of the size of machine memory.

        We should revisit this once PyTorch 1.0 is ready.

        Args:
            feature_extractor: An instance of FeatureExtractorBase
        """
        # 1. Get Caffe2 model
        c2_model, input_blobs, output_blobs = self.get_caffe2_model()
        ws = c2_model.workspace

        # Initializing constants in the model
        init_net = core.Net(c2_model.init_net)
        ws.CreateNet(init_net)
        ws.RunNet(init_net)

        # Per ONNX code comment, input blobs are not initilized
        model_inputs = c2_model.uninitialized
        assert len(model_inputs) > 0, "Model is expected to have some input"
        parameters = [b for b in ws.Blobs() if b not in model_inputs]
        # Input blobs in order
        model_input_blobs = [b for b in input_blobs if b in model_inputs]

        predict_net = core.Net("predict_net")

        output_blob_names = self.output_blob_names()
        assert len(output_blobs) == len(output_blob_names), (
            "output_blobs and output_blob_names must have the same lengths. "
            "Check that your model don't reuse output tensors. "
            "output_blobs: {}; output_blob_names: {}".format(
                output_blobs, output_blob_names))
        blob_remap = {
            onnx_name: explicit_name
            for onnx_name, explicit_name in zip(output_blobs,
                                                output_blob_names)
        }
        shapes = {}

        # 2. Create feature extractor net
        if feature_extractor:
            feature_extractor_nets = feature_extractor.create_net()
            # Initializing feature extractor parameters
            ws.CreateNet(feature_extractor_nets.init_net)
            ws.RunNet(feature_extractor_nets.init_net)
            feature_extractor_params = set(
                feature_extractor_nets.init_net.Proto().external_output)
            assert (len(set(parameters) & feature_extractor_params) == 0
                    ), "Blob names collide! Please open a bug report"
            parameters += feature_extractor_params
            extracted_blobs = [
                str(b) for b in
                feature_extractor_nets.net.output_record().field_blobs()
            ]
            assert len(model_input_blobs) == len(extracted_blobs), (
                "The lengths of model_input_blobs and extracted_blobs must match. "
                "model_input_blobs: {}; extracted_blobs: {}".format(
                    model_input_blobs, extracted_blobs))
            blob_remap.update({
                onnx_name: extracted_name
                for onnx_name, extracted_name in zip(model_input_blobs,
                                                     extracted_blobs)
            })

            predict_net.AppendNet(feature_extractor_nets.net)
            del predict_net.Proto().external_output[:]

            input_blobs = [
                b for b in predict_net.Proto().external_input
                if b not in feature_extractor_params
            ]
            shapes.update({b: [] for b in input_blobs})
        else:
            input_blobs = model_input_blobs

        # 3. Rename the input blobs of model to match the output of feature
        #    extractor net

        model_net = core.Net(c2_model.predict_net).Clone("remapped_model_net",
                                                         blob_remap=blob_remap)

        # 5. Join feature extractor net & model net
        predict_net.AppendNet(model_net)

        if output_transformer is not None:
            output_field_names = self.output_field_names()
            original_output = schema.from_column_list(
                col_names=output_field_names,
                col_blobs=[core.BlobReference(b) for b in output_blob_names],
            )
            output_transformer_nets = output_transformer.create_net(
                original_output)
            # Initializing output transformer parameters
            ws.CreateNet(output_transformer_nets.init_net)
            ws.RunNet(output_transformer_nets.init_net)
            output_transformer_params = set(
                output_transformer_nets.init_net.Proto().external_output)
            assert (len(set(parameters) & output_transformer_params) == 0
                    ), "Blob names collide! Please open a bug report"
            parameters += output_transformer_params
            del predict_net.Proto().external_output[:]
            predict_net.AppendNet(output_transformer_nets.net)

        # These shapes are not really used but required, so just pass fake ones
        shapes.update({b: [] for b in predict_net.Proto().external_output})

        return (
            PredictorExportMeta(
                predict_net,
                parameters,
                input_blobs,
                predict_net.Proto().external_output,
                shapes=shapes,
                net_type="async_scheduling",
                num_workers=8,
            ),
            ws,
        )