コード例 #1
0
ファイル: mobilenet.py プロジェクト: sasxianhu/python-dlpy
def MobileNetV2_ONNX(conn,
                     model_file,
                     n_classes=1000,
                     width=224,
                     height=224,
                     offsets=(255 * 0.406, 255 * 0.456, 255 * 0.485),
                     norm_stds=(255 * 0.225, 255 * 0.224, 255 * 0.229),
                     random_flip=None,
                     random_crop=None,
                     random_mutation=None,
                     include_top=False):
    """
    Generates a deep learning model with the MobileNetV2_ONNX architecture.
    The model architecture and pre-trained weights is generated from MobileNetV2 ONNX trained on ImageNet dataset.
    The model file and the weights file can be downloaded from https://support.sas.com/documentation/prod-p/vdmml/zip/.
    To learn more information about the model and pre-processing.
    Please go to the websites: https://github.com/onnx/models/tree/master/vision/classification/mobilenet.

    Parameters
    ----------
    conn : CAS
        Specifies the CAS connection object.
    model_file : string
        Specifies the absolute server-side path of the model table file.
        The model table file can be downloaded from https://support.sas.com/documentation/prod-p/vdmml/zip/.
    n_classes : int, optional
        Specifies the number of classes.
        Default: 1000
    width : int, optional
        Specifies the width of the input layer.
        Default: 224
    height : int, optional
        Specifies the height of the input layer.
        Default: 224
    offsets : double or iter-of-doubles, optional
        Specifies an offset for each channel in the input data. The final input
        data is set after applying scaling and subtracting the specified offsets.
        The channel order is BGR.
        Default: (255*0.406, 255*0.456, 255*0.485)
    norm_stds : double or iter-of-doubles, optional
        Specifies a standard deviation for each channel in the input data.
        The final input data is normalized with specified means and standard deviations.
        The channel order is BGR.
        Default: (255*0.225, 255*0.224, 255*0.229)
    random_flip : string, optional
        Specifies how to flip the data in the input layer when image data is
        used. Approximately half of the input data is subject to flipping.
        Valid Values: 'h', 'hv', 'v', 'none'
    random_crop : string, optional
        Specifies how to crop the data in the input layer when image data is
        used. Images are cropped to the values that are specified in the width
        and height parameters. Only the images with one or both dimensions
        that are larger than those sizes are cropped.
        Valid Values: 'none', 'unique', 'randomresized', 'resizethencrop'
    random_mutation : string, optional
        Specifies how to apply data augmentations/mutations to the data in the input layer.
        Valid Values: 'none', 'random'
    include_top : bool, optional
        Specifies whether to include pre-trained weights of the top layers (i.e., the FC layers)
        Default: False

    """
    parameters = locals()
    input_parameters = get_layer_options(input_layer_options, parameters)

    # load model and model weights
    model = Model.from_sashdat(conn, path=model_file)
    # check if a user points to a correct model.
    if model.summary.shape[0] != 120:
        raise DLPyError(
            "The model file doesn't point to a valid MobileNetV2_ONNX model. "
            "Please check the SASHDAT file.")
    # extract input layer config
    model_table_df = conn.CASTable(**model.model_table).to_frame()
    input_layer_df = model_table_df[model_table_df['_DLLayerID_'] == 0]
    input_layer = extract_input_layer(input_layer_df)
    input_layer_config = input_layer.config
    # update input layer config
    input_layer_config.update(input_parameters)
    # update the layer list
    model.layers[0] = InputLayer(**input_layer_config,
                                 name=model.layers[0].name)

    # warning if model weights doesn't exist
    if not conn.tableexists(model.model_weights.name).exists:
        weights_file_path = os.path.join(os.path.dirname(model_file),
                                         model.model_name + '_weights.sashdat')
        print('WARNING: Model weights is not attached '
              'since system cannot find a weights file located at {}'.format(
                  weights_file_path))

    if include_top:
        if n_classes != 1000:
            raise DLPyError(
                "If include_top is enabled, n_classes has to be 1000.")
    else:
        # since the output layer is non fully connected layer,
        # we need to modify the convolution right before the output. The number of filter is set to n_classes.
        conv_layer_df = model_table_df[model_table_df['_DLLayerID_'] == 118]
        conv_layer = extract_conv_layer(conv_layer_df)
        conv_layer_config = conv_layer.config
        # update input layer config
        conv_layer_config.update({'n_filters': n_classes})
        # update the layer list
        model.layers[-2] = Conv2d(**conv_layer_config,
                                  name=model.layers[-2].name,
                                  src_layers=model.layers[-3])

        # overwrite n_classes in output layer
        out_layer_df = model_table_df[model_table_df['_DLLayerID_'] == 119]
        out_layer = extract_output_layer(out_layer_df)
        out_layer_config = out_layer.config
        # update input layer config
        out_layer_config.update({'n': n_classes})
        # update the layer list
        model.layers[-1] = OutputLayer(**out_layer_config,
                                       name=model.layers[-1].name,
                                       src_layers=model.layers[-2])

        # remove top weights
        model.model_weights.append_where('_LayerID_<118')
        model._retrieve_('table.partition',
                         table=model.model_weights,
                         casout=dict(replace=True,
                                     name=model.model_weights.name))
        model.set_weights(model.model_weights.name)
    # recompile the whole network according to the new layer list
    model.compile()
    return model
コード例 #2
0
ファイル: speech.py プロジェクト: sasxianhu/python-dlpy
class Speech:
    """
    Class to do speech recognition using SAS Viya.

    Parameters
    ----------
    conn : CAS Connection
        Specifies the CAS connection object

    data_path : string
        Specifies the absolute path of the folder where segmented audio files are stored (server side).

        The "audio_path" parameter in "transcribe" method is located on the client side. To transcribe the audio,
        we need to firstly save the .wav file somewhere the CAS server can access. Also, if the audio is really long
        we may need to segment it into multiple files before copying.

        Notice that this is the location to store the temporary audio files. The Python client should have both
        reading and writing permission for this folder, and the CAS server should have at least reading permission
        for this folder.

    local_path : string, optional
        Specifies the path of the folder where segmented audio files are stored (client side).
        Default = None

        Notice that "data_path" and "local_path" actually point to the same location, and they should only have
        the same path if the CAS server and the Python client are on the same machine.

    acoustic_model_path : string, optional
        Specifies the absolute server-side path of the acoustic model file.
        Please make sure the weights file and the weights attribute file are placed under the same directory.
        Default = None

    language_model_path : string, optional
        Specifies the absolute server-side path of the language model file.
        Default = None
    """

    acoustic_model = None
    language_model_name = "languageModel"
    language_model_caslib = None

    data_path = None
    local_path = None

    data_caslib = None
    data_caslib_path = None
    data_path_after_caslib = None

    audio_table = None

    def __init__(self,
                 conn,
                 data_path,
                 local_path=None,
                 acoustic_model_path=None,
                 language_model_path=None):
        try:
            import wave
        except ImportError:
            raise DLPyError(
                "wave package was not found. "
                "Please install this package before using any APIs from dlpy.speech. "
                "We're using this Python library to help read and write audio files."
            )
        try:
            import audioop
        except ImportError:
            raise DLPyError(
                "audioop package was not found. "
                "Please install this package before using any APIs from dlpy.speech. "
                "We're using this Python library to help extract audio features and convert audio formats."
            )

        self.conn = conn
        self.server_sep = get_server_path_sep(self.conn)

        self.data_path = data_path
        if self.data_path.endswith(self.server_sep):
            self.data_path = self.data_path[:-1]
        self.data_path += self.server_sep

        server_type = get_cas_host_type(self.conn).lower()
        is_server_unix = server_type.startswith(
            "lin") or server_type.startswith("osx")

        client_type = platform.system()
        if (is_server_unix and client_type.startswith("Win")
            ) or not (is_server_unix or client_type.startswith("Win")):
            if local_path is None:
                raise DLPyError(
                    "the \"local_path\" parameter is not specified. "
                    "The CAS server and the Python client have different OS type (Windows/Linux), "
                    "so please specify the \"local_path\" parameter.")
            else:
                self.local_path = local_path
        else:
            if local_path is None:
                self.local_path = self.data_path
                print(
                    "Note: the \"local_path\" parameter is not specified. "
                    "The CAS server and the Python client have the same OS type (Windows/Linux), "
                    "so simply use \"data_path\" as \"local_path\":",
                    self.local_path)
            else:
                self.local_path = local_path

        if not os.path.exists(self.local_path):
            raise DLPyError("Invalid \"local_path\" value: does not exist.")
        if not os.access(self.local_path, os.R_OK):
            raise DLPyError(
                "Invalid \"local_path\" value: does not have reading permission."
            )
        if not os.access(self.local_path, os.W_OK):
            raise DLPyError(
                "Invalid \"local_path\" value: does not have writing permission."
            )

        self.conn.loadactionset("audio", _messagelevel="error")
        self.conn.loadactionset("deepLearn", _messagelevel="error")
        self.conn.loadactionset("langModel", _messagelevel="error")

        if acoustic_model_path is not None:
            self.load_acoustic_model(acoustic_model_path)

        if language_model_path is not None:
            self.load_language_model(language_model_path)

        self.data_caslib, self.data_path_after_caslib, _ = caslibify(
            self.conn, self.data_path, task="save")
        self.data_caslib_path = self.conn.caslibinfo(
            caslib=self.data_caslib).CASLibInfo["Path"][0]
        if not self.data_caslib_path.endswith(self.server_sep):
            self.data_caslib_path += self.server_sep

    def load_acoustic_model(self, acoustic_model_path):
        """
        Load the RNN acoustic model.

        Parameters
        ----------
        acoustic_model_path : string
            Specifies the absolute server-side path of the acoustic model file.
            Please make sure the weights file and the weights attribute file are placed under the same directory.

        """
        self.acoustic_model = Model(self.conn)
        self.acoustic_model.from_sashdat(self.conn, path=acoustic_model_path)
        if self.acoustic_model.model_table is None:
            raise DLPyError("Failed to load the acoustic model.")
        if self.acoustic_model.model_weights is None:
            raise DLPyError("Failed to load the acoustic model weights.")

    def load_language_model(self, language_model_path):
        """
        Load the N-gram language model.

        Parameters
        ----------
        language_model_path : string
            Specifies the absolute server-side path of the acoustic model file.

        """
        self.language_model_caslib, path_after_caslib, _ = caslibify(
            self.conn, language_model_path, task="load")
        rt = self.conn.retrieve("langModel.lmImport",
                                _messagelevel='error',
                                table=dict(name=path_after_caslib,
                                           caslib=self.language_model_caslib),
                                casout=dict(replace=True,
                                            name=self.language_model_name,
                                            caslib=self.language_model_caslib))
        if rt.severity > 1:
            self.language_model_caslib = None
            for msg in rt.messages:
                print(msg)
            raise DLPyError("Failed to import the language model.")

    def transcribe(self,
                   audio_path,
                   max_path_size=100,
                   alpha=1.0,
                   beta=0.0,
                   gpu=None):
        """
        Transcribe the audio file into text.

        Notice that for this API, we are assuming that the speech-to-test models published by SAS Viya 3.4 will be used.
        Please download the acoustic and language model files from here:
        https://support.sas.com/documentation/prod-p/vdmml/zip/speech_19w21.zip

        Parameters
        ----------
        audio_path : string
            Specifies the location of the audio file (client-side, absolute/relative).
        max_path_size : int, optional
            Specifies the maximum number of paths kept as candidates of the final results during the decoding process.
            Default = 100
        alpha : double, optional
            Specifies the weight of the language model, relative to the acoustic model.
            Default = 1.0
        beta : double, optional
            Specifies the weight of the sentence length, relative to the acoustic model.
            Default = 0.0
        gpu : class : `dlpy.model.Gpu`, optional
            When specified, the action uses  Graphics Processing Unit hardware.
            The simplest way to use GPU processing is to specify "gpu=1". In this case, the default values of
            other GPU parameters are used.
            Setting gpu=1 enables all available GPU devices for use. Setting gpu=0 disables GPU processing.

        Returns
        -------
        string
            Transcribed text from audio file located at 'audio_path'.

        """

        # check if acoustic model is loaded
        if self.acoustic_model is None:
            raise DLPyError(
                "acoustic model not found. "
                "Please load the acoustic model with \"load_acoustic_model\" before calling \"transcribe\"."
            )

        # check if language model is loaded
        if self.language_model_caslib is None:
            raise DLPyError(
                "language model not found. "
                "Please load the language model with \"load_language_model\" before calling \"transcribe\"."
            )

        # step 1: preparation and segmentation
        listing_path_after_caslib, listing_path_local, segment_path_after_caslib_list, segment_path_local_list = \
            segment_audio(audio_path, self.local_path, self.data_path_after_caslib, 10, 16000, 2)
        segment_path_list = [
            self.data_caslib_path + segment_path_after_caslib
            for segment_path_after_caslib in segment_path_after_caslib_list
        ]

        # step 2: load audio
        try:
            audio_table = AudioTable.load_audio_files(
                self.conn,
                path=listing_path_after_caslib,
                caslib=self.data_caslib)
        except DLPyError as err:
            if "cannot load audio files, something is wrong!" in str(err):
                clean_audio(listing_path_local, segment_path_local_list)
                raise DLPyError(
                    "Error: Cannot load the audio files. "
                    "Please verify that \"data_path\" and \"local_path\" are pointing to the same position."
                )
            raise err

        # step 3: extract features
        feature_table = AudioTable.extract_audio_features(self.conn,
                                                          table=audio_table,
                                                          n_output_frames=3500,
                                                          copyvars=["_path_"])

        # step 4: score features
        self.acoustic_model.score(table=feature_table,
                                  model="asr",
                                  init_weights="asr_weights",
                                  copy_vars=["_path_"],
                                  gpu=gpu,
                                  casout=dict(name="score_table",
                                              replace=True))
        score_table = self.conn.CASTable(name="score_table")

        # step 5: decode scores
        rt = self.conn.retrieve("langModel.lmDecode",
                                _messagelevel='error',
                                table=score_table,
                                casout=dict(name="result_table", replace=True),
                                langModelTable=dict(
                                    name=self.language_model_name,
                                    caslib=self.language_model_caslib),
                                blankLabel=" ",
                                spaceLabel="&",
                                maxPathSize=max_path_size,
                                alpha=alpha,
                                beta=beta,
                                copyvars=["_path_"])
        if rt.severity > 1:
            for msg in rt.messages:
                print(msg)
            raise DLPyError("Failed to decode the scores.")
        result_table = self.conn.CASTable(name="result_table")

        # step 6: concatenate results
        result_dict = dict(
            zip(list(result_table["_path_"]),
                list(result_table["_audio_content_"])))
        result_list = [
            result_dict[segment_path] for segment_path in segment_path_list
        ]
        result_list = [result.strip() for result in result_list]
        result_list = [result for result in result_list if len(result) > 0]
        result = " ".join(result_list)

        # step 7: cleaning
        clean_audio(listing_path_local, segment_path_local_list)

        return result