Beispiel #1
0
def train_svm(X_train,
              y_train,
              vectorizer,
              type="normal",
              c: float = 1.0,
              max_iter: int = 1000,
              max_features: int = 1000):
    """
        Used to train a svm classifier
        :param X_train: input train
        :param y_train: label train
        :param vectorizer: type of vectorizer
        :param type : type of svm (linear, not linear)
        :param c: regularization parameter
        :param max_iter : maximum iterations
        :param max_features : maximum number of features
        :return: svm model, features vectorizer
    """
    X_train, vectorizer = utils.feature_extraction(X_train,
                                                   max_features=max_features,
                                                   vectorizer_type=vectorizer)
    if type == "normal":
        svm_model = SVC(C=c, max_iter=max_iter)
    else:
        svm_model = LinearSVC(C=c, max_iter=max_iter, dual=False)
    print("Cross Validation ...")
    train_linear(svm_model, X_train, y_train)
    print("\nFitting ...")
    svm_model.fit(X_train, y_train)
    return svm_model, vectorizer
def main(settings):
    features, minima, maxima, scaling_parameter = feature_extraction(
        settings.dataset_dir)
    window = 5
    X_train, y_train, X_test, y_test = split_features(features[::-1], window)
    print("X_train", X_train.shape)
    print("y_train", y_train.shape)
    print("X_test", X_test.shape)
    print("y_test", y_test.shape)

    # load json and create model
    layout_path = glob.glob(os.path.join(settings.model_dir,
                                         "*layout.json"))[0]
    json_file = open(layout_path, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)

    # load weights into new model
    weights_path = glob.glob(os.path.join(settings.model_dir,
                                          "*weights.h5"))[0]
    model.load_weights(weights_path)
    print("Loaded model from disk")

    predicted2 = model.predict(X_test)
    actual = y_test
    predicted2 = (predicted2 * scaling_parameter) + minima
    actual = (actual * scaling_parameter) + minima

    mape2 = sqrt(mean_absolute_percentage_error(predicted2, actual))
    mse2 = mean_absolute_error(actual, predicted2)

    print(json.dumps({"mape": mape2, "mse": mse2}))
Beispiel #3
0
    def wakati(self, text, lower=False):
        """Word segmentation function. Return the segmented words.

        args:
            - text (str): An input sentence.
            - lower (bool): If lower is True, all uppercase characters in a list \
                            of the words are converted into lowercase characters.

        return:
            - words (list): A list of the words.
        """
        text = utils.preprocess(text)
        lower_text = text.lower()
        feats = utils.feature_extraction(text=lower_text,
                                         uni2id=self._uni2id,
                                         bi2id=self._bi2id,
                                         dictionary=self._word2id,
                                         window_size=self._hp['WINDOW_SIZE'])
        obs  = self._model.encode_ws(feats)
        obs  = [ob.npvalue() for ob in obs]
        tags = utils.np_viterbi(self._model.trans_array, obs)

        # A word can be recognized as a single word forcibly.
        if self.pattern:
            for match in self.pattern.finditer(text):
                span = match.span()
                span_s = span[0]
                span_e = span[1]

                if (span_e - span_s) == 1:
                    tags[span_s:span_e] = [3]
                else:
                    tags[span_s:span_e] = [0]+[1]*((span_e-span_s)-2)+[2]

                if span_s != 0:
                    previous_tag = tags[span_s-1]
                    if previous_tag == 0:   # 0 is BEGIN tag
                        tags[span_s-1] = 3  # 3 is SINGLE tag
                    elif previous_tag == 1: # 1 is MIDDEL tag
                        tags[span_s-1] = 2  # 2 is END tag

                if span_e != len(text):
                    next_tag = tags[span_e]
                    if next_tag == 1:    # 1 is MIDDEL tag
                        tags[span_e] = 0 # 0 is BEGIN tag
                    elif next_tag == 2:  # 2 is END tag
                        tags[span_e] = 3 # 3 is SINGLE tag

        if lower is True:
            words = utils.segmenter_for_bmes(lower_text, tags)
        else:
            words = utils.segmenter_for_bmes(text, tags)
        return words
Beispiel #4
0
def train_knn(X_train, y_train, vectorizer_type, max_features: int = 1000):
    """
        Used to train a k-nn classifier
        :param X_train: input train
        :param y_train: label train
        :param vectorizer_type: type of vectorizer
        :param max_features : maximum number of features
        :return: naive bayes model, features vectorizer
    """

    X_train, vectorizer = utils.feature_extraction(
        X_train, max_features=max_features, vectorizer_type=vectorizer_type)
    model = KNeighborsClassifier(n_neighbors=10)
    train_linear(model, X_train, y_train)
    model.fit(X_train, y_train)
    return model, vectorizer
Beispiel #5
0
def train_NB(X_train, y_train, vectorizer, max_features: int = 1000):
    """
        Used to train a naive bayes classifier
        :param X_train: input train
        :param y_train: label train
        :param vectorizer: type of vectorizer
        :param max_features : maximum number of features
        :return: naive bayes model, features vectorizer
    """
    X_train, vectorizer = utils.feature_extraction(X_train,
                                                   max_features=max_features,
                                                   vectorizer_type=vectorizer)
    naive_model = naive_bayes.MultinomialNB()
    print("\nCross validation ...")
    train_linear(naive_model, X_train, y_train)
    print("\nFitting")
    naive_model.fit(X_train, y_train)
    return naive_model, vectorizer
Beispiel #6
0
    def wakati(self, text, lower=False):
        """
        Return the words of the given sentence.
        Input: str (a sentence)
        Output: the list of the words
        """
        text = utils.preprocess(text)
        lower_text = text.lower()
        feats = utils.feature_extraction(text=lower_text,
                                         uni2id=self._uni2id,
                                         bi2id=self._bi2id,
                                         dictionary=self._word2id,
                                         window_size=self._hp['WINDOW_SIZE'])
        obs = self._model.encode_ws(feats)
        obs = [ob.npvalue() for ob in obs]
        tags = utils.np_viterbi(self._model.trans_array, obs)

        # A word can be recognized as a single word forcibly.
        if self.pattern:
            for match in self.pattern.finditer(text):
                span = match.span()
                span_s = span[0]
                span_e = span[1]
                tags[span_s:span_e] = [0] + [1] * ((span_e - span_s) - 2) + [2]

                if span_s != 0:
                    previous_tag = tags[span_s - 1]
                    if previous_tag == 0:  # 0 is BEGIN tag
                        tags[span_s - 1] = 3  # 3 is SINGLE tag
                    elif previous_tag == 1:  # 1 is MIDDEL tag
                        tags[span_s - 1] = 2  # 2 is END tag

                if span_e != len(text):
                    next_tag = tags[span_e]
                    if next_tag == 1:  # 1 is MIDDEL tag
                        tags[span_e] = 0  # 0 is BEGIN tag
                    elif next_tag == 2:  # 2 is END tag
                        tags[span_e] = 3  # 3 is SINGLE tag

        if lower is True:
            words = utils.segmenter_for_bmes(lower_text, tags)
        else:
            words = utils.segmenter_for_bmes(text, tags)
        return words
Beispiel #7
0
    def inference(self, image):

        self.sess.run(tf.global_variables_initializer())

        checkpoint = tf.train.latest_checkpoint(self.params.model_dir) \
            if not self.params.checkpoint else self.params.checkpoint

        if not checkpoint:
            raise FileNotFoundError('Checkpoint in not found in {}'.format(
                self.params.model_dir))
        else:
            print("Loading model checkpoint {}...".format(checkpoint))
            self.saver.restore(self.sess, checkpoint)
        feature_extractor = utils.feature_extractor('VGG19')
        features = utils.feature_extraction(feature_extractor, image)

        predicted_sequence = self.rnn.predict(features)

        print(predicted_sequence)
Beispiel #8
0
def main(settings):
    features, minima, maxima, scaling_parameter = feature_extraction(settings.dataset_dir)
    window = 5
    X_train, y_train, X_test, y_test = split_features(features[::-1], window)
    print("X_train", X_train.shape)
    print("y_train", y_train.shape)
    print("X_test", X_test.shape)
    print("y_test", y_test.shape)

    json_logging_callback = LambdaCallback(
        on_epoch_end=lambda epoch, logs: print(json.dumps({
            "epoch": epoch,
            "loss": logs["loss"],
            "acc": logs["acc"],
            "val_loss": logs["val_loss"],
            "val_acc": logs["val_acc"],
        })),
    )

    # figure out which model architecture to use
    arch = settings.model_architecture
    assert arch in model_architectures, "Unknown model architecture '%s'." % arch
    builder = model_architectures[arch]

    # build and train the model
    model = builder([len(features.columns), window, 1])
    model.fit(
        X_train,
        y_train,
        batch_size=settings.batch_size,
        epochs=settings.epochs,
        validation_split=settings.validation_split,
        callbacks=[json_logging_callback],
        verbose=0)

    # serialize model to JSON
    model_json = model.to_json()
    with open(os.path.join(settings.output_dir, "model-layout.json"), "w") as json_file:
        json_file.write(model_json)

    # serialize weights to HDF5
    model.save_weights(os.path.join(settings.output_dir, "model-weights.h5"))
    print("Saved model to disk")
Beispiel #9
0
from utils import feature_extraction

files_adresse = [
    '../samples/man/arctic_a0001.wav', '../samples/woman/arctic_a0001.wav'
]
frame_width = 35 / 1000
shift_width = 35 / 1000
threshold = 40
print('Data set :\n',
      feature_extraction(files_adresse, frame_width, shift_width, threshold))
Beispiel #10
0
    def __init__(self, filename, window_size, vocabs):
        self.words = []
        self.ws_data = []
        self.pos_data = []
        self.filename = filename
        uni2id, bi2id, word2id, pos2id, word2postags = vocabs
        with open(filename, 'r') as texts:
            wids = []  # Word index
            cids = []  # Character index
            pids = []  # POStag index
            words = []  # Original Words
            ptags = []  # Original POStags
            for text in texts:
                text = utils.utf8rstrip(text)
                if text == 'EOS':
                    sent = ''.join(words)
                    segmented_sent = ' '.join(words)
                    tags = utils.make_tags_as_bmes(segmented_sent)

                    feats = utils.feature_extraction(text=sent,
                                                     uni2id=uni2id,
                                                     bi2id=bi2id,
                                                     dictionary=word2id,
                                                     window_size=window_size)

                    self.words.append(words)
                    self.ws_data.append([feats, tags])
                    self.pos_data.append([[cids, wids, ptags], pids])
                    # reset index lists
                    wids = []
                    cids = []
                    pids = []
                    words = []
                    ptags = []

                else:
                    word, pos = text.split('\t')
                    word = utils.normalize(word)
                    word = word.replace(' ', ' ')
                    # lower setting: 3
                    word = word.lower()

                    if word in word2postags:
                        w2p = word2postags[word]
                    else:
                        w2p = [0]  # OOV
                    if word.isalnum() is True:
                        if w2p == [0]:
                            w2p = [pos2id['名詞']]
                        else:
                            w2p.append(pos2id['名詞'])

                    w2p = list(set(w2p))
                    ptags.append(w2p)

                    words.append(word)
                    if word in word2id:
                        wids.append(word2id[word])
                    else:
                        wids.append(word2id[OOV])

                    if pos in pos2id:
                        pids.append(pos2id[pos])
                    else:
                        pids.append(pos2id[OOV])

                    char_seq = []
                    for char in word:
                        if char in uni2id:
                            char_seq.append(uni2id[char])
                        else:
                            char_seq.append(uni2id[OOV])
                    cids.append(char_seq)
Beispiel #11
0
for utterance in utterances:
    plot_signal_and_energy_per_frame(utterance, 90 / 1000, 90 / 1000)

print('\n--- Building a rule-based system')

frame_width = 90 / 1000
shift_width = 90 / 1000
threshold = 10

print('\n-- TRAIN :')

folder_addresses = ['samples/man', 'samples/woman']

train_files_adresse = random_select_utterances(folder_addresses, 50)

train_data_frame = feature_extraction(train_files_adresse, frame_width,
                                      shift_width, threshold)

print('\n- Train data frame :\n', train_data_frame)

sns.pairplot(train_data_frame, hue='Sex')
plt.show()

sns.heatmap(train_data_frame.corr())
plt.show()

best_threshold_on_energy = 0
best_accurancy_on_energy = 0
for threshold_on_energy in range(1000000, 100000000, 10000):
    accurancy = rule_based_system_on_energy_accurancy(train_data_frame,
                                                      threshold_on_energy)
    if accurancy > best_accurancy_on_energy: