def train_svm(X_train, y_train, vectorizer, type="normal", c: float = 1.0, max_iter: int = 1000, max_features: int = 1000): """ Used to train a svm classifier :param X_train: input train :param y_train: label train :param vectorizer: type of vectorizer :param type : type of svm (linear, not linear) :param c: regularization parameter :param max_iter : maximum iterations :param max_features : maximum number of features :return: svm model, features vectorizer """ X_train, vectorizer = utils.feature_extraction(X_train, max_features=max_features, vectorizer_type=vectorizer) if type == "normal": svm_model = SVC(C=c, max_iter=max_iter) else: svm_model = LinearSVC(C=c, max_iter=max_iter, dual=False) print("Cross Validation ...") train_linear(svm_model, X_train, y_train) print("\nFitting ...") svm_model.fit(X_train, y_train) return svm_model, vectorizer
def main(settings): features, minima, maxima, scaling_parameter = feature_extraction( settings.dataset_dir) window = 5 X_train, y_train, X_test, y_test = split_features(features[::-1], window) print("X_train", X_train.shape) print("y_train", y_train.shape) print("X_test", X_test.shape) print("y_test", y_test.shape) # load json and create model layout_path = glob.glob(os.path.join(settings.model_dir, "*layout.json"))[0] json_file = open(layout_path, 'r') loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) # load weights into new model weights_path = glob.glob(os.path.join(settings.model_dir, "*weights.h5"))[0] model.load_weights(weights_path) print("Loaded model from disk") predicted2 = model.predict(X_test) actual = y_test predicted2 = (predicted2 * scaling_parameter) + minima actual = (actual * scaling_parameter) + minima mape2 = sqrt(mean_absolute_percentage_error(predicted2, actual)) mse2 = mean_absolute_error(actual, predicted2) print(json.dumps({"mape": mape2, "mse": mse2}))
def wakati(self, text, lower=False): """Word segmentation function. Return the segmented words. args: - text (str): An input sentence. - lower (bool): If lower is True, all uppercase characters in a list \ of the words are converted into lowercase characters. return: - words (list): A list of the words. """ text = utils.preprocess(text) lower_text = text.lower() feats = utils.feature_extraction(text=lower_text, uni2id=self._uni2id, bi2id=self._bi2id, dictionary=self._word2id, window_size=self._hp['WINDOW_SIZE']) obs = self._model.encode_ws(feats) obs = [ob.npvalue() for ob in obs] tags = utils.np_viterbi(self._model.trans_array, obs) # A word can be recognized as a single word forcibly. if self.pattern: for match in self.pattern.finditer(text): span = match.span() span_s = span[0] span_e = span[1] if (span_e - span_s) == 1: tags[span_s:span_e] = [3] else: tags[span_s:span_e] = [0]+[1]*((span_e-span_s)-2)+[2] if span_s != 0: previous_tag = tags[span_s-1] if previous_tag == 0: # 0 is BEGIN tag tags[span_s-1] = 3 # 3 is SINGLE tag elif previous_tag == 1: # 1 is MIDDEL tag tags[span_s-1] = 2 # 2 is END tag if span_e != len(text): next_tag = tags[span_e] if next_tag == 1: # 1 is MIDDEL tag tags[span_e] = 0 # 0 is BEGIN tag elif next_tag == 2: # 2 is END tag tags[span_e] = 3 # 3 is SINGLE tag if lower is True: words = utils.segmenter_for_bmes(lower_text, tags) else: words = utils.segmenter_for_bmes(text, tags) return words
def train_knn(X_train, y_train, vectorizer_type, max_features: int = 1000): """ Used to train a k-nn classifier :param X_train: input train :param y_train: label train :param vectorizer_type: type of vectorizer :param max_features : maximum number of features :return: naive bayes model, features vectorizer """ X_train, vectorizer = utils.feature_extraction( X_train, max_features=max_features, vectorizer_type=vectorizer_type) model = KNeighborsClassifier(n_neighbors=10) train_linear(model, X_train, y_train) model.fit(X_train, y_train) return model, vectorizer
def train_NB(X_train, y_train, vectorizer, max_features: int = 1000): """ Used to train a naive bayes classifier :param X_train: input train :param y_train: label train :param vectorizer: type of vectorizer :param max_features : maximum number of features :return: naive bayes model, features vectorizer """ X_train, vectorizer = utils.feature_extraction(X_train, max_features=max_features, vectorizer_type=vectorizer) naive_model = naive_bayes.MultinomialNB() print("\nCross validation ...") train_linear(naive_model, X_train, y_train) print("\nFitting") naive_model.fit(X_train, y_train) return naive_model, vectorizer
def wakati(self, text, lower=False): """ Return the words of the given sentence. Input: str (a sentence) Output: the list of the words """ text = utils.preprocess(text) lower_text = text.lower() feats = utils.feature_extraction(text=lower_text, uni2id=self._uni2id, bi2id=self._bi2id, dictionary=self._word2id, window_size=self._hp['WINDOW_SIZE']) obs = self._model.encode_ws(feats) obs = [ob.npvalue() for ob in obs] tags = utils.np_viterbi(self._model.trans_array, obs) # A word can be recognized as a single word forcibly. if self.pattern: for match in self.pattern.finditer(text): span = match.span() span_s = span[0] span_e = span[1] tags[span_s:span_e] = [0] + [1] * ((span_e - span_s) - 2) + [2] if span_s != 0: previous_tag = tags[span_s - 1] if previous_tag == 0: # 0 is BEGIN tag tags[span_s - 1] = 3 # 3 is SINGLE tag elif previous_tag == 1: # 1 is MIDDEL tag tags[span_s - 1] = 2 # 2 is END tag if span_e != len(text): next_tag = tags[span_e] if next_tag == 1: # 1 is MIDDEL tag tags[span_e] = 0 # 0 is BEGIN tag elif next_tag == 2: # 2 is END tag tags[span_e] = 3 # 3 is SINGLE tag if lower is True: words = utils.segmenter_for_bmes(lower_text, tags) else: words = utils.segmenter_for_bmes(text, tags) return words
def inference(self, image): self.sess.run(tf.global_variables_initializer()) checkpoint = tf.train.latest_checkpoint(self.params.model_dir) \ if not self.params.checkpoint else self.params.checkpoint if not checkpoint: raise FileNotFoundError('Checkpoint in not found in {}'.format( self.params.model_dir)) else: print("Loading model checkpoint {}...".format(checkpoint)) self.saver.restore(self.sess, checkpoint) feature_extractor = utils.feature_extractor('VGG19') features = utils.feature_extraction(feature_extractor, image) predicted_sequence = self.rnn.predict(features) print(predicted_sequence)
def main(settings): features, minima, maxima, scaling_parameter = feature_extraction(settings.dataset_dir) window = 5 X_train, y_train, X_test, y_test = split_features(features[::-1], window) print("X_train", X_train.shape) print("y_train", y_train.shape) print("X_test", X_test.shape) print("y_test", y_test.shape) json_logging_callback = LambdaCallback( on_epoch_end=lambda epoch, logs: print(json.dumps({ "epoch": epoch, "loss": logs["loss"], "acc": logs["acc"], "val_loss": logs["val_loss"], "val_acc": logs["val_acc"], })), ) # figure out which model architecture to use arch = settings.model_architecture assert arch in model_architectures, "Unknown model architecture '%s'." % arch builder = model_architectures[arch] # build and train the model model = builder([len(features.columns), window, 1]) model.fit( X_train, y_train, batch_size=settings.batch_size, epochs=settings.epochs, validation_split=settings.validation_split, callbacks=[json_logging_callback], verbose=0) # serialize model to JSON model_json = model.to_json() with open(os.path.join(settings.output_dir, "model-layout.json"), "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 model.save_weights(os.path.join(settings.output_dir, "model-weights.h5")) print("Saved model to disk")
from utils import feature_extraction files_adresse = [ '../samples/man/arctic_a0001.wav', '../samples/woman/arctic_a0001.wav' ] frame_width = 35 / 1000 shift_width = 35 / 1000 threshold = 40 print('Data set :\n', feature_extraction(files_adresse, frame_width, shift_width, threshold))
def __init__(self, filename, window_size, vocabs): self.words = [] self.ws_data = [] self.pos_data = [] self.filename = filename uni2id, bi2id, word2id, pos2id, word2postags = vocabs with open(filename, 'r') as texts: wids = [] # Word index cids = [] # Character index pids = [] # POStag index words = [] # Original Words ptags = [] # Original POStags for text in texts: text = utils.utf8rstrip(text) if text == 'EOS': sent = ''.join(words) segmented_sent = ' '.join(words) tags = utils.make_tags_as_bmes(segmented_sent) feats = utils.feature_extraction(text=sent, uni2id=uni2id, bi2id=bi2id, dictionary=word2id, window_size=window_size) self.words.append(words) self.ws_data.append([feats, tags]) self.pos_data.append([[cids, wids, ptags], pids]) # reset index lists wids = [] cids = [] pids = [] words = [] ptags = [] else: word, pos = text.split('\t') word = utils.normalize(word) word = word.replace(' ', ' ') # lower setting: 3 word = word.lower() if word in word2postags: w2p = word2postags[word] else: w2p = [0] # OOV if word.isalnum() is True: if w2p == [0]: w2p = [pos2id['名詞']] else: w2p.append(pos2id['名詞']) w2p = list(set(w2p)) ptags.append(w2p) words.append(word) if word in word2id: wids.append(word2id[word]) else: wids.append(word2id[OOV]) if pos in pos2id: pids.append(pos2id[pos]) else: pids.append(pos2id[OOV]) char_seq = [] for char in word: if char in uni2id: char_seq.append(uni2id[char]) else: char_seq.append(uni2id[OOV]) cids.append(char_seq)
for utterance in utterances: plot_signal_and_energy_per_frame(utterance, 90 / 1000, 90 / 1000) print('\n--- Building a rule-based system') frame_width = 90 / 1000 shift_width = 90 / 1000 threshold = 10 print('\n-- TRAIN :') folder_addresses = ['samples/man', 'samples/woman'] train_files_adresse = random_select_utterances(folder_addresses, 50) train_data_frame = feature_extraction(train_files_adresse, frame_width, shift_width, threshold) print('\n- Train data frame :\n', train_data_frame) sns.pairplot(train_data_frame, hue='Sex') plt.show() sns.heatmap(train_data_frame.corr()) plt.show() best_threshold_on_energy = 0 best_accurancy_on_energy = 0 for threshold_on_energy in range(1000000, 100000000, 10000): accurancy = rule_based_system_on_energy_accurancy(train_data_frame, threshold_on_energy) if accurancy > best_accurancy_on_energy: