def create_currennt_dataset_from_atb_kaldi(train_word_filename, train_word_diac_filename, train_nc_filename, \ test_filename, test_nc_filename, \ dev_word_filename=None, dev_word_diac_filename=None, dev_nc_filename=None, \ stop_on_punc=False, window_size=5, init_method=FeatureInitializer.STRAT_RAND, \ letter_features_size=10, shadda=Word.SHADDA_WITH_NEXT, word_vectors=None): print 'loading training set' start_time = time.time() train_sequences = load_extracted_data(train_word_filename, train_word_diac_filename, stop_on_punc, shadda) feature_initializer = FeatureInitializer(train_sequences, strategy=init_method, \ letter_features_size=letter_features_size) train_dataset = CurrenntDataset(train_nc_filename, train_sequences, \ feature_initializer.letter_features_size, feature_initializer.map_letter2features, \ window_size=window_size, word_vectors=word_vectors) print 'elapsed time:', time.time() - start_time, 'seconds' print 'loading test set' start_time = time.time() test_sequences = load_kaldi_data(test_filename, shadda) test_dataset = CurrenntDataset(test_nc_filename, test_sequences, \ feature_initializer.letter_features_size, feature_initializer.map_letter2features, \ window_size=window_size, map_label2class=train_dataset.map_label2class, \ word_vectors=word_vectors) print 'elapsed time:', time.time() - start_time, 'seconds' if dev_word_filename and dev_word_diac_filename and dev_nc_filename: print 'loading dev set' start_time = time.time() dev_sequences = load_extracted_data(dev_word_filename, dev_word_diac_filename, stop_on_punc, shadda) dev_dataset = CurrenntDataset(dev_nc_filename, dev_sequences, \ feature_initializer.letter_features_size, feature_initializer.map_letter2features, \ window_size=window_size, map_label2class=train_dataset.map_label2class, \ word_vectors=word_vectors) print 'elapsed time:', time.time() - start_time, 'seconds'
def convert_file(word_filename, word_diac_filename, pred_csv_filename, pred_output_filename, train_nc_filename): """ Convert Currennt output to predictions :param word_filename (str): file with words (non-diac) :param word_diac_filename (str): file with words (diac) :param pred_csv_filename (str): file in csv format with predictions :param pred_output_filename (str): file to write predictions in Kaldi format (bw-currennt) :param train_nc_filename (str): file in Currennt format that was used to train the model :return: """ sequences = load_extracted_data(word_filename, word_diac_filename) train_nc_file = Dataset(train_nc_filename) num_labels = len(train_nc_file.dimensions['numLabels']) nc_labels = [''.join(l.data) for l in train_nc_file.variables['labels']] class2label = dict(zip(range(len(nc_labels)), nc_labels)) print class2label g = open(pred_output_filename, 'w') f = open(pred_csv_filename) pred_lines = f.readlines() if len(pred_lines) != len(sequences): sys.stderr.write('Error: incompatible predicted lines and input sequences. Quitting.\n') return for i in xrange(len(pred_lines)): line = pred_lines[i] splt = line.strip().split(';') seq_id_pred = splt[0] probs = [float(p) for p in splt[1:]] sequence = sequences[i] if seq_id_pred != sequence.seq_id: sys.stderr.write('Error: seq id in text file ' + sequence.seq_id + \ ' != seq id in predicted currennt file ' + seq_id_pred + '. Quitting.\n') return g.write(sequence.seq_id) letters = sequences[i].get_sequence_letters(include_word_boundary=True) letter_idx = 0 cur_word, cur_word_diac_pred = '', '' for letter_probs in grouper(probs, num_labels, 0): letter = letters[letter_idx] letter_idx += 1 if letter == Word.WORD_BOUNDARY: if cur_word: # print cur_word + ':' + cur_word_diac_pred g.write(' ' + cur_word + ':' + cur_word_diac_pred) cur_word, cur_word_diac_pred = '', '' continue cur_word += letter arg_best = np.argmax(letter_probs) pred_label = class2label[arg_best] # print letter, ':', pred_label cur_word_diac_pred += letter + pred_label g.write('\n') f.close() g.close()
def create_currennt_dataset(train_word_filename, train_word_diac_filename, train_nc_filename, \ test_word_filename=None, test_word_diac_filename=None, test_nc_filename=None, \ dev_word_filename=None, dev_word_diac_filename=None, dev_nc_filename=None, \ stop_on_punc=False, window_size=5, init_method=FeatureInitializer.STRAT_RAND, \ letter_features_size=10, shadda=Word.SHADDA_WITH_NEXT, word_vectors=None, \ letter_vectors_filename=None, label2class_filename=None): print 'loading training set' start_time = time.time() train_sequences = load_extracted_data(train_word_filename, train_word_diac_filename, stop_on_punc, shadda) feature_initializer = FeatureInitializer(train_sequences, strategy=init_method, \ letter_features_size=letter_features_size, \ letter_features_filename=letter_vectors_filename) if label2class_filename: _, map_label2class = load_label_indices(label2class_filename) train_dataset = CurrenntDataset(train_nc_filename, train_sequences, \ feature_initializer.letter_features_size, feature_initializer.map_letter2features, \ window_size=window_size, map_label2class=map_label2class, word_vectors=word_vectors) else: train_dataset = CurrenntDataset(train_nc_filename, train_sequences, \ feature_initializer.letter_features_size, feature_initializer.map_letter2features, \ window_size=window_size, word_vectors=word_vectors) print 'elapsed time:', time.time() - start_time, 'seconds' if test_word_filename and test_word_diac_filename and test_nc_filename: print 'loading test set' start_time = time.time() test_sequences = load_extracted_data(test_word_filename, test_word_diac_filename, stop_on_punc, shadda) test_dataset = CurrenntDataset(test_nc_filename, test_sequences, \ feature_initializer.letter_features_size, feature_initializer.map_letter2features, \ window_size=window_size, map_label2class=train_dataset.map_label2class, \ word_vectors=word_vectors) print 'elapsed time:', time.time() - start_time, 'seconds' if dev_word_filename and dev_word_diac_filename and dev_nc_filename: print 'loading dev set' start_time = time.time() dev_sequences = load_extracted_data(dev_word_filename, dev_word_diac_filename, stop_on_punc, shadda) dev_dataset = CurrenntDataset(dev_nc_filename, dev_sequences, \ feature_initializer.letter_features_size, feature_initializer.map_letter2features, \ window_size=window_size, map_label2class=train_dataset.map_label2class, \ word_vectors=word_vectors) print 'elapsed time:', time.time() - start_time, 'seconds'
def convert_file(word_filename, word_diac_filename, pred_csv_filename, pred_output_filename, label_indices_filename): """ Convert Currennt output to predictions word_filename (str): file with words (non-diac) word_diac_filename (str): file with words (diac) pred_csv_filename (str): file in csv format with predictions pred_output_filename (str): file to write predictions in Kaldi format (bw-currennt) label_indices_filename (str): file with labels, one label per line, in the order corresponding to indices used in Current :return: """ sequences = load_extracted_data(word_filename, word_diac_filename) class2label, _ = load_label_indices(label_indices_filename) print class2label num_labels = len(class2label) g = open(pred_output_filename, 'w') f = open(pred_csv_filename) pred_lines = f.readlines() if len(pred_lines) != len(sequences): sys.stderr.write('Error: incompatible predicted lines and input sequences. Quitting.\n') return for i in xrange(len(pred_lines)): line = pred_lines[i] splt = line.strip().split(';') seq_id_pred = splt[0] probs = [float(p) for p in splt[1:]] sequence = sequences[i] if seq_id_pred != sequence.seq_id: sys.stderr.write('Error: seq id in text file ' + sequence.seq_id + \ ' != seq id in predicted currennt file ' + seq_id_pred + '. Quitting.\n') return g.write(sequence.seq_id) letters = sequences[i].get_sequence_letters(include_word_boundary=True) letter_idx = 0 cur_word, cur_word_diac_pred = '', '' for letter_probs in grouper(probs, num_labels, 0): letter = letters[letter_idx] letter_idx += 1 if letter == Word.WORD_BOUNDARY: if cur_word: # print cur_word + ':' + cur_word_diac_pred g.write(' ' + cur_word + ':' + cur_word_diac_pred) cur_word, cur_word_diac_pred = '', '' continue cur_word += letter arg_best = np.argmax(letter_probs) pred_label = class2label[arg_best] # print letter, ':', pred_label cur_word_diac_pred += letter + pred_label g.write('\n') f.close() g.close()
def convert_file(word_filename, word_diac_filename, pred_csv_filename, pred_output_filename, train_nc_filename): """ Convert Currennt output to predictions :param word_filename (str): file with words (non-diac) :param word_diac_filename (str): file with words (diac) :param pred_csv_filename (str): file in csv format with predictions :param pred_output_filename (str): file to write predictions in Kaldi format (bw-currennt) :param train_nc_filename (str): file in Currennt format that was used to train the model :return: """ sequences = load_extracted_data(word_filename, word_diac_filename) train_nc_file = Dataset(train_nc_filename) num_labels = len(train_nc_file.dimensions['numLabels']) nc_labels = [''.join(l.data) for l in train_nc_file.variables['labels']] class2label = dict(zip(range(len(nc_labels)), nc_labels)) print class2label g = open(pred_output_filename, 'w') f = open(pred_csv_filename) pred_lines = f.readlines() if len(pred_lines) != len(sequences): sys.stderr.write( 'Error: incompatible predicted lines and input sequences. Quitting.\n' ) return for i in xrange(len(pred_lines)): line = pred_lines[i] splt = line.strip().split(';') seq_id_pred = splt[0] probs = [float(p) for p in splt[1:]] sequence = sequences[i] if seq_id_pred != sequence.seq_id: sys.stderr.write('Error: seq id in text file ' + sequence.seq_id + \ ' != seq id in predicted currennt file ' + seq_id_pred + '. Quitting.\n') return g.write(sequence.seq_id) letters = sequences[i].get_sequence_letters(include_word_boundary=True) letter_idx = 0 cur_word, cur_word_diac_pred = '', '' for letter_probs in grouper(probs, num_labels, 0): letter = letters[letter_idx] letter_idx += 1 if letter == Word.WORD_BOUNDARY: if cur_word: # print cur_word + ':' + cur_word_diac_pred g.write(' ' + cur_word + ':' + cur_word_diac_pred) cur_word, cur_word_diac_pred = '', '' continue cur_word += letter arg_best = np.argmax(letter_probs) pred_label = class2label[arg_best] # print letter, ':', pred_label cur_word_diac_pred += letter + pred_label g.write('\n') f.close() g.close()