#target_dev = prepare_data.load_transcripts('../augmented_labels/data/normalized/transcripts/swedish/dev.txt') # LibriSpeech ASR data #features_train = prepare_data.load_features('../augmented_labels/data/normalized/features/libri/train') #target_train = prepare_data.load_transcripts('../augmented_labels/data/normalized/transcripts/libri/train.txt') #features_dev = prepare_data.load_features_combined('../augmented_labels/data/normalized/features/libri/dev.npy') #target_dev = prepare_data.load_transcripts('../augmented_labels/data/normalized/transcripts/libri/dev.txt') # LibriSpeech data features_train = prepare_data.load_features( '../augmented_labels/data/normalized/features/libri/train') target_train = prepare_data.load_transcripts( '../augmented_labels/data/normalized/augmented/libri/train.txt') features_dev = prepare_data.load_features_combined( '../augmented_labels/data/normalized/features/libri/dev.npy') target_dev = prepare_data.load_transcripts( '../augmented_labels/data/normalized/augmented/libri/dev.txt') print('Done...') print('Loading embeddings...') #embeddings = fasttext.load_model('weights/embeddings/cc.sv.300.bin') embeddings = fasttext.load_model( 'weights/embeddings/crawl-300d-2M-subword.bin') print('Done...') # generate index dictionaries #char2idx, idx2char = prepare_data.encode_data(target_train) # generate index dictionaries
temp_transcripts = [] temp_tags = [] return transcripts, tags if __name__ == '__main__': torch.manual_seed(0) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # test Parliament features_test = prepare_data.load_features_combined('../../TSD/augmented_labels/data/normalized/features/test.npy') target_test = prepare_data.load_transcripts('../../TSD/augmented_labels/data/normalized/augmented/parliament/test.txt') # compare againt conventional NER #features_test = prepare_data.load_features_combined('../augmented_labels/data/normalized/features/test.npy') #target_test = prepare_data.load_transcripts('output/parliament/e2e_asr_combined.txt') #tags_test = prepare_data.load_tags('output/parliament/conventional_ner.txt') features_test = features_test[:50] target_test = target_test[:50] print('Loading embeddings...') embeddings = fasttext.load_model('weights/embeddings/cc.fi.300.bin') print('Done...')
# load features and labels print('Loading data..') # Parliament data ASR #features_train = prepare_data.load_features('data/normalized/features/train') #target_train = prepare_data.load_transcripts('data/normalized/transcripts/train.txt') #features_dev = prepare_data.load_features_combined('data/normalized/features/dev.npy') #target_dev = prepare_data.load_transcripts('data/normalized/transcripts/dev.txt') # Parlaiament data augmented features_train = prepare_data.load_features('data/normalized/features/train') target_train = prepare_data.load_transcripts( 'data/normalized/augmented/parliament/train.txt') features_dev = prepare_data.load_features_combined( 'data/normalized/features/dev.npy') target_dev = prepare_data.load_transcripts( 'data/normalized/augmented/parliament/dev.txt') print('Done...') print('Loading embeddings...') embeddings = fasttext.load_model('weights/embeddings/cc.fi.300.bin') print('Done...') with open('weights/char2idx_augmented.pkl', 'rb') as f: char2idx = pickle.load(f) with open('weights/idx2char_augmented.pkl', 'rb') as f: idx2char = pickle.load(f) # convert labels to indices