def main(args): # load dataset information from setup json file metad = utils.load_json_data(args.metadata_file) # generate data splits and keep them fixed for the whole project # MAKE SURE THIS IS ONLY CALLED ONCE splits_path = os.path.join(config.data_save_folder, 'data_splits.json') utils.create_data_split(metad, splits_path) mtracks = [] for ky in metad.keys(): mtrack = metad[ky] mtrack['filename'] = ky mtracks.append(mtrack) nmixes = len(metad.keys()) print("{} mixes to be processed".format(nmixes)) idx=0 Parallel(n_jobs=4, verbose=5)( delayed(utils.compute_features_mtrack)( mtrack, args.save_dir, args.wavmixes_path, idx ) for mtrack in mtracks)
w2v_path = config['w2v_file'] print('[INFO] Tokenizing input and output sequences') x, input_word_index = utils.tokenize_sequence(input_sentences, filters, config['encoder_num_tokens'], config['encoder_vocab']) y, output_word_index = utils.tokenize_sequence(output_sentences, filters, config['decoder_num_tokens'], config['decoder_vocab']) print('[INFO] Split data into train-validation-test sets') dataset_sizes = [train_data.shape[0], val_data.shape[0], test_data.shape[0]] x_train, y_train, x_val, y_val, x_test, y_test = utils.create_data_split(x, y, dataset_sizes) encoder_embeddings_matrix = utils.create_embedding_matrix(input_word_index, config['embedding_size'], w2v_path) decoder_embeddings_matrix = utils.create_embedding_matrix(output_word_index, config['embedding_size'], w2v_path) # Re-calculate the vocab size based on the word_idx dictionary config['encoder_vocab'] = len(input_word_index) config['decoder_vocab'] = len(output_word_index) #----------------------------------------------------------------#
def create_data_splits(path_to_metadata_file, exper_dir): metadata = load_data(path_to_metadata_file) utils.create_data_split(metadata, os.path.join(exper_dir, 'data_splits.json'))