def split_all_data(): # READ DATA dataset_dir = os.path.join('..', 'data', DATASET) feats_file = os.path.join(dataset_dir, 'feats.17') labels_file = os.path.join(dataset_dir, 'time') data = util.read_data(feats_file, labels_file) # SHUFFLE DATA np.random.seed(1000) np.random.shuffle(data) # BUILD FOLDER STRUCTURE dataset_dir = os.path.join(SPLIT_DIR, DATASET) try: os.makedirs(dataset_dir) except OSError: print "skipping folder creation" # SPLIT TRAIN/TEST AND SAVE fold_indices = cross_validation.KFold(data.shape[0], n_folds=10) for fold, index in enumerate(fold_indices): print index[0].shape train_data = data[index[0]] test_data = data[index[1]] train_data, scaler = util.normalize_train_data(train_data) test_data = util.normalize_test_data(test_data, scaler) fold_dir = os.path.join(dataset_dir, str(fold)) try: os.makedirs(fold_dir) except OSError: print "skipping fold dir" np.savetxt(os.path.join(fold_dir, 'train'), train_data, fmt="%.5f") np.savetxt(os.path.join(fold_dir, 'test'), test_data, fmt="%.5f")
def split_all_data(): # READ DATA dataset_dir = os.path.join('..', 'data', DATASET) feats_file = os.path.join(dataset_dir, 'feats.17') labels_file = os.path.join(dataset_dir, 'time') source_file = os.path.join(dataset_dir, 'source') target_file = os.path.join(dataset_dir, 'target') pe_file = os.path.join(dataset_dir, 'target_postedited') #data = util.read_data(feats_file, labels_file) feats = np.loadtxt(feats_file, dtype=object) labels = np.loadtxt(labels_file, dtype=object, ndmin=2) src = np.loadtxt(source_file, dtype=object, delimiter='\t', ndmin=2) tgt = np.loadtxt(target_file, dtype=object, delimiter='\t', ndmin=2) pe = np.loadtxt(pe_file, dtype=object, delimiter='\t', ndmin=2) data = np.concatenate((feats, labels, src, tgt, pe), axis=1) # SHUFFLE DATA np.random.seed(1000) np.random.shuffle(data) # BUILD FOLDER STRUCTURE dataset_dir = os.path.join(SPLIT_DIR, DATASET) try: os.makedirs(dataset_dir) except OSError: print "skipping folder creation" # SPLIT TRAIN/TEST AND SAVE fold_indices = cross_validation.KFold(data.shape[0], n_folds=10) for fold, index in enumerate(fold_indices): print index[0].shape train = data[index[0]] test = data[index[1]] train_data = np.array(train[:, :18], dtype=float) test_data = np.array(test[:, :18], dtype=float) train_data, scaler = util.normalize_train_data(train_data) test_data = util.normalize_test_data(test_data, scaler) fold_dir = os.path.join(dataset_dir, str(fold)) try: os.makedirs(fold_dir) except OSError: print "skipping fold dir" np.savetxt(os.path.join(fold_dir, 'train'), train_data, fmt="%.5f") np.savetxt(os.path.join(fold_dir, 'test'), test_data, fmt="%.5f") np.savetxt(os.path.join(fold_dir, 'train_src'), train[:, 18], fmt="%s") np.savetxt(os.path.join(fold_dir, 'test_src'), test[:, 18], fmt="%s") np.savetxt(os.path.join(fold_dir, 'train_tgt'), train[:, 19], fmt="%s") np.savetxt(os.path.join(fold_dir, 'test_tgt'), test[:, 19], fmt="%s") np.savetxt(os.path.join(fold_dir, 'train_pe'), train[:, 20], fmt="%s") np.savetxt(os.path.join(fold_dir, 'test_pe'), test[:, 20], fmt="%s")