def concat_english_spanish_vocab(): english_vocab = read_data("../input/english_word_vocabs.txt") spanish_vocab = read_data("../input/words.txt") vocabs = spanish_vocab + english_vocab save_data(path='../input/processing/multi_task_learn/all_vocab.txt', data=vocabs)
def split_train_valid(split_rate, is_use_real=False): train_file = config.spanish_train_path train_data_file = '../input/processing/train_data.txt' valid_data_file = '../input/processing/valid_data.txt' train = read_data(train_file) # random.shuffle(train) if is_use_real: valid_size = 1400 train_data = train[:-valid_size] valid_data = train[-valid_size:] random.shuffle(train_data) save_data(train_data_file, train_data) save_data(valid_data_file, valid_data) else: valid_size = int(split_rate * len(train)) train_data = train[:-valid_size] valid_data = train[-valid_size:] # random.shuffle(train_data) save_data(train_data_file, train_data) save_data(valid_data_file, valid_data)
def split_with_overlap(ratio, ecfp_tr, ic50_tr, root_dir="", overlap=1000): print("Split with %d overlap." % overlap) split_path = root_dir + "data_2_split/" training_sample_num = ecfp_tr.shape[0] one_part = int(training_sample_num / (ratio + 1)) samples_user_1 = ratio * one_part samples_user_2 = one_part print("Number of training samples user-1: %d" % (samples_user_1)) print("Number of training samples user-2: %d" % (samples_user_2)) shuffled_idx = np.array(range(training_sample_num)) np.random.shuffle(shuffled_idx) user_1 = shuffled_idx[:samples_user_1] user_2 = shuffled_idx[-samples_user_2:] user_1_train_size = int(0.8 * samples_user_1) user_2_train_size = int(0.8 * samples_user_2) user_1_train = user_1[:user_1_train_size] user_1_test = user_1[user_1_train_size:] user_2_train = user_2[:user_2_train_size] user_2_test = user_2[user_2_train_size:] # T_total - number of targets T_total = ic50_tr.shape[1] # number of disjunct targets num_disjunct = (T_total - overlap) // 2 print("%d disjunct and %d overlapping labels (total: %d)" % (num_disjunct, overlap, T_total)) shuffled_labels = np.array(range(T_total)) np.random.shuffle(shuffled_labels) common_labels = shuffled_labels[:overlap] if num_disjunct == 0: user_1_labels, user_2_labels = common_labels, common_labels else: user_1_labels = np.concatenate( (common_labels, shuffled_labels[overlap:overlap + num_disjunct]), axis=None) user_2_labels = np.concatenate( (common_labels, shuffled_labels[-num_disjunct:]), axis=None) u1_ic50_tr = ic50_tr.tocsc()[:, user_1_labels].tocsr() u2_ic50_tr = ic50_tr.tocsc()[:, user_2_labels].tocsr() ## save user-1 data du.save_data(split_path + "0_train/", ecfp_tr[user_1_train], u1_ic50_tr[user_1_train]) du.save_data(split_path + "0_test/", ecfp_tr[user_1_test], u1_ic50_tr[user_1_test]) ## save user-2 data du.save_data(split_path + "1_train/", ecfp_tr[user_2_train], u2_ic50_tr[user_2_train]) du.save_data(split_path + "1_test/", ecfp_tr[user_2_test], u2_ic50_tr[user_2_test])
def de_duplicate(): filename = '../input/processing/spanish_train.txt' data = read_data(filename) dumplicate = [] new_data = [] help_set = set() for line in data: new_line = line[1:] if new_line in help_set: dumplicate.append(line) else: new_data.append(line) help_set.add(new_line) save_data( '../input/processing/dumplicate_spanish_data.txt', dumplicate, ) save_data('../input/processing/spanish_train_dedup.txt', new_data)
def tenfold(): """split train data for 10 fold""" base_path = '../input/processing/' train_file = base_path + 'train.txt' save_path = base_path + '10fold' if not os.path.exists(save_path): os.makedirs(save_path) # split data for ten fold train = read_data(train_file) size = len(train) one_part_size = int(0.1 * size) random.shuffle(train) for i in range(10): save_file_path = "{}/train_{}.txt".format(save_path, i) if i < 9: save_data(save_file_path, train[i * one_part_size:(i + 1) * one_part_size]) else: save_data(save_file_path, train[i * one_part_size:])
def five_fold(): """split train data for 10 fold""" base_path = '../input/processing/' train_file = config.spanish_train_path save_path = base_path + '5fold' if not os.path.exists(save_path): os.makedirs(save_path) # split data for ten fold train = read_data(train_file) size = len(train) one_part_size = int(0.2 * size) # random.shuffle(train) for i in range(5): save_file_path = "{}/train_{}.txt".format(save_path, i) if i < 4: save_data(save_file_path, train[i * one_part_size:(i + 1) * one_part_size]) else: save_data(save_file_path, train[i * one_part_size:])
def processing_data_1_step(): """对数据集进行切分,先分成英文部分,西班牙文部分,不shuffle""" process_base_path = '../input/processing/' base_path = '../input/' if not os.path.exists(process_base_path): os.makedirs(process_base_path) train_en = base_path + 'cikm_english_train_20180516.txt' train_sp = base_path + 'cikm_spanish_train_20180516.txt' unlabel_data = base_path + 'cikm_unlabel_spanish_train_20180516.txt' english_file = process_base_path + 'english.txt' spanish_file = process_base_path + 'spanish.txt' unlabel_file = process_base_path + 'unlabel_spanish.txt' test_file = process_base_path + 'test_b_no_process.txt' # 将english_train中的英文和西班牙文划分开 en_train = read_data(train_en) ens = [] sps = [] for line in en_train: line = line.strip() line_arr = line.split('\t') ens.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[0], line_arr[2])) sps.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[1], line_arr[3])) sp_train = read_data(train_sp) for line in sp_train: # line = punctiation(line) line = line.strip() line_arr = re.split('\t', line) sps.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[0], line_arr[2])) ens.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[1], line_arr[3])) # 讲分开的english和spanish文件保存下来 save_data(english_file, data=ens) save_data(spanish_file, data=sps) print(u'对测试数据进行预处理,所有的label均设置为0') test_path = base_path + 'cikm_test_b_20180730.txt' test = read_data(test_path) sps = [] for line in test: line = line.strip() line_arr = re.split('\t', line) sps.append('{}\t{}\t{}\n'.format(0, line_arr[0], line_arr[1])) save_data(test_file, sps) print('Done')
# encoding:utf-8 from utils.log_utils import init_logger from utils.data_utils import save_data, json2text, get_data, write_text from config.basic_config import configs import codecs import numpy as np if __name__ == "__main__": logger = init_logger(log_name='preprocess', log_dir=configs['log_dir']) # 将数据保存起来 save_data(configs['all_data_path'], configs['train_data_path'], configs['test_data_path'], configs['val_data_path']) with codecs.open(configs['all_data_path']) as f: content = f.readlines() length_list = [] for line in content: line = line.strip() if len(line) == 0: continue length_list.append(len(line)) # print(length_list) max_length = int(np.percentile(length_list, 95)) logger.info(f"max length: {max_length}") json2text(configs['test_data_path'], configs['ptest_x_path'], key='text') train_data = get_data(configs['train_data_path']) val_data = get_data(configs['val_data_path']) train_collections = [] val_collections = [] for item in train_data: tags = ['O' for _ in range(len(item['text']))] label = item['label']
with open(file_path) as f: df = pd.read_csv(f) return df if __name__ == '__main__': feature_path = "../data/features" label_path = "../data/labels" # files_dir = r"E:\机器学习\数据集\音频\whale_data\Audio\train" # files = os.listdir(files_dir) # files.sort(key=lambda x: int(re.match(r"train(\d+).wav", x).group(1))) # features = [] # for file in tqdm(files): # path = os.path.join(files_dir, file) # feature = wavfile_to_examples(path) # feature = np.reshape( # feature, (vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS)) # features.append(feature) # # data_utils.save_data(np.array(features), feature_path) df = read_csv(r'E:\机器学习\数据集\音频\whale_data\labels.csv') labels = [] for label in df.label: labels.append(label) data_utils.save_data(np.array(labels), label_path)
def save_stats(path, stats_df): utils.ensure_path_exist(path) data_utils.save_data(stats_df, file_path=path)
def processing_data_2_step(): process_base_path = '../input/processing/' english_file = process_base_path + 'english.txt' spanish_file = process_base_path + 'spanish.txt' test_file = process_base_path + 'test_b_no_process.txt' english_core_nlp = StanfordCoreNLP(core_nlp_path, lang='en') with StanfordCoreNLP(core_nlp_path, lang='es') as client: english_processing_file = process_base_path + 'english_train.txt' spanish_processing_file = process_base_path + 'spanish_train.txt' test_processing_file = process_base_path + 'test_b.txt' # englishs = read_data(english_file) spanishs = read_data(spanish_file) english_processing = [] spanish_processing = [] # for english for line in tqdm(englishs): lines = line.strip().split('\t') assert len(lines) == 3 lines[1] = text_processing_english(lines[1], english_core_nlp) lines[2] = text_processing_english(lines[2], english_core_nlp) english_processing.append("{}\t{}\t{}\n".format( lines[0], lines[1], lines[2])) save_data(english_processing_file, english_processing) # for spanish for line in tqdm(spanishs): line = line.strip().split('\t') assert len(line) == 3, print(line) line[1] = text_processing_spanish(line[1], client) line[2] = text_processing_spanish(line[2], client) spanish_processing.append("{}\t{}\t{}\n".format( line[0], line[1], line[2])) save_data(spanish_processing_file, spanish_processing) # # for test data test = read_data(test_file) test_processing = [] for line in tqdm(test): # ipdb.set_trace() line = line.strip().split('\t') assert len(line) == 3 line[2] = text_processing_spanish(line[2], client) line[1] = text_processing_spanish(line[1], client) test_processing.append("{}\t{}\t{}\n".format( line[0], line[1], line[2])) save_data(test_processing_file, test_processing) english_core_nlp.close() print('Done')
random_states = np.zeros([n_random] + list(states.shape[1:]), dtype=np.float32) random_actions = np.zeros(n_random, dtype=np.int8) + 9 random.shuffle(images) for idx in range(len(images[:n_random])): try: image = Image.open(images[idx]) except: print('Failed on %s' % images[idx]) os.remove(images[idx]) continue image = image.convert('RGB') processor.process_observation(np.array(image)) processor.process_observation(np.array(image)) random_states[idx] = processor.process_observation(np.array(image)) states = np.concatenate((states, random_states)) robot_actions = np.concatenate((robot_actions, random_actions)) human_actions = np.concatenate((human_actions, random_actions)) permutation = np.random.permutation(states.shape[0]) states = states[permutation] robot_actions = robot_actions[permutation] human_actions = human_actions[permutation] save_data(mixed_data_file, states, robot_actions, human_actions) print('Saved mixed states to %s\n' % mixed_data_file)
def to_file(self, path): utils.ensure_path_exist(path) data_utils.save_data(self, file_path=path)
new_labeled_idices = np.squeeze(new_labeled_idices) self._data_l = np.concatenate( [self._data_l, self._data_u[new_labeled_idices]]) np.random.shuffle(self._data_l) # shuffle new labeled data. self._data_u = np.delete(self._data_u, new_labeled_idices, axis=0) if self._embeddings is not None: # Update embedded data cache. self._embedded_data_l = np.concatenate([ self._embedded_data, self._embedded_data_u[new_labeled_idices] ]) np.random.shuffle(self._data_l) # shuffle new labeled data. self._embedded_data_u = np.delete(self._embedded_data_u, new_labeled_idices, axis=0) def to_file(self, path): utils.ensure_path_exist(path) data_utils.save_data(self, file_path=path) if __name__ == '__main__': _features = data_utils.load_data('../data/features') _labels = data_utils.load_data('../data/labels') pairs = [(f, l) for f, l in zip(_features, _labels)] data_utils.save_data( np.array(pairs, dtype=[('feature', np.ndarray), ('label', np.int)]), '../data/pairs')
for i in range(len(features)): features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0]) return np.array(features), np.array(labels, dtype=np.int) if __name__ == '__main__': _features, _labels = extract_features( parent_dir=r'E:\机器学习\数据集\音频\speech_commands_v0.01', dir_label_dic={ 'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9 }, file_ext='*.wav', bands=60, frames=41) _pairs = [(f, l) for f, l in zip(_features, _labels)] data_utils.save_data(obj=np.array(_pairs, dtype=[('feature', np.ndarray), ('label', np.int)]), file_path='../data/speech_command/pairs')
label = file_name.split('-')[1] for (start, end) in windows(sound_clip, window_size): if len(sound_clip[start:end]) == window_size: signal = sound_clip[start:end] melspec = librosa.feature.melspectrogram(signal, n_mels=bands) logspec = librosa.amplitude_to_db(melspec) logspec = logspec.T.flatten()[:, np.newaxis].T log_specgrams.append(logspec) labels.append(label) log_specgrams = np.asarray(log_specgrams).reshape(len(log_specgrams), bands, frames, 1) features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis=3) for i in range(len(features)): features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0]) return np.array(features), np.array(labels, dtype=np.int) if __name__ == '__main__': _features, _labels = extract_features( parent_dir=r'E:\机器学习\数据集\音频\UrbanSound8K\audio', sub_dirs=['fold%d' % i for i in range(1, 11)], file_ext='*.wav', bands=60, frames=41) _pairs = [(f, l) for f, l in zip(_features, _labels)] data_utils.save_data( obj=np.array(_pairs, dtype=[('feature', np.ndarray), ('label', np.int)]), file_path='../data/urbansound8k/pairs')