def save_and_plot(sequences, spectrograms,alignments, log_dir, step, loss, prefix): fn = partial(save_and_plot_fn,log_dir=log_dir, step=step, loss=loss, prefix=prefix) items = list(enumerate(zip(sequences, spectrograms, alignments))) parallel_run(fn, items, parallel=False) log('Test finished for step {}.'.format(step))
def download_all(config): audio_dir = os.path.join(base_path, "audio") makedirs(audio_dir) soup = BeautifulSoup(requests.get(RSS_URL).text, "html5lib") items = [item for item in soup.find_all('item')] titles = [item.find('title').text[9:-3] for item in items] guids = [item.find('guid').text for item in items] accept_list = ['친절한 인나씨', '반납예정일', '귀욤열매 드세요'] new_guids = [guid for title, guid in zip(titles, guids) \ if any(accept in title for accept in accept_list) and '-' not in title] new_titles = [title for title, _ in zip(titles, guids) \ if any(accept in title for accept in accept_list) and '-' not in title] for idx, title in enumerate(new_titles): print(" [{:3d}] {}, {}".format( idx + 1, title, os.path.basename(new_guids[idx]).split('_')[2])) if idx == config.max_num: print("=" * 30) urls = { os.path.basename(guid).split('_')[2]: guid \ for guid in new_guids[:config.max_num] } parallel_run(itunes_download, urls.items(), desc=" [*] Itunes download", parallel=True)
def combine_wavs_batch(audio_paths, **kargv): audio_paths.sort() fn = partial(trim_on_silence, **kargv) parallel_run(fn, audio_paths, desc="Trimming on silence", parallel=False) return 0
def split_on_silence_batch(audio_paths, method, **kargv): audio_paths.sort() method = method.lower() if method == "pydub": fn = partial(split_on_silence_with_pydub, **kargv) parallel_run(fn, audio_paths, desc="Split on silence", parallel=False)
def combine_wavs_batch(audio_paths, method, **kargv): audio_paths.sort() method = method.lower() if method == "librosa": fn = partial(split_on_silence_with_librosa, **kargv) elif method == "pydub": fn = partial(split_on_silence_with_pydub, **kargv) parallel_run(fn, audio_paths, desc="Split on silence", parallel=False) audio_path = audio_paths[0] spl = os.path.basename(audio_path).split('.', 1) prefix = os.path.dirname(audio_path)+"/"+spl[0]+"." in_ext = audio_path.rsplit(".")[1] data = load_json(config.alignment_path, encoding="utf8") #print(data) for i in range(len(wavs)-1): if len(wavs[i]) > 15000: continue if not paths[i] in data: continue sum = len(wavs[i]) filename = prefix + str(i).zfill(4)+"." asr = data[paths[i]]+" " concated = wavs[i] for j in range(i+1, len(wavs)): sum += len(wavs[j]) sum += 400 if sum > 15000: break if not paths[j] in data: break filename = filename + str(j).zfill(4) + "." asr = asr + data[paths[j]] + " " concated = concated + silence + wavs[j] final_fn = filename+"wav" data[final_fn] = asr concated.export(final_fn, format="wav") print(filename+"wav | "+str(len(concated))) if os.path.exists(config.alignment_path): backup_file(config.alignment_path) write_json(config.alignment_path, data) get_durations(data.keys(), print_detail=False) return 0
def split_on_silence_batch(audio_paths, method, **kargv): audio_paths.sort() method = method.lower() deepspeech=kargv['deepspeech'] min_segment_length=kargv['min_segment_length'] if deepspeech: print('DeepSpeech compatibility enabled, using 16000Hz/16bit setting') if method == "librosa": fn = partial(split_on_silence_with_librosa, **kargv) elif method == "pydub": fn = partial(split_on_silence_with_pydub, **kargv) parallel_run(fn, audio_paths, desc="Split on silence", parallel=False)
def align_text_batch(config): if "jtbc" in config.recognition_path.lower(): align_text = partial(align_text_for_jtbc, score_threshold=config.score_threshold) else: raise Exception(" [!] find_related_texts for `{}` is not defined". \ format(config.recognition_path)) results = {} data = load_json(config.recognition_path) items = parallel_run(align_text, data.items(), desc="align_text_batch", parallel=True) for item in items: results.update(item) found_count = sum([type(value) == str for value in results.values()]) print(" [*] # found: {:.5f}% ({}/{})".format( len(results) / len(data), len(results), len(data))) print(" [*] # exact match: {:.5f}% ({}/{})".format( found_count / len(items), found_count, len(items))) return results
def plot_and_save_parallel(wavs, alignments, use_manual_attention, mels): items = list( enumerate(zip(wavs, alignments, paths, texts, sequences, mels))) fn = partial(plot_graph_and_save_audio, base_path=base_path, start_of_sentence=start_of_sentence, end_of_sentence=end_of_sentence, pre_word_num=pre_word_num, post_word_num=post_word_num, pre_surplus_idx=pre_surplus_idx, post_surplus_idx=post_surplus_idx, use_short_concat=use_short_concat, use_manual_attention=use_manual_attention, librosa_trim=librosa_trim, attention_trim=attention_trim, time_str=time_str, isKorean=isKorean) return parallel_run(fn, items, desc="plot_graph_and_save_audio", parallel=False)
def get_text_from_audio_batch(paths, multi_process=False): results = {} items = parallel_run(get_text_from_audio, paths, desc="get_text_from_audio_batch") for item in items: results.update(item) return results
def get_path_dict(data_dirs, hparams, config, data_type, n_test=None, rng=np.random.RandomState(123)): # Load metadata: path_dict = {} for data_dir in data_dirs: # ['datasets/moon\\data'] paths = glob( "{}/*.npz".format(data_dir) ) # ['datasets/moon\\data\\001.0000.npz', 'datasets/moon\\data\\001.0001.npz', 'datasets/moon\\data\\001.0002.npz', ...] if data_type == 'train': rng.shuffle( paths ) # ['datasets/moon\\data\\012.0287.npz', 'datasets/moon\\data\\004.0215.npz', 'datasets/moon\\data\\003.0149.npz', ...] if not config.skip_path_filter: items = parallel_run( get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True ) # [('datasets/moon\\data\\012.0287.npz', 130, 21), ('datasets/moon\\data\\003.0149.npz', 209, 37), ...] min_n_frame = hparams['reduction_factor'] * hparams[ 'min_iters'] # 5*30 max_n_frame = hparams['reduction_factor'] * hparams[ 'max_iters'] - 1 # 5*200 - 5 # 다음 단계에서 data가 많이 떨어져 나감. 글자수가 짧은 것들이 탈락됨. new_items = [ (path, n) for path, n, n_tokens in items if min_n_frame <= n <= max_n_frame and n_tokens >= hparams['min_tokens'] ] # [('datasets/moon\\data\\004.0383.npz', 297), ('datasets/moon\\data\\003.0533.npz', 394),...] new_paths = [path for path, n in new_items] new_n_frames = [n for path, n in new_items] hours = frames_to_hours(new_n_frames, hparams) else: new_paths = paths # train용 data와 test용 data로 나눈다. if data_type == 'train': new_paths = new_paths[:-n_test] # 끝에 있는 n_test(batch_size)를 제외한 모두 elif data_type == 'test': new_paths = new_paths[-n_test:] # 끝에 있는 n_test else: raise Exception(" [!] Unkown data_type: {}".format(data_type)) path_dict[ data_dir] = new_paths # ['datasets/moon\\data\\001.0621.npz', 'datasets/moon\\data\\003.0229.npz', ...] log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'.format( data_dir, len(new_n_frames), hours)) log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames))) log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames))) return path_dict
def save_and_plot(sequences, spectrograms, alignments, log_dir, step, loss, prefix): fn = partial(save_and_plot_fn, log_dir=log_dir, step=step, loss=loss, prefix=prefix) #print("seq.shape[",prefix,"]=",sequences.shape) #print("spec.shape[",prefix,"]=",spectrograms.shape) #print("spec.data[",prefix,"]=",spectrograms) #print("align.shape[",prefix,"]=",alignments.shape) #print("align.data[",prefix,"]=",alignments) items = list(enumerate(zip(sequences, spectrograms, alignments))) parallel_run(fn, items, parallel=False) log('Test finished for step {}.'.format(step))
def combine_wavs_batch(audio_paths, method, **kargv): audio_paths.sort() method = method.lower() if method == "librosa": fn = partial(split_on_silence_with_librosa, **kargv) elif method == "pydub": fn = partial(split_on_silence_with_pydub, **kargv) parallel_run(fn, audio_paths, desc="Split on silence", parallel=False) audio_path = audio_paths[0] spl = os.path.basename(audio_path).split('.', 1) prefix = os.path.dirname(audio_path) + "/" + spl[0] + "." in_ext = audio_path.rsplit(".")[1] for i in range(len(wavs) - 1): if len(wavs[i]) > 15000: continue if not paths[i] in data: continue sum = len(wavs[i]) filename = prefix + str(i).zfill(4) + "." asr = data[paths[i]] + " " concated = wavs[i] for j in range(i + 1, len(wavs)): sum += len(wavs[j]) #sum += 200 if sum > 15000: break if not paths[j] in data: break filename = filename + str(j).zfill(4) + "." asr = asr + data[paths[j]] + " " concated = concated + wavs[j] #if sum < 2000: # continue final_fn = filename + "wav" data[final_fn] = asr concated.export(final_fn, format="wav") print(filename + "wav | " + str(len(concated))) return 0
def get_path_dict(data_dirs, hparams, config, data_type, n_test=None, rng=np.random.RandomState(123)): # Load metadata: path_dict = {} for data_dir in data_dirs: paths = glob("{}/*.npz".format(data_dir)) if data_type == 'train': rng.shuffle(paths) if not config.skip_path_filter: items = parallel_run(get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True) min_n_frame = hparams.reduction_factor * hparams.min_iters max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor new_items = [(path, n) for path, n, n_tokens in items \ if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens] if any(check in data_dir for check in ["son", "yuinna"]): blacklists = [".0000.", ".0001.", "NB11479580.0001"] new_items = [item for item in new_items \ if any(check not in item[0] for check in blacklists)] new_paths = [path for path, n in new_items] new_n_frames = [n for path, n in new_items] hours = frames_to_hours(new_n_frames) log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'. \ format(data_dir, len(new_n_frames), hours)) log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames, default=0))) log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames, default=0))) else: new_paths = paths if data_type == 'train': new_paths = new_paths[:-n_test] elif data_type == 'test': new_paths = new_paths[-n_test:] else: raise Exception(" [!] Unkown data_type: {}".format(data_type)) path_dict[data_dir] = new_paths return path_dict
def text_recognition_batch(paths, args, ds): paths.sort() results = {} items = parallel_run(partial(text_recognition, args=args, ds=ds), paths, desc="text_recognition_batch_deepspeech", parallel=False) for item in items: results.update(item) return results
def text_recognition_batch(paths, config): paths.sort() results = {} items = parallel_run( partial(text_recognition, config=config), paths, desc="text_recognition_batch", parallel=True) for item in items: results.update(item) return results
def save_ttest_result_for_blender(events_id, cm_big='YlOrRd', cm_small='PuBu', threshold=2, norm_by_percentile=True, norm_percs=(1,99), inverse_method='dSPM', do_print=False, n_jobs=1): for cond_id, cond_name in enumerate(events_id.keys()): for patient in get_patients(): results_file_name = op.join(LOCAL_ROOT_DIR, 'results_for_blender', '{}_{}_{}'.format(patient, cond_name, inverse_method)) if op.isfile('{}-stc.h5'.format(results_file_name)): print('{}, {}'.format(patient, cond_name)) stc = mne.read_source_estimate(results_file_name) data_max, data_min = utils.get_activity_max_min(stc, norm_by_percentile, norm_percs, threshold) print(data_max, data_min) scalar_map_big = utils.get_scalar_map(threshold, data_max, cm_big) scalar_map_small = utils.get_scalar_map(data_min, -threshold, cm_small) for hemi in ['rh', 'lh']: utils.check_stc_vertices(stc, hemi, op.join(BLENDER_DIR, 'fsaverage', '{}.pial.ply'.format(hemi))) data = utils.stc_hemi_data(stc, hemi) fol = '{}'.format(os.path.join(BLENDER_DIR, 'fsaverage', '{}_{}'.format(patient, cond_name), 'activity_map_{}').format(hemi)) utils.delete_folder_files(fol) params = [(data[:, t], t, fol, scalar_map_big, scalar_map_small, threshold, do_print) for t in xrange(data.shape[1])] utils.parallel_run(pool, _calc_activity_colors, params, n_jobs) else: print('no results for {} {}'.format(patient, cond_name))
def morph_stcs_to_fsaverage(events_id, stc_per_epoch=False, inverse_method='dSPM', subjects_dir='', n_jobs=1): if subjects_dir is '': subjects_dir = os.environ['SUBJECTS_DIR'] for subject in get_subjects(): for cond_name in events_id.keys(): print('morphing {}, {}'.format(subject, cond_name)) if not stc_per_epoch: morphed_stc_file_name = op.join(LOCAL_ROOT_DIR, 'stc_morphed', '{}_{}_morphed_{}'.format(subject, cond_name, inverse_method)) if op.isfile('{}-stc.h5'.format(morphed_stc_file_name)): print('{} {} already morphed'.format(subject, cond_name)) else: local_stc_file_name = op.join(LOCAL_ROOT_DIR, 'stc', '{}_{}_{}'.format(subject, cond_name, inverse_method)) if op.isfile('{}-stc.h5'.format(local_stc_file_name)): stc = mne.read_source_estimate(local_stc_file_name) stc_morphed = mne.morph_data(subject, 'fsaverage', stc, grade=5, smooth=20, subjects_dir=subjects_dir) stc_morphed.save(morphed_stc_file_name, ftype='h5') else: print("can't find stc file for {}, {}".format(subject, cond_name)) else: stcs = glob.glob(op.join(LOCAL_ROOT_DIR, 'stc_epochs', '{}_{}_*_{}-stc.h5'.format(subject, cond_name, inverse_method))) params = [(subject, cond_name, stc_file_name, inverse_method, subjects_dir) for stc_file_name in stcs] utils.parallel_run(pool, _morphed_epochs_files, params, n_jobs)
def _run_backtests(self, debug): self.backtests = [] self.baselines = [] param_list = [] for strategy, resample_period, ticker in \ itertools.product(self.strategy_set, self.resample_periods, self.tickers): params = {} params['strategy'] = strategy params['start_time'] = self.start_time params['end_time'] = self.end_time params['transaction_currency'] = ticker.transaction_currency params['counter_currency'] = ticker.counter_currency params['resample_period'] = resample_period params['start_cash'] = self.start_cash params['start_crypto'] = self.start_crypto params[ 'evaluate_profit_on_last_order'] = self.evaluate_profit_on_last_order params['verbose'] = False params['source'] = ticker.source params['order_generator'] = self.order_generator param_list.append(params) if self._parallelize: # from pathos.multiprocessing import ProcessingPool as Pool # with Pool(POOL_SIZE) as pool: # backtests = pool.map(self._evaluate, param_list) # pool.close() # pool.join() backtests = parallel_run(self._evaluate, param_list) logging.info("Parallel processing finished.") else: backtests = map(self._evaluate, param_list) for backtest in backtests: if backtest is None: continue if backtest.profit_percent is None or backtest.benchmark_backtest.profit_percent is None: continue self.backtests.append(backtest) self.baselines.append(backtest.benchmark_backtest) if debug: break logging.info("Finished backtesting, building report...")
def get_path_dict(data_dirs, hparams, config,data_type, n_test=None,rng=np.random.RandomState(123)): # Load metadata: path_dict = {} for data_dir in data_dirs: # ['datasets/moon\\data'] paths = glob("{}/*.npz".format(data_dir)) # ['datasets/moon\\data\\001.0000.npz', 'datasets/moon\\data\\001.0001.npz', 'datasets/moon\\data\\001.0002.npz', ...] if data_type == 'train': rng.shuffle(paths) # ['datasets/moon\\data\\012.0287.npz', 'datasets/moon\\data\\004.0215.npz', 'datasets/moon\\data\\003.0149.npz', ...] if not config.skip_path_filter: items = parallel_run( get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True) # [('datasets/moon\\data\\012.0287.npz', 130, 21), ('datasets/moon\\data\\003.0149.npz', 209, 37), ...] min_n_frame = hparams.reduction_factor * hparams.min_iters # 5*30 max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor # 5*200 - 5 # 다음 단계에서 data가 많이 떨어져 나감. 글자수가 짧은 것들이 탈락됨. new_items = [(path, n) for path, n, n_tokens in items if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens] # [('datasets/moon\\data\\004.0383.npz', 297), ('datasets/moon\\data\\003.0533.npz', 394),...] if any(check in data_dir for check in ["son", "yuinna"]): blacklists = [".0000.", ".0001.", "NB11479580.0001"] new_items = [item for item in new_items if any(check not in item[0] for check in blacklists)] new_paths = [path for path, n in new_items] new_n_frames = [n for path, n in new_items] hours = frames_to_hours(new_n_frames,hparams) log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'.format(data_dir, len(new_n_frames), hours)) log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames))) log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames))) else: new_paths = paths if data_type == 'train': new_paths = new_paths[:-n_test] elif data_type == 'test': new_paths = new_paths[-n_test:] else: raise Exception(" [!] Unkown data_type: {}".format(data_type)) path_dict[data_dir] = new_paths # ['datasets/moon\\data\\001.0621.npz', 'datasets/moon\\data\\003.0229.npz', ...] return path_dict
def align_text_batch(config): align_text = partial(align_text_fn, score_threshold=config.score_threshold) results = {} data = load_json(config.recognition_path, encoding=config.recognition_encoding) items = parallel_run( align_text, data.items(), desc="align_text_batch", parallel=True) for item in items: results.update(item) found_count = sum([type(value) == str for value in results.values()]) print(" [*] # found: {:.5f}% ({}/{})".format( len(results)/len(data), len(results), len(data))) print(" [*] # exact match: {:.5f}% ({}/{})".format( found_count/len(items), found_count, len(items))) return results
def run_parallel_experiments(self, num_processes=8, rerun_existing=False, display_results=True): #from pathos.multiprocessing import ProcessingPool as Pool partial_run_func = partial(ExperimentManager.run_variant, keep_record=True, display_results=display_results, rerun_existing=rerun_existing, saved_figure_ext='.fig.png') records = parallel_run(partial_run_func, self.variants) #with Pool(num_processes) as pool: # records = pool.map(partial_run_func, self.variants) # pool.close() # pool.join() # pool.terminate() for record in records: if record is not None: # empty records if experiments already exist self._save_rockstars(record)
def calc_all(events_id, tmin=None, tmax=None, overwrite=False,inverse_method='dSPM', baseline=(None, 0), apply_for_epochs=False, apply_SSP_projection_vectors=True, add_eeg_ref=True, n_jobs=1): params = [(subject, events_id, tmin, tmax, overwrite, inverse_method, baseline, apply_for_epochs, apply_SSP_projection_vectors, add_eeg_ref) for subject in get_subjects()] utils.parallel_run(pool, _calc_all, params, n_jobs)
def _calc_evoked(events_id, epochs): params = [(subject, events_id, epochs) for subject in get_subjects()] utils.parallel_run(pool, _calc_inverse, params, n_jobs)
def calc_inverse(epochs=None, overwrite=False): params = [(subject, epochs, overwrite) for subject in get_subjects()] utils.parallel_run(pool, _calc_inverse, params, n_jobs)
def calc_epoches(events_id, tmin, tmax, n_jobs=1): params = [(subject, events_id, tmin, tmax) for subject in get_subjects()] utils.parallel_run(pool, _calc_epoches, params, n_jobs)
base_dir = os.path.dirname(os.path.realpath(__file__)) news_id_path = os.path.join(base_dir, "news_ids.json") if not os.path.exists(news_id_path): while True: tmp_ids = get_news_ids(page_idx) if len(tmp_ids) == 0: break news_ids.extend(tmp_ids) print(" [*] Download page {}: {}/{}".format( page_idx, len(tmp_ids), len(news_ids))) page_idx += 1 with open(news_id_path, "w") as f: json.dump(news_ids, f, indent=2, ensure_ascii=False) else: with open(news_id_path) as f: news_ids = json.loads(f.read()) exceptions = ["NB10830162"] news_ids = list(set(news_ids) - set(exceptions)) fn = partial(download_news_video_and_content, base_dir=base_dir) results = parallel_run(fn, news_ids, desc="Download news video+text", parallel=True)
def create_stcs(events_id, epochs=None, evoked=None, inv=None, inverse_method='dSPM', baseline=(None, 0), apply_for_epochs=False, apply_SSP_projection_vectors=True, add_eeg_ref=True): params = [(subject, events_id, epochs, evoked, inv, inverse_method, baseline, apply_for_epochs, apply_SSP_projection_vectors, add_eeg_ref) for subject in get_subjects()] utils.parallel_run(pool, _create_stcs, params, n_jobs)