def main(): plist = ProjectList.from_path('projects.yaml') print([p for p in plist.project_list if not p.desc]) topic_map = plist.get_projects_by_type('topic') topic_toc_text = format_tag_toc(topic_map) projects_by_topic = format_all_categories(topic_map) plat_map = plist.get_projects_by_type('platform') plat_toc_text = format_tag_toc(plat_map) projects_by_plat = format_all_categories(plat_map) context = { 'TOPIC_TOC': topic_toc_text, 'TOPIC_TEXT': projects_by_topic, 'PLATFORM_TOC': plat_toc_text, 'PLATFORM_TEXT': projects_by_plat, 'TOTAL_COUNT': len(plist.project_list) } for filename in iter_find_files(TEMPLATES_PATH, '*.tmpl.md'): tmpl_text = open(filename).read() target_filename = os.path.split(filename)[1].replace('.tmpl', '') output_text = tmpl_text.format(**context) with atomic_save(target_filename) as f: f.write(output_text.encode('utf8')) return
def render(plist, pdir): "generate the list markdown from the yaml listing" topic_map = plist.get_projects_by_type('topic') topic_toc_text = format_tag_toc(topic_map) projects_by_topic = format_all_categories(topic_map) plat_map = plist.get_projects_by_type('platform') plat_toc_text = format_tag_toc(plat_map) projects_by_plat = format_all_categories(plat_map) context = { 'TOPIC_TOC': topic_toc_text, 'TOPIC_TEXT': projects_by_topic, 'PLATFORM_TOC': plat_toc_text, 'PLATFORM_TEXT': projects_by_plat, 'TOTAL_COUNT': len(plist.project_list) } templates_path = pdir + '/templates/' if not os.path.isdir(templates_path): raise APACLIError('expected "templates" directory at %r' % templates_path) for filename in iter_find_files(templates_path, '*.tmpl.md'): tmpl_text = open(filename).read() target_filename = os.path.split(filename)[1].replace('.tmpl', '') output_text = tmpl_text.format(**context) with atomic_save(pdir + '/' + target_filename) as f: f.write(output_text.encode('utf8')) return
def make_pairs_dataset(self, path, n_hidden_messages, n_pairs): pairs = [] files_by_speaker = defaultdict(list) unfiltered_wav_files = list(fileutils.iter_find_files(path, "*.wav")) wav_files = [] for wav in unfiltered_wav_files: # filter out short files try: if soundfile.read(wav)[0].shape[0] > 3 * 8000: wav_files.append(wav) except: pass for wav in wav_files: speaker = int(wav.split('/')[-3]) files_by_speaker[speaker].append(wav) for i in range(n_pairs): speaker = random.sample(files_by_speaker.keys(), 1)[0] sampled_files = random.sample(files_by_speaker[speaker], 1 + n_hidden_messages) carrier_file, hidden_message_files = sampled_files[ 0], sampled_files[1:] pairs.append((carrier_file, hidden_message_files)) return pairs
def _get_all_metric_mods(check_reqs=True): ret = [] for metric_path in iter_find_files(METRICS_PATH, '*.py', ignored='__init__.py'): mod_name = os.path.splitext(os.path.split(metric_path)[-1])[0] metric_mod = imp.load_source(mod_name, metric_path) if not callable(getattr(metric_mod, 'collect', None)): print_err('skipping non-metric module at %r' % metric_path) continue if not check_reqs: ret.append(metric_mod) continue missing_env_vars = _check_required_env_vars(metric_mod) missing_cmds = _check_required_cmds(metric_mod) if missing_cmds: print_err( 'omitting metric "%s" due to missing commands: %s (see installation instructions above)' % (metric_mod.__name__, ', '.join(missing_cmds))) elif missing_env_vars: print_err('omitting metric "%s" due to missing ENV variables: %s' % (metric_mod.__name__, ', '.join(missing_env_vars))) else: ret.append(metric_mod) return ret
def _iter_changed_files(entries_path, theme_path, config_path, interval=0.5): mtimes = {} while True: changed = [] to_check = itertools.chain([config_path], iter_find_files(entries_path, ENTRY_PATS), iter_find_files(theme_path, '*')) for path in to_check: try: new_mtime = os.stat(path).st_mtime except OSError: continue old_mtime = mtimes.get(path) if not old_mtime or new_mtime > old_mtime: mtimes[path] = new_mtime changed.append(path) if changed: yield changed time.sleep(interval)
def make_pairs_dataset(self, path, n_hidden_messages, n_pairs): pairs = [] wav_files = list(fileutils.iter_find_files(path, "*.wav")) for i in range(n_pairs): sampled_files = random.sample(wav_files, 1 + n_hidden_messages) carrier_file, hidden_message_files = sampled_files[ 0], sampled_files[1:] pairs.append((carrier_file, hidden_message_files)) return pairs
def load(self): self.last_load = time.time() self._load_custom_mod() self._call_custom_hook('pre_load') self.html_renderer = AshesEnv(paths=[self.theme_path]) self.html_renderer.load_all() self.md_renderer = AshesEnv(paths=[self.theme_path], exts=['md'], keep_whitespace=False) self.md_renderer.autoescape_filter = '' self.md_renderer.load_all() entries_path = self.paths['entries_path'] entry_paths = [] for entry_path in iter_find_files(entries_path, ENTRY_PATS): entry_paths.append(entry_path) entry_paths.sort() for ep in entry_paths: with chlog.info('entry load') as rec: try: entry = self._entry_type.from_path(ep) rec['entry_title'] = entry.title rec['entry_length'] = round(entry.get_reading_time(), 1) except IOError: rec.exception('unopenable entry path: {}', ep) continue except: rec['entry_path'] = ep rec.exception( 'entry {entry_path} load error: {exc_message}') continue else: rec.success('entry loaded:' ' {entry_title} ({entry_length}m)') if entry.is_draft: self.draft_entries.append(entry) elif entry.is_special: self.special_entries.append(entry) else: self.entries.append(entry) # Sorting the EntryLists self.entries.sort() # sorting drafts/special pages doesn't do much self.draft_entries.sort(key=lambda e: os.path.getmtime(e.source_path)) self.special_entries.sort() self._rebuild_tag_map() for i, entry in enumerate(self.entries, start=1): start_next = max(0, i - NEXT_ENTRY_COUNT) entry.next_entries = self.entries[start_next:i - 1][::-1] entry.prev_entries = self.entries[i:i + PREV_ENTRY_COUNT] self._call_custom_hook('post_load')
def load(self): self.last_load = time.time() self._load_custom_mod() self._call_custom_hook('pre_load') self.html_renderer = AshesEnv(paths=[self.theme_path]) self.html_renderer.load_all() self.md_renderer = AshesEnv(paths=[self.theme_path], exts=['md'], keep_whitespace=False) self.md_renderer.autoescape_filter = '' self.md_renderer.load_all() entries_path = self.paths['entries_path'] entry_paths = [] for entry_path in iter_find_files(entries_path, ENTRY_PATS): entry_paths.append(entry_path) entry_paths.sort() for ep in entry_paths: with chlog.info('entry load') as rec: try: entry = self._entry_type.from_path(ep) except IOError: rec.exception('unopenable entry path: {}', ep) continue except: rec.exception('entry load error: {exc_message}') continue else: rec['entry_title'] = entry.title rec['entry_length'] = round(entry.get_reading_time(), 1) rec.success('entry loaded:' ' {entry_title} ({entry_length}m)') if entry.is_draft: self.draft_entries.append(entry) elif entry.is_special: self.special_entries.append(entry) else: self.entries.append(entry) # Sorting the EntryLists self.entries.sort() # sorting drafts/special pages doesn't do much self.draft_entries.sort(key=lambda e: os.path.getmtime(e.source_path)) self.special_entries.sort() self._rebuild_tag_map() for i, entry in enumerate(self.entries, start=1): start_next = max(0, i - NEXT_ENTRY_COUNT) entry.next_entries = self.entries[start_next:i - 1][::-1] entry.prev_entries = self.entries[i:i + PREV_ENTRY_COUNT] self._call_custom_hook('post_load')
def inject_noise_folder(wav_folder, noise_levels, n_items): if type(noise_levels) == float: noise_levels = [noise_levels] trg_dir = join(wav_folder, 'out') os.makedirs(trg_dir, exist_ok=True) wavs = list(fileutils.iter_find_files(wav_folder, "*.wav")) for noise_level in noise_levels: for i in range(n_items): w1, w2 = random.sample(wavs, 2) inject_noise_sample(w1, w2, join(trg_dir, f"{i}_{noise_level}_noise.wav"), noise_level)
def _make_dataset(self): files = [] wavs = list(iter_find_files(self.wav_path, "*.wav")) if self.hparams.devrun: wavs = wavs[:self.hparams.devrun_size] for wav in tqdm(wavs, desc="loading data into memory"): res = self.process_file(wav) if res is not None: files.append(res) return files
def getAllImagesFromFolder(self): """ This method will return all PNG/JPG/JPEG Images in Folder ./resources/exampleImages/ :return: List of filepath-strings """ filePath = propertyHolder.imageFolderDir fileGenerator = fileutils.iter_find_files( filePath, patterns=['*.png', '*jpg', '*jpeg']) fileList = [] for file in fileGenerator: fileList.append(file) return fileList
def generate_stubs(self, path: Path) -> List[Tuple[Path, Path]]: """Generate Stub Files from a package. Args: path (Path): Path to package. Returns: List[Tuple[Path, Path]]: List of tuples containing a path to the original file and stub, respectively. """ py_files = fileutils.iter_find_files(str(path), patterns="*.py", ignored=self._ignore_stubs) stubs = [utils.generate_stub(f) for f in py_files] return stubs
def main(args): try: parser = argparse.ArgumentParser(description='copy all wav files from all sub dirs to out_dir') parser.add_argument('--input_dir', type=str, help='Path to TextGrid dir',required=True) parser.add_argument('--output_dir', type=str, help='Path to output dir',required=True) args = parser.parse_args(args) assert os.path.exists(args.input_dir),f"Invalid Path, couldn't find [{args.input_dir}]" assert os.path.exists(args.output_dir),f"Invalid Path, couldn't find [{args.output_dir}]" wav_files = list(fileutils.iter_find_files(args.input_dir, "*.wav"))+list(fileutils.iter_find_files(args.input_dir, "*.WAV")) counter=0 files_dict={} for file in wav_files: files_dict[counter] = file if os.path.exists(os.path.join(args.output_dir,f"{counter}.wav")): os.remove(os.path.join(args.output_dir,f"{counter}.wav")) copyfile(file,os.path.join(args.output_dir,f"{counter}.wav")) counter+=1 print(f"Finished to copy '*.wav' files to {args.output_dir}") with open(os.path.join(args.output_dir,files_dict_fname),'w') as f: f.write(f"input_dir : {args.input_dir}\n") f.write(f"output_dir : {args.output_dir}\n") for k,v in files_dict.items(): f.write(f"{k}:{v}\n") print(f"Finished to write the files dictionary to {os.path.join(args.output_dir,files_dict_fname)}") except Exception as e: print(f"Failed to process the data, error {e}") exit(1) #FAIL
def show_recent_metrics(metrics_dir): "shows the most recent metrics collection" metrics_files = sorted(iter_find_files(metrics_dir, '*.jsonl'), reverse=True) if not metrics_files: print_err('no recent metrics found at %s' % metrics_dir) return metrics_file = metrics_files[0] print('# ' + os.path.basename(metrics_file) + '\n') for line in open(metrics_file): try: print(line, end='') except IOError: break return
def from_timestamp(cls, campaign, timestamp, full=True): strf_tmpl = STATE_FULL_PATH_TMPL if full else STATE_PATH_TMPL # this handles when a date object is passed in for timestamp # (instead of a datetime) strf_tmpl = strf_tmpl.replace('000000', '*') start_pattern = timestamp.strftime(strf_tmpl) dir_path = campaign.base_path + os.path.split(start_pattern)[0] file_paths = sorted( iter_find_files(dir_path, os.path.split(start_pattern)[1])) try: first_path = file_paths[0] except IndexError: raise StateNotFound( 'no state found for campaign %r at timestamp %s' % (campaign, timestamp)) return cls.from_json_path(campaign, first_path, full=full)
def get_state_filepaths(data_dir, full=True): pattern = STATE_FULL_FN_GLOB if full else STATE_FN_GLOB return sorted(iter_find_files(data_dir, pattern))
def render(plist, pdir, pfile): "generate the list markdown from the yaml listing" normalize(pfile=pfile, plist=plist) topic_map = plist.get_projects_by_type('topic') topic_toc_text = format_tag_toc(topic_map) projects_by_topic = format_all_categories(topic_map) plat_map = plist.get_projects_by_type('platform') plat_toc_text = format_tag_toc(plat_map) projects_by_plat = format_all_categories(plat_map) context = { 'TOPIC_TOC': topic_toc_text, 'TOPIC_TEXT': projects_by_topic, 'PLATFORM_TOC': plat_toc_text, 'PLATFORM_TEXT': projects_by_plat, 'TOTAL_COUNT': len(plist.project_list) } templates_path = pdir + '/templates/' if not os.path.isdir(templates_path): raise APACLIError('expected "templates" directory at %r' % templates_path) for filename in iter_find_files(templates_path, '*.tmpl.md'): tmpl_text = open(filename).read() target_filename = os.path.split(filename)[1].replace('.tmpl', '') output_text = tmpl_text.format(**context) with atomic_save(pdir + '/' + target_filename) as f: f.write(output_text.encode('utf8')) feed_tmpl_path = templates_path + '/atom.xml' if os.path.exists(feed_tmpl_path): def _stderr_log_func(level, name, message): import sys sys.stderr.write('%s - %s - %s\n' % (level.upper(), name, message)) sys.stderr.flush() ashes_env = AshesEnv([templates_path], log_func=_stderr_log_func) proj_dict_list = [] for proj in plist.project_list: cur = proj.to_dict() cur['name_slug'] = proj.name_slug cur['date_added_utc'] = proj.date_added.isoformat() + 'Z' cur['urls'] = get_url_list(proj) proj_dict_list.append(cur) cur_dt = datetime.datetime.utcnow().replace( microsecond=0).isoformat() + 'Z' res = ashes_env.render( 'atom.xml', { 'projects': sorted(proj_dict_list, key=lambda x: x['date_added'], reverse=True), 'last_generated_utc': cur_dt }) with atomic_save(pdir + '/atom.xml') as f: f.write(res.encode('utf8')) return
def export_metrics(plist, earliest, metrics_dir, metrics=None, output_path=None, output_format=None, _show_exportable=False): "export a csv with metrics collated from previous collect-metrics runs" metric_mods = all_metric_mods = _get_all_metric_mods(check_reqs=False) if metrics: metric_mods = [m for m in metric_mods if m.__name__ in metrics] if not metric_mods: print_err( 'failed to collect data. no known metrics selected (available: %s)' % ', '.join([m.__name__ for m in all_metric_mods])) return metrics_map = {(m.__name__, p.name_slug): None for m in all_metric_mods for p in plist.project_list} metrics_files = iter_find_files(metrics_dir, '*.jsonl') earliest_text = earliest.isoformat() files_to_search = [] for metric_file in metrics_files: metric_base_fn = os.path.basename(os.path.splitext(metric_file)[0]) _, run_dt_text, newest_dt_text, oldest_dt_text = metric_base_fn.split( '__') if newest_dt_text < earliest_text: print('skipping', metric_file) continue files_to_search.append(metric_file) with open(metric_file) as f: # TODO: possible optimization when searching for a # specific project/metric. search for the project name # slug and metric name in the part of the line before the # result begins (the jsonl keys are carefully chosen to # sort nicely) for line_data in JSONLIterator(f): metric_name, proj_slug = line_data['metric_name'], line_data[ 'project'] try: cur_data = metrics_map[metric_name, proj_slug] except KeyError: # not a tracked project/metric continue if cur_data is None or cur_data['pull_date'] < line_data[ 'pull_date']: metrics_map[metric_name, proj_slug] = line_data possible_paths = IndexedSet() for (metric_name, proj_slug), data in metrics_map.items(): if data is None: continue def _visit(path, key, value): if not isinstance(value, (list, dict)): possible_paths.add((metric_name, ) + path + (key, )) return True remap(data['result'], visit=_visit) # TODO: deal with missing metrics # TODO: output csv or something ''' --cols 'license.total,evcs.*, sloc.TOTAL_* --cols-file if col.endswith('*'): pop the segment with the star, fetch up until that point, then fetch/flatten everything underneath ''' possible_paths = sorted(possible_paths) path_texts = ['.'.join('%s' % s for s in path) for path in possible_paths] from pprint import pprint if _show_exportable: print('\n'.join(path_texts)) print('Showing %s exportable columns.' % len(possible_paths)) return # for each project, output project_name, ...cols..., pull_date cols = path_texts all_proj_dicts = [] for project in plist.project_list: cur_proj_dict = {'name': project.name_slug} for col in cols: metric_mod_name, glom_path = col.split('.', 1) cur_result_dict = (metrics_map[metric_mod_name, project.name_slug] or { 'result': {} })['result'] cur_proj_dict[col] = glom.glom(cur_result_dict, glom_path, default='') cur_proj_dict[col] = cur_proj_dict[col] if cur_proj_dict[ col] is not None else '' all_proj_dicts.append(cur_proj_dict) all_cols = [ 'name' ] + cols # TODO: + ['pull_date'] (oldest of all the collated metrics or? with open('apatite_export.csv', 'w') as f: w = csv.DictWriter(f, all_cols) w.writeheader() for proj_dict in all_proj_dicts: w.writerow(proj_dict) print('exported %s columns for %s projects across %s metrics (%s)' % (len(all_cols), len(all_proj_dicts), len(metric_mods), ', '.join( sorted(m.__name__ for m in metric_mods)))) return
def __init__(self, path): self.path = path self.data = list(iter_find_files(self.path, "*.wav")) super(WavPhnDataset, self).__init__()