def test_print(self): self.assertIs(pyaml.print, pyaml.pprint) self.assertIs(pyaml.print, pyaml.p) buff = io.BytesIO() b = pyaml.dump(data_str_multiline, dst=bytes) pyaml.print(data_str_multiline, file=buff) self.assertEqual(b, buff.getvalue())
def test_print_args(self): buff = io.BytesIO() args = 1, 2, 3 b = pyaml.dump(args, dst=bytes) pyaml.print(*args, file=buff) self.assertEqual(b, buff.getvalue())
'display_name': f'{score_type}/{n}', 'dataset_type': 'validation' } for score_type, names in metric_infos.items() for n in names] experiment = { 'name': 'proteins', 'time_created_secs': int((args.datetime_created.timestamp())) } pyaml.add_representer(type, lambda dumper, data: dumper.represent_str(data.__name__)) pyaml.print( { 'experiment': experiment, 'hparams': hparam_infos, 'metrics': metric_infos }, safe=True, sort_dicts=False, force_embed=True) folder = (Path(args.folder) / 'hparams').expanduser().resolve() with SummaryWriter(folder) as writer: experiment_summary = make_experiment_summary(hparam_infos, metric_infos, experiment) writer.file_writer.add_summary(experiment_summary, walltime=int( (args.datetime_created.timestamp()))) print('Experiment summary saved to', folder)
def process_file(filepath: Union[str, Path], destpath: Union[str, Path], compress: bool): import tables filepath = Path(filepath).expanduser().resolve() h5_file = tables.open_file(filepath) destpath = Path(destpath).expanduser().resolve() destpath.mkdir(parents=True, exist_ok=False) scores = { 'local': { 'lddt': [], # 'sscore': [], }, 'global': { 'gdtts': [], # 'gdtts_ha': [], # 'rmsd': [], # 'maxsub': [], } } sequence_lengths = [] dataset_index = { 'target': [], 'decoy': [], 'path': [], 'index': [], } # First pass: enumerate proteins, count models, prepare frequency stats for protein in tqdm.tqdm(h5_file.list_nodes('/'), desc='Targets ', unit='t', leave=False): target_name = protein._v_name # I'm not sure why the file contains these nodes, but they are not proteins for sure if target_name.startswith(('casp', 'cameo')): continue if len(protein.names) <= 5: tqdm.tqdm.write( f'[{target_name}] Skipping target with {len(protein.names)} model(s).', file=sys.stderr) continue if len(protein.seq[0]) <= 50: tqdm.tqdm.write( f'[{target_name}] Skipping target with length {len(protein.seq[0])}.', file=sys.stderr) continue target = make_target(protein) if compress: np.savez_compressed(destpath / f'{target_name}.npz', **target) else: np.savez(destpath / f'{target_name}.npz', **target) with open(destpath / f'{target_name}_info.yaml', 'w') as f: pyaml.dump( { 'target': target_name, 'num_residues': len(target['residues']), 'num_decoys': len(target['decoy_names']), }, dst=f, sort_dicts=False) scores['local']['lddt'].append(target['lddt'].ravel()) # scores['local']['sscore'].append(target['sscore'].ravel()) scores['global']['gdtts'].append(target['gdtts']) # scores['global']['gdtts_ha'].append(target['gdtts_ha']) # scores['global']['rmsd'].append(target['rmsd']) # scores['global']['maxsub'].append(target['maxsub']) sequence_lengths.append(len(target['residues'])) dataset_index['target'].extend([target['target_name']] * len(target['decoy_names'])) dataset_index['decoy'].extend(target['decoy_names']) dataset_index['path'].extend([f'{target["target_name"]}.npz'] * len(target['decoy_names'])) dataset_index['index'].extend(range(len(target['decoy_names']))) dataset_index = pd.DataFrame(dataset_index) dataset_index.to_csv(destpath / 'dataset_index.csv', header=True, index=False) # Bucketize the scores into 20 equally-spaced bins in the range [0, 1] and compute frequency counts for each bin bins = np.linspace(0, 1, num=20 + 1) for score_type in scores.keys(): for score_name in scores[score_type].keys(): scores[score_type][score_name] = pd.Series( np.concatenate(scores[score_type][score_name])).dropna() frequencies = pd.cut( scores[score_type][score_name], bins=bins, include_lowest=True).value_counts().sort_index() frequencies.to_pickle(destpath / f'{score_type}_{score_name}_frequencies.pkl') dataset_stats = { 'dataset': Path(filepath).name, 'num_targets': dataset_index['target'].nunique(), 'num_decoys': len(dataset_index['decoy']), 'avg_length': np.mean(sequence_lengths), 'max_length': np.max(sequence_lengths), **{ score_type: { score_name: scores[score_type][score_name].describe().to_dict( ) for score_name in scores[score_type] } for score_type in scores.keys() } } pyaml.print(dataset_stats, sort_dicts=False) with open(destpath / 'dataset_stats.yaml', 'w') as f: pyaml.dump(dataset_stats, dst=f, sort_dicts=False) h5_file.close()