Exemple #1
0
 def test_print(self):
     self.assertIs(pyaml.print, pyaml.pprint)
     self.assertIs(pyaml.print, pyaml.p)
     buff = io.BytesIO()
     b = pyaml.dump(data_str_multiline, dst=bytes)
     pyaml.print(data_str_multiline, file=buff)
     self.assertEqual(b, buff.getvalue())
	def test_print(self):
		self.assertIs(pyaml.print, pyaml.pprint)
		self.assertIs(pyaml.print, pyaml.p)
		buff = io.BytesIO()
		b = pyaml.dump(data_str_multiline, dst=bytes)
		pyaml.print(data_str_multiline, file=buff)
		self.assertEqual(b, buff.getvalue())
Exemple #3
0
 def test_print_args(self):
     buff = io.BytesIO()
     args = 1, 2, 3
     b = pyaml.dump(args, dst=bytes)
     pyaml.print(*args, file=buff)
     self.assertEqual(b, buff.getvalue())
	def test_print_args(self):
		buff = io.BytesIO()
		args = 1, 2, 3
		b = pyaml.dump(args, dst=bytes)
		pyaml.print(*args, file=buff)
		self.assertEqual(b, buff.getvalue())
Exemple #5
0
    'display_name': f'{score_type}/{n}',
    'dataset_type': 'validation'
} for score_type, names in metric_infos.items() for n in names]

experiment = {
    'name': 'proteins',
    'time_created_secs': int((args.datetime_created.timestamp()))
}

pyaml.add_representer(type,
                      lambda dumper, data: dumper.represent_str(data.__name__))
pyaml.print(
    {
        'experiment': experiment,
        'hparams': hparam_infos,
        'metrics': metric_infos
    },
    safe=True,
    sort_dicts=False,
    force_embed=True)

folder = (Path(args.folder) / 'hparams').expanduser().resolve()
with SummaryWriter(folder) as writer:
    experiment_summary = make_experiment_summary(hparam_infos, metric_infos,
                                                 experiment)
    writer.file_writer.add_summary(experiment_summary,
                                   walltime=int(
                                       (args.datetime_created.timestamp())))

print('Experiment summary saved to', folder)
Exemple #6
0
def process_file(filepath: Union[str, Path], destpath: Union[str, Path],
                 compress: bool):
    import tables

    filepath = Path(filepath).expanduser().resolve()
    h5_file = tables.open_file(filepath)

    destpath = Path(destpath).expanduser().resolve()
    destpath.mkdir(parents=True, exist_ok=False)

    scores = {
        'local': {
            'lddt': [],
            # 'sscore': [],
        },
        'global': {
            'gdtts': [],
            # 'gdtts_ha': [],
            # 'rmsd': [],
            # 'maxsub': [],
        }
    }

    sequence_lengths = []
    dataset_index = {
        'target': [],
        'decoy': [],
        'path': [],
        'index': [],
    }

    # First pass: enumerate proteins, count models, prepare frequency stats
    for protein in tqdm.tqdm(h5_file.list_nodes('/'),
                             desc='Targets ',
                             unit='t',
                             leave=False):
        target_name = protein._v_name

        # I'm not sure why the file contains these nodes, but they are not proteins for sure
        if target_name.startswith(('casp', 'cameo')):
            continue

        if len(protein.names) <= 5:
            tqdm.tqdm.write(
                f'[{target_name}] Skipping target with {len(protein.names)} model(s).',
                file=sys.stderr)
            continue

        if len(protein.seq[0]) <= 50:
            tqdm.tqdm.write(
                f'[{target_name}] Skipping target with length {len(protein.seq[0])}.',
                file=sys.stderr)
            continue

        target = make_target(protein)
        if compress:
            np.savez_compressed(destpath / f'{target_name}.npz', **target)
        else:
            np.savez(destpath / f'{target_name}.npz', **target)
        with open(destpath / f'{target_name}_info.yaml', 'w') as f:
            pyaml.dump(
                {
                    'target': target_name,
                    'num_residues': len(target['residues']),
                    'num_decoys': len(target['decoy_names']),
                },
                dst=f,
                sort_dicts=False)

        scores['local']['lddt'].append(target['lddt'].ravel())
        # scores['local']['sscore'].append(target['sscore'].ravel())
        scores['global']['gdtts'].append(target['gdtts'])
        # scores['global']['gdtts_ha'].append(target['gdtts_ha'])
        # scores['global']['rmsd'].append(target['rmsd'])
        # scores['global']['maxsub'].append(target['maxsub'])

        sequence_lengths.append(len(target['residues']))
        dataset_index['target'].extend([target['target_name']] *
                                       len(target['decoy_names']))
        dataset_index['decoy'].extend(target['decoy_names'])
        dataset_index['path'].extend([f'{target["target_name"]}.npz'] *
                                     len(target['decoy_names']))
        dataset_index['index'].extend(range(len(target['decoy_names'])))

    dataset_index = pd.DataFrame(dataset_index)
    dataset_index.to_csv(destpath / 'dataset_index.csv',
                         header=True,
                         index=False)

    # Bucketize the scores into 20 equally-spaced bins in the range [0, 1] and compute frequency counts for each bin
    bins = np.linspace(0, 1, num=20 + 1)
    for score_type in scores.keys():
        for score_name in scores[score_type].keys():
            scores[score_type][score_name] = pd.Series(
                np.concatenate(scores[score_type][score_name])).dropna()
            frequencies = pd.cut(
                scores[score_type][score_name], bins=bins,
                include_lowest=True).value_counts().sort_index()
            frequencies.to_pickle(destpath /
                                  f'{score_type}_{score_name}_frequencies.pkl')

    dataset_stats = {
        'dataset': Path(filepath).name,
        'num_targets': dataset_index['target'].nunique(),
        'num_decoys': len(dataset_index['decoy']),
        'avg_length': np.mean(sequence_lengths),
        'max_length': np.max(sequence_lengths),
        **{
            score_type: {
                score_name: scores[score_type][score_name].describe().to_dict(
                )
                for score_name in scores[score_type]
            }
            for score_type in scores.keys()
        }
    }
    pyaml.print(dataset_stats, sort_dicts=False)
    with open(destpath / 'dataset_stats.yaml', 'w') as f:
        pyaml.dump(dataset_stats, dst=f, sort_dicts=False)

    h5_file.close()