def _list_dir(dir_path: str) -> _DirEntries: ents: _DirEntries = {} for name in gfile.listdir(dir_path): path = dir_path + '/' + name stat = gfile.stat(path) ents[name] = _FileStat(length=stat.length, mtime_nsec=stat.mtime_nsec, is_directory=stat.is_directory) return ents
def copy(src, dst, progress=True, block_size=1024 * 1024 * 10): """Copies a file with progress bar. Args: src: Source file. Path must be readable by `tf.io.gfile`. dst: Destination file. Path must be readable by `tf.io.gfile`. progress: Whether to show a progres bar. block_size: Size of individual blocks to be read/written. """ stats = gfile.stat(src) n = int(np.ceil(stats.length / block_size)) range_or_trange = tqdm.trange if progress else range with gfile.GFile(src, 'rb') as fin: with gfile.GFile(dst, 'wb') as fout: for _ in range_or_trange(n): fout.write(fin.read(block_size))
def test_whole_pipeline(self): test_subdirectory = self.create_tempdir() output_stem = os.path.join(test_subdirectory, 'testout') input_stage1_dat_glob = os.path.join(TESTDATA_PATH, 'pipeline_input_stage1.dat') input_stage2_dat_glob = os.path.join(TESTDATA_PATH, 'pipeline_input_stage2.dat') input_equivalent_glob = os.path.join(TESTDATA_PATH, 'pipeline_equivalent.dat') input_bond_topology_csv = os.path.join(TESTDATA_PATH, 'pipeline_bond_topology.csv') with flagsaver.flagsaver( input_stage1_dat_glob=input_stage1_dat_glob, input_stage2_dat_glob=input_stage2_dat_glob, input_equivalent_glob=input_equivalent_glob, input_bond_topology_csv=input_bond_topology_csv, output_stem=output_stem, output_shards=1): # If you have custom beam options, add them here. beam_options = None with beam.Pipeline(beam_options) as root: pipeline.pipeline(root) metrics = root.result.metrics().query() counters_dict = { m.key.metric.name: m.committed for m in metrics['counters'] } self.assertEqual(counters_dict['attempted_topology_matches'], 3) # Conformer 620517 will not match because bond lengths are not extracted # from conformers with serious errors like this. self.assertEqual(counters_dict['no_topology_matches'], 1) self.assertNotIn('topology_match_smiles_failure', counters_dict) logging.info('Files in output: %s', '\n'.join(gfile.glob(os.path.join(test_subdirectory, '*')))) for stage in ['stage1', 'stage2']: self.assertTrue( gfile.exists(output_stem + '_' + stage + '_original_known_error-00000-of-00001.dat')) self.assertTrue( gfile.exists(output_stem + '_' + stage + '_original_unknown_error-00000-of-00001.dat')) self.assertTrue( gfile.exists(output_stem + '_' + stage + '_mismatched_original-00000-of-00001.dat')) self.assertTrue( gfile.exists(output_stem + '_' + stage + '_mismatched_regen-00000-of-00001.dat')) # Check the merge conflicts file with gfile.GFile(output_stem + '_conflicts-00000-of-00001.csv') as f: conflicts_lines = f.readlines() self.assertIn('conformer_id,', conflicts_lines[0]) self.assertEqual( conflicts_lines[1], '618451001,1,1,1,1,' '-406.51179,9.999999,-406.522079,9.999999,True,True,' '-406.51179,0.052254,-406.522079,2.5e-05,True,True\n') # Check a couple of the stats. with gfile.GFile(output_stem + '_stats-00000-of-00001.csv') as f: stats_lines = f.readlines() self.assertIn('errors.status,0,2\n', stats_lines) self.assertIn('errors.warn_t1,0,4\n', stats_lines) self.assertIn('fate,FATE_SUCCESS,2\n', stats_lines) self.assertIn('fate,FATE_DUPLICATE_DIFFERENT_TOPOLOGY,1\n', stats_lines) self.assertIn('num_initial_geometries,1,4\n', stats_lines) self.assertIn('num_duplicates,1,1\n', stats_lines) self.assertIn('zero_field,single_point_energy_pbe0d3_6_311gd,1\n', stats_lines) # Check the smiles comparison output with gfile.GFile(output_stem + '_smiles_compare-00000-of-00001.csv') as f: smiles_lines = f.readlines() self.assertIn( '620517002,MISMATCH,NotAValidSmilesString,' '[H]C1=C2OC2=C(F)O1,FC1=C2OC2=CO1\n', smiles_lines) # Make sure that a bond topology with a matching smiles doesn't show for line in smiles_lines: self.assertNotIn('618451001', line) # Check the bond topology summary with gfile.GFile(output_stem + '_bt_summary-00000-of-00001.csv') as f: bt_summary_lines = f.readlines() # Check part of the header line self.assertIn('bt_id', bt_summary_lines[0]) self.assertIn('count_attempted_conformers', bt_summary_lines[0]) # This is the bond topology that has no conformer self.assertIn('10,0,0,0,0,0,0,0,0,0,0,0,0\n', bt_summary_lines) # This is a bond topology with 1 conformer self.assertIn('620517,1,0,0,0,1,0,1,0,0,0,0,0\n', bt_summary_lines) # This is a bond topology with 2 conformers self.assertIn('618451,2,0,0,0,2,0,0,0,2,0,0,0\n', bt_summary_lines) # Check the bond lengths file with gfile.GFile(output_stem + '_bond_lengths.csv') as f: bond_length_lines = f.readlines() self.assertEqual('atom_char_0,atom_char_1,bond_type,length_str,count\n', bond_length_lines[0]) self.assertIn('c,c,2,1.336,1\n', bond_length_lines) self.assertIn('c,o,1,1.422,2\n', bond_length_lines) # For the gzip files below, we check >100 because even an empty gzip file # has non-zero length. 100 is kind of arbitrary to be bigger than the # expected header of 20. self.assertGreater( gfile.stat(output_stem + '_complete_json-00000-of-00003.json.gz').length + gfile.stat(output_stem + '_complete_json-00001-of-00003.json.gz').length + gfile.stat(output_stem + '_complete_json-00002-of-00003.json.gz').length, 100) self.assertGreater( gfile.stat(output_stem + '_standard_json-00000-of-00001.json.gz').length, 100) # Check that the generated TFRecord files contain some expected outputs standard_dataset = tf.data.TFRecordDataset( output_stem + '_standard_tfrecord-00000-of-00001') standard_output = [ dataset_pb2.Conformer.FromString(raw) for raw in standard_dataset.as_numpy_iterator() ] self.assertCountEqual([c.conformer_id for c in standard_output], [618451001, 618451123]) # Check that fields are filtered the way we expect self.assertFalse( standard_output[0].properties.HasField('compute_cluster_info')) self.assertFalse( standard_output[0].properties.HasField('homo_pbe0_aug_pc_1')) self.assertTrue( standard_output[0].properties.HasField('rotational_constants')) complete_dataset = tf.data.TFRecordDataset( output_stem + '_complete_tfrecord-00000-of-00001') complete_output = [ dataset_pb2.Conformer.FromString(raw) for raw in complete_dataset.as_numpy_iterator() ] self.assertCountEqual([c.conformer_id for c in complete_output], [618451001, 618451123, 620517002, 79593005]) # Check that fields are filtered the way we expect # The DirectRunner randomizes the order of output so we need to make sure # that we get a full record. complete_entry = [ c for c in complete_output if c.conformer_id == 618451001 ][0] self.assertFalse(complete_entry.properties.HasField('compute_cluster_info')) self.assertTrue(complete_entry.properties.HasField('homo_pbe0_aug_pc_1')) self.assertTrue(complete_entry.properties.HasField('rotational_constants')) complete_entry_for_smiles = [ c for c in complete_output if c.conformer_id == 620517002 ][0] self.assertEqual(complete_entry_for_smiles.properties.smiles_openbabel, 'NotAValidSmilesString')
def test_whole_pipeline(self): test_subdirectory = self.create_tempdir() output_stem = os.path.join(test_subdirectory, 'testout') input_stage1_dat_glob = os.path.join(TESTDATA_PATH, 'pipeline_input_stage1.dat') input_stage2_dat_glob = os.path.join(TESTDATA_PATH, 'pipeline_input_stage2.dat') input_equivalent_glob = os.path.join(TESTDATA_PATH, 'pipeline_equivalent.dat') input_bond_topology_csv = os.path.join(TESTDATA_PATH, 'pipeline_bond_topology.csv') with flagsaver.flagsaver( input_stage1_dat_glob=input_stage1_dat_glob, input_stage2_dat_glob=input_stage2_dat_glob, input_equivalent_glob=input_equivalent_glob, input_bond_topology_csv=input_bond_topology_csv, output_stem=output_stem, output_shards=1): # If you have custom beam options, add them here. beam_options = None with beam.Pipeline(beam_options) as root: pipeline.pipeline(root) logging.info( 'Files in output: %s', '\n'.join(gfile.glob(os.path.join(test_subdirectory, '/*')))) for stage in ['stage1', 'stage2']: self.assertTrue( gfile.exists(output_stem + '_' + stage + '_original_known_error-00000-of-00001.dat')) self.assertTrue( gfile.exists(output_stem + '_' + stage + '_original_unknown_error-00000-of-00001.dat')) self.assertTrue( gfile.exists(output_stem + '_' + stage + '_mismatched_original-00000-of-00001.dat')) self.assertTrue( gfile.exists(output_stem + '_' + stage + '_mismatched_regen-00000-of-00001.dat')) # Check the merge conflicts file with gfile.GFile(output_stem + '_conflicts-00000-of-00001.csv') as f: conflicts_lines = f.readlines() self.assertIn('conformer_id,', conflicts_lines[0]) self.assertEqual( conflicts_lines[1], '618451001,' '1,1,1,1,-406.51179,9.999999,-406.522079,9.999999,True,True,' '1,1,1,1,-406.51179,0.052254,-406.522079,2.5e-05,True,True\n') # Check a couple of the stats. with gfile.GFile(output_stem + '_stats-00000-of-00001.csv') as f: stats_lines = f.readlines() self.assertIn('error_nsvg09,0,4\n', stats_lines) self.assertIn('fate,FATE_SUCCESS,2\n', stats_lines) self.assertIn('fate,FATE_DUPLICATE_DIFFERENT_TOPOLOGY,1\n', stats_lines) self.assertIn('num_initial_geometries,1,4\n', stats_lines) self.assertIn('num_duplicates,1,1\n', stats_lines) # Check the smiles comparison output with gfile.GFile(output_stem + '_smiles_compare-00000-of-00001.csv') as f: smiles_lines = f.readlines() self.assertIn( '620517002,MISMATCH,NotAValidSmilesString,' '[H]C1=C2OC2=C(F)O1,FC1=C2OC2=CO1\n', smiles_lines) # Make sure that a bond topology with a matching smiles doesn't show for line in smiles_lines: self.assertNotIn('618451001', line) # Check the bond topology summary with gfile.GFile(output_stem + '_bt_summary-00000-of-00001.csv') as f: bt_summary_lines = f.readlines() # Check part of the header line self.assertIn('bt_id', bt_summary_lines[0]) self.assertIn('count_attempted_conformers', bt_summary_lines[0]) # This is the bond topology that has no conformer self.assertIn('10,0,0,0,0,0,0,0,0,0,0\n', bt_summary_lines) # This is a bond topology with 1 conformer self.assertIn('620517,1,0,0,0,1,0,1,0,0,0\n', bt_summary_lines) # This is a bond topology with 2 conformers self.assertIn('618451,2,0,0,0,2,0,0,2,0,0\n', bt_summary_lines) # For the gzip files below, we check >100 because even an empty gzip file # has non-zero length. 100 is kind of arbitrary to be bigger than the # expected header of 20. self.assertGreater( gfile.stat(output_stem + '_complete_json-00000-of-00003.json.gz').length + gfile.stat(output_stem + '_complete_json-00001-of-00003.json.gz').length + gfile.stat(output_stem + '_complete_json-00002-of-00003.json.gz').length, 100) self.assertGreater( gfile.stat(output_stem + '_standard_json-00000-of-00001.json.gz').length, 100)
def _get_file_stats(path: str): stat = gfile.stat(path) return File(path=path, size=stat.length, mtime=int(stat.mtime_nsec / 1e9))