Example #1
0
def _list_dir(dir_path: str) -> _DirEntries:
    ents: _DirEntries = {}
    for name in gfile.listdir(dir_path):
        path = dir_path + '/' + name
        stat = gfile.stat(path)
        ents[name] = _FileStat(length=stat.length,
                               mtime_nsec=stat.mtime_nsec,
                               is_directory=stat.is_directory)
    return ents
def copy(src, dst, progress=True, block_size=1024 * 1024 * 10):
    """Copies a file with progress bar.

  Args:
    src: Source file. Path must be readable by `tf.io.gfile`.
    dst: Destination file. Path must be readable by `tf.io.gfile`.
    progress: Whether to show a progres bar.
    block_size: Size of individual blocks to be read/written.
  """
    stats = gfile.stat(src)
    n = int(np.ceil(stats.length / block_size))
    range_or_trange = tqdm.trange if progress else range
    with gfile.GFile(src, 'rb') as fin:
        with gfile.GFile(dst, 'wb') as fout:
            for _ in range_or_trange(n):
                fout.write(fin.read(block_size))
  def test_whole_pipeline(self):
    test_subdirectory = self.create_tempdir()
    output_stem = os.path.join(test_subdirectory, 'testout')
    input_stage1_dat_glob = os.path.join(TESTDATA_PATH,
                                         'pipeline_input_stage1.dat')
    input_stage2_dat_glob = os.path.join(TESTDATA_PATH,
                                         'pipeline_input_stage2.dat')
    input_equivalent_glob = os.path.join(TESTDATA_PATH,
                                         'pipeline_equivalent.dat')
    input_bond_topology_csv = os.path.join(TESTDATA_PATH,
                                           'pipeline_bond_topology.csv')
    with flagsaver.flagsaver(
        input_stage1_dat_glob=input_stage1_dat_glob,
        input_stage2_dat_glob=input_stage2_dat_glob,
        input_equivalent_glob=input_equivalent_glob,
        input_bond_topology_csv=input_bond_topology_csv,
        output_stem=output_stem,
        output_shards=1):
      # If you have custom beam options, add them here.
      beam_options = None
      with beam.Pipeline(beam_options) as root:
        pipeline.pipeline(root)

    metrics = root.result.metrics().query()
    counters_dict = {
        m.key.metric.name: m.committed for m in metrics['counters']
    }

    self.assertEqual(counters_dict['attempted_topology_matches'], 3)
    # Conformer 620517 will not match because bond lengths are not extracted
    # from conformers with serious errors like this.
    self.assertEqual(counters_dict['no_topology_matches'], 1)
    self.assertNotIn('topology_match_smiles_failure', counters_dict)

    logging.info('Files in output: %s',
                 '\n'.join(gfile.glob(os.path.join(test_subdirectory, '*'))))
    for stage in ['stage1', 'stage2']:
      self.assertTrue(
          gfile.exists(output_stem + '_' + stage +
                       '_original_known_error-00000-of-00001.dat'))
      self.assertTrue(
          gfile.exists(output_stem + '_' + stage +
                       '_original_unknown_error-00000-of-00001.dat'))
      self.assertTrue(
          gfile.exists(output_stem + '_' + stage +
                       '_mismatched_original-00000-of-00001.dat'))
      self.assertTrue(
          gfile.exists(output_stem + '_' + stage +
                       '_mismatched_regen-00000-of-00001.dat'))

    # Check the merge conflicts file
    with gfile.GFile(output_stem + '_conflicts-00000-of-00001.csv') as f:
      conflicts_lines = f.readlines()
      self.assertIn('conformer_id,', conflicts_lines[0])
      self.assertEqual(
          conflicts_lines[1], '618451001,1,1,1,1,'
          '-406.51179,9.999999,-406.522079,9.999999,True,True,'
          '-406.51179,0.052254,-406.522079,2.5e-05,True,True\n')

    # Check a couple of the stats.
    with gfile.GFile(output_stem + '_stats-00000-of-00001.csv') as f:
      stats_lines = f.readlines()
      self.assertIn('errors.status,0,2\n', stats_lines)
      self.assertIn('errors.warn_t1,0,4\n', stats_lines)
      self.assertIn('fate,FATE_SUCCESS,2\n', stats_lines)
      self.assertIn('fate,FATE_DUPLICATE_DIFFERENT_TOPOLOGY,1\n', stats_lines)
      self.assertIn('num_initial_geometries,1,4\n', stats_lines)
      self.assertIn('num_duplicates,1,1\n', stats_lines)
      self.assertIn('zero_field,single_point_energy_pbe0d3_6_311gd,1\n',
                    stats_lines)

    # Check the smiles comparison output
    with gfile.GFile(output_stem + '_smiles_compare-00000-of-00001.csv') as f:
      smiles_lines = f.readlines()
      self.assertIn(
          '620517002,MISMATCH,NotAValidSmilesString,'
          '[H]C1=C2OC2=C(F)O1,FC1=C2OC2=CO1\n', smiles_lines)
      # Make sure that a bond topology with a matching smiles doesn't show
      for line in smiles_lines:
        self.assertNotIn('618451001', line)

    # Check the bond topology summary
    with gfile.GFile(output_stem + '_bt_summary-00000-of-00001.csv') as f:
      bt_summary_lines = f.readlines()
      # Check part of the header line
      self.assertIn('bt_id', bt_summary_lines[0])
      self.assertIn('count_attempted_conformers', bt_summary_lines[0])
      # This is the bond topology that has no conformer
      self.assertIn('10,0,0,0,0,0,0,0,0,0,0,0,0\n', bt_summary_lines)
      # This is a bond topology with 1 conformer
      self.assertIn('620517,1,0,0,0,1,0,1,0,0,0,0,0\n', bt_summary_lines)
      # This is a bond topology with 2 conformers
      self.assertIn('618451,2,0,0,0,2,0,0,0,2,0,0,0\n', bt_summary_lines)

    # Check the bond lengths file
    with gfile.GFile(output_stem + '_bond_lengths.csv') as f:
      bond_length_lines = f.readlines()
      self.assertEqual('atom_char_0,atom_char_1,bond_type,length_str,count\n',
                       bond_length_lines[0])
      self.assertIn('c,c,2,1.336,1\n', bond_length_lines)
      self.assertIn('c,o,1,1.422,2\n', bond_length_lines)

    # For the gzip files below, we check >100 because even an empty gzip file
    # has non-zero length. 100 is kind of arbitrary to be bigger than the
    # expected header of 20.
    self.assertGreater(
        gfile.stat(output_stem + '_complete_json-00000-of-00003.json.gz').length
        +
        gfile.stat(output_stem + '_complete_json-00001-of-00003.json.gz').length
        + gfile.stat(output_stem +
                     '_complete_json-00002-of-00003.json.gz').length, 100)
    self.assertGreater(
        gfile.stat(output_stem +
                   '_standard_json-00000-of-00001.json.gz').length, 100)

    # Check that the generated TFRecord files contain some expected outputs
    standard_dataset = tf.data.TFRecordDataset(
        output_stem + '_standard_tfrecord-00000-of-00001')
    standard_output = [
        dataset_pb2.Conformer.FromString(raw)
        for raw in standard_dataset.as_numpy_iterator()
    ]
    self.assertCountEqual([c.conformer_id for c in standard_output],
                          [618451001, 618451123])
    # Check that fields are filtered the way we expect
    self.assertFalse(
        standard_output[0].properties.HasField('compute_cluster_info'))
    self.assertFalse(
        standard_output[0].properties.HasField('homo_pbe0_aug_pc_1'))
    self.assertTrue(
        standard_output[0].properties.HasField('rotational_constants'))

    complete_dataset = tf.data.TFRecordDataset(
        output_stem + '_complete_tfrecord-00000-of-00001')
    complete_output = [
        dataset_pb2.Conformer.FromString(raw)
        for raw in complete_dataset.as_numpy_iterator()
    ]
    self.assertCountEqual([c.conformer_id for c in complete_output],
                          [618451001, 618451123, 620517002, 79593005])
    # Check that fields are filtered the way we expect
    # The DirectRunner randomizes the order of output so we need to make sure
    # that we get a full record.
    complete_entry = [
        c for c in complete_output if c.conformer_id == 618451001
    ][0]
    self.assertFalse(complete_entry.properties.HasField('compute_cluster_info'))
    self.assertTrue(complete_entry.properties.HasField('homo_pbe0_aug_pc_1'))
    self.assertTrue(complete_entry.properties.HasField('rotational_constants'))

    complete_entry_for_smiles = [
        c for c in complete_output if c.conformer_id == 620517002
    ][0]
    self.assertEqual(complete_entry_for_smiles.properties.smiles_openbabel,
                     'NotAValidSmilesString')
Example #4
0
    def test_whole_pipeline(self):
        test_subdirectory = self.create_tempdir()
        output_stem = os.path.join(test_subdirectory, 'testout')
        input_stage1_dat_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_input_stage1.dat')
        input_stage2_dat_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_input_stage2.dat')
        input_equivalent_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_equivalent.dat')
        input_bond_topology_csv = os.path.join(TESTDATA_PATH,
                                               'pipeline_bond_topology.csv')
        with flagsaver.flagsaver(
                input_stage1_dat_glob=input_stage1_dat_glob,
                input_stage2_dat_glob=input_stage2_dat_glob,
                input_equivalent_glob=input_equivalent_glob,
                input_bond_topology_csv=input_bond_topology_csv,
                output_stem=output_stem,
                output_shards=1):
            # If you have custom beam options, add them here.
            beam_options = None
            with beam.Pipeline(beam_options) as root:
                pipeline.pipeline(root)

        logging.info(
            'Files in output: %s',
            '\n'.join(gfile.glob(os.path.join(test_subdirectory, '/*'))))
        for stage in ['stage1', 'stage2']:
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_original_known_error-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_original_unknown_error-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_mismatched_original-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_mismatched_regen-00000-of-00001.dat'))

        # Check the merge conflicts file
        with gfile.GFile(output_stem + '_conflicts-00000-of-00001.csv') as f:
            conflicts_lines = f.readlines()
            self.assertIn('conformer_id,', conflicts_lines[0])
            self.assertEqual(
                conflicts_lines[1], '618451001,'
                '1,1,1,1,-406.51179,9.999999,-406.522079,9.999999,True,True,'
                '1,1,1,1,-406.51179,0.052254,-406.522079,2.5e-05,True,True\n')

        # Check a couple of the stats.
        with gfile.GFile(output_stem + '_stats-00000-of-00001.csv') as f:
            stats_lines = f.readlines()
            self.assertIn('error_nsvg09,0,4\n', stats_lines)
            self.assertIn('fate,FATE_SUCCESS,2\n', stats_lines)
            self.assertIn('fate,FATE_DUPLICATE_DIFFERENT_TOPOLOGY,1\n',
                          stats_lines)
            self.assertIn('num_initial_geometries,1,4\n', stats_lines)
            self.assertIn('num_duplicates,1,1\n', stats_lines)

        # Check the smiles comparison output
        with gfile.GFile(output_stem +
                         '_smiles_compare-00000-of-00001.csv') as f:
            smiles_lines = f.readlines()
            self.assertIn(
                '620517002,MISMATCH,NotAValidSmilesString,'
                '[H]C1=C2OC2=C(F)O1,FC1=C2OC2=CO1\n', smiles_lines)
            # Make sure that a bond topology with a matching smiles doesn't show
            for line in smiles_lines:
                self.assertNotIn('618451001', line)

        # Check the bond topology summary
        with gfile.GFile(output_stem + '_bt_summary-00000-of-00001.csv') as f:
            bt_summary_lines = f.readlines()
            # Check part of the header line
            self.assertIn('bt_id', bt_summary_lines[0])
            self.assertIn('count_attempted_conformers', bt_summary_lines[0])
            # This is the bond topology that has no conformer
            self.assertIn('10,0,0,0,0,0,0,0,0,0,0\n', bt_summary_lines)
            # This is a bond topology with 1 conformer
            self.assertIn('620517,1,0,0,0,1,0,1,0,0,0\n', bt_summary_lines)
            # This is a bond topology with 2 conformers
            self.assertIn('618451,2,0,0,0,2,0,0,2,0,0\n', bt_summary_lines)

        # For the gzip files below, we check >100 because even an empty gzip file
        # has non-zero length. 100 is kind of arbitrary to be bigger than the
        # expected header of 20.
        self.assertGreater(
            gfile.stat(output_stem +
                       '_complete_json-00000-of-00003.json.gz').length +
            gfile.stat(output_stem +
                       '_complete_json-00001-of-00003.json.gz').length +
            gfile.stat(output_stem +
                       '_complete_json-00002-of-00003.json.gz').length, 100)
        self.assertGreater(
            gfile.stat(output_stem +
                       '_standard_json-00000-of-00001.json.gz').length, 100)
Example #5
0
 def _get_file_stats(path: str):
     stat = gfile.stat(path)
     return File(path=path,
                 size=stat.length,
                 mtime=int(stat.mtime_nsec / 1e9))