Example #1
0
    def test_whole_pipeline(self):
        test_subdirectory = self.create_tempdir()
        output_stem = os.path.join(test_subdirectory, 'testout')
        input_stage1_dat_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_input_stage1.dat')
        input_stage2_dat_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_input_stage2.dat')
        input_equivalent_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_equivalent.dat')
        input_bond_topology_csv = os.path.join(TESTDATA_PATH,
                                               'pipeline_bond_topology.csv')
        with flagsaver.flagsaver(
                input_stage1_dat_glob=input_stage1_dat_glob,
                input_stage2_dat_glob=input_stage2_dat_glob,
                input_equivalent_glob=input_equivalent_glob,
                input_bond_topology_csv=input_bond_topology_csv,
                output_stem=output_stem,
                output_shards=1):
            # If you have custom beam options, add them here.
            beam_options = None
            with beam.Pipeline(beam_options) as root:
                pipeline.pipeline(root)

        metrics = root.result.metrics().query()
        counters_dict = {
            m.key.metric.name: m.committed
            for m in metrics['counters']
        }

        self.assertEqual(counters_dict['attempted_topology_matches'], 3)
        # Conformer 620517 will not match because bond lengths are not extracted
        # from conformers with serious errors like this.
        self.assertEqual(counters_dict['no_topology_matches'], 1)
        self.assertNotIn('topology_match_smiles_failure', counters_dict)

        logging.info(
            'Files in output: %s',
            '\n'.join(gfile.glob(os.path.join(test_subdirectory, '*'))))
        for stage in ['stage1', 'stage2']:
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_original_known_error-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_original_unknown_error-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_mismatched_original-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_mismatched_regen-00000-of-00001.dat'))

        # Check the merge conflicts file
        with gfile.GFile(output_stem + '_conflicts-00000-of-00001.csv') as f:
            conflicts_lines = f.readlines()
            self.assertIn('conformer_id,', conflicts_lines[0])
            self.assertEqual(
                conflicts_lines[1], '618451001,1,1,1,1,'
                '-406.51179,9.999999,-406.522079,9.999999,True,True,'
                '-406.51179,0.052254,-406.522079,2.5e-05,True,True\n')

        # Check a couple of the stats.
        with gfile.GFile(output_stem + '_stats-00000-of-00001.csv') as f:
            stats_lines = f.readlines()
            self.assertIn('errors.status,0,2\n', stats_lines)
            self.assertIn('errors.warn_t1,0,4\n', stats_lines)
            self.assertIn('fate,FATE_SUCCESS,2\n', stats_lines)
            self.assertIn('fate,FATE_DUPLICATE_DIFFERENT_TOPOLOGY,1\n',
                          stats_lines)
            self.assertIn('num_initial_geometries,1,4\n', stats_lines)
            self.assertIn('num_duplicates,1,1\n', stats_lines)
            self.assertIn('zero_field,single_point_energy_pbe0d3_6_311gd,1\n',
                          stats_lines)

        # Check the smiles comparison output
        with gfile.GFile(output_stem +
                         '_smiles_compare-00000-of-00001.csv') as f:
            smiles_lines = f.readlines()
            self.assertIn(
                '620517002,MISMATCH,NotAValidSmilesString,'
                '[H]C1=C2OC2=C(F)O1,FC1=C2OC2=CO1\n', smiles_lines)
            # Make sure that a bond topology with a matching smiles doesn't show
            for line in smiles_lines:
                self.assertNotIn('618451001', line)

        # Check the bond topology summary
        with gfile.GFile(output_stem + '_bt_summary-00000-of-00001.csv') as f:
            bt_summary_lines = f.readlines()
            # Check part of the header line
            self.assertIn('bt_id', bt_summary_lines[0])
            self.assertIn('count_attempted_conformers', bt_summary_lines[0])
            # This is the bond topology that has no conformer
            self.assertIn('10,0,0,0,0,0,0,0,0,0,0,0,0,0\n', bt_summary_lines)
            # This is a bond topology with 1 conformer
            self.assertIn('620517,1,0,0,0,1,0,1,0,0,0,0,0,0\n',
                          bt_summary_lines)
            # This is a bond topology with 2 conformers
            self.assertIn('618451,2,0,0,0,2,0,0,0,2,0,0,0,0\n',
                          bt_summary_lines)

        # Check the bond lengths file
        with gfile.GFile(output_stem + '_bond_lengths.csv') as f:
            bond_length_lines = f.readlines()
            self.assertEqual(
                'atom_char_0,atom_char_1,bond_type,length_str,count\n',
                bond_length_lines[0])
            self.assertIn('c,c,2,1.336,1\n', bond_length_lines)
            self.assertIn('c,o,1,1.422,2\n', bond_length_lines)

        # For the gzip files below, we check >100 because even an empty gzip file
        # has non-zero length. 100 is kind of arbitrary to be bigger than the
        # expected header of 20.

        # Check that the generated TFRecord files contain some expected outputs
        standard_dataset = tf.data.TFRecordDataset(
            output_stem + '_standard_tfrecord-00000-of-00001')
        standard_output = [
            dataset_pb2.Conformer.FromString(raw)
            for raw in standard_dataset.as_numpy_iterator()
        ]
        self.assertCountEqual([c.conformer_id for c in standard_output],
                              [618451001, 618451123])
        # Check that fields are filtered the way we expect
        self.assertFalse(
            standard_output[0].properties.HasField('compute_cluster_info'))
        self.assertFalse(
            standard_output[0].properties.HasField('homo_pbe0_aug_pc_1'))
        self.assertTrue(
            standard_output[0].properties.HasField('rotational_constants'))

        complete_dataset = tf.data.TFRecordDataset(
            output_stem + '_complete_tfrecord-00000-of-00001')
        complete_output = [
            dataset_pb2.Conformer.FromString(raw)
            for raw in complete_dataset.as_numpy_iterator()
        ]
        self.assertCountEqual([c.conformer_id for c in complete_output],
                              [618451001, 618451123, 620517002, 79593005])
        # Check that fields are filtered the way we expect
        # The DirectRunner randomizes the order of output so we need to make sure
        # that we get a full record.
        complete_entry = [
            c for c in complete_output if c.conformer_id == 618451001
        ][0]
        self.assertFalse(
            complete_entry.properties.HasField('compute_cluster_info'))
        self.assertTrue(
            complete_entry.properties.HasField('homo_pbe0_aug_pc_1'))
        self.assertTrue(
            complete_entry.properties.HasField('rotational_constants'))

        complete_entry_for_smiles = [
            c for c in complete_output if c.conformer_id == 620517002
        ][0]
        self.assertEqual(complete_entry_for_smiles.properties.smiles_openbabel,
                         'NotAValidSmilesString')
Example #2
0
    def test_whole_pipeline(self):
        test_subdirectory = self.create_tempdir()
        output_stem = os.path.join(test_subdirectory, 'testout')
        input_stage1_dat_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_input_stage1.dat')
        input_stage2_dat_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_input_stage2.dat')
        input_equivalent_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_equivalent.dat')
        input_bond_topology_csv = os.path.join(TESTDATA_PATH,
                                               'pipeline_bond_topology.csv')
        with flagsaver.flagsaver(
                input_stage1_dat_glob=input_stage1_dat_glob,
                input_stage2_dat_glob=input_stage2_dat_glob,
                input_equivalent_glob=input_equivalent_glob,
                input_bond_topology_csv=input_bond_topology_csv,
                output_stem=output_stem,
                output_shards=1):
            # If you have custom beam options, add them here.
            beam_options = None
            with beam.Pipeline(beam_options) as root:
                pipeline.pipeline(root)

        logging.info(
            'Files in output: %s',
            '\n'.join(gfile.glob(os.path.join(test_subdirectory, '/*'))))
        for stage in ['stage1', 'stage2']:
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_original_known_error-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_original_unknown_error-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_mismatched_original-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_mismatched_regen-00000-of-00001.dat'))

        # Check the merge conflicts file
        with gfile.GFile(output_stem + '_conflicts-00000-of-00001.csv') as f:
            conflicts_lines = f.readlines()
            self.assertIn('conformer_id,', conflicts_lines[0])
            self.assertEqual(
                conflicts_lines[1], '618451001,'
                '1,1,1,1,-406.51179,9.999999,-406.522079,9.999999,True,True,'
                '1,1,1,1,-406.51179,0.052254,-406.522079,2.5e-05,True,True\n')

        # Check a couple of the stats.
        with gfile.GFile(output_stem + '_stats-00000-of-00001.csv') as f:
            stats_lines = f.readlines()
            self.assertIn('error_nsvg09,0,4\n', stats_lines)
            self.assertIn('fate,FATE_SUCCESS,2\n', stats_lines)
            self.assertIn('fate,FATE_DUPLICATE_DIFFERENT_TOPOLOGY,1\n',
                          stats_lines)
            self.assertIn('num_initial_geometries,1,4\n', stats_lines)
            self.assertIn('num_duplicates,1,1\n', stats_lines)

        # Check the smiles comparison output
        with gfile.GFile(output_stem +
                         '_smiles_compare-00000-of-00001.csv') as f:
            smiles_lines = f.readlines()
            self.assertIn(
                '620517002,MISMATCH,NotAValidSmilesString,'
                '[H]C1=C2OC2=C(F)O1,FC1=C2OC2=CO1\n', smiles_lines)
            # Make sure that a bond topology with a matching smiles doesn't show
            for line in smiles_lines:
                self.assertNotIn('618451001', line)

        # Check the bond topology summary
        with gfile.GFile(output_stem + '_bt_summary-00000-of-00001.csv') as f:
            bt_summary_lines = f.readlines()
            # Check part of the header line
            self.assertIn('bt_id', bt_summary_lines[0])
            self.assertIn('count_attempted_conformers', bt_summary_lines[0])
            # This is the bond topology that has no conformer
            self.assertIn('10,0,0,0,0,0,0,0,0,0,0\n', bt_summary_lines)
            # This is a bond topology with 1 conformer
            self.assertIn('620517,1,0,0,0,1,0,1,0,0,0\n', bt_summary_lines)
            # This is a bond topology with 2 conformers
            self.assertIn('618451,2,0,0,0,2,0,0,2,0,0\n', bt_summary_lines)

        # For the gzip files below, we check >100 because even an empty gzip file
        # has non-zero length. 100 is kind of arbitrary to be bigger than the
        # expected header of 20.
        self.assertGreater(
            gfile.stat(output_stem +
                       '_complete_json-00000-of-00003.json.gz').length +
            gfile.stat(output_stem +
                       '_complete_json-00001-of-00003.json.gz').length +
            gfile.stat(output_stem +
                       '_complete_json-00002-of-00003.json.gz').length, 100)
        self.assertGreater(
            gfile.stat(output_stem +
                       '_standard_json-00000-of-00001.json.gz').length, 100)