Python pipeline Examples

Programming Language: Python

Namespace/Package Name: smu.pipeline

Method/Function: pipeline

Examples at hotexamples.com: 2

Python pipeline - 2 examples found. These are the top rated real world Python examples of smu.pipeline.pipeline extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def test_whole_pipeline(self):
        test_subdirectory = self.create_tempdir()
        output_stem = os.path.join(test_subdirectory, 'testout')
        input_stage1_dat_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_input_stage1.dat')
        input_stage2_dat_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_input_stage2.dat')
        input_equivalent_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_equivalent.dat')
        input_bond_topology_csv = os.path.join(TESTDATA_PATH,
                                               'pipeline_bond_topology.csv')
        with flagsaver.flagsaver(
                input_stage1_dat_glob=input_stage1_dat_glob,
                input_stage2_dat_glob=input_stage2_dat_glob,
                input_equivalent_glob=input_equivalent_glob,
                input_bond_topology_csv=input_bond_topology_csv,
                output_stem=output_stem,
                output_shards=1):
            # If you have custom beam options, add them here.
            beam_options = None
            with beam.Pipeline(beam_options) as root:
                pipeline.pipeline(root)

        metrics = root.result.metrics().query()
        counters_dict = {
            m.key.metric.name: m.committed
            for m in metrics['counters']
        }

        self.assertEqual(counters_dict['attempted_topology_matches'], 3)
        # Conformer 620517 will not match because bond lengths are not extracted
        # from conformers with serious errors like this.
        self.assertEqual(counters_dict['no_topology_matches'], 1)
        self.assertNotIn('topology_match_smiles_failure', counters_dict)

        logging.info(
            'Files in output: %s',
            '\n'.join(gfile.glob(os.path.join(test_subdirectory, '*'))))
        for stage in ['stage1', 'stage2']:
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_original_known_error-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_original_unknown_error-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_mismatched_original-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_mismatched_regen-00000-of-00001.dat'))

        # Check the merge conflicts file
        with gfile.GFile(output_stem + '_conflicts-00000-of-00001.csv') as f:
            conflicts_lines = f.readlines()
            self.assertIn('conformer_id,', conflicts_lines[0])
            self.assertEqual(
                conflicts_lines[1], '618451001,1,1,1,1,'
                '-406.51179,9.999999,-406.522079,9.999999,True,True,'
                '-406.51179,0.052254,-406.522079,2.5e-05,True,True\n')

        # Check a couple of the stats.
        with gfile.GFile(output_stem + '_stats-00000-of-00001.csv') as f:
            stats_lines = f.readlines()
            self.assertIn('errors.status,0,2\n', stats_lines)
            self.assertIn('errors.warn_t1,0,4\n', stats_lines)
            self.assertIn('fate,FATE_SUCCESS,2\n', stats_lines)
            self.assertIn('fate,FATE_DUPLICATE_DIFFERENT_TOPOLOGY,1\n',
                          stats_lines)
            self.assertIn('num_initial_geometries,1,4\n', stats_lines)
            self.assertIn('num_duplicates,1,1\n', stats_lines)
            self.assertIn('zero_field,single_point_energy_pbe0d3_6_311gd,1\n',
                          stats_lines)

        # Check the smiles comparison output
        with gfile.GFile(output_stem +
                         '_smiles_compare-00000-of-00001.csv') as f:
            smiles_lines = f.readlines()
            self.assertIn(
                '620517002,MISMATCH,NotAValidSmilesString,'
                '[H]C1=C2OC2=C(F)O1,FC1=C2OC2=CO1\n', smiles_lines)
            # Make sure that a bond topology with a matching smiles doesn't show
            for line in smiles_lines:
                self.assertNotIn('618451001', line)

        # Check the bond topology summary
        with gfile.GFile(output_stem + '_bt_summary-00000-of-00001.csv') as f:
            bt_summary_lines = f.readlines()
            # Check part of the header line
            self.assertIn('bt_id', bt_summary_lines[0])
            self.assertIn('count_attempted_conformers', bt_summary_lines[0])
            # This is the bond topology that has no conformer
            self.assertIn('10,0,0,0,0,0,0,0,0,0,0,0,0,0\n', bt_summary_lines)
            # This is a bond topology with 1 conformer
            self.assertIn('620517,1,0,0,0,1,0,1,0,0,0,0,0,0\n',
                          bt_summary_lines)
            # This is a bond topology with 2 conformers
            self.assertIn('618451,2,0,0,0,2,0,0,0,2,0,0,0,0\n',
                          bt_summary_lines)

        # Check the bond lengths file
        with gfile.GFile(output_stem + '_bond_lengths.csv') as f:
            bond_length_lines = f.readlines()
            self.assertEqual(
                'atom_char_0,atom_char_1,bond_type,length_str,count\n',
                bond_length_lines[0])
            self.assertIn('c,c,2,1.336,1\n', bond_length_lines)
            self.assertIn('c,o,1,1.422,2\n', bond_length_lines)

        # For the gzip files below, we check >100 because even an empty gzip file
        # has non-zero length. 100 is kind of arbitrary to be bigger than the
        # expected header of 20.

        # Check that the generated TFRecord files contain some expected outputs
        standard_dataset = tf.data.TFRecordDataset(
            output_stem + '_standard_tfrecord-00000-of-00001')
        standard_output = [
            dataset_pb2.Conformer.FromString(raw)
            for raw in standard_dataset.as_numpy_iterator()
        ]
        self.assertCountEqual([c.conformer_id for c in standard_output],
                              [618451001, 618451123])
        # Check that fields are filtered the way we expect
        self.assertFalse(
            standard_output[0].properties.HasField('compute_cluster_info'))
        self.assertFalse(
            standard_output[0].properties.HasField('homo_pbe0_aug_pc_1'))
        self.assertTrue(
            standard_output[0].properties.HasField('rotational_constants'))

        complete_dataset = tf.data.TFRecordDataset(
            output_stem + '_complete_tfrecord-00000-of-00001')
        complete_output = [
            dataset_pb2.Conformer.FromString(raw)
            for raw in complete_dataset.as_numpy_iterator()
        ]
        self.assertCountEqual([c.conformer_id for c in complete_output],
                              [618451001, 618451123, 620517002, 79593005])
        # Check that fields are filtered the way we expect
        # The DirectRunner randomizes the order of output so we need to make sure
        # that we get a full record.
        complete_entry = [
            c for c in complete_output if c.conformer_id == 618451001
        ][0]
        self.assertFalse(
            complete_entry.properties.HasField('compute_cluster_info'))
        self.assertTrue(
            complete_entry.properties.HasField('homo_pbe0_aug_pc_1'))
        self.assertTrue(
            complete_entry.properties.HasField('rotational_constants'))

        complete_entry_for_smiles = [
            c for c in complete_output if c.conformer_id == 620517002
        ][0]
        self.assertEqual(complete_entry_for_smiles.properties.smiles_openbabel,
                         'NotAValidSmilesString')

Example #2

Show file

File: pipeline_test.py Project: yan0626/google-research

    def test_whole_pipeline(self):
        test_subdirectory = self.create_tempdir()
        output_stem = os.path.join(test_subdirectory, 'testout')
        input_stage1_dat_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_input_stage1.dat')
        input_stage2_dat_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_input_stage2.dat')
        input_equivalent_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_equivalent.dat')
        input_bond_topology_csv = os.path.join(TESTDATA_PATH,
                                               'pipeline_bond_topology.csv')
        with flagsaver.flagsaver(
                input_stage1_dat_glob=input_stage1_dat_glob,
                input_stage2_dat_glob=input_stage2_dat_glob,
                input_equivalent_glob=input_equivalent_glob,
                input_bond_topology_csv=input_bond_topology_csv,
                output_stem=output_stem,
                output_shards=1):
            # If you have custom beam options, add them here.
            beam_options = None
            with beam.Pipeline(beam_options) as root:
                pipeline.pipeline(root)

        logging.info(
            'Files in output: %s',
            '\n'.join(gfile.glob(os.path.join(test_subdirectory, '/*'))))
        for stage in ['stage1', 'stage2']:
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_original_known_error-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_original_unknown_error-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_mismatched_original-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_mismatched_regen-00000-of-00001.dat'))

        # Check the merge conflicts file
        with gfile.GFile(output_stem + '_conflicts-00000-of-00001.csv') as f:
            conflicts_lines = f.readlines()
            self.assertIn('conformer_id,', conflicts_lines[0])
            self.assertEqual(
                conflicts_lines[1], '618451001,'
                '1,1,1,1,-406.51179,9.999999,-406.522079,9.999999,True,True,'
                '1,1,1,1,-406.51179,0.052254,-406.522079,2.5e-05,True,True\n')

        # Check a couple of the stats.
        with gfile.GFile(output_stem + '_stats-00000-of-00001.csv') as f:
            stats_lines = f.readlines()
            self.assertIn('error_nsvg09,0,4\n', stats_lines)
            self.assertIn('fate,FATE_SUCCESS,2\n', stats_lines)
            self.assertIn('fate,FATE_DUPLICATE_DIFFERENT_TOPOLOGY,1\n',
                          stats_lines)
            self.assertIn('num_initial_geometries,1,4\n', stats_lines)
            self.assertIn('num_duplicates,1,1\n', stats_lines)

        # Check the smiles comparison output
        with gfile.GFile(output_stem +
                         '_smiles_compare-00000-of-00001.csv') as f:
            smiles_lines = f.readlines()
            self.assertIn(
                '620517002,MISMATCH,NotAValidSmilesString,'
                '[H]C1=C2OC2=C(F)O1,FC1=C2OC2=CO1\n', smiles_lines)
            # Make sure that a bond topology with a matching smiles doesn't show
            for line in smiles_lines:
                self.assertNotIn('618451001', line)

        # Check the bond topology summary
        with gfile.GFile(output_stem + '_bt_summary-00000-of-00001.csv') as f:
            bt_summary_lines = f.readlines()
            # Check part of the header line
            self.assertIn('bt_id', bt_summary_lines[0])
            self.assertIn('count_attempted_conformers', bt_summary_lines[0])
            # This is the bond topology that has no conformer
            self.assertIn('10,0,0,0,0,0,0,0,0,0,0\n', bt_summary_lines)
            # This is a bond topology with 1 conformer
            self.assertIn('620517,1,0,0,0,1,0,1,0,0,0\n', bt_summary_lines)
            # This is a bond topology with 2 conformers
            self.assertIn('618451,2,0,0,0,2,0,0,2,0,0\n', bt_summary_lines)

        # For the gzip files below, we check >100 because even an empty gzip file
        # has non-zero length. 100 is kind of arbitrary to be bigger than the
        # expected header of 20.
        self.assertGreater(
            gfile.stat(output_stem +
                       '_complete_json-00000-of-00003.json.gz').length +
            gfile.stat(output_stem +
                       '_complete_json-00001-of-00003.json.gz').length +
            gfile.stat(output_stem +
                       '_complete_json-00002-of-00003.json.gz').length, 100)
        self.assertGreater(
            gfile.stat(output_stem +
                       '_standard_json-00000-of-00001.json.gz').length, 100)