def regenerate_dat(input_tuple, stage): """Regenerates the original dat from conformer and compares it to original. Args: input_tuple: tuple of string (original contents), dataset_pb2.Conformer stage: string 'stage1' or 'stage2' Returns: original_dat, conformer, regenerated dat, int (0=mismatch, 1=match) """ original_dat, conformer = input_tuple smu_writer = smu_writer_lib.SmuWriter(annotate=False) if stage == 'stage1': regen_dat = smu_writer.process_stage1_proto(conformer) else: regen_dat = smu_writer.process_stage2_proto(conformer) try: smu_writer_lib.check_dat_formats_match(original_dat.splitlines(), regen_dat.splitlines()) beam.metrics.Metrics.counter(_METRICS_NAMESPACE, stage + '_dat_format_matched').inc() return original_dat, conformer, regen_dat, 1 except smu_writer_lib.DatFormatMismatchError: beam.metrics.Metrics.counter(_METRICS_NAMESPACE, stage + '_dat_format_mismatched').inc() return original_dat, conformer, regen_dat, 0
def test_roundtrip(self): """Tests a conversion from a SMU .dat file to protocol buffer and back.""" smu_writer = smu_writer_lib.SmuWriter(annotate=False) for conformer, orig_contents in self.parser.process_stage2(): smu_writer_lib.check_dat_formats_match( orig_contents, smu_writer.process_stage2_proto(conformer).splitlines())
def __init__(self, output_path): """Creates DatOutputter. Args: output_path: file to write to """ self.writer = smu_writer_lib.SmuWriter(annotate=False) if output_path: self.outfile = open(output_path, 'w') else: self.outfile = sys.stdout
def test_roundtrip_tweaked_bt(self): """Tests a conversion from a SMU .dat file to protocol buffer and back.""" smu_writer = smu_writer_lib.SmuWriter(annotate=False) for molecule, orig_contents in self.parser.process_stage2(): # We're going to mess with the molecule by perturbing the bond_toplogies. # The .dat format shoudl only ever use the starting topology, so we are # going to add some wrong bond topologies to make sure they are ignored. molecule.bond_topologies.append(molecule.bond_topologies[0]) molecule.bond_topologies.append(molecule.bond_topologies[0]) molecule.bond_topologies[ 0].source = dataset_pb2.BondTopology.SOURCE_ITC molecule.bond_topologies[ 1].source = dataset_pb2.BondTopology.SOURCE_CSD for bt in molecule.bond_topologies[0:2]: bt.bonds[0].bond_type = dataset_pb2.BondTopology.BOND_TRIPLE bt.bond_topology_id += 9999 smu_writer_lib.check_dat_formats_match( orig_contents, smu_writer.process_stage2_proto(molecule).splitlines())
def try_roundtrip(self, filename, stage): parser = smu_parser_lib.SmuParser(os.path.join(TESTDATA_PATH, filename)) writer = smu_writer_lib.SmuWriter(annotate=False) if stage == 'stage1': process_fn = parser.process_stage1 writer_fn = writer.process_stage1_proto elif stage == 'stage2': process_fn = parser.process_stage2 writer_fn = writer.process_stage2_proto else: raise ValueError(stage) for maybe_conformer, orig_contents in process_fn(): if isinstance(maybe_conformer, Exception): raise maybe_conformer self.assertGreater(maybe_conformer.bond_topologies[0].bond_topology_id, 0) smu_writer_lib.check_dat_formats_match( orig_contents, writer_fn(maybe_conformer).splitlines())
def test_pbtxt_to_annotated_dat(self, input_fn, expected_fn): # Note that this is partially a copy and paste from smu_writer (which is # what is used to regenerate the golden) full_input_fn = os.path.join(TESTDATA_PATH, input_fn) full_expected_fn = os.path.join(TESTDATA_PATH, expected_fn) smu_proto = dataset_pb2.MultipleConformers() raw_proto = '\n'.join(get_file_contents(full_input_fn)) text_format.Parse(raw_proto, smu_proto) smu_writer = smu_writer_lib.SmuWriter(True) got = ''.join( smu_writer.process_stage2_proto(conformer) for conformer in smu_proto.conformers) expected = get_file_contents(full_expected_fn) print('Command line to regenerate:\npython3 parser/smu_writer.py ' '--input_file {} --output_file {} --annotate True'.format( full_input_fn, full_expected_fn)) self.assertEqual([l.rstrip('\n') for l in expected], got.splitlines())
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') smu_writer = smu_writer_lib.SmuWriter(annotate=False) # output_files maps from Outcome to the a pair of file handle output_files = {} output_files[Outcome.SUCCESS] = (gfile.GFile( FLAGS.output_stem + '_success_original.dat', 'w'), gfile.GFile(FLAGS.output_stem + '_success_regen.dat', 'w')) output_files[Outcome.MISMATCH] = (gfile.GFile( FLAGS.output_stem + '_mismatch_original.dat', 'w'), gfile.GFile(FLAGS.output_stem + '_mismatch_regen.dat', 'w')) output_files[Outcome.PARSE_ERROR_KNOWN] = ( gfile.GFile(FLAGS.output_stem + '_parse_error_known_original.dat', 'w'), gfile.GFile(FLAGS.output_stem + '_parse_error_known_regen.dat', 'w')) output_files[Outcome.PARSE_ERROR_UNKNOWN] = ( gfile.GFile(FLAGS.output_stem + '_parse_error_unknown_original.dat', 'w'), gfile.GFile(FLAGS.output_stem + '_parse_error_unknown_regen.dat', 'w')) file_count = 0 conformer_count = 0 outcome_counts = collections.Counter() for filepath in gfile.glob(FLAGS.input_glob): logging.info('Processing file %s', filepath) file_count += 1 smu_parser = smu_parser_lib.SmuParser(filepath) if FLAGS.stage == 'stage1': process_fn = smu_parser.process_stage1 else: process_fn = smu_parser.process_stage2 for conformer, orig_contents_list in process_fn(): conformer_count += 1 outcome = None if isinstance(conformer, Exception): if isinstance(conformer, smu_parser_lib.SmuKnownError): outcome = Outcome.PARSE_ERROR_KNOWN else: outcome = Outcome.PARSE_ERROR_UNKNOWN regen_contents = '{}\n{}: {} {}\n'.format( smu_parser_lib.SEPARATOR_LINE, conformer.conformer_id, type(conformer).__name__, str(conformer)) else: if FLAGS.stage == 'stage1': regen_contents = smu_writer.process_stage1_proto(conformer) else: regen_contents = smu_writer.process_stage2_proto(conformer) try: smu_writer_lib.check_dat_formats_match( orig_contents_list, regen_contents.splitlines()) outcome = Outcome.SUCCESS except smu_writer_lib.DatFormatMismatchError as e: outcome = Outcome.MISMATCH print(e) outcome_counts[outcome] += 1 output_files[outcome][0].write('\n'.join(orig_contents_list) + '\n') output_files[outcome][1].write(regen_contents) for file_orig, file_regen in output_files.values(): file_orig.close() file_regen.close() def outcome_status(outcome): if conformer_count: percent = outcome_counts[outcome] / conformer_count * 100 else: percent = float('nan') return '%5.1f%% %7d %s \n' % (percent, outcome_counts[outcome], str(outcome)) status_str = ('COMPLETE: Read %d files, %d conformers\n' % (file_count, conformer_count) + outcome_status(Outcome.SUCCESS) + outcome_status(Outcome.PARSE_ERROR_KNOWN) + outcome_status(Outcome.MISMATCH) + outcome_status(Outcome.PARSE_ERROR_UNKNOWN)) logging.info(status_str) print(status_str)