def setUpClass(cls): cls.head, cls.entries = parse(open(cls.sample, "r").read())
def setUpClass(cls): super(GhanaParserTest, cls).setUpClass() cls.head, cls.entries = parse(open(cls.sample, 'r').read())
def test_entire_output(self): """ For the sample files that we have parse them and then compare the results to those stored in JSON, and to the normalised line breaks. This will allow us to quickly spot changes that are not individually tested. Note that there is a flag that can be used to write the new output to disk. This can be used to update the test data and, and also to make it possible to use a diff tool to see the changes more clearly than is possible in the failing test output. """ # change to True to update the test json files. overwrite_known_good_files = False # list of all the files that we should parse and compare (path should # be relative to this test file). transcript_files = [ "data/hansard-sample.txt", "data/hansards/hansard_0014.txt", "data/hansards/hansard_0025.txt", "data/hansards/hansard_0026.txt", "data/hansards/hansard_0028.txt", "data/hansards/hansard_0029.txt", "data/hansards/hansard_0030.txt", "data/hansards/hansard_0031.txt", "data/hansards/hansard_0032.txt", "data/hansards/hansard_0034.txt", "data/hansards/hansard_0038.txt", "data/hansards/hansard_0051.txt", "data/hansards/hansard_0075.txt", "data/hansards/hansard_0077.txt", "data/hansards/hansard_0078.txt", ] for transcript_file in transcript_files: transcript_abs_path = os.path.join(os.path.dirname(__file__), transcript_file) normalised_abs_path = os.path.splitext(transcript_abs_path)[0] + "-normalised.txt" data_abs_path = os.path.splitext(transcript_abs_path)[0] + ".json" # Read the sample content sample_content = open(transcript_abs_path, "r").read() # normalise the line breaks normalised_sample_content = normalise_line_breaks(sample_content) # parse sample content, store in data structure head, entries = parse(sample_content) parsed_data = {"head": head, "entries": entries} parsed_data_as_json = self.convert_parsed_data_to_json(parsed_data) # Write this parsed data out to disk if desired - this should # normally not happen, but is convenient to do during development if overwrite_known_good_files: print "** WARNING - overwriting known good files for '%s' ***" % transcript_abs_path open(data_abs_path, "w").write(parsed_data_as_json) open(normalised_abs_path, "w").write(normalised_sample_content) # check that the line splitting works as expected self.assertEqual(open(normalised_abs_path, "r").read().strip(), normalised_sample_content) # Read in the expected data and compare to what we got from parsing expected_data = json.loads(open(data_abs_path, "r").read()) self.assertEqual( json.loads(parsed_data_as_json), # so datetimes are iso formatted expected_data # "Correctly parsed %s" % transcript_file )
def test_entire_output(self): """ For the sample files that we have parse them and then compare the results to those stored in JSON, and to the normalised line breaks. This will allow us to quickly spot changes that are not individually tested. Note that there is a flag that can be used to write the new output to disk. This can be used to update the test data and, and also to make it possible to use a diff tool to see the changes more clearly than is possible in the failing test output. """ # change to True to update the test json files. overwrite_known_good_files = False # list of all the files that we should parse and compare (path should # be relative to this test file). transcript_files = [ 'data/hansard-sample.txt', 'data/hansards/hansard_0014.txt', 'data/hansards/hansard_0025.txt', 'data/hansards/hansard_0026.txt', 'data/hansards/hansard_0028.txt', 'data/hansards/hansard_0029.txt', 'data/hansards/hansard_0030.txt', 'data/hansards/hansard_0031.txt', 'data/hansards/hansard_0032.txt', 'data/hansards/hansard_0034.txt', 'data/hansards/hansard_0038.txt', 'data/hansards/hansard_0051.txt', 'data/hansards/hansard_0075.txt', 'data/hansards/hansard_0077.txt', 'data/hansards/hansard_0078.txt', ] for transcript_file in transcript_files: transcript_abs_path = os.path.join(os.path.dirname(__file__), transcript_file) normalised_abs_path = os.path.splitext(transcript_abs_path)[0] + '-normalised.txt' data_abs_path = os.path.splitext(transcript_abs_path)[0] + '.json' # Read the sample content sample_content = open(transcript_abs_path, 'r').read() # normalise the line breaks normalised_sample_content = normalise_line_breaks( sample_content ) # parse sample content, store in data structure head, entries = parse(sample_content) parsed_data = { 'head': head, 'entries': entries } parsed_data_as_json = self.convert_parsed_data_to_json( parsed_data ) # Write this parsed data out to disk if desired - this should # normally not happen, but is convenient to do during development if overwrite_known_good_files: print "** WARNING - overwriting known good files for '%s' ***" % transcript_abs_path open(data_abs_path, 'w').write( parsed_data_as_json ) open(normalised_abs_path, 'w').write( normalised_sample_content ) # check that the line splitting works as expected self.assertEqual( open(normalised_abs_path, 'r').read().strip(), normalised_sample_content ) # Read in the expected data and compare to what we got from parsing expected_data = json.loads( open( data_abs_path, 'r').read() ) self.assertEqual( json.loads( parsed_data_as_json ), # so datetimes are iso formatted expected_data # "Correctly parsed %s" % transcript_file )