def _create_source_and_load_test_json_to_entries(self): source = Source.objects.create( name='Test source', url='http://example.com/foo/bar/testing', date=datetime.date(2011, 9, 1)) data = json.loads(open(self.expected_data_json, 'r').read()) KenyaParser.create_entries_from_data_and_source(data, source) return source
def _create_source_and_load_test_json_to_entries(self): source = Source.objects.create( name = 'Test source', url = 'http://example.com/foo/bar/testing', date = datetime.date( 2011, 9, 1 ) ) data = json.loads( open( self.expected_data_json, 'r' ).read() ) KenyaParser.create_entries_from_data_and_source( data, source ) return source
def handle_noargs(self, **options): for source in Source.objects.all().requires_processing(): if int(options.get('verbosity')) >= 2: print "Looking at %s" % source source.last_processing_attempt = datetime.datetime.now() source.save() pdf = source.file() html = KenyaParser.convert_pdf_to_html(pdf) data = KenyaParser.convert_html_to_data(html) KenyaParser.create_entries_from_data_and_source(data, source)
def handle_noargs(self, **options): for source in Source.objects.all().requires_processing(): if int(options.get('verbosity')) >= 2: print "Looking at %s" % source source.last_processing_attempt = datetime.datetime.now() source.save() pdf = source.file() html = KenyaParser.convert_pdf_to_html( pdf ) data = KenyaParser.convert_html_to_data( html ) KenyaParser.create_entries_from_data_and_source( data, source )
def test_converting_pdf_to_html(self): """Test that the pdf becomes the html that we expect""" pdf_file = open( self.sample_pdf, 'r' ) html = KenyaParser.convert_pdf_to_html( pdf_file ) expected_html = open( self.sample_html, 'r' ).read() self.assertEqual( html, expected_html )
def test_converting_pdf_to_html(self): """Test that the pdf becomes the html that we expect""" pdf_file = open(self.sample_pdf, 'r') html = KenyaParser.convert_pdf_to_html(pdf_file) expected_html = open(self.sample_html, 'r').read() self.assertEqual(html, expected_html)
def handle_noargs(self, **options): verbose = int(options.get('verbosity')) >= 2 for source in Source.objects.all().requires_processing(): if verbose: message = "{0}: Looking at {1}" print message.format(source.list_page, source) source.last_processing_attempt = datetime.datetime.now() source.save() pdf = source.file() try: html = KenyaParser.convert_pdf_to_html( pdf ) data = KenyaParser.convert_html_to_data( html ) KenyaParser.create_entries_from_data_and_source( data, source ) except Exception as e: print "There was an exception when parsing {0}".format(pdf) raise
def test_parse_time_string(self): time_tests = { '1.00 p.m.': '13:00:00', '1.00 a.m.': '01:00:00', '12.00 p.m.': '12:00:00', # am and pm make no sense at noon or midnight - but define what we want to happen '12.30 p.m.': '12:30:00', } for string, output in time_tests.items(): self.assertEqual(KenyaParser.parse_time_string(string), output) self.assertRaises(KenyaParserCouldNotParseTimeString, KenyaParser.parse_time_string, 'foo.bar')
def test_parse_time_string(self): time_tests = { '1.00 p.m.': '13:00:00', '1.00 a.m.': '01:00:00', '12.00 p.m.': '12:00:00', # am and pm make no sense at noon or midnight - but define what we want to happen '12.30 p.m.': '12:30:00', } for string, output in time_tests.items(): self.assertEqual( KenyaParser.parse_time_string( string ), output ) self.assertRaises( KenyaParserCouldNotParseTimeString, KenyaParser.parse_time_string, 'foo.bar' )
def test_parse_time_string(self): time_tests = { '1.00 p.m.': '13:00:00', '1.00 a.m.': '01:00:00', '12.00 p.m.': '12:00:00', # am and pm make no sense at noon or midnight - but define what we want to happen '12.30 p.m.': '12:30:00', "twenty-four minutes past Six o'clock": '18:24:00', "Fifteen minutes to two o'clock": '13:45:00', "Fifty five minutes past Nine o'clock": '09:55:00', "One minute past eight o'clock": '08:01:00' } for string, output in time_tests.items(): self.assertEqual(KenyaParser.parse_time_string(string), output) self.assertRaises(KenyaParserCouldNotParseTimeString, KenyaParser.parse_time_string, 'foo.bar')
def test_parse_time_string(self): time_tests = { '1.00 p.m.': '13:00:00', '1.00 a.m.': '01:00:00', '12.00 p.m.': '12:00:00', # am and pm make no sense at noon or midnight - but define what we want to happen '12.30 p.m.': '12:30:00', "twenty-four minutes past Six o'clock" : '18:24:00', "Fifteen minutes to two o'clock" : '13:45:00', "Fifty five minutes past Nine o'clock" : '09:55:00', "One minute past eight o'clock": '08:01:00' } for string, output in time_tests.items(): self.assertEqual( KenyaParser.parse_time_string( string ), output ) self.assertRaises( KenyaParserCouldNotParseTimeString, KenyaParser.parse_time_string, 'foo.bar' )
def test_converting_html_to_data(self): """test the convert_pdf_to_data function""" html_file = open( self.sample_html, 'r') html = html_file.read() data = KenyaParser.convert_html_to_data( html=html ) # Whilst developing the code this proved useful # out = open( self.expected_data_json, 'w') # json_string = json.dumps( data, sort_keys=True, indent=4 ) # json_string = re.sub(r" +\n", "\n", json_string) # trim trailing whitespace # json_string += "\n" # out.write( json_string ) # out.close() expected = json.loads( open( self.expected_data_json, 'r' ).read() ) self.assertEqual( data['transcript'], expected['transcript'] ) # FIXME self.assertEqual( data['meta'], expected['meta'] )
def test_converting_html_to_data(self): """test the convert_pdf_to_data function""" html_file = open(self.sample_html, 'r') html = html_file.read() data = KenyaParser.convert_html_to_data(html=html) # Whilst developing the code this proved useful # out = open( self.expected_data_json, 'w') # json_string = json.dumps( data, sort_keys=True, indent=4 ) # json_string = re.sub(r" +\n", "\n", json_string) # trim trailing whitespace # json_string += "\n" # out.write( json_string ) # out.close() expected = json.loads(open(self.expected_data_json, 'r').read()) self.assertEqual(data['transcript'], expected['transcript']) # FIXME self.assertEqual(data['meta'], expected['meta'])