def handle_noargs(self, **options): for source in Source.objects.all().requires_processing(): if int(options.get('verbosity')) >= 2: print "Looking at %s" % source source.last_processing_attempt = datetime.datetime.now() source.save() pdf = source.file() html = KenyaParser.convert_pdf_to_html(pdf) data = KenyaParser.convert_html_to_data(html) KenyaParser.create_entries_from_data_and_source(data, source)
def handle_noargs(self, **options): for source in Source.objects.all().requires_processing(): if int(options.get('verbosity')) >= 2: print "Looking at %s" % source source.last_processing_attempt = datetime.datetime.now() source.save() pdf = source.file() html = KenyaParser.convert_pdf_to_html( pdf ) data = KenyaParser.convert_html_to_data( html ) KenyaParser.create_entries_from_data_and_source( data, source )
def handle_noargs(self, **options): verbose = int(options.get('verbosity')) >= 2 for source in Source.objects.all().requires_processing(): if verbose: message = "{0}: Looking at {1}" print message.format(source.list_page, source) source.last_processing_attempt = datetime.datetime.now() source.save() pdf = source.file() try: html = KenyaParser.convert_pdf_to_html( pdf ) data = KenyaParser.convert_html_to_data( html ) KenyaParser.create_entries_from_data_and_source( data, source ) except Exception as e: print "There was an exception when parsing {0}".format(pdf) raise
def test_converting_html_to_data(self): """test the convert_pdf_to_data function""" html_file = open( self.sample_html, 'r') html = html_file.read() data = KenyaParser.convert_html_to_data( html=html ) # Whilst developing the code this proved useful # out = open( self.expected_data_json, 'w') # json_string = json.dumps( data, sort_keys=True, indent=4 ) # json_string = re.sub(r" +\n", "\n", json_string) # trim trailing whitespace # json_string += "\n" # out.write( json_string ) # out.close() expected = json.loads( open( self.expected_data_json, 'r' ).read() ) self.assertEqual( data['transcript'], expected['transcript'] ) # FIXME self.assertEqual( data['meta'], expected['meta'] )
def test_converting_html_to_data(self): """test the convert_pdf_to_data function""" html_file = open(self.sample_html, 'r') html = html_file.read() data = KenyaParser.convert_html_to_data(html=html) # Whilst developing the code this proved useful # out = open( self.expected_data_json, 'w') # json_string = json.dumps( data, sort_keys=True, indent=4 ) # json_string = re.sub(r" +\n", "\n", json_string) # trim trailing whitespace # json_string += "\n" # out.write( json_string ) # out.close() expected = json.loads(open(self.expected_data_json, 'r').read()) self.assertEqual(data['transcript'], expected['transcript']) # FIXME self.assertEqual(data['meta'], expected['meta'])