def download_parse_pdf(pdf_url): raw_pdf = requests.get(f'https://www.boe.es{pdf_url}', allow_redirects=True) pdf_path = f"/tmp/{pdf_url.split('/')[-1]}" open(pdf_path, 'wb').write(raw_pdf.content) bormeparser.parse(pdf_path, bormeparser.SECCION.A).to_json(f"{pdf_path}.json") with open(f"{pdf_path}.json") as f: data = json.load(f) os.remove(f"{pdf_path}.json") os.remove(pdf_path) return data
def import_borme_pdf(filename, create_json=True): """ Import BORME PDF to database """ results = { 'created_anuncios': 0, 'created_bormes': 0, 'created_companies': 0, 'created_persons': 0, 'errors': 0 } try: borme = bormeparser.parse(filename) results = _import1(borme) if create_json: json_path = get_borme_json_path(borme.date) os.makedirs(json_path, exist_ok=True) json_filepath = os.path.join(json_path, '%s.json' % borme.cve) borme.to_json(json_filepath) except Exception as e: logger.error('[X] Error grave en bormeparser.parse(): %s' % filename) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if not all(map(lambda x: x == 0, results.values())): print_results(results, borme) return True, results
def from_pdf_file(filename, create_json=True): """Importa un archivo BORME-PDF en la BD. :param filename: Archivo a importar :param create_json: Crear BORME-JSON como paso intermedio :type filename: str :type create_json: bool :rtype: (bool, dict) """ results = { 'created_anuncios': 0, 'created_bormes': 0, 'created_companies': 0, 'created_persons': 0, 'errors': 0 } try: borme = bormeparser.parse(filename, bormeparser.SECCION.A) results = _from_instance(borme) if create_json: json_path = get_borme_json_path(borme.date) os.makedirs(json_path, exist_ok=True) json_filepath = os.path.join(json_path, '%s.json' % borme.cve) borme.to_json(json_filepath) except Exception as e: logger.error('[X] Error grave (III) en bormeparser.parse(): %s' % filename) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if not all(map(lambda x: x == 0, results.values())): _print_results(results, borme) return True, results
def setUpClass(cls): super(TestBormeModel, cls).setUpClass() global results path = os.path.expanduser('~/.bormes/pdf/2015/02/10/BORME-A-2015-27-10.pdf') borme = bormeparser.parse(path, bormeparser.SECCION.A) results = _import1(borme)
def setUpClass(cls): super(TestBormeModel, cls).setUpClass() global results path = os.path.expanduser( '~/.bormes/pdf/2015/02/10/BORME-A-2015-27-10.pdf') borme = bormeparser.parse(path, bormeparser.SECCION.A) results = _import1(borme)
def run(self): while True: pdf_path, json_path = self.queue.get() print('Creating %s ...' % json_path) try: borme = bormeparser.parse(pdf_path, bormeparser.SECCION.A) borme.to_json(json_path) print('{cve}: OK'.format(cve=borme.cve)) except Exception as e: print('ERROR: {} ({})'.format(os.path.basename(pdf_path), e)) self.queue.task_done()
def import_borme_pdf(filename, create_json=True): """ Import BORME PDF to database """ results = {'created_anuncios': 0, 'created_bormes': 0, 'created_companies': 0, 'created_persons': 0, 'errors': 0} try: borme = bormeparser.parse(filename, bormeparser.SECCION.A) results = _import1(borme) if create_json: json_path = get_borme_json_path(borme.date) os.makedirs(json_path, exist_ok=True) json_filepath = os.path.join(json_path, '%s.json' % borme.cve) borme.to_json(json_filepath) except Exception as e: logger.error('[X] Error grave en bormeparser.parse(): %s' % filename) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if not all(map(lambda x: x == 0, results.values())): print_results(results, borme) return True, results
# GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import bormeparser import bormeparser.borme import logging import os import sys if __name__ == '__main__': if len(sys.argv) == 1: print('Usage: %s <filename.pdf> [--debug]') sys.exit(1) # set logger DEBUG if len(sys.argv) == 3 and sys.argv[2] == '--debug': bormeparser.borme.logger.setLevel(logging.DEBUG) # filename filename = os.path.basename(sys.argv[1]).replace('.pdf', '.json') borme = bormeparser.parse(sys.argv[1]) borme.to_json(filename) print() print('Created %s' % os.path.abspath(filename))
def _import_borme_download_range2(begin, end, seccion, local_only, strict=False, create_json=True): """ strict: Para en caso de error grave """ next_date = begin total_results = {'created_anuncios': 0, 'created_bormes': 0, 'created_companies': 0, 'created_persons': 0, 'total_anuncios': 0, 'total_bormes': 0, 'total_companies': 0, 'total_persons': 0, 'errors': 0} total_start_time = time.time() try: while next_date and next_date <= end: xml_path = get_borme_xml_filepath(next_date) try: bxml = BormeXML.from_file(xml_path) if bxml.next_borme is None: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) except FileNotFoundError: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) # Add FileHandlers logpath = os.path.join(settings.BORME_LOG_ROOT, 'imports', '%02d-%02d' % (bxml.date.year, bxml.date.month)) os.makedirs(logpath, exist_ok=True) fh1_path = os.path.join(logpath, '%02d_info.txt' % bxml.date.day) fh1 = logging.FileHandler(fh1_path) fh1.setLevel(logging.INFO) logger.addHandler(fh1) fh2_path = os.path.join(logpath, '%02d_error.txt' % bxml.date.day) fh2 = logging.FileHandler(fh2_path) fh2.setLevel(logging.WARNING) logger.addHandler(fh2) json_path = get_borme_json_path(bxml.date) pdf_path = get_borme_pdf_path(bxml.date) os.makedirs(pdf_path, exist_ok=True) logger.info('============================================================') logger.info('Ran import_borme_download at %s' % timezone.now()) logger.info(' Import date: %s. Section: %s' % (bxml.date.isoformat(), seccion)) logger.info('============================================================') print('\nPATH: %s\nDATE: %s\nSECCION: %s\n' % (pdf_path, bxml.date, seccion)) bormes = [] if not local_only: _, files = bxml.download_borme(pdf_path, seccion=seccion) for filepath in files: if filepath.endswith('-99.pdf'): continue logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.parse(filepath, seccion)) except Exception as e: logger.error('[X] Error grave (I) en bormeparser.parse(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger.error('[X] Una vez arreglado, reanuda la importación:') logger.error('[X] python manage.py importbormetoday local') return False, total_results else: cves = bxml.get_cves(bormeparser.SECCION.A) files_json = list(map(lambda x: os.path.join(json_path, '%s.json' % x), cves)) files_pdf = list(map(lambda x: os.path.join(pdf_path, '%s.pdf' % x), cves)) if files_exist(files_json): for filepath in files_json: logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.Borme.from_json(filepath)) except Exception as e: logger.error('[X] Error grave (I) en bormeparser.Borme.from_json(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger.error('[X] Una vez arreglado, reanuda la importación:') logger.error('[X] python manage.py importbormetoday local') # TODO: --from date return False, total_results elif files_exist(files_pdf): for filepath in files_pdf: logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.parse(filepath, seccion)) except Exception as e: logger.error('[X] Error grave (II) en bormeparser.parse(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger.error('[X] Una vez arreglado, reanuda la importación:') logger.error('[X] python manage.py importbormetoday local') # TODO: --from date return False, total_results else: logger.error('[X] Faltan archivos PDF y JSON que no se desea descargar.') logger.error('[X] JSON: %s' % ' '.join(files_json)) logger.error('[X] PDF: %s' % ' '.join(files_pdf)) if strict: return False, total_results for filepath in files_json: if not os.path.exists(filepath): logger.warn('[X] Missing JSON: %s' % filepath) continue logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.Borme.from_json(filepath)) except Exception as e: logger.error('[X] Error grave (II) en bormeparser.Borme.from_json(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) for borme in sorted(bormes): total_results['total_anuncios'] += len(borme.get_anuncios()) start_time = time.time() try: results = _import1(borme) except Exception as e: logger.error('[%s] Error grave en _import1:' % borme.cve) logger.error('[%s] %s' % (borme.cve, e)) logger.error('[%s] Prueba importar manualmente en modo detallado para ver el error:' % borme.cve) logger.error('[%s] python manage.py importbormepdf %s -v 3' % (borme.cve, borme.filename)) if strict: logger.error('[%s] Una vez arreglado, reanuda la importación:' % borme.cve) logger.error('[%s] python manage.py importbormetoday local' % borme.cve) return False, total_results if create_json: os.makedirs(json_path, exist_ok=True) json_filepath = os.path.join(json_path, '%s.json' % borme.cve) borme.to_json(json_filepath) total_results['created_anuncios'] += results['created_anuncios'] total_results['created_bormes'] += results['created_bormes'] total_results['created_companies'] += results['created_companies'] total_results['created_persons'] += results['created_persons'] total_results['total_companies'] += results['total_companies'] total_results['total_persons'] += results['total_persons'] total_results['errors'] += results['errors'] if not all(map(lambda x: x == 0, total_results.values())): print_results(results, borme) elapsed_time = time.time() - start_time logger.info('[%s] Elapsed time: %.2f seconds' % (borme.cve, elapsed_time)) # Remove handlers logger.removeHandler(fh1) logger.removeHandler(fh2) next_date = bxml.next_borme except KeyboardInterrupt: logger.info('\nImport aborted.') elapsed_time = time.time() - total_start_time logger.info('\nBORMEs creados: %d/%d' % (total_results['created_bormes'], total_results['total_bormes'])) logger.info('Anuncios creados: %d/%d' % (total_results['created_anuncios'], total_results['total_anuncios'])) logger.info('Empresas creadas: %d/%d' % (total_results['created_companies'], total_results['total_companies'])) logger.info('Personas creadas: %d/%d' % (total_results['created_persons'], total_results['total_persons'])) logger.info('Total elapsed time: %.2f seconds' % elapsed_time) return True, total_results
def _import_borme_download_range(begin, end, seccion, local_only, strict=False, create_json=True): """Importa los BORMEs data un rango de fechas. Itera en el rango de fechas. Por cada día: * Genera los nombres de los archivos BORMEs a partir del archivo BORME-XML * Carga los archivos BORME-JSON, o los BORME-PDF si no existieran los JSON * Importa en la BD los datos de los BORME :param begin: Fecha desde la que importar :param end: Fecha hasta la que importar :param seccion: Seccion del BORME :param local_only: No descarga archivos, solo procesa archivos ya presentes :param strict: Aborta el proceso tan pronto como se encuentre un error :param create_json: Crear archivo BORME-JSON :type date_from: datetime.date :type date_to: datetime.date :type seccion: bormeparser.SECCION :type local_only: bool :type strict: bool :type create_json: bool :rtype: (bool, dict) """ next_date = begin total_results = { 'created_anuncios': 0, 'created_bormes': 0, 'created_companies': 0, 'created_persons': 0, 'total_anuncios': 0, 'total_bormes': 0, 'total_companies': 0, 'total_persons': 0, 'errors': 0 } total_start_time = time.time() try: while next_date and next_date <= end: xml_path = get_borme_xml_filepath(next_date) try: bxml = BormeXML.from_file(xml_path) if bxml.next_borme is None: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) except OSError: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) # Add FileHandlers directory = '%02d-%02d' % (bxml.date.year, bxml.date.month) logpath = os.path.join(settings.BORME_LOG_ROOT, 'imports', directory) os.makedirs(logpath, exist_ok=True) fh1_path = os.path.join(logpath, '%02d_info.txt' % bxml.date.day) fh1 = logging.FileHandler(fh1_path) fh1.setLevel(logging.INFO) logger.addHandler(fh1) fh2_path = os.path.join(logpath, '%02d_error.txt' % bxml.date.day) fh2 = logging.FileHandler(fh2_path) fh2.setLevel(logging.WARNING) logger.addHandler(fh2) json_path = get_borme_json_path(bxml.date) pdf_path = get_borme_pdf_path(bxml.date) os.makedirs(pdf_path, exist_ok=True) logger.info( "===================================================\n" "Ran import_borme_download at {now}\n" " Import date: {borme_date}. Section: {section}\n" "===================================================" .format(now=timezone.now(), section=seccion, borme_date=bxml.date.isoformat())) print("\nPATH: {}" "\nDATE: {}" "\nSECCION: {}\n" .format(pdf_path, bxml.date, seccion)) bormes = [] if not local_only: _, files = bxml.download_borme(pdf_path, seccion=seccion) for filepath in files: if filepath.endswith('-99.pdf'): continue logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.parse(filepath, seccion)) except Exception as e: logger.error('[X] Error grave (I) en bormeparser.parse(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger_resume_import() return False, total_results else: files_json, files_pdf = _generate_borme_files_list(bxml, json_path, pdf_path) if files_exist(files_json): bormes, err = _load_and_append(files_json, strict) total_results["total_bormes"] += len(files_json) if err: return False, total_results elif files_exist(files_pdf): bormes, err = _load_and_append(files_pdf, strict, seccion) total_results["total_bormes"] += len(files_pdf) if err: return False, total_results else: logger.error('[X] Faltan archivos PDF y JSON que no se desea descargar.') logger.error('[X] JSON: %s' % ' '.join(files_json)) logger.error('[X] PDF: %s' % ' '.join(files_pdf)) if strict: return False, total_results bormes, err = _load_and_append(files_pdf, strict, seccion) total_results["total_bormes"] += len(files_pdf) for borme in sorted(bormes): total_results['total_anuncios'] += len(borme.get_anuncios()) start_time = time.time() try: results = _from_instance(borme) except Exception as e: logger.error('[%s] Error grave en _from_instance:' % borme.cve) logger.error('[%s] %s' % (borme.cve, e)) logger.error('[%s] Prueba importar manualmente en modo detallado para ver el error:' % borme.cve) logger.error('[%s] python manage.py importbormepdf %s -v 3' % (borme.cve, borme.filename)) if strict: logger_resume_import(cve=borme.cve) return False, total_results if create_json: os.makedirs(json_path, exist_ok=True) json_filepath = os.path.join(json_path, '%s.json' % borme.cve) borme.to_json(json_filepath) for key in total_results.keys(): total_results[key] += results[key] if not all(map(lambda x: x == 0, total_results.values())): _print_results(results, borme) elapsed_time = time.time() - start_time logger.info('[%s] Elapsed time: %.2f seconds' % (borme.cve, elapsed_time)) # Remove handlers logger.removeHandler(fh1) logger.removeHandler(fh2) next_date = bxml.next_borme except KeyboardInterrupt: logger.info('\nImport aborted.') elapsed_time = time.time() - total_start_time logger.info("\nBORMEs creados: {created_bormes}/{total_bormes}\n" "Anuncios creados: {created_anuncios}/{total_anuncios}\n" "Empresas creadas: {created_companies}/{total_companies}\n" "Personas creadas: {created_persons}/{total_persons}" .format(**total_results)) logger.info("Total elapsed time: %.2f seconds" % elapsed_time) return True, total_results
import os if __name__ == '__main__': parser = argparse.ArgumentParser( description='Convert BORME A PDF files to JSON.') parser.add_argument('filename', help='BORME A PDF filename') parser.add_argument('--debug', action='store_true', default=False, help='Debug mode') parser.add_argument( '-o', '--output', help='Output directory or filename (default is current directory)') args = parser.parse_args() # set logger DEBUG (Not working) if args.debug: bormeparser.borme.logger.setLevel(logging.DEBUG) bormeparser.backends.pypdf2.parser.logger.setLevel( logging.DEBUG) # FIXME: DEFAULT_PARSER print('\nParsing {}'.format(args.filename)) borme = bormeparser.parse(args.filename, bormeparser.SECCION.A) path = borme.to_json(args.output) if path: print('Created {}'.format(os.path.abspath(path))) else: print('Error creating JSON for {}'.format(args.filename))
def _import_borme_download_range(begin, end, seccion, local_only, strict=False, create_json=True): """Importa los BORMEs data un rango de fechas. Itera en el rango de fechas. Por cada día: * Genera los nombres de los archivos BORMEs a partir del archivo BORME-XML * Carga los archivos BORME-JSON, o los BORME-PDF si no existieran los JSON * Importa en la BD los datos de los BORME :param begin: Fecha desde la que importar :param end: Fecha hasta la que importar :param seccion: Seccion del BORME :param local_only: No descarga archivos, solo procesa archivos ya presentes :param strict: Aborta el proceso tan pronto como se encuentre un error :param create_json: Crear archivo BORME-JSON :type date_from: datetime.date :type date_to: datetime.date :type seccion: bormeparser.SECCION :type local_only: bool :type strict: bool :type create_json: bool :rtype: (bool, dict) """ next_date = begin total_results = { 'created_anuncios': 0, 'created_bormes': 0, 'created_companies': 0, 'created_persons': 0, 'total_anuncios': 0, 'total_bormes': 0, 'total_companies': 0, 'total_persons': 0, 'errors': 0 } total_start_time = time.time() try: while next_date and next_date <= end: xml_path = get_borme_xml_filepath(next_date) try: bxml = BormeXML.from_file(xml_path) if bxml.next_borme is None: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) except OSError: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) # Add FileHandlers directory = '%02d-%02d' % (bxml.date.year, bxml.date.month) logpath = os.path.join(settings.BORME_LOG_ROOT, 'imports', directory) os.makedirs(logpath, exist_ok=True) fh1_path = os.path.join(logpath, '%02d_info.txt' % bxml.date.day) fh1 = logging.FileHandler(fh1_path) fh1.setLevel(logging.INFO) logger.addHandler(fh1) fh2_path = os.path.join(logpath, '%02d_error.txt' % bxml.date.day) fh2 = logging.FileHandler(fh2_path) fh2.setLevel(logging.WARNING) logger.addHandler(fh2) json_path = get_borme_json_path(bxml.date) pdf_path = get_borme_pdf_path(bxml.date) os.makedirs(pdf_path, exist_ok=True) logger.info( "===================================================\n" "Ran import_borme_download at {now}\n" " Import date: {borme_date}. Section: {section}\n" "===================================================".format( now=timezone.now(), section=seccion, borme_date=bxml.date.isoformat())) print("\nPATH: {}" "\nDATE: {}" "\nSECCION: {}\n".format(pdf_path, bxml.date, seccion)) bormes = [] if not local_only: _, files = bxml.download_borme(pdf_path, seccion=seccion) for filepath in files: if filepath.endswith('-99.pdf'): continue logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.parse(filepath, seccion)) except Exception as e: logger.error( '[X] Error grave (I) en bormeparser.parse(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger_resume_import() return False, total_results else: files_json, files_pdf = _generate_borme_files_list( bxml, json_path, pdf_path) if files_exist(files_json): bormes, err = _load_and_append(files_json, strict) total_results["total_bormes"] += len(files_json) if err: return False, total_results elif files_exist(files_pdf): bormes, err = _load_and_append(files_pdf, strict, seccion) total_results["total_bormes"] += len(files_pdf) if err: return False, total_results else: logger.error( '[X] Faltan archivos PDF y JSON que no se desea descargar.' ) logger.error('[X] JSON: %s' % ' '.join(files_json)) logger.error('[X] PDF: %s' % ' '.join(files_pdf)) if strict: return False, total_results bormes, err = _load_and_append(files_pdf, strict, seccion) total_results["total_bormes"] += len(files_pdf) for borme in sorted(bormes): total_results['total_anuncios'] += len(borme.get_anuncios()) start_time = time.time() try: results = _from_instance(borme) except Exception as e: logger.error('[%s] Error grave en _from_instance:' % borme.cve) logger.error('[%s] %s' % (borme.cve, e)) logger.error( '[%s] Prueba importar manualmente en modo detallado para ver el error:' % borme.cve) logger.error( '[%s] python manage.py importbormepdf %s -v 3' % (borme.cve, borme.filename)) if strict: logger_resume_import(cve=borme.cve) return False, total_results if create_json: os.makedirs(json_path, exist_ok=True) json_filepath = os.path.join(json_path, '%s.json' % borme.cve) borme.to_json(json_filepath) for key in total_results.keys(): total_results[key] += results[key] if not all(map(lambda x: x == 0, total_results.values())): _print_results(results, borme) elapsed_time = time.time() - start_time logger.info('[%s] Elapsed time: %.2f seconds' % (borme.cve, elapsed_time)) # Remove handlers logger.removeHandler(fh1) logger.removeHandler(fh2) next_date = bxml.next_borme except KeyboardInterrupt: logger.info('\nImport aborted.') elapsed_time = time.time() - total_start_time logger.info("\nBORMEs creados: {created_bormes}/{total_bormes}\n" "Anuncios creados: {created_anuncios}/{total_anuncios}\n" "Empresas creadas: {created_companies}/{total_companies}\n" "Personas creadas: {created_persons}/{total_persons}".format( **total_results)) logger.info("Total elapsed time: %.2f seconds" % elapsed_time) return True, total_results
def setUpClass(cls): super(TestBormeModel, cls).setUpClass() filepath = os.path.join(FILES_PATH, 'BORME-A-2015-27-10.pdf') borme = bormeparser.parse(filepath, bormeparser.SECCION.A) TestBormeModel.results = _from_instance(borme)
def setUpClass(cls): cls.borme = bormeparser.parse( os.path.join(EXAMPLES_PATH, 'BORME-C-2011-20488.html'), SECCION.C)
def setUpClass(cls): cls.borme = bormeparser.parse( os.path.join(EXAMPLES_PATH, 'BORME-A-2015-27-10.pdf'), SECCION.A)
def _import_borme_download_range2(begin, end, seccion, local_only, strict=False, create_json=True): """ strict: Para en caso de error grave """ next_date = begin total_results = {'created_anuncios': 0, 'created_bormes': 0, 'created_companies': 0, 'created_persons': 0, 'total_anuncios': 0, 'total_bormes': 0, 'total_companies': 0, 'total_persons': 0, 'errors': 0} total_start_time = time.time() try: while next_date and next_date <= end: xml_path = get_borme_xml_filepath(next_date) try: bxml = BormeXML.from_file(xml_path) if bxml.next_borme is None: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) except FileNotFoundError: bxml = BormeXML.from_date(next_date) os.makedirs(os.path.dirname(xml_path), exist_ok=True) bxml.save_to_file(xml_path) # Add FileHandlers logpath = os.path.join(settings.BORME_LOG_ROOT, 'imports', '%02d-%02d' % (bxml.date.year, bxml.date.month)) os.makedirs(logpath, exist_ok=True) fh1_path = os.path.join(logpath, '%02d_info.txt' % bxml.date.day) fh1 = logging.FileHandler(fh1_path) fh1.setLevel(logging.INFO) logger.addHandler(fh1) fh2_path = os.path.join(logpath, '%02d_error.txt' % bxml.date.day) fh2 = logging.FileHandler(fh2_path) fh2.setLevel(logging.WARNING) logger.addHandler(fh2) json_path = get_borme_json_path(bxml.date) pdf_path = get_borme_pdf_path(bxml.date) os.makedirs(pdf_path, exist_ok=True) logger.info('============================================================') logger.info('Ran import_borme_download at %s' % timezone.now()) logger.info(' Import date: %s. Section: %s' % (bxml.date.isoformat(), seccion)) logger.info('============================================================') print('\nPATH: %s\nDATE: %s\nSECCION: %s\n' % (pdf_path, bxml.date, seccion)) bormes = [] if not local_only: _, files = bxml.download_borme(pdf_path, seccion=seccion) for filepath in files: if filepath.endswith('-99.pdf'): continue logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.parse(filepath, seccion)) except Exception as e: logger.error('[X] Error grave en bormeparser.parse(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger.error('[X] Una vez arreglado, reanuda la importación:') logger.error('[X] python manage.py importbormetoday local') return False, total_results else: cves = bxml.get_cves(bormeparser.SECCION.A) files_json = list(map(lambda x: os.path.join(json_path, '%s.json' % x), cves)) files_pdf = list(map(lambda x: os.path.join(pdf_path, '%s.pdf' % x), cves)) if files_exist(files_json): for filepath in files_json: logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.Borme.from_json(filepath)) except Exception as e: logger.error('[X] Error grave en bormeparser.Borme.from_json(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger.error('[X] Una vez arreglado, reanuda la importación:') logger.error('[X] python manage.py importbormetoday local') # TODO: --from date return False, total_results elif files_exist(files_pdf): for filepath in files_pdf: logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.parse(filepath, seccion)) except Exception as e: logger.error('[X] Error grave en bormeparser.parse(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) if strict: logger.error('[X] Una vez arreglado, reanuda la importación:') logger.error('[X] python manage.py importbormetoday local') # TODO: --from date return False, total_results else: logger.error('[X] Faltan archivos PDF y JSON que no se desea descargar.') logger.error('[X] JSON: %s' % ' '.join(files_json)) logger.error('[X] PDF: %s' % ' '.join(files_pdf)) if strict: return False, total_results for filepath in files_json: if not os.path.exists(filepath): logger.warn('[X] Missing JSON: %s' % filepath) continue logger.info('%s' % filepath) total_results['total_bormes'] += 1 try: bormes.append(bormeparser.Borme.from_json(filepath)) except Exception as e: logger.error('[X] Error grave en bormeparser.Borme.from_json(): %s' % filepath) logger.error('[X] %s: %s' % (e.__class__.__name__, e)) for borme in sorted(bormes): total_results['total_anuncios'] += len(borme.get_anuncios()) start_time = time.time() try: results = _import1(borme) except Exception as e: logger.error('[%s] Error grave en _import1:' % borme.cve) logger.error('[%s] %s' % (borme.cve, e)) logger.error('[%s] Prueba importar manualmente en modo detallado para ver el error:' % borme.cve) logger.error('[%s] python manage.py importbormepdf %s -v 3' % (borme.cve, borme.filename)) if strict: logger.error('[%s] Una vez arreglado, reanuda la importación:' % borme.cve) logger.error('[%s] python manage.py importbormetoday local' % borme.cve) return False, total_results if create_json: os.makedirs(json_path, exist_ok=True) json_filepath = os.path.join(json_path, '%s.json' % borme.cve) borme.to_json(json_filepath) total_results['created_anuncios'] += results['created_anuncios'] total_results['created_bormes'] += results['created_bormes'] total_results['created_companies'] += results['created_companies'] total_results['created_persons'] += results['created_persons'] total_results['total_companies'] += results['total_companies'] total_results['total_persons'] += results['total_persons'] total_results['errors'] += results['errors'] if not all(map(lambda x: x == 0, total_results.values())): print_results(results, borme) elapsed_time = time.time() - start_time logger.info('[%s] Elapsed time: %.2f seconds' % (borme.cve, elapsed_time)) # Remove handlers logger.removeHandler(fh1) logger.removeHandler(fh2) next_date = bxml.next_borme except KeyboardInterrupt: logger.info('\nImport aborted.') elapsed_time = time.time() - total_start_time logger.info('\nBORMEs creados: %d/%d' % (total_results['created_bormes'], total_results['total_bormes'])) logger.info('Anuncios creados: %d/%d' % (total_results['created_anuncios'], total_results['total_anuncios'])) logger.info('Empresas creadas: %d/%d' % (total_results['created_companies'], total_results['total_companies'])) logger.info('Personas creadas: %d/%d' % (total_results['created_persons'], total_results['total_persons'])) logger.info('Total elapsed time: %.2f seconds' % elapsed_time) return True, total_results
import bormeparser.backends.pypdf2.parser from bormeparser.backends.defaults import OPTIONS OPTIONS['SANITIZE_COMPANY_NAME'] = True import argparse import logging import os if __name__ == '__main__': parser = argparse.ArgumentParser(description='Convert BORME A PDF files to JSON.') parser.add_argument('filename', help='BORME A PDF filename') parser.add_argument('--debug', action='store_true', default=False, help='Debug mode') parser.add_argument('-o', '--output', help='Output directory or filename (default is current directory)') args = parser.parse_args() # set logger DEBUG (Not working) if args.debug: bormeparser.borme.logger.setLevel(logging.DEBUG) bormeparser.backends.pypdf2.parser.logger.setLevel(logging.DEBUG) # FIXME: DEFAULT_PARSER print('\nParsing {}'.format(args.filename)) borme = bormeparser.parse(args.filename, bormeparser.SECCION.A) path = borme.to_json(args.output) if path: print('Created {}'.format(os.path.abspath(path))) else: print('Error creating JSON for {}'.format(args.filename))