Beispiel #1
0
def download_parse_pdf(pdf_url):
    raw_pdf = requests.get(f'https://www.boe.es{pdf_url}',
                           allow_redirects=True)
    pdf_path = f"/tmp/{pdf_url.split('/')[-1]}"
    open(pdf_path, 'wb').write(raw_pdf.content)

    bormeparser.parse(pdf_path,
                      bormeparser.SECCION.A).to_json(f"{pdf_path}.json")

    with open(f"{pdf_path}.json") as f:
        data = json.load(f)
    os.remove(f"{pdf_path}.json")
    os.remove(pdf_path)

    return data
Beispiel #2
0
def import_borme_pdf(filename, create_json=True):
    """
    Import BORME PDF to database
    """
    results = {
        'created_anuncios': 0,
        'created_bormes': 0,
        'created_companies': 0,
        'created_persons': 0,
        'errors': 0
    }

    try:
        borme = bormeparser.parse(filename)
        results = _import1(borme)
        if create_json:
            json_path = get_borme_json_path(borme.date)
            os.makedirs(json_path, exist_ok=True)
            json_filepath = os.path.join(json_path, '%s.json' % borme.cve)
            borme.to_json(json_filepath)
    except Exception as e:
        logger.error('[X] Error grave en bormeparser.parse(): %s' % filename)
        logger.error('[X] %s: %s' % (e.__class__.__name__, e))

    if not all(map(lambda x: x == 0, results.values())):
        print_results(results, borme)
    return True, results
Beispiel #3
0
def from_pdf_file(filename, create_json=True):
    """Importa un archivo BORME-PDF en la BD.

    :param filename: Archivo a importar
    :param create_json: Crear BORME-JSON como paso intermedio
    :type filename: str
    :type create_json: bool
    :rtype: (bool, dict)
    """
    results = {
        'created_anuncios': 0,
        'created_bormes': 0,
        'created_companies': 0,
        'created_persons': 0,
        'errors': 0
    }

    try:
        borme = bormeparser.parse(filename, bormeparser.SECCION.A)
        results = _from_instance(borme)
        if create_json:
            json_path = get_borme_json_path(borme.date)
            os.makedirs(json_path, exist_ok=True)
            json_filepath = os.path.join(json_path, '%s.json' % borme.cve)
            borme.to_json(json_filepath)
    except Exception as e:
        logger.error('[X] Error grave (III) en bormeparser.parse(): %s' % filename)
        logger.error('[X] %s: %s' % (e.__class__.__name__, e))

    if not all(map(lambda x: x == 0, results.values())):
        _print_results(results, borme)
    return True, results
Beispiel #4
0
def from_pdf_file(filename, create_json=True):
    """Importa un archivo BORME-PDF en la BD.

    :param filename: Archivo a importar
    :param create_json: Crear BORME-JSON como paso intermedio
    :type filename: str
    :type create_json: bool
    :rtype: (bool, dict)
    """
    results = {
        'created_anuncios': 0,
        'created_bormes': 0,
        'created_companies': 0,
        'created_persons': 0,
        'errors': 0
    }

    try:
        borme = bormeparser.parse(filename, bormeparser.SECCION.A)
        results = _from_instance(borme)
        if create_json:
            json_path = get_borme_json_path(borme.date)
            os.makedirs(json_path, exist_ok=True)
            json_filepath = os.path.join(json_path, '%s.json' % borme.cve)
            borme.to_json(json_filepath)
    except Exception as e:
        logger.error('[X] Error grave (III) en bormeparser.parse(): %s' %
                     filename)
        logger.error('[X] %s: %s' % (e.__class__.__name__, e))

    if not all(map(lambda x: x == 0, results.values())):
        _print_results(results, borme)
    return True, results
    def setUpClass(cls):
        super(TestBormeModel, cls).setUpClass()

        global results

        path = os.path.expanduser('~/.bormes/pdf/2015/02/10/BORME-A-2015-27-10.pdf')
        borme = bormeparser.parse(path, bormeparser.SECCION.A)
        results = _import1(borme)
    def setUpClass(cls):
        super(TestBormeModel, cls).setUpClass()

        global results

        path = os.path.expanduser(
            '~/.bormes/pdf/2015/02/10/BORME-A-2015-27-10.pdf')
        borme = bormeparser.parse(path, bormeparser.SECCION.A)
        results = _import1(borme)
 def run(self):
     while True:
         pdf_path, json_path = self.queue.get()
         print('Creating %s ...' % json_path)
         try:
             borme = bormeparser.parse(pdf_path, bormeparser.SECCION.A)
             borme.to_json(json_path)
             print('{cve}: OK'.format(cve=borme.cve))
         except Exception as e:
             print('ERROR: {} ({})'.format(os.path.basename(pdf_path), e))
         self.queue.task_done()
Beispiel #8
0
 def run(self):
     while True:
         pdf_path, json_path = self.queue.get()
         print('Creating %s ...' % json_path)
         try:
             borme = bormeparser.parse(pdf_path, bormeparser.SECCION.A)
             borme.to_json(json_path)
             print('{cve}: OK'.format(cve=borme.cve))
         except Exception as e:
             print('ERROR: {} ({})'.format(os.path.basename(pdf_path), e))
         self.queue.task_done()
Beispiel #9
0
def import_borme_pdf(filename, create_json=True):
    """
    Import BORME PDF to database
    """
    results = {'created_anuncios': 0, 'created_bormes': 0, 'created_companies': 0, 'created_persons': 0, 'errors': 0}

    try:
        borme = bormeparser.parse(filename, bormeparser.SECCION.A)
        results = _import1(borme)
        if create_json:
            json_path = get_borme_json_path(borme.date)
            os.makedirs(json_path, exist_ok=True)
            json_filepath = os.path.join(json_path, '%s.json' % borme.cve)
            borme.to_json(json_filepath)
    except Exception as e:
        logger.error('[X] Error grave en bormeparser.parse(): %s' % filename)
        logger.error('[X] %s: %s' % (e.__class__.__name__, e))

    if not all(map(lambda x: x == 0, results.values())):
        print_results(results, borme)
    return True, results
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import bormeparser
import bormeparser.borme
import logging
import os
import sys


if __name__ == '__main__':

    if len(sys.argv) == 1:
        print('Usage: %s <filename.pdf> [--debug]')
        sys.exit(1)

    # set logger DEBUG
    if len(sys.argv) == 3 and sys.argv[2] == '--debug':
        bormeparser.borme.logger.setLevel(logging.DEBUG)

    # filename
    filename = os.path.basename(sys.argv[1]).replace('.pdf', '.json')
    borme = bormeparser.parse(sys.argv[1])
    borme.to_json(filename)

    print()
    print('Created %s' % os.path.abspath(filename))
Beispiel #11
0
def _import_borme_download_range2(begin, end, seccion, local_only, strict=False, create_json=True):
    """
    strict: Para en caso de error grave
    """
    next_date = begin
    total_results = {'created_anuncios': 0, 'created_bormes': 0, 'created_companies': 0, 'created_persons': 0,
                     'total_anuncios': 0, 'total_bormes': 0, 'total_companies': 0, 'total_persons': 0, 'errors': 0}
    total_start_time = time.time()

    try:
        while next_date and next_date <= end:
            xml_path = get_borme_xml_filepath(next_date)
            try:
                bxml = BormeXML.from_file(xml_path)
                if bxml.next_borme is None:
                    bxml = BormeXML.from_date(next_date)
                    os.makedirs(os.path.dirname(xml_path), exist_ok=True)
                    bxml.save_to_file(xml_path)

            except FileNotFoundError:
                bxml = BormeXML.from_date(next_date)
                os.makedirs(os.path.dirname(xml_path), exist_ok=True)
                bxml.save_to_file(xml_path)

            # Add FileHandlers
            logpath = os.path.join(settings.BORME_LOG_ROOT, 'imports', '%02d-%02d' % (bxml.date.year, bxml.date.month))
            os.makedirs(logpath, exist_ok=True)

            fh1_path = os.path.join(logpath, '%02d_info.txt' % bxml.date.day)
            fh1 = logging.FileHandler(fh1_path)
            fh1.setLevel(logging.INFO)
            logger.addHandler(fh1)

            fh2_path = os.path.join(logpath, '%02d_error.txt' % bxml.date.day)
            fh2 = logging.FileHandler(fh2_path)
            fh2.setLevel(logging.WARNING)
            logger.addHandler(fh2)

            json_path = get_borme_json_path(bxml.date)
            pdf_path = get_borme_pdf_path(bxml.date)
            os.makedirs(pdf_path, exist_ok=True)
            logger.info('============================================================')
            logger.info('Ran import_borme_download at %s' % timezone.now())
            logger.info('  Import date: %s. Section: %s' % (bxml.date.isoformat(), seccion))
            logger.info('============================================================')

            print('\nPATH: %s\nDATE: %s\nSECCION: %s\n' % (pdf_path, bxml.date, seccion))

            bormes = []
            if not local_only:
                _, files = bxml.download_borme(pdf_path, seccion=seccion)

                for filepath in files:
                    if filepath.endswith('-99.pdf'):
                        continue
                    logger.info('%s' % filepath)
                    total_results['total_bormes'] += 1
                    try:
                        bormes.append(bormeparser.parse(filepath, seccion))
                    except Exception as e:
                        logger.error('[X] Error grave (I) en bormeparser.parse(): %s' % filepath)
                        logger.error('[X] %s: %s' % (e.__class__.__name__, e))
                        if strict:
                            logger.error('[X] Una vez arreglado, reanuda la importación:')
                            logger.error('[X]   python manage.py importbormetoday local')
                            return False, total_results

            else:
                cves = bxml.get_cves(bormeparser.SECCION.A)
                files_json = list(map(lambda x: os.path.join(json_path, '%s.json' % x), cves))
                files_pdf = list(map(lambda x: os.path.join(pdf_path, '%s.pdf' % x), cves))

                if files_exist(files_json):
                    for filepath in files_json:
                        logger.info('%s' % filepath)
                        total_results['total_bormes'] += 1
                        try:
                            bormes.append(bormeparser.Borme.from_json(filepath))
                        except Exception as e:
                            logger.error('[X] Error grave (I) en bormeparser.Borme.from_json(): %s' % filepath)
                            logger.error('[X] %s: %s' % (e.__class__.__name__, e))
                            if strict:
                                logger.error('[X] Una vez arreglado, reanuda la importación:')
                                logger.error('[X]   python manage.py importbormetoday local')  # TODO: --from date
                                return False, total_results
                elif files_exist(files_pdf):
                    for filepath in files_pdf:
                        logger.info('%s' % filepath)
                        total_results['total_bormes'] += 1
                        try:
                            bormes.append(bormeparser.parse(filepath, seccion))
                        except Exception as e:
                            logger.error('[X] Error grave (II) en bormeparser.parse(): %s' % filepath)
                            logger.error('[X] %s: %s' % (e.__class__.__name__, e))
                            if strict:
                                logger.error('[X] Una vez arreglado, reanuda la importación:')
                                logger.error('[X]   python manage.py importbormetoday local')  # TODO: --from date
                                return False, total_results
                else:
                    logger.error('[X] Faltan archivos PDF y JSON que no se desea descargar.')
                    logger.error('[X] JSON: %s' % ' '.join(files_json))
                    logger.error('[X] PDF: %s' % ' '.join(files_pdf))
                    if strict:
                        return False, total_results

                    for filepath in files_json:
                        if not os.path.exists(filepath):
                            logger.warn('[X] Missing JSON: %s' % filepath)
                            continue
                        logger.info('%s' % filepath)
                        total_results['total_bormes'] += 1
                        try:
                            bormes.append(bormeparser.Borme.from_json(filepath))
                        except Exception as e:
                            logger.error('[X] Error grave (II) en bormeparser.Borme.from_json(): %s' % filepath)
                            logger.error('[X] %s: %s' % (e.__class__.__name__, e))

            for borme in sorted(bormes):
                total_results['total_anuncios'] += len(borme.get_anuncios())
                start_time = time.time()
                try:
                    results = _import1(borme)
                except Exception as e:
                    logger.error('[%s] Error grave en _import1:' % borme.cve)
                    logger.error('[%s] %s' % (borme.cve, e))
                    logger.error('[%s] Prueba importar manualmente en modo detallado para ver el error:' % borme.cve)
                    logger.error('[%s]   python manage.py importbormepdf %s -v 3' % (borme.cve, borme.filename))
                    if strict:
                        logger.error('[%s] Una vez arreglado, reanuda la importación:' % borme.cve)
                        logger.error('[%s]   python manage.py importbormetoday local' % borme.cve)
                        return False, total_results

                if create_json:
                    os.makedirs(json_path, exist_ok=True)
                    json_filepath = os.path.join(json_path, '%s.json' % borme.cve)
                    borme.to_json(json_filepath)

                total_results['created_anuncios'] += results['created_anuncios']
                total_results['created_bormes'] += results['created_bormes']
                total_results['created_companies'] += results['created_companies']
                total_results['created_persons'] += results['created_persons']
                total_results['total_companies'] += results['total_companies']
                total_results['total_persons'] += results['total_persons']
                total_results['errors'] += results['errors']

                if not all(map(lambda x: x == 0, total_results.values())):
                    print_results(results, borme)
                    elapsed_time = time.time() - start_time
                    logger.info('[%s] Elapsed time: %.2f seconds' % (borme.cve, elapsed_time))

            # Remove handlers
            logger.removeHandler(fh1)
            logger.removeHandler(fh2)
            next_date = bxml.next_borme
    except KeyboardInterrupt:
        logger.info('\nImport aborted.')

    elapsed_time = time.time() - total_start_time
    logger.info('\nBORMEs creados: %d/%d' % (total_results['created_bormes'], total_results['total_bormes']))
    logger.info('Anuncios creados: %d/%d' % (total_results['created_anuncios'], total_results['total_anuncios']))
    logger.info('Empresas creadas: %d/%d' % (total_results['created_companies'], total_results['total_companies']))
    logger.info('Personas creadas: %d/%d' % (total_results['created_persons'], total_results['total_persons']))
    logger.info('Total elapsed time: %.2f seconds' % elapsed_time)

    return True, total_results
Beispiel #12
0
def _import_borme_download_range(begin, end, seccion, local_only,
                                 strict=False, create_json=True):
    """Importa los BORMEs data un rango de fechas.

    Itera en el rango de fechas. Por cada día:
    * Genera los nombres de los archivos BORMEs a partir del archivo BORME-XML
    * Carga los archivos BORME-JSON, o los BORME-PDF si no existieran los JSON
    * Importa en la BD los datos de los BORME

    :param begin: Fecha desde la que importar
    :param end: Fecha hasta la que importar
    :param seccion: Seccion del BORME
    :param local_only: No descarga archivos, solo procesa archivos ya presentes
    :param strict: Aborta el proceso tan pronto como se encuentre un error
    :param create_json: Crear archivo BORME-JSON
    :type date_from: datetime.date
    :type date_to: datetime.date
    :type seccion: bormeparser.SECCION
    :type local_only: bool
    :type strict: bool
    :type create_json: bool

    :rtype: (bool, dict)
    """
    next_date = begin
    total_results = {
        'created_anuncios': 0,
        'created_bormes': 0,
        'created_companies': 0,
        'created_persons': 0,
        'total_anuncios': 0,
        'total_bormes': 0,
        'total_companies': 0,
        'total_persons': 0,
        'errors': 0
    }
    total_start_time = time.time()

    try:
        while next_date and next_date <= end:
            xml_path = get_borme_xml_filepath(next_date)
            try:
                bxml = BormeXML.from_file(xml_path)
                if bxml.next_borme is None:
                    bxml = BormeXML.from_date(next_date)
                    os.makedirs(os.path.dirname(xml_path), exist_ok=True)
                    bxml.save_to_file(xml_path)

            except OSError:
                bxml = BormeXML.from_date(next_date)
                os.makedirs(os.path.dirname(xml_path), exist_ok=True)
                bxml.save_to_file(xml_path)

            # Add FileHandlers
            directory = '%02d-%02d' % (bxml.date.year, bxml.date.month)
            logpath = os.path.join(settings.BORME_LOG_ROOT,
                                   'imports', directory)
            os.makedirs(logpath, exist_ok=True)

            fh1_path = os.path.join(logpath, '%02d_info.txt' % bxml.date.day)
            fh1 = logging.FileHandler(fh1_path)
            fh1.setLevel(logging.INFO)
            logger.addHandler(fh1)

            fh2_path = os.path.join(logpath, '%02d_error.txt' % bxml.date.day)
            fh2 = logging.FileHandler(fh2_path)
            fh2.setLevel(logging.WARNING)
            logger.addHandler(fh2)

            json_path = get_borme_json_path(bxml.date)
            pdf_path = get_borme_pdf_path(bxml.date)
            os.makedirs(pdf_path, exist_ok=True)
            logger.info(
                    "===================================================\n"
                    "Ran import_borme_download at {now}\n"
                    "  Import date: {borme_date}. Section: {section}\n"
                    "==================================================="
                    .format(now=timezone.now(), section=seccion,
                            borme_date=bxml.date.isoformat()))
            print("\nPATH: {}"
                  "\nDATE: {}"
                  "\nSECCION: {}\n"
                  .format(pdf_path, bxml.date, seccion))

            bormes = []
            if not local_only:
                _, files = bxml.download_borme(pdf_path, seccion=seccion)

                for filepath in files:
                    if filepath.endswith('-99.pdf'):
                        continue
                    logger.info('%s' % filepath)
                    total_results['total_bormes'] += 1
                    try:
                        bormes.append(bormeparser.parse(filepath, seccion))
                    except Exception as e:
                        logger.error('[X] Error grave (I) en bormeparser.parse(): %s' % filepath)
                        logger.error('[X] %s: %s' % (e.__class__.__name__, e))
                        if strict:
                            logger_resume_import()
                            return False, total_results

            else:
                files_json, files_pdf = _generate_borme_files_list(bxml,
                                                                   json_path,
                                                                   pdf_path)

                if files_exist(files_json):
                    bormes, err = _load_and_append(files_json, strict)
                    total_results["total_bormes"] += len(files_json)

                    if err:
                        return False, total_results

                elif files_exist(files_pdf):
                    bormes, err = _load_and_append(files_pdf, strict, seccion)
                    total_results["total_bormes"] += len(files_pdf)

                    if err:
                        return False, total_results
                else:
                    logger.error('[X] Faltan archivos PDF y JSON que no se desea descargar.')
                    logger.error('[X] JSON: %s' % ' '.join(files_json))
                    logger.error('[X] PDF: %s' % ' '.join(files_pdf))

                    if strict:
                        return False, total_results

                    bormes, err = _load_and_append(files_pdf, strict, seccion)
                    total_results["total_bormes"] += len(files_pdf)

            for borme in sorted(bormes):
                total_results['total_anuncios'] += len(borme.get_anuncios())
                start_time = time.time()
                try:
                    results = _from_instance(borme)
                except Exception as e:
                    logger.error('[%s] Error grave en _from_instance:' % borme.cve)
                    logger.error('[%s] %s' % (borme.cve, e))
                    logger.error('[%s] Prueba importar manualmente en modo detallado para ver el error:' % borme.cve)
                    logger.error('[%s]   python manage.py importbormepdf %s -v 3' % (borme.cve, borme.filename))
                    if strict:
                        logger_resume_import(cve=borme.cve)
                        return False, total_results

                if create_json:
                    os.makedirs(json_path, exist_ok=True)
                    json_filepath = os.path.join(json_path, '%s.json' % borme.cve)
                    borme.to_json(json_filepath)

                for key in total_results.keys():
                    total_results[key] += results[key]

                if not all(map(lambda x: x == 0, total_results.values())):
                    _print_results(results, borme)
                    elapsed_time = time.time() - start_time
                    logger.info('[%s] Elapsed time: %.2f seconds' % (borme.cve, elapsed_time))

            # Remove handlers
            logger.removeHandler(fh1)
            logger.removeHandler(fh2)
            next_date = bxml.next_borme
    except KeyboardInterrupt:
        logger.info('\nImport aborted.')

    elapsed_time = time.time() - total_start_time
    logger.info("\nBORMEs creados: {created_bormes}/{total_bormes}\n"
                "Anuncios creados: {created_anuncios}/{total_anuncios}\n"
                "Empresas creadas: {created_companies}/{total_companies}\n"
                "Personas creadas: {created_persons}/{total_persons}"
                .format(**total_results))
    logger.info("Total elapsed time: %.2f seconds" % elapsed_time)

    return True, total_results
Beispiel #13
0
import os

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Convert BORME A PDF files to JSON.')
    parser.add_argument('filename', help='BORME A PDF filename')
    parser.add_argument('--debug',
                        action='store_true',
                        default=False,
                        help='Debug mode')
    parser.add_argument(
        '-o',
        '--output',
        help='Output directory or filename (default is current directory)')
    args = parser.parse_args()

    # set logger DEBUG (Not working)
    if args.debug:
        bormeparser.borme.logger.setLevel(logging.DEBUG)
        bormeparser.backends.pypdf2.parser.logger.setLevel(
            logging.DEBUG)  # FIXME: DEFAULT_PARSER

    print('\nParsing {}'.format(args.filename))
    borme = bormeparser.parse(args.filename, bormeparser.SECCION.A)
    path = borme.to_json(args.output)

    if path:
        print('Created {}'.format(os.path.abspath(path)))
    else:
        print('Error creating JSON for {}'.format(args.filename))
Beispiel #14
0
def _import_borme_download_range(begin,
                                 end,
                                 seccion,
                                 local_only,
                                 strict=False,
                                 create_json=True):
    """Importa los BORMEs data un rango de fechas.

    Itera en el rango de fechas. Por cada día:
    * Genera los nombres de los archivos BORMEs a partir del archivo BORME-XML
    * Carga los archivos BORME-JSON, o los BORME-PDF si no existieran los JSON
    * Importa en la BD los datos de los BORME

    :param begin: Fecha desde la que importar
    :param end: Fecha hasta la que importar
    :param seccion: Seccion del BORME
    :param local_only: No descarga archivos, solo procesa archivos ya presentes
    :param strict: Aborta el proceso tan pronto como se encuentre un error
    :param create_json: Crear archivo BORME-JSON
    :type date_from: datetime.date
    :type date_to: datetime.date
    :type seccion: bormeparser.SECCION
    :type local_only: bool
    :type strict: bool
    :type create_json: bool

    :rtype: (bool, dict)
    """
    next_date = begin
    total_results = {
        'created_anuncios': 0,
        'created_bormes': 0,
        'created_companies': 0,
        'created_persons': 0,
        'total_anuncios': 0,
        'total_bormes': 0,
        'total_companies': 0,
        'total_persons': 0,
        'errors': 0
    }
    total_start_time = time.time()

    try:
        while next_date and next_date <= end:
            xml_path = get_borme_xml_filepath(next_date)
            try:
                bxml = BormeXML.from_file(xml_path)
                if bxml.next_borme is None:
                    bxml = BormeXML.from_date(next_date)
                    os.makedirs(os.path.dirname(xml_path), exist_ok=True)
                    bxml.save_to_file(xml_path)

            except OSError:
                bxml = BormeXML.from_date(next_date)
                os.makedirs(os.path.dirname(xml_path), exist_ok=True)
                bxml.save_to_file(xml_path)

            # Add FileHandlers
            directory = '%02d-%02d' % (bxml.date.year, bxml.date.month)
            logpath = os.path.join(settings.BORME_LOG_ROOT, 'imports',
                                   directory)
            os.makedirs(logpath, exist_ok=True)

            fh1_path = os.path.join(logpath, '%02d_info.txt' % bxml.date.day)
            fh1 = logging.FileHandler(fh1_path)
            fh1.setLevel(logging.INFO)
            logger.addHandler(fh1)

            fh2_path = os.path.join(logpath, '%02d_error.txt' % bxml.date.day)
            fh2 = logging.FileHandler(fh2_path)
            fh2.setLevel(logging.WARNING)
            logger.addHandler(fh2)

            json_path = get_borme_json_path(bxml.date)
            pdf_path = get_borme_pdf_path(bxml.date)
            os.makedirs(pdf_path, exist_ok=True)
            logger.info(
                "===================================================\n"
                "Ran import_borme_download at {now}\n"
                "  Import date: {borme_date}. Section: {section}\n"
                "===================================================".format(
                    now=timezone.now(),
                    section=seccion,
                    borme_date=bxml.date.isoformat()))
            print("\nPATH: {}"
                  "\nDATE: {}"
                  "\nSECCION: {}\n".format(pdf_path, bxml.date, seccion))

            bormes = []
            if not local_only:
                _, files = bxml.download_borme(pdf_path, seccion=seccion)

                for filepath in files:
                    if filepath.endswith('-99.pdf'):
                        continue
                    logger.info('%s' % filepath)
                    total_results['total_bormes'] += 1
                    try:
                        bormes.append(bormeparser.parse(filepath, seccion))
                    except Exception as e:
                        logger.error(
                            '[X] Error grave (I) en bormeparser.parse(): %s' %
                            filepath)
                        logger.error('[X] %s: %s' % (e.__class__.__name__, e))
                        if strict:
                            logger_resume_import()
                            return False, total_results

            else:
                files_json, files_pdf = _generate_borme_files_list(
                    bxml, json_path, pdf_path)

                if files_exist(files_json):
                    bormes, err = _load_and_append(files_json, strict)
                    total_results["total_bormes"] += len(files_json)

                    if err:
                        return False, total_results

                elif files_exist(files_pdf):
                    bormes, err = _load_and_append(files_pdf, strict, seccion)
                    total_results["total_bormes"] += len(files_pdf)

                    if err:
                        return False, total_results
                else:
                    logger.error(
                        '[X] Faltan archivos PDF y JSON que no se desea descargar.'
                    )
                    logger.error('[X] JSON: %s' % ' '.join(files_json))
                    logger.error('[X] PDF: %s' % ' '.join(files_pdf))

                    if strict:
                        return False, total_results

                    bormes, err = _load_and_append(files_pdf, strict, seccion)
                    total_results["total_bormes"] += len(files_pdf)

            for borme in sorted(bormes):
                total_results['total_anuncios'] += len(borme.get_anuncios())
                start_time = time.time()
                try:
                    results = _from_instance(borme)
                except Exception as e:
                    logger.error('[%s] Error grave en _from_instance:' %
                                 borme.cve)
                    logger.error('[%s] %s' % (borme.cve, e))
                    logger.error(
                        '[%s] Prueba importar manualmente en modo detallado para ver el error:'
                        % borme.cve)
                    logger.error(
                        '[%s]   python manage.py importbormepdf %s -v 3' %
                        (borme.cve, borme.filename))
                    if strict:
                        logger_resume_import(cve=borme.cve)
                        return False, total_results

                if create_json:
                    os.makedirs(json_path, exist_ok=True)
                    json_filepath = os.path.join(json_path,
                                                 '%s.json' % borme.cve)
                    borme.to_json(json_filepath)

                for key in total_results.keys():
                    total_results[key] += results[key]

                if not all(map(lambda x: x == 0, total_results.values())):
                    _print_results(results, borme)
                    elapsed_time = time.time() - start_time
                    logger.info('[%s] Elapsed time: %.2f seconds' %
                                (borme.cve, elapsed_time))

            # Remove handlers
            logger.removeHandler(fh1)
            logger.removeHandler(fh2)
            next_date = bxml.next_borme
    except KeyboardInterrupt:
        logger.info('\nImport aborted.')

    elapsed_time = time.time() - total_start_time
    logger.info("\nBORMEs creados: {created_bormes}/{total_bormes}\n"
                "Anuncios creados: {created_anuncios}/{total_anuncios}\n"
                "Empresas creadas: {created_companies}/{total_companies}\n"
                "Personas creadas: {created_persons}/{total_persons}".format(
                    **total_results))
    logger.info("Total elapsed time: %.2f seconds" % elapsed_time)

    return True, total_results
    def setUpClass(cls):
        super(TestBormeModel, cls).setUpClass()

        filepath = os.path.join(FILES_PATH, 'BORME-A-2015-27-10.pdf')
        borme = bormeparser.parse(filepath, bormeparser.SECCION.A)
        TestBormeModel.results = _from_instance(borme)
Beispiel #16
0
 def setUpClass(cls):
     cls.borme = bormeparser.parse(
             os.path.join(EXAMPLES_PATH, 'BORME-C-2011-20488.html'),
             SECCION.C)
Beispiel #17
0
 def setUpClass(cls):
     cls.borme = bormeparser.parse(
         os.path.join(EXAMPLES_PATH, 'BORME-A-2015-27-10.pdf'), SECCION.A)
 def setUpClass(cls):
     cls.borme = bormeparser.parse(
             os.path.join(EXAMPLES_PATH, 'BORME-C-2011-20488.html'),
             SECCION.C)
Beispiel #19
0
def _import_borme_download_range2(begin, end, seccion, local_only, strict=False, create_json=True):
    """
    strict: Para en caso de error grave
    """
    next_date = begin
    total_results = {'created_anuncios': 0, 'created_bormes': 0, 'created_companies': 0, 'created_persons': 0,
                     'total_anuncios': 0, 'total_bormes': 0, 'total_companies': 0, 'total_persons': 0, 'errors': 0}
    total_start_time = time.time()

    try:
        while next_date and next_date <= end:
            xml_path = get_borme_xml_filepath(next_date)
            try:
                bxml = BormeXML.from_file(xml_path)
                if bxml.next_borme is None:
                    bxml = BormeXML.from_date(next_date)
                    os.makedirs(os.path.dirname(xml_path), exist_ok=True)
                    bxml.save_to_file(xml_path)

            except FileNotFoundError:
                bxml = BormeXML.from_date(next_date)
                os.makedirs(os.path.dirname(xml_path), exist_ok=True)
                bxml.save_to_file(xml_path)

            # Add FileHandlers
            logpath = os.path.join(settings.BORME_LOG_ROOT, 'imports', '%02d-%02d' % (bxml.date.year, bxml.date.month))
            os.makedirs(logpath, exist_ok=True)

            fh1_path = os.path.join(logpath, '%02d_info.txt' % bxml.date.day)
            fh1 = logging.FileHandler(fh1_path)
            fh1.setLevel(logging.INFO)
            logger.addHandler(fh1)

            fh2_path = os.path.join(logpath, '%02d_error.txt' % bxml.date.day)
            fh2 = logging.FileHandler(fh2_path)
            fh2.setLevel(logging.WARNING)
            logger.addHandler(fh2)

            json_path = get_borme_json_path(bxml.date)
            pdf_path = get_borme_pdf_path(bxml.date)
            os.makedirs(pdf_path, exist_ok=True)
            logger.info('============================================================')
            logger.info('Ran import_borme_download at %s' % timezone.now())
            logger.info('  Import date: %s. Section: %s' % (bxml.date.isoformat(), seccion))
            logger.info('============================================================')

            print('\nPATH: %s\nDATE: %s\nSECCION: %s\n' % (pdf_path, bxml.date, seccion))

            bormes = []
            if not local_only:
                _, files = bxml.download_borme(pdf_path, seccion=seccion)

                for filepath in files:
                    if filepath.endswith('-99.pdf'):
                        continue
                    logger.info('%s' % filepath)
                    total_results['total_bormes'] += 1
                    try:
                        bormes.append(bormeparser.parse(filepath, seccion))
                    except Exception as e:
                        logger.error('[X] Error grave en bormeparser.parse(): %s' % filepath)
                        logger.error('[X] %s: %s' % (e.__class__.__name__, e))
                        if strict:
                            logger.error('[X] Una vez arreglado, reanuda la importación:')
                            logger.error('[X]   python manage.py importbormetoday local')
                            return False, total_results

            else:
                cves = bxml.get_cves(bormeparser.SECCION.A)
                files_json = list(map(lambda x: os.path.join(json_path, '%s.json' % x), cves))
                files_pdf = list(map(lambda x: os.path.join(pdf_path, '%s.pdf' % x), cves))

                if files_exist(files_json):
                    for filepath in files_json:
                        logger.info('%s' % filepath)
                        total_results['total_bormes'] += 1
                        try:
                            bormes.append(bormeparser.Borme.from_json(filepath))
                        except Exception as e:
                            logger.error('[X] Error grave en bormeparser.Borme.from_json(): %s' % filepath)
                            logger.error('[X] %s: %s' % (e.__class__.__name__, e))
                            if strict:
                                logger.error('[X] Una vez arreglado, reanuda la importación:')
                                logger.error('[X]   python manage.py importbormetoday local')  # TODO: --from date
                                return False, total_results
                elif files_exist(files_pdf):
                    for filepath in files_pdf:
                        logger.info('%s' % filepath)
                        total_results['total_bormes'] += 1
                        try:
                            bormes.append(bormeparser.parse(filepath, seccion))
                        except Exception as e:
                            logger.error('[X] Error grave en bormeparser.parse(): %s' % filepath)
                            logger.error('[X] %s: %s' % (e.__class__.__name__, e))
                            if strict:
                                logger.error('[X] Una vez arreglado, reanuda la importación:')
                                logger.error('[X]   python manage.py importbormetoday local')  # TODO: --from date
                                return False, total_results
                else:
                    logger.error('[X] Faltan archivos PDF y JSON que no se desea descargar.')
                    logger.error('[X] JSON: %s' % ' '.join(files_json))
                    logger.error('[X] PDF: %s' % ' '.join(files_pdf))
                    if strict:
                        return False, total_results

                    for filepath in files_json:
                        if not os.path.exists(filepath):
                            logger.warn('[X] Missing JSON: %s' % filepath)
                            continue
                        logger.info('%s' % filepath)
                        total_results['total_bormes'] += 1
                        try:
                            bormes.append(bormeparser.Borme.from_json(filepath))
                        except Exception as e:
                            logger.error('[X] Error grave en bormeparser.Borme.from_json(): %s' % filepath)
                            logger.error('[X] %s: %s' % (e.__class__.__name__, e))

            for borme in sorted(bormes):
                total_results['total_anuncios'] += len(borme.get_anuncios())
                start_time = time.time()
                try:
                    results = _import1(borme)
                except Exception as e:
                    logger.error('[%s] Error grave en _import1:' % borme.cve)
                    logger.error('[%s] %s' % (borme.cve, e))
                    logger.error('[%s] Prueba importar manualmente en modo detallado para ver el error:' % borme.cve)
                    logger.error('[%s]   python manage.py importbormepdf %s -v 3' % (borme.cve, borme.filename))
                    if strict:
                        logger.error('[%s] Una vez arreglado, reanuda la importación:' % borme.cve)
                        logger.error('[%s]   python manage.py importbormetoday local' % borme.cve)
                        return False, total_results

                if create_json:
                    os.makedirs(json_path, exist_ok=True)
                    json_filepath = os.path.join(json_path, '%s.json' % borme.cve)
                    borme.to_json(json_filepath)

                total_results['created_anuncios'] += results['created_anuncios']
                total_results['created_bormes'] += results['created_bormes']
                total_results['created_companies'] += results['created_companies']
                total_results['created_persons'] += results['created_persons']
                total_results['total_companies'] += results['total_companies']
                total_results['total_persons'] += results['total_persons']
                total_results['errors'] += results['errors']

                if not all(map(lambda x: x == 0, total_results.values())):
                    print_results(results, borme)
                    elapsed_time = time.time() - start_time
                    logger.info('[%s] Elapsed time: %.2f seconds' % (borme.cve, elapsed_time))

            # Remove handlers
            logger.removeHandler(fh1)
            logger.removeHandler(fh2)
            next_date = bxml.next_borme
    except KeyboardInterrupt:
        logger.info('\nImport aborted.')

    elapsed_time = time.time() - total_start_time
    logger.info('\nBORMEs creados: %d/%d' % (total_results['created_bormes'], total_results['total_bormes']))
    logger.info('Anuncios creados: %d/%d' % (total_results['created_anuncios'], total_results['total_anuncios']))
    logger.info('Empresas creadas: %d/%d' % (total_results['created_companies'], total_results['total_companies']))
    logger.info('Personas creadas: %d/%d' % (total_results['created_persons'], total_results['total_persons']))
    logger.info('Total elapsed time: %.2f seconds' % elapsed_time)

    return True, total_results
 def setUpClass(cls):
     cls.borme = bormeparser.parse(
         os.path.join(EXAMPLES_PATH, 'BORME-A-2015-27-10.pdf'), SECCION.A)
import bormeparser.backends.pypdf2.parser

from bormeparser.backends.defaults import OPTIONS
OPTIONS['SANITIZE_COMPANY_NAME'] = True

import argparse
import logging
import os


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Convert BORME A PDF files to JSON.')
    parser.add_argument('filename', help='BORME A PDF filename')
    parser.add_argument('--debug', action='store_true', default=False, help='Debug mode')
    parser.add_argument('-o', '--output', help='Output directory or filename (default is current directory)')
    args = parser.parse_args()

    # set logger DEBUG (Not working)
    if args.debug:
        bormeparser.borme.logger.setLevel(logging.DEBUG)
        bormeparser.backends.pypdf2.parser.logger.setLevel(logging.DEBUG)  # FIXME: DEFAULT_PARSER

    print('\nParsing {}'.format(args.filename))
    borme = bormeparser.parse(args.filename, bormeparser.SECCION.A)
    path = borme.to_json(args.output)

    if path:
        print('Created {}'.format(os.path.abspath(path)))
    else:
        print('Error creating JSON for {}'.format(args.filename))
Beispiel #22
0
    def setUpClass(cls):
        super(TestBormeModel, cls).setUpClass()

        filepath = os.path.join(FILES_PATH, 'BORME-A-2015-27-10.pdf')
        borme = bormeparser.parse(filepath, bormeparser.SECCION.A)
        TestBormeModel.results = _from_instance(borme)