Ejemplo n.º 1
0
    def run(self):
        self.base = DBMeta.get(DBMeta.key == 'base').value or 'LEGI'
        self.unknown_folders = defaultdict(lambda: 0)
        self.liste_suppression = []
        self.counts = defaultdict(lambda: 0)
        self.skipped = 0

        entries_counts = self.get_entries_count()
        chunks = self.get_chunks(entries_counts)
        if len(chunks) > 1:
            print(f"big archive will be processed in {len(chunks)} chunks...")
        for chunk_idx, chunk in enumerate(chunks):
            chunk_counts, chunk_skipped = self.process_chunk(chunk_idx, chunk)
            merge_counts(chunk_counts, chunk_skipped, self.counts,
                         self.skipped)
        if self.liste_suppression:
            db = connect_db(self.db_url)
            db_proxy.initialize(db)
            suppress(self.base, self.db, self.liste_suppression)
        print("made %s changes in the database:" % sum(self.counts.values()),
              json.dumps(self.counts, indent=4, sort_keys=True))
        if self.skipped:
            print("skipped", self.skipped, "files that haven't changed")
        if self.unknown_folders:
            for d, x in self.unknown_folders.items():
                print("skipped", x, "files in unknown folder `%s`" % d)
Ejemplo n.º 2
0
def process_xml_jobs_batch(jobs_args_batch, db_url):
    # this will be ran in a separate process, thus we need to init our own DB connection
    db = connect_db(db_url)
    db_proxy.initialize(db)
    batch_counts, batch_skipped = process_xml_jobs_sync(jobs_args_batch,
                                                        db=db,
                                                        commit=False)
    db.commit()  # limits commits to db
    return batch_counts, batch_skipped
Ejemplo n.º 3
0
def process_xml_batch(process_xml_jobs_args, db_url):
    if MAX_PROCESSES != 1 and len(
            process_xml_jobs_args) > 10 * PARALLEL_BATCH_SIZE:
        return process_xml_jobs_in_parallel(process_xml_jobs_args, db_url)
    else:
        db = connect_db(db_url)
        return process_xml_jobs_sync(process_xml_jobs_args,
                                     db=db,
                                     commit=True,
                                     progress=True)
Ejemplo n.º 4
0
def test_normalize():
    db = connect_db(':memory:')
    db_proxy.initialize(db)
    for row in DATA:
        TexteVersion.insert(**row).execute()
    normalize_text_titles(db)

    data_brutes = list(TexteVersionBrute.select().order_by(
        TexteVersionBrute.rowid))
    data_norm = list(TexteVersion.select().order_by(TexteVersion.rowid))

    assert len(data_brutes) == 6

    assert data_brutes[0].bits == 23
    assert data_norm[0].nature == "DECRET"
    assert data_norm[0].titre == "Décret n° 75-96 du 18 février 1975"
    assert data_norm[
        0].titrefull == "Décret n° 75-96 du 18 février 1975 fixant les modalités de …"

    assert data_brutes[1].bits == 63
    assert data_norm[1].nature == "DECISION"
    assert data_norm[
        1].titre == "Décision du Conseil d'État n° 344021, 344022 du 28 juin 2013"
    assert data_norm[
        1].titrefull == "Décision du Conseil d'État n° 344021, 344022 du 28 juin 2013 statuant au contentieux"
    assert data_norm[1].autorite == "CONSEIL D'ETAT"
    assert data_norm[1].num == "344021, 344022"
    assert data_norm[1].date_texte == "2013-06-28"

    assert data_brutes[2].bits == 7
    assert data_norm[2].nature == "LOI_ORGANIQUE"
    assert data_norm[2].titre == "Loi organique n° 2016-1086 du 8 août 2016"
    assert data_norm[
        2].titrefull == "Loi organique n° 2016-1086 du 8 août 2016 relative à la nomination …"

    assert data_brutes[3].bits == 4
    assert data_norm[3].titrefull == "Code minier (nouveau)"

    assert data_brutes[4].bits == 6
    assert data_norm[4].titre == "Arrêté du 18 décembre 2014"
    assert data_norm[4].titrefull == "Arrêté du 18 décembre 2014 modifiant …"

    assert data_brutes[5].bits == 4
    assert data_norm[5].titrefull == "Arrêté du 5 septembre 2002"
Ejemplo n.º 5
0
    file_path = f"active_idccs_{human_date}.csv"
    with open(file_path, 'w') as f:
        writer = csv.DictWriter(
            f, fieldnames=["idcc", "name", "group", "in_kali"])
        writer.writeheader()
        writer.writerows(new_rows)
    print(f"wrote {len(new_rows)} IDCCs to {file_path}")


def idcc_to_group(idcc):
    if idcc >= 0 and idcc <= 3999:
        return 'DGT'
    elif idcc >= 5000 and idcc <= 5999:
        return 'DARES'
    elif idcc >= 7000 and idcc <= 9999:
        return 'AGRICULTURE'


if __name__ == '__main__':
    p = ArgumentParser()
    p.add_argument('--db-url')
    p.add_argument('--identify-missing', action='store_true')
    args = p.parse_args()

    db = connect_db(args.db_url)
    db_proxy.initialize(db)

    xls_sheet = download_file_and_open_xls_sheet()
    active_idccs = get_active_idccs(xls_sheet)
    identify_missing(xls_sheet, active_idccs)
Ejemplo n.º 6
0
def run_importer(db_url=DEFAULT_OPTIONS["db-url"],
                 base=DEFAULT_OPTIONS["base"],
                 raw=DEFAULT_OPTIONS["raw"],
                 anomalies=DEFAULT_OPTIONS["anomalies"],
                 anomalies_dir=DEFAULT_OPTIONS["anomalies-dir"],
                 skip_links=DEFAULT_OPTIONS["skip-links"],
                 dumps_directory=DEFAULT_OPTIONS["dumps-dir"]):
    db = connect_db(db_url)
    db_proxy.initialize(db)

    db_meta_base = DBMeta.get_or_none(key='base')
    base_from_db = db_meta_base.value if db_meta_base else None
    db_meta_last_update = DBMeta.get_or_none(key='last_update')
    last_update = db_meta_last_update.value if db_meta_last_update else None

    if not base_from_db:
        DBMeta.create(key='base', value=base)
    elif base and base != base_from_db:
        print(
            f"!> Wrong database: requested '{base}' but existing database is '{base_from_db}"
        )
        raise SystemExit(1)

    if base != 'LEGI' and not raw:
        print(
            "!> You need to use the --raw option when working with bases other than LEGI."
        )
        raise SystemExit(1)

    if base != 'LEGI' and anomalies:
        print("!> The --anomalies option can only be used with the LEGI base")
        raise SystemExit(1)

    # Check and record the data mode
    db_meta_raw = DBMeta.get_or_none(key='raw')
    db_meta_raw = db_meta_raw.value if db_meta_raw else None
    if raw:
        versions_brutes = bool(TexteVersionBrute.get_or_none())
        data_is_not_raw = versions_brutes or db_meta_raw is False
        if data_is_not_raw:
            print(
                "!> Can't honor --raw option, the data has already been modified previously."
            )
            raise SystemExit(1)
    if db_meta_raw != raw:
        DBMeta.insert(key='raw', value=raw) \
            .on_conflict(conflict_target=[DBMeta.key], preserve=[DBMeta.value]) \
            .execute()

    # Handle the --skip-links option
    has_links = bool(Lien.get_or_none())
    if not skip_links and not has_links and last_update is not None:
        skip_links = True
        print(
            "> Warning: links will not be processed because this DB was built with --skip-links."
        )
    elif skip_links and has_links:
        print("> Deleting links...")
        Lien.delete()

    # Look for new archives in the given dumps_directory
    print("> last_update is", last_update)
    archive_re = re.compile(
        r'(.+_)?' + base.lower() +
        r'(?P<global>_global|_)?_(?P<date>[0-9]{8}-[0-9]{6})\..+',
        flags=re.IGNORECASE)
    skipped = 0
    archives = sorted([
        (m.group('date'), bool(m.group('global')), m.group(0)) for m in [
            archive_re.match(fn) for fn in os.listdir(dumps_directory)
            if fnmatch(fn.lower(), '*' + base.lower() + '_*.tar.*')
        ]
    ])
    most_recent_global = [t[0] for t in archives if t[1]][-1]
    if last_update and most_recent_global > last_update:
        print(
            "> There is a new global archive, recreating the DB from scratch!")
        raise Exception("not implemented yet")
        # db.close()
        # os.rename(db.address, db.address + '.back')
        # db = connect_db(db_name, pragmas=args.pragma)
    archives, skipped = partition(
        archives, lambda t: t[0] >= most_recent_global and t[0] >
        (last_update or ''))
    if skipped:
        print("> Skipped %i old archives" % len(skipped))

    # Process the new archives
    for archive_date, is_global, archive_name in archives:
        print("> Processing %s..." % archive_name)
        archive_processor = ArchiveProcessor(db,
                                             db_url,
                                             dumps_directory + '/' +
                                             archive_name,
                                             process_links=not skip_links)
        archive_processor.run()
        DBMeta.insert(key='last_update', value=archive_date) \
            .on_conflict(conflict_target=[DBMeta.key], preserve=[DBMeta.value]) \
            .execute()
        last_update = archive_date
        print('last_update is now set to', last_update)

        # Detect anomalies if requested
        if anomalies:
            if not os.path.isdir(anomalies_dir):
                os.mkdir(anomalies_dir)
            fpath = anomalies_dir + '/anomalies-' + last_update + '.txt'
            with open(fpath, 'w') as f:
                n_anomalies = detect_anomalies(db, f)
            print("logged", n_anomalies, "anomalies in", fpath)

    postprocess(db, base)

    if not raw:
        from .normalize import normalize_text_titles
        normalize_text_titles(db)
        from .factorize import main as factorize
        factorize(db)
        from .normalize import normalize_article_numbers
        normalize_article_numbers(db)
Ejemplo n.º 7
0
                LEFT JOIN articles ON articles.id = hierarchie.element
                WHERE SUBSTR(hierarchie.element, 5, 4) = 'ARTI'
            """)
        for article in articles:
            bloc_textuel, titre, num = article
            article_counts[conteneur_id] += 1
            if bloc_textuel is None:
                continue
            soup = BeautifulSoup(bloc_textuel)
            text = soup.text
            if text is None:
                continue
            txt_file.write(text + '\n')
            count = len([x.strip() for x in text.split(" ")])
            word_counts[conteneur_id] += count
        txt_file.close()
        csv_writer.writerow([
            conteneur_id, conteneur_num, conteneur_titre, textes_count,
            article_counts[conteneur_id], word_counts[conteneur_id]
        ])
    csv_file.close()


if __name__ == '__main__':
    p = ArgumentParser()
    p.add_argument('db')
    p.add_argument('--output-dir', default=DEFAULT_OUTPUT_DIR)
    args = p.parse_args()
    db = connect_db(args.db)
    write_csv(db, args.output_dir)
Ejemplo n.º 8
0
def test_factorize():
    db = connect_db('sqlite:///tests_factorize.sqlite')
    main(db)