def run(self): self.base = DBMeta.get(DBMeta.key == 'base').value or 'LEGI' self.unknown_folders = defaultdict(lambda: 0) self.liste_suppression = [] self.counts = defaultdict(lambda: 0) self.skipped = 0 entries_counts = self.get_entries_count() chunks = self.get_chunks(entries_counts) if len(chunks) > 1: print(f"big archive will be processed in {len(chunks)} chunks...") for chunk_idx, chunk in enumerate(chunks): chunk_counts, chunk_skipped = self.process_chunk(chunk_idx, chunk) merge_counts(chunk_counts, chunk_skipped, self.counts, self.skipped) if self.liste_suppression: db = connect_db(self.db_url) db_proxy.initialize(db) suppress(self.base, self.db, self.liste_suppression) print("made %s changes in the database:" % sum(self.counts.values()), json.dumps(self.counts, indent=4, sort_keys=True)) if self.skipped: print("skipped", self.skipped, "files that haven't changed") if self.unknown_folders: for d, x in self.unknown_folders.items(): print("skipped", x, "files in unknown folder `%s`" % d)
def process_xml_jobs_batch(jobs_args_batch, db_url): # this will be ran in a separate process, thus we need to init our own DB connection db = connect_db(db_url) db_proxy.initialize(db) batch_counts, batch_skipped = process_xml_jobs_sync(jobs_args_batch, db=db, commit=False) db.commit() # limits commits to db return batch_counts, batch_skipped
def test_normalize(): db = connect_db(':memory:') db_proxy.initialize(db) for row in DATA: TexteVersion.insert(**row).execute() normalize_text_titles(db) data_brutes = list(TexteVersionBrute.select().order_by( TexteVersionBrute.rowid)) data_norm = list(TexteVersion.select().order_by(TexteVersion.rowid)) assert len(data_brutes) == 6 assert data_brutes[0].bits == 23 assert data_norm[0].nature == "DECRET" assert data_norm[0].titre == "Décret n° 75-96 du 18 février 1975" assert data_norm[ 0].titrefull == "Décret n° 75-96 du 18 février 1975 fixant les modalités de …" assert data_brutes[1].bits == 63 assert data_norm[1].nature == "DECISION" assert data_norm[ 1].titre == "Décision du Conseil d'État n° 344021, 344022 du 28 juin 2013" assert data_norm[ 1].titrefull == "Décision du Conseil d'État n° 344021, 344022 du 28 juin 2013 statuant au contentieux" assert data_norm[1].autorite == "CONSEIL D'ETAT" assert data_norm[1].num == "344021, 344022" assert data_norm[1].date_texte == "2013-06-28" assert data_brutes[2].bits == 7 assert data_norm[2].nature == "LOI_ORGANIQUE" assert data_norm[2].titre == "Loi organique n° 2016-1086 du 8 août 2016" assert data_norm[ 2].titrefull == "Loi organique n° 2016-1086 du 8 août 2016 relative à la nomination …" assert data_brutes[3].bits == 4 assert data_norm[3].titrefull == "Code minier (nouveau)" assert data_brutes[4].bits == 6 assert data_norm[4].titre == "Arrêté du 18 décembre 2014" assert data_norm[4].titrefull == "Arrêté du 18 décembre 2014 modifiant …" assert data_brutes[5].bits == 4 assert data_norm[5].titrefull == "Arrêté du 5 septembre 2002"
file_path = f"active_idccs_{human_date}.csv" with open(file_path, 'w') as f: writer = csv.DictWriter( f, fieldnames=["idcc", "name", "group", "in_kali"]) writer.writeheader() writer.writerows(new_rows) print(f"wrote {len(new_rows)} IDCCs to {file_path}") def idcc_to_group(idcc): if idcc >= 0 and idcc <= 3999: return 'DGT' elif idcc >= 5000 and idcc <= 5999: return 'DARES' elif idcc >= 7000 and idcc <= 9999: return 'AGRICULTURE' if __name__ == '__main__': p = ArgumentParser() p.add_argument('--db-url') p.add_argument('--identify-missing', action='store_true') args = p.parse_args() db = connect_db(args.db_url) db_proxy.initialize(db) xls_sheet = download_file_and_open_xls_sheet() active_idccs = get_active_idccs(xls_sheet) identify_missing(xls_sheet, active_idccs)
def run_importer(db_url=DEFAULT_OPTIONS["db-url"], base=DEFAULT_OPTIONS["base"], raw=DEFAULT_OPTIONS["raw"], anomalies=DEFAULT_OPTIONS["anomalies"], anomalies_dir=DEFAULT_OPTIONS["anomalies-dir"], skip_links=DEFAULT_OPTIONS["skip-links"], dumps_directory=DEFAULT_OPTIONS["dumps-dir"]): db = connect_db(db_url) db_proxy.initialize(db) db_meta_base = DBMeta.get_or_none(key='base') base_from_db = db_meta_base.value if db_meta_base else None db_meta_last_update = DBMeta.get_or_none(key='last_update') last_update = db_meta_last_update.value if db_meta_last_update else None if not base_from_db: DBMeta.create(key='base', value=base) elif base and base != base_from_db: print( f"!> Wrong database: requested '{base}' but existing database is '{base_from_db}" ) raise SystemExit(1) if base != 'LEGI' and not raw: print( "!> You need to use the --raw option when working with bases other than LEGI." ) raise SystemExit(1) if base != 'LEGI' and anomalies: print("!> The --anomalies option can only be used with the LEGI base") raise SystemExit(1) # Check and record the data mode db_meta_raw = DBMeta.get_or_none(key='raw') db_meta_raw = db_meta_raw.value if db_meta_raw else None if raw: versions_brutes = bool(TexteVersionBrute.get_or_none()) data_is_not_raw = versions_brutes or db_meta_raw is False if data_is_not_raw: print( "!> Can't honor --raw option, the data has already been modified previously." ) raise SystemExit(1) if db_meta_raw != raw: DBMeta.insert(key='raw', value=raw) \ .on_conflict(conflict_target=[DBMeta.key], preserve=[DBMeta.value]) \ .execute() # Handle the --skip-links option has_links = bool(Lien.get_or_none()) if not skip_links and not has_links and last_update is not None: skip_links = True print( "> Warning: links will not be processed because this DB was built with --skip-links." ) elif skip_links and has_links: print("> Deleting links...") Lien.delete() # Look for new archives in the given dumps_directory print("> last_update is", last_update) archive_re = re.compile( r'(.+_)?' + base.lower() + r'(?P<global>_global|_)?_(?P<date>[0-9]{8}-[0-9]{6})\..+', flags=re.IGNORECASE) skipped = 0 archives = sorted([ (m.group('date'), bool(m.group('global')), m.group(0)) for m in [ archive_re.match(fn) for fn in os.listdir(dumps_directory) if fnmatch(fn.lower(), '*' + base.lower() + '_*.tar.*') ] ]) most_recent_global = [t[0] for t in archives if t[1]][-1] if last_update and most_recent_global > last_update: print( "> There is a new global archive, recreating the DB from scratch!") raise Exception("not implemented yet") # db.close() # os.rename(db.address, db.address + '.back') # db = connect_db(db_name, pragmas=args.pragma) archives, skipped = partition( archives, lambda t: t[0] >= most_recent_global and t[0] > (last_update or '')) if skipped: print("> Skipped %i old archives" % len(skipped)) # Process the new archives for archive_date, is_global, archive_name in archives: print("> Processing %s..." % archive_name) archive_processor = ArchiveProcessor(db, db_url, dumps_directory + '/' + archive_name, process_links=not skip_links) archive_processor.run() DBMeta.insert(key='last_update', value=archive_date) \ .on_conflict(conflict_target=[DBMeta.key], preserve=[DBMeta.value]) \ .execute() last_update = archive_date print('last_update is now set to', last_update) # Detect anomalies if requested if anomalies: if not os.path.isdir(anomalies_dir): os.mkdir(anomalies_dir) fpath = anomalies_dir + '/anomalies-' + last_update + '.txt' with open(fpath, 'w') as f: n_anomalies = detect_anomalies(db, f) print("logged", n_anomalies, "anomalies in", fpath) postprocess(db, base) if not raw: from .normalize import normalize_text_titles normalize_text_titles(db) from .factorize import main as factorize factorize(db) from .normalize import normalize_article_numbers normalize_article_numbers(db)