def cmd_reindex(drop_before=False, **kwargs): """Reindex collections""" ctx = client.Context(**kwargs) ctx.log_warn("All Writes operations is blocked pending run !") if ctx.silent or click.confirm('Do you want to continue?', abort=True): db = ctx.mongo_database() if drop_before: with click.progressbar(constants.COL_ALL, length=len(constants.COL_ALL) + 1, label='Drop indexes') as collections: for collection in collections: ctx.log_ok(" [%s]" % collection) try: db[collection].drop_indexes() except Exception as err: ctx.log_warn(str(err)) ctx.log("Create or update all indexes !") create_or_update_indexes(db) if not drop_before: with click.progressbar(constants.COL_ALL, length=len(constants.COL_ALL) + 1, label='Reindex collections') as collections: for collection in collections: ctx.log_ok(" [%s]" % collection) try: db[collection].reindex() except Exception as err: ctx.log_warn(str(err))
def cmd_dataset_list(fetcher, **kwargs): """Show datasets list""" ctx = client.Context(**kwargs) f = FETCHERS[fetcher](db=ctx.mongo_database()) datasets = f.datasets_list() if not datasets: ctx.log_error("Not datasets for this fetcher") return fmt = "{0:20} | {1:70} | {2:10}" print( "---------------------------------------------------------------------------------------------------------------------------" ) print(fmt.format("Dataset Code", "Dataset Name", "Last Update")) print( "---------------------------------------------------------------------------------------------------------------------------" ) for dataset in datasets: last_update = "" if dataset.get('last_update'): last_update = str(dataset['last_update'].strftime("%Y-%m-%d")) print(fmt.format(dataset["dataset_code"], dataset["name"], last_update)) print( "---------------------------------------------------------------------------------------------------------------------------" )
def cmd_consolidate(fetcher=None, dataset=None, max_bulk=20, **kwargs): """Consolidate codelists and concepts one or more dataset""" ctx = client.Context(**kwargs) if ctx.silent or click.confirm('Do you want to continue?', abort=True): db = ctx.mongo_database() _consolidate(ctx, db, fetcher, dataset=dataset, max_bulk=max_bulk)
def cmd_purge(fetcher=None, dataset=None, purge_all=False, **kwargs): """Purge one or more dataset""" """ dlstats fetchers purge -f INSEE --purge-all dlstats fetchers purge -f INSEE -d IPCH-2015-FR-COICOP dlstats fetchers purge -f INSEE -d IPCH-2015-FR-COICOP -d IPC-2015-COICOP """ ctx = client.Context(**kwargs) ctx.log("START purge for [%s]" % fetcher) if ctx.silent or click.confirm('Do you want to continue?', abort=True): start = time.time() db = ctx.mongo_database() from pymongo import DeleteMany if purge_all: query = {"name": fetcher} result = db[constants.COL_PROVIDERS].bulk_write( [DeleteMany(query)], ordered=False) ctx.log("Provider [%s] deleted" % fetcher) query = {"provider_name": fetcher} result = db[constants.COL_CATEGORIES].bulk_write( [DeleteMany(query)], ordered=False) ctx.log("Categories deleted: %s" % result.deleted_count) query = {"provider_name": fetcher} if not purge_all and dataset: query["dataset_code"] = {"$in": dataset} bulk_requests = [DeleteMany(query)] result = db[constants.COL_DATASETS].bulk_write(bulk_requests, ordered=False) ctx.log("Datasets deleted: %s" % result.deleted_count) result = db[constants.COL_SERIES].bulk_write(bulk_requests, ordered=False) ctx.log("Series deleted: %s" % result.deleted_count) result = db[constants.COL_SERIES_ARCHIVES].bulk_write(bulk_requests, ordered=False) ctx.log("Series archives deleted: %s" % result.deleted_count) end = time.time() - start ctx.log("END purge for [%s] - time[%.3f]" % (fetcher, end))
def cmd_clean(**kwargs): """Delete All MongoDB collections""" ctx = client.Context(**kwargs) #TODO: translation ctx.log_warn("La destruction des données est définitive !") if ctx.silent or click.confirm('Do you want to continue?', abort=True): db = ctx.mongo_database() utils.clean_mongodb(db)
def cmd_export_csvfile(provider=None, dataset=None, filepath=None, create=False, **kwargs): """Download csvfile from one dataset. Examples: dlstats export csvfile -p Eurostat -d "nama_10_a10" -S dlstats export csvfile -p BEA -d "10101 Ann" widukind-dataset-Eurostat-nama_10_a10-csv widukind-dataset-eurostat-nama_10_a10.csv """ ctx = client.Context(**kwargs) if ctx.silent or click.confirm('Do you want to continue?', abort=True): db = ctx.mongo_database() fs = ctx.mongo_fs() filename = export_files.generate_filename_csv(provider_name=provider, dataset_code=dataset, prefix="dataset") filepath = filepath or filename csvfile = fs.find_one({"filename": filename}) if not csvfile and create is True: ctx.log_warn("%s not exist. creating..." % filename) try: id = export_files.export_file_csv_dataset_unit( provider=provider, dataset_code=dataset) csvfile = fs.get(id) except Exception as err: ctx.log_error(str(err)) if csvfile: created = csvfile.upload_date.strftime("%Y-%m-%d-%H:%M:%S") ctx.log_ok("export to %s - created[%s]" % (filepath, created)) with open(filepath, 'wb') as fp: rows = iter(csvfile) for row in rows: fp.write(row) else: ctx.log_error("file not found: %s" % filename)
def cmd_search(search_type=None, fetcher=None, dataset=None, frequency=None, search=None, limit=None, **kwargs): """Search in Series""" #TODO: pretty #TODO: csv ? #TODO: time limit """ dlstats fetchers search -F Q -s "euro Market financial" dlstats fetchers search -t series -F Q -s "euro Market financial" -D """ ctx = client.Context(**kwargs) db = ctx.mongo_database() provider_name = None if fetcher: f = FETCHERS[fetcher](db=db) provider_name = f.provider_name result, query = tags.search_tags(db, search_type=search_type, provider_name=provider_name, dataset_code=dataset, frequency=frequency, search_tags=search, limit=limit) ctx.log("Count result : %s" % result.count()) for doc in result: if search_type == constants.COL_SERIES: fields = [ doc['provider_name'], doc['dataset_code'], doc['key'], doc['name'] ] else: fields = [doc['provider_name'], doc['dataset_code'], doc['name']] if ctx.debug: fields.append(doc['tags']) print(fields)
def cmd_datatree(fetcher=None, force=False, use_files=False, not_remove=False, **kwargs): """Create or Update fetcher Data-Tree""" ctx = client.Context(**kwargs) ctx.log_ok("Run Update Data-Tree for %s fetcher:" % fetcher) if ctx.silent or click.confirm('Do you want to continue?', abort=True): f = FETCHERS[fetcher](db=ctx.mongo_database(), use_existing_file=use_files, not_remove_files=not_remove) f.upsert_data_tree(force_update=force)
def cmd_stats_run(fetcher=None, limit=20, **kwargs): """Display run stats sorted by descending created field""" #TODO: csv export ? #TODO: envoi csv par mail ctx = client.Context(**kwargs) db = ctx.mongo_database() #fmt = "{:15} | {:10} | {:20} | {:>5} | {:>5} | {:>5} | {:>5} | {:>5}" fmt = "{:16} | {:10} | {:20.20} | {:>6} | {:>6} | {:>6} | {:>6} | {:>5} | {:>5} | {:>6} | {:>6} | {:>6} | {:5} | {:>5}" sep = "---------------------------------------------------------------------------------------------------------------------------------------------------" print(sep) print( fmt.format("Date", "Provider", "Dataset", "Acc.", "Rej.", "Ins.", "Upd.", "Err.", "FErr.", "Dur.", "Avg", "Avg.W", "Async", "Bulk")) print(sep) query = {} if fetcher: query["provider_name"] = fetcher cursor = db[constants.COL_STATS_RUN].find(query) if limit: cursor = cursor.limit(limit) for stat in cursor.sort("created", -1): print( fmt.format(stat['created'].strftime("%Y-%m-%d-%H:%M"), stat["provider_name"], stat.get("dataset_code"), stat.get("count_accepts", 0), stat.get("count_rejects", 0), stat.get("count_inserts", 0), stat.get("count_updates", 0), stat.get("count_errors", 0), stat.get("fetcher_errors", 0), round(stat.get("duration", 0.0), 2), round(stat.get("avg_all", 0.0), 2), round(stat.get("avg_write", 0.0), 2), "Y" if stat.get("async_mode", None) else "N", stat.get("bulk_size", 0))) print(sep)
def cmd_aggregate_tags(max_bulk=100, update_only=False, async_mode=None, **kwargs): """Aggregate tags""" #TODO: tag sur dataset only ctx = client.Context(**kwargs) ctx.log("START aggregate tags") if ctx.silent or click.confirm('Do you want to continue?', abort=True): start = time.time() db = ctx.mongo_database() ctx.log("Aggregate Datasets tags...") result = tags.aggregate_tags_datasets(db, max_bulk=max_bulk) #{'nMatched': 410, 'nModified': 385, 'writeConcernErrors': [], 'nInserted': 0, 'nUpserted': 2452, 'writeErrors': [], 'nRemoved': 0} ctx.log_ok( "datasets - matched[%(nMatched)s] - modified[%(nModified)s] - inserted[%(nInserted)s] - upserted[%(nUpserted)s]" % result) if result["writeErrors"]: print(result["writeErrors"][0]) ctx.log("Aggregate Series tags...") result = tags.aggregate_tags_series(db, max_bulk=max_bulk) ctx.log_ok( "series - matched[%(nMatched)s] - modified[%(nModified)s] - inserted[%(nInserted)s] - upserted[%(nUpserted)s]" % result) if result["writeErrors"]: print(result["writeErrors"][0]) end = time.time() - start ctx.log("END aggregate tags time[%.3f]" % end)
def cmd_check(**kwargs): """Verify connection""" ctx = client.Context(**kwargs) try: import pymongo mongo_client = ctx.mongo_client() db = ctx.mongo_database() server_info = mongo_client.server_info() host_info = db.command("hostInfo") print("------------------------------------------------------") ctx.log_ok("Connection OK") print("------------------------------------------------------") print("pymongo version : %s" % pymongo.version) print("-------------------- Server Infos --------------------") pprint(server_info) print("-------------------- Host Infos ----------------------") pprint(host_info) print("------------------------------------------------------") except Exception as err: ctx.log_error("Connection Error:") print("------------------------------------------------------") ctx.log_error(str(err)) print("------------------------------------------------------")
def cmd_providers(fetcher=None, **kwargs): """Create or Update fetcher Providers""" ctx = client.Context(**kwargs) ctx.log_ok("Create/Update Provider for %s fetcher:" % fetcher) if ctx.silent or click.confirm('Do you want to continue?', abort=True): _fetchers = [] if fetcher: _fetchers = [fetcher] else: _fetchers = list(FETCHERS.keys()) for _fetcher in _fetchers: ctx.log("Run provider for [%s]" % _fetcher) f = FETCHERS[_fetcher](db=ctx.mongo_database()) result = f.provider.update_database() if result: ctx.log_ok("Provider [%s] updated." % _fetcher) else: ctx.log_error("Provider [%s] update ERROR." % _fetcher)
def cmd_update_tags(fetcher=None, dataset=None, max_bulk=100, update_only=False, async_mode=None, dry_mode=False, **kwargs): """Create or Update field tags""" """ Examples: dlstats fetchers tag -f BIS -d CNFS -S dlstats fetchers tag -f BEA -d "10101 Ann" -S dlstats fetchers tag -f BEA -d "10101 Ann" -S dlstats fetchers tag -f Eurostat -d nama_10_a10 -S dlstats fetchers tag -f OECD -d MEI -S """ ctx = client.Context(**kwargs) if ctx.silent or click.confirm('Do you want to continue?', abort=True): db = ctx.mongo_database() if fetcher: fetchers = [fetcher] else: fetchers = FETCHERS.keys() for provider in fetchers: provider_name = provider _update_tags(ctx, db, provider_name, dataset, max_bulk, update_only, dry_mode, async_mode)
def cmd_calendar(fetcher=None, update=False, **kwargs): """Display calendar for this provider""" """ Ouput examples: $ dlstats fetchers calendar -F BIS --------------------------------------------------------------------------------------------------------------------------- Provider | Dataset | Action | Type | Date (yyyy-mm-dd hh:mn) --------------------------------------------------------------------------------------------------------------------------- BIS | EERI | update-dataset | date | 2016-01-18 - 08:00 BIS | LBS-DISS | update-dataset | date | 2016-01-22 - 08:00 BIS | CBS | update-dataset | date | 2016-01-22 - 08:00 --------------------------------------------------------------------------------------------------------------------------- $ dlstats fetchers calendar -F ECB --------------------------------------------------------------------------------------------------------------------------- Provider | Dataset | Action | Type | Date (yyyy-mm-dd hh:mn) --------------------------------------------------------------------------------------------------------------------------- ECB | BLS | update-dataset | date | 2016-01-19 - 10:00 ECB | ICP | update-dataset | date | 2016-01-19 - 11:00 ECB | IVF | update-dataset | date | 2016-01-21 - 10:00 ECB | BSI | update-dataset | date | 2016-01-29 - 10:00 --------------------------------------------------------------------------------------------------------------------------- """ ctx = client.Context(**kwargs) db = ctx.mongo_database() f = FETCHERS[fetcher](db=db) if not hasattr(f, 'get_calendar'): ctx.log_error("Not implemented get_calendar() method") ctx.log_error("Operation cancelled !") return False if update: if ctx.silent or click.confirm('Do you want to continue?', abort=True): try: with ctx.lock("calendar-%s" % fetcher, "calendar"): f.upsert_calendar() except errors.Locked as err: ctx.log_error("calendar command is locked for provider[%s]" % fetcher) return False calendar_list = db[constants.COL_CALENDARS].find({ "action": { "$in": ["update-dataset", "update-fetcher"] }, "kwargs.provider_name": fetcher }) calendars = [] dates = [] for i, c in enumerate(calendar_list): calendars.append((i, c)) if c["period_type"] == "date": dates.append((i, c['period_kwargs']['run_date'])) fmt = "{0:10} | {1:12} | {2:15} | {3:6} | {4:10}" print( "---------------------------------------------------------------------------------------------------------------------------" ) print( fmt.format("Provider", "Dataset", "Action", "Type", "Date (yyyy-mm-dd hh:mn)")) print( "---------------------------------------------------------------------------------------------------------------------------" ) for entry in sorted(dates, key=itemgetter(1)): c = calendars[entry[0]][1] action = c['action'] period_type = c['period_type'] k = c['kwargs'] provider_name = fetcher dataset_code = k.get('dataset_code', 'ALL') if period_type == "date": _date = c['period_kwargs']['run_date'].strftime("%Y-%m-%d - %H:%M") else: _date = c['period_kwargs']['run_date'] print( fmt.format(provider_name, dataset_code, action, period_type, _date)) print( "---------------------------------------------------------------------------------------------------------------------------" )
def cmd_run(fetcher=None, dataset=None, max_errors=0, bulk_size=200, datatree=False, async_mode=None, use_files=False, not_remove=False, run_full=False, dataset_only=False, refresh_meta=False, force_update=False, **kwargs): """Run Fetcher - All datasets or selected dataset""" ctx = client.Context(**kwargs) ctx.log_ok("Run %s fetcher:" % fetcher) if ctx.silent or click.confirm('Do you want to continue?', abort=True): lock_key = "run-%s" % fetcher if dataset: lock_key = lock_key + "-" + ".".join(dataset) try: with ctx.lock(lock_key, "run"): db = ctx.mongo_database() f = FETCHERS[fetcher](db=db, max_errors=max_errors, bulk_size=bulk_size, use_existing_file=use_files, not_remove_files=not_remove, dataset_only=dataset_only, refresh_meta=refresh_meta, async_mode=async_mode, force_update=force_update) if not dataset and not hasattr(f, "upsert_all_datasets"): ctx.log_error( "upsert_all_datasets method is not implemented for this fetcher." ) ctx.log_error("Please choice a dataset.") ctx.log_error("Operation cancelled !") return if datatree: f.upsert_data_tree(force_update=True) if dataset: for ds in dataset: f.wrap_upsert_dataset(ds) if run_full: _consolidate(ctx, db, fetcher, dataset=ds) _update_tags(ctx, db, fetcher, dataset=ds, update_only=True) else: f.upsert_all_datasets() if run_full: _consolidate(ctx, db, fetcher) _update_tags(ctx, db, fetcher, update_only=True) except errors.Locked as err: ctx.log_error("run command is locked for key[%s]" % lock_key) return False
def cmd_check_schemas(max_errors=None, **kwargs): """Check datas in DB with schemas """ ctx = client.Context(**kwargs) ctx.log_warn("Attention, opération très longue") # dlstats mongo check-schemas --mongo-url mongodb://localhost/widukind -M 20 -S report = {} if ctx.silent or click.confirm('Do you want to continue?', abort=True): start = time.time() db = ctx.mongo_database() from pymongo import ReadPreference for col in CURRENT_SCHEMAS.keys(): print("check %s..." % col) report[col] = {'error': 0, 'verified': 0, 'time': 0} report[col]['count'] = db[col].count() s = time.time() _schema = CURRENT_SCHEMAS[col] #coll2 = coll1.with_options(read_preference=ReadPreference.SECONDARY_PREFERRED) #find(limit=0) #projection={‘_id’: False} for doc in db[col].with_options( read_preference=ReadPreference.SECONDARY_PREFERRED).find(): _id = None if max_errors and report[col]['error'] >= max_errors: ctx.log_warn("Max error attempt. Skip test !") break try: report[col]['verified'] += 1 _id = str(doc.pop('_id')) _schema(doc) except Exception as err: report[col]['error'] += 1 if ctx.verbose: ctx.log_error("%s - %s - %s" % (col, _id, str(err))) report[col]['time'] = "%.3f" % (time.time() - s) end = time.time() - start fmt = "{0:20} | {1:10} | {2:10} | {3:10} | {4:10}" print( "--------------------------------------------------------------------" ) print(fmt.format("Collection", "Count", "Verified", "Errors", "Time")) for col, item in report.items(): print( fmt.format(col, item['count'], item['verified'], item['error'], item['time'])) print( "--------------------------------------------------------------------" ) print("time elapsed : %.3f seconds " % end) """
def cmd_report(**kwargs): """Technical statistic report""" ctx = client.Context(**kwargs) ctx.log_error("Not Implemented")
def cmd_restore(**kwargs): """Restore database or collection(s)""" ctx = client.Context(**kwargs) ctx.log_error("Not Implemented")
def cmd_backup(**kwargs): """Backup database or collection(s)""" ctx = client.Context(**kwargs) ctx.log_error("Not Implemented")
def cmd_copy_db(**kwargs): """Copy database to other database""" ctx = client.Context(**kwargs) ctx.log_error("Not Implemented")
def cmd_report(fetcher=None, **kwargs): """Fetchers report""" """ Report example: --------------------------------------------------------------------------------------------------------------------------- MongoDB: mongodb://localhost/widukind : --------------------------------------------------------------------------------------------------------------------------- Provider | Version | Dataset | Series | First Download | last Download --------------------------------------------------------------------------------------------------------------------------- BIS | 1 | PP-LS | 23 | 2016-01-06 - 09:38 | 2016-01-06 - 09:38 INSEE | 1 | CNT-2010-PIB-RF | 11 | 2016-01-06 - 09:37 | 2016-01-06 - 09:37 --------------------------------------------------------------------------------------------------------------------------- """ ctx = client.Context(**kwargs) db = ctx.mongo_database() fmt = "{0:10} | {1:4} | {2:30} | {3:10} | {4:15} | {5:20} | {6:20} | {7:7}" print( "----------------------------------------------------------------------------------------------------------------------------------------" ) print("MongoDB: %s :" % ctx.mongo_url) print( "----------------------------------------------------------------------------------------------------------------------------------------" ) print( fmt.format("Provider", "Ver.", "Dataset", "Series", "Last Update", "First Download", "last Download", "Enable")) print( "----------------------------------------------------------------------------------------------------------------------------------------" ) query = {} if fetcher: query["name"] = fetcher for provider in db[constants.COL_PROVIDERS].find(query): for dataset in db[constants.COL_DATASETS].find({ 'provider_name': provider['name'] }).sort("dataset_code"): series_count = db[constants.COL_SERIES].count({ 'provider_name': provider['name'], "dataset_code": dataset['dataset_code'] }) if not provider['enable']: _provider = "%s *" % provider['name'] else: _provider = provider['name'] print( fmt.format( _provider, provider['version'], dataset['dataset_code'], series_count, str(dataset['last_update'].strftime("%Y-%m-%d")), str(dataset['download_first'].strftime( "%Y-%m-%d - %H:%M")), str(dataset['download_last'].strftime("%Y-%m-%d - %H:%M")), str(dataset["enable"]))) print( "---------------------------------------------------------------------------------------------------------------------------" )