Exemple #1
0
def cmd_reindex(drop_before=False, **kwargs):
    """Reindex collections"""

    ctx = client.Context(**kwargs)
    ctx.log_warn("All Writes operations is blocked pending run !")

    if ctx.silent or click.confirm('Do you want to continue?', abort=True):

        db = ctx.mongo_database()

        if drop_before:
            with click.progressbar(constants.COL_ALL,
                                   length=len(constants.COL_ALL) + 1,
                                   label='Drop indexes') as collections:
                for collection in collections:
                    ctx.log_ok(" [%s]" % collection)
                    try:
                        db[collection].drop_indexes()
                    except Exception as err:
                        ctx.log_warn(str(err))

        ctx.log("Create or update all indexes !")
        create_or_update_indexes(db)

        if not drop_before:
            with click.progressbar(constants.COL_ALL,
                                   length=len(constants.COL_ALL) + 1,
                                   label='Reindex collections') as collections:
                for collection in collections:
                    ctx.log_ok(" [%s]" % collection)
                    try:
                        db[collection].reindex()
                    except Exception as err:
                        ctx.log_warn(str(err))
def cmd_dataset_list(fetcher, **kwargs):
    """Show datasets list"""

    ctx = client.Context(**kwargs)

    f = FETCHERS[fetcher](db=ctx.mongo_database())

    datasets = f.datasets_list()
    if not datasets:
        ctx.log_error("Not datasets for this fetcher")
        return

    fmt = "{0:20} | {1:70} | {2:10}"
    print(
        "---------------------------------------------------------------------------------------------------------------------------"
    )
    print(fmt.format("Dataset Code", "Dataset Name", "Last Update"))
    print(
        "---------------------------------------------------------------------------------------------------------------------------"
    )

    for dataset in datasets:
        last_update = ""
        if dataset.get('last_update'):
            last_update = str(dataset['last_update'].strftime("%Y-%m-%d"))
        print(fmt.format(dataset["dataset_code"], dataset["name"],
                         last_update))

    print(
        "---------------------------------------------------------------------------------------------------------------------------"
    )
def cmd_consolidate(fetcher=None, dataset=None, max_bulk=20, **kwargs):
    """Consolidate codelists and concepts one or more dataset"""

    ctx = client.Context(**kwargs)

    if ctx.silent or click.confirm('Do you want to continue?', abort=True):

        db = ctx.mongo_database()

        _consolidate(ctx, db, fetcher, dataset=dataset, max_bulk=max_bulk)
def cmd_purge(fetcher=None, dataset=None, purge_all=False, **kwargs):
    """Purge one or more dataset"""
    """
    dlstats fetchers purge -f INSEE --purge-all
    dlstats fetchers purge -f INSEE -d IPCH-2015-FR-COICOP
    dlstats fetchers purge -f INSEE -d IPCH-2015-FR-COICOP -d IPC-2015-COICOP
    """

    ctx = client.Context(**kwargs)

    ctx.log("START purge for [%s]" % fetcher)

    if ctx.silent or click.confirm('Do you want to continue?', abort=True):

        start = time.time()

        db = ctx.mongo_database()

        from pymongo import DeleteMany

        if purge_all:
            query = {"name": fetcher}
            result = db[constants.COL_PROVIDERS].bulk_write(
                [DeleteMany(query)], ordered=False)
            ctx.log("Provider [%s] deleted" % fetcher)

            query = {"provider_name": fetcher}
            result = db[constants.COL_CATEGORIES].bulk_write(
                [DeleteMany(query)], ordered=False)

            ctx.log("Categories deleted: %s" % result.deleted_count)

        query = {"provider_name": fetcher}
        if not purge_all and dataset:
            query["dataset_code"] = {"$in": dataset}

        bulk_requests = [DeleteMany(query)]

        result = db[constants.COL_DATASETS].bulk_write(bulk_requests,
                                                       ordered=False)
        ctx.log("Datasets deleted: %s" % result.deleted_count)

        result = db[constants.COL_SERIES].bulk_write(bulk_requests,
                                                     ordered=False)
        ctx.log("Series deleted: %s" % result.deleted_count)

        result = db[constants.COL_SERIES_ARCHIVES].bulk_write(bulk_requests,
                                                              ordered=False)
        ctx.log("Series archives deleted: %s" % result.deleted_count)

        end = time.time() - start

        ctx.log("END purge for [%s] - time[%.3f]" % (fetcher, end))
Exemple #5
0
def cmd_clean(**kwargs):
    """Delete All MongoDB collections"""

    ctx = client.Context(**kwargs)
    #TODO: translation
    ctx.log_warn("La destruction des données est définitive !")

    if ctx.silent or click.confirm('Do you want to continue?', abort=True):

        db = ctx.mongo_database()

        utils.clean_mongodb(db)
Exemple #6
0
def cmd_export_csvfile(provider=None,
                       dataset=None,
                       filepath=None,
                       create=False,
                       **kwargs):
    """Download csvfile from one dataset. 

    Examples:
    
    dlstats export csvfile -p Eurostat -d "nama_10_a10" -S
    dlstats export csvfile -p BEA -d "10101 Ann"
    
    widukind-dataset-Eurostat-nama_10_a10-csv
    widukind-dataset-eurostat-nama_10_a10.csv
    """

    ctx = client.Context(**kwargs)

    if ctx.silent or click.confirm('Do you want to continue?', abort=True):

        db = ctx.mongo_database()
        fs = ctx.mongo_fs()

        filename = export_files.generate_filename_csv(provider_name=provider,
                                                      dataset_code=dataset,
                                                      prefix="dataset")
        filepath = filepath or filename

        csvfile = fs.find_one({"filename": filename})

        if not csvfile and create is True:
            ctx.log_warn("%s not exist. creating..." % filename)
            try:
                id = export_files.export_file_csv_dataset_unit(
                    provider=provider, dataset_code=dataset)
                csvfile = fs.get(id)
            except Exception as err:
                ctx.log_error(str(err))

        if csvfile:

            created = csvfile.upload_date.strftime("%Y-%m-%d-%H:%M:%S")
            ctx.log_ok("export to %s - created[%s]" % (filepath, created))

            with open(filepath, 'wb') as fp:
                rows = iter(csvfile)
                for row in rows:
                    fp.write(row)
        else:
            ctx.log_error("file not found: %s" % filename)
def cmd_search(search_type=None,
               fetcher=None,
               dataset=None,
               frequency=None,
               search=None,
               limit=None,
               **kwargs):
    """Search in Series"""

    #TODO: pretty
    #TODO: csv ?
    #TODO: time limit
    """
    dlstats fetchers search -F Q -s "euro Market financial"
    
    dlstats fetchers search -t series -F Q -s "euro Market financial" -D
    """

    ctx = client.Context(**kwargs)
    db = ctx.mongo_database()

    provider_name = None
    if fetcher:
        f = FETCHERS[fetcher](db=db)
        provider_name = f.provider_name

    result, query = tags.search_tags(db,
                                     search_type=search_type,
                                     provider_name=provider_name,
                                     dataset_code=dataset,
                                     frequency=frequency,
                                     search_tags=search,
                                     limit=limit)

    ctx.log("Count result : %s" % result.count())
    for doc in result:
        if search_type == constants.COL_SERIES:
            fields = [
                doc['provider_name'], doc['dataset_code'], doc['key'],
                doc['name']
            ]
        else:
            fields = [doc['provider_name'], doc['dataset_code'], doc['name']]
        if ctx.debug:
            fields.append(doc['tags'])

        print(fields)
def cmd_datatree(fetcher=None,
                 force=False,
                 use_files=False,
                 not_remove=False,
                 **kwargs):
    """Create or Update fetcher Data-Tree"""

    ctx = client.Context(**kwargs)

    ctx.log_ok("Run Update Data-Tree for %s fetcher:" % fetcher)

    if ctx.silent or click.confirm('Do you want to continue?', abort=True):

        f = FETCHERS[fetcher](db=ctx.mongo_database(),
                              use_existing_file=use_files,
                              not_remove_files=not_remove)

        f.upsert_data_tree(force_update=force)
def cmd_stats_run(fetcher=None, limit=20, **kwargs):
    """Display run stats sorted by descending created field"""

    #TODO: csv export ?
    #TODO: envoi csv par mail

    ctx = client.Context(**kwargs)
    db = ctx.mongo_database()
    #fmt = "{:15} | {:10} | {:20} | {:>5} | {:>5} | {:>5} | {:>5} | {:>5}"
    fmt = "{:16} | {:10} | {:20.20} | {:>6} | {:>6} | {:>6} | {:>6} | {:>5} | {:>5} | {:>6} | {:>6} | {:>6} | {:5} | {:>5}"
    sep = "---------------------------------------------------------------------------------------------------------------------------------------------------"
    print(sep)
    print(
        fmt.format("Date", "Provider", "Dataset", "Acc.", "Rej.", "Ins.",
                   "Upd.", "Err.", "FErr.", "Dur.", "Avg", "Avg.W", "Async",
                   "Bulk"))
    print(sep)
    query = {}
    if fetcher:
        query["provider_name"] = fetcher

    cursor = db[constants.COL_STATS_RUN].find(query)
    if limit:
        cursor = cursor.limit(limit)

    for stat in cursor.sort("created", -1):

        print(
            fmt.format(stat['created'].strftime("%Y-%m-%d-%H:%M"),
                       stat["provider_name"], stat.get("dataset_code"),
                       stat.get("count_accepts", 0),
                       stat.get("count_rejects", 0),
                       stat.get("count_inserts", 0),
                       stat.get("count_updates",
                                0), stat.get("count_errors", 0),
                       stat.get("fetcher_errors", 0),
                       round(stat.get("duration", 0.0), 2),
                       round(stat.get("avg_all", 0.0), 2),
                       round(stat.get("avg_write", 0.0),
                             2), "Y" if stat.get("async_mode", None) else "N",
                       stat.get("bulk_size", 0)))

    print(sep)
Exemple #10
0
def cmd_aggregate_tags(max_bulk=100,
                       update_only=False,
                       async_mode=None,
                       **kwargs):
    """Aggregate tags"""

    #TODO: tag sur dataset only

    ctx = client.Context(**kwargs)

    ctx.log("START aggregate tags")

    if ctx.silent or click.confirm('Do you want to continue?', abort=True):

        start = time.time()

        db = ctx.mongo_database()

        ctx.log("Aggregate Datasets tags...")
        result = tags.aggregate_tags_datasets(db, max_bulk=max_bulk)
        #{'nMatched': 410, 'nModified': 385, 'writeConcernErrors': [], 'nInserted': 0, 'nUpserted': 2452, 'writeErrors': [], 'nRemoved': 0}
        ctx.log_ok(
            "datasets - matched[%(nMatched)s] - modified[%(nModified)s] - inserted[%(nInserted)s] - upserted[%(nUpserted)s]"
            % result)
        if result["writeErrors"]:
            print(result["writeErrors"][0])

        ctx.log("Aggregate Series tags...")
        result = tags.aggregate_tags_series(db, max_bulk=max_bulk)
        ctx.log_ok(
            "series - matched[%(nMatched)s] - modified[%(nModified)s] - inserted[%(nInserted)s] - upserted[%(nUpserted)s]"
            % result)
        if result["writeErrors"]:
            print(result["writeErrors"][0])

        end = time.time() - start

        ctx.log("END aggregate tags time[%.3f]" % end)
Exemple #11
0
def cmd_check(**kwargs):
    """Verify connection"""
    ctx = client.Context(**kwargs)
    try:
        import pymongo
        mongo_client = ctx.mongo_client()
        db = ctx.mongo_database()
        server_info = mongo_client.server_info()
        host_info = db.command("hostInfo")
        print("------------------------------------------------------")
        ctx.log_ok("Connection OK")
        print("------------------------------------------------------")
        print("pymongo version : %s" % pymongo.version)
        print("-------------------- Server Infos --------------------")
        pprint(server_info)
        print("-------------------- Host Infos ----------------------")
        pprint(host_info)
        print("------------------------------------------------------")
    except Exception as err:
        ctx.log_error("Connection Error:")
        print("------------------------------------------------------")
        ctx.log_error(str(err))
        print("------------------------------------------------------")
Exemple #12
0
def cmd_providers(fetcher=None, **kwargs):
    """Create or Update fetcher Providers"""

    ctx = client.Context(**kwargs)

    ctx.log_ok("Create/Update Provider for %s fetcher:" % fetcher)

    if ctx.silent or click.confirm('Do you want to continue?', abort=True):

        _fetchers = []
        if fetcher:
            _fetchers = [fetcher]
        else:
            _fetchers = list(FETCHERS.keys())

        for _fetcher in _fetchers:
            ctx.log("Run provider for [%s]" % _fetcher)
            f = FETCHERS[_fetcher](db=ctx.mongo_database())
            result = f.provider.update_database()
            if result:
                ctx.log_ok("Provider [%s] updated." % _fetcher)
            else:
                ctx.log_error("Provider [%s] update ERROR." % _fetcher)
Exemple #13
0
def cmd_update_tags(fetcher=None,
                    dataset=None,
                    max_bulk=100,
                    update_only=False,
                    async_mode=None,
                    dry_mode=False,
                    **kwargs):
    """Create or Update field tags"""
    """
    Examples:
    
    dlstats fetchers tag -f BIS -d CNFS -S 
    dlstats fetchers tag -f BEA -d "10101 Ann" -S
    dlstats fetchers tag -f BEA -d "10101 Ann" -S
    dlstats fetchers tag -f Eurostat -d nama_10_a10 -S
    dlstats fetchers tag -f OECD -d MEI -S
    
    """

    ctx = client.Context(**kwargs)

    if ctx.silent or click.confirm('Do you want to continue?', abort=True):

        db = ctx.mongo_database()

        if fetcher:
            fetchers = [fetcher]
        else:
            fetchers = FETCHERS.keys()

        for provider in fetchers:

            provider_name = provider

            _update_tags(ctx, db, provider_name, dataset, max_bulk,
                         update_only, dry_mode, async_mode)
Exemple #14
0
def cmd_calendar(fetcher=None, update=False, **kwargs):
    """Display calendar for this provider"""
    """
    Ouput examples:
    
    $ dlstats fetchers calendar -F BIS
    ---------------------------------------------------------------------------------------------------------------------------
    Provider   | Dataset      | Action          | Type   | Date (yyyy-mm-dd hh:mn)
    ---------------------------------------------------------------------------------------------------------------------------
    BIS        | EERI         | update-dataset     | date   | 2016-01-18 - 08:00
    BIS        | LBS-DISS     | update-dataset     | date   | 2016-01-22 - 08:00
    BIS        | CBS          | update-dataset     | date   | 2016-01-22 - 08:00    
    ---------------------------------------------------------------------------------------------------------------------------

    $ dlstats fetchers calendar -F ECB
    ---------------------------------------------------------------------------------------------------------------------------
    Provider   | Dataset      | Action          | Type   | Date (yyyy-mm-dd hh:mn)
    ---------------------------------------------------------------------------------------------------------------------------
    ECB        | BLS          | update-dataset     | date   | 2016-01-19 - 10:00
    ECB        | ICP          | update-dataset     | date   | 2016-01-19 - 11:00
    ECB        | IVF          | update-dataset     | date   | 2016-01-21 - 10:00
    ECB        | BSI          | update-dataset     | date   | 2016-01-29 - 10:00    
    ---------------------------------------------------------------------------------------------------------------------------
    """

    ctx = client.Context(**kwargs)
    db = ctx.mongo_database()

    f = FETCHERS[fetcher](db=db)
    if not hasattr(f, 'get_calendar'):
        ctx.log_error("Not implemented get_calendar() method")
        ctx.log_error("Operation cancelled !")
        return False

    if update:
        if ctx.silent or click.confirm('Do you want to continue?', abort=True):
            try:
                with ctx.lock("calendar-%s" % fetcher, "calendar"):
                    f.upsert_calendar()
            except errors.Locked as err:
                ctx.log_error("calendar command is locked for provider[%s]" %
                              fetcher)
                return False

    calendar_list = db[constants.COL_CALENDARS].find({
        "action": {
            "$in": ["update-dataset", "update-fetcher"]
        },
        "kwargs.provider_name":
        fetcher
    })

    calendars = []
    dates = []

    for i, c in enumerate(calendar_list):
        calendars.append((i, c))
        if c["period_type"] == "date":
            dates.append((i, c['period_kwargs']['run_date']))

    fmt = "{0:10} | {1:12} | {2:15} | {3:6} | {4:10}"
    print(
        "---------------------------------------------------------------------------------------------------------------------------"
    )
    print(
        fmt.format("Provider", "Dataset", "Action", "Type",
                   "Date (yyyy-mm-dd hh:mn)"))
    print(
        "---------------------------------------------------------------------------------------------------------------------------"
    )
    for entry in sorted(dates, key=itemgetter(1)):
        c = calendars[entry[0]][1]
        action = c['action']
        period_type = c['period_type']
        k = c['kwargs']
        provider_name = fetcher
        dataset_code = k.get('dataset_code', 'ALL')
        if period_type == "date":
            _date = c['period_kwargs']['run_date'].strftime("%Y-%m-%d - %H:%M")
        else:
            _date = c['period_kwargs']['run_date']
        print(
            fmt.format(provider_name, dataset_code, action, period_type,
                       _date))
    print(
        "---------------------------------------------------------------------------------------------------------------------------"
    )
Exemple #15
0
def cmd_run(fetcher=None,
            dataset=None,
            max_errors=0,
            bulk_size=200,
            datatree=False,
            async_mode=None,
            use_files=False,
            not_remove=False,
            run_full=False,
            dataset_only=False,
            refresh_meta=False,
            force_update=False,
            **kwargs):
    """Run Fetcher - All datasets or selected dataset"""

    ctx = client.Context(**kwargs)

    ctx.log_ok("Run %s fetcher:" % fetcher)

    if ctx.silent or click.confirm('Do you want to continue?', abort=True):

        lock_key = "run-%s" % fetcher
        if dataset:
            lock_key = lock_key + "-" + ".".join(dataset)

        try:
            with ctx.lock(lock_key, "run"):

                db = ctx.mongo_database()

                f = FETCHERS[fetcher](db=db,
                                      max_errors=max_errors,
                                      bulk_size=bulk_size,
                                      use_existing_file=use_files,
                                      not_remove_files=not_remove,
                                      dataset_only=dataset_only,
                                      refresh_meta=refresh_meta,
                                      async_mode=async_mode,
                                      force_update=force_update)

                if not dataset and not hasattr(f, "upsert_all_datasets"):
                    ctx.log_error(
                        "upsert_all_datasets method is not implemented for this fetcher."
                    )
                    ctx.log_error("Please choice a dataset.")
                    ctx.log_error("Operation cancelled !")
                    return

                if datatree:
                    f.upsert_data_tree(force_update=True)

                if dataset:
                    for ds in dataset:
                        f.wrap_upsert_dataset(ds)
                        if run_full:
                            _consolidate(ctx, db, fetcher, dataset=ds)
                            _update_tags(ctx,
                                         db,
                                         fetcher,
                                         dataset=ds,
                                         update_only=True)
                else:
                    f.upsert_all_datasets()
                    if run_full:
                        _consolidate(ctx, db, fetcher)
                        _update_tags(ctx, db, fetcher, update_only=True)

        except errors.Locked as err:
            ctx.log_error("run command is locked for key[%s]" % lock_key)
            return False
Exemple #16
0
def cmd_check_schemas(max_errors=None, **kwargs):
    """Check datas in DB with schemas
    """
    ctx = client.Context(**kwargs)
    ctx.log_warn("Attention, opération très longue")

    # dlstats mongo check-schemas --mongo-url mongodb://localhost/widukind -M 20 -S

    report = {}

    if ctx.silent or click.confirm('Do you want to continue?', abort=True):

        start = time.time()

        db = ctx.mongo_database()
        from pymongo import ReadPreference

        for col in CURRENT_SCHEMAS.keys():
            print("check %s..." % col)

            report[col] = {'error': 0, 'verified': 0, 'time': 0}
            report[col]['count'] = db[col].count()

            s = time.time()

            _schema = CURRENT_SCHEMAS[col]

            #coll2 = coll1.with_options(read_preference=ReadPreference.SECONDARY_PREFERRED)
            #find(limit=0)
            #projection={‘_id’: False}
            for doc in db[col].with_options(
                    read_preference=ReadPreference.SECONDARY_PREFERRED).find():
                _id = None
                if max_errors and report[col]['error'] >= max_errors:
                    ctx.log_warn("Max error attempt. Skip test !")
                    break
                try:
                    report[col]['verified'] += 1
                    _id = str(doc.pop('_id'))
                    _schema(doc)
                except Exception as err:
                    report[col]['error'] += 1
                    if ctx.verbose:
                        ctx.log_error("%s - %s - %s" % (col, _id, str(err)))

            report[col]['time'] = "%.3f" % (time.time() - s)

        end = time.time() - start

        fmt = "{0:20} | {1:10} | {2:10} | {3:10} | {4:10}"
        print(
            "--------------------------------------------------------------------"
        )
        print(fmt.format("Collection", "Count", "Verified", "Errors", "Time"))
        for col, item in report.items():
            print(
                fmt.format(col, item['count'], item['verified'], item['error'],
                           item['time']))
        print(
            "--------------------------------------------------------------------"
        )
        print("time elapsed : %.3f seconds " % end)
        """
Exemple #17
0
def cmd_report(**kwargs):
    """Technical statistic report"""
    ctx = client.Context(**kwargs)
    ctx.log_error("Not Implemented")
Exemple #18
0
def cmd_restore(**kwargs):
    """Restore database or collection(s)"""
    ctx = client.Context(**kwargs)
    ctx.log_error("Not Implemented")
Exemple #19
0
def cmd_backup(**kwargs):
    """Backup database or collection(s)"""
    ctx = client.Context(**kwargs)
    ctx.log_error("Not Implemented")
Exemple #20
0
def cmd_copy_db(**kwargs):
    """Copy database to other database"""
    ctx = client.Context(**kwargs)
    ctx.log_error("Not Implemented")
Exemple #21
0
def cmd_report(fetcher=None, **kwargs):
    """Fetchers report"""
    """
    Report example:
    ---------------------------------------------------------------------------------------------------------------------------
    MongoDB: mongodb://localhost/widukind :
    ---------------------------------------------------------------------------------------------------------------------------
    Provider             | Version   | Dataset                        | Series     | First Download       | last Download
    ---------------------------------------------------------------------------------------------------------------------------
    BIS                  |         1 | PP-LS                          |         23 | 2016-01-06 - 09:38   | 2016-01-06 - 09:38
    INSEE                |         1 | CNT-2010-PIB-RF                |         11 | 2016-01-06 - 09:37   | 2016-01-06 - 09:37    
    ---------------------------------------------------------------------------------------------------------------------------
    """
    ctx = client.Context(**kwargs)
    db = ctx.mongo_database()
    fmt = "{0:10} | {1:4} | {2:30} | {3:10} | {4:15} | {5:20} | {6:20} | {7:7}"
    print(
        "----------------------------------------------------------------------------------------------------------------------------------------"
    )
    print("MongoDB: %s :" % ctx.mongo_url)
    print(
        "----------------------------------------------------------------------------------------------------------------------------------------"
    )
    print(
        fmt.format("Provider", "Ver.", "Dataset", "Series", "Last Update",
                   "First Download", "last Download", "Enable"))
    print(
        "----------------------------------------------------------------------------------------------------------------------------------------"
    )
    query = {}
    if fetcher:
        query["name"] = fetcher

    for provider in db[constants.COL_PROVIDERS].find(query):

        for dataset in db[constants.COL_DATASETS].find({
                'provider_name':
                provider['name']
        }).sort("dataset_code"):

            series_count = db[constants.COL_SERIES].count({
                'provider_name':
                provider['name'],
                "dataset_code":
                dataset['dataset_code']
            })

            if not provider['enable']:
                _provider = "%s *" % provider['name']
            else:
                _provider = provider['name']

            print(
                fmt.format(
                    _provider, provider['version'], dataset['dataset_code'],
                    series_count,
                    str(dataset['last_update'].strftime("%Y-%m-%d")),
                    str(dataset['download_first'].strftime(
                        "%Y-%m-%d - %H:%M")),
                    str(dataset['download_last'].strftime("%Y-%m-%d - %H:%M")),
                    str(dataset["enable"])))
    print(
        "---------------------------------------------------------------------------------------------------------------------------"
    )