Example #1
0
    def delete_dataset(database, collection):
        retrieve_util = MongoRetrieveUtil(database, collection)
        db_info = retrieve_util.get_mongo_db(database)
        if not db_info.success:
            return err_resp(db_info.err_msg)

        db = db_info.result_obj

        db[collection].drop()
Example #2
0
def run_test():
    print('step 1')
    mr = MongoRetrieveUtil('test-it')
    if mr.has_error():
        #print(mr.get_error_message())
        return

    db_info = mr.get_mongo_db('ok-db')
    print('db_info.success', db_info.success)

    print('step 2')

    client_info = mr.get_mongo_client()
    if client_info.success:
        client = client_info.result_obj
    for x in client.list_databases():
        print(x)
    print('step 3')

    return

    print('get_mongo_url:', mr.get_mongo_url())
    db_info = mr.get_mongo_db('hello')
    print('success?', db_info.success)

    print(mr.list_databases())
Example #3
0
    def get_data(database,
                 collection,
                 method,
                 query,
                 distinct=None,
                 host=None):
        """Return data from a Mongo query"""

        if method == 'distinct' and not distinct:
            return err_resp("the distinct method requires a 'keys' argument")

        retrieve_util = MongoRetrieveUtil(database, collection, host)
        success, data = retrieve_util.run_query(query, method, distinct)

        return ok_resp(data) if success else err_resp(data)
Example #4
0
def api_mongo_healthcheck(request):
    """Mongo healthcheck"""
    mongo_check = MongoRetrieveUtil.run_tworavens_healthcheck()

    if mongo_check.success:
        return JsonResponse(get_json_success(\
                            'Mongo is running',
                            data=mongo_check.result_obj))

    return JsonResponse(get_json_error(mongo_check.err_msg))
Example #5
0
    def upload_query_result(event_obj):
        """upload query result to dataverse"""
        collection_name = event_obj.as_dict()['collection_name']
        query_obj = event_obj.as_dict()['query']
        query_id = event_obj.as_dict()['id']
        filename = '%s_%s.txt' % (str(query_id), str(collection_name))
        obj = MongoRetrieveUtil(settings.EVENTDATA_DB_NAME, collection_name)
        success, mongo_obj = obj.run_query(query_obj, 'aggregate')

        if not mongo_obj:
            return err_resp(mongo_obj)

        json_dump = json.dumps(mongo_obj)
        temp_file_obj = TemporaryFileMaker(filename, json_dump)

        succ, res_obj = temp_file_obj.return_status()
        print("query result upload : ", res_obj)

        if succ:
            return ok_resp(res_obj)

        else:
            return err_resp(res_obj)
def util_results_importance_efd(data_pointer, metadata):
    LIMIT_UNIQUE_LEVELS = 20

    # make sure the base dataset is loaded
    EventJobUtil.import_dataset(settings.TWORAVENS_MONGO_DB_NAME,
                                metadata['collectionName'],
                                data_path=metadata['collectionPath'])

    results_collection_name = metadata[
        'collectionName'] + '_produce_' + mongofy_collection_name(
            metadata['produceId'])

    util = MongoRetrieveUtil(
        settings.TWORAVENS_MONGO_DB_NAME,
        settings.MONGO_COLLECTION_PREFIX + metadata['collectionName'])
    if util.has_error():
        return {KEY_SUCCESS: False, KEY_DATA: util.get_error_message()}

    levels = {}
    # populate levels (for example, numeric column tagged as categorical)
    for variable in metadata['categoricals']:
        # levels are passed, but levels have lost type information (json object keys are coerced to string)
        # if not levels[key]:
        response = util.run_query([
            *metadata['query'], {
                "$group": {
                    "_id": f"${variable}",
                    "count": {
                        '$sum': 1
                    }
                }
            }, {
                '$sort': {
                    'count': -1,
                    '_id': 1
                }
            }, {
                "$limit": LIMIT_UNIQUE_LEVELS
            }
        ], 'aggregate')

        if not response[0]:
            return {KEY_SUCCESS: False, KEY_DATA: response[1]}
        levels[variable] = [doc['_id'] for doc in response[1]]

        # limit the number of unique levels
        if len(levels[variable]) > LIMIT_UNIQUE_LEVELS:
            levels[variable] = levels[variable][:LIMIT_UNIQUE_LEVELS]

    # fitted versions of variables have same levels as their originals
    levels.update({'fitted ' + key: levels[key] for key in levels})
    # renamed variables have the same levels as their originals
    levels.update({'actual ' + key: levels[key] for key in levels})

    # print('metadata levels', levels)

    def is_categorical(variable, levels):
        return variable in levels

    def branch_target(variable, levels):
        if is_categorical(variable, levels):
            return {
                f'{variable}-{level}': {
                    "$avg": {
                        "$cond": [{
                            "$eq": [f"${variable}", level]
                        }, 1, 0]
                    }
                }
                for level in levels[variable]
            }
        # compute mean of fitted and actual
        return {
            f'fitted {variable}': {
                "$avg": f'$fitted {variable}'
            },
            f'actual {variable}': {
                "$avg": f'$actual {variable}'
            },
            'error': {
                '$sum': {
                    "$pow": [{
                        '$subtract':
                        [f'$fitted {variable}', f'$actual {variable}']
                    }, 2]
                }
            }
        }

    def aggregate_targets(variables, levels):
        return {
            k: v
            for d in [branch_target(target, levels) for target in variables]
            for k, v in d.items()
        }

    def branch_target_levels(variable, levels):
        if is_categorical(variable, levels):
            return {
                f'{variable}-{level}': {
                    "$avg": {
                        "$cond": [{
                            "$eq": [f"${variable}", level]
                        }, 1, 0]
                    }
                }
                for level in levels[variable]
            }
        return {variable: {"$avg": f'${variable}'}}

    def aggregate_target_levels(variables, levels):
        return {
            k: v
            for d in [
                *[
                    branch_target_levels('fitted ' + target, levels)
                    for target in variables
                ], *[
                    branch_target_levels('actual ' + target, levels)
                    for target in variables
                ]
            ] for k, v in d.items()
        }

    target_aggregator = aggregate_target_levels(metadata['targets'], levels)

    query = [
        *metadata['query'],
        {
            "$lookup": {
                "from":
                settings.MONGO_COLLECTION_PREFIX + results_collection_name,
                "localField": "d3mIndex",
                "foreignField": "d3mIndex",
                "as": "results_collection"
            }
        },
        {
            "$unwind": "$results_collection"
        },
        {
            "$project": {
                **{
                    'fitted ' + name: f"$results_collection\\.{name}"
                    for name in metadata['targets']
                },
                **{
                    'actual ' + name: f"${name}"
                    for name in metadata['targets']
                },
                **{
                    f"predictor {predictor}": f"${predictor}"
                    for predictor in metadata['predictors']
                },
                **{
                    "_id": 0
                }
            }
        },
        {
            "$facet": {
                predictor: [{
                    "$group": {
                        **{
                            "_id": f'$predictor {predictor}',
                            'count': {
                                "$sum": 1
                            }
                        },
                        **target_aggregator
                    }
                }, {
                    "$sort": {
                        "count": -1,
                        '_id': 1
                    }
                }, {
                    "$limit": 20
                }, {
                    "$project": {
                        **{
                            "predictor": "$_id"
                        },
                        **{k: 1
                           for k in target_aggregator.keys()},
                        **{
                            "_id": 0
                        }
                    }
                }] if is_categorical(predictor, levels) else [{
                    "$bucketAuto": {
                        "groupBy": f'$predictor {predictor}',
                        "buckets": 100,
                        "output": target_aggregator
                    }
                }, {
                    "$project": {
                        **{
                            "predictor": {
                                "$avg": ["$_id\\.min", "$_id\\.max"]
                            }
                        },
                        **{k: 1
                           for k in target_aggregator.keys()},
                        **{
                            "_id": 0
                        }
                    }
                }]
                for predictor in metadata['predictors']
            }
        },
    ]

    try:
        status = EventJobUtil.import_dataset(settings.TWORAVENS_MONGO_DB_NAME,
                                             results_collection_name,
                                             data_pointer)

        if not status.success:
            return {KEY_SUCCESS: False, KEY_DATA: status.err_msg}

        response = list(util.run_query(query, method='aggregate'))

        if not response[0]:
            return {KEY_SUCCESS: response[0], KEY_DATA: response[1]}

        # exhaust cursor before dropping dataset
        data = next(response[1])

    finally:
        pass
        # EventJobUtil.delete_dataset(
        #     settings.TWORAVENS_MONGO_DB_NAME,
        #     results_collection_name)

    def kernel_linear(size):
        return list(range(1, size // 2 + 2)) + list(range(size // 2, 0, -1))

    def kernel_uniform(size):
        return [1] * size

    print(data)

    def smooth(kernel, data, predictor):
        if len(kernel) % 2 != 1:
            raise ValueError('Kernel must be odd-length')
        # normalize kernel
        kernel = [i / sum(kernel) for i in kernel]

        # clip indices for data access on kernel offsets at edges
        def clip(x):
            return max(0, min(len(data) - 1, x))

        offset = len(kernel) // 2
        smoothed = []
        for i in range(len(data)):
            smoothed.append({
                **{
                    level: sum(weight * (data[clip(i + j_level - offset)][level] or 0) for j_level, weight in enumerate(kernel))
                    for level in data[i].keys() if level != "predictor"
                },
                **{
                    "predictor": data[i]["predictor"]
                }
            })
        return smoothed

    # pyperclip.copy(json.dumps({"query": query, "data": data}, indent=4))
    for predictor in metadata['predictors']:
        if not is_categorical(predictor, levels):
            data[predictor] = smooth(kernel_linear(size=7), data[predictor],
                                     predictor)

    return {KEY_SUCCESS: True, KEY_DATA: data}
def util_results_confusion_matrix(data_pointer, metadata):
    """Get the content from the file and format a JSON snippet
    that includes statistical summaries.
    """
    response = EventJobUtil.import_dataset(settings.TWORAVENS_MONGO_DB_NAME,
                                           metadata['collectionName'],
                                           metadata['collectionPath'])

    if not response.success:
        return {KEY_SUCCESS: False, KEY_DATA: response.err_msg}

    results_collection_name = metadata[
        'collectionName'] + '_produce_' + mongofy_collection_name(
            metadata['produceId'])

    util = MongoRetrieveUtil(
        settings.TWORAVENS_MONGO_DB_NAME,
        settings.MONGO_COLLECTION_PREFIX + metadata['collectionName'])
    if util.has_error():
        return {KEY_SUCCESS: False, KEY_DATA: util.get_error_message()}

    query = [
        *metadata['query'],
        # minor optimization, drop unneeded columns before performing lookup
        {
            "$project": {
                **{name: 1
                   for name in metadata['targets']},
                **{
                    'd3mIndex': 1
                }
            }
        },
        {
            "$lookup": {
                "from":
                settings.MONGO_COLLECTION_PREFIX + results_collection_name,
                "localField": "d3mIndex",
                "foreignField": "d3mIndex",
                "as": "results_collection"
            }
        },
        {
            "$unwind": "$results_collection"
        },
        {
            "$project": {
                **{
                    'Predicted_' + name: f"$results_collection\\.{name}"
                    for name in metadata['targets']
                },
                **{
                    'Actual_' + name: f"${name}"
                    for name in metadata['targets']
                },
                **{
                    "_id": 0
                }
            }
        },
        {
            '$facet': {
                target: [{
                    "$group": {
                        '_id': {
                            'Actual': f'$Actual_{target}',
                            'Predicted': f'$Predicted_{target}'
                        },
                        'count': {
                            '$sum': 1
                        }
                    }
                }, {
                    "$project": {
                        'Actual': '$_id\\.Actual',
                        'Predicted': '$_id\\.Predicted',
                        'count': 1,
                        '_id': 0
                    }
                }, {
                    "$sort": {
                        'Actual': 1
                    }
                }]
                for target in metadata['targets']
            }
        }
    ]

    try:
        status = EventJobUtil.import_dataset(settings.TWORAVENS_MONGO_DB_NAME,
                                             results_collection_name,
                                             data_pointer,
                                             indexes=['d3mIndex'])

        if not status.success:
            return {KEY_SUCCESS: False, KEY_DATA: status.err_msg}

        response = list(util.run_query(query, method='aggregate'))

    finally:
        EventJobUtil.delete_dataset(settings.TWORAVENS_MONGO_DB_NAME,
                                    results_collection_name)

    if not response[0]:
        return {KEY_SUCCESS: response[0], KEY_DATA: response[1]}

    data = next(response[1])

    return {
        KEY_SUCCESS: response[0],
        KEY_DATA: {
            target: {
                'data': data[target],
                'classes': list(set(map(lambda x: x['Actual'], data[target])))
            }
            for target in data.keys()
        }
    }
def util_results_real_clustered(data_pointer, metadata):
    GRID_SIZE = 100
    response = EventJobUtil.import_dataset(settings.TWORAVENS_MONGO_DB_NAME,
                                           metadata['collectionName'],
                                           metadata['collectionPath'])

    if not response.success:
        return {KEY_SUCCESS: False, KEY_DATA: response.err_msg}

    results_collection_name = metadata[
        'collectionName'] + '_produce_' + mongofy_collection_name(
            metadata['produceId'])

    mongo_util_base = MongoRetrieveUtil(
        settings.TWORAVENS_MONGO_DB_NAME,
        settings.MONGO_COLLECTION_PREFIX + metadata['collectionName'])
    if mongo_util_base.has_error():
        return {
            KEY_SUCCESS: False,
            KEY_DATA: mongo_util_base.get_error_message()
        }

    mongo_util_fitted = MongoRetrieveUtil(
        settings.TWORAVENS_MONGO_DB_NAME,
        settings.MONGO_COLLECTION_PREFIX + metadata['collectionName'])
    if mongo_util_fitted.has_error():
        return {
            KEY_SUCCESS: False,
            KEY_DATA: mongo_util_fitted.get_error_message()
        }

    def normalize(variable, minimum, maximum, scale=1):
        return {
            "$divide": [{
                "$subtract": [variable, minimum]
            }, (maximum - minimum) / scale]
        }

    try:
        status = EventJobUtil.import_dataset(settings.TWORAVENS_MONGO_DB_NAME,
                                             results_collection_name,
                                             data_pointer,
                                             indexes=['d3mIndex'])

        if not status.success:
            return {KEY_SUCCESS: False, KEY_DATA: status.err_msg}

        # COMPUTE ACTUAL BOUNDS
        bounds = {}
        response = list(
            mongo_util_base.run_query([
                *metadata['query'], {
                    "$match": {
                        target: {
                            "$not": {
                                "$type": 2
                            }
                        }
                        for target in metadata['targets']
                    }
                }, {
                    "$group": {
                        "_id": 0,
                        **{
                            f'min_{target}': {
                                "$min": f"${target}"
                            }
                            for target in metadata['targets']
                        },
                        **{
                            f'max_{target}': {
                                "$max": f"${target}"
                            }
                            for target in metadata['targets']
                        }
                    }
                }
            ],
                                      method='aggregate'))

        if not response[0]:
            return {KEY_SUCCESS: response[0], KEY_DATA: response[1]}

        record = next(response[1])
        bounds['actual'] = {
            target: [record[f'min_{target}'], record[f'max_{target}']]
            for target in metadata['targets']
        }

        # COMPUTE FITTED BOUNDS
        response = list(
            mongo_util_fitted.run_query([{
                "$match": {
                    target: {
                        "$not": {
                            "$type": 2
                        }
                    }
                    for target in metadata['targets']
                }
            }, {
                "$group": {
                    "_id": 0,
                    **{
                        f'min_{target}': {
                            "$min": f"${target}"
                        }
                        for target in metadata['targets']
                    },
                    **{
                        f'max_{target}': {
                            "$max": f"${target}"
                        }
                        for target in metadata['targets']
                    }
                }
            }],
                                        method='aggregate'))

        if not response[0]:
            return {KEY_SUCCESS: response[0], KEY_DATA: response[1]}

        record = next(response[1])
        bounds['fitted'] = {
            target: [record[f'min_{target}'], record[f'max_{target}']]
            for target in metadata['targets']
        }

        # GRID CLUSTERING
        query = [
            *metadata['query'],
            {
                "$project": {
                    **{name: 1
                       for name in metadata['targets']},
                    **{
                        'd3mIndex': 1
                    }
                }
            },
            # ignore records with strings in the target variable
            {
                "$match": {
                    target: {
                        "$not": {
                            "$type": 2
                        }
                    }
                    for target in metadata['targets']
                }
            },
            {
                "$lookup": {
                    "from":
                    settings.MONGO_COLLECTION_PREFIX + results_collection_name,
                    "localField": "d3mIndex",
                    "foreignField": "d3mIndex",
                    "as": "results_collection"
                }
            },
            {
                "$unwind": "$results_collection"
            },
            {
                "$project": {
                    **{
                        'fitted_' + name: f"$results_collection\\.{name}"
                        for name in metadata['targets']
                    },
                    **{
                        'actual_' + name: f"${name}"
                        for name in metadata['targets']
                    },
                    **{
                        "_id": 0
                    }
                }
            },
            {
                "$facet": {
                    target: [{
                        "$group": {
                            "_id": {
                                'x': {
                                    '$toInt':
                                    normalize(f'$fitted_{target}',
                                              *bounds['fitted'][target],
                                              GRID_SIZE)
                                },
                                'y': {
                                    '$toInt':
                                    normalize(f'$actual_{target}',
                                              *bounds['actual'][target],
                                              GRID_SIZE)
                                }
                            },
                            'Fitted Values': {
                                "$avg": f'$fitted_{target}'
                            },
                            'Actual Values': {
                                "$avg": f'$actual_{target}'
                            },
                            'count': {
                                '$sum': 1
                            }
                        }
                    }, {
                        '$project': {
                            '_id': 0
                        }
                    }]
                    for target in metadata['targets']
                }
            }
        ]

        response = list(mongo_util_base.run_query(query, method='aggregate'))

    finally:
        pass
        # EventJobUtil.delete_dataset(
        #     settings.TWORAVENS_MONGO_DB_NAME,
        #     results_collection_name)

    return {
        KEY_SUCCESS: response[0],
        KEY_DATA: next(response[1]) if response[0] else response[1]
    }
Example #9
0
    def import_dataset(database,
                       collection,
                       data_path,
                       reload=False,
                       header=True,
                       columns=None,
                       indexes=None,
                       delimiter=None):
        """Key method to load a Datafile into Mongo as a new collection"""
        print('--> import_dataset --')

        retrieve_util = MongoRetrieveUtil(database, collection)
        db_info = retrieve_util.get_mongo_db(database)
        if not db_info.success:
            return err_resp(db_info.err_msg)

        db = db_info.result_obj

        collection_name = settings.MONGO_COLLECTION_PREFIX + collection

        # dataset already loaded in mongo
        #
        if collection_name in db.list_collection_names():
            if reload:
                db[collection_name].drop()
                MongoDataset.objects.select_for_update().filter(
                    name=collection_name).delete()
            else:
                print(
                    '--> import_dataset: data in database, no data in django, not reloading'
                )
                # make sure database entry exists
                dataset_records = MongoDataset.objects.select_for_update(
                ).filter(name=collection_name)
                if dataset_records:
                    dataset_record = dataset_records[0]
                    dataset_record.loading = False
                    dataset_record.save()
                else:
                    MongoDataset.objects.create(name=collection_name,
                                                loading=False)

                return ok_resp({'collection': collection_name})
        else:
            # if data is not loaded, make sure record is not in database
            try:
                MongoDataset.objects.select_for_update().filter(
                    name=collection_name).delete()
            except MongoDataset.DoesNotExist:
                pass
                # print('data not loaded, and no data in django')

        # create lockable record
        if not MongoDataset.objects.select_for_update().filter(
                name=collection_name):
            MongoDataset.objects.create(name=collection_name, loading=True)

        # lock on record
        dataset_record = MongoDataset.objects.select_for_update().get(
            name=collection_name)
        if not dataset_record.loading:
            return ok_resp({'collection': collection_name})

        # print(collection_name + ' does not yet exist. Importing.\n\n\n\n')

        if not data_path:
            return err_resp('The file_uri cannot be None or an empty string.')

        if not os.path.exists(data_path):
            return err_resp(collection + ' not found')
        # Convert the file uri to a path
        #
        fpath, err_msg = format_file_uri_to_path(data_path)
        if err_msg:
            return err_resp(err_msg)

        # for mongoimport commands
        #
        import_commands = []

        # -------------------------------------
        # ignore first line of input files
        # -------------------------------------
        if header:
            import_commands.append(f'tail -n +2')

        # -------------------------------------
        # standardize column metadata to dict
        # -------------------------------------
        if not columns:
            columns = DuplicateColumnRemover(data_path).updated_columns

        if isinstance(columns, list):
            columns = {col: None for col in columns}

        # -------------------------------------
        # standardize dict's tworavens types to mongo,
        # try to be flexible with alternative words
        # -------------------------------------
        def mongofy_type(value):
            return {
                bool: 'boolean',
                'boolean': 'boolean',
                str: 'string',
                'string': 'string',
                int: 'int32',
                'int32': 'int32',
                'int': 'int32',
                float: 'double',
                'double': 'double',
                'float': 'double',
                datetime.datetime: 'date',
                'date': 'date'
            }.get(value, 'auto')

        columns = {col: mongofy_type(columns[col]) for col in columns}

        # -------------------------------------
        # Prepare field names and set delimiter
        #    for Mongo import/insert
        # -------------------------------------
        def sanitize(column):
            return encode_variable(column).replace('"', '\\"')

        field_names = ','.join(f"{sanitize(col)}.{columns.get(col, 'auto')}()"
                               for col in columns)
        print('field_names', field_names)
        delimiter_type = 'csv'
        if os.path.splitext(data_path)[1] == 'tsv':
            delimiter_type = 'tsv'
        if delimiter in [None, ',']:
            pass
        elif delimiter == '\t':
            delimiter_type = 'tsv'
        else:
            import_commands.append(f'tr "{delimiter}" "\t" <')
            delimiter_type = 'tsv'

        delimiter = {'csv': ',', 'tsv': '\t'}[delimiter_type]

        # ------------------------------------------
        # TEMP skip this for k8s...
        # ---
        # Prepare and run the mongoimport command
        # ------------------------------------------
        # try:
        if False:  # try:

            import_commands.append(
                f'mongoimport'
                f' --db {database}'
                f' --collection {settings.MONGO_COLLECTION_PREFIX + collection}'
                f' --type {delimiter_type}'
                f' --ignoreBlanks'
                f' --columnsHaveTypes'
                f' --parseGrace autoCast'
                f' --drop'
                f' --numInsertionWorkers=4'
                f' --fields "{field_names}"')

            # the first command takes the data path, which is piped through the other commands
            import_commands[0] = import_commands[0] + ' ' + data_path

            print('--> import_dataset: mongoimport command:')
            print('-->' + ' | '.join(import_commands))

            # pipe each command to the next
            print('--> start subprocess')
            process = subprocess.Popen(shlex.split(import_commands[0]),
                                       stdout=subprocess.PIPE)
            for command in import_commands[1:]:
                print('--> command (bracketed): [%s]' % command)
                process = subprocess.Popen(shlex.split(command),
                                           stdin=process.stdout,
                                           stdout=subprocess.PIPE)
            process.communicate()

            for column in columns.keys():
                db[collection_name].update({column: {
                    '$exists': False
                }}, {'$set': {
                    column: None
                }},
                                           multi=True)

        else:  #except Exception as err:
            # slower, secondary import if first fails
            #print('--> mongo err: [%s]' % err)
            #print(traceback.format_exc())
            print(
                '--> import_dataset: mongoimport failed. Running row-by-row insertion instead.'
            )
            db[collection_name].drop()
            with open(data_path, 'r') as csv_file:
                csv_reader = csv.reader(csv_file, delimiter=delimiter)

                # discard header
                next(csv_reader)

                # use duplicate column name removal headers instead
                columns = [encode_variable(value) for value in columns]

                for observation in csv_reader:
                    db[collection_name].insert_one({
                        col: infer_type(val)
                        for col, val in zip(columns, observation)
                    })

        if indexes:
            for index in indexes:
                # print('creating index ', index, ' on ', collection_name)
                db[collection_name].create_index(index)

        dataset_record.loading = False
        dataset_record.save()

        return ok_resp({'collection': collection_name})
Example #10
0
def check_mongo():
    """test"""
    # ['cline_phoenix_nyt', 'icews', 'cline_phoenix_swb', 'acled_asia', 'cline_speed', 'acled_africa', 'acled_middle_east', 'cline_phoenix_fbis']
    mr = MongoRetrieveUtil('cline_phoenix_fbis', '*')
    if mr.has_error():
        print(mr.error_message)