Esempio n. 1
0
    def handle(self, *args, **options):
        target = options['target']
        target_dict = {
            'type': 'postgres',
            'cursor': connection.cursor(),
            'with_indices': False
        }

        if target == 'mongo':
            target_dict = {
                'type': 'mongo',
                'db': get_mongo_db()
            }

        motu_args = (
            COPERNICUS_SERVER['USERNAME'],
            COPERNICUS_SERVER['PASSWORD'],
            BaseConverter.full_input_path(),
            '%s.nc' % str(uuid4()),
        )

        motu_script = options['script']
        # '-m http://data.ncof.co.uk/motu-web/Motu -s NORTHWESTSHELF_ANALYSIS_FORECAST_BIO_004_002_b -d MetO-NWS-BIO-dm-ATTN -x -19.888889312744 -X 12.999670028687 -y 40.066669464111 -Y 65.001251220703 -t "2017-05-14 12:00:00" -T "2017-05-18 12:00:00" -z 0 -Z 3.0001 -v attn '

        print('Downloading...')
        motu_download(('-u %s -p %s ' +
                       motu_script +
                       ' -o "%s" -f %s') % motu_args)
        print('Done.\n')

        cnv = NetCDF4Converter(motu_args[3])
        cnv.store(target=target_dict, stdout=self.stdout)
Esempio n. 2
0
def generate_dataset(target, variable, sizes, index=False, stdout=None):
    target_dict = {
        'type': 'postgres',
        'cursor': connection.cursor(),
        'with_indices': index
    }

    if target == 'mongo':
        target_dict = {
            'type': 'mongo',
            'db': get_mongo_db(),
            'with_indices': index,
        }

    dimensions = {
        'time': {
            'unit': 'hours since 2016-01-01',
            'min': 0,
            'step': 1
        },
        'lat': {
            'unit': 'degree_east',
            'min': -50,
            'step': 0.2
        },
        'lng': {
            'unit': 'degree_north',
            'min': 10,
            'step': 0.2
        }
    }

    total_items = 1
    for idx, d_name in enumerate(['time', 'lat', 'lng']):
        n_of_steps = int(sizes.split(',')[idx].strip())
        dimensions[d_name]['max'] = dimensions[d_name]['min'] + (
            n_of_steps - 1) * dimensions[d_name]['step']
        total_items *= n_of_steps

    cnv = RandomDataConverter(v_name=variable,
                              dimensions=dimensions,
                              title=variable)
    t1 = time.time()
    dataset = cnv.store(target=target_dict, stdout=stdout)
    t2 = time.time()

    return dataset, total_items, (t2 - t1)
Esempio n. 3
0
    def handle(self, *args, **options):
        target = options['target']
        target_dict = {
            'type': 'postgres',
            'cursor': connection.cursor(),
            'with_indices': True
        }

        if target == 'mongo':
            target_dict = {
                'type': 'mongo',
                'db': get_mongo_db(),
                'with_indices': True
            }

        cnv = NetCDF4Converter(options['file'])
        cnv.store(target=target_dict, stdout=self.stdout)
    def prepare_queries(self, pg_datasets, mongo_datasets):
        queries = [
            # simple select/filter
            {
                'title': 'Simple select/filter',
                'type': 'filter',
                'postgres': """
                SELECT * FROM (
                    SELECT <v1a>, <v2a>, <v3a>, value
                    FROM <t1>
                    WHERE <v1a> >= -9.8 AND <v1a> <= -9.6
                ) AS Q1
                ORDER BY value
                """,
                'mongo': {
                    'collection': "<c1>",
                    'find': {
                        'lat': {
                            '$gte': -9.8,
                            '$lte': -9.6
                        },
                    },
                }
            },

            # paginated select/filter
            {
                'title': 'Paginated select/filter',
                'type': 'filter',
                'postgres': """
                    SELECT * FROM (
                        SELECT <v1a>, <v2a>, <v3a>, value
                        FROM <t1>
                        WHERE <v1a> >= -9.8 AND <v1a> <= -9.6
                    ) AS Q1
                    ORDER BY value
                    LIMIT 10000000
                    OFFSET 20000000
                    """,
                'mongo': {
                    'collection': "<c1>",
                    'find': {
                        'lat': {
                            '$gte': -9.8,
                            '$lte': -9.6
                        },
                    },
                    'limit': 10000000,
                    'skip': 20000000
                }
            },

            # strict join
            {
                'title': 'Value difference in exact location & time',
                'type': 'join',
                'postgres': """
                SELECT * FROM (
                    SELECT <v1a>, <v2a>, <v3a>, (<t2>.value - <t1>.value) AS difv
                    FROM <t1>
                    JOIN <t2> ON <v1a>=<v1b> AND <v2a>=<v2b> AND <v3a>=<v3b>
                ) AS Q1
                ORDER BY difv
                """,
                'mongo': {
                    'collection':
                    "<c1>",
                    'aggregates': [
                        {
                            "$lookup": {
                                "from": "<c2>",
                                "localField": "<v1>",
                                "foreignField": "<v1>",
                                "as": "c2"
                            }
                        },
                        {
                            "$unwind": "$c2"
                        },
                        {
                            "$project": {
                                'lat': 1,
                                'lng': 1,
                                'time': 1,
                                'isLatEqual': {
                                    "$eq": ["$lat", "$c2.lat"]
                                },
                                'isLngEqual': {
                                    "$eq": ["$lng", "$c2.lng"]
                                },
                                'isTimeEqual': {
                                    "$eq": ["$time", "$c2.time"]
                                },
                                'diff': {
                                    '$subtract': ["$value", "$c2.value"]
                                },
                            },
                        },
                        {
                            "$match": {
                                'isLngEqual': True,
                                'isTimeEqual': True
                            }
                        },
                        {
                            "$sort": {
                                'diff': 1
                            }
                        },
                    ]
                }
            },
            {
                'title': 'Value difference at the same time',
                'type': 'join',
                'postgres': """
                SELECT * FROM (
                    SELECT <v1a>, <v2a>, <v3a>, (<t2>.value - <t1>.value) AS difv
                    FROM <t1>
                    JOIN <t2> ON <v3a>=<v3b>
                ) AS Q1
                ORDER BY difv
                """,
                'mongo': {
                    'collection':
                    "<c1>",
                    'aggregates': [
                        {
                            "$lookup": {
                                "from": "<c2>",
                                "localField": "<v3>",
                                "foreignField": "<v3>",
                                "as": "c2"
                            }
                        },
                        {
                            "$unwind": "$c2"
                        },
                        {
                            "$project": {
                                'lat': 1,
                                'lng': 1,
                                'time': 1,
                                'diff': {
                                    '$subtract': ["$value", "$c2.value"]
                                },
                            },
                        },
                        {
                            "$sort": {
                                'diff': 1
                            }
                        },
                    ]
                }
            },
        ]

        for query in queries:
            # PG replacements
            q = query['postgres']

            for d_id in (['a', 'b'] if pg_datasets[1] else ['a']):
                # replace table names
                q = q.replace(
                    '<t%d>' % (['a', 'b'].index(d_id) + 1), pg_datasets[[
                        'a', 'b'
                    ].index(d_id)].variables.get().data_table_name)

                # replace column names
                for dim_id in range(
                        1, pg_datasets[['a', 'b'].index(
                            d_id)].variables.get().dimensions.all().count() +
                        1):
                    q = q.replace(
                        '<v%d%s>' % (dim_id, d_id), pg_datasets[[
                            'a', 'b'
                        ].index(d_id)].variables.get().dimensions.all()[
                            dim_id - 1].data_column_name)

            query['postgres'] = q

            # MONGO replacements
            q = json.dumps(query['mongo'])

            c1 = get_mongo_db().variables.find_one(
                {'dataset_id': mongo_datasets[0]})
            if mongo_datasets[1]:
                c2 = get_mongo_db().variables.find_one(
                    {'dataset_id': mongo_datasets[1]})
            else:
                c2 = {'name': ''}

            q = q.replace('<c1>', c1['name']).replace('<c2>', c2['name'])

            for idx, dim in enumerate(c1['dimensions']):
                q = q.replace('<v%d>' % (idx + 1), dim)

            query['mongo'] = json.loads(q)

        return queries
    def handle(self, *args, **options):
        skip_mongo_joins = options['no_mongo_joins'] or False
        index = options['index'] or False
        skip_joins = options['no_joins'] or False
        if skip_joins:
            skip_mongo_joins = True
        v_name = 'rnd_%s' % ''.join(
            [str(random.choice(range(1, 10))) for _ in range(1, 5)])

        # call for postgres
        pd1, p_size, p_time = generate_dataset(target='postgres',
                                               variable=v_name + '_1',
                                               sizes=options['sizes'],
                                               index=index,
                                               stdout=self.stdout)
        if not skip_joins:
            pd2, _, _ = generate_dataset(target='postgres',
                                         variable=v_name + '_2',
                                         sizes=options['sizes'],
                                         index=index,
                                         stdout=self.stdout)
        else:
            pd2 = None

        # call for mongo
        md1, m_size, m_time = generate_dataset(target='mongo',
                                               variable=v_name + '_1',
                                               sizes=options['sizes'],
                                               index=index,
                                               stdout=self.stdout)
        if not skip_mongo_joins:
            md2, _, _ = generate_dataset(target='mongo',
                                         variable=v_name + '_2',
                                         sizes=options['sizes'],
                                         index=index,
                                         stdout=self.stdout)
        else:
            md2 = None

        # print insert times
        print('')
        print('Data import')
        print('===========')
        print('\tDatabase\tDuration\t\t#Rows')
        print('\tPostgres\t%.3f sec\t\t%d' % (p_time, p_size))
        print('\tMongoDB\t\t%.3f sec\t\t%d' % (m_time, m_size))

        # print disk usage info
        stats = get_mongo_db().command("collstats", v_name + '_1')
        m_disk_size = (stats['size'] +
                       stats['totalIndexSize']) / 1024.0 / 1024.0
        cur = connection.cursor()
        cur.execute("""
            SELECT *, pg_size_pretty(total_bytes) AS total
                    , pg_size_pretty(index_bytes) AS INDEX
                    , pg_size_pretty(toast_bytes) AS toast
                    , pg_size_pretty(table_bytes) AS TABLE
                  FROM (
                  SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes FROM (
                      SELECT c.oid,nspname AS table_schema, relname AS TABLE_NAME
                              , c.reltuples AS row_estimate
                              , pg_total_relation_size(c.oid) AS total_bytes
                              , pg_indexes_size(c.oid) AS index_bytes
                              , pg_total_relation_size(reltoastrelid) AS toast_bytes
                          FROM pg_class c
                          LEFT JOIN pg_namespace n ON n.oid = c.relnamespace
                          WHERE relkind = 'r'
                  ) a
                ) a
                WHERE table_name = '%s';
            """ % pd1.variables.get().data_table_name)
        p_disk_size = cur.fetchone()[4] / 1024.0 / 1024.0

        print('')
        print('Disk usage')
        print('===========')
        print('\tDatabase\tSize (MB)\t\t#Rows')
        print('\tPostgres\t%.2f MB\t\t%d' % (p_disk_size, p_size))
        print('\tMongoDB\t\t%.2f MB\t\t%d' % (m_disk_size, m_size))

        # prepare queries
        queries = self.prepare_queries(pg_datasets=[pd1, pd2],
                                       mongo_datasets=[md1, md2])

        # execute queries
        for q in queries:
            if skip_joins and q['type'] == 'join':
                continue

            print('')
            title = 'Query: %s' % q['title']
            print(title)
            print(''.join(['=' for _ in title]))
            print('\tDatabase\tDuration\t\t#Results')

            try:
                t1 = time.time()
                cur = connection.cursor()
                cur.execute(q['postgres'])
                cnt = len([x for x in cur.fetchall()])
                t2 = time.time()

                print('\tPostgres\t%.3f sec\t\t%d' % ((t2 - t1), cnt))
            except:
                traceback.print_exc()
                print('\tPostgres failed')

            if skip_mongo_joins and q['type'] == 'join':
                continue
            try:
                t1 = time.time()
                f = get_mongo_db().get_collection(q['mongo']['collection'])
                if 'find' in q['mongo']:
                    f = f.find(q['mongo']['find'])
                if 'aggregates' in q['mongo']:
                    f = f.aggregate(q['mongo']['aggregates'])
                if 'skip' in q['mongo']:
                    f = f.limit(q['mongo']['skip'])
                if 'limit' in q['mongo']:
                    f = f.limit(q['mongo']['limit'])
                cnt = [x for x in f].__len__()
                t2 = time.time()

                print('\tMongoDB\t\t%.3f sec\t\t%d' % ((t2 - t1), cnt))
            except:
                traceback.print_exc()
                print('\tMongoDB failed')

        # drop everything
        pd1.delete()
        if pd2:
            pd2.delete()

        for md in [md1, md2]:
            if md is None:
                continue

            db = get_mongo_db()
            variable = db.variables.find_one({'dataset_id': md})
            db.get_collection(variable['name']).drop()
            db.dimensions.delete_many({'variable_id': variable['_id']})
            db.variables.delete_many({'dataset_id': md})
            db.datasets.delete_many({'_id': md})