Ejemplos de create_tables_load_bson_data en Python, ejemplos de mongo_schema.schema_engine.create_tables_load_bson_data en Python

Ejemplo n.º 1

0

Mostrar archivo

def create_truncate_psql_objects(dbreq, schemas_path, psql_schema):
    """ drop and create tables for all collections """
    schema_engines = get_schema_engines_as_dict(schemas_path)
    for _, schema in schema_engines.iteritems():
        tables_obj = create_tables_load_bson_data(schema, None)
        drop = True
        create_psql_tables(tables_obj, dbreq, psql_schema, '', drop)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_schema_engine.py Proyecto: YaroslavLitvinov/mongo_schema

def test_external_data_loader():

    def table_rows_list(table):
        """ get list of rows, every row is values list
        @param table object schema_engine.SqlTable"""
        res = []
        firstcolname = table.sql_column_names[0]
        reccount = len(table.sql_columns[firstcolname].values)
        for val_i in xrange(reccount):
            values = []
            for column_name in table.sql_column_names:
                col = table.sql_columns[column_name]
                values.append(col.values[val_i])
            res.append( values )
        return res

    tables1 = test_all_tables()
    ext_tables = create_tables_load_bson_data(tables1.schema_engine, None)
    # empty before data loaded
    assert(ext_tables.is_empty()==True)
    ext_tables_data = {}
    for name in tables1.tables:
        ext_tables_data[name] = table_rows_list(tables1.tables[name])
    ext_tables.load_external_tables_data(ext_tables_data)
    # non empty after data loaded
    assert(ext_tables.is_empty()==False)
    assert(tables1.compare(ext_tables))

Ejemplo n.º 3

0

Mostrar archivo

Archivo: psql_copy.py Proyecto: VarchukVladimir/gizer

def main():
    """ main """
    parser = argparse.ArgumentParser()
    parser.add_argument("--config-file",
                        action="store",
                        help="Settings file",
                        type=file,
                        required=True)
    parser.add_argument("--psql-section",
                        help="Psql section name from config",
                        type=str,
                        required=True)
    parser.add_argument("-cn",
                        "--collection-name",
                        help="Mongo collection name",
                        type=str,
                        required=True)
    parser.add_argument("--psql-table-name", type=str, required=True)
    parser.add_argument("-psql-table-prefix", type=str, required=False)
    parser.add_argument("--input-csv-dir", type=str, required=True)
    args = parser.parse_args()

    config = configparser.ConfigParser()
    config.read_file(args.config_file)

    schema_name = config['psql']['psql-schema-name']
    schemas_dir = config['misc']['schemas-dir']
    schema_path = join(schemas_dir, args.collection_name + '.json')
    schema_file = open(schema_path, 'r')

    psql_settings = psql_settings_from_config(config, args.psql_section)

    table_prefix = ""
    if args.psql_table_prefix:
        table_prefix = args.psql_table_prefix

    schema = SchemaEngine(args.collection_name, [load(schema_file)])
    table = create_tables_load_bson_data(schema, None)\
        .tables[args.psql_table_name]
    dbreq = PsqlRequests(psql_conn_from_settings(psql_settings))

    create_psql_table(table, dbreq, schema_name, table_prefix, drop=True)
    create_psql_index(table, dbreq, schema_name, table_prefix)
    dbreq.conn.commit()

    csv_files = [f \
                 for f in listdir(args.input_csv_dir) \
                 if isfile(join(args.input_csv_dir, f))]
    csv_files.sort()
    for name in csv_files:
        csvpath = join(args.input_csv_dir, name)
        with open(csvpath, 'rb') as csv_f:
            schema_name_subst = schema_name
            if len(schema_name):
                schema_name_subst += '.'
            tname = '%s"%s%s"' % (schema_name_subst, table_prefix,
                                  args.psql_table_name)
            copy_from_csv(dbreq, csv_f, tname)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: collection_reader.py Proyecto: VarchukVladimir/gizer

def async_worker_handle_mongo_rec(schema_engines, rec_data_and_collection):
    """ function intended to call by FastQueueProcessor.
    process mongo record / bson data in separate process.
    schema_engines -- dict {'collection name': SchemaEngine}. Here is
    many schema engines to use every queue to handle items from any collection;
    rec_data_and_collection - tuple('collection name', bson record)"""
    rec = rec_data_and_collection[0]
    collection = rec_data_and_collection[1]
    return create_tables_load_bson_data(schema_engines[collection], [rec])

Ejemplo n.º 5

0

Mostrar archivo

Archivo: test_schema_engine.py Proyecto: YaroslavLitvinov/mongo_schema

def get_schema_tables(schema_engine_obj):
    collection_name = schema_engine_obj.root_node.name
    dirpath=os.path.dirname(os.path.abspath(__file__))
    data_fname = files[collection_name][1]
    data_path = os.path.join(dirpath, data_fname)
    tables_with_data = create_tables_load_file(schema_engine_obj, data_path)
    tables_no_data = create_tables_load_bson_data(schema_engine_obj, None)
    assert(tables_with_data.tables.keys() == tables_no_data.tables.keys())
    return tables_with_data

Ejemplo n.º 6

0

Mostrar archivo

Archivo: collection_reader.py Proyecto: YaroslavLitvinov/gizer

def async_worker_handle_mongo_rec(schema_engines,
                                  rec_data_and_collection):
    """ function intended to call by FastQueueProcessor.
    process mongo record / bson data in separate process.
    schema_engines -- dict {'collection name': SchemaEngine}. Here is
    many schema engines to use every queue to handle items from any collection;
    rec_data_and_collection - tuple('collection name', bson record)"""
    rec = rec_data_and_collection[0]
    collection = rec_data_and_collection[1]
    return create_tables_load_bson_data(schema_engines[collection],
                                        [rec])

Ejemplo n.º 7

0

Mostrar archivo

Archivo: oplog_handlers.py Proyecto: VarchukVladimir/gizer

def cb_insert(psql_schema, ts, ns, schema_engine, bson_data):
    tables = create_tables_load_bson_data(schema_engine, bson_data)
    collection_name = tables.schema_engine.root_node.name
    log_table_errors(
        "collection: %s data for opinsert load from MONGO OPLOG \
with errors:" % collection_name, tables.errors)
    res = []
    for name, table in tables.tables.iteritems():
        res.append(
            OplogQuery("i", generate_insert_queries(table, psql_schema, "")))
    return res

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_oplog_parser.py Proyecto: VarchukVladimir/gizer

def load_mongo_data_to_psql(schema_engine, mongo_data_path, psql, psql_schema):
    getLogger(__name__).info("Load initial data from %s" \
                                         % (mongo_data_path))
    with open(mongo_data_path, "r") as input_f:
        raw_bson_data = input_f.read()
        for one_record in loads(raw_bson_data):
            tables = create_tables_load_bson_data(schema_engine, [one_record])
            log_table_errors('test info msg:', tables.errors)
            getLogger(__name__).info("Loaded tables=%s" % tables.tables)
            insert_tables_data_into_dst_psql(psql, tables, psql_schema, '')
            psql.cursor.execute('COMMIT')

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_oplog_parser.py Proyecto: YaroslavLitvinov/gizer

def load_mongo_data_to_psql(schema_engine, mongo_data_path, psql, psql_schema):
    getLogger(__name__).info("Load initial data from %s" \
                                         % (mongo_data_path))
    with open(mongo_data_path, "r") as input_f:
        raw_bson_data = input_f.read()
        for one_record in loads(raw_bson_data):
            tables = create_tables_load_bson_data(schema_engine, [one_record])
            log_table_errors('test info msg:', tables.errors)
            getLogger(__name__).info("Loaded tables=%s" % tables.tables)
            insert_tables_data_into_dst_psql(psql, tables, psql_schema, '')
            psql.cursor.execute('COMMIT')

Ejemplo n.º 10

0

Mostrar archivo

Archivo: oplog_handlers.py Proyecto: VarchukVladimir/gizer

def cb_insert(psql_schema, ts, ns, schema_engine, bson_data):
    tables = create_tables_load_bson_data(schema_engine, bson_data)
    collection_name = tables.schema_engine.root_node.name
    log_table_errors("collection: %s data for opinsert load from MONGO OPLOG \
with errors:" % collection_name, tables.errors)
    res = []
    for name, table in tables.tables.iteritems():
        res.append(OplogQuery("i", generate_insert_queries(table,
                                                           psql_schema,
                                                           "")))
    return res

Ejemplo n.º 11

0

Mostrar archivo

Archivo: psql_copy.py Proyecto: VarchukVladimir/gizer

def main():
    """ main """
    parser = argparse.ArgumentParser()
    parser.add_argument("--config-file", action="store",
                        help="Settings file", type=file, required=True)
    parser.add_argument("--psql-section", help="Psql section name from config",
                        type=str, required=True)
    parser.add_argument("-cn", "--collection-name", help="Mongo collection name",
                        type=str, required=True)
    parser.add_argument("--psql-table-name", type=str, required=True)
    parser.add_argument("-psql-table-prefix", type=str, required=False)
    parser.add_argument("--input-csv-dir", type=str, required=True)
    args = parser.parse_args()

    config = configparser.ConfigParser()
    config.read_file(args.config_file)

    schema_name = config['psql']['psql-schema-name']
    schemas_dir = config['misc']['schemas-dir']
    schema_path = join(schemas_dir, args.collection_name + '.json')
    schema_file = open(schema_path, 'r')

    psql_settings = psql_settings_from_config(config, args.psql_section)

    table_prefix = ""
    if args.psql_table_prefix:
        table_prefix = args.psql_table_prefix

    schema = SchemaEngine(args.collection_name, [load(schema_file)])
    table = create_tables_load_bson_data(schema, None)\
        .tables[args.psql_table_name]
    dbreq = PsqlRequests(psql_conn_from_settings(psql_settings))

    create_psql_table(table, dbreq, schema_name, table_prefix, drop=True)
    create_psql_index(table, dbreq, schema_name, table_prefix)
    dbreq.conn.commit()

    csv_files = [f \
                 for f in listdir(args.input_csv_dir) \
                 if isfile(join(args.input_csv_dir, f))]
    csv_files.sort()
    for name in csv_files:
        csvpath = join(args.input_csv_dir, name)
        with open(csvpath, 'rb') as csv_f:
            schema_name_subst = schema_name
            if len(schema_name):
                schema_name_subst += '.'
            tname = '%s"%s%s"' % (schema_name_subst,
                                  table_prefix,
                                  args.psql_table_name)
            copy_from_csv(dbreq, csv_f, tname)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: mongo_reader.py Proyecto: VarchukVladimir/gizer

def async_worker_handle_mongo_rec(schema_engines, rec_collection):
    """ function intended to call by FastQueueProcessor.
    process mongo record / bson data in separate process.
    schema_engine -- SchemaEngine
    rec_collection - tuble(bson record, collection name)"""
    rows_as_dict = {}
    collection = rec_collection[1]
    rec = rec_collection[0]
    schema_engine = schema_engines[collection]
    tables_obj = create_tables_load_bson_data(schema_engine, [rec])
    for table_name, table in tables_obj.tables.iteritems():
        rows = table_rows_list(table, ENCODE_ONLY, null_value=NULLVAL)
        rows_as_dict[table_name] = rows
    return TablesToSave(rows=rows_as_dict, errors=tables_obj.errors)

Ejemplo n.º 13

0

Mostrar archivo

def load_single_rec_into_tables_obj(src_dbreq,
                                    schema_engine,
                                    psql_schema,
                                    rec_id):
    """ Return Tables obj loaded from postgres. """
    if len(psql_schema):
        psql_schema += '.'
    tables = create_tables_load_bson_data(schema_engine, None)

    # fetch mongo rec by id from source psql
    ext_tables_data = {}
    for table_name, table in tables.tables.iteritems():
        id_name, quotes = parent_id_name_and_quotes_for_table(table)
        if quotes:
            id_val = "'" + str(rec_id) + "'"
        else:
            id_val = rec_id
        indexes = [name \
                       for name in table.sql_column_names \
                       if table.sql_columns[name].index_key()]
        idx_order_by = ''
        if len(indexes):
            idx_order_by = "ORDER BY " + ','.join(indexes)
        select_fmt = 'SELECT * FROM {schema}"{table}" \
WHERE {id_name}={id_val} {idx_order_by};'
        select_req = select_fmt.format(schema=psql_schema,
                                       table=table_name,
                                       id_name=id_name,
                                       id_val=id_val,
                                       idx_order_by=idx_order_by)
        getLogger(__name__).debug("Get psql data: "+select_req)
        src_dbreq.cursor.execute(select_req)
        ext_tables_data[table_name] = []
        idx = 0
        for record in src_dbreq.cursor:
            record_decoded = []
            if type(record) is tuple:
                for titem in record:
                    if type(titem) is str:
                        record_decoded.append(titem.decode('utf-8'))
                    else:
                        record_decoded.append(titem)
                record = tuple(record_decoded)
            getLogger(__name__).debug("result[%d]=%s", idx, record)
            ext_tables_data[table_name].append(record)
            idx += 1

    # set external tables data to Tables
    tables.load_external_tables_data(ext_tables_data)
    return tables

Ejemplo n.º 14

0

Mostrar archivo

Archivo: mongo_reader.py Proyecto: VarchukVladimir/gizer

def save_ddl_create_statements(create_statements_file,
                               schema_engine,
                               psql_schema_name,
                               table_prefix):
    """ save create table statements to file """
    ddls = {}
    if not psql_schema_name:
        psql_schema_name = ''
    if not table_prefix:
        table_prefix = ''
    sqltables = create_tables_load_bson_data(schema_engine, None).tables
    for tablename, sqltable in sqltables.iteritems():
        ddls[tablename] = create_table(sqltable, psql_schema_name,
                                       table_prefix)
    for table_name in ddls:
        create_query = ddls[table_name]
        create_statements_file.write(create_query)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: mongo_reader.py Proyecto: YaroslavLitvinov/gizer

def save_ddl_create_statements(create_statements_file,
                               schema_engine,
                               psql_schema_name,
                               table_prefix):
    """ save create table statements to file """
    ddls = {}
    if not psql_schema_name:
        psql_schema_name = ''
    if not table_prefix:
        table_prefix = ''
    sqltables = create_tables_load_bson_data(schema_engine, None).tables
    for tablename, sqltable in sqltables.iteritems():
        ddls[tablename] = create_table(sqltable, psql_schema_name,
                                       table_prefix)
    for table_name in ddls:
        create_query = ddls[table_name]
        create_statements_file.write(create_query)

Ejemplo n.º 16

0

Mostrar archivo

def test_complete_partial_record2():
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)-8s %(message)s')

    PSQL_SCHEMA_NAME = ''
    # etalon of data
    sample_data_before = {
        'posts2': {
            'id': [133],
            "updated_at": [loads('{ "$date" : "2016-02-08T20:02:12.985Z"}')]
        },
        'posts2_comments': {
            'idx': [1, 2],
            'id_oid': [
                str(loads('{ "$oid": "56b8f35ef9fcee1b0000001a" }')),
                str(loads('{ "$oid": "56b8f344f9fcee1b00000018" }'))
            ],
            'updated_at': [
                loads('{ "$date" : "2016-02-08T20:02:12.985Z"}'),
                loads('{ "$date" : "2016-02-08T20:02:12.985Z"}')
            ]
        },
        'posts2_comment_struct_tests': {
            'v': [1, 2, 3],
            'idx': [1, 2, 1]
        },
        'posts2_comment_struct_test_nested': {
            'nested': [20, 23, 24, 25, 26],
            'idx': [1, 1, 2, 1, 2]
        }
    }

    wrong_raw_bson_data = '[{\
     "_id": 133,\
     "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
     "comments": "error in data"\
    }]'

    existing_raw_bson_data = '[{\
     "_id": 133,\
     "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
     "comments": [ {\
          "_id": { "$oid": "56b8f35ef9fcee1b0000001a" },\
          "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
          "struct" : {\
              "tests": [{\
                  "v": 1,\
                  "nested": [20]\
              }, {\
                  "v": 2,\
                  "nested": [23, 24]\
              }]}\
        }, {\
          "_id": { "$oid": "56b8f344f9fcee1b00000018" },\
          "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
          "struct" : {\
              "tests": [{\
                  "v": 3,\
                  "nested": [25, 26]\
              }]}\
        } ]\
    }]'

    oplog_object_id_bson_raw_data = '{"_id": 133}'
    # insert request should be created, to add 'tests' item
    oplog_path_array_bson_raw_data = '{\
"comments.0.struct.tests.0.nested.1": 21,\
"comments.2": { \
    "_id": { "$oid": "56b8f35ef9fcee1b0000001a" },\
    "updated_at": { "$date" : "2016-02-08T20:02:14.985Z"},\
    "struct": {\
        "tests": [{\
            "v": 12,\
            "nested": [30]\
         }, {\
            "v": 13,\
            "nested": [32, 31]\
         }\
    ]}}\
}'

    dbname = 'rails4_mongoid_development'
    db_schemas_path = '/'.join(['test_data', 'schemas', dbname])
    schemas = get_schema_engines_as_dict(db_schemas_path)
    schema_engine = schemas['posts2']

    connstr = os.environ['TEST_PSQLCONN']
    psql = PsqlRequests(psycopg2.connect(connstr))

    # test wrong bson data
    existing_bson_data = loads(wrong_raw_bson_data)
    tables_obj_before = \
        create_tables_load_bson_data(schema_engine,
                                     existing_bson_data)
    assert (False == tables_obj_before.compare_with_sample(sample_data_before))

    # tables loaded from existing_raW_bson_data
    existing_bson_data = loads(existing_raw_bson_data)
    tables_obj_before = \
        create_tables_load_bson_data(schema_engine,
                                     existing_bson_data)

    assert (False == tables_obj_before.compare_with_sample({}))
    assert (True == tables_obj_before.compare_with_sample(sample_data_before))

    # create table structure, drop existing
    create_psql_tables(tables_obj_before, psql, PSQL_SCHEMA_NAME, '', True)
    # insert data totables
    insert_tables_data_into_dst_psql(psql, tables_obj_before, PSQL_SCHEMA_NAME,
                                     '')

    # oplog path with indexes. insert array item
    bson_data = loads(oplog_path_array_bson_raw_data)
    object_id_bson_data = loads(oplog_object_id_bson_raw_data)
    partial_inserts_list = get_tables_data_from_oplog_set_command(\
        schema_engine, bson_data, object_id_bson_data)

    for partial_insert in partial_inserts_list:
        tables_for_insert = partial_insert.tables
        initial_indexes = partial_insert.initial_indexes

        for name, table in tables_for_insert.iteritems():
            query_tuple = generate_insert_queries(table, PSQL_SCHEMA_NAME, "",
                                                  initial_indexes)
            for query in query_tuple[1]:
                getLogger(__name__).debug("EXECUTE: " + \
                                              str(query_tuple[0]) + str(query))
                psql.cursor.execute(query_tuple[0], query)

    # tables loaded from existing_bson_data
    rec_obj_id = object_id_bson_data['_id']
    tables_obj_after = load_single_rec_into_tables_obj(psql, schema_engine,
                                                       PSQL_SCHEMA_NAME,
                                                       rec_obj_id)
    sample_data_after = sample_data_before
    sample_data_after['posts2_comments']['idx'].append(3)
    sample_data_after['posts2_comments']['id_oid'].append(\
        "56b8f35ef9fcee1b0000001a")
    sample_data_after['posts2_comments']['updated_at'].append(
        loads('{ "$date" : "2016-02-08T20:02:14.985Z"}'))
    sample_data_after['posts2_comment_struct_tests'] = {
        'v': [1, 2, 3, 12, 13],
        'idx': [1, 2, 1, 1, 2]
    }
    sample_data_after['posts2_comment_struct_test_nested'] = {
        'nested': [20, 21, 23, 24, 25, 26, 30, 32, 31],
        'idx': [1, 2, 1, 2, 1, 2, 1, 1, 2]
    }

    assert (False == tables_obj_after.compare_with_sample({}))
    assert (True == tables_obj_after.compare_with_sample(sample_data_after))

Ejemplo n.º 17

0

Mostrar archivo

Archivo: mongo_reader.py Proyecto: YaroslavLitvinov/gizer

def main():
    """ main """
    #for debugging purposes
    #profiler = Profile()  # profiling
    #profiler.enable()

    args = getargs()

    config = configparser.ConfigParser()
    config.read_file(args.config_file)

    schema_name = config['psql']['psql-schema-name']
    schemas_dir = config['misc']['schemas-dir']
    schema_path = os.path.join(schemas_dir, args.collection_name + '.json')
    schema_file = open(schema_path, 'r')

    mongo_settings = mongo_settings_from_config(config, 'mongo')

    mongo_reader = mongo_reader_from_settings(mongo_settings,
                                              args.collection_name,
                                              json.loads(args.js_request))
    schema_engine = SchemaEngine(args.collection_name, [json.load(schema_file)])
    table_names = create_tables_load_bson_data(schema_engine, None).tables.keys()
    csm = CsvWriteManager(table_names, args.csv_path, CSV_CHUNK_SIZE)

    etl_mongo_reader = EtlMongoReader(ETL_PROCESS_NUMBER,
                                      ETL_QUEUE_SIZE,
                                      async_worker_handle_mongo_rec,
                                      #1st worker param
                                      {args.collection_name: schema_engine}, 
                                      {args.collection_name: mongo_reader})
    etl_mongo_reader.execute_query(args.collection_name,
                                   json.loads(args.js_request))

    getLogger(__name__).info("Connecting to mongo server " + mongo_settings.host)
    received_rec_ids = Set([])
    errors = {}
    all_written = {}
    while True:
        tables_to_save = etl_mongo_reader.next()
        if not tables_to_save:
            break
        # don't process duplicates that can be accidently returned by transp
        if tables_to_save.rec_id not in received_rec_ids:
            received_rec_ids.add(tables_to_save.rec_id)
            all_written = merge_dicts(all_written,
                                      save_csvs(csm, tables_to_save.rows))
            errors = merge_dicts(errors, tables_to_save.errors)
        else:
            getLogger(__name__).warning("Skip duplicated rec_id=%s",
                                        tables_to_save.rec_id)
    
    if args.ddl_statements_file:
        save_ddl_create_statements(args.ddl_statements_file,
                                   schema_engine,
                                   schema_name,
                                   args.psql_table_prefix)
    # save csv files
    csm.finalize()

    #for debugging purposes
    #print_profiler_stats(profiler)
    print_etl_stats(errors, all_written, etl_mongo_reader.etl_recs_count)
    save_etl_stats(args.stats_file, all_written)

    exit_code = 0
    if etl_mongo_reader.current_mongo_reader.failed or \
            etl_mongo_reader.fast_queue.error:
        exit_code = 1
    del etl_mongo_reader
    exit(exit_code)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: mongo_reader.py Proyecto: VarchukVladimir/gizer

def main():
    """ main """
    #for debugging purposes
    #profiler = Profile()  # profiling
    #profiler.enable()

    args = getargs()

    config = configparser.ConfigParser()
    config.read_file(args.config_file)

    schema_name = config['psql']['psql-schema-name']
    schemas_dir = config['misc']['schemas-dir']
    schema_path = os.path.join(schemas_dir, args.collection_name + '.json')
    schema_file = open(schema_path, 'r')

    mongo_settings = mongo_settings_from_config(config, 'mongo')

    mongo_reader = mongo_reader_from_settings(mongo_settings,
                                              args.collection_name,
                                              json.loads(args.js_request))
    schema_engine = SchemaEngine(args.collection_name, [json.load(schema_file)])
    table_names = create_tables_load_bson_data(schema_engine, None).tables.keys()
    csm = CsvWriteManager(table_names, args.csv_path, CSV_CHUNK_SIZE)

    etl_mongo_reader = EtlMongoReader(ETL_PROCESS_NUMBER,
                                      ETL_QUEUE_SIZE,
                                      async_worker_handle_mongo_rec,
                                      #1st worker param
                                      {args.collection_name: schema_engine}, 
                                      {args.collection_name: mongo_reader})
    etl_mongo_reader.execute_query(args.collection_name, json.loads(args.js_request))

    getLogger(__name__).info("Connecting to mongo server " + mongo_settings.host)
    errors = {}
    all_written = {}
    tables_list = etl_mongo_reader.next_processed()
    while tables_list is not None:
        for tables in tables_list:
            all_written = merge_dicts(all_written,
                                      save_csvs(csm, tables.rows))
            errors = merge_dicts(errors, tables.errors)
        tables_list = etl_mongo_reader.next_processed()
    
    if args.ddl_statements_file:
        save_ddl_create_statements(args.ddl_statements_file,
                                   schema_engine,
                                   schema_name,
                                   args.psql_table_prefix)
    # save csv files
    csm.finalize()

    #for debugging purposes
    #print_profiler_stats(profiler)
    print_etl_stats(errors, all_written, etl_mongo_reader.etl_recs_count)
    save_etl_stats(args.stats_file, all_written)

    exit_code = 0
    if etl_mongo_reader.current_mongo_reader.failed or \
            etl_mongo_reader.fast_queue.error:
        exit_code = 1
    del etl_mongo_reader
    exit(exit_code)

Ejemplo n.º 19

0

Mostrar archivo

def test_complete_partial_record3():
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)-8s %(message)s')

    PSQL_SCHEMA_NAME = ''
    # etalon of data
    sample_data_before = {
        'posts': {
            'id_oid': ['56b8da59f9fcee1b00000007'],
            "updated_at": [loads('{ "$date" : "2016-02-08T20:02:12.985Z"}')]
        },
        'post_comments': {
            'id_oid': [
                str(loads('{ "$oid": "56b8f35ef9fcee1b0000001a" }')),
                str(loads('{ "$oid": "56b8f344f9fcee1b00000018" }'))
            ],
            'updated_at': [
                loads('{ "$date" : "2016-02-08T20:02:12.985Z"}'),
                loads('{ "$date" : "2016-02-08T20:02:13.985Z"}')
            ],
            'idx': [1, 2]
        },
        'post_comment_tests': {
            'tests': [0, 2],
            'idx': [1, 2]
        }
    }

    existing_raw_bson_data = '[{\
     "_id": { "$oid": "56b8da59f9fcee1b00000007" },\
     "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
     "comments": [ {\
          "_id": { "$oid": "56b8f35ef9fcee1b0000001a" },\
          "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"}\
        }, {\
          "_id": { "$oid": "56b8f344f9fcee1b00000018" },\
          "updated_at": { "$date" : "2016-02-08T20:02:13.985Z"},\
          "tests": [0,2]\
        } ]\
 }]'

    oplog_object_id_bson_raw_data = '{\
"_id": { "$oid": "56b8da59f9fcee1b00000007" }\
}'

    # insert request should be created, to add a record with only single field: updated_at
    oplog_path_array_bson_raw_data = '{"comments.2.updated_at": \
{ "$date" : "2016-02-08T20:02:14.985Z"}}'

    dbname = 'rails4_mongoid_development'
    db_schemas_path = '/'.join(['test_data', 'schemas', dbname])
    schemas = get_schema_engines_as_dict(db_schemas_path)
    schema_engine = schemas['posts']

    connstr = os.environ['TEST_PSQLCONN']
    psql = PsqlRequests(psycopg2.connect(connstr))

    # tables loaded from existing_raW_bson_data
    existing_bson_data = loads(existing_raw_bson_data)
    tables_obj_before = \
        create_tables_load_bson_data(schema_engine,
                                     existing_bson_data)
    assert (True == tables_obj_before.compare_with_sample(sample_data_before))

    # create table structure, drop existing
    create_psql_tables(tables_obj_before, psql, PSQL_SCHEMA_NAME, '', True)
    # insert data totables
    insert_tables_data_into_dst_psql(psql, tables_obj_before, PSQL_SCHEMA_NAME,
                                     '')

    # oplog path inserting just a field
    bson_data = loads(oplog_path_array_bson_raw_data)
    print bson_data
    object_id_bson_data = loads(oplog_object_id_bson_raw_data)
    partial_inserts_list = get_tables_data_from_oplog_set_command(\
        schema_engine, bson_data, object_id_bson_data)
    tables_for_insert = partial_inserts_list[0].tables
    initial_indexes = partial_inserts_list[0].initial_indexes
    print "tables_for_insert", tables_for_insert.keys()
    print "initial_indexes", initial_indexes
    insert_tests_t = tables_for_insert['post_comments']
    insert_query = generate_insert_queries(insert_tests_t, "", "",
                                           initial_indexes)
    print "columns", insert_tests_t.sql_column_names
    print "insert_query=", insert_query
    for query in insert_query[1]:
        print insert_query[0], query
        psql.cursor.execute(insert_query[0], query)

    # tables loaded from existing_bson_data
    rec_obj_id = object_id_bson_data['_id']
    tables_obj_after = load_single_rec_into_tables_obj(psql, schema_engine,
                                                       PSQL_SCHEMA_NAME,
                                                       rec_obj_id)
    sample_data_after = sample_data_before
    sample_data_after['post_comments']['idx'].append(3)
    sample_data_after['post_comments']['id_oid'].append(None)
    sample_data_after['post_comments']['updated_at'].append(
        loads('{ "$date" : "2016-02-08T20:02:14.985Z"}'))

    assert (True == tables_obj_after.compare_with_sample(sample_data_after))

Ejemplo n.º 20

0

Mostrar archivo

Archivo: test_oppartial_record.py Proyecto: VarchukVladimir/gizer

def test_complete_partial_record2():
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)-8s %(message)s')

    PSQL_SCHEMA_NAME = ''
    # etalon of data
    sample_data_before = {
        'posts2': {
            'id': [133],
            "updated_at": [loads('{ "$date" : "2016-02-08T20:02:12.985Z"}')]
        },
        'posts2_comments': {
            'idx': [1,2],
            'id_oid': [str(loads('{ "$oid": "56b8f35ef9fcee1b0000001a" }')),
                       str(loads('{ "$oid": "56b8f344f9fcee1b00000018" }'))],
            'updated_at': [loads('{ "$date" : "2016-02-08T20:02:12.985Z"}'),
                           loads('{ "$date" : "2016-02-08T20:02:12.985Z"}')]
        },
        'posts2_comment_struct_tests': {
            'v': [1,2,3],
            'idx': [1,2,1]
        },
        'posts2_comment_struct_test_nested': {
            'nested': [20,23,24,25,26],
            'idx': [1,1,2,1,2]
        }
    }

    
    wrong_raw_bson_data = '[{\
     "_id": 133,\
     "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
     "comments": "error in data"\
    }]'

    existing_raw_bson_data = '[{\
     "_id": 133,\
     "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
     "comments": [ {\
          "_id": { "$oid": "56b8f35ef9fcee1b0000001a" },\
          "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
          "struct" : {\
              "tests": [{\
                  "v": 1,\
                  "nested": [20]\
              }, {\
                  "v": 2,\
                  "nested": [23, 24]\
              }]}\
        }, {\
          "_id": { "$oid": "56b8f344f9fcee1b00000018" },\
          "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
          "struct" : {\
              "tests": [{\
                  "v": 3,\
                  "nested": [25, 26]\
              }]}\
        } ]\
    }]'

    oplog_object_id_bson_raw_data = '{"_id": 133}'
    # insert request should be created, to add 'tests' item
    oplog_path_array_bson_raw_data = '{\
"comments.0.struct.tests.0.nested.1": 21,\
"comments.2": { \
    "_id": { "$oid": "56b8f35ef9fcee1b0000001a" },\
    "updated_at": { "$date" : "2016-02-08T20:02:14.985Z"},\
    "struct": {\
        "tests": [{\
            "v": 12,\
            "nested": [30]\
         }, {\
            "v": 13,\
            "nested": [32, 31]\
         }\
    ]}}\
}'

    dbname = 'rails4_mongoid_development'
    db_schemas_path = '/'.join(['test_data', 'schemas', dbname])
    schemas = get_schema_engines_as_dict(db_schemas_path)
    schema_engine = schemas['posts2']

    connstr = os.environ['TEST_PSQLCONN']
    psql = PsqlRequests(psycopg2.connect(connstr))


    # test wrong bson data
    existing_bson_data = loads(wrong_raw_bson_data)
    tables_obj_before = \
        create_tables_load_bson_data(schema_engine, 
                                     existing_bson_data)
    assert(False==tables_obj_before.compare_with_sample(sample_data_before))



    # tables loaded from existing_raW_bson_data
    existing_bson_data = loads(existing_raw_bson_data)
    tables_obj_before = \
        create_tables_load_bson_data(schema_engine, 
                                     existing_bson_data)

    assert(False==tables_obj_before.compare_with_sample({}))
    assert(True==tables_obj_before.compare_with_sample(sample_data_before))

    # create table structure, drop existing
    create_psql_tables(tables_obj_before, psql, PSQL_SCHEMA_NAME, '', True)
    # insert data totables
    insert_tables_data_into_dst_psql(psql, tables_obj_before, PSQL_SCHEMA_NAME, '')

    # oplog path with indexes. insert array item
    bson_data = loads(oplog_path_array_bson_raw_data)
    object_id_bson_data = loads(oplog_object_id_bson_raw_data)
    partial_inserts_list = get_tables_data_from_oplog_set_command(\
        schema_engine, bson_data, object_id_bson_data)

    for partial_insert in partial_inserts_list:
        tables_for_insert = partial_insert.tables
        initial_indexes = partial_insert.initial_indexes

        for name, table in tables_for_insert.iteritems():
            query_tuple = generate_insert_queries(table, 
                                                  PSQL_SCHEMA_NAME, "", 
                                                  initial_indexes)
            for query in query_tuple[1]:
                getLogger(__name__).debug("EXECUTE: " + \
                                              str(query_tuple[0]) + str(query))
                psql.cursor.execute(query_tuple[0], query)

    # tables loaded from existing_bson_data
    rec_obj_id = object_id_bson_data['_id']
    tables_obj_after = load_single_rec_into_tables_obj(psql,
                                                       schema_engine,
                                                       PSQL_SCHEMA_NAME,
                                                       rec_obj_id)
    sample_data_after = sample_data_before
    sample_data_after['posts2_comments']['idx'].append(3)
    sample_data_after['posts2_comments']['id_oid'].append(\
        "56b8f35ef9fcee1b0000001a")
    sample_data_after['posts2_comments']['updated_at'].append(
        loads('{ "$date" : "2016-02-08T20:02:14.985Z"}'))
    sample_data_after['posts2_comment_struct_tests'] = {
            'v': [1,2,3,12,13],
            'idx': [1,2,1,1,2]
    }
    sample_data_after['posts2_comment_struct_test_nested'] = {
            'nested': [20,21,23,24,25,26,30,32,31],
            'idx': [1,2,1,2,1,2,1,1,2]
    }

    assert(False==tables_obj_after.compare_with_sample({}))
    assert(True==tables_obj_after.compare_with_sample(sample_data_after))

Ejemplo n.º 21

0

Mostrar archivo

Archivo: test_oppartial_record.py Proyecto: VarchukVladimir/gizer

def test_complete_partial_record3():
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)-8s %(message)s')

    PSQL_SCHEMA_NAME = ''
    # etalon of data
    sample_data_before = {
        'posts': {
            'id_oid': ['56b8da59f9fcee1b00000007'],
            "updated_at": [loads('{ "$date" : "2016-02-08T20:02:12.985Z"}')]
        },
        'post_comments': {
            'id_oid': [str(loads('{ "$oid": "56b8f35ef9fcee1b0000001a" }')),
                       str(loads('{ "$oid": "56b8f344f9fcee1b00000018" }'))],
            'updated_at': [loads('{ "$date" : "2016-02-08T20:02:12.985Z"}'),
                           loads('{ "$date" : "2016-02-08T20:02:13.985Z"}')],
            'idx': [1,2]
        },
        'post_comment_tests': {
            'tests': [0,2],
            'idx': [1,2]
        }
    }

    existing_raw_bson_data = '[{\
     "_id": { "$oid": "56b8da59f9fcee1b00000007" },\
     "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
     "comments": [ {\
          "_id": { "$oid": "56b8f35ef9fcee1b0000001a" },\
          "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"}\
        }, {\
          "_id": { "$oid": "56b8f344f9fcee1b00000018" },\
          "updated_at": { "$date" : "2016-02-08T20:02:13.985Z"},\
          "tests": [0,2]\
        } ]\
 }]'

    oplog_object_id_bson_raw_data = '{\
"_id": { "$oid": "56b8da59f9fcee1b00000007" }\
}'
    # insert request should be created, to add a record with only single field: updated_at
    oplog_path_array_bson_raw_data = '{"comments.2.updated_at": \
{ "$date" : "2016-02-08T20:02:14.985Z"}}'

    dbname = 'rails4_mongoid_development'
    db_schemas_path = '/'.join(['test_data', 'schemas', dbname])
    schemas = get_schema_engines_as_dict(db_schemas_path)
    schema_engine = schemas['posts']

    connstr = os.environ['TEST_PSQLCONN']
    psql = PsqlRequests(psycopg2.connect(connstr))

    # tables loaded from existing_raW_bson_data
    existing_bson_data = loads(existing_raw_bson_data)
    tables_obj_before = \
        create_tables_load_bson_data(schema_engine, 
                                     existing_bson_data)
    assert(True==tables_obj_before.compare_with_sample(sample_data_before))

    # create table structure, drop existing
    create_psql_tables(tables_obj_before, psql, PSQL_SCHEMA_NAME, '', True)
    # insert data totables
    insert_tables_data_into_dst_psql(psql, tables_obj_before, 
                                     PSQL_SCHEMA_NAME, '')

    # oplog path inserting just a field
    bson_data = loads(oplog_path_array_bson_raw_data)
    print bson_data
    object_id_bson_data = loads(oplog_object_id_bson_raw_data)
    partial_inserts_list = get_tables_data_from_oplog_set_command(\
        schema_engine, bson_data, object_id_bson_data)
    tables_for_insert = partial_inserts_list[0].tables
    initial_indexes = partial_inserts_list[0].initial_indexes
    print "tables_for_insert", tables_for_insert.keys()
    print "initial_indexes", initial_indexes
    insert_tests_t = tables_for_insert['post_comments']
    insert_query = generate_insert_queries(insert_tests_t, "", "", 
                                           initial_indexes)
    print "columns", insert_tests_t.sql_column_names
    print "insert_query=", insert_query
    for query in insert_query[1]:
        print insert_query[0], query
        psql.cursor.execute(insert_query[0], query)

    # tables loaded from existing_bson_data
    rec_obj_id = object_id_bson_data['_id']
    tables_obj_after = load_single_rec_into_tables_obj(psql,
                                                       schema_engine,
                                                       PSQL_SCHEMA_NAME,
                                                       rec_obj_id)
    sample_data_after = sample_data_before
    sample_data_after['post_comments']['idx'].append(3)
    sample_data_after['post_comments']['id_oid'].append(None)
    sample_data_after['post_comments']['updated_at'].append(
        loads('{ "$date" : "2016-02-08T20:02:14.985Z"}'))

    assert(True==tables_obj_after.compare_with_sample(sample_data_after))