Ejemplo n.º 1
0
def test_all():
    connstr = os.environ['TEST_PSQLCONN']
    dbreq = PsqlRequests(psycopg2.connect(connstr))
    tables = collection_tables("a_inserts").tables
    for table in tables:
        create_table = generate_create_table_statement(tables[table], "", "")
        print create_table
        dbreq.cursor.execute(create_table)
        indexes = dbreq.get_table_max_indexes(tables[table], "")
        inserts = generate_insert_queries(tables[table], "", "", initial_indexes = indexes)
        for query in inserts[1]:
            dbreq.cursor.execute(inserts[0], query)
    dbreq.cursor.execute('COMMIT')
Ejemplo n.º 2
0
def test_tables2():
    """ Test collection with id of INT type """
    collection_name = 'a_somethings'
    tables = collection_tables(collection_name)
    root_t = tables.tables[collection_name]
    assert ('id' in root_t.sql_columns)
    assert (root_t.sql_columns['id'].values[0] == 777)
    comments_t = tables.tables['a_something_comments']
    assert ('a_somethings_id' in comments_t.sql_columns)
    assert (comments_t.sql_columns['a_somethings_id'].values[0] == 777)
    items_t = tables.tables['a_something_comment_items']
    assert ('a_somethings_id' in items_t.sql_columns)
    assert ('customer_id_bsontype' in root_t.sql_columns)
    print root_t.sql_columns['customer_id_bsontype'].values
    assert (root_t.sql_columns['customer_id_bsontype'].values[0] == 7)
    assert(root_t.sql_columns['customer_id_oid'].values[0]\
               == '56b8f05cf9fcee1b00000000')

    #test load / unload data
    connstr = os.environ['TEST_PSQLCONN']
    dbreq = PsqlRequests(psycopg2.connect(connstr))
    insert_tables_data_into_dst_psql(dbreq, tables, '', '')
    create_psql_index(root_t, dbreq, '', '')

    loaded_tables = load_single_rec_into_tables_obj(dbreq,
                                                    tables.schema_engine, '',
                                                    777)
    assert (tables.compare(loaded_tables) == True)
Ejemplo n.º 3
0
def main():
    """ main """
    parser = argparse.ArgumentParser()
    parser.add_argument("--config-file",
                        action="store",
                        help="Settings file",
                        type=file,
                        required=True)
    parser.add_argument("--psql-section",
                        help="Psql section name from config",
                        type=str,
                        required=True)
    parser.add_argument("-cn",
                        "--collection-name",
                        help="Mongo collection name",
                        type=str,
                        required=True)
    parser.add_argument("--psql-table-name", type=str, required=True)
    parser.add_argument("-psql-table-prefix", type=str, required=False)
    parser.add_argument("--input-csv-dir", type=str, required=True)
    args = parser.parse_args()

    config = configparser.ConfigParser()
    config.read_file(args.config_file)

    schema_name = config['psql']['psql-schema-name']
    schemas_dir = config['misc']['schemas-dir']
    schema_path = join(schemas_dir, args.collection_name + '.json')
    schema_file = open(schema_path, 'r')

    psql_settings = psql_settings_from_config(config, args.psql_section)

    table_prefix = ""
    if args.psql_table_prefix:
        table_prefix = args.psql_table_prefix

    schema = SchemaEngine(args.collection_name, [load(schema_file)])
    table = create_tables_load_bson_data(schema, None)\
        .tables[args.psql_table_name]
    dbreq = PsqlRequests(psql_conn_from_settings(psql_settings))

    create_psql_table(table, dbreq, schema_name, table_prefix, drop=True)
    create_psql_index(table, dbreq, schema_name, table_prefix)
    dbreq.conn.commit()

    csv_files = [f \
                 for f in listdir(args.input_csv_dir) \
                 if isfile(join(args.input_csv_dir, f))]
    csv_files.sort()
    for name in csv_files:
        csvpath = join(args.input_csv_dir, name)
        with open(csvpath, 'rb') as csv_f:
            schema_name_subst = schema_name
            if len(schema_name):
                schema_name_subst += '.'
            tname = '%s"%s%s"' % (schema_name_subst, table_prefix,
                                  args.psql_table_name)
            copy_from_csv(dbreq, csv_f, tname)
Ejemplo n.º 4
0
def test_complete_partial_record2():
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)-8s %(message)s')

    PSQL_SCHEMA_NAME = ''
    # etalon of data
    sample_data_before = {
        'posts2': {
            'id': [133],
            "updated_at": [loads('{ "$date" : "2016-02-08T20:02:12.985Z"}')]
        },
        'posts2_comments': {
            'idx': [1, 2],
            'id_oid': [
                str(loads('{ "$oid": "56b8f35ef9fcee1b0000001a" }')),
                str(loads('{ "$oid": "56b8f344f9fcee1b00000018" }'))
            ],
            'updated_at': [
                loads('{ "$date" : "2016-02-08T20:02:12.985Z"}'),
                loads('{ "$date" : "2016-02-08T20:02:12.985Z"}')
            ]
        },
        'posts2_comment_struct_tests': {
            'v': [1, 2, 3],
            'idx': [1, 2, 1]
        },
        'posts2_comment_struct_test_nested': {
            'nested': [20, 23, 24, 25, 26],
            'idx': [1, 1, 2, 1, 2]
        }
    }

    wrong_raw_bson_data = '[{\
     "_id": 133,\
     "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
     "comments": "error in data"\
    }]'

    existing_raw_bson_data = '[{\
     "_id": 133,\
     "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
     "comments": [ {\
          "_id": { "$oid": "56b8f35ef9fcee1b0000001a" },\
          "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
          "struct" : {\
              "tests": [{\
                  "v": 1,\
                  "nested": [20]\
              }, {\
                  "v": 2,\
                  "nested": [23, 24]\
              }]}\
        }, {\
          "_id": { "$oid": "56b8f344f9fcee1b00000018" },\
          "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
          "struct" : {\
              "tests": [{\
                  "v": 3,\
                  "nested": [25, 26]\
              }]}\
        } ]\
    }]'

    oplog_object_id_bson_raw_data = '{"_id": 133}'
    # insert request should be created, to add 'tests' item
    oplog_path_array_bson_raw_data = '{\
"comments.0.struct.tests.0.nested.1": 21,\
"comments.2": { \
    "_id": { "$oid": "56b8f35ef9fcee1b0000001a" },\
    "updated_at": { "$date" : "2016-02-08T20:02:14.985Z"},\
    "struct": {\
        "tests": [{\
            "v": 12,\
            "nested": [30]\
         }, {\
            "v": 13,\
            "nested": [32, 31]\
         }\
    ]}}\
}'

    dbname = 'rails4_mongoid_development'
    db_schemas_path = '/'.join(['test_data', 'schemas', dbname])
    schemas = get_schema_engines_as_dict(db_schemas_path)
    schema_engine = schemas['posts2']

    connstr = os.environ['TEST_PSQLCONN']
    psql = PsqlRequests(psycopg2.connect(connstr))

    # test wrong bson data
    existing_bson_data = loads(wrong_raw_bson_data)
    tables_obj_before = \
        create_tables_load_bson_data(schema_engine,
                                     existing_bson_data)
    assert (False == tables_obj_before.compare_with_sample(sample_data_before))

    # tables loaded from existing_raW_bson_data
    existing_bson_data = loads(existing_raw_bson_data)
    tables_obj_before = \
        create_tables_load_bson_data(schema_engine,
                                     existing_bson_data)

    assert (False == tables_obj_before.compare_with_sample({}))
    assert (True == tables_obj_before.compare_with_sample(sample_data_before))

    # create table structure, drop existing
    create_psql_tables(tables_obj_before, psql, PSQL_SCHEMA_NAME, '', True)
    # insert data totables
    insert_tables_data_into_dst_psql(psql, tables_obj_before, PSQL_SCHEMA_NAME,
                                     '')

    # oplog path with indexes. insert array item
    bson_data = loads(oplog_path_array_bson_raw_data)
    object_id_bson_data = loads(oplog_object_id_bson_raw_data)
    partial_inserts_list = get_tables_data_from_oplog_set_command(\
        schema_engine, bson_data, object_id_bson_data)

    for partial_insert in partial_inserts_list:
        tables_for_insert = partial_insert.tables
        initial_indexes = partial_insert.initial_indexes

        for name, table in tables_for_insert.iteritems():
            query_tuple = generate_insert_queries(table, PSQL_SCHEMA_NAME, "",
                                                  initial_indexes)
            for query in query_tuple[1]:
                getLogger(__name__).debug("EXECUTE: " + \
                                              str(query_tuple[0]) + str(query))
                psql.cursor.execute(query_tuple[0], query)

    # tables loaded from existing_bson_data
    rec_obj_id = object_id_bson_data['_id']
    tables_obj_after = load_single_rec_into_tables_obj(psql, schema_engine,
                                                       PSQL_SCHEMA_NAME,
                                                       rec_obj_id)
    sample_data_after = sample_data_before
    sample_data_after['posts2_comments']['idx'].append(3)
    sample_data_after['posts2_comments']['id_oid'].append(\
        "56b8f35ef9fcee1b0000001a")
    sample_data_after['posts2_comments']['updated_at'].append(
        loads('{ "$date" : "2016-02-08T20:02:14.985Z"}'))
    sample_data_after['posts2_comment_struct_tests'] = {
        'v': [1, 2, 3, 12, 13],
        'idx': [1, 2, 1, 1, 2]
    }
    sample_data_after['posts2_comment_struct_test_nested'] = {
        'nested': [20, 21, 23, 24, 25, 26, 30, 32, 31],
        'idx': [1, 2, 1, 2, 1, 2, 1, 1, 2]
    }

    assert (False == tables_obj_after.compare_with_sample({}))
    assert (True == tables_obj_after.compare_with_sample(sample_data_after))
Ejemplo n.º 5
0
def test_complete_partial_record3():
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)-8s %(message)s')

    PSQL_SCHEMA_NAME = ''
    # etalon of data
    sample_data_before = {
        'posts': {
            'id_oid': ['56b8da59f9fcee1b00000007'],
            "updated_at": [loads('{ "$date" : "2016-02-08T20:02:12.985Z"}')]
        },
        'post_comments': {
            'id_oid': [
                str(loads('{ "$oid": "56b8f35ef9fcee1b0000001a" }')),
                str(loads('{ "$oid": "56b8f344f9fcee1b00000018" }'))
            ],
            'updated_at': [
                loads('{ "$date" : "2016-02-08T20:02:12.985Z"}'),
                loads('{ "$date" : "2016-02-08T20:02:13.985Z"}')
            ],
            'idx': [1, 2]
        },
        'post_comment_tests': {
            'tests': [0, 2],
            'idx': [1, 2]
        }
    }

    existing_raw_bson_data = '[{\
     "_id": { "$oid": "56b8da59f9fcee1b00000007" },\
     "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\
     "comments": [ {\
          "_id": { "$oid": "56b8f35ef9fcee1b0000001a" },\
          "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"}\
        }, {\
          "_id": { "$oid": "56b8f344f9fcee1b00000018" },\
          "updated_at": { "$date" : "2016-02-08T20:02:13.985Z"},\
          "tests": [0,2]\
        } ]\
 }]'

    oplog_object_id_bson_raw_data = '{\
"_id": { "$oid": "56b8da59f9fcee1b00000007" }\
}'

    # insert request should be created, to add a record with only single field: updated_at
    oplog_path_array_bson_raw_data = '{"comments.2.updated_at": \
{ "$date" : "2016-02-08T20:02:14.985Z"}}'

    dbname = 'rails4_mongoid_development'
    db_schemas_path = '/'.join(['test_data', 'schemas', dbname])
    schemas = get_schema_engines_as_dict(db_schemas_path)
    schema_engine = schemas['posts']

    connstr = os.environ['TEST_PSQLCONN']
    psql = PsqlRequests(psycopg2.connect(connstr))

    # tables loaded from existing_raW_bson_data
    existing_bson_data = loads(existing_raw_bson_data)
    tables_obj_before = \
        create_tables_load_bson_data(schema_engine,
                                     existing_bson_data)
    assert (True == tables_obj_before.compare_with_sample(sample_data_before))

    # create table structure, drop existing
    create_psql_tables(tables_obj_before, psql, PSQL_SCHEMA_NAME, '', True)
    # insert data totables
    insert_tables_data_into_dst_psql(psql, tables_obj_before, PSQL_SCHEMA_NAME,
                                     '')

    # oplog path inserting just a field
    bson_data = loads(oplog_path_array_bson_raw_data)
    print bson_data
    object_id_bson_data = loads(oplog_object_id_bson_raw_data)
    partial_inserts_list = get_tables_data_from_oplog_set_command(\
        schema_engine, bson_data, object_id_bson_data)
    tables_for_insert = partial_inserts_list[0].tables
    initial_indexes = partial_inserts_list[0].initial_indexes
    print "tables_for_insert", tables_for_insert.keys()
    print "initial_indexes", initial_indexes
    insert_tests_t = tables_for_insert['post_comments']
    insert_query = generate_insert_queries(insert_tests_t, "", "",
                                           initial_indexes)
    print "columns", insert_tests_t.sql_column_names
    print "insert_query=", insert_query
    for query in insert_query[1]:
        print insert_query[0], query
        psql.cursor.execute(insert_query[0], query)

    # tables loaded from existing_bson_data
    rec_obj_id = object_id_bson_data['_id']
    tables_obj_after = load_single_rec_into_tables_obj(psql, schema_engine,
                                                       PSQL_SCHEMA_NAME,
                                                       rec_obj_id)
    sample_data_after = sample_data_before
    sample_data_after['post_comments']['idx'].append(3)
    sample_data_after['post_comments']['id_oid'].append(None)
    sample_data_after['post_comments']['updated_at'].append(
        loads('{ "$date" : "2016-02-08T20:02:14.985Z"}'))

    assert (True == tables_obj_after.compare_with_sample(sample_data_after))
Ejemplo n.º 6
0
def run_oplog_engine_check(oplog_test, schemas_path):
    connstr = os.environ['TEST_PSQLCONN']
    dbreq = PsqlRequests(psycopg2.connect(connstr))

    psql_schema = MAIN_SCHEMA_NAME

    schema_engines = get_schema_engines_as_dict(schemas_path)
    getLogger(__name__).info("Loading oplog data...")

    oplog_readers, mongo_readers_after = get_readers(oplog_test,
                                                     enable_exceptions=False)

    create_truncate_psql_objects(dbreq, schemas_path, psql_schema)
    dbreq.cursor.execute('COMMIT')
    # do pseudo "init load", ignore inject_exception on this step
    for name, mongo_data_path in oplog_test.before.iteritems():
        load_mongo_data_to_psql(schema_engines[name], mongo_data_path[0],
                                dbreq, psql_schema)
    # recreate connection / cursor as rollback won't work after commit
    del dbreq
    dbreq = PsqlRequests(psycopg2.connect(connstr))
    dbreq_etl = PsqlRequests(psycopg2.connect(connstr))
    getLogger(__name__).info("Loading mongo data after initload")

    try:
        gizer.oplog_sync_unalligned_data.DO_OPLOG_READ_ATTEMPTS_COUNT \
            = oplog_test.max_attempts
        gizer.oplog_sync_alligned_data.DO_OPLOG_READ_ATTEMPTS_COUNT \
            = oplog_test.max_attempts
        # for better coverage
        gizer.oplog_sync_unalligned_data.SYNC_REC_COUNT_IN_ONE_BATCH = 2

        unalligned_sync \
            = OplogSyncUnallignedData(dbreq_etl, dbreq,
                                      mongo_readers_after, oplog_readers,
                                      schemas_path, schema_engines, psql_schema)

        #start syncing from very start of oplog
        ts_synced = unalligned_sync.sync(None)
        getLogger(__name__).info("Sync done ts_synced: %s" % str(ts_synced))
        getLogger(__name__).info("statistic %s" %
                                 str(unalligned_sync.statistic()))
        del unalligned_sync
        # sync failed
        if ts_synced is None and not oplog_test.skip_sync:
            return False

        del dbreq
        dbreq = PsqlRequests(psycopg2.connect(connstr))

        oplog_readers, mongo_readers_after = get_readers(
            oplog_test, enable_exceptions=True)
        # ignore sync results
        if oplog_test.skip_sync:
            ts_synced = {}
            for name in oplog_readers:
                ts_synced[name] = None

        alligned_sync \
            = OplogSyncAllignedData(dbreq, mongo_readers_after, oplog_readers,
                                    schemas_path, schema_engines, psql_schema)
        res = alligned_sync.sync(ts_synced)
        if res:
            getLogger(__name__).info("Test passed")

    except:
        # close psql connection to have ability to run next tests
        dbreq.conn.close()
        raise
    if res:
        return True
    else:
        return False
Ejemplo n.º 7
0
def main():
    """ main """

    parser = argparse.ArgumentParser()
    parser.add_argument("--config-file",
                        action="store",
                        help="Config file with settings",
                        type=file,
                        required=True)
    parser.add_argument("-init-load-status",
                        action="store_true",
                        help="will get exit status=0 if init load not needed,\
or status=-1 if otherwise; Also print 1 - if in progress, 0 - if not.")
    parser.add_argument(
        "-init-load-start-save-ts",
        action="store_true",
        help='Save latest oplog timestamp to psql etlstatus table')
    parser.add_argument("-init-load-finish",
                        help='values are: "ok" or "error"',
                        type=str)
    args = parser.parse_args()

    config = configparser.ConfigParser()
    config.read_file(args.config_file)

    psql_settings = psql_settings_from_config(config, 'psql')
    psql_main = PsqlRequests(psql_conn_from_settings(psql_settings))
    oplog_settings = load_mongo_replicas_from_setting(config, 'mongo-oplog')

    status_table = PsqlEtlStatusTable(psql_main.cursor,
                                      config['psql']['psql-schema-name'],
                                      sorted(oplog_settings.keys()))
    res = 0
    if args.init_load_status:
        status = status_table.get_recent()
        if status:
            if (status.status == STATUS_OPLOG_SYNC or \
                status.status == STATUS_OPLOG_APPLY or \
                status.status == STATUS_INITIAL_LOAD or \
                status.status == STATUS_OPLOG_RESYNC) and not status.error:
                delta = datetime.now() - status.time_start
                # if operation is running to long
                if status.time_end:
                    res = 0
                elif delta.total_seconds() < 32400:  # < 9 hours
                    res = 0
                    if not status.time_end:
                        print 1  # means etl in progress
                    else:
                        print 0  # means not etl in progress
                else:
                    # takes to much time -> do init load
                    res = -1
            else:
                # error status -> do init load
                res = -1
        else:
            # empty status table -> do init load
            res = -1
    elif args.init_load_start_save_ts:
        # create oplog read transport/s to acquire ts
        max_ts_dict = {}
        for oplog_name, settings_list in oplog_settings.iteritems():
            print 'Fetch timestamp from oplog: %s' % oplog_name
            # settings list is a replica set (must be at least one in list)
            reader = mongo_reader_from_settings(settings_list, 'oplog.rs', {})
            reader.make_new_request({})
            reader.cursor.sort('ts', DESCENDING)
            reader.cursor.limit(1)
            timestamp = reader.next()
            if timestamp:
                max_ts_dict[oplog_name] = timestamp['ts']
            else:
                max_ts_dict[oplog_name] = None
            print 'Initload ts: %s, oplog: %s' % (max_ts_dict[oplog_name],
                                                  oplog_name)

        status_manager = PsqlEtlStatusTableManager(status_table)
        status_manager.init_load_start(max_ts_dict)
    elif args.init_load_finish:
        status_manager = PsqlEtlStatusTableManager(status_table)
        if args.init_load_finish == "ok":
            status_manager.init_load_finish(False)  # ok
        else:
            status_manager.init_load_finish(True)  # error

    return res
Ejemplo n.º 8
0
def main():
    """ main """

    parser = argparse.ArgumentParser()
    parser.add_argument("--config-file",
                        action="store",
                        help="Config file with settings",
                        type=file,
                        required=True)
    args = parser.parse_args()

    config = configparser.ConfigParser()
    config.read_file(args.config_file)

    schemas_path = config['misc']['schemas-dir']
    logspath = config['misc']['logs-dir']

    oplog_settings = load_mongo_replicas_from_setting(config, 'mongo-oplog')

    mongo_settings = mongo_settings_from_config(config, 'mongo')
    psql_settings = psql_settings_from_config(config, 'psql')

    mongo_readers = {}
    schema_engines = get_schema_engines_as_dict(schemas_path)
    for collection_name in schema_engines:
        reader = mongo_reader_from_settings(mongo_settings, collection_name,
                                            {})
        mongo_readers[collection_name] = reader
        mongo_readers[collection_name].set_name(collection_name)

    # create oplog read transport/s
    oplog_readers = {}
    for oplog_name, settings_list in oplog_settings.iteritems():
        # settings list is a replica set (must be at least one in list)
        oplog_readers[oplog_name] = \
            mongo_reader_from_settings(settings_list, 'oplog.rs', {})
        oplog_readers[oplog_name].set_name(oplog_name)

    psql_qmetl = PsqlRequests(psql_conn_from_settings(psql_settings))
    psql_main = PsqlRequests(psql_conn_from_settings(psql_settings))

    status_table = PsqlEtlStatusTable(psql_qmetl.cursor,
                                      config['psql']['psql-schema-name'],
                                      sorted(oplog_settings.keys()))
    status_manager = PsqlEtlStatusTableManager(status_table)

    psql_schema = config['psql']['psql-schema-name']

    res = 0
    status = status_table.get_recent()
    if status:
        if (status.status == STATUS_INITIAL_LOAD \
                or status.status == STATUS_OPLOG_RESYNC) \
                and status.time_end and not status.error:
            create_logger(logspath, 'oplogsync')
            psql_sync = psql_main
            # intial load done, save oplog sync status and do oplog sync.
            status_manager.oplog_sync_start(status.ts)
            unalligned_sync = OplogSyncUnallignedData(
                psql_qmetl, psql_sync, mongo_readers, oplog_readers,
                schemas_path, schema_engines, psql_schema)
            try:
                ts = unalligned_sync.sync(status.ts)
                stat = unalligned_sync.statistic()
                reinit_conn(psql_settings, psql_qmetl, status_manager)
                if ts:  # sync ok
                    status_manager.oplog_sync_finish(stat[0], stat[1], ts,
                                                     False)
                    res = 0
                else:  # error
                    status_manager.oplog_sync_finish(stat[0], stat[1], None,
                                                     True)
                    res = -1
            except Exception, e:
                getLogger(__name__).error(e, exc_info=True)
                getLogger(__name__).error('ROLLBACK CLOSE')
                psql_sync.conn.rollback()
                reinit_conn(psql_settings, psql_qmetl, status_manager)
                status_manager.oplog_sync_finish(None, True)
                res = -1

        elif (status.status == STATUS_OPLOG_SYNC or \
              status.status == STATUS_OPLOG_APPLY) \
            and status.time_end and not status.error:
            create_logger(logspath, 'oploguse')
            # sync done, now apply oplog pacthes to main psql
            # save oplog sync status
            getLogger(__name__).\
                info('Sync point is ts:{ts}'.format(ts=status.ts))
            status_manager.oplog_use_start(status.ts)
            alligned_sync = \
                OplogSyncAllignedData(psql_main, mongo_readers, oplog_readers,
                                      schemas_path, schema_engines, psql_schema)
            try:
                ts_res = alligned_sync.sync(status.ts)
                stat = alligned_sync.statistic()
                reinit_conn(psql_settings, psql_qmetl, status_manager)
                if ts_res == 'resync':
                    # some records recovered must do resync at next step
                    status_manager.oplog_resync_finish(stat[0], stat[1],
                                                       status.ts, False)
                    res = 0
                elif ts_res:  # oplog apply ok
                    status_manager.oplog_use_finish(stat[0], stat[1], ts_res,
                                                    False)
                    res = 0
                else:  # error
                    status_manager.oplog_use_finish(stat[0], stat[1], None,
                                                    True)
                    res = -1
            except Exception, e:
                getLogger(__name__).error(e, exc_info=True)
                getLogger(__name__).error('ROLLBACK CLOSE')
                psql_main.conn.rollback()
                reinit_conn(psql_settings, psql_qmetl, status_manager)
                status_manager.oplog_use_finish(None, None, None, True)
                res = -1