Exemple #1
0
def _insert_metadata(metadata_hash_dict, dataset_id_hash_dict):
    if metadata_hash_dict is None or len(metadata_hash_dict) == 0:
        return

    metadata_list = []
    for _hash, _metadata_dict in metadata_hash_dict.items():
        for k, v in _metadata_dict.items():
            metadata = {}
            metadata['metadata_name']  = k
            metadata['metadata_val']  = v
            metadata['dataset_id']      = dataset_id_hash_dict[_hash].dataset_id
            metadata_list.append(metadata)

    DBSession.execute(Metadata.__table__.insert(), metadata_list)
Exemple #2
0
def _insert_metadata(metadata_hash_dict, dataset_id_hash_dict):
    if metadata_hash_dict is None or len(metadata_hash_dict) == 0:
        return

    metadata_list = []
    for _hash, _metadata_dict in metadata_hash_dict.items():
        for k, v in _metadata_dict.items():
            metadata = {}
            metadata['metadata_name'] = k
            metadata['metadata_val'] = v
            metadata['dataset_id'] = dataset_id_hash_dict[_hash].dataset_id
            metadata_list.append(metadata)

    DBSession.execute(Metadata.__table__.insert(), metadata_list)
Exemple #3
0
def _bulk_insert_data(bulk_data, user_id=None, source=None):
    """
        Insert lots of datasets at once to reduce the number of DB interactions.
        user_id indicates the user adding the data
        source indicates the name of the app adding the data
        both user_id and source are added as metadata
    """
    get_timing = lambda x: datetime.datetime.now() - x
    start_time=datetime.datetime.now()

    new_data = _process_incoming_data(bulk_data, user_id, source)
    log.info("Incoming data processed in %s", (get_timing(start_time)))

    existing_data = _get_existing_data(new_data.keys())

    log.info("Existing data retrieved.")

    #The list of dataset IDS to be returned.
    hash_id_map = {}
    new_datasets = []
    metadata         = {}
    #This is what gets returned.
    for d in bulk_data:
        log.info(d.data_hash)
        dataset_dict = new_data[d.data_hash]
        current_hash = d.data_hash

        #if this piece of data is already in the DB, then
        #there is no need to insert it!
        if  existing_data.get(current_hash) is not None:

            dataset = existing_data.get(current_hash)
            #Is this user allowed to use this dataset?
            if dataset.check_user(user_id) == False:
                new_dataset = _make_new_dataset(dataset_dict)
                new_datasets.append(new_dataset)
                metadata[new_dataset['data_hash']] = dataset_dict['metadata']
            else:
                hash_id_map[current_hash] = dataset#existing_data[current_hash]
        elif current_hash in hash_id_map:
            new_datasets.append(dataset_dict)
        else:
            #set a placeholder for a dataset_id we don't know yet.
            #The placeholder is the hash, which is unique to this object and
            #therefore easily identifiable.
            new_datasets.append(dataset_dict)
            hash_id_map[current_hash] = dataset_dict
            metadata[current_hash] = dataset_dict['metadata']

    log.debug("Isolating new data %s", get_timing(start_time))
    #Isolate only the new datasets and insert them
    new_data_for_insert = []
    #keep track of the datasets that are to be inserted to avoid duplicate
    #inserts
    new_data_hashes = []
    for d in new_datasets:
        if d['data_hash'] not in new_data_hashes:
            new_data_for_insert.append(d)
            new_data_hashes.append(d['data_hash'])

    if len(new_data_for_insert) > 0:
    	#If we're working with mysql, we have to lock the table..
    	#For sqlite, this is not possible. Hence the try: except
        try:
            DBSession.execute("LOCK TABLES tDataset WRITE, tMetadata WRITE")
        except OperationalError:
            pass

        log.debug("Inserting new data", get_timing(start_time))
        DBSession.execute(Dataset.__table__.insert(), new_data_for_insert)
        log.debug("New data Inserted", get_timing(start_time))

        try:
            DBSession.execute("UNLOCK TABLES")
        except OperationalError:
            pass


        new_data = _get_existing_data(new_data_hashes)
        log.debug("New data retrieved", get_timing(start_time))

        for k, v in new_data.items():
            hash_id_map[k] = v

        _insert_metadata(metadata, hash_id_map)
        log.debug("Metadata inserted", get_timing(start_time))

    returned_ids = []
    for d in bulk_data:
        returned_ids.append(hash_id_map[d.data_hash])

    log.info("Done bulk inserting data. %s datasets", len(returned_ids))

    return returned_ids
Exemple #4
0
def _bulk_insert_data(bulk_data, user_id=None, source=None):
    """
        Insert lots of datasets at once to reduce the number of DB interactions.
        user_id indicates the user adding the data
        source indicates the name of the app adding the data
        both user_id and source are added as metadata
    """
    get_timing = lambda x: datetime.datetime.now() - x
    start_time = datetime.datetime.now()

    new_data = _process_incoming_data(bulk_data, user_id, source)
    log.info("Incoming data processed in %s", (get_timing(start_time)))

    existing_data = _get_existing_data(new_data.keys())

    log.info("Existing data retrieved.")

    #The list of dataset IDS to be returned.
    hash_id_map = {}
    new_datasets = []
    metadata = {}
    #This is what gets returned.
    for d in bulk_data:
        log.info(d.data_hash)
        dataset_dict = new_data[d.data_hash]
        current_hash = d.data_hash

        #if this piece of data is already in the DB, then
        #there is no need to insert it!
        if existing_data.get(current_hash) is not None:

            dataset = existing_data.get(current_hash)
            #Is this user allowed to use this dataset?
            if dataset.check_user(user_id) == False:
                new_dataset = _make_new_dataset(dataset_dict)
                new_datasets.append(new_dataset)
                metadata[new_dataset['data_hash']] = dataset_dict['metadata']
            else:
                hash_id_map[
                    current_hash] = dataset  #existing_data[current_hash]
        elif current_hash in hash_id_map:
            new_datasets.append(dataset_dict)
        else:
            #set a placeholder for a dataset_id we don't know yet.
            #The placeholder is the hash, which is unique to this object and
            #therefore easily identifiable.
            new_datasets.append(dataset_dict)
            hash_id_map[current_hash] = dataset_dict
            metadata[current_hash] = dataset_dict['metadata']

    log.debug("Isolating new data %s", get_timing(start_time))
    #Isolate only the new datasets and insert them
    new_data_for_insert = []
    #keep track of the datasets that are to be inserted to avoid duplicate
    #inserts
    new_data_hashes = []
    for d in new_datasets:
        if d['data_hash'] not in new_data_hashes:
            new_data_for_insert.append(d)
            new_data_hashes.append(d['data_hash'])

    if len(new_data_for_insert) > 0:
        #If we're working with mysql, we have to lock the table..
        #For sqlite, this is not possible. Hence the try: except
        try:
            DBSession.execute("LOCK TABLES tDataset WRITE, tMetadata WRITE")
        except OperationalError:
            pass

        log.debug("Inserting new data", get_timing(start_time))
        DBSession.execute(Dataset.__table__.insert(), new_data_for_insert)
        log.debug("New data Inserted", get_timing(start_time))

        try:
            DBSession.execute("UNLOCK TABLES")
        except OperationalError:
            pass

        new_data = _get_existing_data(new_data_hashes)
        log.debug("New data retrieved", get_timing(start_time))

        for k, v in new_data.items():
            hash_id_map[k] = v

        _insert_metadata(metadata, hash_id_map)
        log.debug("Metadata inserted", get_timing(start_time))

    returned_ids = []
    for d in bulk_data:
        returned_ids.append(hash_id_map[d.data_hash])

    log.info("Done bulk inserting data. %s datasets", len(returned_ids))

    return returned_ids