def _insert_metadata(metadata_hash_dict, dataset_id_hash_dict): if metadata_hash_dict is None or len(metadata_hash_dict) == 0: return metadata_list = [] for _hash, _metadata_dict in metadata_hash_dict.items(): for k, v in _metadata_dict.items(): metadata = {} metadata['metadata_name'] = k metadata['metadata_val'] = v metadata['dataset_id'] = dataset_id_hash_dict[_hash].dataset_id metadata_list.append(metadata) DBSession.execute(Metadata.__table__.insert(), metadata_list)
def _insert_metadata(metadata_hash_dict, dataset_id_hash_dict): if metadata_hash_dict is None or len(metadata_hash_dict) == 0: return metadata_list = [] for _hash, _metadata_dict in metadata_hash_dict.items(): for k, v in _metadata_dict.items(): metadata = {} metadata['metadata_name'] = k metadata['metadata_val'] = v metadata['dataset_id'] = dataset_id_hash_dict[_hash].dataset_id metadata_list.append(metadata) DBSession.execute(Metadata.__table__.insert(), metadata_list)
def _bulk_insert_data(bulk_data, user_id=None, source=None): """ Insert lots of datasets at once to reduce the number of DB interactions. user_id indicates the user adding the data source indicates the name of the app adding the data both user_id and source are added as metadata """ get_timing = lambda x: datetime.datetime.now() - x start_time=datetime.datetime.now() new_data = _process_incoming_data(bulk_data, user_id, source) log.info("Incoming data processed in %s", (get_timing(start_time))) existing_data = _get_existing_data(new_data.keys()) log.info("Existing data retrieved.") #The list of dataset IDS to be returned. hash_id_map = {} new_datasets = [] metadata = {} #This is what gets returned. for d in bulk_data: log.info(d.data_hash) dataset_dict = new_data[d.data_hash] current_hash = d.data_hash #if this piece of data is already in the DB, then #there is no need to insert it! if existing_data.get(current_hash) is not None: dataset = existing_data.get(current_hash) #Is this user allowed to use this dataset? if dataset.check_user(user_id) == False: new_dataset = _make_new_dataset(dataset_dict) new_datasets.append(new_dataset) metadata[new_dataset['data_hash']] = dataset_dict['metadata'] else: hash_id_map[current_hash] = dataset#existing_data[current_hash] elif current_hash in hash_id_map: new_datasets.append(dataset_dict) else: #set a placeholder for a dataset_id we don't know yet. #The placeholder is the hash, which is unique to this object and #therefore easily identifiable. new_datasets.append(dataset_dict) hash_id_map[current_hash] = dataset_dict metadata[current_hash] = dataset_dict['metadata'] log.debug("Isolating new data %s", get_timing(start_time)) #Isolate only the new datasets and insert them new_data_for_insert = [] #keep track of the datasets that are to be inserted to avoid duplicate #inserts new_data_hashes = [] for d in new_datasets: if d['data_hash'] not in new_data_hashes: new_data_for_insert.append(d) new_data_hashes.append(d['data_hash']) if len(new_data_for_insert) > 0: #If we're working with mysql, we have to lock the table.. #For sqlite, this is not possible. Hence the try: except try: DBSession.execute("LOCK TABLES tDataset WRITE, tMetadata WRITE") except OperationalError: pass log.debug("Inserting new data", get_timing(start_time)) DBSession.execute(Dataset.__table__.insert(), new_data_for_insert) log.debug("New data Inserted", get_timing(start_time)) try: DBSession.execute("UNLOCK TABLES") except OperationalError: pass new_data = _get_existing_data(new_data_hashes) log.debug("New data retrieved", get_timing(start_time)) for k, v in new_data.items(): hash_id_map[k] = v _insert_metadata(metadata, hash_id_map) log.debug("Metadata inserted", get_timing(start_time)) returned_ids = [] for d in bulk_data: returned_ids.append(hash_id_map[d.data_hash]) log.info("Done bulk inserting data. %s datasets", len(returned_ids)) return returned_ids
def _bulk_insert_data(bulk_data, user_id=None, source=None): """ Insert lots of datasets at once to reduce the number of DB interactions. user_id indicates the user adding the data source indicates the name of the app adding the data both user_id and source are added as metadata """ get_timing = lambda x: datetime.datetime.now() - x start_time = datetime.datetime.now() new_data = _process_incoming_data(bulk_data, user_id, source) log.info("Incoming data processed in %s", (get_timing(start_time))) existing_data = _get_existing_data(new_data.keys()) log.info("Existing data retrieved.") #The list of dataset IDS to be returned. hash_id_map = {} new_datasets = [] metadata = {} #This is what gets returned. for d in bulk_data: log.info(d.data_hash) dataset_dict = new_data[d.data_hash] current_hash = d.data_hash #if this piece of data is already in the DB, then #there is no need to insert it! if existing_data.get(current_hash) is not None: dataset = existing_data.get(current_hash) #Is this user allowed to use this dataset? if dataset.check_user(user_id) == False: new_dataset = _make_new_dataset(dataset_dict) new_datasets.append(new_dataset) metadata[new_dataset['data_hash']] = dataset_dict['metadata'] else: hash_id_map[ current_hash] = dataset #existing_data[current_hash] elif current_hash in hash_id_map: new_datasets.append(dataset_dict) else: #set a placeholder for a dataset_id we don't know yet. #The placeholder is the hash, which is unique to this object and #therefore easily identifiable. new_datasets.append(dataset_dict) hash_id_map[current_hash] = dataset_dict metadata[current_hash] = dataset_dict['metadata'] log.debug("Isolating new data %s", get_timing(start_time)) #Isolate only the new datasets and insert them new_data_for_insert = [] #keep track of the datasets that are to be inserted to avoid duplicate #inserts new_data_hashes = [] for d in new_datasets: if d['data_hash'] not in new_data_hashes: new_data_for_insert.append(d) new_data_hashes.append(d['data_hash']) if len(new_data_for_insert) > 0: #If we're working with mysql, we have to lock the table.. #For sqlite, this is not possible. Hence the try: except try: DBSession.execute("LOCK TABLES tDataset WRITE, tMetadata WRITE") except OperationalError: pass log.debug("Inserting new data", get_timing(start_time)) DBSession.execute(Dataset.__table__.insert(), new_data_for_insert) log.debug("New data Inserted", get_timing(start_time)) try: DBSession.execute("UNLOCK TABLES") except OperationalError: pass new_data = _get_existing_data(new_data_hashes) log.debug("New data retrieved", get_timing(start_time)) for k, v in new_data.items(): hash_id_map[k] = v _insert_metadata(metadata, hash_id_map) log.debug("Metadata inserted", get_timing(start_time)) returned_ids = [] for d in bulk_data: returned_ids.append(hash_id_map[d.data_hash]) log.info("Done bulk inserting data. %s datasets", len(returned_ids)) return returned_ids