def ingest_all(): ib_brainregion = InsertBuffer(reference_ingest.BrainRegion) for key in tqdm(keys, position=0): fields = key['fields'] graph_order = atlas[atlas['id'] == key['pk']]['graph_order'] if np.isnan(graph_order.to_list()[0]): graph_order = None else: graph_order = int(graph_order) ib_brainregion.insert1( dict(brain_region_pk=key['pk'], acronym=fields['acronym'], brain_region_name=fields['name'], parent=fields['parent'], brain_region_level=fields['level'], graph_order=graph_order)) if ib_brainregion.flush(skip_duplicates=True, chunksz=1000): print('Inserted 1000 raw tuples.') if ib_brainregion.flush(skip_duplicates=True): print('Inserted all remaining raw field tuples')
def process_public(): from ibl_pipeline import public from ibl_pipeline.common import subject, acquisition ingest_alyx_raw.insert_to_alyxraw(ingest_alyx_raw.get_alyx_entries()) excluded_tables = [ 'Weighing', 'WaterType', 'WaterAdministration', 'WaterRestriction', 'ProbeModel', 'ProbeInsertion', 'ProbeTrajectory' ] ingest_shadow.main(excluded_tables=excluded_tables) excluded_membership_tables = [ 'WaterRestrictionUser', 'WaterRestrictionProcedure', 'SurgeryUser', 'WaterAdministrationSession', ] ingest_membership.main(excluded_tables=excluded_membership_tables) ingest_real.main(excluded_tables=excluded_tables + excluded_membership_tables, public=True) # delete non-releasing tables from ibl_pipeline.ingest import InsertBuffer table = InsertBuffer(acquisition.Session) for key in tqdm((acquisition.Session - public.PublicSession - behavior.TrialSet).fetch('KEY')): table.delete1(key) if table.flush_delete(chunksz=100): print('Deleted 100 sessions') table.flush_delete() print('Deleted the rest of the sessions') subjs = subject.Subject & acquisition.Session for key in tqdm((subject.Subject - public.PublicSubjectUuid - subjs.proj()).fetch('KEY')): (subject.Subject & key).delete() excluded_behavior_tables = [ 'AmbientSensorData', 'Settings', 'SessionDelay' ] populate_behavior.main(excluded_tables=excluded_behavior_tables)
def insert_to_alyxraw(keys, alyxraw_module=alyxraw, alyx_type='all'): # use insert buffer to speed up the insertion process if alyx_type in ('all', 'main'): ib_main = InsertBuffer(alyxraw_module.AlyxRaw) # insert into AlyxRaw table for key in tqdm(keys, position=0): try: pk = uuid.UUID(key['pk']) except Exception: print('Error for key: {}'.format(key)) continue ib_main.insert1(dict(uuid=pk, model=key['model'])) if ib_main.flush(skip_duplicates=True, chunksz=10000): logger.debug('Inserted 10000 raw tuples.') if ib_main.flush(skip_duplicates=True): logger.debug('Inserted remaining raw tuples') ib_main = InsertBuffer(alyxraw_module.AlyxRaw) if alyx_type in ('all', 'part'): ib_part = InsertBuffer(alyxraw_module.AlyxRaw.Field) # insert into the part table AlyxRaw.Field for ikey, key in tqdm(enumerate(keys), position=0): try: try: pk = uuid.UUID(key['pk']) except ValueError: print('Error for key: {}'.format(key)) continue key_field = dict(uuid=uuid.UUID(key['pk'])) for field_name, field_value in key['fields'].items(): key_field = dict(key_field, fname=field_name) if field_name == 'json' and field_value is not None: key_field['value_idx'] = 0 key_field['fvalue'] = json.dumps(field_value) if len(key_field['fvalue']) < 10000: ib_part.insert1(key_field) else: continue if field_name == 'narrative' and field_value is not None: # filter out emoji emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) key_field['value_idx'] = 0 key_field['fvalue'] = emoji_pattern.sub( r'', field_value) elif field_value is None or field_value == '' or field_value == [] or \ (isinstance(field_value, float) and math.isnan(field_value)): key_field['value_idx'] = 0 key_field['fvalue'] = 'None' ib_part.insert1(key_field) elif type(field_value) is list and \ (type(field_value[0]) is dict or type(field_value[0]) is str): for value_idx, value in enumerate(field_value): key_field['value_idx'] = value_idx key_field['fvalue'] = str(value) ib_part.insert1(key_field) else: key_field['value_idx'] = 0 key_field['fvalue'] = str(field_value) ib_part.insert1(key_field) if ib_part.flush(skip_duplicates=True, chunksz=10000): logger.debug('Inserted 10000 raw field tuples') except Exception: print('Problematic entry:{}'.format(ikey)) raise if ib_part.flush(skip_duplicates=True): logger.debug('Inserted all remaining raw field tuples')
def main(excluded_tables=[], modified_pks=None): kwargs = dict(display_progress=True, suppress_errors=True) for t in SHADOW_TABLES: if t.__name__ in excluded_tables: continue print(f'Ingesting shadow table {t.__name__}...') if t.__name__ == 'Session' and modified_pks: modified_session_keys = [{ 'session_uuid': pk } for pk in modified_pks] sessions = acquisition.Session & modified_session_keys if sessions: modified_session_entries = [] for key in sessions.fetch('KEY'): try: entry = acquisition.Session.create_entry(key) modified_session_entries.append(entry) except: print("Error creating entry for key: {}".format(key)) if modified_session_entries: t.insert(modified_session_entries, allow_direct_insert=True, replace=True) t.populate(**kwargs) if 'DataSet' not in excluded_tables: print('Ingesting dataset entries...') key_source = (alyxraw.AlyxRaw & 'model="data.dataset"').proj( dataset_uuid="uuid") - data.DataSet data_set = InsertBuffer(data.DataSet) for key in tqdm(key_source.fetch('KEY'), position=0): key_ds = key.copy() key['uuid'] = key['dataset_uuid'] session = grf(key, 'session') if not len(acquisition.Session & dict(session_uuid=uuid.UUID(session))): print('Session {} is not in the table acquisition.Session'. format(session)) print('dataset_uuid: {}'.format(str(key['uuid']))) continue key_ds['subject_uuid'], key_ds['session_start_time'] = \ (acquisition.Session & dict(session_uuid=uuid.UUID(session))).fetch1( 'subject_uuid', 'session_start_time') key_ds['dataset_name'] = grf(key, 'name') dt = grf(key, 'dataset_type') key_ds['dataset_type_name'] = \ (data.DataSetType & dict(dataset_type_uuid=uuid.UUID(dt))).fetch1( 'dataset_type_name') user = grf(key, 'created_by') if user != 'None': try: key_ds['dataset_created_by'] = \ (reference.LabMember & dict(user_uuid=uuid.UUID(user))).fetch1( 'user_name') except: print(user) else: key_ds['dataset_created_by'] = None format = grf(key, 'data_format') key_ds['format_name'] = \ (data.DataFormat & dict(format_uuid=uuid.UUID(format))).fetch1( 'format_name') key_ds['created_datetime'] = grf(key, 'created_datetime') software = grf(key, 'generating_software') if software != 'None': key_ds['generating_software'] = software else: key_ds['generating_software'] = None directory = grf(key, 'provenance_directory') if directory != 'None': key_ds['provenance_directory'] = directory else: key_ds['provenance_directory'] = None md5 = grf(key, 'md5') if md5 != 'None': key_ds['md5'] = md5 else: key_ds['md5'] = None file_size = grf(key, 'file_size') if file_size != 'None': key_ds['file_size'] = file_size else: key_ds['file_size'] = None data_set.insert1(key_ds) if data_set.flush(skip_duplicates=True, allow_direct_insert=True, chunksz=100): print('Inserted 100 dataset tuples') if data_set.flush(skip_duplicates=True, allow_direct_insert=True): print('Inserted all remaining dataset tuples') if 'FileRecord' not in excluded_tables: print('Ingesting file record entries...') records = alyxraw.AlyxRaw & 'model="data.filerecord"' repos = (data.DataRepository & 'repo_name LIKE "flatiron%"').fetch('repo_uuid') records_flatiron = alyxraw.AlyxRaw.Field & records & \ 'fname = "data_repository"' & [{'fvalue': str(repo)} for repo in repos] record_exists = alyxraw.AlyxRaw.Field & records & \ 'fname = "exists"' & 'fvalue="True"' key_source = (alyxraw.AlyxRaw & record_exists & records_flatiron).proj( record_uuid='uuid') - data.FileRecord file_record = InsertBuffer(data.FileRecord) for key in tqdm(key_source.fetch('KEY'), position=0): key_fr = key.copy() key['uuid'] = key['record_uuid'] key_fr['exists'] = True dataset = grf(key, 'dataset') if not len(data.DataSet & dict(dataset_uuid=uuid.UUID(dataset))): print('Dataset {} is not in the table data.DataSet') print('Record_uuid: {}'.format(str(key['uuid']))) continue key_fr['subject_uuid'], key_fr['session_start_time'], \ key_fr['dataset_name'] = \ (data.DataSet & dict(dataset_uuid=uuid.UUID(dataset))).fetch1( 'subject_uuid', 'session_start_time', 'dataset_name') repo = grf(key, 'data_repository') key_fr['repo_name'] = \ (data.DataRepository & dict(repo_uuid=uuid.UUID(repo))).fetch1( 'repo_name') key_fr['relative_path'] = grf(key, 'relative_path') file_record.insert1(key_fr) if file_record.flush(skip_duplicates=True, allow_direct_insert=True, chunksz=1000): print('Inserted 1000 raw field tuples') if file_record.flush(skip_duplicates=True, allow_direct_insert=True): print('Inserted all remaining file record tuples')
the table acquisition.Session'.format(parent_session)) continue key_cs['parent_session_start_time'] = \ (acquisition.Session & dict(session_uuid=uuid.UUID(parent_session))).fetch1( 'session_start_time') acquisition.ChildSession.insert1(key_cs, skip_duplicates=True) # acquisition.SessionUser print('Ingesting acquisition.SessionUser...') sessions = alyxraw.AlyxRaw & 'model="actions.session"' sessions_with_users = alyxraw.AlyxRaw.Field & sessions & \ 'fname="users"' & 'fvalue!="None"' keys = (alyxraw.AlyxRaw & sessions_with_users).proj(session_uuid='uuid') session_user = InsertBuffer(acquisition.SessionUser) for key in tqdm(keys, position=0): key['uuid'] = key['session_uuid'] if not len(acquisition.Session & key): print('Session {} is not in the table acquisition.Session'.format( key['session_uuid'])) continue key_s = dict() key_s['subject_uuid'], key_s['session_start_time'] = \ (acquisition.Session & key).fetch1( 'subject_uuid', 'session_start_time')
# if no argument given, assume a canonical file location and name filename = path.join('/data', 'alyxfull.json') else: filename = path.join(dir_name, sys.argv[1]) with open(filename, 'r') as fid: keys = json.load(fid) # remove invalid uuid from unused tables keys = [ key for key in keys if key['model'] not in ['auth.group', 'sessions.session', 'authtoken.token'] ] # use insert buffer to speed up the insersion process ib_main = InsertBuffer(alyxraw.AlyxRaw) ib_part = InsertBuffer(alyxraw.AlyxRaw.Field) # insert into AlyxRaw table for key in keys: ib_main.insert1(dict(uuid=uuid.UUID(key['pk']), model=key['model'])) if ib_main.flush(skip_duplicates=True, chunksz=10000): logger.debug('Inserted 10000 raw tuples.') if ib_main.flush(skip_duplicates=True): logger.debug('Inserted remaining raw tuples') # insert into the part table AlyxRaw.Field for ikey, key in enumerate(keys): try: key_field = dict(uuid=uuid.UUID(key['pk']))
from ibl_pipeline.ingest import alyxraw, data, reference, acquisition, InsertBuffer from ibl_pipeline.ingest import get_raw_field as grf import uuid from tqdm import tqdm # ingest dataset entries key_source = (alyxraw.AlyxRaw & 'model="data.dataset"').proj( dataset_uuid="uuid") - data.DataSet data_set = InsertBuffer(data.DataSet) for key in tqdm(key_source.fetch('KEY'), position=0): key_ds = key.copy() key['uuid'] = key['dataset_uuid'] session = grf(key, 'session') if not len(acquisition.Session & dict(session_uuid=uuid.UUID(session))): print('Session {} is not in the table acquisition.Session'.format( session)) print('dataset_uuid: {}'.format(str(key['uuid']))) continue key_ds['subject_uuid'], key_ds['session_start_time'] = \ (acquisition.Session & dict(session_uuid=uuid.UUID(session))).fetch1( 'subject_uuid', 'session_start_time') key_ds['dataset_name'] = grf(key, 'name')
if len(sys.argv) < 2: # no arguments given # if no argument given, assume a canonical file location and name filename = path.join(dir_name, '..', 'data', 'alyxfull.json') else: filename = path.join(dir_name, sys.argv[1]) with open(filename, 'r') as fid: keys = json.load(fid) # remove invalid uuid from unused tables keys = [key for key in keys if key['model'] not in ['auth.group', 'sessions.session', 'authtoken.token']] # use insert buffer to speed up the insersion process ib_main = InsertBuffer(alyxraw.AlyxRaw) ib_part = InsertBuffer(alyxraw.AlyxRaw.Field) # insert into AlyxRaw table for key in keys: ib_main.insert1(dict(uuid=uuid.UUID(key['pk']), model=key['model'])) if ib_main.flush(skip_duplicates=True, chunksz=10000): logger.debug('Inserted 10000 raw tuples.') if ib_main.flush(skip_duplicates=True): logger.debug('Inserted remaining raw tuples') # insert into the part table AlyxRaw.Field for key in keys: key_field = dict(uuid=uuid.UUID(key['pk'])) for field_name, field_value in key['fields'].items():
def delete_histology_alyx_shadow(verbose=False): CHANNEL_TABLES = [ histology_ingest.ChannelBrainLocationTemp, histology_ingest.ChannelBrainLocation, alyxraw.AlyxRaw.Field, alyxraw.AlyxRaw ] channel_loc_keys = update_utils.get_deleted_keys('experiments.channel') for t in CHANNEL_TABLES: print(f'Deleting from table {t.__name__}') uuid_name = t.heading.primary_key[0] keys = [{uuid_name: k['uuid']} for k in tqdm(channel_loc_keys)] table = InsertBuffer(t) for k in tqdm(keys, position=0): table.delete1(k) if table.flush_delete(chunksz=1000, quick=True) and verbose: print(f'Deleted 1000 entries from {t.__name__}') table.flush_delete(quick=True) traj_keys = update_utils.get_deleted_keys('experiments.trajectoryestimate') + \ update_utils.get_updated_keys('experiments.trajectoryestimate') TRAJ_TABLES = [ histology_ingest.ProbeTrajectoryTemp, histology_ingest.ProbeTrajectory, alyxraw.AlyxRaw.Field, alyxraw.AlyxRaw ] for t in TRAJ_TABLES: uuid_name = t.heading.primary_key[0] keys = [{uuid_name: k['uuid']} for k in traj_keys] table = InsertBuffer(t) for k in tqdm(keys, position=0): table.delete1(k) if table.flush_delete(chunksz=1000, quick=True) and verbose: print(f'Deleted 1000 entries from {t.__name__}') table.flush_delete(quick=True)