from ibl_pipeline.ingest import alyxraw, reference, subject, action, acquisition, data from ibl_pipeline.ingest import get_raw_field as grf if __name__ == '__main__': # reference.ProjectLabMember print('Ingesting reference.ProjectLabMember...') projects = alyxraw.AlyxRaw & 'model="subjects.project"' users = alyxraw.AlyxRaw.Field & projects & 'fname="users"' & 'fvalue!="None"' keys = (alyxraw.AlyxRaw & users).proj(project_uuid='uuid') for key in keys: key_p = dict() key_p['project_name'] = (reference.Project & key).fetch1('project_name') user_uuids = grf(key, 'users', multiple_entries=True, model='subjects.project') if len(user_uuids): for user_uuid in user_uuids: if user_uuid == 'None': continue key_pl = key_p.copy() key_pl['user_name'] = \ (reference.LabMember & dict(user_uuid=uuid.UUID(user_uuid))).fetch1( 'user_name') reference.ProjectLabMember.insert1(key_pl, skip_duplicates=True) # subject.AlleleSequence print('Ingesting subject.AlleleSequence...')
def ingest_membership_table(dj_current_table, alyx_parent_model, alyx_field, dj_parent_table, dj_other_table, dj_parent_fields, dj_other_field, dj_parent_uuid_name, dj_other_uuid_name, renamed_other_field_name=None, new_pks=None): ''' Ingest shadow membership table. This function works for the pattern that an alyx parent model contain one or multiple entries of one field that have the information in the membership table. Arguments: dj_current_table : datajoint table object, current membership table to ingest alyx_parent_model: string, model name inside alyx that contains information of the current table. alyx_field : field of alyx that contains information of current table dj_parent_table : datajoint parent table, corresponding to alyx parent model dj_other_table : datajoint other table to fetch the field from dj_parent_fields : string or list of strings, field names to be fetched from the parent table dj_other_field : string, the field table to be fetched from the other table dj_parent_uuid_name: string, uuid id name of the parent table dj_other_uuid_name: string, uuid id name of the other table renamed_other_field_name: string the other field name sometimes renamed in the real table, the default is None if the field is not renamed new_pks : list of strings of valid uuids, this is the new entries to process, the default is None if all entries are inserted. ''' if new_pks: restr = [{'uuid': pk} for pk in new_pks if is_valid_uuid(pk)] else: restr = {} alyxraw_to_insert = (alyxraw.AlyxRaw & restr & { 'model': alyx_parent_model }).fetch('KEY') if not alyxraw_to_insert: return alyx_field_entries = alyxraw.AlyxRaw.Field & alyxraw_to_insert & \ {'fname': alyx_field} & 'fvalue!="None"' keys = (alyxraw.AlyxRaw & alyx_field_entries).proj(**{dj_parent_uuid_name: 'uuid'}) if type(dj_parent_fields) == str: dj_parent_fields = [dj_parent_fields] for key in keys: if not dj_parent_table & key: print( f'The entry {key} is not parent table {dj_parent_table.__name__}' ) continue entry_base = (dj_parent_table & key).fetch(*dj_parent_fields, as_dict=True)[0] key['uuid'] = key[dj_parent_uuid_name] uuids = grf(key, alyx_field, multiple_entries=True, model=alyx_parent_model) if len(uuids): for uuid in uuids: if uuid == 'None': continue else: if not dj_other_table & {dj_other_uuid_name: uuid}: print( f'The uuid {uuid} is not datajoint table {dj_other_table.__name__}' ) continue entry = entry_base.copy() field_value = (dj_other_table & { dj_other_uuid_name: uuid }).fetch1(dj_other_field) if renamed_other_field_name: entry[renamed_other_field_name] = field_value else: entry[dj_other_field] = field_value dj_current_table.insert1(entry, skip_duplicates=True)
def main(excluded_tables=[], modified_pks=None): kwargs = dict(display_progress=True, suppress_errors=True) for t in SHADOW_TABLES: if t.__name__ in excluded_tables: continue print(f'Ingesting shadow table {t.__name__}...') if t.__name__ == 'Session' and modified_pks: modified_session_keys = [{ 'session_uuid': pk } for pk in modified_pks] sessions = acquisition.Session & modified_session_keys if sessions: modified_session_entries = [] for key in sessions.fetch('KEY'): try: entry = acquisition.Session.create_entry(key) modified_session_entries.append(entry) except: print("Error creating entry for key: {}".format(key)) if modified_session_entries: t.insert(modified_session_entries, allow_direct_insert=True, replace=True) t.populate(**kwargs) if 'DataSet' not in excluded_tables: print('Ingesting dataset entries...') key_source = (alyxraw.AlyxRaw & 'model="data.dataset"').proj( dataset_uuid="uuid") - data.DataSet data_set = InsertBuffer(data.DataSet) for key in tqdm(key_source.fetch('KEY'), position=0): key_ds = key.copy() key['uuid'] = key['dataset_uuid'] session = grf(key, 'session') if not len(acquisition.Session & dict(session_uuid=uuid.UUID(session))): print('Session {} is not in the table acquisition.Session'. format(session)) print('dataset_uuid: {}'.format(str(key['uuid']))) continue key_ds['subject_uuid'], key_ds['session_start_time'] = \ (acquisition.Session & dict(session_uuid=uuid.UUID(session))).fetch1( 'subject_uuid', 'session_start_time') key_ds['dataset_name'] = grf(key, 'name') dt = grf(key, 'dataset_type') key_ds['dataset_type_name'] = \ (data.DataSetType & dict(dataset_type_uuid=uuid.UUID(dt))).fetch1( 'dataset_type_name') user = grf(key, 'created_by') if user != 'None': try: key_ds['dataset_created_by'] = \ (reference.LabMember & dict(user_uuid=uuid.UUID(user))).fetch1( 'user_name') except: print(user) else: key_ds['dataset_created_by'] = None format = grf(key, 'data_format') key_ds['format_name'] = \ (data.DataFormat & dict(format_uuid=uuid.UUID(format))).fetch1( 'format_name') key_ds['created_datetime'] = grf(key, 'created_datetime') software = grf(key, 'generating_software') if software != 'None': key_ds['generating_software'] = software else: key_ds['generating_software'] = None directory = grf(key, 'provenance_directory') if directory != 'None': key_ds['provenance_directory'] = directory else: key_ds['provenance_directory'] = None md5 = grf(key, 'md5') if md5 != 'None': key_ds['md5'] = md5 else: key_ds['md5'] = None file_size = grf(key, 'file_size') if file_size != 'None': key_ds['file_size'] = file_size else: key_ds['file_size'] = None data_set.insert1(key_ds) if data_set.flush(skip_duplicates=True, allow_direct_insert=True, chunksz=100): print('Inserted 100 dataset tuples') if data_set.flush(skip_duplicates=True, allow_direct_insert=True): print('Inserted all remaining dataset tuples') if 'FileRecord' not in excluded_tables: print('Ingesting file record entries...') records = alyxraw.AlyxRaw & 'model="data.filerecord"' repos = (data.DataRepository & 'repo_name LIKE "flatiron%"').fetch('repo_uuid') records_flatiron = alyxraw.AlyxRaw.Field & records & \ 'fname = "data_repository"' & [{'fvalue': str(repo)} for repo in repos] record_exists = alyxraw.AlyxRaw.Field & records & \ 'fname = "exists"' & 'fvalue="True"' key_source = (alyxraw.AlyxRaw & record_exists & records_flatiron).proj( record_uuid='uuid') - data.FileRecord file_record = InsertBuffer(data.FileRecord) for key in tqdm(key_source.fetch('KEY'), position=0): key_fr = key.copy() key['uuid'] = key['record_uuid'] key_fr['exists'] = True dataset = grf(key, 'dataset') if not len(data.DataSet & dict(dataset_uuid=uuid.UUID(dataset))): print('Dataset {} is not in the table data.DataSet') print('Record_uuid: {}'.format(str(key['uuid']))) continue key_fr['subject_uuid'], key_fr['session_start_time'], \ key_fr['dataset_name'] = \ (data.DataSet & dict(dataset_uuid=uuid.UUID(dataset))).fetch1( 'subject_uuid', 'session_start_time', 'dataset_name') repo = grf(key, 'data_repository') key_fr['repo_name'] = \ (data.DataRepository & dict(repo_uuid=uuid.UUID(repo))).fetch1( 'repo_name') key_fr['relative_path'] = grf(key, 'relative_path') file_record.insert1(key_fr) if file_record.flush(skip_duplicates=True, allow_direct_insert=True, chunksz=1000): print('Inserted 1000 raw field tuples') if file_record.flush(skip_duplicates=True, allow_direct_insert=True): print('Inserted all remaining file record tuples')
from ibl_pipeline.ingest import alyxraw, data, reference, acquisition, QueryBuffer from ibl_pipeline.ingest import get_raw_field as grf import uuid from tqdm import tqdm # ingest dataset entries key_source = (alyxraw.AlyxRaw & 'model="data.dataset"').proj( dataset_uuid="uuid") - data.DataSet data_set = QueryBuffer(data.DataSet) for key in tqdm(key_source.fetch('KEY'), position=0): key_ds = key.copy() key['uuid'] = key['dataset_uuid'] session = grf(key, 'session') if not len(acquisition.Session & dict(session_uuid=uuid.UUID(session))): print('Session {} is not in the table acquisition.Session'.format( session)) print('dataset_uuid: {}'.format(str(key['uuid']))) continue key_ds['subject_uuid'], key_ds['session_start_time'] = \ (acquisition.Session & dict(session_uuid=uuid.UUID(session))).fetch1( 'subject_uuid', 'session_start_time') key_ds['dataset_name'] = grf(key, 'name') dt = grf(key, 'dataset_type') key_ds['dataset_type_name'] = \
import datajoint as dj import json from ibl_pipeline.ingest import alyxraw, reference, subject, action, acquisition, data from ibl_pipeline.ingest import get_raw_field as grf subjects = alyxraw.AlyxRaw.Field & (alyxraw.AlyxRaw & 'model="subjects.subject"') & 'fname="lab"' & 'fvalue!="None"' # reference.ProjectLabMember print('Ingesting reference.ProjectLabMember...') keys = (alyxraw.AlyxRaw & 'model="subjects.project"').proj(project_uuid='uuid') for key in keys: key_p = dict() key_p['project_name'] = (reference.Project & key).fetch1('project_name') user_uuids = grf(key, 'users', multiple_entries=True) for user_uuid in user_uuids: key_pl = key_p.copy() key_pl['user_name'] = (reference.LabMember & 'user_uuid="{}"'.format(user_uuid)).fetch1('user_name') reference.ProjectLabMember.insert1(key_pl, skip_duplicates=True) # subject.AlleleSequence print('Ingesting subject.AlleleSequence...') keys = (alyxraw.AlyxRaw & 'model="subjects.allele"').proj(allele_uuid='uuid') for key in keys: key_a = dict() key_a['allele_name'] = (subject.Allele & key).fetch1('allele_name') key['uuid'] = key['allele_uuid'] sequences = grf(key, 'sequences', multiple_entries=True)
subjects = alyxraw.AlyxRaw.Field & ( alyxraw.AlyxRaw & 'model="subjects.subject"') & 'fname="lab"' & 'fvalue!="None"' # reference.ProjectLabMember print('Ingesting reference.ProjectLabMember...') projects = alyxraw.AlyxRaw & 'model="subjects.project"' users = alyxraw.AlyxRaw.Field & projects & 'fname="users"' & 'fvalue!="None"' keys = (alyxraw.AlyxRaw & users).proj(project_uuid='uuid') for key in keys: key_p = dict() key_p['project_name'] = (reference.Project & key).fetch1('project_name') user_uuids = grf(key, 'user_uuids', multiple_entries=True, model='subjects.project') for user_uuid in user_uuids: key_pl = key_p.copy() key_pl['user_name'] = ( reference.LabMember & 'user_uuid="{}"'.format(user_uuid)).fetch1('user_name') reference.ProjectLabMember.insert1(key_pl, skip_duplicates=True) # subject.AlleleSequence print('Ingesting subject.AlleleSequence...') keys = (alyxraw.AlyxRaw & 'model="subjects.allele"').proj(allele_uuid='uuid') for key in keys: key_a = dict() key_a['allele_name'] = (subject.Allele & key).fetch1('allele_name')