def get_datasetdata_column_fieldinformation(): #TODO: is this cached by python? """ returns a list of [('table.field', fieldinformation )] """ # TODO: only include cell, protein if they are present in the dataset # Note, case is erased when cursor.description is filled, so use underscores, and leave the camelcasing for later table_fields = ['datarecord.id', 'dataset.facility_id', 'smallmolecule.facility_id', 'smallmolecule.salt_id', 'smallmoleculebatch.facility_batch_id', 'smallmolecule.lincs_id', 'smallmolecule.name', 'cell.name', 'cell.cl_id', 'cell.facility_id', 'protein.name', 'protein.lincs_id', 'datarecord.plate', 'datarecord.well', 'datarecord.control_type', 'datacolumn.name', 'datacolumn.unit', ] tablefields_fi = []; for tablefield in table_fields: tflist = tablefield.split('.') tablefields_fi.append((tablefield, get_fieldinformation(tflist[1],[tflist[0]]))) return tablefields_fi
def get_datasetdata_camel_case_columns(dataset_id): camel_columns = [x[1].get_camel_case_dwg_name() for x in DataSetDataResource.get_datasetdata_column_fieldinformation()] timepoint_columns = DataSetDataResource.get_dataset_columns(dataset_id, ['day','hour','minute','second']) for i in xrange(len(timepoint_columns)): camel_columns.append('timepoint'+ ('','_'+str(i))[i>0]) # FYI we have to label manually, because timepoint is an alias, not real DataColumn camel_columns.append('timepoint_unit'+ ('','_'+str(i))[i>0]) camel_columns.append('timepoint_description'+ ('','_'+str(i))[i>0]) datapoint_value_fi = get_fieldinformation('datapoint_value', ['']) camel_columns.append(datapoint_value_fi.get_camel_case_dwg_name()) return camel_columns
def build_schema(self): schema = super(DataSetResource,self).build_schema() original_dict = schema['fields'] # TODO: reincorporate this information (this default information is about the DB schema definition) fields = get_detail_schema(DataSet(), 'dataset', lambda x: x.show_in_detail ) # Custom fields for SAF: TODO: generate the names here from the fieldinformation fields['datapointFile'] = get_schema_fieldinformation('datapoint_file','') fields['safVersion'] = get_schema_fieldinformation('saf_version','') fields['screeningFacility'] = get_schema_fieldinformation('screening_facility','') schema['fields'] = OrderedDict(sorted(fields.items(), key=lambda x: x[0])) # sort alpha, todo sort on fi.order ds_fieldinformation = DataSetDataResource.get_datasetdata_column_fieldinformation() ds_fieldinformation.append(('datapoint_value',get_fieldinformation('datapoint_value',[''])) ) ds_fieldinformation.append(('timepoint',get_fieldinformation('timepoint',[''])) ) ds_fieldinformation.append(('timepoint_unit',get_fieldinformation('timepoint_unit',[''])) ) ds_fieldinformation.append(('timepoint_description',get_fieldinformation('timepoint_description',[''])) ) meta_field_info = get_listing(FieldInformation(),['fieldinformation']) fields = {} for field,fi in ds_fieldinformation: field_schema_info = {} for item in meta_field_info.items(): meta_fi_attr = item[0] meta_fi = item[1]['fieldinformation'] field_schema_info[meta_fi.get_camel_case_dwg_name()] = getattr(fi,meta_fi_attr) fields[fi.get_camel_case_dwg_name()]= field_schema_info schema['datasetDataFile'] = OrderedDict(sorted(fields.items(), key=lambda x: x[0])) # sort alpha, todo sort on fi.order dc_fieldinformation = FieldInformation.objects.all().filter(table='datacolumn', show_in_detail=True) datapoint_fields = {} for fi in dc_fieldinformation: field_schema_info = {} for item in meta_field_info.items(): meta_fi_attr = item[0] meta_fi = item[1]['fieldinformation'] field_schema_info[meta_fi.get_camel_case_dwg_name()] = getattr(fi,meta_fi_attr) datapoint_fields[fi.get_camel_case_dwg_name()]= field_schema_info schema['datapointInformation'] = OrderedDict(sorted(datapoint_fields.items(), key=lambda x: x[0])) # sort alpha, todo sort on fi.order return schema
def build_schema(self): schema = super(DataSetDataResource,self).build_schema() original_dict = schema['fields'] # TODO: reincorporate this information (this default information is about the DB schema definition) ds_fieldinformation = DataSetDataResource.get_datasetdata_column_fieldinformation() ds_fieldinformation.append(('datapoint_value',get_fieldinformation('datapoint_value',[''])) ) ds_fieldinformation.append(('timepoint',get_fieldinformation('timepoint',[''])) ) ds_fieldinformation.append(('timepoint_unit',get_fieldinformation('timepoint_unit',[''])) ) ds_fieldinformation.append(('timepoint_description',get_fieldinformation('timepoint_description',[''])) ) meta_field_info = get_listing(FieldInformation(),['fieldinformation']) fields = {} for __,fi in ds_fieldinformation: field_schema_info = {} for item in meta_field_info.items(): meta_fi_attr = item[0] meta_fi = item[1]['fieldinformation'] field_schema_info[meta_fi.get_camel_case_dwg_name()] = getattr(fi,meta_fi_attr) fields[fi.get_camel_case_dwg_name()]= field_schema_info schema['fields'] = OrderedDict(sorted(fields.items(), key=lambda x: x[0])) # TODO, use the fieldinformation order return schema
def get_datasetdata_cursor(dataset_id): timepoint_columns = DataSetDataResource.get_dataset_columns(dataset_id, ['day','hour','minute','second']) logger.info(str(('timepoint_columns', timepoint_columns))) datapoint_columns = DataSetDataResource.get_dataset_columns(dataset_id) datapoint_columns = [col for col in datapoint_columns if col not in timepoint_columns] # pivot out the timepoint columns only timepoint_column_string = ''; for i,dc in enumerate(timepoint_columns): alias = "dp_"+ str(i) tp_name = "timepoint" tp_unit_name = "timepoint_unit" tp_desc_name = "timepoint_description" if i>0: tp_name += "_" + str(i) tp_unit_name += "_" + str(i) tp_desc_name += "_" + str(i) # note: timepoint values are probably text, but who knows, so query the type here column_to_select = None if(dc.data_type == 'Numeric' or dc.data_type == 'omero_image'): if dc.precision == 0 or dc.data_type == 'omero_image': column_to_select = "int_value" else: column_to_select = "round( float_value::numeric, 2 )" # TODO: specify the precision in the fieldinformation for this column else: column_to_select = "text_value" timepoint_column_string += ( ",(SELECT " + column_to_select + " FROM db_datapoint " + alias + " where " + alias + ".datacolumn_id="+str(dc.id) + " and " + alias + ".datarecord_id=datarecord.id) as " + tp_name ) timepoint_column_string += ", '" + dc.unit + "' as " + tp_unit_name if dc.description: timepoint_column_string += ", '" + dc.description + "' as " + tp_desc_name sql = "select " # patterns to match the protein and cell fields, which will be handled differently below, # because they can be linked through the DataRecord or the DataColumn protein_pattern = re.compile(r'protein.(.*)') cell_pattern = re.compile(r'cell.(.*)') meta_columns_to_fieldinformation = DataSetDataResource.get_datasetdata_column_fieldinformation() for i,(tablefield,fi) in enumerate(meta_columns_to_fieldinformation): if i!=0: sql += ', \n' # NOTE: Due to an updated requirement, proteins, cells may be linked to either datarecords, or datacolumns, # so the following ugliness ensues m = protein_pattern.match(tablefield) m2 = cell_pattern.match(tablefield) if m: sql += ( '(select p.' + m.group(1) + ' from db_protein p where p.id in (datacolumn.protein_id,datarecord.protein_id))' + ' as "' + fi.get_camel_case_dwg_name() +'"' ) elif m2: sql += ( '(select c.' + m2.group(1) + ' from db_cell c where c.id in (datacolumn.cell_id,datarecord.cell_id))' + ' as "' + fi.get_camel_case_dwg_name() +'"' ) else: # TODO: this information is parsed when deserializing to create the "camel cased name" sql += tablefield + ' as "' + fi.get_camel_case_dwg_name() +'"' datapoint_value_fi = get_fieldinformation('datapoint_value', ['']) sql += ', coalesce(dp.int_value::TEXT, dp.float_value::TEXT, dp.text_value) as "' + datapoint_value_fi.get_camel_case_dwg_name() +'"\n' sql += timepoint_column_string # Note: older simple left join to proteins # left join db_protein protein on (datarecord.protein_id=protein.id) # Also, cells: # left join db_cell cell on (datarecord.cell_id=cell.id) # has been replaced to # left join ( select datarecord.id as dr_id, * from db_protein p where p.id in (datarecord.protein_id,datacolumn.protein_id)) protein on(dr_id=datarecord.id) sql += """ from db_dataset dataset join db_datarecord datarecord on(datarecord.dataset_id=dataset.id) join db_datacolumn datacolumn on(datacolumn.dataset_id=dataset.id) left join db_smallmolecule smallmolecule on (datarecord.smallmolecule_id=smallmolecule.id) left join db_smallmoleculebatch smallmoleculebatch on(smallmoleculebatch.smallmolecule_id=smallmolecule.id and smallmoleculebatch.facility_batch_id=datarecord.batch_id) , db_datapoint dp where dp.datarecord_id=datarecord.id and dp.datacolumn_id=datacolumn.id and dataset.id = %s """ if len(timepoint_columns) > 0: sql += " and datacolumn.id not in (" + ','.join([str(col.id) for col in timepoint_columns]) + ") " sql += " order by datarecord.id, datacolumn.id " logger.info(str(('sql',sql))) cursor = connection.cursor() cursor.execute(sql, [dataset_id]) # query = DataRecord.objects.filter(dataset_id=dataset_id) # logger.info(str(('query returns', query))) return cursor