def get_datasetdata_cursor(dataset_id): dataset = DataSet.objects.get(id=dataset_id) timepoint_columns = ( DataColumn.objects.all() .filter(dataset_id=dataset_id) .filter(unit__in=['day','hour','minute','second']) ) reagent_columns = ( DataColumn.objects.filter(dataset_id=dataset_id) .filter(data_type__in=[ 'small_molecule','cell','primary_cell','protein','antibody', 'otherreagent']).order_by('display_order') ) dc_ids_to_exclude = [dc.id for dc in timepoint_columns] dc_ids_to_exclude.extend([dc.id for dc in reagent_columns]) col_query_string = ( ', (SELECT ' ' {column_to_select} ' ' FROM db_datapoint as {alias} ' ' WHERE {alias}.datacolumn_id={dc_id} ' ' AND {alias}.datarecord_id=datarecord.id ) as "{column_name}" ' ) timepoint_unit_string = ',$${dc_unit}$$ as "{dc_name}_timepointUnit" ' timepoint_description_string = ( ',$${dc_description}$$ as "{dc_name}_timepointDescription" ') query_string = ( 'WITH drs as (' ' SELECT ' ' datarecord.id as "datarecordID"' ', dataset.facility_id as "hmsDatasetID"' ', datarecord.plate as "recordedPlate"' ', datarecord.well as "recordedWell"' ', datarecord.control_type as "controlType"' ) alias_count = 0 for dc in timepoint_columns: column_to_select = None alias_count += 1 alias = 'dp_%d'%alias_count column_name = '%s_timepoint' % camel_case_dwg(dc.name) if(dc.data_type == 'Numeric' or dc.data_type == 'omero_image'): if dc.precision == 0 or dc.data_type == 'omero_image': column_to_select = "int_value" else: column_to_select = "round( float_value::numeric, 2 )" else: column_to_select = "text_value" query_string += col_query_string.format( column_to_select=column_to_select, alias=alias,dc_id=dc.id,column_name=column_name) query_string += timepoint_unit_string.format( dc_unit=dc.unit, dc_name=camel_case_dwg(dc.name)) if dc.description: query_string += timepoint_description_string.format( dc_description=dc.description, dc_name=camel_case_dwg(dc.name)) reagent_id_query = ( ', (SELECT r.facility_id ' ' FROM db_reagent r ' ' JOIN db_reagentbatch rb on rb.reagent_id=r.id ' ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id ' ' WHERE {alias}.datarecord_id=datarecord.id ' ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" ' ) sm_salt_query = ( ', (SELECT r.salt_id' ' FROM db_reagent r ' ' JOIN db_reagentbatch rb on rb.reagent_id=r.id ' ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id ' ' WHERE {alias}.datarecord_id=datarecord.id ' ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" ' ) reagent_lincs_id_query = ( ', (SELECT r.lincs_id' ' FROM db_reagent r ' ' JOIN db_reagentbatch rb on rb.reagent_id=r.id ' ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id ' ' WHERE {alias}.datarecord_id=datarecord.id ' ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" ' ) reagent_batch_id_query = ( ', (SELECT ' " CASE WHEN rb.batch_id = '0' THEN '' ELSE rb.batch_id END" ' FROM db_reagent r ' ' JOIN db_reagentbatch rb on rb.reagent_id=r.id ' ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id ' ' WHERE {alias}.datarecord_id=datarecord.id ' ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" ' ) reagent_name_query = ( ', ( SELECT r.name ' ' FROM db_reagent r ' ' JOIN db_reagentbatch rb on rb.reagent_id=r.id ' ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id ' ' WHERE {alias}.datarecord_id=datarecord.id ' ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" ' ) prefixes = { 'small_molecule': 'sm', 'protein': 'pp', 'antibody': 'ab', 'otherreagent': 'or', 'cell': 'cl', 'primary_cell': 'pc'} for dc in reagent_columns: prefix = prefixes[dc.data_type] if dc.data_type == 'small_molecule': alias_count += 1 alias = 'dp_%d' % alias_count query_string += sm_salt_query.format( alias=alias, dc_id=dc.id, column_name='%s_%s%s' % (camel_case_dwg(dc.name),prefix,'Salt')) alias_count += 1 alias = 'dp_%d' % alias_count query_string += reagent_id_query.format( alias=alias, dc_id=dc.id, column_name='%s_%s%s' % (camel_case_dwg(dc.name),prefix,'CenterCompoundID')) alias_count += 1 alias = 'dp_%d' % alias_count query_string += reagent_lincs_id_query.format( alias=alias, dc_id=dc.id, column_name='%s_%s%s' % (camel_case_dwg(dc.name),prefix,'LincsID')) else: alias_count += 1 alias = 'dp_%d' % alias_count query_string += reagent_id_query.format( alias=alias, dc_id=dc.id, column_name='%s_%s%s' % (camel_case_dwg(dc.name),prefix,'CenterCanonicalID')) alias_count += 1 alias = 'dp_%d' % alias_count # Note: remove condition when new protein specification is released if dc.data_type != 'protein': query_string += reagent_lincs_id_query.format( alias=alias, dc_id=dc.id, column_name='%s_%s%s' % (camel_case_dwg(dc.name),prefix,'LincsID')) else: query_string += (", '' as %s_%s%s" % (camel_case_dwg(dc.name),prefix,'LincsID')) alias_count += 1 alias = 'dp_%d' % alias_count query_string += reagent_batch_id_query.format( alias=alias, dc_id=dc.id, column_name='%s_%s%s' % (camel_case_dwg(dc.name),prefix,'CenterBatchID')) alias_count += 1 alias = 'dp_%d' % alias_count query_string += reagent_name_query.format( alias=alias, dc_id=dc.id, column_name='%s_%s%s' % (camel_case_dwg(dc.name),prefix,'Name')) query_string += """ from db_dataset dataset join db_datarecord datarecord on(datarecord.dataset_id=dataset.id) and dataset.id = %s ) """ query_string += 'SELECT drs.* ' query_string += ', dc.name as "datapointName" ' query_string += ', dc.unit as "datapointUnit" ' query_string += ( ', coalesce(dp.int_value::TEXT' ', dp.float_value::TEXT' ', dp.text_value) as "%s" ' % 'datapointValue' ) query_string += """ FROM drs JOIN db_datapoint dp on dp.datarecord_id=drs."datarecordID" join db_datacolumn dc on dp.datacolumn_id=dc.id where dp.dataset_id = %s """ if dc_ids_to_exclude: query_string += ( " and dp.datacolumn_id not in (%s)" % ','.join([str(x) for x in dc_ids_to_exclude])) query_string +=' order by "datarecordID",dc.id ' logger.info('query_string: %s' % query_string) cursor = connection.cursor() cursor.execute(query_string, [dataset_id,dataset_id]) return cursor
def get_datasetdata_cursor(dataset_id): dataset = DataSet.objects.get(id=dataset_id) timepoint_columns = ( DataColumn.objects.all() .filter(dataset_id=dataset_id) .filter(unit__in=['day','hour','minute','second']) ) reagent_columns = ( DataColumn.objects.filter(dataset_id=dataset_id) .filter(data_type__in=[ 'small_molecule','cell','protein','antibody','otherreagent']) ) dc_ids_to_exclude = [dc.id for dc in timepoint_columns] dc_ids_to_exclude.extend([dc.id for dc in reagent_columns]) col_query_string = ( ', (SELECT ' ' {column_to_select} ' ' FROM db_datapoint as {alias} ' ' WHERE {alias}.datacolumn_id={dc_id} ' ' AND {alias}.datarecord_id=datarecord.id ) as "{column_name}" ' ) timepoint_unit_string = ',$${dc_unit}$$ as "{dc_name}_timepointUnit" ' timepoint_description_string = ( ',$${dc_description}$$ as "{dc_name}_timepointDescription" ') query_string = ( 'WITH drs as (' ' SELECT ' ' datarecord.id as "datarecordID"' ', dataset.facility_id as "hmsDatasetID"' ', datarecord.plate as "recordedPlate"' ', datarecord.well as "recordedWell"' ', datarecord.control_type as "controlType"' ) alias_count = 0 for dc in timepoint_columns: column_to_select = None alias_count += 1 alias = 'dp_%d'%alias_count column_name = '%s_timepoint' % camel_case_dwg(dc.name) if(dc.data_type == 'Numeric' or dc.data_type == 'omero_image'): if dc.precision == 0 or dc.data_type == 'omero_image': column_to_select = "int_value" else: column_to_select = "round( float_value::numeric, 2 )" else: column_to_select = "text_value" query_string += col_query_string.format( column_to_select=column_to_select, alias=alias,dc_id=dc.id,column_name=column_name) query_string += timepoint_unit_string.format( dc_unit=dc.unit, dc_name=camel_case_dwg(dc.name)) if dc.description: query_string += timepoint_description_string.format( dc_description=dc.description, dc_name=camel_case_dwg(dc.name)) reagent_id_query = ( ', (SELECT r.facility_id ' ' FROM db_reagent r ' ' JOIN db_reagentbatch rb on rb.reagent_id=r.id ' ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id ' ' WHERE {alias}.datarecord_id=datarecord.id ' ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" ' ) sm_salt_query = ( ', (SELECT r.salt_id' ' FROM db_reagent r ' ' JOIN db_reagentbatch rb on rb.reagent_id=r.id ' ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id ' ' WHERE {alias}.datarecord_id=datarecord.id ' ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" ' ) sm_lincs_id_query = ( ', (SELECT r.lincs_id' ' FROM db_reagent r ' ' JOIN db_reagentbatch rb on rb.reagent_id=r.id ' ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id ' ' WHERE {alias}.datarecord_id=datarecord.id ' ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" ' ) reagent_batch_id_query = ( ', (SELECT ' " CASE WHEN rb.batch_id = '0' THEN '' ELSE rb.batch_id END" ' FROM db_reagent r ' ' JOIN db_reagentbatch rb on rb.reagent_id=r.id ' ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id ' ' WHERE {alias}.datarecord_id=datarecord.id ' ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" ' ) reagent_name_query = ( ', ( SELECT r.name ' ' FROM db_reagent r ' ' JOIN db_reagentbatch rb on rb.reagent_id=r.id ' ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id ' ' WHERE {alias}.datarecord_id=datarecord.id ' ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" ' ) prefixes = { 'small_molecule': 'sm', 'protein': 'pp', 'antibody': 'ab', 'otherreagent': 'or', 'cell': 'cl'} for dc in reagent_columns: prefix = prefixes[dc.data_type] if dc.data_type == 'small_molecule': alias_count += 1 alias = 'dp_%d' % alias_count query_string += sm_salt_query.format( alias=alias, dc_id=dc.id, column_name='%s_%s%s' % (camel_case_dwg(dc.name),prefix,'Salt')) alias_count += 1 alias = 'dp_%d' % alias_count query_string += reagent_id_query.format( alias=alias, dc_id=dc.id, column_name='%s_%s%s' % (camel_case_dwg(dc.name),prefix,'CenterCompoundID')) alias_count += 1 alias = 'dp_%d' % alias_count query_string += sm_lincs_id_query.format( alias=alias, dc_id=dc.id, column_name='%s_%s%s' % (camel_case_dwg(dc.name),prefix,'LincsID')) else: alias_count += 1 alias = 'dp_%d' % alias_count query_string += reagent_id_query.format( alias=alias, dc_id=dc.id, column_name='%s_%s%s' % (camel_case_dwg(dc.name),prefix,'CenterSpecificID')) alias_count += 1 alias = 'dp_%d' % alias_count query_string += reagent_batch_id_query.format( alias=alias, dc_id=dc.id, column_name='%s_%s%s' % (camel_case_dwg(dc.name),prefix,'CenterSampleID')) alias_count += 1 alias = 'dp_%d' % alias_count query_string += reagent_name_query.format( alias=alias, dc_id=dc.id, column_name='%s_%s%s' % (camel_case_dwg(dc.name),prefix,'Name')) query_string += """ from db_dataset dataset join db_datarecord datarecord on(datarecord.dataset_id=dataset.id) and dataset.id = %s ) """ query_string += 'SELECT drs.* ' query_string += ', dc.name as "datapointName" ' query_string += ', dc.unit as "datapointUnit" ' query_string += ( ', coalesce(dp.int_value::TEXT, dp.float_value::TEXT, dp.text_value) as "%s" ' % 'datapointValue' ) query_string += """ FROM drs JOIN db_datapoint dp on dp.datarecord_id=drs."datarecordID" join db_datacolumn dc on dp.datacolumn_id=dc.id where dp.dataset_id = %s """ if dc_ids_to_exclude: query_string += ( " and dp.datacolumn_id not in (%s)" % ','.join([str(x) for x in dc_ids_to_exclude])) query_string +=' order by "datarecordID",dc.id ' logger.info('query_string: %s' % query_string) cursor = connection.cursor() cursor.execute(query_string, [dataset_id,dataset_id]) return cursor
final_val = util.convertdata( val,type_lookup.get(recognized_label, None)) if final_val != None: dc_dict[recognized_label] = final_val if recognized_label == 'display_order': # add 10 to the order, so default reagent cols can go first dc_dict['display_order'] = ( dc_dict['display_order'] + 10) if recognized_label == 'name': # split on non-alphanumeric chars temp = re.split(r'[^a-zA-Z0-9]+',dc_dict['name']) # convert, if needed if len(temp) > 1: dc_dict['name'] = camel_case_dwg(dc_dict['name']) else: if recognized_label in required_labels: raise Exception( 'Error, data column field is required: %s, col: %r' % ( recognized_label, colname(j+1) ) ) else: logger.debug( 'unrecognized label in "Data Columns" sheet %r' % label_read) for dc_dict in dc_definitions: for label in required_labels: if label not in dc_dict: raise Exception( 'required "Data Column" label not defined %r' % label)
def read_datacolumns(book): ''' @return an array of data column definition dicts ''' data_column_sheet = book.sheet_by_name('Data Columns') labels = { 'Worksheet Column': 'worksheet_column', '"Data" Worksheet Column': 'worksheet_column', 'Display Order': 'display_order', 'Display Name': 'display_name', 'Name': 'name', 'Data Type': 'data_type', 'Decimal Places': 'precision', 'Description': 'description', 'Replicate Number': 'replicate', 'Unit': 'unit', 'Assay readout type': 'readout_type', 'Comments': 'comments', } dc_definitions = [] datacolumn_fields = util.get_fields(DataColumn) type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields) logger.debug('datacolumn type lookups: %s' % type_lookup) required_labels = ['name', 'data_type'] logger.info('read the data column definitions...') for i in xrange(data_column_sheet.nrows): row_values = data_column_sheet.row_values(i) if i == 0: for val in row_values[1:]: dc_definitions.append({}) label_read = row_values[0] recognized_label = next( (field_name for label, field_name in labels.items() if label_read and label.lower() == label_read.lower()), None) if recognized_label: logger.debug('label: %r, recognized_label: %r' % (label_read, recognized_label)) for j, val in enumerate(row_values[1:]): dc_dict = dc_definitions[j] logger.debug('data column %s:%d:%d:%r' % (recognized_label, i, j, val)) final_val = util.convertdata( val, type_lookup.get(recognized_label, None)) if final_val != None: dc_dict[recognized_label] = final_val if recognized_label == 'display_order': # add 10 to the order, so default reagent cols can go first dc_dict['display_order'] = (dc_dict['display_order'] + 10) if recognized_label == 'name': # split on non-alphanumeric chars temp = re.split(r'[^a-zA-Z0-9]+', dc_dict['name']) # convert, if needed if len(temp) > 1: dc_dict['name'] = camel_case_dwg(dc_dict['name']) else: if recognized_label in required_labels: raise Exception( 'Error, data column field is required: %s, col: %r' % (recognized_label, colname(j + 1))) else: logger.debug('unrecognized label in "Data Columns" sheet %r' % label_read) for dc_dict in dc_definitions: for label in required_labels: if label not in dc_dict: raise Exception('required "Data Column" label not defined %r' % label) logger.info('find the data columns on the "Data" sheet...') data_sheet = book.sheet_by_name('Data') data_sheet_labels = data_sheet.row_values(0) dc_definitions_found = [] data_labels_found = [] for i, data_label in enumerate(data_sheet_labels): if not data_label or not data_label.strip(): logger.info('break on data sheet col %d, blank' % i) break data_label = data_label.upper() col_letter = colname(i) for dc_dict in dc_definitions: _dict = None if 'worksheet_column' in dc_dict: v = dc_dict['worksheet_column'] if v.upper() == col_letter: data_labels_found.append(i) dc_definitions_found.append(dc_dict) _dict = dc_dict elif 'name' in dc_dict or 'display_name' in dc_dict: if (dc_dict.get('name', '').upper() == data_label or dc_dict.get('display_name', '').upper() == data_label): dc_dict['worksheet_column'] = col_letter data_labels_found.append(i) dc_definitions_found.append(dc_dict) _dict = dc_dict if _dict and 'display_order' not in _dict: _dict['display_order'] = i + 10 logger.warn('auto assigning "display_order" for col %r as %d' % (_dict['name'], i + 10)) if i not in data_labels_found: logger.debug(('Data sheet label not found %r,' ' looking in default reagent definitions %s') % (data_label, default_reagent_columns.keys())) for key, dc_dict in default_reagent_columns.items(): if (key.upper() == data_label or dc_dict.get('name', '').upper() == data_label or dc_dict.get('display_name', '').upper() == data_label): dc_dict['worksheet_column'] = col_letter data_labels_found.append(i) dc_definitions_found.append(dc_dict) data_labels_not_found = [ data_label for i, data_label in enumerate(data_sheet_labels) if data_label and data_label.strip() and i not in data_labels_found and data_label not in meta_columns ] if data_labels_not_found: logger.warn('data sheet labels not recognized %s' % data_labels_not_found) # for legacy datasets: make sure the small molecule column 1 is always created small_mol_col = None for dc_dict in dc_definitions_found: if dc_dict['data_type'] == 'small_molecule': small_mol_col = dc_dict break if not small_mol_col: dc_definitions_found.append( default_reagent_columns['Small Molecule Batch']) logger.info('data column definitions found: %s' % [x['display_name'] for x in dc_definitions_found]) return dc_definitions_found
def read_datacolumns(book): ''' @return an array of data column definition dicts ''' data_column_sheet = book.sheet_by_name('Data Columns') labels = { 'Worksheet Column':'worksheet_column', '"Data" Worksheet Column':'worksheet_column', 'Display Order':'display_order', 'Display Name':'display_name', 'Name':'name', 'Data Type':'data_type', 'Decimal Places':'precision', 'Description':'description', 'Replicate Number':'replicate', 'Unit':'unit', 'Assay readout type':'readout_type', 'Comments':'comments', } dc_definitions = [] datacolumn_fields = util.get_fields(DataColumn) type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields) logger.debug('datacolumn type lookups: %s' % type_lookup) required_labels = ['name', 'data_type'] logger.info('read the data column definitions...') for i in xrange(data_column_sheet.nrows): row_values = data_column_sheet.row_values(i) if i == 0: for val in row_values[1:]: dc_definitions.append({}) label_read = row_values[0] recognized_label = next( (field_name for label, field_name in labels.items() if label_read and label.lower() == label_read.lower() ), None) if recognized_label: logger.debug( 'label: %r, recognized_label: %r' % (label_read, recognized_label)) for j,val in enumerate(row_values[1:]): dc_dict = dc_definitions[j] logger.debug('data column %s:%d:%d:%r' % ( recognized_label, i, j, val)) final_val = util.convertdata( val,type_lookup.get(recognized_label, None)) if final_val != None: dc_dict[recognized_label] = final_val if recognized_label == 'display_order': # add 10 to the order, so default reagent cols can go first dc_dict['display_order'] = ( dc_dict['display_order'] + 10) if recognized_label == 'name': # split on non-alphanumeric chars temp = re.split(r'[^a-zA-Z0-9]+',dc_dict['name']) # convert, if needed if len(temp) > 1: dc_dict['name'] = camel_case_dwg(dc_dict['name']) else: if recognized_label in required_labels: raise Exception( 'Error, data column field is required: %s, col: %r' % ( recognized_label, colname(j+1) ) ) else: logger.debug( 'unrecognized label in "Data Columns" sheet %r' % label_read) for dc_dict in dc_definitions: for label in required_labels: if label not in dc_dict: raise Exception( 'required "Data Column" label not defined %r' % label) logger.info('find the data columns on the "Data" sheet...') data_sheet = book.sheet_by_name('Data') data_sheet_labels = data_sheet.row_values(0) dc_definitions_found = [] data_labels_found = [] for i,data_label in enumerate(data_sheet_labels): if not data_label or not data_label.strip(): logger.info('break on data sheet col %d, blank' % i) break data_label = data_label.upper() col_letter = colname(i) for dc_dict in dc_definitions: _dict = None if 'worksheet_column' in dc_dict: v = dc_dict['worksheet_column'] if v.upper() == col_letter: data_labels_found.append(i) dc_definitions_found.append(dc_dict) _dict = dc_dict elif 'name' in dc_dict or 'display_name' in dc_dict: if ( dc_dict.get('name', '').upper() == data_label or dc_dict.get('display_name', '').upper() == data_label): dc_dict['worksheet_column'] = col_letter data_labels_found.append(i) dc_definitions_found.append(dc_dict) _dict = dc_dict if _dict and 'display_order' not in _dict: _dict['display_order'] = i+10 logger.warn( 'auto assigning "display_order" for col %r as %d' % (_dict['name'], i+10)) if i not in data_labels_found: logger.debug( ( 'Data sheet label not found %r,' ' looking in default reagent definitions %s' ) % ( data_label, default_reagent_columns.keys() ) ) for key,dc_dict in default_reagent_columns.items(): if (key.upper() == data_label or dc_dict.get('name', '').upper() == data_label or dc_dict.get('display_name', '').upper() == data_label): dc_dict['worksheet_column'] = col_letter data_labels_found.append(i) dc_definitions_found.append(dc_dict) data_labels_not_found = [ data_label for i,data_label in enumerate(data_sheet_labels) if data_label and data_label.strip() and i not in data_labels_found and data_label not in meta_columns ] if data_labels_not_found: logger.warn( 'data sheet labels not recognized %s' % data_labels_not_found ) # for legacy datasets: make sure the small molecule column 1 is always created small_mol_col = None for dc_dict in dc_definitions_found: if dc_dict['data_type'] == 'small_molecule': small_mol_col = dc_dict break if not small_mol_col: dc_definitions_found.append(default_reagent_columns['Small Molecule Batch']) logger.info('data column definitions found: %s' % [x['display_name'] for x in dc_definitions_found]) return dc_definitions_found