def _read_small_molecule_batch(map_column,r,current_row, dr): ''' @param r row @param dr dataRecord ''' try: value = util.convertdata(r[map_column].strip()) if(value != None and value != '' ): value = value.split("-") if len(value) < 2: raise Exception('Small Molecule (Batch) format is ' '#####-###(-#) **Note that (batch) is optional') x = value[0] facility = util.convertdata(x,int) salt = value[1] try: dr.smallmolecule = SmallMolecule.objects.get( facility_id=facility, salt_id=salt) except Exception, e: logger.error(str(('could not locate small molecule:', facility,e))) raise if(len(value)>2): dr.batch_id = util.convertdata(value[2],int) # TODO: validate that the batch exists? (would need to # do for all types, not just Small Molecule except Exception, e: logger.error(str(( "Invalid Small Molecule (or batch) identifiers: ", value, 'row',current_row,e))) raise
def _create_datapoint(dataColumn, dataset, dataRecord, value): dataPoint = None # TODO: define allowed "types" for the input sheet # (this is listed in current SS code, but we may want to rework) if (dataColumn.data_type == 'Numeric'): if (dataColumn.precision != 0): # float, TODO: set precision dataPoint = DataPoint(datacolumn = dataColumn, dataset = dataset, datarecord = dataRecord, float_value=util.convertdata(value, float)) else: dataPoint = DataPoint(datacolumn=dataColumn, dataset = dataset, datarecord = dataRecord, int_value=util.convertdata(value,int)) elif (dataColumn.data_type == 'omero_image'): dataPoint = DataPoint(datacolumn=dataColumn, dataset = dataset, datarecord = dataRecord, int_value=util.convertdata(value,int)) else: # ONLY text, for now, we'll need to define the allowed types, next! dataPoint = DataPoint(datacolumn=dataColumn, dataset = dataset, datarecord = dataRecord, text_value=util.convertdata(value)) return dataPoint
def main(path): """ Read in the Library and LibraryMapping sheets """ libraries = readLibraries(path, 'Library') labels = { 'Facility': 'facility_id', 'Salt': 'sm_salt', 'Batch': 'facility_batch_id', 'Plate': 'plate', 'Well': 'well', 'Library Name': 'short_name', 'Concentration': 'concentration', 'Concentration Unit': 'concentration_unit' } small_molecule_lookup = ('facility_id', 'sm_salt', 'facility_batch_id') sheet = iu.readtable([path, 'LibraryMapping']) #dict to map spreadsheet fields to terms cols = {} # first put the label row in (it contains the worksheet column, and its unique) for i, label in enumerate(sheet.labels): if label in labels: cols[labels[label]] = i else: print 'Note: column label not found:', label rows = 0 for row in sheet: r = util.make_row(row) # small molecule dict = {} for field in small_molecule_lookup: dict[field] = util.convertdata(r[cols[field]], int) try: dict['facility_id'] = 'HMSL' + str( dict['facility_id'] ) # TODO: convert all hmsl id's to integers!! sm = SmallMolecule.objects.get(**dict) except Exception, e: print "Invalid small molecule identifiers: ", dict raise short_name = r[cols['short_name']] if short_name not in libraries: print "Library not found: ", short_name raise lm = {} lm['concentration'] = util.convertdata(r[cols['concentration']], float) lm['concentration_unit'] = util.convertdata( r[cols['concentration_unit']], None) lm['plate'] = util.convertdata(r[cols['plate']], int) lm['well'] = r[cols['well']] lm['small_molecule'] = sm lm['library'] = libraries[short_name] lm = LibraryMapping(**lm) lm.save() rows += 1
def _read_plate_well(map_column,r,current_row, dr): ''' @param r row @param dr dataRecord ''' try: plate_id=None well_id=None value = util.convertdata(r[map_column].strip()) if(value != None and value != '' ): plate_id = util.convertdata(value,int) value = util.convertdata(r[map_column+1].strip()) if(value != None and value != '' ): well_id = value else: raise Exception(str(( 'Must define both plate and well (not just plate), row', current_row))) dr.plate = plate_id dr.well = well_id try: # TODO: # What if the plate/well does not correlate to a # librarymapping? # i.e. if this is the plate/well for a cell/protein study? # For now, the effect of the following logic is that # plate/well either maps a librarymapping, or is a an # arbitrary plate/well. dr.library_mapping = \ LibraryMapping.objects.get(plate=plate_id,well=well_id) if(dr.smallmolecule != None): if(dr.smallmolecule != None and dr.library_mapping.smallmolecule_batch != None and (dr.smallmolecule != dr.library_mapping.smallmolecule_batch.smallmolecule)): raise Exception(str(( 'SmallMolecule does not match the ' 'libraryMapping.smallmolecule_batch.smallmolecule ' 'pointed to by the plate/well:' ,plate_id,well_id, dr.smallmolecule, dr.library_mapping.smallmolecule_batch.smallmolecule, r,'row',current_row))) elif(dr.library_mapping.smallmolecule_batch != None): dr.smallmolecule = \ dr.library_mapping.smallmolecule_batch.smallmolecule except ObjectDoesNotExist, e: logger.warn(str(( 'No librarymapping defined (plate/well do not point to a ' 'librarymapping), row', current_row))) except Exception, e: logger.error(str(("Invalid plate/well identifiers",plate_id,well_id,r, e,'row',current_row,e))) raise e
def _read_plate_well(map_column, r, current_row, dr, small_mol_col, small_molecule_datapoint, datapoint_batch): plate_id = None well_id = None try: value = util.convertdata(r[map_column].strip()) if (value != None and value != ''): plate_id = util.convertdata(value, int) value = util.convertdata(r[map_column + 1].strip()) if (value != None and value != ''): well_id = value else: raise Exception( 'Must define both plate and well (not just plate), row: %d' % current_row) dr.plate = plate_id dr.well = well_id dr.library_mapping = LibraryMapping.objects.get(plate=plate_id, well=well_id) # Legacy loading use-case: # - if small molecule already specified, check that it is the same # - if no small molecule specified yet, associate the plate:well # small molecule with a datapoint and the dataset if (dr.library_mapping.smallmolecule_batch != None): if small_molecule_datapoint and small_molecule_datapoint.reagent_batch: if small_molecule_datapoint.reagent_batch != dr.library_mapping.smallmolecule_batch: raise Exception( ('plate:well entry %s ' 'does not match small molecule %r, row: %s') % (well_id, dr.library_mapping.smallmolecule_batch, current_row)) else: dr.dataset.small_molecules.add( dr.library_mapping.smallmolecule_batch) text_value = dr.library_mapping.smallmolecule_batch.reagent.facility_id text_value += '-%s' % dr.library_mapping.smallmolecule_batch.reagent.salt_id if dr.library_mapping.smallmolecule_batch.batch_id != 0: text_value += '-%s' % dr.library_mapping.smallmolecule_batch.batch_id datapoint = DataPoint( datacolumn=small_mol_col, dataset=dr.dataset, datarecord=dr, reagent_batch=dr.library_mapping.smallmolecule_batch, text_value=text_value) datapoint_batch.append(datapoint) except Exception, e: logger.exception(('Invalid plate/row information, ' 'plate: %r, well: %r, data: %s, row_number: %d') % (plate_id, well_id, r, current_row)) raise e
def main(path): """ Read in the Library and LibraryMapping sheets """ libraries = readLibraries(path,'Library') labels = { 'Facility':'facility_id', 'Salt':'sm_salt', 'Batch':'facility_batch_id', 'Plate':'plate', 'Well':'well', 'Library Name':'short_name', 'Concentration': 'concentration', 'Concentration Unit':'concentration_unit' } small_molecule_lookup = ('facility_id', 'sm_salt', 'facility_batch_id') sheet = iu.readtable([path, 'LibraryMapping']) #dict to map spreadsheet fields to terms cols = {} # first put the label row in (it contains the worksheet column, and its unique) for i,label in enumerate(sheet.labels): if label in labels: cols[labels[label]] = i else: print 'Note: column label not found:', label rows = 0 for row in sheet: r = util.make_row(row) # small molecule dict = {} for field in small_molecule_lookup: dict[field] = util.convertdata(r[cols[field]],int) try: dict['facility_id'] = 'HMSL' + str(dict['facility_id']) # TODO: convert all hmsl id's to integers!! sm = SmallMolecule.objects.get(**dict) except Exception, e: print "Invalid small molecule identifiers: ", dict raise short_name = r[cols['short_name']] if short_name not in libraries: print "Library not found: ", short_name raise lm = {} lm['concentration'] = util.convertdata(r[cols['concentration']],float) lm['concentration_unit'] = util.convertdata(r[cols['concentration_unit']],None) lm['plate'] = util.convertdata(r[cols['plate']], int) lm['well'] = r[cols['well']] lm['small_molecule'] = sm lm['library'] = libraries[short_name] lm = LibraryMapping(**lm) lm.save() rows += 1
def _read_plate_well(map_column, r, current_row, dr, small_molecule_column, small_molecule_datapoint,datapoint_batch): plate_id=None well_id=None try: value = util.convertdata(r[map_column].strip()) if (value != None and value != ''): plate_id = util.convertdata(value, int) value = util.convertdata(r[map_column+1].strip()) if (value != None and value != '' ): well_id = value else: raise Exception( 'Must define both plate and well (not just plate), row: %d' % current_row) dr.plate = plate_id dr.well = well_id dr.library_mapping = LibraryMapping.objects.get( plate=plate_id, well=well_id) # Legacy loading use-case: # - if small molecule already specified, check that it is the same # - if no small molecule specified yet, associate the plate:well # small molecule with a datapoint and the dataset if(dr.library_mapping.smallmolecule_batch != None): if small_molecule_datapoint and small_molecule_datapoint.reagent_batch: if small_molecule_datapoint.reagent_batch != dr.library_mapping.smallmolecule_batch: raise Exception(( 'plate:well entry %s ' 'does not match small molecule %r, row: %s') % (well_id, dr.library_mapping.smallmolecule_batch, current_row)) else: dr.dataset.small_molecules.add(dr.library_mapping.smallmolecule_batch) text_value = dr.library_mapping.smallmolecule_batch.reagent.facility_id text_value += '-%s' % dr.library_mapping.smallmolecule_batch.reagent.salt_id if dr.library_mapping.smallmolecule_batch.batch_id != 0: text_value += '-%s' % dr.library_mapping.smallmolecule_batch.batch_id datapoint = DataPoint(datacolumn=small_molecule_column, dataset = dr.dataset, datarecord = dr, reagent_batch=dr.library_mapping.smallmolecule_batch, text_value=text_value) datapoint_batch.append(datapoint) except Exception, e: logger.exception( ('Invalid plate/row information, ' 'plate: %r, well: %r, data: %s, row_number: %d') % ( plate_id,well_id, r, current_row )) raise e
def _read_protein(map_column,r,current_row, dr): ''' @param r row @param dr dataRecord ''' try: value = util.convertdata(r[map_column].strip()) if(value != None and value != '' ): facility_id = r[map_column] facility_id = util.convertdata(facility_id,int) dr.protein = Protein.objects.get(lincs_id=facility_id) except Exception, e: logger.error(str(( "Invalid Protein facility id: ", value,'row',current_row, e))) raise
def _read_cell(map_column,r,current_row, dr): ''' @param r row @param dr dataRecord ''' try: value = util.convertdata(r[map_column].strip()) facility_id = None if(value != None and value != '' ): facility_id = util.convertdata(value,int) dr.cell = Cell.objects.get(facility_id=facility_id) except Exception, e: logger.error(str(("Invalid Cell facility id: ", facility_id, 'row',current_row, e))) raise
def read_explicit_reagents(book, dataset): try: reagents_sheet = book.sheet_by_name('Reagents') for row in range(1,reagents_sheet.nrows): facility_batch_id = read_string(reagents_sheet.cell(row,0)) vals = [ util.convertdata(x,int) for x in facility_batch_id.split('-')] logger.info('facility_batch_id: %r', vals) if len(vals)>3: raise Exception( 'Reagent id has too many values: %r', facility_batch_id) if (len(vals)==3): smb = SmallMoleculeBatch.objects.get( reagent__facility_id=vals[0], reagent__salt_id=vals[1], batch_id=vals[2]) logger.info('small molecule batch found: %r', smb) dataset.small_molecules.add(smb) else: if len(vals)==2: if len(str(vals[1]))==3: smb = SmallMoleculeBatch.objects.get( reagent__facility_id=vals[0], reagent__salt_id=vals[1], batch_id=0) logger.info('small molecule batch found: %r', smb) dataset.small_molecules.add(smb) continue rb = ReagentBatch.objects.get( reagent__facility_id=vals[0], batch_id=vals[1]) else: rb = ReagentBatch.objects.get( reagent__facility_id=vals[0], batch_id=0) if hasattr(rb,'antibodybatch'): logger.info('antibody reagent found: %r', rb) dataset.antibodies.add(rb.antibodybatch) elif hasattr(rb, 'cellbatch'): logger.info('cell reagent found: %r', rb) dataset.cells.add(rb.cellbatch) elif hasattr(rb, 'otherreagentbatch'): logger.info('other_reagent reagent found: %r', rb) dataset.other_reagents.add(rb.otherreagentbatch) elif hasattr(rb, 'primarycellbatch'): logger.info('primary cell reagent found: %r', rb) dataset.primary_cells.add(rb.primarycellbatch) elif hasattr(rb, 'proteinbatch'): logger.info('protein reagent found: %r', rb) dataset.proteins.add(rb.proteinbatch) else: raise Exception('unknown reagent type: %r', rb) dataset.save() except XLRDError, e: logger.info('no "Reagents" sheet found')
def read_metadata(path): """ Read in the DataSets, Datacolumns, and Data sheets. In the Data sheet, rows are DataRecords, and columns are DataPoints """ # Read in the DataSet sheetname = 'Meta' # Define the Column Names -> model fields mapping labels = {'Lead Screener First': 'lead_screener_firstname', 'Lead Screener Last': 'lead_screener_lastname', 'Lead Screener Email': 'lead_screener_email', 'Lab Head First': 'lab_head_firstname', 'Lab Head Last': 'lab_head_lastname', 'Lab Head Email': 'lab_head_email', 'Title': 'title', 'Facility ID': 'facility_id', 'Summary': 'summary', 'Protocol': 'protocol', 'References': 'protocol_references'} metaSheet = iu.readtable([path, sheetname]) # Note, skipping the header row by default metaData = {} for row in metaSheet: rowAsUnicode = util.make_row(row) for key,value in labels.items(): if re.match(key, rowAsUnicode[0], re.M|re.I): if key == 'Facility ID': metaData[value] = util.convertdata(rowAsUnicode[1],int) else: metaData[value] = rowAsUnicode[1] assert len(metaData) == len(labels), 'Meta data sheet does not contain the necessary keys, expected: %s, read: %s' % [labels, metaData] return metaData
def readLibraries(path, sheetName): sheet = iu.readtable([path, sheetName]) # Note, skipping the header row by default # dict to map spreadsheet fields to the Library fields properties = ('model_field','required','default','converter') date_parser = lambda x : util.convertdata(x,date) column_definitions = {'Name': ('name',True), # TODO use the model to determine if req'd 'ShortName': ('short_name',True), 'Library Type':'type', 'Date First Plated': ('date_first_plated',False,None,date_parser), 'Date Data Received':('date_data_received',False,None,date_parser), 'Date Loaded': ('date_loaded',False,None,date_parser), 'Date Publicly Available': ('date_publicly_available',False,None,date_parser), 'Most Recent Update': ('date_updated',False,None,util.date_converter), 'Is Restricted':('is_restricted',False,False) } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 libraries = {} for row in sheet: logger.debug(str(('row raw: ',row))) r = util.make_row(row) logger.debug(str(('row: ',r))) initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value try: library = Library(**initializer) library.save() logger.info(str(('library created', library))) libraries[library.short_name] = library rows += 1 except Exception, e: logger.error(str(('library initializer problem: ', initializer))) raise e
def _create_datapoint(datacolumn, dataset, datarecord, value): datapoint = None if datacolumn.data_type == 'Numeric': if datacolumn.precision != 0: datapoint = DataPoint(datacolumn = datacolumn, dataset = dataset, datarecord = datarecord, float_value=util.convertdata(value, float)) else: datapoint = DataPoint(datacolumn=datacolumn, dataset = dataset, datarecord = datarecord, int_value=util.convertdata(value, int)) elif datacolumn.data_type == 'omero_image': datapoint = DataPoint(datacolumn=datacolumn, dataset = dataset, datarecord = datarecord, int_value=util.convertdata(value, int)) else: logger.debug( 'create datapoint for %r, datarecord: %s' % (value, datarecord)) datapoint = DataPoint(datacolumn=datacolumn, dataset = dataset, datarecord = datarecord, text_value=util.convertdata(value)) if datacolumn.data_type == 'small_molecule': _read_small_molecule(dataset, datapoint) elif datacolumn.data_type == 'protein': _read_protein(dataset, datapoint) elif datacolumn.data_type == 'antibody': _read_antibody(dataset, datapoint) elif datacolumn.data_type == 'other_reagent': _read_other_reagent(dataset, datapoint) elif datacolumn.data_type == 'cell': _read_cell_batch(dataset, datapoint) elif datacolumn.data_type == 'primary_cell': _read_primary_cell_batch(dataset, datapoint) return datapoint
def _create_datapoint(datacolumn, dataset, datarecord, value): datapoint = None if datacolumn.data_type == 'Numeric': if datacolumn.precision != 0: datapoint = DataPoint(datacolumn=datacolumn, dataset=dataset, datarecord=datarecord, float_value=util.convertdata(value, float)) else: datapoint = DataPoint(datacolumn=datacolumn, dataset=dataset, datarecord=datarecord, int_value=util.convertdata(value, int)) elif datacolumn.data_type == 'omero_image': datapoint = DataPoint(datacolumn=datacolumn, dataset=dataset, datarecord=datarecord, int_value=util.convertdata(value, int)) else: logger.debug('create datapoint for %r, datarecord: %s' % (value, datarecord)) datapoint = DataPoint(datacolumn=datacolumn, dataset=dataset, datarecord=datarecord, text_value=util.convertdata(value)) if datacolumn.data_type == 'small_molecule': _read_small_molecule(dataset, datapoint) elif datacolumn.data_type == 'protein': _read_protein(dataset, datapoint) elif datacolumn.data_type == 'antibody': _read_antibody(dataset, datapoint) elif datacolumn.data_type == 'other_reagent': _read_other_reagent(dataset, datapoint) elif datacolumn.data_type == 'cell': _read_cell_batch(dataset, datapoint) return datapoint
def readLibraries(path, sheetName): sheet = iu.readtable([path, sheetName ]) # Note, skipping the header row by default # dict to map spreadsheet fields to the Library fields labels = { 'Name': 'name', 'ShortName': 'short_name', 'Date First Plated': 'date_first_plated', 'Date Data Received': 'date_data_received', 'Date Loaded': 'date_loaded', 'Date Publicly Available': 'date_publicly_available' } date_parser = lambda x: util.convertdata(x, date) converters = { 'date_first_plated': date_parser, 'date_loaded': date_parser, 'date_data_recieved': date_parser, 'date_publicly_available': date_parser } cols = {} # first put the label row in (it contains the worksheet column, and its unique) for i, label in enumerate(sheet.labels): if label in labels: cols[i] = labels[label] else: print 'Note: column label not found:', label raise rows = 0 i = 0 libraries = {} for row in sheet: r = util.make_row(row) dict = {} for i, value in enumerate(r): if cols[i] in converters: value = converters[cols[i]](value) dict[cols[i]] = value try: print 'create library:', dict library = Library(**dict) library.save() libraries[library.short_name] = library rows += 1 except Exception, e: print "Invalid Library, name: ", r[0] raise
def _parse_reagent_batch(text_value): ''' Split text_value on the dash character, convert each element to an integer ''' vals = [util.convertdata(x, int) for x in text_value.split('-')] if len(vals) > 2: raise Exception( 'invalid reagent-batch ID value, to many identifiers: %r' % text_value) facility_id = vals[0] batch_id = 0 if len(vals) == 2: batch_id = vals[1] parsed_text = '-'.join([str(x) for x in vals]) return (facility_id, batch_id, parsed_text)
def _parse_reagent_batch(text_value): ''' Split text_value on the dash character, convert each element to an integer ''' vals = [ util.convertdata(x,int) for x in text_value.split('-')] if len(vals) > 2: raise Exception( 'invalid reagent-batch ID value, to many identifiers: %r' % text_value) facility_id = vals[0] batch_id = 0 if len(vals) == 2: batch_id = vals[1] parsed_text = '-'.join([str(x) for x in vals]) return (facility_id,batch_id,parsed_text)
def readLibraries(path, sheetName): sheet = iu.readtable([path, sheetName]) # Note, skipping the header row by default # dict to map spreadsheet fields to the Library fields labels = { 'Name': 'name', 'ShortName': 'short_name', 'Date First Plated': 'date_first_plated', 'Date Data Received':'date_data_received', 'Date Loaded': 'date_loaded', 'Date Publicly Available': 'date_publicly_available' } date_parser = lambda x : util.convertdata(x,date) converters = {'date_first_plated': date_parser, 'date_loaded': date_parser, 'date_data_recieved': date_parser, 'date_publicly_available': date_parser } cols = {} # first put the label row in (it contains the worksheet column, and its unique) for i,label in enumerate(sheet.labels): if label in labels: cols[i] = labels[label] else: print 'Note: column label not found:', label raise rows = 0 i = 0 libraries = {} for row in sheet: r = util.make_row(row) dict = {} for i,value in enumerate(r): if cols[i] in converters: value = converters[cols[i]](value) dict[cols[i]]= value try: print 'create library:', dict library = Library(**dict) library.save() libraries[library.short_name] = library rows += 1 except Exception, e: print "Invalid Library, name: ", r[0] raise
def readDataColumns(path): # Read in the DataColumn Sheet sheetname = 'Data Columns' dataColumnSheet = iu.readtable([path, sheetname]) _fields = util.get_fields(DataColumn) _typelookup = dict((f.name, iu.totype(f)) for f in _fields) # TODO: Use the import_utils methods here # TODO: compare and combine this with the fieldinformation entity labels = {'Worksheet Column':'worksheet_column', 'Display Order':'display_order', 'Name':'name', 'Data Type':'data_type', 'Decimal Places':'precision', 'Description':'description', 'Replicate Number':'replicate', 'Time point':'time_point', 'Assay readout type':'readout_type', 'Comments':'comments'} # create an array of dict's, each dict defines a DataColumn dataColumnDefinitions = [] # first put the label row in (it contains the worksheet column, and its unique) for v in dataColumnSheet.labels[1:]: dataColumnDefinitions.append({labels['Worksheet Column']:v}) # now, for each row, create the appropriate dictionary entry in the dataColumnDefinitions for row in dataColumnSheet: rowAsUnicode = util.make_row(row) keyRead = rowAsUnicode[0] for i,cellText in enumerate(rowAsUnicode[1:]): for key,fieldName in labels.items(): if re.match(key,keyRead,re.M|re.I): # if the row is one of the DataColumn fields, then add it to the dict dataColumnDefinitions[i][fieldName] = util.convertdata(cellText,_typelookup.get(fieldName, None)) # Note: convert the data to the model field type else: logger.debug(str(( '"Data Column definition not used: ', cellText)) ) pass logger.debug(str(("definitions: ", dataColumnDefinitions)) ) return dataColumnDefinitions
def readDataColumns(path): # Read in the DataColumn Sheet sheetname = 'Data Columns' dataColumnSheet = iu.readtable([path, sheetname]) _fields = util.get_fields(DataColumn) _typelookup = dict((f.name, iu.totype(f)) for f in _fields) labels = {'Worksheet Column':'worksheet_column', 'Name':'name', 'Data Type':'data_type', 'Decimal Places':'precision', 'Description':'description', 'Replicate Number':'replicate', 'Time point':'time_point', 'Assay readout type':'readout_type', 'Comments':'comments'} # create an array of dict's, each dict defines a DataColumn dataColumnDefinitions = [] # first put the label row in (it contains the worksheet column, and its unique) for v in dataColumnSheet.labels[1:]: dataColumnDefinitions.append({labels['Worksheet Column']:v}) # now, for each row, create the appropriate dictionary entry in the dataColumnDefinitions for row in dataColumnSheet: rowAsUnicode = util.make_row(row) keyRead = rowAsUnicode[0] for i,cellText in enumerate(rowAsUnicode[1:]): for key,fieldName in labels.items(): if re.match(key,keyRead,re.M|re.I): # if the row is one of the DataColumn fields, then add it to the dict dataColumnDefinitions[i][fieldName] = util.convertdata(cellText,_typelookup.get(fieldName, None)) # Note: convert the data to the model field type else: pass # print '"Data Column definition not used: ', cellText print "definitions: ", dataColumnDefinitions return dataColumnDefinitions
def main(path): """ Read in the sdf file """ # map field labels to model fields properties = ('model_field','required','default','converter') get_primary_name = lambda x: x.split(';')[0].strip() get_alternate_names = lambda x: ';'.join([x.strip() for x in x.split(';')[1:]]) labels = { s2p.MOLDATAKEY:('molfile',True), # NOTE: even though these db field are not integers, # it is convenient to convert the read in values to INT to make sure they are not interpreted as float values 'facility_reagent_id': ('facility_id',True,None, lambda x: util.convertdata(x[x.index('HMSL')+4:],int)), 'salt_id': ('salt_id',True,None, lambda x: util.convertdata(x,int)), 'lincs_id':('lincs_id',False), #None,lambda x:util.convertdata(x,int)), 'chemical_name':('name',True), 'alternative_names':'alternative_names', 'pubchem_cid':'pubchem_cid', 'chembl_id':'chembl_id', 'chebi_id':'chebi_id', 'inchi':'_inchi', 'inchi_key':'_inchi_key', 'smiles': ('_smiles',True), 'molecular_mass':('_molecular_mass',False,None, lambda x: round(util.convertdata(x, float),2)), 'molecular_formula':'_molecular_formula', 'software':'software', # 'concentration':'concentration', #'well_type':('well_type',False,'experimental'), 'is_restricted':('is_restricted',False,False,util.bool_converter)} # convert the labels to fleshed out dict's, with strategies for optional, default and converter labels = util.fill_in_column_definitions(properties,labels) assert typecheck.isstring(path) with open(path) as fh: data = fh.read().decode(DEFAULT_ENCODING) records = s2p.parse_sdf(data) logger.info(str(('read rows: ', len(records)))) count = 0 for record in records: logger.debug(str(('record', record))) initializer = {} for key,properties in labels.items(): logger.debug(str(('look for key: ', key, ', properties: ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] value = record.get(key) # Todo, refactor to a method try: logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == 'n/a'): value = None if(value == None and required == True): raise Exception(str(('Field is required: ', key, initializer, 'record:', count))) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value except Exception, e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error(str((exc_type, fname, exc_tb.tb_lineno))) logger.error(str(('invalid input', e, 'count', count))) raise e # follows is a kludge, to split up the entered "chemical_name" field, on ';' - TODO: just have two fields that get entered if(initializer['name']): initializer['alternative_names']=get_alternate_names(initializer['name']) initializer['name']=get_primary_name(initializer['name']) if(logger.isEnabledFor(logging.DEBUG)): logger.debug(str(('initializer: ', initializer))) try: sm = SmallMolecule(**initializer) sm.save() logger.info(str(('sm created:', sm))) count += 1 except Exception, e: logger.error(str(('save failed for: ', initializer, 'error',e, 'count: ', count))) raise e
raise Exception('at least one of: ' + str(mappingColumnDict.keys()) + ' must be defined and used in the Data sheet.') # Read in the Data sheet, create DataPoint values for mapped column in each row logger.info(str(('data sheet columns identified, read rows, save_interval:', save_interval))) loopStart = time.time() pointsSaved = 0 rowsRead = 0 for row in dataSheet: current_row = rowsRead+2 r = util.make_row(row) dataRecord = DataRecord(dataset=dataset ) map_column = mappingColumnDict['Small Molecule Batch'] mapped = False if(map_column > -1): try: value = util.convertdata(r[map_column].strip()) if(value != None and value != '' ): value = value.split("-") if len(value) < 2: raise Exception('Small Molecule (Batch) format is #####-###(-#) **Note that (batch) is optional') x = value[0] facility = util.convertdata(x,int) salt = value[1] try: dataRecord.smallmolecule = SmallMolecule.objects.get(facility_id=facility, salt_id=salt) except Exception, e: logger.error(str(('could not locate small molecule:', facility))) raise if(len(value)>2): dataRecord.batch_id = util.convertdata(value[2],int) # TODO: validate that the batch exists? (would need to do for all types, not just Small Molecule mapped = True
def main(path): """ Read in the Cell """ sheet_name = 'HMS-LINCS cell line metadata' sheet = iu.readtable([path, sheet_name, 1]) # Note, skipping the header row by default properties = ('model_field','required','default','converter') column_definitions = { 'Facility ID':('facility_id',True,None, lambda x: x[x.index('HMSL')+4:]), 'CL_Name':('name',True), 'CL_ID':'cl_id', 'CL_Alternate_Name':'alternate_name', 'CL_Alternate_ID':'alternate_id', 'CL_Center_Name':'center_name', 'CL_Center_Specific_ID':'center_specific_id', 'MGH_ID':('mgh_id',False,None,lambda x:util.convertdata(x,int)), 'Assay':'assay', 'CL_Provider_Name':'provider_name', 'CL_Provider_Catalog_ID':'provider_catalog_id', 'CL_Batch_ID':'batch_id', 'CL_Organism':'organism', 'CL_Organ':'organ', 'CL_Tissue':'tissue', 'CL_Cell_Type':'cell_type', 'CL_Cell_Type_Detail':'cell_type_detail', 'CL_Disease':'disease', 'CL_Disease_Detail':'disease_detail', 'CL_Growth_Properties':'growth_properties', 'CL_Genetic_Modification':'genetic_modification', 'CL_Related_Projects':'related_projects', 'CL_Recommended_Culture_Conditions':'recommended_culture_conditions', 'CL_Verification_Profile':'verification_profile', 'CL_Verification_Reference_Profile':'verification_reference_profile', 'CL_Mutations_Reference':'mutations_reference', 'CL_Mutations_Explicit':'mutations_explicit', 'CL_Organism_Gender':'organism_gender', 'Date Data Received':('date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), 'Is Restricted':('is_restricted',False,False,util.bool_converter)} # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: r = util.make_row(row) initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) cell = Cell(**initializer) cell.save() logger.info(str(('cell created:', cell))) rows += 1 except Exception, e: print "Invalid Cell, name: ", r[0] raise e
def main(path): """ Read in the Library and LibraryMapping sheets """ libraries = readLibraries(path,'Library') sheet = iu.readtable([path, 'LibraryMapping']) properties = ('model_field','required','default','converter') column_definitions = {'Facility':('facility_id',False,None, lambda x: util.convertdata(x,int)), 'Salt':('salt_id',False,None, lambda x: util.convertdata(x,int)), 'Batch':('facility_batch_id',False,None, lambda x: util.convertdata(x,int)), 'Is Control':('is_control',False,False,util.bool_converter), 'Plate':('plate',False,None, lambda x: util.convertdata(x,int)), 'Well':'well', 'Library Name':'short_name', 'Concentration': 'concentration', 'Concentration Unit':'concentration_unit' } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) small_molecule_batch_lookup = ('smallmolecule', 'facility_batch_id') library_mapping_lookup = ('smallmolecule_batch','library','is_control','plate','well','concentration','concentration_unit') rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: current_row = rows + 2 r = util.make_row(row) initializer = {} small_molecule_lookup = {'facility_id':None, 'salt_id':None} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],'row',current_row)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value if(model_field in small_molecule_lookup): small_molecule_lookup[model_field]=value if( None not in small_molecule_lookup.values()): try: sm = SmallMolecule.objects.get(**small_molecule_lookup) initializer['smallmolecule'] = sm except Exception, e: raise Exception(str(('sm facility id not found', small_molecule_lookup,e,'row',current_row))) elif(model_field == 'short_name'): try: library = libraries[value] initializer['library'] = library except Exception, e: raise Exception(str(('library short_name not found', value,e,'row',current_row)))
def main(path): """ Read in the smallmolecule batch info """ sheet_name = 'sheet 1' start_row = 1 sheet = iu.readtable([path, sheet_name, start_row]) # Note, skipping the header row by default properties = ('model_field','required','default','converter') column_definitions = { # NOTE: even though these db field are not integers, # it is convenient to convert the read in values to INT to make sure they are not interpreted as float values 'facility_id': ('facility_id',True,None, lambda x: util.convertdata(x,int)), 'salt_id': ('salt_id',True,None, lambda x: util.convertdata(x,int)), 'facility_batch_id':('batch_id',True,None, lambda x: util.convertdata(x,int)), 'provider': ('provider_name',True), 'provider_catalog_id':'provider_catalog_id', 'provider_sample_id':'provider_batch_id', 'chemical_synthesis_reference':'chemical_synthesis_reference', 'purity':'purity', 'purity_method':'purity_method', 'aqueous_solubility':'aqueous_solubility', # FIXME: should warn the user if no unit is provided when # aqueous_solubility is provided 'aqueous_solubility_unit':'aqueous_solubility_unit', 'Date Data Received':('date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels, all_sheet_columns_required=False) rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: r = util.make_row(row) initializer = {} small_molecule_lookup = {'facility_id':None, 'salt_id':None} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) if(model_field in small_molecule_lookup): small_molecule_lookup[model_field]=value if( None not in small_molecule_lookup.values()): try: sm = SmallMolecule.objects.get(**small_molecule_lookup) initializer['reagent'] = sm except Exception, e: logger.error(str(('sm identifiers not found', small_molecule_lookup,'row',rows+start_row+2))) raise else: initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) smb = SmallMoleculeBatch(**initializer) smb.save() logger.debug(str(('smb created:', smb))) rows += 1 except Exception, e: logger.error(str(( "Invalid smallmolecule batch initializer: ", initializer, 'row', rows+start_row+2, e))) raise
def read_metadata(meta_sheet): properties = ('model_field', 'required', 'default', 'converter') field_definitions = { 'Lead Screener First': 'lead_screener_firstname', 'Lead Screener Last': 'lead_screener_lastname', 'Lead Screener Email': 'lead_screener_email', 'Lab Head First': 'lab_head_firstname', 'Lab Head Last': 'lab_head_lastname', 'Lab Head Email': 'lab_head_email', 'Title': 'title', 'Facility ID': ( 'facility_id', True, None, lambda x: util.convertdata(x, int)), 'Summary': 'summary', 'Protocol': 'protocol', 'References': 'protocol_references', 'Date Data Received':( 'date_data_received', False, None, util.date_converter), 'Date Loaded': ('date_loaded', False, None, util.date_converter), 'Date Publicly Available': ( 'date_publicly_available', False, None, util.date_converter), 'Most Recent Update': ( 'date_updated', False, None, util.date_converter), 'Is Restricted':('is_restricted', False, False, util.bool_converter), 'Dataset Type':('dataset_type', False), 'Bioassay':('bioassay', False), 'Dataset Keywords':('dataset_keywords', False), 'Usage Message':('usage_message', False), 'Dataset Data URL':('dataset_data_url', False), 'Associated Publication': ('associated_publication', False), 'Associated Project Summary': ('associated_project_summary', False), } sheet_labels = [] for i in xrange(meta_sheet.nrows-1): row = meta_sheet.row_values(i+1) sheet_labels.append(row[0]) field_definitions = util.fill_in_column_definitions( properties, field_definitions) cols = util.find_columns(field_definitions, sheet_labels, all_column_definitions_required=False) initializer = {} for i in xrange(meta_sheet.nrows-1): row = meta_sheet.row_values(i+1) properties = cols[i] value = row[1] logger.debug('Metadata raw value %r' % value) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] if converter: value = converter(value) if not value and default != None: value = default if not value and required: raise Exception( 'Field is required: %s, record: %d' % (properties['column_label'], row)) logger.debug('model_field: %s, value: %r' % ( model_field, value ) ) initializer[model_field] = value return initializer
def read_metadata(path): """ Read in the DataSets, Datacolumns, and Data sheets. In the Data sheet, rows are DataRecords, and columns are DataPoints """ # Read in the DataSet sheetname = 'Meta' # Note, skipping the header row by default metaSheet = iu.readtable([path, sheetname]) # Define the Column Names -> model fields mapping properties = ('model_field','required','default','converter') field_definitions = {'Lead Screener First': 'lead_screener_firstname', 'Lead Screener Last': 'lead_screener_lastname', 'Lead Screener Email': 'lead_screener_email', 'Lab Head First': 'lab_head_firstname', 'Lab Head Last': 'lab_head_lastname', 'Lab Head Email': 'lab_head_email', 'Title': 'title', 'Facility ID': ('facility_id',True,None, lambda x: util.convertdata(x,int)), 'Summary': 'summary', 'Protocol': 'protocol', 'References': 'protocol_references', 'Date Data Received':('date_data_received',False,None, util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None, util.date_converter), 'Most Recent Update': ('date_updated',False,None, util.date_converter), 'Is Restricted':('is_restricted',False,False,util.bool_converter), 'Dataset Type':('dataset_type',False), 'Bioassay':('bioassay',False), 'Dataset Keywords':('dataset_keywords',False), 'Usage Message':('usage_message',False), } sheet_labels = [] for row in metaSheet: rowAsUnicode = util.make_row(row) sheet_labels.append(rowAsUnicode[0]) # convert the definitions to fleshed out dict's, with strategies for # optional, default and converter field_definitions = \ util.fill_in_column_definitions(properties,field_definitions) # create a dict mapping the column/row ordinal to the proper definition dict cols = util.find_columns(field_definitions, sheet_labels, all_column_definitions_required=False) initializer = {} for i,row in enumerate(metaSheet): rowAsUnicode = util.make_row(row) properties = cols[i] value = rowAsUnicode[1] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],row)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value return initializer
def main(path): """ Read in the Library and LibraryMapping sheets """ libraries = readLibraries(path, 'Library') sheet = iu.readtable([path, 'LibraryMapping']) properties = ('model_field', 'required', 'default', 'converter') date_parser = lambda x: util.convertdata(x, date) column_definitions = { 'Facility': ('facility_id', False, None, lambda x: util.convertdata(x, int)), 'Salt': ('salt_id', False, None, lambda x: util.convertdata(x, int)), 'Batch': ('batch_id', False, None, lambda x: util.convertdata(x, int)), 'Is Control': ('is_control', False, False, util.bool_converter), 'Plate': ('plate', False, None, lambda x: util.convertdata(x, int)), 'Well': 'well', 'Library Name': 'short_name', 'Concentration': 'concentration', 'Concentration Unit': 'concentration_unit' } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions( properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) small_molecule_batch_lookup = ('reagent', 'batch_id') library_mapping_lookup = ('smallmolecule_batch', 'library', 'is_control', 'plate', 'well', 'concentration', 'concentration_unit') rows = 0 logger.debug(str(('cols: ', cols))) for row in sheet: current_row = rows + 2 r = util.make_row(row) initializer = {} small_molecule_lookup = {'facility_id': None, 'salt_id': None} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if (converter != None): value = converter(value) if (value == None): if (default != None): value = default if (value == None and required == True): raise Exception( 'Field is required: %s, record: %d' % (properties['column_label'], 'row', current_row)) logger.debug( str(('model_field: ', model_field, ', value: ', value))) initializer[model_field] = value if (model_field in small_molecule_lookup): small_molecule_lookup[model_field] = value if (None not in small_molecule_lookup.values()): try: sm = SmallMolecule.objects.get(**small_molecule_lookup) initializer['reagent'] = sm except Exception, e: raise Exception( str(('sm facility id not found', small_molecule_lookup, e, 'row', current_row))) elif (model_field == 'short_name'): try: library = libraries[value] initializer['library'] = library except Exception, e: raise Exception( str(('library short_name not found', value, e, 'row', current_row)))
raise map_column = mappingColumnDict['Cell'] if(map_column > -1): try: value = util.convertdata(r[map_column].strip()) if(value != None and value != '' ): facility_id = value dataRecord.cell = Cell.objects.get(facility_id=facility_id) # TODO: purge "HMSL" from the db mapped = True except Exception, e: print "Invalid Cell facility id: ", facility_id raise map_column = mappingColumnDict['Protein'] if(map_column > -1): try: value = util.convertdata(r[map_column].strip()) if(value != None and value != '' ): facility_id = r[map_column] dataRecord.protein = Protein.objects.get(lincs_id=facility_id[facility_id.index('HMSL')+4:]) #TODO: purge "HMSL" mapped = True except Exception, e: print "Invalid Protein facility id: ", value raise if(not mapped): raise Exception('at least one of: ' + str(mappingColumnDict.keys()) + ' must be defined, missing for row: ' + str(rowsRead+2)) if metaColumnDict['Plate'] > -1 : dataRecord.plate = util.convertdata(r[metaColumnDict['Plate']],int) if metaColumnDict['Well'] > -1 : dataRecord.well = util.convertdata(r[metaColumnDict['Well']]) if metaColumnDict['Control Type'] > -1: dataRecord.control_type = util.convertdata(r[metaColumnDict['Control Type']]) dataRecord.save()
_read_plate_well(map_column,r,current_row, dataRecord) map_column = mappingColumnDict['Cell'] if(map_column > -1): _read_cell(map_column,r,current_row,dataRecord) map_column = mappingColumnDict['Antibody'] if(map_column > -1): _read_antibody(map_column,r,current_row,dataRecord) map_column = mappingColumnDict['OtherReagent'] if(map_column > -1): _read_other_reagent(map_column,r,current_row,dataRecord) map_column = mappingColumnDict['Protein'] if(map_column > -1): _read_protein(map_column,r,current_row,dataRecord) if metaColumnDict['Control Type'] > -1: dataRecord.control_type = util.convertdata( r[metaColumnDict['Control Type']]) if(dataRecord.control_type is not None and dataRecord.smallmolecule is not None): raise Exception(str(( 'Cannot define a control type for a non-control well ' '(well mapped to a small molecule batch)', dataRecord.smallmolecule,dataRecord.control_type, 'row',current_row))) if metaColumnDict['batch_id'] > -1: temp = util.convertdata(r[metaColumnDict['batch_id']], int) if(temp != None): if(dataRecord.batch_id is not None and temp is not None and dataRecord.batch_id != temp): raise Exception(str(( 'batch id field(1) does not match batch id set with ' 'entity(2):',temp,dataRecord.batch_id)))
recognized_label = next( (field_name for label, field_name in labels.items() if label_read and label.lower() == label_read.lower() ), None) if recognized_label: logger.debug( 'label: %r, recognized_label: %r' % (label_read, recognized_label)) for j,val in enumerate(row_values[1:]): dc_dict = dc_definitions[j] logger.debug('data column %s:%d:%d:%r' % ( recognized_label, i, j, val)) final_val = util.convertdata( val,type_lookup.get(recognized_label, None)) if final_val != None: dc_dict[recognized_label] = final_val if recognized_label == 'display_order': # add 10 to the order, so default reagent cols can go first dc_dict['display_order'] = ( dc_dict['display_order'] + 10) if recognized_label == 'name': # split on non-alphanumeric chars temp = re.split(r'[^a-zA-Z0-9]+',dc_dict['name']) # convert, if needed if len(temp) > 1: dc_dict['name'] = camel_case_dwg(dc_dict['name']) else: if recognized_label in required_labels:
def main(path): """ Read in the cell batch info """ sheet_name = 'Sheet1' start_row = 1 sheet = iu.readtable([path, sheet_name, start_row ]) # Note, skipping the header row by default properties = ('model_field', 'required', 'default', 'converter') column_definitions = { 'Facility ID': ('facility_id', True, None, lambda x: x[x.index('HMSL') + 4:]), 'CL_Batch_ID': ('batch_id', True, None, lambda x: util.convertdata(x, int)), 'CL_Provider_Name': 'provider_name', 'CL_Provider_Batch_ID': 'provider_batch_id', 'CL_Provider_Catalog_ID': 'provider_catalog_id', 'CL_Quality_Verification': 'quality_verification', 'CL_Transient_Modification': 'transient_modification', 'Date Data Received': ('date_data_received', False, None, util.date_converter), 'Date Loaded': ('date_loaded', False, None, util.date_converter), 'Date Publicly Available': ('date_publicly_available', False, None, util.date_converter), 'Most Recent Update': ('date_updated', False, None, util.date_converter), } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions( properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ', cols))) for row in sheet: r = util.make_row(row) initializer = {} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug(str(('raw value', value))) if (converter != None): value = converter(value) if (value == None): if (default != None): value = default if (value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'], rows)) logger.debug( str(('model_field: ', model_field, ', value: ', value))) if model_field == 'facility_id': try: cell = Cell.objects.get(facility_id=value) initializer['reagent'] = cell except: logger.error( str(("Cell not found", value, 'row', rows + start_row + 2))) raise else: initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) cell = CellBatch(**initializer) cell.save() logger.debug(str(('cell created:', cell))) rows += 1 except Exception, e: logger.error( str(("Invalid CellBatch initializer: ", initializer, 'row', rows + start_row + 2, e))) raise
def read_datacolumns(book): ''' @return an array of data column definition dicts ''' data_column_sheet = book.sheet_by_name('Data Columns') labels = { 'Worksheet Column': 'worksheet_column', '"Data" Worksheet Column': 'worksheet_column', 'Display Order': 'display_order', 'Display Name': 'display_name', 'Name': 'name', 'Data Type': 'data_type', 'Decimal Places': 'precision', 'Description': 'description', 'Replicate Number': 'replicate', 'Unit': 'unit', 'Assay readout type': 'readout_type', 'Comments': 'comments', } dc_definitions = [] datacolumn_fields = util.get_fields(DataColumn) type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields) logger.debug('datacolumn type lookups: %s' % type_lookup) required_labels = ['name', 'data_type'] logger.info('read the data column definitions...') for i in xrange(data_column_sheet.nrows): row_values = data_column_sheet.row_values(i) if i == 0: for val in row_values[1:]: dc_definitions.append({}) label_read = row_values[0] recognized_label = next( (field_name for label, field_name in labels.items() if label_read and label.lower() == label_read.lower()), None) if recognized_label: logger.debug('label: %r, recognized_label: %r' % (label_read, recognized_label)) for j, val in enumerate(row_values[1:]): dc_dict = dc_definitions[j] logger.debug('data column %s:%d:%d:%r' % (recognized_label, i, j, val)) final_val = util.convertdata( val, type_lookup.get(recognized_label, None)) if final_val != None: dc_dict[recognized_label] = final_val if recognized_label == 'display_order': # add 10 to the order, so default reagent cols can go first dc_dict['display_order'] = (dc_dict['display_order'] + 10) if recognized_label == 'name': # split on non-alphanumeric chars temp = re.split(r'[^a-zA-Z0-9]+', dc_dict['name']) # convert, if needed if len(temp) > 1: dc_dict['name'] = camel_case_dwg(dc_dict['name']) else: if recognized_label in required_labels: raise Exception( 'Error, data column field is required: %s, col: %r' % (recognized_label, colname(j + 1))) else: logger.debug('unrecognized label in "Data Columns" sheet %r' % label_read) for dc_dict in dc_definitions: for label in required_labels: if label not in dc_dict: raise Exception('required "Data Column" label not defined %r' % label) logger.info('find the data columns on the "Data" sheet...') data_sheet = book.sheet_by_name('Data') data_sheet_labels = data_sheet.row_values(0) dc_definitions_found = [] data_labels_found = [] for i, data_label in enumerate(data_sheet_labels): if not data_label or not data_label.strip(): logger.info('break on data sheet col %d, blank' % i) break data_label = data_label.upper() col_letter = colname(i) for dc_dict in dc_definitions: _dict = None if 'worksheet_column' in dc_dict: v = dc_dict['worksheet_column'] if v.upper() == col_letter: data_labels_found.append(i) dc_definitions_found.append(dc_dict) _dict = dc_dict elif 'name' in dc_dict or 'display_name' in dc_dict: if (dc_dict.get('name', '').upper() == data_label or dc_dict.get('display_name', '').upper() == data_label): dc_dict['worksheet_column'] = col_letter data_labels_found.append(i) dc_definitions_found.append(dc_dict) _dict = dc_dict if _dict and 'display_order' not in _dict: _dict['display_order'] = i + 10 logger.warn('auto assigning "display_order" for col %r as %d' % (_dict['name'], i + 10)) if i not in data_labels_found: logger.debug(('Data sheet label not found %r,' ' looking in default reagent definitions %s') % (data_label, default_reagent_columns.keys())) for key, dc_dict in default_reagent_columns.items(): if (key.upper() == data_label or dc_dict.get('name', '').upper() == data_label or dc_dict.get('display_name', '').upper() == data_label): dc_dict['worksheet_column'] = col_letter data_labels_found.append(i) dc_definitions_found.append(dc_dict) data_labels_not_found = [ data_label for i, data_label in enumerate(data_sheet_labels) if data_label and data_label.strip() and i not in data_labels_found and data_label not in meta_columns ] if data_labels_not_found: logger.warn('data sheet labels not recognized %s' % data_labels_not_found) # for legacy datasets: make sure the small molecule column 1 is always created small_mol_col = None for dc_dict in dc_definitions_found: if dc_dict['data_type'] == 'small_molecule': small_mol_col = dc_dict break if not small_mol_col: dc_definitions_found.append( default_reagent_columns['Small Molecule Batch']) logger.info('data column definitions found: %s' % [x['display_name'] for x in dc_definitions_found]) return dc_definitions_found
logger.debug('read the data sheet, save_interval: %d' % save_interval) loopStart = time.time() pointsSaved = 0 rows_read = 0 col_to_dc_items = col_to_dc_map.items() for i in xrange(data_sheet.nrows - 1): current_row = i + 2 row = data_sheet.row_values(i + 1) r = util.make_row(row) datarecord = DataRecord(dataset=dataset) if meta_columns['Control Type'] > -1: datarecord.control_type = util.convertdata( r[meta_columns['Control Type']]) datapoint_batch = [] small_molecule_datapoint = None for i, dc in col_to_dc_items: value = r[i] logger.debug('reading column %r, %s, val: %r' % (colname(i), dc, value)) value = value.strip() value = util.convertdata(value) if not value: continue datapoint = _create_datapoint(dc, dataset, datarecord, value) datapoint_batch.append(datapoint) pointsSaved += 1 if not small_molecule_datapoint and dc.data_type == 'small_molecule':
def main(path): """ Read in the Antibody Batches """ sheet_name = 'Sheet1' sheet = iu.readtable([path, sheet_name, 0]) properties = ('model_field', 'required', 'default', 'converter') column_definitions = { 'AR_Center_Specific_ID': ('antibody_facility_id', True, None, lambda x: x[x.index('HMSL') + 4:]), 'AR_Batch_ID': ('batch_id', True, None, lambda x: util.convertdata(x, int)), 'AR_Provider_Name': 'provider_name', 'AR_Provider_Catalog_ ID': 'provider_catalog_id', 'AR_Provider_Batch_ID': 'provider_batch_id', 'AR_Antibody_Purity': 'antibody_purity', 'Date Data Received': ('date_data_received', False, None, util.date_converter), 'Date Loaded': ('date_loaded', False, None, util.date_converter), 'Date Publicly Available': ('date_publicly_available', False, None, util.date_converter), 'Most Recent Update': ('date_updated', False, None, util.date_converter), } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions( properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug('cols: %s' % cols) for row in sheet: r = util.make_row(row) dict = {} initializer = {} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug('read col: %d: %s' % (i, properties)) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug('raw value %r' % value) if (converter != None): value = converter(value) if (value == None): if (default != None): value = default if (value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'], rows)) logger.debug('model_field: %s, converted value %r' % (model_field, value)) initializer[model_field] = value try: logger.debug('initializer: %s' % initializer) antibody_facility_id = initializer.pop('antibody_facility_id', None) if antibody_facility_id: try: antibody = Antibody.objects.get( facility_id=antibody_facility_id) initializer['reagent'] = antibody except ObjectDoesNotExist, e: logger.error( 'AR_Center_Specific_ID: "%s" does not exist, row: %d' % (antibody_facility_id, i)) antibody_batch = AntibodyBatch(**initializer) antibody_batch.save() logger.info('antibody batch created: %s' % antibody_batch) rows += 1 except Exception, e: logger.error("Invalid antibody_batch initializer: %s" % initializer) raise
def readDataColumns(path): # Read in the DataColumn Sheet sheetname = 'Data Columns' dataColumnSheet = iu.readtable([path, sheetname]) # Lookup all of the field types of the Datacolumn table. # These will be used to validate input type by converting on read _fields = util.get_fields(DataColumn) _typelookup = dict((f.name, iu.totype(f)) for f in _fields) # TODO: Use the import_utils methods here # TODO: compare and combine this with the fieldinformation entity labels = {'Worksheet Column':'worksheet_column', 'Display Order':'display_order', 'Name':'name', 'Display Name':'display_name', 'Data Type':'data_type', 'Decimal Places':'precision', 'Description':'description', 'Replicate Number':'replicate', 'Unit':'unit', 'Assay readout type':'readout_type', 'Comments':'comments', 'Protein HMS LINCS ID': 'protein', 'Cell HMS LINCS ID': 'cell'} # create an array of dict's, each dict defines a DataColumn dataColumnDefinitions = [] #Note we also allow a list of pro # first the label row (it contains the worksheet column, it is unique) for v in dataColumnSheet.labels[1:]: dataColumnDefinitions.append({labels['Worksheet Column']:v}) logger.debug(str(('========== datacolumns:',dataColumnDefinitions))) # for each row, create the dictionary entry in the dataColumnDefinitions for row in dataColumnSheet: rowAsUnicode = util.make_row(row) keyRead = rowAsUnicode[0] for i,cellText in enumerate(rowAsUnicode[1:]): try: for key,fieldName in labels.items(): # if one of the DataColumn fields, add it to the dict if re.match(key,keyRead,re.M|re.I): if re.match('Protein HMS LINCS ID', keyRead, re.M|re.I): facility_id = util.convertdata(cellText, int); if facility_id: dataColumnDefinitions[i][fieldName] = \ Protein.objects.get(lincs_id=facility_id) elif re.match('Cell HMS LINCS ID', keyRead, re.M|re.I): facility_id = util.convertdata(cellText, int); if facility_id: dataColumnDefinitions[i][fieldName] = \ Cell.objects.get(facility_id=facility_id) else: # Use the type from the fieldinformation table # to read in the data for each DC field dataColumnDefinitions[i][fieldName] = \ util.convertdata(cellText, _typelookup.get(fieldName, None)) else: logger.debug(str(( '"Data Column definition not used: ', cellText)) ) pass except Exception, e: logger.error(str(('Exception reading data for cell', i, cellText, e))) raise e logger.debug(str(("definitions: ", dataColumnDefinitions)) )
def main(path): """ Read in the Data Working Group sheets """ logger.info("start") book = xlrd.open_workbook(path) #open our xls file, there's lots of extra default options in this call, for logging etc. take a look at the docs #sheet = book.sheets()[0] #book.sheets() returns a list of sheet objects... alternatively... #sheet = book.sheet_by_name("qqqq") #we can pull by name worksheet = book.sheet_by_index(0) #or by the index it has in excel's sheet collection properties = ('model_field','required','default','converter') column_definitions = {'table':'table', 'field':'field', 'alias':'alias', 'queryset':'queryset', 'show in detail':('show_in_detail',True,False,util.bool_converter), 'show in list':('show_in_list',True,False,util.bool_converter), 'show_as_extra_field':('show_as_extra_field',False,False,util.bool_converter), 'is_lincs_field':('is_lincs_field',True,False,util.bool_converter), 'is_unrestricted':('is_unrestricted',False,False,util.bool_converter), 'order':('order',True,None,lambda x:util.convertdata(x,int)), 'use_for_search_index':('use_for_search_index',True,False,util.bool_converter), 'Data Working Group version':'dwg_version', 'Unique ID':('unique_id',True), 'DWG Field Name':'dwg_field_name', 'HMS Field Name':'hms_field_name', 'Related to':'related_to', 'Description':'description', 'Importance (1: essential; 2: desirable / recommended; 3: optional)':'importance', 'Comments':'comments', 'Ontologies / references considered':'ontology_reference', 'Link to ontology / reference':'ontology_reference', 'Additional Notes (for development)':'additional_notes', } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) num_rows = worksheet.nrows - 1 num_cells = worksheet.ncols - 1 curr_row = 0 # note zero indexed row = worksheet.row(curr_row) labels = [] i = -1 while i < num_cells: i += 1 # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank # cell_type = worksheet.cell_type(curr_row, curr_cell) labels.append(str(worksheet.cell_value(curr_row, i))) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, labels, all_sheet_columns_required=False) logger.info('delete current table'); FieldInformation.objects.all().delete() rows = 0 while curr_row < num_rows: curr_row += 1 actual_row = curr_row + 1 row = worksheet.row(curr_row) if(logger.isEnabledFor(logging.DEBUG)): logger.debug(str(('row', row))) i = -1 initializer = {} while i < num_cells: i += 1 # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank #cell_type = worksheet.cell_type(curr_row, curr_cell) value = unicode(worksheet.cell_value(curr_row, i)) if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): logger.debug(str(('using converter',converter,value))) value = converter(value) logger.debug(str(('converted',value))) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],actual_row)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) #if((initializer['table'] == None and initializer['queryset'] == None ) or if(initializer['field'] == None): logger.warn(str(('Note: table entry has no field definition (will be skipped)', initializer, 'current row:', actual_row))) continue; lfi = FieldInformation(**initializer) # check if the table/field exists if(lfi.table != None): table = models.get_model(APPNAME, lfi.table) if( table != None): if(lfi.field not in map(lambda x: x.name,table._meta.fields) ): raise Exception(str(('unknown field: ', lfi.field))) else: raise Exception(str(('unknown table', lfi.table ))) lfi.save() logger.info(str(('fieldInformation created:', lfi))) rows += 1 except Exception, e: logger.error(str(( "Invalid fieldInformation, initializer so far: ", initializer, 'current row:', actual_row,e))) raise e
def main(path): # read in the two columns of the meta sheet to a dict that defines a DataSet metadata = read_metadata(path) dataset = DataSet(**metadata) dataset.save() # read in the data columns sheet to an array of dict's, each dict defines a DataColumn dataColumnDefinitions = readDataColumns(path) # now that the array of DataColumn dicts is created, use them to create the DataColumn instances dataColumns = {} for dc in dataColumnDefinitions: dc['dataset'] = dataset dataColumn = DataColumn(**dc) dataColumn.save() dataColumns[dataColumn.name] = dataColumn # read the Data sheet sheetname = 'Data' dataSheet = iu.readtable([path, sheetname]) # First, map the sheet column indices to the DataColumns that were created dataColumnList = {} metaColumnDict = {'Well':-1, 'Plate':-1, 'Control Type':-1} # meta columns contain forensic information mappingColumnDict = {'Small Molecule':-1, 'Cell':-1, 'Protein':-1} # what is being studied - at least one is required # NOTE: this scheme is matching based on the labels between the "Data Column" sheet and the "Data" sheet for i,label in enumerate(dataSheet.labels): if(label == 'None' or label == 'well_id' or label.strip()=='' or label == 'Exclude' ): continue if label in metaColumnDict: metaColumnDict[label] = i continue if label in mappingColumnDict: mappingColumnDict[label] = i continue if label in dataColumns: dataColumnList[i] = dataColumns[label] # note here "i" is the index to the dict else: #raise Exception("no datacolumn for the label: " + label) columnName = chr(ord('A') + i) findError = True for column in dataColumns.values(): if(column.worksheet_column == columnName): dataColumnList[i] = column findError = False break if findError: print "Error: no datacolumn for ", label sys.exit(-1) found=False for key,value in mappingColumnDict.items(): if(value != -1): found=True if(not found): raise Exception('at least one of: ' + str(mappingColumnDict.keys()) + ' must be defined and used in the Data sheet.') # Read in the Data sheet, create DataPoint values for mapped column in each row pointsSaved = 0 rowsRead = 0 for row in dataSheet: r = util.make_row(row) dataRecord = DataRecord(dataset=dataset ) map_column = mappingColumnDict['Small Molecule'] mapped = False if(map_column > -1): try: value = util.convertdata(r[map_column].strip()) if(value != None and value != '' ): facility = value.split("-")[0] # TODO: purge "HMSL" from the db salt = value.split("-")[1] dataRecord.small_molecule = SmallMolecule.objects.get(facility_id=facility, sm_salt=salt) mapped = True except Exception, e: print "Invalid Small Molecule facility id: ", value raise map_column = mappingColumnDict['Cell'] if(map_column > -1): try: value = util.convertdata(r[map_column].strip()) if(value != None and value != '' ): facility_id = value dataRecord.cell = Cell.objects.get(facility_id=facility_id) # TODO: purge "HMSL" from the db mapped = True except Exception, e: print "Invalid Cell facility id: ", facility_id raise
def main(path): """ Read in the Data Working Group sheets """ logger.info(str(('read field information file', path))) properties = ('model_field', 'required', 'default', 'converter') column_definitions = { 'table': 'table', 'field': 'field', 'alias': 'alias', 'queryset': 'queryset', 'show in detail': ('show_in_detail', True, False, util.bool_converter), 'show in list': ('show_in_list', True, False, util.bool_converter), 'show_as_extra_field': ('show_as_extra_field', False, False, util.bool_converter), 'is_lincs_field': ('is_lincs_field', True, False, util.bool_converter), 'is_unrestricted': ('is_unrestricted', False, False, util.bool_converter), 'list_order': ('list_order', True, None, lambda x: util.convertdata(x, int)), 'detail_order': ('detail_order', True, None, lambda x: util.convertdata(x, int)), 'use_for_search_index': ('use_for_search_index', True, False, util.bool_converter), 'Data Working Group version': 'dwg_version', 'Unique ID': ('unique_id', True), 'DWG Field Name': 'dwg_field_name', 'HMS Field Name': 'hms_field_name', 'Related to': 'related_to', 'Description': 'description', 'Importance (1: essential; 2: desirable / recommended; 3: optional)': 'importance', 'Comments': 'comments', 'Ontologies / references considered': 'ontology_reference', 'Link to ontology / reference': 'ontology_reference', 'Additional Notes (for development)': 'additional_notes', } column_definitions = util.fill_in_column_definitions( properties, column_definitions) with open(path) as f: reader = csv.reader(f) labels = reader.next() cols = util.find_columns(column_definitions, labels, all_sheet_columns_required=False) logger.info('delete current table') FieldInformation.objects.all().delete() for j, row in enumerate(reader): logger.debug('row %d: %s', j, row) initializer = {} for i, value in enumerate(row): if i not in cols: logger.info(str(('column out of range', j + 1, i))) continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if converter: logger.debug(str(('using converter', converter, value))) value = converter(value) logger.debug(str(('converted', value))) # Note: must check the value against None, as False is a valid value if value is None: if default != None: value = default # Note: must check the value against None, as False is a valid value if value is None and required is True: raise Exception('Field is required: %s, record: %d' % (properties['column_label'], j + 1)) logger.debug( str(('model_field: ', model_field, ', value: ', value))) initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) if not initializer['field']: logger.warn( str(( 'Note: table entry has no field definition (will be skipped)', initializer, 'current row:', j + 1))) continue lfi = FieldInformation(**initializer) # check if the table/field exists if lfi.table: table = models.get_model(APPNAME, lfi.table) if table: if lfi.field not in map(lambda x: x.name, table._meta.fields): raise Exception(str( ('unknown field: ', lfi.field))) else: raise Exception(str(('unknown table', lfi.table))) lfi.save() logger.info(str(('fieldInformation created:', lfi))) except Exception, e: logger.error( str(("Invalid fieldInformation, initializer so far: ", initializer, 'current row:', j + 1, e))) raise e
def readLibraries(path, sheetName): sheet = iu.readtable([path, sheetName ]) # Note, skipping the header row by default # dict to map spreadsheet fields to the Library fields properties = ('model_field', 'required', 'default', 'converter') date_parser = lambda x: util.convertdata(x, date) column_definitions = { 'Name': ('name', True), # TODO use the model to determine if req'd 'ShortName': ('short_name', True), 'Library Type': 'type', 'Date First Plated': ('date_first_plated', False, None, date_parser), 'Date Data Received': ('date_data_received', False, None, date_parser), 'Date Loaded': ('date_loaded', False, None, date_parser), 'Date Publicly Available': ('date_publicly_available', False, None, date_parser), 'Most Recent Update': ('date_updated', False, None, util.date_converter), 'Is Restricted': ('is_restricted', False, False) } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions( properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 libraries = {} for row in sheet: logger.debug(str(('row raw: ', row))) r = util.make_row(row) logger.debug(str(('row: ', r))) initializer = {} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if (converter != None): value = converter(value) if (value == None): if (default != None): value = default if (value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'], rows)) logger.debug( str(('model_field: ', model_field, ', value: ', value))) initializer[model_field] = value try: library = Library(**initializer) library.save() logger.info(str(('library created', library))) libraries[library.short_name] = library rows += 1 except Exception, e: logger.error(str(('library initializer problem: ', initializer))) raise e
def main(path): """ Read in the Cell """ sheet_name = 'HMS-LINCS cell line metadata' sheet = iu.readtable([path, sheet_name, 1]) # Note, skipping the header row by default properties = ('model_field','required','default','converter') column_definitions = { 'Facility ID':('facility_id',True,None, lambda x: x[x.index('HMSL')+4:]), 'CL_Name':('name',True), 'CL_LINCS_ID':'lincs_id', 'CL_Alternate_Name':'alternative_names', 'CL_Alternate_ID':'alternate_id', 'CL_Center_Specific_ID':'center_specific_id', 'MGH_ID':('mgh_id',False,None,lambda x:util.convertdata(x,int)), 'Assay':'assay', 'CL_Organism':'organism', 'CL_Organ':'organ', 'CL_Tissue':'tissue', 'CL_Cell_Type':'cell_type', 'CL_Cell_Type_Detail':'cell_type_detail', 'CL_Donor_Sex': 'donor_sex', 'CL_Donor_Age': ('donor_age_years',False,None,lambda x:util.convertdata(x,int)), 'CL_Donor_Ethnicity': 'donor_ethnicity', 'CL_Donor_Health_Status': 'donor_health_status', 'CL_Disease':'disease', 'CL_Disease_Detail':'disease_detail', 'CL_Growth_Properties':'growth_properties', 'CL_Genetic_Modification':'genetic_modification', 'CL_Related_Projects':'related_projects', 'CL_Recommended_Culture_Conditions':'recommended_culture_conditions', 'CL_Verification_Reference_Profile':'verification_reference_profile', 'CL_Known_Mutations':'mutations_known', 'CL_Mutations_Citations':'mutations_citations', 'CL_Molecular_Features': 'molecular_features', 'CL_Relevant_Citations': 'relevant_citations', 'CL_Reference_Source': 'reference_source', 'CL_Reference_Source_ID': 'reference_source_id', 'Reference Source URL': 'reference_source_url', 'Usage Note': 'usage_note', 'Date Data Received':('date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), 'Is Restricted':('is_restricted',False,False,util.bool_converter)} # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels, all_sheet_columns_required=False) rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: r = util.make_row(row) initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) cell = Cell(**initializer) cell.save() logger.info(str(('cell created:', cell))) rows += 1 # create a default batch - 0 CellBatch.objects.create(reagent=cell,batch_id=0) except Exception, e: print "Invalid Cell, name: ", r[0] raise e
def main(import_file,file_directory,deploy_dir): """ Read in the qc events for batches - version 1 - for small molecule batches """ sheet_name = 'Sheet1' start_row = 0 sheet = iu.readtable([import_file, sheet_name, start_row]) # Note, skipping the header row by default properties = ('model_field','required','default','converter') column_definitions = { 'facility_id': ('facility_id_for',True,None, lambda x: util.convertdata(x,int)), 'salt_id': ('salt_id_for',False,None, lambda x: util.convertdata(x,int)), 'batch_id':('batch_id_for',True,None, lambda x: util.convertdata(x,int)), 'QC event date': ('date',True,None,util.date_converter), 'outcome': ('outcome',True), 'comment': 'comment', 'is_restricted':('is_restricted',False,False,util.bool_converter), 'file1': 'file1', 'file2': 'file2', 'file3': 'file3', 'file4': 'file4', 'file5': 'file5', } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: r = util.make_row(row) # store each row in a dict _dict = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) _dict[model_field] = value logger.debug(str(('dict: ', _dict))) files_to_attach = [] for i in range(10): filenameProp = 'file%s'%i; if _dict.get(filenameProp, None): fileprop = _dict[filenameProp] filepath = os.path.join(file_directory,fileprop) if not os.path.exists(filepath): raise Exception(str(('file does not exist:',filepath,'row', rows+start_row))) filename = os.path.basename(filepath) relative_path = fileprop[:fileprop.index(filename)] # Move the file dest_dir = deploy_dir if not dest_dir: dest_dir = settings.STATIC_AUTHENTICATED_FILE_DIR if not os.path.isdir(dest_dir): raise Exception(str(('no such deploy directory, please create it', dest_dir))) if relative_path: dest_dir = os.path.join(dest_dir, relative_path) if not os.path.exists(dest_dir): os.makedirs(dest_dir) deployed_path = os.path.join(dest_dir, filename) logger.debug(str(('deploy',filepath, deployed_path))) if os.path.exists(deployed_path): os.remove(deployed_path) copy(filepath,deployed_path) if not os.path.isfile (deployed_path): raise Exception(str(('could not deploy to', deployed_path))) else: logger.debug(str(('successfully deployed to', deployed_path))) files_to_attach.append((filename,relative_path)) initializer = None try: # create the qc record initializer = {key:_dict[key] for key in ['facility_id_for','salt_id_for','batch_id_for','outcome','comment','date']} qc_event = QCEvent(**initializer) qc_event.save() logger.debug(str(('saved', qc_event))) # create attached file records for (filename,relative_path) in files_to_attach: initializer = { 'qc_event':qc_event, 'filename':filename, 'relative_path':relative_path, 'is_restricted':_dict['is_restricted'] } qc_attached_file = QCAttachedFile(**initializer) qc_attached_file.save() logger.debug(str(('created qc attached file', qc_attached_file))) rows += 1 except Exception, e: logger.error(str(("Invalid initializer: ", initializer, 'row', rows+start_row+2, e))) raise
def read_data(book, col_to_dc_map, first_small_molecule_column, dataset): datarecord_batch = [] save_interval = 1000 logger.debug('read the Data sheet') data_sheet = book.sheet_by_name('Data') for i,label in enumerate(data_sheet.row_values(0)): logger.debug('find datasheet label %r:%r' % (colname(i), label)) if label in meta_columns: meta_columns[label] = i continue logger.debug('meta_columns: %s, datacolumnList: %s' % (meta_columns, col_to_dc_map) ) logger.debug('read the data sheet, save_interval: %d' % save_interval) loopStart = time.time() pointsSaved = 0 rows_read = 0 for i in xrange(data_sheet.nrows-1): current_row = i + 2 row = data_sheet.row_values(i+1) r = util.make_row(row) datarecord = DataRecord(dataset=dataset) if meta_columns['Control Type'] > -1: datarecord.control_type = util.convertdata( r[meta_columns['Control Type']]) datapoint_batch = [] small_molecule_datapoint = None for i,dc in col_to_dc_map.items(): value = r[i] logger.debug( 'reading column %r, %s, val: %r' % (colname(i), dc, value)) value = value.strip() value = util.convertdata(value) if not value: continue datapoint = _create_datapoint(dc, dataset, datarecord, value) datapoint_batch.append(datapoint) pointsSaved += 1 if not small_molecule_datapoint and dc.data_type == 'small_molecule': small_molecule_datapoint = datapoint if meta_columns['Plate'] > -1: _read_plate_well( meta_columns['Plate'], r, current_row, datarecord, first_small_molecule_column,small_molecule_datapoint, datapoint_batch) datarecord_batch.append((datarecord, datapoint_batch)) rows_read += 1 if (rows_read % save_interval == 0): bulk_create_datarecords(datarecord_batch) logger.debug( 'datarecord batch created, rows_read: %d , time (ms): %d' % (rows_read, time.time()-loopStart ) ) count = bulk_create_datapoints(datarecord_batch) logger.debug('datapoints created in batch: %d ' % count) datarecord_batch=[] bulk_create_datarecords(datarecord_batch) et = time.time()-loopStart logger.debug( 'final datarecord batch created, rows_read: %d, time (ms): %d' % (rows_read, et)) count = bulk_create_datapoints(datarecord_batch) logger.debug('created dps %d' % count ) print 'Finished reading, rows_read: ', rows_read, ', points Saved: ', pointsSaved print 'elapsed: ', et , 'avg: ', et/rows_read cleanup_unused_datacolumns(dataset)
def read_metadata(meta_sheet): properties = ('model_field', 'required', 'default', 'converter') field_definitions = { 'Lead Screener First': 'lead_screener_firstname', 'Lead Screener Last': 'lead_screener_lastname', 'Lead Screener Email': 'lead_screener_email', 'Lab Head First': 'lab_head_firstname', 'Lab Head Last': 'lab_head_lastname', 'Lab Head Email': 'lab_head_email', 'Title': 'title', 'Facility ID': ('facility_id', True, None, lambda x: util.convertdata(x, int)), 'Summary': 'summary', 'Protocol': 'protocol', 'References': 'protocol_references', 'Date Data Received': ('date_data_received', False, None, util.date_converter), 'Date Loaded': ('date_loaded', False, None, util.date_converter), 'Date Publicly Available': ('date_publicly_available', False, None, util.date_converter), 'Most Recent Update': ('date_updated', False, None, util.date_converter), 'Is Restricted': ('is_restricted', False, False, util.bool_converter), 'Dataset Type': ('dataset_type', False), 'Bioassay': ('bioassay', False), 'Dataset Keywords': ('dataset_keywords', False), 'Usage Message': ('usage_message', False), 'Associated Publication': ('associated_publication', False), 'Associated Project Summary': ('associated_project_summary', False), } sheet_labels = [] for i in xrange(meta_sheet.nrows - 1): row = meta_sheet.row_values(i + 1) sheet_labels.append(row[0]) field_definitions = util.fill_in_column_definitions( properties, field_definitions) cols = util.find_columns(field_definitions, sheet_labels, all_column_definitions_required=False) initializer = {} for i in xrange(meta_sheet.nrows - 1): row = meta_sheet.row_values(i + 1) properties = cols[i] value = row[1] logger.debug('Metadata raw value %r' % value) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] if converter: value = converter(value) if not value and default != None: value = default if not value and required: raise Exception('Field is required: %s, record: %d' % (properties['column_label'], row)) logger.debug('model_field: %s, value: %r' % (model_field, value)) initializer[model_field] = value return initializer
def main(path): sheet_name = 'sheet 1' start_row = 1 sheet = iu.readtable([path, sheet_name, start_row]) properties = ('model_field','required','default','converter') column_definitions = { 'facility_id': ( 'facility_id',True,None, lambda x: util.convertdata(x,int)), 'facility_batch_id':( 'batch_id',True,None, lambda x: util.convertdata(x,int)), 'provider': ('provider_name',False), 'provider_catalog_id':'provider_catalog_id', 'provider_sample_id':'provider_batch_id', 'Date Data Received':( 'date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ( 'date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ( 'date_updated',False,None,util.date_converter), } column_definitions = util.fill_in_column_definitions( properties,column_definitions) cols = util.find_columns(column_definitions, sheet.labels, all_sheet_columns_required=False) rows = 0 logger.debug('cols: %s' % cols) for row in sheet: r = util.make_row(row) dict = {} initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug('read col: %d: %s' % (i,properties)) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug('raw value %r' % value) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug('model_field: %s, converted value %r' % (model_field, value) ) initializer[model_field] = value try: logger.debug('initializer: %s' % initializer) facility_id = initializer.pop('facility_id',None) try: other_reagent = OtherReagent.objects.get(facility_id=facility_id) initializer['reagent'] = other_reagent except ObjectDoesNotExist, e: logger.error('facility_id: "%s" does not exist, row: %d' % (facility_id,i)) batch = OtherReagentBatch(**initializer) batch.save() logger.debug('batch created: %s', batch) rows += 1 except Exception, e: logger.error("Invalid other_reagent_batch initializer: %s" % initializer) raise
def main(path): """ Read in the cell batch info """ sheet_name = 'Sheet1' start_row = 1 sheet = iu.readtable([path, sheet_name, start_row]) # Note, skipping the header row by default properties = ('model_field','required','default','converter') column_definitions = { 'Facility ID':('facility_id',True,None, lambda x: x[x.index('HMSL')+4:]), 'CL_Batch_ID':('batch_id',True,None,lambda x:util.convertdata(x,int)), 'CL_Provider_Name':'provider_name', 'CL_Provider_Batch_ID':'provider_batch_id', 'CL_Provider_Catalog_ID':'provider_catalog_id', 'CL_Quality_Verification':'quality_verification', 'CL_Transient_Modification': 'transient_modification', 'Date Data Received':('date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: r = util.make_row(row) initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % ( properties['column_label'],rows)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) if model_field == 'facility_id': try: cell = Cell.objects.get(facility_id=value) initializer['reagent'] = cell except: logger.error(str(("Cell not found", value, 'row',rows+start_row+2))) raise else: initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) cell = CellBatch(**initializer) cell.save() logger.debug(str(('cell created:', cell))) rows += 1 except Exception, e: logger.error(str(( "Invalid CellBatch initializer: ", initializer, 'row', rows+start_row+2, e))) raise
def main(path, do_precursors_only): """ Read in the Cell """ sheet_name = 'HMS-LINCS cell line metadata' sheet = iu.readtable([path, sheet_name, 1]) # allow for informational header row properties = ('model_field','required','default','converter') column_definitions = { 'Facility ID':('facility_id',True,None, lambda x: x[x.index('HMSL')+4:]), 'CL_Name':('name',True), 'CL_LINCS_ID':'lincs_id', 'CL_Alternate_Name':'alternative_names', 'CL_Alternate_ID':'alternative_id', 'Precursor_Cell':'precursor_facility_batch_id', 'CL_Organism':'organism', 'CL_Organ':'organ', 'CL_Tissue':'tissue', 'CL_Cell_Type':'cell_type', 'CL_Cell_Type_Detail':'cell_type_detail', 'CL_Donor_Sex': 'donor_sex', 'CL_Donor_Age': ('donor_age_years',False,None,lambda x:util.convertdata(x,int)), 'CL_Donor_Ethnicity': 'donor_ethnicity', 'CL_Donor_Health_Status': 'donor_health_status', 'CL_Disease':'disease', 'CL_Disease_Detail':'disease_detail', 'CL_Production_Details': 'production_details', 'CL_Genetic_Modification':'genetic_modification', 'CL_Known_Mutations':'mutations_known', 'CL_Mutation_Citations':'mutation_citations', 'CL_Verification_Reference_Profile':'verification_reference_profile', 'CL_Growth_Properties':'growth_properties', 'CL_Recommended_Culture_Conditions':'recommended_culture_conditions', 'CL_Relevant_Citations': 'relevant_citations', 'Usage Note': 'usage_note', 'CL_Reference_Source': 'reference_source', 'Reference Source URL': 'reference_source_url', 'Date Data Received':('date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), 'Is Restricted':('is_restricted',False,False,util.bool_converter)} # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels, all_sheet_columns_required=False) rows = 0 precursor_map = {} precursor_pattern = re.compile(r'HMSL(5\d{4})-(\d+)') for row in sheet: r = util.make_row(row) initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] value = convertdata(value) if value is not None: if converter: try: value = converter(value) except Exception: logger.error('field parse error: %r, value: %r, row: %d', properties['column_label'],value,rows+2) raise if value is None: if default is not None: value = default if value is None and required: raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug('model_field: %r, value: %r' , model_field, value) initializer[model_field] = value precursor_facility_batch_id = initializer.pop('precursor_facility_batch_id') if precursor_facility_batch_id: match = precursor_pattern.match(precursor_facility_batch_id) if not match: raise Exception('Invalid precursor pattern: needs: %s: %r, row: %d' % (precursor_pattern, initializer, rows)) precursor_map[initializer['facility_id']] = (match.group(1),match.group(2)) if not do_precursors_only: try: logger.info('initializer: %r', initializer) cell = Cell(**initializer) cell.save() logger.info(str(('cell created:', cell))) # create a default batch - 0 CellBatch.objects.create(reagent=cell,batch_id=0) except Exception, e: print "Invalid Cell, name: ", r[0] raise e rows += 1
def main(import_file, file_directory, deploy_dir): """ Read in the qc events for batches - version 1 - for small molecule batches """ sheet_name = 'Sheet1' start_row = 0 sheet = iu.readtable([import_file, sheet_name, start_row ]) # Note, skipping the header row by default properties = ('model_field', 'required', 'default', 'converter') column_definitions = { 'facility_id': ('facility_id_for', True, None, lambda x: util.convertdata(x, int)), 'salt_id': ('salt_id_for', False, None, lambda x: util.convertdata(x, int)), 'batch_id': ('batch_id_for', True, None, lambda x: util.convertdata(x, int)), 'QC event date': ('date', True, None, util.date_converter), 'outcome': ('outcome', True), 'comment': 'comment', 'is_restricted': ('is_restricted', False, False, util.bool_converter), 'file1': 'file1', 'file2': 'file2', 'file3': 'file3', 'file4': 'file4', 'file5': 'file5', } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions( properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ', cols))) for row in sheet: r = util.make_row(row) # store each row in a dict _dict = {} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug(str(('raw value', value))) if (converter != None): value = converter(value) if (value == None): if (default != None): value = default if (value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'], rows)) logger.debug( str(('model_field: ', model_field, ', value: ', value))) _dict[model_field] = value logger.debug(str(('dict: ', _dict))) files_to_attach = [] for i in range(10): filenameProp = 'file%s' % i if _dict.get(filenameProp, None): fileprop = _dict[filenameProp] filepath = os.path.join(file_directory, fileprop) if not os.path.exists(filepath): raise Exception( str(('file does not exist:', filepath, 'row', rows + start_row))) filename = os.path.basename(filepath) relative_path = fileprop[:fileprop.index(filename)] # Move the file dest_dir = deploy_dir if not dest_dir: dest_dir = settings.STATIC_AUTHENTICATED_FILE_DIR if not os.path.isdir(dest_dir): raise Exception( str(('no such deploy directory, please create it', dest_dir))) if relative_path: dest_dir = os.path.join(dest_dir, relative_path) if not os.path.exists(dest_dir): os.makedirs(dest_dir) deployed_path = os.path.join(dest_dir, filename) logger.debug(str(('deploy', filepath, deployed_path))) if os.path.exists(deployed_path): os.remove(deployed_path) copy(filepath, deployed_path) if not os.path.isfile(deployed_path): raise Exception(str( ('could not deploy to', deployed_path))) else: logger.debug( str(('successfully deployed to', deployed_path))) files_to_attach.append((filename, relative_path)) initializer = None try: # create the qc record initializer = { key: _dict[key] for key in [ 'facility_id_for', 'salt_id_for', 'batch_id_for', 'outcome', 'comment', 'date' ] } qc_event = QCEvent(**initializer) qc_event.save() logger.debug(str(('saved', qc_event))) # create attached file records for (filename, relative_path) in files_to_attach: initializer = { 'qc_event': qc_event, 'filename': filename, 'relative_path': relative_path, 'is_restricted': _dict['is_restricted'] } qc_attached_file = QCAttachedFile(**initializer) qc_attached_file.save() logger.debug( str(('created qc attached file', qc_attached_file))) rows += 1 except Exception, e: logger.error( str(("Invalid initializer: ", initializer, 'row', rows + start_row + 2, e))) raise
def main(path): """ Read in the smallmolecule batch info """ sheet_name = 'sheet 1' start_row = 1 sheet = iu.readtable([path, sheet_name, start_row ]) # Note, skipping the header row by default properties = ('model_field', 'required', 'default', 'converter') column_definitions = { # NOTE: even though these db field are not integers, # it is convenient to convert the read in values to INT to make sure they are not interpreted as float values 'facility_id': ('facility_id', True, None, lambda x: util.convertdata(x, int)), 'salt_id': ('salt_id', True, None, lambda x: util.convertdata(x, int)), 'facility_batch_id': ('batch_id', True, None, lambda x: util.convertdata(x, int)), 'provider': ('provider_name', True), 'provider_catalog_id': 'provider_catalog_id', 'provider_sample_id': 'provider_batch_id', 'chemical_synthesis_reference': 'chemical_synthesis_reference', 'purity': 'purity', 'purity_method': 'purity_method', 'aqueous_solubility': 'aqueous_solubility', # FIXME: should warn the user if no unit is provided when # aqueous_solubility is provided 'aqueous_solubility_unit': 'aqueous_solubility_unit', 'Date Data Received': ('date_data_received', False, None, util.date_converter), 'Date Loaded': ('date_loaded', False, None, util.date_converter), 'Date Publicly Available': ('date_publicly_available', False, None, util.date_converter), 'Most Recent Update': ('date_updated', False, None, util.date_converter), } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions( properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels, all_sheet_columns_required=False) rows = 0 logger.debug(str(('cols: ', cols))) for row in sheet: r = util.make_row(row) initializer = {} small_molecule_lookup = {'facility_id': None, 'salt_id': None} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if (converter != None): value = converter(value) if (value == None): if (default != None): value = default if (value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'], rows)) logger.debug( str(('model_field: ', model_field, ', value: ', value))) if (model_field in small_molecule_lookup): small_molecule_lookup[model_field] = value if (None not in small_molecule_lookup.values()): try: sm = SmallMolecule.objects.get(**small_molecule_lookup) initializer['reagent'] = sm except Exception, e: logger.error( str(('sm identifiers not found', small_molecule_lookup, 'row', rows + start_row + 2))) raise else: initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) smb = SmallMoleculeBatch(**initializer) smb.save() logger.debug(str(('smb created:', smb))) rows += 1 except Exception, e: logger.error( str(("Invalid smallmolecule batch initializer: ", initializer, 'row', rows + start_row + 2, e))) raise