def read_metadata(path): """ Read in the DataSets, Datacolumns, and Data sheets. In the Data sheet, rows are DataRecords, and columns are DataPoints """ # Read in the DataSet sheetname = 'Meta' # Define the Column Names -> model fields mapping labels = {'Lead Screener First': 'lead_screener_firstname', 'Lead Screener Last': 'lead_screener_lastname', 'Lead Screener Email': 'lead_screener_email', 'Lab Head First': 'lab_head_firstname', 'Lab Head Last': 'lab_head_lastname', 'Lab Head Email': 'lab_head_email', 'Title': 'title', 'Facility ID': 'facility_id', 'Summary': 'summary', 'Protocol': 'protocol', 'References': 'protocol_references'} metaSheet = iu.readtable([path, sheetname]) # Note, skipping the header row by default metaData = {} for row in metaSheet: rowAsUnicode = util.make_row(row) for key,value in labels.items(): if re.match(key, rowAsUnicode[0], re.M|re.I): if key == 'Facility ID': metaData[value] = util.convertdata(rowAsUnicode[1],int) else: metaData[value] = rowAsUnicode[1] assert len(metaData) == len(labels), 'Meta data sheet does not contain the necessary keys, expected: %s, read: %s' % [labels, metaData] return metaData
def main(path): """ Read in the Protein """ sheet_name = 'HMS-LINCS Kinases' labels = { 'PP_Name': 'name', 'PP_LINCS_ID': 'lincs_id', 'PP_UniProt_ID': 'uniprot_id', 'PP_Alternate_Name': 'alternate_name', 'PP_Provider': 'provider', 'PP_Provider_Catalog_ID': 'provider_catalog_id', 'PP_Batch_ID': 'batch_id', 'PP_Amino_Acid_Sequence': 'amino_acid_sequence', 'PP_Gene_Symbol': 'gene_symbol', 'PP_Gene_ID': 'gene_id', 'PP_Protein_Source': 'protein_source', 'PP_Protein_Form': 'protein_form', 'PP_Protein_Purity': 'protein_purity', 'PP_Protein_Complex': 'protein_complex', 'PP_Isoform': 'isoform', 'PP_Protein_Type': 'protein_type', 'PP_Source_Organism': 'source_organism', 'PP_Reference': 'reference' } converters = {'lincs_id': lambda x: x[x.index('HMSL') + 4:]} sheet = iu.readtable([path, sheet_name, 1]) # Note, skipping the header row by default cols = {} # first put the label row in (it contains the worksheet column, and its unique) print 'labels: ', sheet.labels for i, label in enumerate(sheet.labels): if label in labels: cols[i] = labels[label] else: print 'Note: column label not found:', label #raise rows = 0 i = 0 print 'cols: ', cols proteins = {} for row in sheet: r = util.make_row(row) dict = {} for i, value in enumerate(r): if i not in cols: continue if cols[i] in converters: value = converters[cols[i]](value) dict[cols[i]] = value try: protein = Protein(**dict) protein.save() rows += 1 except Exception, e: print "Invalid Protein, name: ", r[0] raise
def main(path): """ Read in the Library and LibraryMapping sheets """ libraries = readLibraries(path, 'Library') labels = { 'Facility': 'facility_id', 'Salt': 'sm_salt', 'Batch': 'facility_batch_id', 'Plate': 'plate', 'Well': 'well', 'Library Name': 'short_name', 'Concentration': 'concentration', 'Concentration Unit': 'concentration_unit' } small_molecule_lookup = ('facility_id', 'sm_salt', 'facility_batch_id') sheet = iu.readtable([path, 'LibraryMapping']) #dict to map spreadsheet fields to terms cols = {} # first put the label row in (it contains the worksheet column, and its unique) for i, label in enumerate(sheet.labels): if label in labels: cols[labels[label]] = i else: print 'Note: column label not found:', label rows = 0 for row in sheet: r = util.make_row(row) # small molecule dict = {} for field in small_molecule_lookup: dict[field] = util.convertdata(r[cols[field]], int) try: dict['facility_id'] = 'HMSL' + str( dict['facility_id'] ) # TODO: convert all hmsl id's to integers!! sm = SmallMolecule.objects.get(**dict) except Exception, e: print "Invalid small molecule identifiers: ", dict raise short_name = r[cols['short_name']] if short_name not in libraries: print "Library not found: ", short_name raise lm = {} lm['concentration'] = util.convertdata(r[cols['concentration']], float) lm['concentration_unit'] = util.convertdata( r[cols['concentration_unit']], None) lm['plate'] = util.convertdata(r[cols['plate']], int) lm['well'] = r[cols['well']] lm['small_molecule'] = sm lm['library'] = libraries[short_name] lm = LibraryMapping(**lm) lm.save() rows += 1
def readLibraries(path, sheetName): sheet = iu.readtable([path, sheetName]) # Note, skipping the header row by default # dict to map spreadsheet fields to the Library fields properties = ('model_field','required','default','converter') date_parser = lambda x : util.convertdata(x,date) column_definitions = {'Name': ('name',True), # TODO use the model to determine if req'd 'ShortName': ('short_name',True), 'Library Type':'type', 'Date First Plated': ('date_first_plated',False,None,date_parser), 'Date Data Received':('date_data_received',False,None,date_parser), 'Date Loaded': ('date_loaded',False,None,date_parser), 'Date Publicly Available': ('date_publicly_available',False,None,date_parser), 'Most Recent Update': ('date_updated',False,None,util.date_converter), 'Is Restricted':('is_restricted',False,False) } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 libraries = {} for row in sheet: logger.debug(str(('row raw: ',row))) r = util.make_row(row) logger.debug(str(('row: ',r))) initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value try: library = Library(**initializer) library.save() logger.info(str(('library created', library))) libraries[library.short_name] = library rows += 1 except Exception, e: logger.error(str(('library initializer problem: ', initializer))) raise e
def main(path): """ Read in the Protein """ sheet_name = 'HMS-LINCS Kinases' labels = { 'PP_Name':'name', 'PP_LINCS_ID':'lincs_id', 'PP_UniProt_ID':'uniprot_id', 'PP_Alternate_Name':'alternate_name', 'PP_Provider':'provider', 'PP_Provider_Catalog_ID':'provider_catalog_id', 'PP_Batch_ID':'batch_id', 'PP_Amino_Acid_Sequence':'amino_acid_sequence', 'PP_Gene_Symbol':'gene_symbol', 'PP_Gene_ID':'gene_id', 'PP_Protein_Source':'protein_source', 'PP_Protein_Form':'protein_form', 'PP_Protein_Purity':'protein_purity', 'PP_Protein_Complex':'protein_complex', 'PP_Isoform':'isoform', 'PP_Protein_Type':'protein_type', 'PP_Source_Organism':'source_organism', 'PP_Reference':'reference'} converters = { 'lincs_id': lambda x: x[x.index('HMSL')+4:] } sheet = iu.readtable([path, sheet_name, 1]) # Note, skipping the header row by default cols = {} # first put the label row in (it contains the worksheet column, and its unique) print 'labels: ', sheet.labels for i,label in enumerate(sheet.labels): if label in labels: cols[i] = labels[label] else: print 'Note: column label not found:', label #raise rows = 0 i = 0 print 'cols: ' , cols proteins = {} for row in sheet: r = util.make_row(row) dict = {} for i,value in enumerate(r): if i not in cols: continue if cols[i] in converters: value = converters[cols[i]](value) dict[cols[i]]= value try: protein = Protein(**dict) protein.save() rows += 1 except Exception, e: print "Invalid Protein, name: ", r[0] raise
def main(path): """ Read in the Library and LibraryMapping sheets """ libraries = readLibraries(path,'Library') labels = { 'Facility':'facility_id', 'Salt':'sm_salt', 'Batch':'facility_batch_id', 'Plate':'plate', 'Well':'well', 'Library Name':'short_name', 'Concentration': 'concentration', 'Concentration Unit':'concentration_unit' } small_molecule_lookup = ('facility_id', 'sm_salt', 'facility_batch_id') sheet = iu.readtable([path, 'LibraryMapping']) #dict to map spreadsheet fields to terms cols = {} # first put the label row in (it contains the worksheet column, and its unique) for i,label in enumerate(sheet.labels): if label in labels: cols[labels[label]] = i else: print 'Note: column label not found:', label rows = 0 for row in sheet: r = util.make_row(row) # small molecule dict = {} for field in small_molecule_lookup: dict[field] = util.convertdata(r[cols[field]],int) try: dict['facility_id'] = 'HMSL' + str(dict['facility_id']) # TODO: convert all hmsl id's to integers!! sm = SmallMolecule.objects.get(**dict) except Exception, e: print "Invalid small molecule identifiers: ", dict raise short_name = r[cols['short_name']] if short_name not in libraries: print "Library not found: ", short_name raise lm = {} lm['concentration'] = util.convertdata(r[cols['concentration']],float) lm['concentration_unit'] = util.convertdata(r[cols['concentration_unit']],None) lm['plate'] = util.convertdata(r[cols['plate']], int) lm['well'] = r[cols['well']] lm['small_molecule'] = sm lm['library'] = libraries[short_name] lm = LibraryMapping(**lm) lm.save() rows += 1
def readLibraries(path, sheetName): sheet = iu.readtable([path, sheetName ]) # Note, skipping the header row by default # dict to map spreadsheet fields to the Library fields labels = { 'Name': 'name', 'ShortName': 'short_name', 'Date First Plated': 'date_first_plated', 'Date Data Received': 'date_data_received', 'Date Loaded': 'date_loaded', 'Date Publicly Available': 'date_publicly_available' } date_parser = lambda x: util.convertdata(x, date) converters = { 'date_first_plated': date_parser, 'date_loaded': date_parser, 'date_data_recieved': date_parser, 'date_publicly_available': date_parser } cols = {} # first put the label row in (it contains the worksheet column, and its unique) for i, label in enumerate(sheet.labels): if label in labels: cols[i] = labels[label] else: print 'Note: column label not found:', label raise rows = 0 i = 0 libraries = {} for row in sheet: r = util.make_row(row) dict = {} for i, value in enumerate(r): if cols[i] in converters: value = converters[cols[i]](value) dict[cols[i]] = value try: print 'create library:', dict library = Library(**dict) library.save() libraries[library.short_name] = library rows += 1 except Exception, e: print "Invalid Library, name: ", r[0] raise
def readLibraries(path, sheetName): sheet = iu.readtable([path, sheetName]) # Note, skipping the header row by default # dict to map spreadsheet fields to the Library fields labels = { 'Name': 'name', 'ShortName': 'short_name', 'Date First Plated': 'date_first_plated', 'Date Data Received':'date_data_received', 'Date Loaded': 'date_loaded', 'Date Publicly Available': 'date_publicly_available' } date_parser = lambda x : util.convertdata(x,date) converters = {'date_first_plated': date_parser, 'date_loaded': date_parser, 'date_data_recieved': date_parser, 'date_publicly_available': date_parser } cols = {} # first put the label row in (it contains the worksheet column, and its unique) for i,label in enumerate(sheet.labels): if label in labels: cols[i] = labels[label] else: print 'Note: column label not found:', label raise rows = 0 i = 0 libraries = {} for row in sheet: r = util.make_row(row) dict = {} for i,value in enumerate(r): if cols[i] in converters: value = converters[cols[i]](value) dict[cols[i]]= value try: print 'create library:', dict library = Library(**dict) library.save() libraries[library.short_name] = library rows += 1 except Exception, e: print "Invalid Library, name: ", r[0] raise
def readDataColumns(path): # Read in the DataColumn Sheet sheetname = 'Data Columns' dataColumnSheet = iu.readtable([path, sheetname]) _fields = util.get_fields(DataColumn) _typelookup = dict((f.name, iu.totype(f)) for f in _fields) # TODO: Use the import_utils methods here # TODO: compare and combine this with the fieldinformation entity labels = {'Worksheet Column':'worksheet_column', 'Display Order':'display_order', 'Name':'name', 'Data Type':'data_type', 'Decimal Places':'precision', 'Description':'description', 'Replicate Number':'replicate', 'Time point':'time_point', 'Assay readout type':'readout_type', 'Comments':'comments'} # create an array of dict's, each dict defines a DataColumn dataColumnDefinitions = [] # first put the label row in (it contains the worksheet column, and its unique) for v in dataColumnSheet.labels[1:]: dataColumnDefinitions.append({labels['Worksheet Column']:v}) # now, for each row, create the appropriate dictionary entry in the dataColumnDefinitions for row in dataColumnSheet: rowAsUnicode = util.make_row(row) keyRead = rowAsUnicode[0] for i,cellText in enumerate(rowAsUnicode[1:]): for key,fieldName in labels.items(): if re.match(key,keyRead,re.M|re.I): # if the row is one of the DataColumn fields, then add it to the dict dataColumnDefinitions[i][fieldName] = util.convertdata(cellText,_typelookup.get(fieldName, None)) # Note: convert the data to the model field type else: logger.debug(str(( '"Data Column definition not used: ', cellText)) ) pass logger.debug(str(("definitions: ", dataColumnDefinitions)) ) return dataColumnDefinitions
def readDataColumns(path): # Read in the DataColumn Sheet sheetname = 'Data Columns' dataColumnSheet = iu.readtable([path, sheetname]) _fields = util.get_fields(DataColumn) _typelookup = dict((f.name, iu.totype(f)) for f in _fields) labels = {'Worksheet Column':'worksheet_column', 'Name':'name', 'Data Type':'data_type', 'Decimal Places':'precision', 'Description':'description', 'Replicate Number':'replicate', 'Time point':'time_point', 'Assay readout type':'readout_type', 'Comments':'comments'} # create an array of dict's, each dict defines a DataColumn dataColumnDefinitions = [] # first put the label row in (it contains the worksheet column, and its unique) for v in dataColumnSheet.labels[1:]: dataColumnDefinitions.append({labels['Worksheet Column']:v}) # now, for each row, create the appropriate dictionary entry in the dataColumnDefinitions for row in dataColumnSheet: rowAsUnicode = util.make_row(row) keyRead = rowAsUnicode[0] for i,cellText in enumerate(rowAsUnicode[1:]): for key,fieldName in labels.items(): if re.match(key,keyRead,re.M|re.I): # if the row is one of the DataColumn fields, then add it to the dict dataColumnDefinitions[i][fieldName] = util.convertdata(cellText,_typelookup.get(fieldName, None)) # Note: convert the data to the model field type else: pass # print '"Data Column definition not used: ', cellText print "definitions: ", dataColumnDefinitions return dataColumnDefinitions
def main(path, do_precursors_only): """ Read in the Cell """ sheet_name = 'HMS-LINCS cell line metadata' sheet = iu.readtable([path, sheet_name, 1]) # allow for informational header row properties = ('model_field','required','default','converter') column_definitions = { 'Facility ID':('facility_id',True,None, lambda x: x[x.index('HMSL')+4:]), 'CL_Name':('name',True), 'CL_LINCS_ID':'lincs_id', 'CL_Alternate_Name':'alternative_names', 'CL_Alternate_ID':'alternative_id', 'Precursor_Cell':'precursor_facility_batch_id', 'CL_Organism':'organism', 'CL_Organ':'organ', 'CL_Tissue':'tissue', 'CL_Cell_Type':'cell_type', 'CL_Cell_Type_Detail':'cell_type_detail', 'CL_Donor_Sex': 'donor_sex', 'CL_Donor_Age': ('donor_age_years',False,None,lambda x:util.convertdata(x,int)), 'CL_Donor_Ethnicity': 'donor_ethnicity', 'CL_Donor_Health_Status': 'donor_health_status', 'CL_Disease':'disease', 'CL_Disease_Detail':'disease_detail', 'CL_Production_Details': 'production_details', 'CL_Genetic_Modification':'genetic_modification', 'CL_Known_Mutations':'mutations_known', 'CL_Mutation_Citations':'mutation_citations', 'CL_Verification_Reference_Profile':'verification_reference_profile', 'CL_Growth_Properties':'growth_properties', 'CL_Recommended_Culture_Conditions':'recommended_culture_conditions', 'CL_Relevant_Citations': 'relevant_citations', 'Usage Note': 'usage_note', 'CL_Reference_Source': 'reference_source', 'Reference Source URL': 'reference_source_url', 'Date Data Received':('date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), 'Is Restricted':('is_restricted',False,False,util.bool_converter)} # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels, all_sheet_columns_required=False) rows = 0 precursor_map = {} precursor_pattern = re.compile(r'HMSL(5\d{4})-(\d+)') for row in sheet: r = util.make_row(row) initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] value = convertdata(value) if value is not None: if converter: try: value = converter(value) except Exception: logger.error('field parse error: %r, value: %r, row: %d', properties['column_label'],value,rows+2) raise if value is None: if default is not None: value = default if value is None and required: raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug('model_field: %r, value: %r' , model_field, value) initializer[model_field] = value precursor_facility_batch_id = initializer.pop('precursor_facility_batch_id') if precursor_facility_batch_id: match = precursor_pattern.match(precursor_facility_batch_id) if not match: raise Exception('Invalid precursor pattern: needs: %s: %r, row: %d' % (precursor_pattern, initializer, rows)) precursor_map[initializer['facility_id']] = (match.group(1),match.group(2)) if not do_precursors_only: try: logger.info('initializer: %r', initializer) cell = Cell(**initializer) cell.save() logger.info(str(('cell created:', cell))) # create a default batch - 0 CellBatch.objects.create(reagent=cell,batch_id=0) except Exception, e: print "Invalid Cell, name: ", r[0] raise e rows += 1
def read_metadata(path): """ Read in the DataSets, Datacolumns, and Data sheets. In the Data sheet, rows are DataRecords, and columns are DataPoints """ # Read in the DataSet sheetname = 'Meta' # Note, skipping the header row by default metaSheet = iu.readtable([path, sheetname]) # Define the Column Names -> model fields mapping properties = ('model_field','required','default','converter') field_definitions = {'Lead Screener First': 'lead_screener_firstname', 'Lead Screener Last': 'lead_screener_lastname', 'Lead Screener Email': 'lead_screener_email', 'Lab Head First': 'lab_head_firstname', 'Lab Head Last': 'lab_head_lastname', 'Lab Head Email': 'lab_head_email', 'Title': 'title', 'Facility ID': ('facility_id',True,None, lambda x: util.convertdata(x,int)), 'Summary': 'summary', 'Protocol': 'protocol', 'References': 'protocol_references', 'Date Data Received':('date_data_received',False,None, util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None, util.date_converter), 'Most Recent Update': ('date_updated',False,None, util.date_converter), 'Is Restricted':('is_restricted',False,False,util.bool_converter), 'Dataset Type':('dataset_type',False), 'Bioassay':('bioassay',False), 'Dataset Keywords':('dataset_keywords',False), 'Usage Message':('usage_message',False), } sheet_labels = [] for row in metaSheet: rowAsUnicode = util.make_row(row) sheet_labels.append(rowAsUnicode[0]) # convert the definitions to fleshed out dict's, with strategies for # optional, default and converter field_definitions = \ util.fill_in_column_definitions(properties,field_definitions) # create a dict mapping the column/row ordinal to the proper definition dict cols = util.find_columns(field_definitions, sheet_labels, all_column_definitions_required=False) initializer = {} for i,row in enumerate(metaSheet): rowAsUnicode = util.make_row(row) properties = cols[i] value = rowAsUnicode[1] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],row)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value return initializer
def main(path): """ Read in the smallmolecule batch info """ sheet_name = 'sheet 1' start_row = 1 sheet = iu.readtable([path, sheet_name, start_row ]) # Note, skipping the header row by default properties = ('model_field', 'required', 'default', 'converter') column_definitions = { # NOTE: even though these db field are not integers, # it is convenient to convert the read in values to INT to make sure they are not interpreted as float values 'facility_id': ('facility_id', True, None, lambda x: util.convertdata(x, int)), 'salt_id': ('salt_id', True, None, lambda x: util.convertdata(x, int)), 'facility_batch_id': ('batch_id', True, None, lambda x: util.convertdata(x, int)), 'provider': ('provider_name', True), 'provider_catalog_id': 'provider_catalog_id', 'provider_sample_id': 'provider_batch_id', 'chemical_synthesis_reference': 'chemical_synthesis_reference', 'purity': 'purity', 'purity_method': 'purity_method', 'aqueous_solubility': 'aqueous_solubility', # FIXME: should warn the user if no unit is provided when # aqueous_solubility is provided 'aqueous_solubility_unit': 'aqueous_solubility_unit', 'Date Data Received': ('date_data_received', False, None, util.date_converter), 'Date Loaded': ('date_loaded', False, None, util.date_converter), 'Date Publicly Available': ('date_publicly_available', False, None, util.date_converter), 'Most Recent Update': ('date_updated', False, None, util.date_converter), } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions( properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels, all_sheet_columns_required=False) rows = 0 logger.debug(str(('cols: ', cols))) for row in sheet: r = util.make_row(row) initializer = {} small_molecule_lookup = {'facility_id': None, 'salt_id': None} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if (converter != None): value = converter(value) if (value == None): if (default != None): value = default if (value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'], rows)) logger.debug( str(('model_field: ', model_field, ', value: ', value))) if (model_field in small_molecule_lookup): small_molecule_lookup[model_field] = value if (None not in small_molecule_lookup.values()): try: sm = SmallMolecule.objects.get(**small_molecule_lookup) initializer['reagent'] = sm except Exception, e: logger.error( str(('sm identifiers not found', small_molecule_lookup, 'row', rows + start_row + 2))) raise else: initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) smb = SmallMoleculeBatch(**initializer) smb.save() logger.debug(str(('smb created:', smb))) rows += 1 except Exception, e: logger.error( str(("Invalid smallmolecule batch initializer: ", initializer, 'row', rows + start_row + 2, e))) raise
def main(path): """ Read in the Protein """ sheet_name = 'HMS-LINCS Kinases' # Note, skipping the header row by default sheet = iu.readtable([path, sheet_name, 1]) properties = ('model_field','required','default','converter') column_definitions = { 'PP_Name':('name',True), 'PP_LINCS_ID':('facility_id',True,None,lambda x: x[x.index('HMSL')+4:]), 'PP_UniProt_ID':'uniprot_id', 'PP_Alternate_Name':'alternative_names', 'PP_Alternate_Name[2]':'alternate_name_2', 'PP_Provider':'provider', 'PP_Provider_Catalog_ID':'provider_catalog_id', 'PP_Batch_ID':'batch_id', 'PP_Amino_Acid_Sequence':'amino_acid_sequence', 'PP_Gene_Symbol':'gene_symbol', 'PP_Gene_ID':'gene_id', 'PP_Protein_Source':'protein_source', 'PP_Protein_Form':'protein_form', 'PP_Mutation':'mutation', 'PP_Phosphorylation_State':'phosphlorylation', 'PP_Domain':'protein_domain', 'PP_Protein_Purity':'protein_purity', 'PP_Protein_Complex':'protein_complex', 'PP_Isoform':'isoform', 'PP_Protein_Type':'protein_type', 'PP_Source_Organism':'source_organism', 'PP_Reference':'reference', 'Date Data Received':('date_data_received',False,None, util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None, util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), 'Is Restricted':('is_restricted',False,False)} # convert the labels to fleshed out dict's, with strategies for optional, # default and converter column_definitions = \ util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: r = util.make_row(row) dict = {} initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug(str(( 'model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) protein = Protein(**initializer) # FIXME: LINCS IDS for Protein protein.lincs_id = protein.facility_id protein.save() logger.info(str(('protein created: ', protein))) rows += 1 # create a default batch - 0 ProteinBatch.objects.create(reagent=protein,batch_id=0) except Exception, e: logger.error(str(("Invalid protein initializer: ", initializer, e))) raise
def main(path): """ Read in the Library and LibraryMapping sheets """ libraries = readLibraries(path,'Library') sheet = iu.readtable([path, 'LibraryMapping']) properties = ('model_field','required','default','converter') column_definitions = {'Facility':('facility_id',False,None, lambda x: util.convertdata(x,int)), 'Salt':('salt_id',False,None, lambda x: util.convertdata(x,int)), 'Batch':('facility_batch_id',False,None, lambda x: util.convertdata(x,int)), 'Is Control':('is_control',False,False,util.bool_converter), 'Plate':('plate',False,None, lambda x: util.convertdata(x,int)), 'Well':'well', 'Library Name':'short_name', 'Concentration': 'concentration', 'Concentration Unit':'concentration_unit' } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) small_molecule_batch_lookup = ('smallmolecule', 'facility_batch_id') library_mapping_lookup = ('smallmolecule_batch','library','is_control','plate','well','concentration','concentration_unit') rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: current_row = rows + 2 r = util.make_row(row) initializer = {} small_molecule_lookup = {'facility_id':None, 'salt_id':None} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],'row',current_row)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value if(model_field in small_molecule_lookup): small_molecule_lookup[model_field]=value if( None not in small_molecule_lookup.values()): try: sm = SmallMolecule.objects.get(**small_molecule_lookup) initializer['smallmolecule'] = sm except Exception, e: raise Exception(str(('sm facility id not found', small_molecule_lookup,e,'row',current_row))) elif(model_field == 'short_name'): try: library = libraries[value] initializer['library'] = library except Exception, e: raise Exception(str(('library short_name not found', value,e,'row',current_row)))
def main(import_file,file_directory,deploy_dir): """ Read in the qc events for batches - version 1 - for small molecule batches """ sheet_name = 'Sheet1' start_row = 0 sheet = iu.readtable([import_file, sheet_name, start_row]) # Note, skipping the header row by default properties = ('model_field','required','default','converter') column_definitions = { 'facility_id': ('facility_id_for',True,None, lambda x: util.convertdata(x,int)), 'salt_id': ('salt_id_for',False,None, lambda x: util.convertdata(x,int)), 'batch_id':('batch_id_for',True,None, lambda x: util.convertdata(x,int)), 'QC event date': ('date',True,None,util.date_converter), 'outcome': ('outcome',True), 'comment': 'comment', 'is_restricted':('is_restricted',False,False,util.bool_converter), 'file1': 'file1', 'file2': 'file2', 'file3': 'file3', 'file4': 'file4', 'file5': 'file5', } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: r = util.make_row(row) # store each row in a dict _dict = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) _dict[model_field] = value logger.debug(str(('dict: ', _dict))) files_to_attach = [] for i in range(10): filenameProp = 'file%s'%i; if _dict.get(filenameProp, None): fileprop = _dict[filenameProp] filepath = os.path.join(file_directory,fileprop) if not os.path.exists(filepath): raise Exception(str(('file does not exist:',filepath,'row', rows+start_row))) filename = os.path.basename(filepath) relative_path = fileprop[:fileprop.index(filename)] # Move the file dest_dir = deploy_dir if not dest_dir: dest_dir = settings.STATIC_AUTHENTICATED_FILE_DIR if not os.path.isdir(dest_dir): raise Exception(str(('no such deploy directory, please create it', dest_dir))) if relative_path: dest_dir = os.path.join(dest_dir, relative_path) if not os.path.exists(dest_dir): os.makedirs(dest_dir) deployed_path = os.path.join(dest_dir, filename) logger.debug(str(('deploy',filepath, deployed_path))) if os.path.exists(deployed_path): os.remove(deployed_path) copy(filepath,deployed_path) if not os.path.isfile (deployed_path): raise Exception(str(('could not deploy to', deployed_path))) else: logger.debug(str(('successfully deployed to', deployed_path))) files_to_attach.append((filename,relative_path)) initializer = None try: # create the qc record initializer = {key:_dict[key] for key in ['facility_id_for','salt_id_for','batch_id_for','outcome','comment','date']} qc_event = QCEvent(**initializer) qc_event.save() logger.debug(str(('saved', qc_event))) # create attached file records for (filename,relative_path) in files_to_attach: initializer = { 'qc_event':qc_event, 'filename':filename, 'relative_path':relative_path, 'is_restricted':_dict['is_restricted'] } qc_attached_file = QCAttachedFile(**initializer) qc_attached_file.save() logger.debug(str(('created qc attached file', qc_attached_file))) rows += 1 except Exception, e: logger.error(str(("Invalid initializer: ", initializer, 'row', rows+start_row+2, e))) raise
def main(path): """ Read in the primary cell batch info """ sheet_name = "Sheet1" start_row = 1 sheet = iu.readtable([path, sheet_name, start_row]) # Note, skipping the header row by default properties = ("model_field", "required", "default", "converter") column_definitions = { "Facility ID": ("facility_id", True, None, lambda x: x[x.index("HMSL") + 4 :]), "PC_Center_Batch_ID": ("batch_id", True, None, lambda x: util.convertdata(x, int)), "PC_Center_Specific_Code": "center_specific_code", "PC_Provider_Name": "provider_name", "PC_Provider_Catalog_ID": "provider_catalog_id", "PC_Provider_Batch_ID": "provider_batch_id", "PC_Source_Information": "source_information", "PC_Date_Received": "date_received", "PC_Quality_Verification": "quality_verification", "PC_Culture_Conditions": "culture_conditions", "PC_Passage_Number": ("passage_number", False, None, lambda x: util.convertdata(x, int)), "PC_Transient_Modification": "transient_modification", "Date Data Received": ("date_data_received", False, None, util.date_converter), "Date Loaded": ("date_loaded", False, None, util.date_converter), "Date Publicly Available": ("date_publicly_available", False, None, util.date_converter), "Most Recent Update": ("date_updated", False, None, util.date_converter), } column_definitions = util.fill_in_column_definitions(properties, column_definitions) cols = util.find_columns(column_definitions, sheet.labels) rows = 0 for row in sheet: r = util.make_row(row) initializer = {} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] required = properties["required"] default = properties["default"] converter = properties["converter"] model_field = properties["model_field"] if converter != None: value = converter(value) if value == None: if default != None: value = default if value == None and required == True: raise Exception("Field is required: %s, record: %d" % (properties["column_label"], rows)) if model_field == "facility_id": try: cell = PrimaryCell.objects.get(facility_id=value) initializer["reagent"] = cell except: logger.exception("Primary Cell not found: %r, row: %d", value, rows + start_row + 1) raise else: initializer[model_field] = value try: logger.debug("initializer: %r", initializer) cell = PrimaryCellBatch(**initializer) cell.save() logger.debug("primary cell batch created: %r", cell) rows += 1 except Exception, e: logger.exception("Invalid Primary CellBatch initializer: %r, row: %d", initializer, rows + start_row + 1) raise
def main(path): """ Read in the smallmolecule batch info """ sheet_name = "sheet 1" start_row = 1 sheet = iu.readtable([path, sheet_name, start_row]) # Note, skipping the header row by default properties = ("model_field", "required", "default", "converter") column_definitions = { # NOTE: even though these db field are not integers, # it is convenient to convert the read in values to INT to make sure they are not interpreted as float values "facility_id": ("facility_id", True, None, lambda x: util.convertdata(x, int)), "salt_id": ("salt_id", True, None, lambda x: util.convertdata(x, int)), "facility_batch_id": ("facility_batch_id", True, None, lambda x: util.convertdata(x, int)), "provider": ("provider", True), "provider_catalog_id": "provider_catalog_id", "provider_sample_id": "provider_sample_id", "chemical_synthesis_reference": "chemical_synthesis_reference", "purity": "purity", "purity_method": "purity_method", "aqueous_solubility": "aqueous_solubility", "aqueous_solubility_unit": "aqueous_solubility_unit", "Date Data Received": ("date_data_received", False, None, util.date_converter), "Date Loaded": ("date_loaded", False, None, util.date_converter), "Date Publicly Available": ("date_publicly_available", False, None, util.date_converter), } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(("cols: ", cols))) for row in sheet: r = util.make_row(row) initializer = {} small_molecule_lookup = {"facility_id": None, "salt_id": None} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(("read col: ", i, ", ", properties))) required = properties["required"] default = properties["default"] converter = properties["converter"] model_field = properties["model_field"] # Todo, refactor to a method logger.debug(str(("raw value", value))) if converter != None: value = converter(value) if value == None: if default != None: value = default if value == None and required == True: raise Exception("Field is required: %s, record: %d" % (properties["column_label"], rows)) logger.debug(str(("model_field: ", model_field, ", value: ", value))) if model_field in small_molecule_lookup: small_molecule_lookup[model_field] = value if None not in small_molecule_lookup.values(): try: sm = SmallMolecule.objects.get(**small_molecule_lookup) initializer["smallmolecule"] = sm except Exception, e: logger.error( str(("sm identifiers not found", small_molecule_lookup, "row", rows + start_row + 2)) ) raise else: initializer[model_field] = value try: logger.debug(str(("initializer: ", initializer))) smb = SmallMoleculeBatch(**initializer) smb.save() logger.debug(str(("smb created:", smb))) rows += 1 except Exception, e: logger.error( str(("Invalid smallmolecule batch initializer: ", initializer, "row", rows + start_row + 2, e)) ) raise
def main(path): sheet_name = 'sheet 1' start_row = 1 sheet = iu.readtable([path, sheet_name, start_row]) properties = ('model_field','required','default','converter') column_definitions = { 'facility_id': ( 'facility_id',True,None, lambda x: util.convertdata(x,int)), 'salt_id': ( 'salt_id',True,None, lambda x: util.convertdata(x,int)), 'facility_batch_id':( 'batch_id',True,None, lambda x: util.convertdata(x,int)), 'provider': ('provider_name',False), 'provider_catalog_id':'provider_catalog_id', 'provider_sample_id':'provider_batch_id', 'molecular_weight':( '_molecular_weight',False,None, lambda x: util.convertdata(x, float)), 'molecular_formula':'_molecular_formula', 'chemical_synthesis_reference':'_chemical_synthesis_reference', 'purity':'_purity', 'purity_method':'_purity_method', 'aqueous_solubility':'aqueous_solubility', # FIXME: should warn the user if no unit is provided when # aqueous_solubility is provided 'aqueous_solubility_unit':'aqueous_solubility_unit', 'Date Data Received':( 'date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ( 'date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ( 'date_updated',False,None,util.date_converter), } column_definitions = util.fill_in_column_definitions( properties,column_definitions) cols = util.find_columns(column_definitions, sheet.labels, all_sheet_columns_required=False) rows = 0 for row in sheet: r = util.make_row(row) initializer = {} small_molecule_lookup = {'facility_id':None, 'salt_id':None} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception( 'Field is required: %s, record: %d' % (properties['column_label'],rows)) if(model_field in small_molecule_lookup): small_molecule_lookup[model_field]=value if( None not in small_molecule_lookup.values()): try: sm = SmallMolecule.objects.get(**small_molecule_lookup) initializer['reagent'] = sm except Exception, e: logger.exception( 'sm identifiers not found: %r, row: %d', small_molecule_lookup,rows+start_row+2) raise else: initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) smb = SmallMoleculeBatch(**initializer) smb.save() logger.debug(str(('smb created:', smb))) rows += 1 except Exception, e: logger.exception( 'Invalid smallmolecule batch initializer: %r, row: %d', initializer, rows+start_row+2) raise
def main(path): """ Read in the Library and LibraryMapping sheets """ libraries = readLibraries(path, 'Library') sheet = iu.readtable([path, 'LibraryMapping']) properties = ('model_field', 'required', 'default', 'converter') date_parser = lambda x: util.convertdata(x, date) column_definitions = { 'Facility': ('facility_id', False, None, lambda x: util.convertdata(x, int)), 'Salt': ('salt_id', False, None, lambda x: util.convertdata(x, int)), 'Batch': ('batch_id', False, None, lambda x: util.convertdata(x, int)), 'Is Control': ('is_control', False, False, util.bool_converter), 'Plate': ('plate', False, None, lambda x: util.convertdata(x, int)), 'Well': 'well', 'Library Name': 'short_name', 'Concentration': 'concentration', 'Concentration Unit': 'concentration_unit' } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions( properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) small_molecule_batch_lookup = ('reagent', 'batch_id') library_mapping_lookup = ('smallmolecule_batch', 'library', 'is_control', 'plate', 'well', 'concentration', 'concentration_unit') rows = 0 logger.debug(str(('cols: ', cols))) for row in sheet: current_row = rows + 2 r = util.make_row(row) initializer = {} small_molecule_lookup = {'facility_id': None, 'salt_id': None} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if (converter != None): value = converter(value) if (value == None): if (default != None): value = default if (value == None and required == True): raise Exception( 'Field is required: %s, record: %d' % (properties['column_label'], 'row', current_row)) logger.debug( str(('model_field: ', model_field, ', value: ', value))) initializer[model_field] = value if (model_field in small_molecule_lookup): small_molecule_lookup[model_field] = value if (None not in small_molecule_lookup.values()): try: sm = SmallMolecule.objects.get(**small_molecule_lookup) initializer['reagent'] = sm except Exception, e: raise Exception( str(('sm facility id not found', small_molecule_lookup, e, 'row', current_row))) elif (model_field == 'short_name'): try: library = libraries[value] initializer['library'] = library except Exception, e: raise Exception( str(('library short_name not found', value, e, 'row', current_row)))
def readLibraries(path, sheetName): sheet = iu.readtable([path, sheetName ]) # Note, skipping the header row by default # dict to map spreadsheet fields to the Library fields properties = ('model_field', 'required', 'default', 'converter') date_parser = lambda x: util.convertdata(x, date) column_definitions = { 'Name': ('name', True), # TODO use the model to determine if req'd 'ShortName': ('short_name', True), 'Library Type': 'type', 'Date First Plated': ('date_first_plated', False, None, date_parser), 'Date Data Received': ('date_data_received', False, None, date_parser), 'Date Loaded': ('date_loaded', False, None, date_parser), 'Date Publicly Available': ('date_publicly_available', False, None, date_parser), 'Most Recent Update': ('date_updated', False, None, util.date_converter), 'Is Restricted': ('is_restricted', False, False) } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions( properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 libraries = {} for row in sheet: logger.debug(str(('row raw: ', row))) r = util.make_row(row) logger.debug(str(('row: ', r))) initializer = {} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if (converter != None): value = converter(value) if (value == None): if (default != None): value = default if (value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'], rows)) logger.debug( str(('model_field: ', model_field, ', value: ', value))) initializer[model_field] = value try: library = Library(**initializer) library.save() logger.info(str(('library created', library))) libraries[library.short_name] = library rows += 1 except Exception, e: logger.error(str(('library initializer problem: ', initializer))) raise e
def main(path): """ Read in the Antibody Batches """ sheet_name = 'Sheet1' sheet = iu.readtable([path, sheet_name, 1]) properties = ('model_field','required','default','converter') column_definitions = { 'AR_Center_Specific_ID': ('antibody_facility_id',True,None, lambda x: x[x.index('HMSL')+4:]), 'AR_Center_Batch_ID': ('batch_id',True,None,lambda x:util.convertdata(x,int)), 'AR_Center_Name': 'center_name', 'AR_Provider_Name': 'provider_name', 'AR_Provider_Catalog_ ID': 'provider_catalog_id', 'AR_Provider_Batch_ID': 'provider_batch_id', 'AR_Antibody_Purity': 'antibody_purity', 'Date Data Received':('date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug('cols: %s' % cols) for row in sheet: r = util.make_row(row) dict = {} initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug('read col: %d: %s' % (i,properties)) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug('raw value %r' % value) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug('model_field: %s, converted value %r' % (model_field, value) ) initializer[model_field] = value try: logger.debug('initializer: %s' % initializer) antibody_facility_id = initializer.pop('antibody_facility_id',None) if antibody_facility_id: try: antibody = Antibody.objects.get(facility_id=antibody_facility_id) initializer['reagent'] = antibody except ObjectDoesNotExist, e: logger.error('AR_Center_Specific_ID: "%s" does not exist, row: %d' % (antibody_facility_id,i)) antibody_batch = AntibodyBatch(**initializer) antibody_batch.save() logger.info('antibody batch created: %s' % antibody_batch) rows += 1 except Exception, e: logger.error("Invalid antibody_batch initializer: %s" % initializer) raise
def main(path): sheet_name = 'Sheet1' sheet = iu.readtable([path, sheet_name, 1]) properties = ('model_field','required','default','converter') column_definitions = { 'AR_Name': ('name',True), 'AR_LINCS_ID': 'lincs_id', 'AR_Alternative_Name': 'alternative_names', 'AR_Alternative_ID': 'alternative_id', 'AR_Center_Canonical_ID': ( 'facility_id',True,None, lambda x: x[x.index('HMSL')+4:]), 'AR_Clone_Name': 'clone_name', 'AR_RRID': 'rrid', 'AR_Antibody_Type': 'type', 'target_protein_center_ids': 'target_protein_center_ids', 'AR_Non-Protein_Target': 'non_protein_target_name', 'AR_Target_Organism': 'target_organism', 'other_target_information': 'other_target_information', 'other_human_target_protein_center_ids': 'other_human_target_protein_center_ids', 'AR_Immunogen': 'immunogen', 'AR_Immunogen_Sequence': 'immunogen_sequence', 'AR_Antibody_Species': 'species', 'AR_Antibody_Clonality': 'clonality', 'AR_Antibody_Isotype': 'isotype', 'AR_Antibody_Production_Source_Organism': 'source_organism', 'AR_Antibody_Production_Details': 'production_details', 'AR_Antibody_Labeling': 'labeling', 'AR_Antibody_Labeling_Details': 'labeling_details', 'AR_Relevant_Citations': 'relevant_citations', 'Date Data Received':( 'date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ( 'date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), 'Is Restricted':('is_restricted',False,False,util.bool_converter)} column_definitions = util.fill_in_column_definitions(properties,column_definitions) cols = util.find_columns(column_definitions, sheet.labels, all_sheet_columns_required=False) rows = 0 logger.debug('cols: %s' % cols) for row in sheet: logger.debug('row %s - %s' %(rows,row)) r = util.make_row(row) dict = {} initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug('read col: %d: %s' % (i,properties)) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug('raw value %r' % value) if(value == None or value == 'None'): value = None if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) if(value and converter != None): value = converter(value) logger.debug('model_field: %s, converted value %r' % (model_field, value) ) initializer[model_field] = value try: logger.debug('row: %s, initializer: %s' % (rows,initializer)) target_protein_center_ids = initializer.pop( 'target_protein_center_ids',None) other_human_target_protein_center_ids = initializer.pop( 'other_human_target_protein_center_ids',None) antibody = Antibody.objects.create(**initializer) if target_protein_center_ids: ids = [x for x in target_protein_center_ids.split(';')] try: target_proteins = [] for id in ids: id = id[id.index('HMSL')+4:] target_proteins.append( Protein.objects.get(facility_id=id)) antibody.target_proteins = target_proteins except ObjectDoesNotExist, e: logger.error( 'target_protein_center_ids "%s" does not exist, row: %d' % (id,i)) raise if other_human_target_protein_center_ids: ids = [x for x in other_human_target_protein_center_ids.split(';')] try: other_target_proteins = [] for id in ids: id = id[id.index('HMSL')+4:] other_target_proteins.append( Protein.objects.get(facility_id=id)) antibody.other_human_target_proteins = other_target_proteins except ObjectDoesNotExist, e: logger.error( 'other_human_target_protein_center_ids "%s"' ' does not exist, row: %d' % (id,i)) raise antibody.save() logger.info('antibody created: %s' % antibody) rows += 1 # create a default batch - 0 AntibodyBatch.objects.create(reagent=antibody,batch_id=0)
def main(path): """ Read in the Antibody """ sheet_name = 'Sheet1' sheet = iu.readtable([path, sheet_name, 1]) # Note, skipping the header row by default properties = ('model_field','required','default','converter') column_definitions = { 'AR_Name': ('name',True), 'AR_LINCS_ID': 'lincs_id', 'AR_Alternative_Name': 'alternative_names', 'AR_Center_ID': ('facility_id', True), 'AR_Target_Protein': 'target_protein_name', 'AR_Target_Protein_ID': 'target_protein_uniprot_id', 'AR_Target_Gene': 'target_gene_name', 'AR_Target_Gene_ID': 'target_gene_id', 'AR_Target_Organism': 'target_organism', 'AR_Immunogen': 'immunogen', 'AR_Immunogen_Sequence': 'immunogen_sequence', 'AR_AntibodyClonality': 'antibody_clonality', 'AR_Source_Organism': 'source_organism', 'AR_Antibody_Isotype': 'antibody_isotype', 'AR_Engineering': 'engineering', 'AR_Antibody_Purity': 'antibody_purity', 'AR_Antibody_Labeling': 'antibody_labeling', 'AR_Recommended_Experiment_Type': 'recommended_experiment_type', 'AR_Relevant_Reference': 'relevant_reference', 'AR_Specificity': 'specificity', 'Date Data Received':('date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), 'Is Restricted':('is_restricted',False,False)} # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: r = util.make_row(row) dict = {} initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) antibody = Antibody(**initializer) antibody.save() logger.info(str(('antibody created: ', antibody))) rows += 1 except Exception, e: logger.error(str(( "Invalid antibody initializer: ", initializer))) raise
def main(path): """ Read in the Cell """ sheet_name = 'HMS-LINCS cell line metadata' sheet = iu.readtable([path, sheet_name, 1]) # Note, skipping the header row by default properties = ('model_field','required','default','converter') column_definitions = { 'Facility ID':('facility_id',True,None, lambda x: x[x.index('HMSL')+4:]), 'CL_Name':('name',True), 'CL_LINCS_ID':'lincs_id', 'CL_Alternate_Name':'alternative_names', 'CL_Alternate_ID':'alternate_id', 'CL_Center_Specific_ID':'center_specific_id', 'MGH_ID':('mgh_id',False,None,lambda x:util.convertdata(x,int)), 'Assay':'assay', 'CL_Organism':'organism', 'CL_Organ':'organ', 'CL_Tissue':'tissue', 'CL_Cell_Type':'cell_type', 'CL_Cell_Type_Detail':'cell_type_detail', 'CL_Donor_Sex': 'donor_sex', 'CL_Donor_Age': ('donor_age_years',False,None,lambda x:util.convertdata(x,int)), 'CL_Donor_Ethnicity': 'donor_ethnicity', 'CL_Donor_Health_Status': 'donor_health_status', 'CL_Disease':'disease', 'CL_Disease_Detail':'disease_detail', 'CL_Growth_Properties':'growth_properties', 'CL_Genetic_Modification':'genetic_modification', 'CL_Related_Projects':'related_projects', 'CL_Recommended_Culture_Conditions':'recommended_culture_conditions', 'CL_Verification_Reference_Profile':'verification_reference_profile', 'CL_Known_Mutations':'mutations_known', 'CL_Mutations_Citations':'mutations_citations', 'CL_Molecular_Features': 'molecular_features', 'CL_Relevant_Citations': 'relevant_citations', 'CL_Reference_Source': 'reference_source', 'CL_Reference_Source_ID': 'reference_source_id', 'Reference Source URL': 'reference_source_url', 'Usage Note': 'usage_note', 'Date Data Received':('date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), 'Is Restricted':('is_restricted',False,False,util.bool_converter)} # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels, all_sheet_columns_required=False) rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: r = util.make_row(row) initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) cell = Cell(**initializer) cell.save() logger.info(str(('cell created:', cell))) rows += 1 # create a default batch - 0 CellBatch.objects.create(reagent=cell,batch_id=0) except Exception, e: print "Invalid Cell, name: ", r[0] raise e
def main(path): """ Read in the smallmolecule batch info """ sheet_name = 'sheet 1' start_row = 1 sheet = iu.readtable([path, sheet_name, start_row]) # Note, skipping the header row by default properties = ('model_field','required','default','converter') column_definitions = { # NOTE: even though these db field are not integers, # it is convenient to convert the read in values to INT to make sure they are not interpreted as float values 'facility_id': ('facility_id',True,None, lambda x: util.convertdata(x,int)), 'salt_id': ('salt_id',True,None, lambda x: util.convertdata(x,int)), 'facility_batch_id':('batch_id',True,None, lambda x: util.convertdata(x,int)), 'provider': ('provider_name',True), 'provider_catalog_id':'provider_catalog_id', 'provider_sample_id':'provider_batch_id', 'chemical_synthesis_reference':'chemical_synthesis_reference', 'purity':'purity', 'purity_method':'purity_method', 'aqueous_solubility':'aqueous_solubility', # FIXME: should warn the user if no unit is provided when # aqueous_solubility is provided 'aqueous_solubility_unit':'aqueous_solubility_unit', 'Date Data Received':('date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels, all_sheet_columns_required=False) rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: r = util.make_row(row) initializer = {} small_molecule_lookup = {'facility_id':None, 'salt_id':None} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) if(model_field in small_molecule_lookup): small_molecule_lookup[model_field]=value if( None not in small_molecule_lookup.values()): try: sm = SmallMolecule.objects.get(**small_molecule_lookup) initializer['reagent'] = sm except Exception, e: logger.error(str(('sm identifiers not found', small_molecule_lookup,'row',rows+start_row+2))) raise else: initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) smb = SmallMoleculeBatch(**initializer) smb.save() logger.debug(str(('smb created:', smb))) rows += 1 except Exception, e: logger.error(str(( "Invalid smallmolecule batch initializer: ", initializer, 'row', rows+start_row+2, e))) raise
def main(path): """ Read in the Antibody """ sheet_name = 'Sheet1' sheet = iu.readtable([path, sheet_name, 0]) properties = ('model_field','required','default','converter') column_definitions = { 'AR_Name': ('name',True), 'AR_LINCS_ID': 'lincs_id', 'AR_Alternative_Name': 'alternative_names', 'AR_Center_Specific_ID': ('facility_id',True,None, lambda x: x[x.index('HMSL')+4:]), 'AR_Clone_Name': 'clone_name', 'AR_RRID': 'rrid', 'AR_Antibody_Type': 'type', 'target_protein_lincs_id': ( 'target_protein_lincs_id',False,None, lambda x: x[x.index('HMSL')+4:] if x else None ), 'AR_Non-Protein_Target': 'non_protein_target_name', 'AR_Target_Organism': 'target_organism', 'AR_Immunogen': 'immunogen', 'AR_Immunogen_Sequence': 'immunogen_sequence', 'AR_Antibody_Species': 'species', 'AR_Antibody_Clonality': 'clonality', 'AR_Antibody_Isotype': 'isotype', 'AR_Antibody_Production_Source_Organism': 'source_organism', 'AR_Antibody_Production_Details': 'production_details', 'AR_Antibody_Labeling': 'labeling', 'AR_Antibody_Labeling_Details': 'labeling_details', 'AR_Relevant_Citations': 'relevant_citations', 'Date Data Received':('date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), 'Is Restricted':('is_restricted',False,False,util.bool_converter)} # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug('cols: %s' % cols) for row in sheet: logger.debug('row %s - %s' %(rows,row)) r = util.make_row(row) dict = {} initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug('read col: %d: %s' % (i,properties)) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug('raw value %r' % value) if(value == None or value == 'None'): value = None if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) if(value and converter != None): value = converter(value) logger.debug('model_field: %s, converted value %r' % (model_field, value) ) initializer[model_field] = value try: logger.debug('row: %s, initializer: %s' % (rows,initializer)) target_protein_lincs_id = initializer.pop('target_protein_lincs_id',None) if target_protein_lincs_id: try: target_protein = Protein.objects.get(lincs_id=target_protein_lincs_id) initializer['target_protein'] = target_protein except ObjectDoesNotExist, e: logger.error('target_protein_lincs_id "%s" does not exist, row: %d' % (target_protein_lincs_id,i)) antibody = Antibody(**initializer) antibody.save() logger.info('antibody created: %s' % antibody) rows += 1 # create a default batch - 0 AntibodyBatch.objects.create(reagent=antibody,batch_id=0) except Exception, e: logger.error("Invalid antibody initializer: %s" % initializer) raise
def main(path): """ Read in the cell batch info """ sheet_name = 'Sheet1' start_row = 1 sheet = iu.readtable([path, sheet_name, start_row ]) # Note, skipping the header row by default properties = ('model_field', 'required', 'default', 'converter') column_definitions = { 'Facility ID': ('facility_id', True, None, lambda x: x[x.index('HMSL') + 4:]), 'CL_Batch_ID': ('batch_id', True, None, lambda x: util.convertdata(x, int)), 'CL_Provider_Name': 'provider_name', 'CL_Provider_Batch_ID': 'provider_batch_id', 'CL_Provider_Catalog_ID': 'provider_catalog_id', 'CL_Quality_Verification': 'quality_verification', 'CL_Transient_Modification': 'transient_modification', 'Date Data Received': ('date_data_received', False, None, util.date_converter), 'Date Loaded': ('date_loaded', False, None, util.date_converter), 'Date Publicly Available': ('date_publicly_available', False, None, util.date_converter), 'Most Recent Update': ('date_updated', False, None, util.date_converter), } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions( properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ', cols))) for row in sheet: r = util.make_row(row) initializer = {} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug(str(('raw value', value))) if (converter != None): value = converter(value) if (value == None): if (default != None): value = default if (value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'], rows)) logger.debug( str(('model_field: ', model_field, ', value: ', value))) if model_field == 'facility_id': try: cell = Cell.objects.get(facility_id=value) initializer['reagent'] = cell except: logger.error( str(("Cell not found", value, 'row', rows + start_row + 2))) raise else: initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) cell = CellBatch(**initializer) cell.save() logger.debug(str(('cell created:', cell))) rows += 1 except Exception, e: logger.error( str(("Invalid CellBatch initializer: ", initializer, 'row', rows + start_row + 2, e))) raise
def main(path): """ Read in the Cell """ sheet_name = 'HMS-LINCS cell line metadata' sheet = iu.readtable([path, sheet_name, 1]) # Note, skipping the header row by default properties = ('model_field','required','default','converter') column_definitions = { 'Facility ID':('facility_id',True,None, lambda x: x[x.index('HMSL')+4:]), 'CL_Name':('name',True), 'CL_ID':'cl_id', 'CL_Alternate_Name':'alternate_name', 'CL_Alternate_ID':'alternate_id', 'CL_Center_Name':'center_name', 'CL_Center_Specific_ID':'center_specific_id', 'MGH_ID':('mgh_id',False,None,lambda x:util.convertdata(x,int)), 'Assay':'assay', 'CL_Provider_Name':'provider_name', 'CL_Provider_Catalog_ID':'provider_catalog_id', 'CL_Batch_ID':'batch_id', 'CL_Organism':'organism', 'CL_Organ':'organ', 'CL_Tissue':'tissue', 'CL_Cell_Type':'cell_type', 'CL_Cell_Type_Detail':'cell_type_detail', 'CL_Disease':'disease', 'CL_Disease_Detail':'disease_detail', 'CL_Growth_Properties':'growth_properties', 'CL_Genetic_Modification':'genetic_modification', 'CL_Related_Projects':'related_projects', 'CL_Recommended_Culture_Conditions':'recommended_culture_conditions', 'CL_Verification_Profile':'verification_profile', 'CL_Verification_Reference_Profile':'verification_reference_profile', 'CL_Mutations_Reference':'mutations_reference', 'CL_Mutations_Explicit':'mutations_explicit', 'CL_Organism_Gender':'organism_gender', 'Date Data Received':('date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), 'Is Restricted':('is_restricted',False,False,util.bool_converter)} # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: r = util.make_row(row) initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) cell = Cell(**initializer) cell.save() logger.info(str(('cell created:', cell))) rows += 1 except Exception, e: print "Invalid Cell, name: ", r[0] raise e
found=False for key,value in mappingColumnDict.items(): if(value != -1): found=True if(not found): raise Exception('at least one of: ' + str(mappingColumnDict.keys()) + ' must be defined and used in the Data sheet.') # Read the Datasheet, create DataPoint values for mapped column in each row logger.debug(str(('now read rows, save_interval:', save_interval))) loopStart = time.time() pointsSaved = 0 rowsRead = 0 for row in dataSheet: current_row = rowsRead+2 r = util.make_row(row) dataRecord = DataRecord(dataset=dataset ) map_column = mappingColumnDict['Small Molecule Batch'] mapped = False if(map_column > -1): _read_small_molecule_batch(map_column,r,current_row,dataRecord) map_column = mappingColumnDict['Plate'] if(map_column > -1): _read_plate_well(map_column,r,current_row, dataRecord) map_column = mappingColumnDict['Cell'] if(map_column > -1): _read_cell(map_column,r,current_row,dataRecord) map_column = mappingColumnDict['Antibody'] if(map_column > -1): _read_antibody(map_column,r,current_row,dataRecord) map_column = mappingColumnDict['OtherReagent']
def main(path): """ Read in the Protein """ sheet_name = "HMS-LINCS Kinases" sheet = iu.readtable([path, sheet_name, 1]) # Note, skipping the header row by default properties = ("model_field", "required", "default", "converter") column_definitions = { "PP_Name": ("name", True), "PP_LINCS_ID": ("lincs_id", True, None, lambda x: x[x.index("HMSL") + 4 :]), "PP_UniProt_ID": "uniprot_id", "PP_Alternate_Name": "alternate_name", "PP_Alternate_Name[2]": "alternate_name_2", "PP_Provider": "provider", "PP_Provider_Catalog_ID": "provider_catalog_id", "PP_Batch_ID": "batch_id", "PP_Amino_Acid_Sequence": "amino_acid_sequence", "PP_Gene_Symbol": "gene_symbol", "PP_Gene_ID": "gene_id", "PP_Protein_Source": "protein_source", "PP_Protein_Form": "protein_form", "PP_Protein_Purity": "protein_purity", "PP_Protein_Complex": "protein_complex", "PP_Isoform": "isoform", "PP_Protein_Type": "protein_type", "PP_Source_Organism": "source_organism", "PP_Reference": "reference", "Date Data Received": ("date_data_received", False, None, util.date_converter), "Date Loaded": ("date_loaded", False, None, util.date_converter), "Date Publicly Available": ("date_publicly_available", False, None, util.date_converter), "Is Restricted": ("is_restricted", False, False), } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(("cols: ", cols))) for row in sheet: r = util.make_row(row) dict = {} initializer = {} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(("read col: ", i, ", ", properties))) required = properties["required"] default = properties["default"] converter = properties["converter"] model_field = properties["model_field"] # Todo, refactor to a method logger.debug(str(("raw value", value))) if converter != None: value = converter(value) if value == None: if default != None: value = default if value == None and required == True: raise Exception("Field is required: %s, record: %d" % (properties["column_label"], rows)) logger.debug(str(("model_field: ", model_field, ", value: ", value))) initializer[model_field] = value try: logger.debug(str(("initializer: ", initializer))) protein = Protein(**initializer) protein.save() logger.info(str(("protein created: ", protein))) rows += 1 except Exception, e: logger.error(str(("Invalid protein initializer: ", initializer))) raise
def main(path): # read in the two columns of the meta sheet to a dict that defines a DataSet metadata = read_metadata(path) dataset = DataSet(**metadata) dataset.save() # read in the data columns sheet to an array of dict's, each dict defines a DataColumn dataColumnDefinitions = readDataColumns(path) # now that the array of DataColumn dicts is created, use them to create the DataColumn instances dataColumns = {} for dc in dataColumnDefinitions: dc['dataset'] = dataset dataColumn = DataColumn(**dc) dataColumn.save() dataColumns[dataColumn.name] = dataColumn # read the Data sheet sheetname = 'Data' dataSheet = iu.readtable([path, sheetname]) # First, map the sheet column indices to the DataColumns that were created dataColumnList = {} metaColumnDict = {'Well':-1, 'Plate':-1, 'Control Type':-1} # meta columns contain forensic information mappingColumnDict = {'Small Molecule':-1, 'Cell':-1, 'Protein':-1} # what is being studied - at least one is required # NOTE: this scheme is matching based on the labels between the "Data Column" sheet and the "Data" sheet for i,label in enumerate(dataSheet.labels): if(label == 'None' or label == 'well_id' or label.strip()=='' or label == 'Exclude' ): continue if label in metaColumnDict: metaColumnDict[label] = i continue if label in mappingColumnDict: mappingColumnDict[label] = i continue if label in dataColumns: dataColumnList[i] = dataColumns[label] # note here "i" is the index to the dict else: #raise Exception("no datacolumn for the label: " + label) columnName = chr(ord('A') + i) findError = True for column in dataColumns.values(): if(column.worksheet_column == columnName): dataColumnList[i] = column findError = False break if findError: print "Error: no datacolumn for ", label sys.exit(-1) found=False for key,value in mappingColumnDict.items(): if(value != -1): found=True if(not found): raise Exception('at least one of: ' + str(mappingColumnDict.keys()) + ' must be defined and used in the Data sheet.') # Read in the Data sheet, create DataPoint values for mapped column in each row pointsSaved = 0 rowsRead = 0 for row in dataSheet: r = util.make_row(row) dataRecord = DataRecord(dataset=dataset ) map_column = mappingColumnDict['Small Molecule'] mapped = False if(map_column > -1): try: value = util.convertdata(r[map_column].strip()) if(value != None and value != '' ): facility = value.split("-")[0] # TODO: purge "HMSL" from the db salt = value.split("-")[1] dataRecord.small_molecule = SmallMolecule.objects.get(facility_id=facility, sm_salt=salt) mapped = True except Exception, e: print "Invalid Small Molecule facility id: ", value raise map_column = mappingColumnDict['Cell'] if(map_column > -1): try: value = util.convertdata(r[map_column].strip()) if(value != None and value != '' ): facility_id = value dataRecord.cell = Cell.objects.get(facility_id=facility_id) # TODO: purge "HMSL" from the db mapped = True except Exception, e: print "Invalid Cell facility id: ", facility_id raise
def main(path): """ Read in the Protein """ sheet_name = 'HMS-LINCS Kinases' # Note, skipping the header row by default sheet = iu.readtable([path, sheet_name, 1]) properties = ('model_field','required','default','converter') column_definitions = { 'PP_Name':('name',True), 'PP_LINCS_ID':('lincs_id',True,None,lambda x: x[x.index('HMSL')+4:]), 'PP_UniProt_ID':'uniprot_id', 'PP_Alternate_Name':'alternate_name', 'PP_Alternate_Name[2]':'alternate_name_2', 'PP_Provider':'provider', 'PP_Provider_Catalog_ID':'provider_catalog_id', 'PP_Batch_ID':'batch_id', 'PP_Amino_Acid_Sequence':'amino_acid_sequence', 'PP_Gene_Symbol':'gene_symbol', 'PP_Gene_ID':'gene_id', 'PP_Protein_Source':'protein_source', 'PP_Protein_Form':'protein_form', 'PP_Mutation':'mutation', 'PP_Phosphorylation_State':'phosphlorylation', 'PP_Domain':'protein_domain', 'PP_Protein_Purity':'protein_purity', 'PP_Protein_Complex':'protein_complex', 'PP_Isoform':'isoform', 'PP_Protein_Type':'protein_type', 'PP_Source_Organism':'source_organism', 'PP_Reference':'reference', 'Date Data Received':('date_data_received',False,None, util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None, util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), 'Is Restricted':('is_restricted',False,False)} # convert the labels to fleshed out dict's, with strategies for optional, # default and converter column_definitions = \ util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: r = util.make_row(row) dict = {} initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug(str(( 'model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) protein = Protein(**initializer) protein.save() logger.info(str(('protein created: ', protein))) rows += 1 except Exception, e: logger.error(str(("Invalid protein initializer: ", initializer, e))) raise
def main(path): """ Read in the OtherReagent """ sheet_name = 'Sheet1' sheet = iu.readtable([path, sheet_name, 1]) # Note, skipping the header row by default properties = ('model_field', 'required', 'default', 'converter') column_definitions = { 'OR_ID': 'lincs_id', 'Facility ID': ('facility_id', True), 'OR_Alternate_ID': 'alternate_id', 'OR_Primary_Name': ('name', True), 'OR_Alternate_Name': 'alternative_names', 'OR_Role': 'role', 'OR_Reference': 'reference', 'Date Data Received': ('date_data_received', False, None, util.date_converter), 'Date Loaded': ('date_loaded', False, None, util.date_converter), 'Date Publicly Available': ('date_publicly_available', False, None, util.date_converter), 'Most Recent Update': ('date_updated', False, None, util.date_converter), 'Is Restricted': ('is_restricted', False, False) } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions( properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ', cols))) for row in sheet: r = util.make_row(row) dict = {} initializer = {} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if (converter != None): value = converter(value) if (value == None): if (default != None): value = default if (value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'], rows)) logger.debug( str(('model_field: ', model_field, ', value: ', value))) initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) reagent = OtherReagent(**initializer) reagent.save() logger.info(str(('OtherReagent created: ', reagent))) rows += 1 # create a default batch - 0 OtherReagentBatch.objects.create(reagent=reagent, batch_id=0) except Exception, e: logger.error( str(("Invalid OtherReagent initializer: ", initializer))) raise
def readDataColumns(path): # Read in the DataColumn Sheet sheetname = 'Data Columns' dataColumnSheet = iu.readtable([path, sheetname]) # Lookup all of the field types of the Datacolumn table. # These will be used to validate input type by converting on read _fields = util.get_fields(DataColumn) _typelookup = dict((f.name, iu.totype(f)) for f in _fields) # TODO: Use the import_utils methods here # TODO: compare and combine this with the fieldinformation entity labels = {'Worksheet Column':'worksheet_column', 'Display Order':'display_order', 'Name':'name', 'Display Name':'display_name', 'Data Type':'data_type', 'Decimal Places':'precision', 'Description':'description', 'Replicate Number':'replicate', 'Unit':'unit', 'Assay readout type':'readout_type', 'Comments':'comments', 'Protein HMS LINCS ID': 'protein', 'Cell HMS LINCS ID': 'cell'} # create an array of dict's, each dict defines a DataColumn dataColumnDefinitions = [] #Note we also allow a list of pro # first the label row (it contains the worksheet column, it is unique) for v in dataColumnSheet.labels[1:]: dataColumnDefinitions.append({labels['Worksheet Column']:v}) logger.debug(str(('========== datacolumns:',dataColumnDefinitions))) # for each row, create the dictionary entry in the dataColumnDefinitions for row in dataColumnSheet: rowAsUnicode = util.make_row(row) keyRead = rowAsUnicode[0] for i,cellText in enumerate(rowAsUnicode[1:]): try: for key,fieldName in labels.items(): # if one of the DataColumn fields, add it to the dict if re.match(key,keyRead,re.M|re.I): if re.match('Protein HMS LINCS ID', keyRead, re.M|re.I): facility_id = util.convertdata(cellText, int); if facility_id: dataColumnDefinitions[i][fieldName] = \ Protein.objects.get(lincs_id=facility_id) elif re.match('Cell HMS LINCS ID', keyRead, re.M|re.I): facility_id = util.convertdata(cellText, int); if facility_id: dataColumnDefinitions[i][fieldName] = \ Cell.objects.get(facility_id=facility_id) else: # Use the type from the fieldinformation table # to read in the data for each DC field dataColumnDefinitions[i][fieldName] = \ util.convertdata(cellText, _typelookup.get(fieldName, None)) else: logger.debug(str(( '"Data Column definition not used: ', cellText)) ) pass except Exception, e: logger.error(str(('Exception reading data for cell', i, cellText, e))) raise e logger.debug(str(("definitions: ", dataColumnDefinitions)) )
def main(path): """ Read in the cell batch info """ sheet_name = 'Sheet1' start_row = 1 sheet = iu.readtable([path, sheet_name, start_row]) # Note, skipping the header row by default properties = ('model_field','required','default','converter') column_definitions = { 'Facility ID':('facility_id',True,None, lambda x: x[x.index('HMSL')+4:]), 'CL_Batch_ID':('batch_id',True,None,lambda x:util.convertdata(x,int)), 'CL_Provider_Name':'provider_name', 'CL_Provider_Batch_ID':'provider_batch_id', 'CL_Provider_Catalog_ID':'provider_catalog_id', 'CL_Quality_Verification':'quality_verification', 'CL_Transient_Modification': 'transient_modification', 'Date Data Received':('date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: r = util.make_row(row) initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % ( properties['column_label'],rows)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) if model_field == 'facility_id': try: cell = Cell.objects.get(facility_id=value) initializer['reagent'] = cell except: logger.error(str(("Cell not found", value, 'row',rows+start_row+2))) raise else: initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) cell = CellBatch(**initializer) cell.save() logger.debug(str(('cell created:', cell))) rows += 1 except Exception, e: logger.error(str(( "Invalid CellBatch initializer: ", initializer, 'row', rows+start_row+2, e))) raise
def read_data(book, col_to_dc_map, first_small_molecule_column, dataset): datarecord_batch = [] save_interval = 1000 logger.debug('read the Data sheet') data_sheet = book.sheet_by_name('Data') for i,label in enumerate(data_sheet.row_values(0)): logger.debug('find datasheet label %r:%r' % (colname(i), label)) if label in meta_columns: meta_columns[label] = i continue logger.debug('meta_columns: %s, datacolumnList: %s' % (meta_columns, col_to_dc_map) ) logger.debug('read the data sheet, save_interval: %d' % save_interval) loopStart = time.time() pointsSaved = 0 rows_read = 0 for i in xrange(data_sheet.nrows-1): current_row = i + 2 row = data_sheet.row_values(i+1) r = util.make_row(row) datarecord = DataRecord(dataset=dataset) if meta_columns['Control Type'] > -1: datarecord.control_type = util.convertdata( r[meta_columns['Control Type']]) datapoint_batch = [] small_molecule_datapoint = None for i,dc in col_to_dc_map.items(): value = r[i] logger.debug( 'reading column %r, %s, val: %r' % (colname(i), dc, value)) value = value.strip() value = util.convertdata(value) if not value: continue datapoint = _create_datapoint(dc, dataset, datarecord, value) datapoint_batch.append(datapoint) pointsSaved += 1 if not small_molecule_datapoint and dc.data_type == 'small_molecule': small_molecule_datapoint = datapoint if meta_columns['Plate'] > -1: _read_plate_well( meta_columns['Plate'], r, current_row, datarecord, first_small_molecule_column,small_molecule_datapoint, datapoint_batch) datarecord_batch.append((datarecord, datapoint_batch)) rows_read += 1 if (rows_read % save_interval == 0): bulk_create_datarecords(datarecord_batch) logger.debug( 'datarecord batch created, rows_read: %d , time (ms): %d' % (rows_read, time.time()-loopStart ) ) count = bulk_create_datapoints(datarecord_batch) logger.debug('datapoints created in batch: %d ' % count) datarecord_batch=[] bulk_create_datarecords(datarecord_batch) et = time.time()-loopStart logger.debug( 'final datarecord batch created, rows_read: %d, time (ms): %d' % (rows_read, et)) count = bulk_create_datapoints(datarecord_batch) logger.debug('created dps %d' % count ) print 'Finished reading, rows_read: ', rows_read, ', points Saved: ', pointsSaved print 'elapsed: ', et , 'avg: ', et/rows_read cleanup_unused_datacolumns(dataset)
def main(import_file, file_directory, deploy_dir): """ Read in the qc events for batches - version 1 - for small molecule batches """ sheet_name = "Sheet1" start_row = 0 sheet = iu.readtable([import_file, sheet_name, start_row]) # Note, skipping the header row by default properties = ("model_field", "required", "default", "converter") column_definitions = { "facility_id": ("facility_id_for", True, None, lambda x: util.convertdata(x, int)), "salt_id": ("salt_id_for", False, None, lambda x: util.convertdata(x, int)), "batch_id": ("batch_id_for", True, None, lambda x: util.convertdata(x, int)), "QC event date": ("date", True, None, util.date_converter), "outcome": ("outcome", True), "comment": "comment", "is_restricted": ("is_restricted", False, False, util.bool_converter), "file1": "file1", "file2": "file2", "file3": "file3", "file4": "file4", "file5": "file5", } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(("cols: ", cols))) for row in sheet: r = util.make_row(row) # store each row in a dict _dict = {} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(("read col: ", i, ", ", properties))) required = properties["required"] default = properties["default"] converter = properties["converter"] model_field = properties["model_field"] logger.debug(str(("raw value", value))) if converter != None: value = converter(value) if value == None: if default != None: value = default if value == None and required == True: raise Exception("Field is required: %s, record: %d" % (properties["column_label"], rows)) logger.debug(str(("model_field: ", model_field, ", value: ", value))) _dict[model_field] = value logger.debug(str(("dict: ", _dict))) files_to_attach = [] for i in range(10): filenameProp = "file%s" % i if _dict.get(filenameProp, None): fileprop = _dict[filenameProp] filepath = os.path.join(file_directory, fileprop) if not os.path.exists(filepath): raise Exception(str(("file does not exist:", filepath, "row", rows + start_row))) filename = os.path.basename(filepath) relative_path = fileprop[: fileprop.index(filename)] # Move the file dest_dir = deploy_dir if not dest_dir: dest_dir = settings.STATIC_AUTHENTICATED_FILE_DIR if not os.path.isdir(dest_dir): raise Exception(str(("no such deploy directory, please create it", dest_dir))) if relative_path: dest_dir = os.path.join(dest_dir, relative_path) if not os.path.exists(dest_dir): os.makedirs(dest_dir) deployed_path = os.path.join(dest_dir, filename) logger.debug(str(("deploy", filepath, deployed_path))) if os.path.exists(deployed_path): os.remove(deployed_path) copy(filepath, deployed_path) if not os.path.isfile(deployed_path): raise Exception(str(("could not deploy to", deployed_path))) else: logger.debug(str(("successfully deployed to", deployed_path))) files_to_attach.append((filename, relative_path)) initializer = None try: # create the qc record initializer = { key: _dict[key] for key in ["facility_id_for", "salt_id_for", "batch_id_for", "outcome", "comment", "date"] } qc_event = QCEvent(**initializer) qc_event.save() logger.debug(str(("saved", qc_event))) # create attached file records for (filename, relative_path) in files_to_attach: initializer = { "qc_event": qc_event, "filename": filename, "relative_path": relative_path, "is_restricted": _dict["is_restricted"], } qc_attached_file = QCAttachedFile(**initializer) qc_attached_file.save() logger.debug(str(("created qc attached file", qc_attached_file))) rows += 1 except Exception, e: logger.error(str(("Invalid initializer: ", initializer, "row", rows + start_row + 2, e))) raise
def main(path): """ Read in the OtherReagent """ sheet_name = 'Sheet1' sheet = iu.readtable([path, sheet_name, 1]) # Note, skipping the header row by default properties = ('model_field','required','default','converter') column_definitions = { 'OR_ID': 'lincs_id', 'Facility ID': ('facility_id',True), 'OR_Alternate_ID': 'alternate_id', 'OR_Primary_Name': ('name',True), 'OR_Alternate_Name': 'alternative_names', 'OR_Role': 'role', 'OR_Reference': 'reference', 'Date Data Received':('date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ('date_updated',False,None,util.date_converter), 'Is Restricted':('is_restricted',False,False)} # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions(properties,column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ' , cols))) for row in sheet: r = util.make_row(row) dict = {} initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] # Todo, refactor to a method logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value try: logger.debug(str(('initializer: ', initializer))) reagent = OtherReagent(**initializer) reagent.save() logger.info(str(('OtherReagent created: ', reagent))) rows += 1 except Exception, e: logger.error(str(( "Invalid OtherReagent initializer: ", initializer))) raise
def main(path): """ Read in the Antibody Batches """ sheet_name = 'Sheet1' sheet = iu.readtable([path, sheet_name, 0]) properties = ('model_field', 'required', 'default', 'converter') column_definitions = { 'AR_Center_Specific_ID': ('antibody_facility_id', True, None, lambda x: x[x.index('HMSL') + 4:]), 'AR_Batch_ID': ('batch_id', True, None, lambda x: util.convertdata(x, int)), 'AR_Provider_Name': 'provider_name', 'AR_Provider_Catalog_ ID': 'provider_catalog_id', 'AR_Provider_Batch_ID': 'provider_batch_id', 'AR_Antibody_Purity': 'antibody_purity', 'Date Data Received': ('date_data_received', False, None, util.date_converter), 'Date Loaded': ('date_loaded', False, None, util.date_converter), 'Date Publicly Available': ('date_publicly_available', False, None, util.date_converter), 'Most Recent Update': ('date_updated', False, None, util.date_converter), } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions( properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug('cols: %s' % cols) for row in sheet: r = util.make_row(row) dict = {} initializer = {} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug('read col: %d: %s' % (i, properties)) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug('raw value %r' % value) if (converter != None): value = converter(value) if (value == None): if (default != None): value = default if (value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'], rows)) logger.debug('model_field: %s, converted value %r' % (model_field, value)) initializer[model_field] = value try: logger.debug('initializer: %s' % initializer) antibody_facility_id = initializer.pop('antibody_facility_id', None) if antibody_facility_id: try: antibody = Antibody.objects.get( facility_id=antibody_facility_id) initializer['reagent'] = antibody except ObjectDoesNotExist, e: logger.error( 'AR_Center_Specific_ID: "%s" does not exist, row: %d' % (antibody_facility_id, i)) antibody_batch = AntibodyBatch(**initializer) antibody_batch.save() logger.info('antibody batch created: %s' % antibody_batch) rows += 1 except Exception, e: logger.error("Invalid antibody_batch initializer: %s" % initializer) raise
def main(path): """ Read in the Antibody """ sheet_name = 'Sheet1' sheet = iu.readtable([path, sheet_name, 0]) properties = ('model_field', 'required', 'default', 'converter') column_definitions = { 'AR_Name': ('name', True), 'AR_LINCS_ID': 'lincs_id', 'AR_Alternative_Name': 'alternative_names', 'AR_Center_Specific_ID': ('facility_id', True, None, lambda x: x[x.index('HMSL') + 4:]), 'AR_Clone_Name': 'clone_name', 'AR_RRID': 'rrid', 'AR_Antibody_Type': 'type', 'target_protein_lincs_id': ('target_protein_lincs_id', False, None, lambda x: x[x.index('HMSL') + 4:] if x else None), 'AR_Non-Protein_Target': 'non_protein_target_name', 'AR_Target_Organism': 'target_organism', 'AR_Immunogen': 'immunogen', 'AR_Immunogen_Sequence': 'immunogen_sequence', 'AR_Antibody_Species': 'species', 'AR_Antibody_Clonality': 'clonality', 'AR_Antibody_Isotype': 'isotype', 'AR_Antibody_Production_Source_Organism': 'source_organism', 'AR_Antibody_Production_Details': 'production_details', 'AR_Antibody_Labeling': 'labeling', 'AR_Antibody_Labeling_Details': 'labeling_details', 'AR_Relevant_Citations': 'relevant_citations', 'Date Data Received': ('date_data_received', False, None, util.date_converter), 'Date Loaded': ('date_loaded', False, None, util.date_converter), 'Date Publicly Available': ('date_publicly_available', False, None, util.date_converter), 'Most Recent Update': ('date_updated', False, None, util.date_converter), 'Is Restricted': ('is_restricted', False, False, util.bool_converter) } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions( properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug('cols: %s' % cols) for row in sheet: logger.debug('row %s - %s' % (rows, row)) r = util.make_row(row) dict = {} initializer = {} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug('read col: %d: %s' % (i, properties)) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug('raw value %r' % value) if (value == None or value == 'None'): value = None if (default != None): value = default if (value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'], rows)) if (value and converter != None): value = converter(value) logger.debug('model_field: %s, converted value %r' % (model_field, value)) initializer[model_field] = value try: logger.debug('row: %s, initializer: %s' % (rows, initializer)) target_protein_lincs_id = initializer.pop( 'target_protein_lincs_id', None) if target_protein_lincs_id: try: target_protein = Protein.objects.get( lincs_id=target_protein_lincs_id) initializer['target_protein'] = target_protein except ObjectDoesNotExist, e: logger.error( 'target_protein_lincs_id "%s" does not exist, row: %d' % (target_protein_lincs_id, i)) antibody = Antibody(**initializer) antibody.save() logger.info('antibody created: %s' % antibody) rows += 1 # create a default batch - 0 AntibodyBatch.objects.create(reagent=antibody, batch_id=0) except Exception, e: logger.error("Invalid antibody initializer: %s" % initializer) raise
continue logger.debug('meta_columns: %s, datacolumnList: %s' % (meta_columns, col_to_dc_map)) logger.debug('read the data sheet, save_interval: %d' % save_interval) loopStart = time.time() pointsSaved = 0 rows_read = 0 col_to_dc_items = col_to_dc_map.items() for i in xrange(data_sheet.nrows - 1): current_row = i + 2 row = data_sheet.row_values(i + 1) r = util.make_row(row) datarecord = DataRecord(dataset=dataset) if meta_columns['Control Type'] > -1: datarecord.control_type = util.convertdata( r[meta_columns['Control Type']]) datapoint_batch = [] small_molecule_datapoint = None for i, dc in col_to_dc_items: value = r[i] logger.debug('reading column %r, %s, val: %r' % (colname(i), dc, value)) value = value.strip() value = util.convertdata(value) if not value:
def main(import_file, file_directory, deploy_dir): """ Read in the qc events for batches - version 1 - for small molecule batches """ sheet_name = 'Sheet1' start_row = 0 sheet = iu.readtable([import_file, sheet_name, start_row ]) # Note, skipping the header row by default properties = ('model_field', 'required', 'default', 'converter') column_definitions = { 'facility_id': ('facility_id_for', True, None, lambda x: util.convertdata(x, int)), 'salt_id': ('salt_id_for', False, None, lambda x: util.convertdata(x, int)), 'batch_id': ('batch_id_for', True, None, lambda x: util.convertdata(x, int)), 'QC event date': ('date', True, None, util.date_converter), 'outcome': ('outcome', True), 'comment': 'comment', 'is_restricted': ('is_restricted', False, False, util.bool_converter), 'file1': 'file1', 'file2': 'file2', 'file3': 'file3', 'file4': 'file4', 'file5': 'file5', } # convert the labels to fleshed out dict's, with strategies for optional, default and converter column_definitions = util.fill_in_column_definitions( properties, column_definitions) # create a dict mapping the column ordinal to the proper column definition dict cols = util.find_columns(column_definitions, sheet.labels) rows = 0 logger.debug(str(('cols: ', cols))) for row in sheet: r = util.make_row(row) # store each row in a dict _dict = {} for i, value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug(str(('read col: ', i, ', ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug(str(('raw value', value))) if (converter != None): value = converter(value) if (value == None): if (default != None): value = default if (value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'], rows)) logger.debug( str(('model_field: ', model_field, ', value: ', value))) _dict[model_field] = value logger.debug(str(('dict: ', _dict))) files_to_attach = [] for i in range(10): filenameProp = 'file%s' % i if _dict.get(filenameProp, None): fileprop = _dict[filenameProp] filepath = os.path.join(file_directory, fileprop) if not os.path.exists(filepath): raise Exception( str(('file does not exist:', filepath, 'row', rows + start_row))) filename = os.path.basename(filepath) relative_path = fileprop[:fileprop.index(filename)] # Move the file dest_dir = deploy_dir if not dest_dir: dest_dir = settings.STATIC_AUTHENTICATED_FILE_DIR if not os.path.isdir(dest_dir): raise Exception( str(('no such deploy directory, please create it', dest_dir))) if relative_path: dest_dir = os.path.join(dest_dir, relative_path) if not os.path.exists(dest_dir): os.makedirs(dest_dir) deployed_path = os.path.join(dest_dir, filename) logger.debug(str(('deploy', filepath, deployed_path))) if os.path.exists(deployed_path): os.remove(deployed_path) copy(filepath, deployed_path) if not os.path.isfile(deployed_path): raise Exception(str( ('could not deploy to', deployed_path))) else: logger.debug( str(('successfully deployed to', deployed_path))) files_to_attach.append((filename, relative_path)) initializer = None try: # create the qc record initializer = { key: _dict[key] for key in [ 'facility_id_for', 'salt_id_for', 'batch_id_for', 'outcome', 'comment', 'date' ] } qc_event = QCEvent(**initializer) qc_event.save() logger.debug(str(('saved', qc_event))) # create attached file records for (filename, relative_path) in files_to_attach: initializer = { 'qc_event': qc_event, 'filename': filename, 'relative_path': relative_path, 'is_restricted': _dict['is_restricted'] } qc_attached_file = QCAttachedFile(**initializer) qc_attached_file.save() logger.debug( str(('created qc attached file', qc_attached_file))) rows += 1 except Exception, e: logger.error( str(("Invalid initializer: ", initializer, 'row', rows + start_row + 2, e))) raise
def main(path): sheet_name = 'sheet 1' start_row = 1 sheet = iu.readtable([path, sheet_name, start_row]) properties = ('model_field','required','default','converter') column_definitions = { 'facility_id': ( 'facility_id',True,None, lambda x: util.convertdata(x,int)), 'facility_batch_id':( 'batch_id',True,None, lambda x: util.convertdata(x,int)), 'provider': ('provider_name',False), 'provider_catalog_id':'provider_catalog_id', 'provider_sample_id':'provider_batch_id', 'Date Data Received':( 'date_data_received',False,None,util.date_converter), 'Date Loaded': ('date_loaded',False,None,util.date_converter), 'Date Publicly Available': ( 'date_publicly_available',False,None,util.date_converter), 'Most Recent Update': ( 'date_updated',False,None,util.date_converter), } column_definitions = util.fill_in_column_definitions( properties,column_definitions) cols = util.find_columns(column_definitions, sheet.labels, all_sheet_columns_required=False) rows = 0 logger.debug('cols: %s' % cols) for row in sheet: r = util.make_row(row) dict = {} initializer = {} for i,value in enumerate(r): if i not in cols: continue properties = cols[i] logger.debug('read col: %d: %s' % (i,properties)) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] logger.debug('raw value %r' % value) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == None and required == True): raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows)) logger.debug('model_field: %s, converted value %r' % (model_field, value) ) initializer[model_field] = value try: logger.debug('initializer: %s' % initializer) facility_id = initializer.pop('facility_id',None) try: other_reagent = OtherReagent.objects.get(facility_id=facility_id) initializer['reagent'] = other_reagent except ObjectDoesNotExist, e: logger.error('facility_id: "%s" does not exist, row: %d' % (facility_id,i)) batch = OtherReagentBatch(**initializer) batch.save() logger.debug('batch created: %s', batch) rows += 1 except Exception, e: logger.error("Invalid other_reagent_batch initializer: %s" % initializer) raise