Python convertdata Examples, import_utils.convertdata Python Examples

Example #1

0

Show file

File: import_dataset.py Project: seanderickson/hmslincs

def _read_small_molecule_batch(map_column,r,current_row, dr):
    '''
    @param r row
    @param dr dataRecord
    '''
    try:
        value = util.convertdata(r[map_column].strip())
        if(value != None and value != '' ):
            value = value.split("-")
            if len(value) < 2: 
                raise Exception('Small Molecule (Batch) format is '
                                '#####-###(-#) **Note that (batch) is optional')
            x = value[0]
            facility = util.convertdata(x,int) 
            salt = value[1]
            try:
                dr.smallmolecule = SmallMolecule.objects.get(
                    facility_id=facility, salt_id=salt)
            except Exception, e:
                logger.error(str(('could not locate small molecule:', 
                                  facility,e)))
                raise
            if(len(value)>2):
                dr.batch_id = util.convertdata(value[2],int)
                # TODO: validate that the batch exists?  (would need to
                # do for all types, not just Small Molecule
    except Exception, e:
        logger.error(str((
            "Invalid Small Molecule (or batch) identifiers: ", value, 
            'row',current_row,e)))
        raise

Example #2

0

Show file

File: import_dataset.py Project: seanderickson/hmslincs

def _create_datapoint(dataColumn, dataset, dataRecord, value):
    dataPoint = None
    # TODO: define allowed "types" for the input sheet 
    # (this is listed in current SS code, but we may want to rework)
    if (dataColumn.data_type == 'Numeric'): 
        if (dataColumn.precision != 0): # float, TODO: set precision
            dataPoint = DataPoint(datacolumn = dataColumn,
                                  dataset = dataset,
                                  datarecord = dataRecord,
                                  float_value=util.convertdata(value, float))
        else:
            dataPoint = DataPoint(datacolumn=dataColumn,
                                  dataset = dataset,
                                  datarecord = dataRecord,
                                  int_value=util.convertdata(value,int))
    elif (dataColumn.data_type == 'omero_image'): 
        dataPoint = DataPoint(datacolumn=dataColumn,
                              dataset = dataset,
                              datarecord = dataRecord,
                              int_value=util.convertdata(value,int))
    else: # ONLY text, for now, we'll need to define the allowed types, next!
        dataPoint = DataPoint(datacolumn=dataColumn,
                              dataset = dataset,
                              datarecord = dataRecord,
                              text_value=util.convertdata(value))
    return dataPoint

Example #3

0

Show file

def main(path):
    """
    Read in the Library and LibraryMapping sheets
    """
    libraries = readLibraries(path, 'Library')

    labels = {
        'Facility': 'facility_id',
        'Salt': 'sm_salt',
        'Batch': 'facility_batch_id',
        'Plate': 'plate',
        'Well': 'well',
        'Library Name': 'short_name',
        'Concentration': 'concentration',
        'Concentration Unit': 'concentration_unit'
    }

    small_molecule_lookup = ('facility_id', 'sm_salt', 'facility_batch_id')
    sheet = iu.readtable([path, 'LibraryMapping'])

    #dict to map spreadsheet fields to terms
    cols = {}
    # first put the label row in (it contains the worksheet column, and its unique)
    for i, label in enumerate(sheet.labels):
        if label in labels:
            cols[labels[label]] = i
        else:
            print 'Note: column label not found:', label
    rows = 0
    for row in sheet:
        r = util.make_row(row)
        # small molecule
        dict = {}
        for field in small_molecule_lookup:
            dict[field] = util.convertdata(r[cols[field]], int)
        try:
            dict['facility_id'] = 'HMSL' + str(
                dict['facility_id']
            )  # TODO: convert all hmsl id's to integers!!
            sm = SmallMolecule.objects.get(**dict)
        except Exception, e:
            print "Invalid small molecule identifiers: ", dict
            raise
        short_name = r[cols['short_name']]
        if short_name not in libraries:
            print "Library not found: ", short_name
            raise
        lm = {}
        lm['concentration'] = util.convertdata(r[cols['concentration']], float)
        lm['concentration_unit'] = util.convertdata(
            r[cols['concentration_unit']], None)
        lm['plate'] = util.convertdata(r[cols['plate']], int)
        lm['well'] = r[cols['well']]
        lm['small_molecule'] = sm
        lm['library'] = libraries[short_name]
        lm = LibraryMapping(**lm)
        lm.save()
        rows += 1

Example #4

0

Show file

File: import_dataset.py Project: seanderickson/hmslincs

def _read_plate_well(map_column,r,current_row, dr):
    '''
    @param r row
    @param dr dataRecord
    '''
    try:
        plate_id=None
        well_id=None
        value = util.convertdata(r[map_column].strip())
        if(value != None and value != '' ):
            plate_id = util.convertdata(value,int)
         
            value = util.convertdata(r[map_column+1].strip())
            if(value != None and value != '' ):
                well_id = value 
            else:
                raise Exception(str((
                    'Must define both plate and well (not just plate), row', 
                    current_row)))
                
            dr.plate = plate_id
            dr.well = well_id
            try:
                # TODO: 
                # What if the plate/well does not correlate to a 
                # librarymapping?  
                # i.e. if this is the plate/well for a cell/protein study?
                # For now, the effect of the following logic is that 
                # plate/well either maps a librarymapping, or is a an 
                # arbitrary plate/well.
                dr.library_mapping = \
                    LibraryMapping.objects.get(plate=plate_id,well=well_id)
                if(dr.smallmolecule != None):
                    if(dr.smallmolecule != None and 
                       dr.library_mapping.smallmolecule_batch != None and 
                       (dr.smallmolecule != 
                           dr.library_mapping.smallmolecule_batch.smallmolecule)):
                        raise Exception(str((
                            'SmallMolecule does not match the '
                            'libraryMapping.smallmolecule_batch.smallmolecule '
                            'pointed to by the plate/well:'
                            ,plate_id,well_id,
                            dr.smallmolecule,
                            dr.library_mapping.smallmolecule_batch.smallmolecule,
                            r,'row',current_row)))
                elif(dr.library_mapping.smallmolecule_batch != None):
                    dr.smallmolecule = \
                        dr.library_mapping.smallmolecule_batch.smallmolecule
            except ObjectDoesNotExist, e:
                logger.warn(str((
                    'No librarymapping defined (plate/well do not point to a '
                    'librarymapping), row', current_row))) 
    except Exception, e:
        logger.error(str(("Invalid plate/well identifiers",plate_id,well_id,r,
            e,'row',current_row,e)))
        raise e

Example #5

0

Show file

def _read_plate_well(map_column, r, current_row, dr, small_mol_col,
                     small_molecule_datapoint, datapoint_batch):

    plate_id = None
    well_id = None
    try:
        value = util.convertdata(r[map_column].strip())
        if (value != None and value != ''):
            plate_id = util.convertdata(value, int)

            value = util.convertdata(r[map_column + 1].strip())
            if (value != None and value != ''):
                well_id = value
            else:
                raise Exception(
                    'Must define both plate and well (not just plate), row: %d'
                    % current_row)

            dr.plate = plate_id
            dr.well = well_id
            dr.library_mapping = LibraryMapping.objects.get(plate=plate_id,
                                                            well=well_id)

            # Legacy loading use-case:
            # - if small molecule already specified, check that it is the same
            # - if no small molecule specified yet, associate the plate:well
            # small molecule with a datapoint and the dataset
            if (dr.library_mapping.smallmolecule_batch != None):
                if small_molecule_datapoint and small_molecule_datapoint.reagent_batch:
                    if small_molecule_datapoint.reagent_batch != dr.library_mapping.smallmolecule_batch:
                        raise Exception(
                            ('plate:well entry %s '
                             'does not match small molecule %r, row: %s') %
                            (well_id, dr.library_mapping.smallmolecule_batch,
                             current_row))
                else:
                    dr.dataset.small_molecules.add(
                        dr.library_mapping.smallmolecule_batch)
                    text_value = dr.library_mapping.smallmolecule_batch.reagent.facility_id
                    text_value += '-%s' % dr.library_mapping.smallmolecule_batch.reagent.salt_id
                    if dr.library_mapping.smallmolecule_batch.batch_id != 0:
                        text_value += '-%s' % dr.library_mapping.smallmolecule_batch.batch_id
                    datapoint = DataPoint(
                        datacolumn=small_mol_col,
                        dataset=dr.dataset,
                        datarecord=dr,
                        reagent_batch=dr.library_mapping.smallmolecule_batch,
                        text_value=text_value)
                    datapoint_batch.append(datapoint)
    except Exception, e:
        logger.exception(('Invalid plate/row information, '
                          'plate: %r, well: %r, data: %s, row_number: %d') %
                         (plate_id, well_id, r, current_row))
        raise e

Example #6

0

Show file

File: import_libraries.py Project: dwrobel1/hmslincs

def main(path):
    """
    Read in the Library and LibraryMapping sheets
    """
    libraries = readLibraries(path,'Library')
    
    labels = { 'Facility':'facility_id',
               'Salt':'sm_salt',
               'Batch':'facility_batch_id',
               'Plate':'plate',
               'Well':'well',
               'Library Name':'short_name',
               'Concentration': 'concentration',
               'Concentration Unit':'concentration_unit'
               }
    
    small_molecule_lookup = ('facility_id', 'sm_salt', 'facility_batch_id')
    sheet = iu.readtable([path, 'LibraryMapping'])
    
    #dict to map spreadsheet fields to terms
    cols = {}
    # first put the label row in (it contains the worksheet column, and its unique)
    for i,label in enumerate(sheet.labels):
        if label in labels:
            cols[labels[label]] = i
        else:
            print 'Note: column label not found:', label    
    rows = 0
    for row in sheet:
        r = util.make_row(row)
        # small molecule
        dict = {}
        for field in small_molecule_lookup:
            dict[field] = util.convertdata(r[cols[field]],int)
        try:
            dict['facility_id'] = 'HMSL' + str(dict['facility_id']) # TODO: convert all hmsl id's to integers!!
            sm = SmallMolecule.objects.get(**dict)
        except Exception, e:
            print "Invalid small molecule identifiers: ", dict
            raise 
        short_name = r[cols['short_name']]
        if short_name not in libraries:
            print "Library not found: ", short_name
            raise
        lm = {}
        lm['concentration'] = util.convertdata(r[cols['concentration']],float)
        lm['concentration_unit'] = util.convertdata(r[cols['concentration_unit']],None)
        lm['plate'] = util.convertdata(r[cols['plate']], int)
        lm['well'] = r[cols['well']]
        lm['small_molecule'] = sm
        lm['library'] = libraries[short_name]
        lm = LibraryMapping(**lm)
        lm.save()
        rows += 1

Example #7

0

Show file

def _read_plate_well(map_column, r, current_row, dr, small_molecule_column,
        small_molecule_datapoint,datapoint_batch):

    plate_id=None
    well_id=None
    try:
        value = util.convertdata(r[map_column].strip())
        if (value != None and value != ''):
            plate_id = util.convertdata(value, int)
         
            value = util.convertdata(r[map_column+1].strip())
            if (value != None and value != '' ):
                well_id = value 
            else:
                raise Exception(
                    'Must define both plate and well (not just plate), row: %d' 
                        % current_row)
                
            dr.plate = plate_id
            dr.well = well_id
            dr.library_mapping = LibraryMapping.objects.get(
                plate=plate_id, well=well_id)
            
            # Legacy loading use-case:
            # - if small molecule already specified, check that it is the same
            # - if no small molecule specified yet, associate the plate:well
            # small molecule with a datapoint and the dataset
            if(dr.library_mapping.smallmolecule_batch != None):
                if small_molecule_datapoint and small_molecule_datapoint.reagent_batch:
                    if small_molecule_datapoint.reagent_batch != dr.library_mapping.smallmolecule_batch:
                        raise Exception((
                            'plate:well entry %s '
                            'does not match small molecule %r, row: %s')
                            % (well_id, dr.library_mapping.smallmolecule_batch, current_row))
                else:
                    dr.dataset.small_molecules.add(dr.library_mapping.smallmolecule_batch)
                    text_value = dr.library_mapping.smallmolecule_batch.reagent.facility_id
                    text_value += '-%s' % dr.library_mapping.smallmolecule_batch.reagent.salt_id
                    if dr.library_mapping.smallmolecule_batch.batch_id != 0:
                        text_value += '-%s' % dr.library_mapping.smallmolecule_batch.batch_id
                    datapoint = DataPoint(datacolumn=small_molecule_column,
                                      dataset = dr.dataset,
                                      datarecord = dr,
                                      reagent_batch=dr.library_mapping.smallmolecule_batch,
                                      text_value=text_value)
                    datapoint_batch.append(datapoint)
    except Exception, e:
        logger.exception(
            ('Invalid plate/row information, '
            'plate: %r, well: %r, data: %s, row_number: %d')
            % ( plate_id,well_id, r, current_row ))
        raise e

Example #8

0

Show file

File: import_dataset.py Project: seanderickson/hmslincs

def _read_protein(map_column,r,current_row, dr):
    '''
    @param r row
    @param dr dataRecord
    '''
    try:
        value = util.convertdata(r[map_column].strip())
        if(value != None and value != '' ):
            facility_id = r[map_column]
            facility_id = util.convertdata(facility_id,int) 
            dr.protein = Protein.objects.get(lincs_id=facility_id) 
    except Exception, e:
        logger.error(str((
            "Invalid Protein facility id: ", value,'row',current_row, e)))
        raise

Example #9

0

Show file

File: import_dataset.py Project: seanderickson/hmslincs

def _read_cell(map_column,r,current_row, dr):
    '''
    @param r row
    @param dr dataRecord
    '''
    try:
        value = util.convertdata(r[map_column].strip())
        facility_id = None
        if(value != None and value != '' ):
            facility_id = util.convertdata(value,int) 
            dr.cell = Cell.objects.get(facility_id=facility_id) 
    except Exception, e:
        logger.error(str(("Invalid Cell facility id: ", facility_id,
                          'row',current_row, e)))
        raise

Example #10

0

Show file

def read_explicit_reagents(book, dataset):
    
    try:
        reagents_sheet = book.sheet_by_name('Reagents')
        for row in range(1,reagents_sheet.nrows):
            facility_batch_id = read_string(reagents_sheet.cell(row,0))
            vals = [
                 util.convertdata(x,int) for x in facility_batch_id.split('-')]
            
            logger.info('facility_batch_id: %r', vals)
            
            if len(vals)>3:
                raise Exception(
                    'Reagent id has too many values: %r', facility_batch_id)
            
            if (len(vals)==3):
                smb = SmallMoleculeBatch.objects.get(
                    reagent__facility_id=vals[0],
                    reagent__salt_id=vals[1],
                    batch_id=vals[2])
                logger.info('small molecule batch found: %r', smb)
                dataset.small_molecules.add(smb)
            else:
                if len(vals)==2:
                    if len(str(vals[1]))==3:
                        smb = SmallMoleculeBatch.objects.get(
                            reagent__facility_id=vals[0],
                            reagent__salt_id=vals[1],
                            batch_id=0)
                        logger.info('small molecule batch found: %r', smb)
                        dataset.small_molecules.add(smb)
                        continue
                    
                    rb = ReagentBatch.objects.get(
                        reagent__facility_id=vals[0],
                        batch_id=vals[1])
                else:
                    rb = ReagentBatch.objects.get(
                        reagent__facility_id=vals[0],
                        batch_id=0)
                if hasattr(rb,'antibodybatch'):
                    logger.info('antibody reagent found: %r', rb)
                    dataset.antibodies.add(rb.antibodybatch)
                elif hasattr(rb, 'cellbatch'):
                    logger.info('cell reagent found: %r', rb)
                    dataset.cells.add(rb.cellbatch)
                elif hasattr(rb, 'otherreagentbatch'):
                    logger.info('other_reagent reagent found: %r', rb)
                    dataset.other_reagents.add(rb.otherreagentbatch)
                elif hasattr(rb, 'primarycellbatch'):
                    logger.info('primary cell reagent found: %r', rb)
                    dataset.primary_cells.add(rb.primarycellbatch)
                elif hasattr(rb, 'proteinbatch'):
                    logger.info('protein reagent found: %r', rb)
                    dataset.proteins.add(rb.proteinbatch)
                else:
                    raise Exception('unknown reagent type: %r', rb)
        dataset.save()
    except XLRDError, e:
        logger.info('no "Reagents" sheet found')

Example #11

0

Show file

File: import_dataset.py Project: dwrobel1/hmslincs

def read_metadata(path):
    """
    Read in the DataSets, Datacolumns, and Data sheets.  In the Data sheet, rows are DataRecords, and columns are DataPoints
    """
    # Read in the DataSet
    sheetname = 'Meta'
    # Define the Column Names -> model fields mapping
    labels = {'Lead Screener First': 'lead_screener_firstname',
              'Lead Screener Last': 'lead_screener_lastname',
              'Lead Screener Email': 'lead_screener_email',
              'Lab Head First': 'lab_head_firstname',
              'Lab Head Last': 'lab_head_lastname',
              'Lab Head Email': 'lab_head_email',
              'Title': 'title',
              'Facility ID': 'facility_id',
              'Summary': 'summary',
              'Protocol': 'protocol',
              'References': 'protocol_references'}
    
    metaSheet = iu.readtable([path, sheetname]) # Note, skipping the header row by default
    metaData = {}
    for row in metaSheet:
        rowAsUnicode = util.make_row(row)
        for key,value in labels.items():
            if re.match(key, rowAsUnicode[0], re.M|re.I):
                if key == 'Facility ID':
                    metaData[value] = util.convertdata(rowAsUnicode[1],int)
                else:
                    metaData[value] = rowAsUnicode[1]
    assert len(metaData) == len(labels), 'Meta data sheet does not contain the necessary keys, expected: %s, read: %s' % [labels, metaData]
    
    return metaData

Example #12

0

Show file

File: import_libraries.py Project: seanderickson/hmslincs

def readLibraries(path, sheetName):
    
    sheet = iu.readtable([path, sheetName]) # Note, skipping the header row by default
    # dict to map spreadsheet fields to the Library fields
    properties = ('model_field','required','default','converter')
    date_parser = lambda x : util.convertdata(x,date)
    column_definitions = {'Name': ('name',True), # TODO use the model to determine if req'd
                          'ShortName': ('short_name',True),
                          'Library Type':'type',
                          'Date First Plated': ('date_first_plated',False,None,date_parser),
                          'Date Data Received':('date_data_received',False,None,date_parser),
                          'Date Loaded': ('date_loaded',False,None,date_parser),
                          'Date Publicly Available': ('date_publicly_available',False,None,date_parser),
                          'Most Recent Update': ('date_updated',False,None,util.date_converter),
                          'Is Restricted':('is_restricted',False,False) }
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(properties,column_definitions)
    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions, sheet.labels)
    
    rows = 0    
    libraries = {}
    for row in sheet:
        logger.debug(str(('row raw: ',row)))
        r = util.make_row(row)
        logger.debug(str(('row: ',r)))
        initializer = {}
        for i,value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]
            
            logger.debug(str(('read col: ', i, ', ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            # Todo, refactor to a method
            logger.debug(str(('raw value', value)))
            if(converter != None):
                value = converter(value)
            if(value == None ):
                if( default != None ):
                    value = default
            if(value == None and  required == True):
                raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows))
            logger.debug(str(('model_field: ' , model_field, ', value: ', value)))
            initializer[model_field] = value
        try:
            library = Library(**initializer)
            library.save()
            logger.info(str(('library created', library)))
            libraries[library.short_name] = library
            rows += 1
        except Exception, e:
            logger.error(str(('library initializer problem: ', initializer)))
            raise e

Example #13

0

Show file

def _create_datapoint(datacolumn, dataset, datarecord, value):
    
    datapoint = None
    
    if datacolumn.data_type == 'Numeric': 
        if datacolumn.precision != 0: 
            datapoint = DataPoint(datacolumn = datacolumn,
                                  dataset = dataset,
                                  datarecord = datarecord,
                                  float_value=util.convertdata(value, float))
        else:
            datapoint = DataPoint(datacolumn=datacolumn,
                                  dataset = dataset,
                                  datarecord = datarecord,
                                  int_value=util.convertdata(value, int))
    elif datacolumn.data_type == 'omero_image': 
        datapoint = DataPoint(datacolumn=datacolumn,
                              dataset = dataset,
                              datarecord = datarecord,
                              int_value=util.convertdata(value, int))
    else: 
        logger.debug(
            'create datapoint for %r, datarecord: %s' % (value, datarecord))
        datapoint = DataPoint(datacolumn=datacolumn,
                              dataset = dataset,
                              datarecord = datarecord,
                              text_value=util.convertdata(value))
        if datacolumn.data_type == 'small_molecule':
            _read_small_molecule(dataset, datapoint)
        elif datacolumn.data_type == 'protein':
            _read_protein(dataset, datapoint)
        elif datacolumn.data_type == 'antibody':
            _read_antibody(dataset, datapoint)
        elif datacolumn.data_type == 'other_reagent':
            _read_other_reagent(dataset, datapoint)
        elif datacolumn.data_type == 'cell':
            _read_cell_batch(dataset, datapoint)
        elif datacolumn.data_type == 'primary_cell':
            _read_primary_cell_batch(dataset, datapoint)

    return datapoint

Example #14

0

Show file

def _create_datapoint(datacolumn, dataset, datarecord, value):

    datapoint = None

    if datacolumn.data_type == 'Numeric':
        if datacolumn.precision != 0:
            datapoint = DataPoint(datacolumn=datacolumn,
                                  dataset=dataset,
                                  datarecord=datarecord,
                                  float_value=util.convertdata(value, float))
        else:
            datapoint = DataPoint(datacolumn=datacolumn,
                                  dataset=dataset,
                                  datarecord=datarecord,
                                  int_value=util.convertdata(value, int))
    elif datacolumn.data_type == 'omero_image':
        datapoint = DataPoint(datacolumn=datacolumn,
                              dataset=dataset,
                              datarecord=datarecord,
                              int_value=util.convertdata(value, int))
    else:
        logger.debug('create datapoint for %r, datarecord: %s' %
                     (value, datarecord))
        datapoint = DataPoint(datacolumn=datacolumn,
                              dataset=dataset,
                              datarecord=datarecord,
                              text_value=util.convertdata(value))
        if datacolumn.data_type == 'small_molecule':
            _read_small_molecule(dataset, datapoint)
        elif datacolumn.data_type == 'protein':
            _read_protein(dataset, datapoint)
        elif datacolumn.data_type == 'antibody':
            _read_antibody(dataset, datapoint)
        elif datacolumn.data_type == 'other_reagent':
            _read_other_reagent(dataset, datapoint)
        elif datacolumn.data_type == 'cell':
            _read_cell_batch(dataset, datapoint)

    return datapoint

Example #15

0

Show file

def readLibraries(path, sheetName):

    sheet = iu.readtable([path, sheetName
                          ])  # Note, skipping the header row by default
    # dict to map spreadsheet fields to the Library fields
    labels = {
        'Name': 'name',
        'ShortName': 'short_name',
        'Date First Plated': 'date_first_plated',
        'Date Data Received': 'date_data_received',
        'Date Loaded': 'date_loaded',
        'Date Publicly Available': 'date_publicly_available'
    }
    date_parser = lambda x: util.convertdata(x, date)
    converters = {
        'date_first_plated': date_parser,
        'date_loaded': date_parser,
        'date_data_recieved': date_parser,
        'date_publicly_available': date_parser
    }
    cols = {}
    # first put the label row in (it contains the worksheet column, and its unique)
    for i, label in enumerate(sheet.labels):
        if label in labels:
            cols[i] = labels[label]
        else:
            print 'Note: column label not found:', label
            raise

    rows = 0
    i = 0

    libraries = {}
    for row in sheet:
        r = util.make_row(row)
        dict = {}
        for i, value in enumerate(r):
            if cols[i] in converters:
                value = converters[cols[i]](value)
            dict[cols[i]] = value
        try:
            print 'create library:', dict
            library = Library(**dict)
            library.save()
            libraries[library.short_name] = library
            rows += 1
        except Exception, e:
            print "Invalid Library, name: ", r[0]
            raise

Example #16

0

Show file

def _parse_reagent_batch(text_value):
    ''' 
    Split text_value on the dash character, convert each element to an integer
    '''
    vals = [util.convertdata(x, int) for x in text_value.split('-')]
    if len(vals) > 2:
        raise Exception(
            'invalid reagent-batch ID value, to many identifiers: %r' %
            text_value)
    facility_id = vals[0]
    batch_id = 0
    if len(vals) == 2:
        batch_id = vals[1]
    parsed_text = '-'.join([str(x) for x in vals])
    return (facility_id, batch_id, parsed_text)

Example #17

0

Show file

def _parse_reagent_batch(text_value):
    ''' 
    Split text_value on the dash character, convert each element to an integer
    '''
    vals = [ util.convertdata(x,int) for x in text_value.split('-')]
    if len(vals) > 2:
        raise Exception(
            'invalid reagent-batch ID value, to many identifiers: %r' 
            % text_value)
    facility_id = vals[0]
    batch_id = 0
    if len(vals) == 2:
        batch_id = vals[1]
    parsed_text = '-'.join([str(x) for x in vals])
    return (facility_id,batch_id,parsed_text)

Example #18

0

Show file

File: import_libraries.py Project: dwrobel1/hmslincs

def readLibraries(path, sheetName):
    
    sheet = iu.readtable([path, sheetName]) # Note, skipping the header row by default
    # dict to map spreadsheet fields to the Library fields
    labels = { 'Name': 'name',
               'ShortName': 'short_name',
               'Date First Plated': 'date_first_plated',
               'Date Data Received':'date_data_received',
               'Date Loaded': 'date_loaded',
               'Date Publicly Available': 'date_publicly_available' }
    date_parser = lambda x : util.convertdata(x,date)
    converters = {'date_first_plated': date_parser,
                  'date_loaded': date_parser,
                  'date_data_recieved': date_parser,
                  'date_publicly_available': date_parser }
    cols = {}
    # first put the label row in (it contains the worksheet column, and its unique)
    for i,label in enumerate(sheet.labels):
        if label in labels:
            cols[i] = labels[label]
        else:
            print 'Note: column label not found:', label
            raise
            
    rows = 0    
    i = 0
    
    libraries = {}
    for row in sheet:
        r = util.make_row(row)
        dict = {}
        for i,value in enumerate(r):
            if cols[i] in converters:
                value = converters[cols[i]](value)
            dict[cols[i]]= value
        try:
            print 'create library:', dict
            library = Library(**dict)
            library.save()
            libraries[library.short_name] = library
            rows += 1
        except Exception, e:
            print "Invalid Library, name: ", r[0]
            raise

Example #19

0

Show file

File: import_dataset.py Project: gberriz/hmslincs

def readDataColumns(path):
    # Read in the DataColumn Sheet
    sheetname = 'Data Columns'
    dataColumnSheet = iu.readtable([path, sheetname])

    _fields = util.get_fields(DataColumn)
    _typelookup = dict((f.name, iu.totype(f)) for f in _fields)
    
    # TODO: Use the import_utils methods here
    # TODO: compare and combine this with the fieldinformation entity
    labels = {'Worksheet Column':'worksheet_column',
              'Display Order':'display_order',
              'Name':'name',
              'Data Type':'data_type',
              'Decimal Places':'precision',
              'Description':'description',
              'Replicate Number':'replicate',
              'Time point':'time_point', 
              'Assay readout type':'readout_type',
              'Comments':'comments'}

    # create an array of dict's, each dict defines a DataColumn    
    dataColumnDefinitions = []
    # first put the label row in (it contains the worksheet column, and its unique)
    for v in dataColumnSheet.labels[1:]:
        dataColumnDefinitions.append({labels['Worksheet Column']:v})
    # now, for each row, create the appropriate dictionary entry in the dataColumnDefinitions
    for row in dataColumnSheet:
        rowAsUnicode = util.make_row(row)
        keyRead = rowAsUnicode[0]
        for i,cellText in enumerate(rowAsUnicode[1:]):
            for key,fieldName in labels.items():
                if re.match(key,keyRead,re.M|re.I): # if the row is one of the DataColumn fields, then add it to the dict
                    dataColumnDefinitions[i][fieldName] = util.convertdata(cellText,_typelookup.get(fieldName, None)) # Note: convert the data to the model field type
                else:
                    logger.debug(str(( '"Data Column definition not used: ', cellText)) ) 
                    pass
    logger.debug(str(("definitions: ", dataColumnDefinitions)) )
    
    return dataColumnDefinitions

Example #20

0

Show file

File: import_dataset.py Project: dwrobel1/hmslincs

def readDataColumns(path):
        # Read in the DataColumn Sheet
    sheetname = 'Data Columns'
    dataColumnSheet = iu.readtable([path, sheetname])

    _fields = util.get_fields(DataColumn)
    _typelookup = dict((f.name, iu.totype(f)) for f in _fields)
    
    labels = {'Worksheet Column':'worksheet_column',
              'Name':'name',
              'Data Type':'data_type',
              'Decimal Places':'precision',
              'Description':'description',
              'Replicate Number':'replicate',
              'Time point':'time_point', 
              'Assay readout type':'readout_type',
              'Comments':'comments'}

    # create an array of dict's, each dict defines a DataColumn    
    dataColumnDefinitions = []
    # first put the label row in (it contains the worksheet column, and its unique)
    for v in dataColumnSheet.labels[1:]:
        dataColumnDefinitions.append({labels['Worksheet Column']:v})
    # now, for each row, create the appropriate dictionary entry in the dataColumnDefinitions
    for row in dataColumnSheet:
        rowAsUnicode = util.make_row(row)
        keyRead = rowAsUnicode[0]
        for i,cellText in enumerate(rowAsUnicode[1:]):
            for key,fieldName in labels.items():
                if re.match(key,keyRead,re.M|re.I): # if the row is one of the DataColumn fields, then add it to the dict
                    dataColumnDefinitions[i][fieldName] = util.convertdata(cellText,_typelookup.get(fieldName, None)) # Note: convert the data to the model field type
                else:
                    pass
                    # print '"Data Column definition not used: ', cellText 
    print "definitions: ", dataColumnDefinitions
    
    return dataColumnDefinitions

Example #21

0

Show file

File: import_smallmolecule.py Project: seanderickson/hmslincs

def main(path):
    """
    Read in the sdf file
    """
    # map field labels to model fields
    properties = ('model_field','required','default','converter')
    get_primary_name = lambda x: x.split(';')[0].strip()
    get_alternate_names = lambda x: ';'.join([x.strip() for x in x.split(';')[1:]])
    
    labels = { s2p.MOLDATAKEY:('molfile',True),
              # NOTE: even though these db field are not integers, 
              # it is convenient to convert the read in values to INT to make sure they are not interpreted as float values
               'facility_reagent_id': ('facility_id',True,None, lambda x: util.convertdata(x[x.index('HMSL')+4:],int)), 
               'salt_id': ('salt_id',True,None, lambda x: util.convertdata(x,int)),
               'lincs_id':('lincs_id',False), #None,lambda x:util.convertdata(x,int)),
               'chemical_name':('name',True),
               'alternative_names':'alternative_names',
               'pubchem_cid':'pubchem_cid',
               'chembl_id':'chembl_id',
               'chebi_id':'chebi_id',
               'inchi':'_inchi',
               'inchi_key':'_inchi_key',
               'smiles': ('_smiles',True),
               'molecular_mass':('_molecular_mass',False,None, lambda x: round(util.convertdata(x, float),2)),
               'molecular_formula':'_molecular_formula',
               'software':'software',
               # 'concentration':'concentration',
               #'well_type':('well_type',False,'experimental'),
               'is_restricted':('is_restricted',False,False,util.bool_converter)}
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    labels = util.fill_in_column_definitions(properties,labels)
    
    assert typecheck.isstring(path)
    with open(path) as fh:
        data = fh.read().decode(DEFAULT_ENCODING)

    records = s2p.parse_sdf(data)
    logger.info(str(('read rows: ', len(records))))
    
    count = 0
    for record in records:
        logger.debug(str(('record', record)))
        initializer = {}
        for key,properties in labels.items():
            logger.debug(str(('look for key: ', key, ', properties: ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']
            
            value = record.get(key)

            # Todo, refactor to a method
            try:
                logger.debug(str(('raw value', value)))
                if(converter != None):
                    value = converter(value)
                if(value == None ):
                    if( default != None ):
                        value = default
                if(value == 'n/a'): value = None
                if(value == None and  required == True):
                    raise Exception(str(('Field is required: ', key, initializer, 'record:', count)))
                logger.debug(str(('model_field: ' , model_field, ', value: ', value)))
                initializer[model_field] = value
            except Exception, e:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]      
                logger.error(str((exc_type, fname, exc_tb.tb_lineno)))
                logger.error(str(('invalid input', e, 'count', count)))
                raise e
        # follows is a kludge, to split up the entered "chemical_name" field, on ';' - TODO: just have two fields that get entered
        if(initializer['name']):
            initializer['alternative_names']=get_alternate_names(initializer['name'])
            initializer['name']=get_primary_name(initializer['name'])
                
        if(logger.isEnabledFor(logging.DEBUG)): logger.debug(str(('initializer: ', initializer)))
        try:
            sm = SmallMolecule(**initializer)
            sm.save()
            logger.info(str(('sm created:', sm)))
            count += 1
        except Exception, e:
            logger.error(str(('save failed for: ', initializer, 'error',e, 'count: ', count)))
            raise e

Example #22

0

Show file

File: import_dataset.py Project: gberriz/hmslincs

     raise Exception('at least one of: ' + str(mappingColumnDict.keys()) + ' must be defined and used in the Data sheet.')
 
 # Read in the Data sheet, create DataPoint values for mapped column in each row
 logger.info(str(('data sheet columns identified, read rows, save_interval:', save_interval)))
 loopStart = time.time()
 pointsSaved = 0
 rowsRead = 0
 for row in dataSheet:
     current_row = rowsRead+2
     r = util.make_row(row)
     dataRecord = DataRecord(dataset=dataset )
     map_column = mappingColumnDict['Small Molecule Batch']
     mapped = False
     if(map_column > -1):
         try:
             value = util.convertdata(r[map_column].strip())
             if(value != None and value != '' ):
                 value = value.split("-")
                 if len(value) < 2: raise Exception('Small Molecule (Batch) format is #####-###(-#) **Note that (batch) is optional')
                 x = value[0]
                 facility = util.convertdata(x,int) 
                 salt = value[1]
                 try:
                     dataRecord.smallmolecule = SmallMolecule.objects.get(facility_id=facility, salt_id=salt)
                 except Exception, e:
                     logger.error(str(('could not locate small molecule:', facility)))
                     raise
                 if(len(value)>2):
                     dataRecord.batch_id = util.convertdata(value[2],int)
                     # TODO: validate that the batch exists?  (would need to do for all types, not just Small Molecule
                 mapped = True

Example #23

0

Show file

File: import_cell.py Project: seanderickson/hmslincs

def main(path):
    """
    Read in the Cell
    """
    sheet_name = 'HMS-LINCS cell line metadata'
    sheet = iu.readtable([path, sheet_name, 1]) # Note, skipping the header row by default

    properties = ('model_field','required','default','converter')
    column_definitions = {
              'Facility ID':('facility_id',True,None, lambda x: x[x.index('HMSL')+4:]),
              'CL_Name':('name',True),
              'CL_ID':'cl_id',
              'CL_Alternate_Name':'alternate_name',
              'CL_Alternate_ID':'alternate_id',
              'CL_Center_Name':'center_name',
              'CL_Center_Specific_ID':'center_specific_id',
              'MGH_ID':('mgh_id',False,None,lambda x:util.convertdata(x,int)),
              'Assay':'assay',
              'CL_Provider_Name':'provider_name',
              'CL_Provider_Catalog_ID':'provider_catalog_id',
              'CL_Batch_ID':'batch_id',
              'CL_Organism':'organism',
              'CL_Organ':'organ',
              'CL_Tissue':'tissue',
              'CL_Cell_Type':'cell_type',
              'CL_Cell_Type_Detail':'cell_type_detail',
              'CL_Disease':'disease',
              'CL_Disease_Detail':'disease_detail',
              'CL_Growth_Properties':'growth_properties',
              'CL_Genetic_Modification':'genetic_modification',
              'CL_Related_Projects':'related_projects',
              'CL_Recommended_Culture_Conditions':'recommended_culture_conditions',
              'CL_Verification_Profile':'verification_profile',
              'CL_Verification_Reference_Profile':'verification_reference_profile',
              'CL_Mutations_Reference':'mutations_reference',
              'CL_Mutations_Explicit':'mutations_explicit',
              'CL_Organism_Gender':'organism_gender',
              'Date Data Received':('date_data_received',False,None,util.date_converter),
              'Date Loaded': ('date_loaded',False,None,util.date_converter),
              'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter),
              'Most Recent Update': ('date_updated',False,None,util.date_converter),
              'Is Restricted':('is_restricted',False,False,util.bool_converter)}
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(properties,column_definitions)
    
    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions, sheet.labels)
            
    rows = 0    
    logger.debug(str(('cols: ' , cols)))
    for row in sheet:
        r = util.make_row(row)
        initializer = {}
        for i,value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]

            logger.debug(str(('read col: ', i, ', ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            # Todo, refactor to a method
            logger.debug(str(('raw value', value)))
            if(converter != None):
                value = converter(value)
            if(value == None ):
                if( default != None ):
                    value = default
            if(value == None and  required == True):
                raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows))
            logger.debug(str(('model_field: ' , model_field, ', value: ', value)))
            initializer[model_field] = value

        try:
            logger.debug(str(('initializer: ', initializer)))
            cell = Cell(**initializer)
            cell.save()
            logger.info(str(('cell created:', cell)))
            rows += 1
        except Exception, e:
            print "Invalid Cell, name: ", r[0]
            raise e

Example #24

0

Show file

File: import_libraries.py Project: gberriz/hmslincs

def main(path):
    """
    Read in the Library and LibraryMapping sheets
    """
    libraries = readLibraries(path,'Library')
    
    sheet = iu.readtable([path, 'LibraryMapping'])
    properties = ('model_field','required','default','converter')
    column_definitions = {'Facility':('facility_id',False,None, lambda x: util.convertdata(x,int)),
                          'Salt':('salt_id',False,None, lambda x: util.convertdata(x,int)),
                          'Batch':('facility_batch_id',False,None, lambda x: util.convertdata(x,int)),
                          'Is Control':('is_control',False,False,util.bool_converter),
                          'Plate':('plate',False,None, lambda x: util.convertdata(x,int)),
                          'Well':'well',
                          'Library Name':'short_name',
                          'Concentration': 'concentration',
                          'Concentration Unit':'concentration_unit'
                          }
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(properties,column_definitions)
    
    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions, sheet.labels)
    
    small_molecule_batch_lookup = ('smallmolecule', 'facility_batch_id')
    library_mapping_lookup = ('smallmolecule_batch','library','is_control','plate','well','concentration','concentration_unit')
    rows = 0    
    logger.debug(str(('cols: ' , cols)))
    for row in sheet:
        current_row = rows + 2
        r = util.make_row(row)
        initializer = {}
        small_molecule_lookup = {'facility_id':None, 'salt_id':None}
        for i,value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]

            logger.debug(str(('read col: ', i, ', ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            # Todo, refactor to a method
            logger.debug(str(('raw value', value)))
            if(converter != None):
                value = converter(value)
            if(value == None ):
                if( default != None ):
                    value = default
            if(value == None and  required == True):
                raise Exception('Field is required: %s, record: %d' % (properties['column_label'],'row',current_row))
            logger.debug(str(('model_field: ' , model_field, ', value: ', value)))
            
            initializer[model_field] = value
            
            if(model_field in small_molecule_lookup):
                small_molecule_lookup[model_field]=value
                if( None not in small_molecule_lookup.values()):
                    try:
                        sm = SmallMolecule.objects.get(**small_molecule_lookup)
                        initializer['smallmolecule'] = sm
                    except Exception, e:
                        raise Exception(str(('sm facility id not found', small_molecule_lookup,e,'row',current_row)))
            elif(model_field == 'short_name'):
                try:
                    library = libraries[value]
                    initializer['library'] = library
                except Exception, e:
                    raise Exception(str(('library short_name not found', value,e,'row',current_row)))

Example #25

0

Show file

File: import_smallmolecule_batch.py Project: elcovi/hmslincs

def main(path):
    """
    Read in the smallmolecule batch info
    """
    sheet_name = 'sheet 1'
    start_row = 1
    sheet = iu.readtable([path, sheet_name, start_row]) # Note, skipping the header row by default

    properties = ('model_field','required','default','converter')
    column_definitions = { 
              # NOTE: even though these db field are not integers, 
              # it is convenient to convert the read in values to INT to make sure they are not interpreted as float values
              'facility_id': ('facility_id',True,None, lambda x: util.convertdata(x,int)),
              'salt_id': ('salt_id',True,None, lambda x: util.convertdata(x,int)),
              'facility_batch_id':('batch_id',True,None, lambda x: util.convertdata(x,int)),
              'provider': ('provider_name',True),
              'provider_catalog_id':'provider_catalog_id',
              'provider_sample_id':'provider_batch_id',
              'chemical_synthesis_reference':'chemical_synthesis_reference',
              'purity':'purity',
              'purity_method':'purity_method',
              'aqueous_solubility':'aqueous_solubility',
              # FIXME: should warn the user if no unit is provided when 
              # aqueous_solubility is provided
              'aqueous_solubility_unit':'aqueous_solubility_unit',    
              'Date Data Received':('date_data_received',False,None,util.date_converter),
              'Date Loaded': ('date_loaded',False,None,util.date_converter),
              'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter),
              'Most Recent Update': ('date_updated',False,None,util.date_converter),
              }
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(properties,column_definitions)
    
    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions, sheet.labels,
        all_sheet_columns_required=False)
    
    rows = 0    
    logger.debug(str(('cols: ' , cols)))
    for row in sheet:
        r = util.make_row(row)
        initializer = {}
        small_molecule_lookup = {'facility_id':None, 'salt_id':None}
        for i,value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]

            logger.debug(str(('read col: ', i, ', ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            # Todo, refactor to a method
            logger.debug(str(('raw value', value)))
            if(converter != None):
                value = converter(value)
            if(value == None ):
                if( default != None ):
                    value = default
            if(value == None and  required == True):
                raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows))
            logger.debug(str(('model_field: ' , model_field, ', value: ', value)))
            
            if(model_field in small_molecule_lookup):
                small_molecule_lookup[model_field]=value
                if( None not in small_molecule_lookup.values()):
                    try:
                        sm = SmallMolecule.objects.get(**small_molecule_lookup)
                        initializer['reagent'] = sm
                    except Exception, e:
                        logger.error(str(('sm identifiers not found', small_molecule_lookup,'row',rows+start_row+2)))
                        raise
            else:
                initializer[model_field] = value
        try:
            logger.debug(str(('initializer: ', initializer)))
            smb = SmallMoleculeBatch(**initializer)
            smb.save()
            logger.debug(str(('smb created:', smb)))
            rows += 1
        except Exception, e:
            logger.error(str(( "Invalid smallmolecule batch initializer: ", initializer, 'row', rows+start_row+2, e)))
            raise

Example #26

0

Show file

def read_metadata(meta_sheet):

    properties = ('model_field', 'required', 'default', 'converter')
    field_definitions = {
        'Lead Screener First': 'lead_screener_firstname',
        'Lead Screener Last': 'lead_screener_lastname',
        'Lead Screener Email': 'lead_screener_email',
        'Lab Head First': 'lab_head_firstname',
        'Lab Head Last': 'lab_head_lastname',
        'Lab Head Email': 'lab_head_email',
        'Title': 'title',
        'Facility ID': (
            'facility_id', True, None, lambda x: util.convertdata(x, int)),
        'Summary': 'summary',
        'Protocol': 'protocol',
        'References': 'protocol_references',
        'Date Data Received':(
            'date_data_received', False, None, util.date_converter),
        'Date Loaded': ('date_loaded', False, None, util.date_converter),
        'Date Publicly Available': (
            'date_publicly_available', False, None, util.date_converter),
        'Most Recent Update': (
            'date_updated', False, None, util.date_converter),
        'Is Restricted':('is_restricted', False, False, util.bool_converter),
        'Dataset Type':('dataset_type', False),
        'Bioassay':('bioassay', False),
        'Dataset Keywords':('dataset_keywords', False),
        'Usage Message':('usage_message', False),
        'Dataset Data URL':('dataset_data_url', False),
        'Associated Publication': ('associated_publication', False),
        'Associated Project Summary': ('associated_project_summary', False),
    }
    
    sheet_labels = []
    for i in xrange(meta_sheet.nrows-1):
        row = meta_sheet.row_values(i+1)
        sheet_labels.append(row[0])

    field_definitions = util.fill_in_column_definitions(
        properties, field_definitions)

    cols = util.find_columns(field_definitions, sheet_labels,
        all_column_definitions_required=False)
    
    initializer = {}
    for i in xrange(meta_sheet.nrows-1):
        row = meta_sheet.row_values(i+1)
        
        properties = cols[i]
        value = row[1]
        logger.debug('Metadata raw value %r' % value)

        required = properties['required']
        default = properties['default']
        converter = properties['converter']
        model_field = properties['model_field']

        if converter:
            value = converter(value)
        if not value and default != None:
            value = default
        if not value and required:
            raise Exception(
                'Field is required: %s, record: %d' 
                    % (properties['column_label'], row))
        logger.debug('model_field: %s, value: %r' % ( model_field, value ) )
        initializer[model_field] = value

    return initializer

Example #27

0

Show file

File: import_dataset.py Project: seanderickson/hmslincs

def read_metadata(path):
    """
    Read in the DataSets, Datacolumns, and Data sheets.  In the Data sheet, rows
    are DataRecords, and columns are DataPoints
    """
    # Read in the DataSet
    sheetname = 'Meta'
    # Note, skipping the header row by default
    metaSheet = iu.readtable([path, sheetname]) 

    # Define the Column Names -> model fields mapping
    properties = ('model_field','required','default','converter')
    field_definitions = {'Lead Screener First': 'lead_screener_firstname',
              'Lead Screener Last': 'lead_screener_lastname',
              'Lead Screener Email': 'lead_screener_email',
              'Lab Head First': 'lab_head_firstname',
              'Lab Head Last': 'lab_head_lastname',
              'Lab Head Email': 'lab_head_email',
              'Title': 'title',
              'Facility ID': ('facility_id',True,None, 
                              lambda x: util.convertdata(x,int)),
              'Summary': 'summary',
              'Protocol': 'protocol',
              'References': 'protocol_references',
              'Date Data Received':('date_data_received',False,None,
                                    util.date_converter),
              'Date Loaded': ('date_loaded',False,None,util.date_converter),
              'Date Publicly Available': ('date_publicly_available',False,None,
                                          util.date_converter),
              'Most Recent Update': ('date_updated',False,None,
                                      util.date_converter),
              'Is Restricted':('is_restricted',False,False,util.bool_converter),
              'Dataset Type':('dataset_type',False),
              'Bioassay':('bioassay',False),
              'Dataset Keywords':('dataset_keywords',False),
              'Usage Message':('usage_message',False),
              }
    
    sheet_labels = []
    for row in metaSheet:
        rowAsUnicode = util.make_row(row)
        sheet_labels.append(rowAsUnicode[0])

    # convert the definitions to fleshed out dict's, with strategies for 
    # optional, default and converter
    field_definitions = \
        util.fill_in_column_definitions(properties,field_definitions)
    # create a dict mapping the column/row ordinal to the proper definition dict
    cols = util.find_columns(field_definitions, sheet_labels,
                             all_column_definitions_required=False)

    
    initializer = {}
    for i,row in enumerate(metaSheet):
        rowAsUnicode = util.make_row(row)
        properties = cols[i]
        value = rowAsUnicode[1]
        
        logger.debug(str(('read col: ', i, ', ', properties)))
        required = properties['required']
        default = properties['default']
        converter = properties['converter']
        model_field = properties['model_field']

        # Todo, refactor to a method
        logger.debug(str(('raw value', value)))
        if(converter != None):
            value = converter(value)
        if(value == None ):
            if( default != None ):
                value = default
        if(value == None and  required == True):
            raise Exception('Field is required: %s, record: %d' % 
                            (properties['column_label'],row))
        logger.debug(str(('model_field: ' , model_field, ', value: ', value)))
        initializer[model_field] = value

    return initializer

Example #28

0

Show file

def main(path):
    """
    Read in the Library and LibraryMapping sheets
    """
    libraries = readLibraries(path, 'Library')

    sheet = iu.readtable([path, 'LibraryMapping'])
    properties = ('model_field', 'required', 'default', 'converter')
    date_parser = lambda x: util.convertdata(x, date)
    column_definitions = {
        'Facility':
        ('facility_id', False, None, lambda x: util.convertdata(x, int)),
        'Salt': ('salt_id', False, None, lambda x: util.convertdata(x, int)),
        'Batch': ('batch_id', False, None, lambda x: util.convertdata(x, int)),
        'Is Control': ('is_control', False, False, util.bool_converter),
        'Plate': ('plate', False, None, lambda x: util.convertdata(x, int)),
        'Well':
        'well',
        'Library Name':
        'short_name',
        'Concentration':
        'concentration',
        'Concentration Unit':
        'concentration_unit'
    }
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(
        properties, column_definitions)

    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions, sheet.labels)

    small_molecule_batch_lookup = ('reagent', 'batch_id')
    library_mapping_lookup = ('smallmolecule_batch', 'library', 'is_control',
                              'plate', 'well', 'concentration',
                              'concentration_unit')
    rows = 0
    logger.debug(str(('cols: ', cols)))
    for row in sheet:
        current_row = rows + 2
        r = util.make_row(row)
        initializer = {}
        small_molecule_lookup = {'facility_id': None, 'salt_id': None}
        for i, value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]

            logger.debug(str(('read col: ', i, ', ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            # Todo, refactor to a method
            logger.debug(str(('raw value', value)))
            if (converter != None):
                value = converter(value)
            if (value == None):
                if (default != None):
                    value = default
            if (value == None and required == True):
                raise Exception(
                    'Field is required: %s, record: %d' %
                    (properties['column_label'], 'row', current_row))
            logger.debug(
                str(('model_field: ', model_field, ', value: ', value)))

            initializer[model_field] = value

            if (model_field in small_molecule_lookup):
                small_molecule_lookup[model_field] = value
                if (None not in small_molecule_lookup.values()):
                    try:
                        sm = SmallMolecule.objects.get(**small_molecule_lookup)
                        initializer['reagent'] = sm
                    except Exception, e:
                        raise Exception(
                            str(('sm facility id not found',
                                 small_molecule_lookup, e, 'row',
                                 current_row)))
            elif (model_field == 'short_name'):
                try:
                    library = libraries[value]
                    initializer['library'] = library
                except Exception, e:
                    raise Exception(
                        str(('library short_name not found', value, e, 'row',
                             current_row)))

Example #29

0

Show file

File: import_dataset.py Project: dwrobel1/hmslincs

         raise    
 map_column = mappingColumnDict['Cell']
 if(map_column > -1):
     try:
         value = util.convertdata(r[map_column].strip())
         if(value != None and value != '' ):
             facility_id = value
             dataRecord.cell = Cell.objects.get(facility_id=facility_id) # TODO: purge "HMSL" from the db
             mapped = True
     except Exception, e:
         print "Invalid Cell facility id: ", facility_id
         raise    
 map_column = mappingColumnDict['Protein']
 if(map_column > -1):
     try:
         value = util.convertdata(r[map_column].strip())
         if(value != None and value != '' ):
             facility_id = r[map_column]
             dataRecord.protein = Protein.objects.get(lincs_id=facility_id[facility_id.index('HMSL')+4:]) #TODO: purge "HMSL"
             mapped = True
     except Exception, e:
         print "Invalid Protein facility id: ", value
         raise
     
 if(not mapped):
     raise Exception('at least one of: ' + str(mappingColumnDict.keys()) + ' must be defined, missing for row: ' + str(rowsRead+2))
         
 if metaColumnDict['Plate'] > -1 : dataRecord.plate = util.convertdata(r[metaColumnDict['Plate']],int)
 if metaColumnDict['Well'] > -1 : dataRecord.well = util.convertdata(r[metaColumnDict['Well']])
 if metaColumnDict['Control Type'] > -1: dataRecord.control_type = util.convertdata(r[metaColumnDict['Control Type']])
 dataRecord.save()

Example #30

0

Show file

File: import_dataset.py Project: seanderickson/hmslincs

     _read_plate_well(map_column,r,current_row, dataRecord)
 map_column = mappingColumnDict['Cell']
 if(map_column > -1):
     _read_cell(map_column,r,current_row,dataRecord)
 map_column = mappingColumnDict['Antibody']
 if(map_column > -1):
     _read_antibody(map_column,r,current_row,dataRecord)
 map_column = mappingColumnDict['OtherReagent']
 if(map_column > -1):
     _read_other_reagent(map_column,r,current_row,dataRecord)
 map_column = mappingColumnDict['Protein']
 if(map_column > -1):
     _read_protein(map_column,r,current_row,dataRecord)
                     
 if metaColumnDict['Control Type'] > -1: 
     dataRecord.control_type = util.convertdata(
         r[metaColumnDict['Control Type']])
     if(dataRecord.control_type is not None and 
             dataRecord.smallmolecule is not None):
         raise Exception(str((
             'Cannot define a control type for a non-control well '
             '(well mapped to a small molecule batch)',
             dataRecord.smallmolecule,dataRecord.control_type, 
             'row',current_row)))
 if metaColumnDict['batch_id'] > -1: 
     temp = util.convertdata(r[metaColumnDict['batch_id']], int)
     if(temp != None):
         if(dataRecord.batch_id is not None and 
                 temp is not None and dataRecord.batch_id != temp):
             raise Exception(str((
                 'batch id field(1) does not match batch id set with '
                 'entity(2):',temp,dataRecord.batch_id)))

Example #31

0

Show file

        recognized_label = next(
            (field_name for label, field_name in labels.items() 
                if label_read and label.lower() == label_read.lower() ), None)
        
        if recognized_label:
            
            logger.debug(
                'label: %r, recognized_label: %r' % (label_read, recognized_label))
            
            for j,val in enumerate(row_values[1:]):
                dc_dict = dc_definitions[j]

                logger.debug('data column %s:%d:%d:%r' 
                    % ( recognized_label, i, j, val))
                
                final_val = util.convertdata(
                    val,type_lookup.get(recognized_label, None)) 
                
                if final_val != None:
                    dc_dict[recognized_label] = final_val
                    if recognized_label == 'display_order':
                        # add 10 to the order, so default reagent cols can go first
                        dc_dict['display_order'] = (
                            dc_dict['display_order'] + 10)
                    if recognized_label == 'name':
                        # split on non-alphanumeric chars
                        temp = re.split(r'[^a-zA-Z0-9]+',dc_dict['name'])
                        # convert, if needed
                        if len(temp) > 1:
                            dc_dict['name'] = camel_case_dwg(dc_dict['name'])
                else:
                    if recognized_label in required_labels:

Example #32

0

Show file

def main(path):
    """
    Read in the cell batch info
    """
    sheet_name = 'Sheet1'
    start_row = 1
    sheet = iu.readtable([path, sheet_name, start_row
                          ])  # Note, skipping the header row by default

    properties = ('model_field', 'required', 'default', 'converter')
    column_definitions = {
        'Facility ID':
        ('facility_id', True, None, lambda x: x[x.index('HMSL') + 4:]),
        'CL_Batch_ID':
        ('batch_id', True, None, lambda x: util.convertdata(x, int)),
        'CL_Provider_Name':
        'provider_name',
        'CL_Provider_Batch_ID':
        'provider_batch_id',
        'CL_Provider_Catalog_ID':
        'provider_catalog_id',
        'CL_Quality_Verification':
        'quality_verification',
        'CL_Transient_Modification':
        'transient_modification',
        'Date Data Received':
        ('date_data_received', False, None, util.date_converter),
        'Date Loaded': ('date_loaded', False, None, util.date_converter),
        'Date Publicly Available':
        ('date_publicly_available', False, None, util.date_converter),
        'Most Recent Update': ('date_updated', False, None,
                               util.date_converter),
    }
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(
        properties, column_definitions)

    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions, sheet.labels)

    rows = 0
    logger.debug(str(('cols: ', cols)))
    for row in sheet:
        r = util.make_row(row)
        initializer = {}
        for i, value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]

            logger.debug(str(('read col: ', i, ', ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            logger.debug(str(('raw value', value)))
            if (converter != None):
                value = converter(value)
            if (value == None):
                if (default != None):
                    value = default
            if (value == None and required == True):
                raise Exception('Field is required: %s, record: %d' %
                                (properties['column_label'], rows))
            logger.debug(
                str(('model_field: ', model_field, ', value: ', value)))

            if model_field == 'facility_id':
                try:
                    cell = Cell.objects.get(facility_id=value)
                    initializer['reagent'] = cell
                except:
                    logger.error(
                        str(("Cell not found", value, 'row',
                             rows + start_row + 2)))
                    raise
            else:
                initializer[model_field] = value
        try:
            logger.debug(str(('initializer: ', initializer)))
            cell = CellBatch(**initializer)
            cell.save()
            logger.debug(str(('cell created:', cell)))
            rows += 1
        except Exception, e:
            logger.error(
                str(("Invalid CellBatch initializer: ", initializer, 'row',
                     rows + start_row + 2, e)))
            raise

Example #33

0

Show file

def read_datacolumns(book):
    '''
    @return an array of data column definition dicts 
    '''

    data_column_sheet = book.sheet_by_name('Data Columns')

    labels = {
        'Worksheet Column': 'worksheet_column',
        '"Data" Worksheet Column': 'worksheet_column',
        'Display Order': 'display_order',
        'Display Name': 'display_name',
        'Name': 'name',
        'Data Type': 'data_type',
        'Decimal Places': 'precision',
        'Description': 'description',
        'Replicate Number': 'replicate',
        'Unit': 'unit',
        'Assay readout type': 'readout_type',
        'Comments': 'comments',
    }

    dc_definitions = []
    datacolumn_fields = util.get_fields(DataColumn)
    type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields)
    logger.debug('datacolumn type lookups: %s' % type_lookup)
    required_labels = ['name', 'data_type']

    logger.info('read the data column definitions...')
    for i in xrange(data_column_sheet.nrows):
        row_values = data_column_sheet.row_values(i)

        if i == 0:
            for val in row_values[1:]:
                dc_definitions.append({})

        label_read = row_values[0]

        recognized_label = next(
            (field_name for label, field_name in labels.items()
             if label_read and label.lower() == label_read.lower()), None)

        if recognized_label:

            logger.debug('label: %r, recognized_label: %r' %
                         (label_read, recognized_label))

            for j, val in enumerate(row_values[1:]):
                dc_dict = dc_definitions[j]

                logger.debug('data column %s:%d:%d:%r' %
                             (recognized_label, i, j, val))

                final_val = util.convertdata(
                    val, type_lookup.get(recognized_label, None))

                if final_val != None:
                    dc_dict[recognized_label] = final_val
                    if recognized_label == 'display_order':
                        # add 10 to the order, so default reagent cols can go first
                        dc_dict['display_order'] = (dc_dict['display_order'] +
                                                    10)
                    if recognized_label == 'name':
                        # split on non-alphanumeric chars
                        temp = re.split(r'[^a-zA-Z0-9]+', dc_dict['name'])
                        # convert, if needed
                        if len(temp) > 1:
                            dc_dict['name'] = camel_case_dwg(dc_dict['name'])
                else:
                    if recognized_label in required_labels:
                        raise Exception(
                            'Error, data column field is required: %s, col: %r'
                            % (recognized_label, colname(j + 1)))
        else:
            logger.debug('unrecognized label in "Data Columns" sheet %r' %
                         label_read)

    for dc_dict in dc_definitions:
        for label in required_labels:
            if label not in dc_dict:
                raise Exception('required "Data Column" label not defined %r' %
                                label)

    logger.info('find the data columns on the "Data" sheet...')

    data_sheet = book.sheet_by_name('Data')
    data_sheet_labels = data_sheet.row_values(0)
    dc_definitions_found = []
    data_labels_found = []
    for i, data_label in enumerate(data_sheet_labels):

        if not data_label or not data_label.strip():
            logger.info('break on data sheet col %d, blank' % i)
            break

        data_label = data_label.upper()
        col_letter = colname(i)

        for dc_dict in dc_definitions:
            _dict = None
            if 'worksheet_column' in dc_dict:

                v = dc_dict['worksheet_column']
                if v.upper() == col_letter:
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict

            elif 'name' in dc_dict or 'display_name' in dc_dict:

                if (dc_dict.get('name', '').upper() == data_label or
                        dc_dict.get('display_name', '').upper() == data_label):

                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict

            if _dict and 'display_order' not in _dict:

                _dict['display_order'] = i + 10
                logger.warn('auto assigning "display_order" for col %r as %d' %
                            (_dict['name'], i + 10))

        if i not in data_labels_found:

            logger.debug(('Data sheet label not found %r,'
                          ' looking in default reagent definitions %s') %
                         (data_label, default_reagent_columns.keys()))

            for key, dc_dict in default_reagent_columns.items():
                if (key.upper() == data_label
                        or dc_dict.get('name', '').upper() == data_label or
                        dc_dict.get('display_name', '').upper() == data_label):

                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)

    data_labels_not_found = [
        data_label for i, data_label in enumerate(data_sheet_labels)
        if data_label and data_label.strip() and i not in data_labels_found
        and data_label not in meta_columns
    ]
    if data_labels_not_found:
        logger.warn('data sheet labels not recognized %s' %
                    data_labels_not_found)

    # for legacy datasets: make sure the small molecule column 1 is always created
    small_mol_col = None
    for dc_dict in dc_definitions_found:
        if dc_dict['data_type'] == 'small_molecule':
            small_mol_col = dc_dict
            break
    if not small_mol_col:
        dc_definitions_found.append(
            default_reagent_columns['Small Molecule Batch'])

    logger.info('data column definitions found: %s' %
                [x['display_name'] for x in dc_definitions_found])

    return dc_definitions_found

Example #34

0

Show file

    logger.debug('read the data sheet, save_interval: %d' % save_interval)
    loopStart = time.time()
    pointsSaved = 0
    rows_read = 0
    col_to_dc_items = col_to_dc_map.items()

    for i in xrange(data_sheet.nrows - 1):
        current_row = i + 2
        row = data_sheet.row_values(i + 1)

        r = util.make_row(row)
        datarecord = DataRecord(dataset=dataset)

        if meta_columns['Control Type'] > -1:
            datarecord.control_type = util.convertdata(
                r[meta_columns['Control Type']])

        datapoint_batch = []
        small_molecule_datapoint = None
        for i, dc in col_to_dc_items:
            value = r[i]
            logger.debug('reading column %r, %s, val: %r' %
                         (colname(i), dc, value))
            value = value.strip()
            value = util.convertdata(value)
            if not value:
                continue
            datapoint = _create_datapoint(dc, dataset, datarecord, value)
            datapoint_batch.append(datapoint)
            pointsSaved += 1
            if not small_molecule_datapoint and dc.data_type == 'small_molecule':

Example #35

0

Show file

def main(path):
    """
    Read in the Antibody Batches
    """
    sheet_name = 'Sheet1'
    sheet = iu.readtable([path, sheet_name, 0])

    properties = ('model_field', 'required', 'default', 'converter')
    column_definitions = {
        'AR_Center_Specific_ID': ('antibody_facility_id', True, None,
                                  lambda x: x[x.index('HMSL') + 4:]),
        'AR_Batch_ID':
        ('batch_id', True, None, lambda x: util.convertdata(x, int)),
        'AR_Provider_Name':
        'provider_name',
        'AR_Provider_Catalog_ ID':
        'provider_catalog_id',
        'AR_Provider_Batch_ID':
        'provider_batch_id',
        'AR_Antibody_Purity':
        'antibody_purity',
        'Date Data Received':
        ('date_data_received', False, None, util.date_converter),
        'Date Loaded': ('date_loaded', False, None, util.date_converter),
        'Date Publicly Available':
        ('date_publicly_available', False, None, util.date_converter),
        'Most Recent Update':
        ('date_updated', False, None, util.date_converter),
    }

    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(
        properties, column_definitions)

    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions, sheet.labels)

    rows = 0
    logger.debug('cols: %s' % cols)
    for row in sheet:
        r = util.make_row(row)
        dict = {}
        initializer = {}
        for i, value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]

            logger.debug('read col: %d: %s' % (i, properties))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            logger.debug('raw value %r' % value)
            if (converter != None):
                value = converter(value)
            if (value == None):
                if (default != None):
                    value = default
            if (value == None and required == True):
                raise Exception('Field is required: %s, record: %d' %
                                (properties['column_label'], rows))

            logger.debug('model_field: %s, converted value %r' %
                         (model_field, value))
            initializer[model_field] = value
        try:
            logger.debug('initializer: %s' % initializer)

            antibody_facility_id = initializer.pop('antibody_facility_id',
                                                   None)
            if antibody_facility_id:
                try:
                    antibody = Antibody.objects.get(
                        facility_id=antibody_facility_id)
                    initializer['reagent'] = antibody
                except ObjectDoesNotExist, e:
                    logger.error(
                        'AR_Center_Specific_ID: "%s" does not exist, row: %d' %
                        (antibody_facility_id, i))
            antibody_batch = AntibodyBatch(**initializer)
            antibody_batch.save()
            logger.info('antibody batch created: %s' % antibody_batch)
            rows += 1
        except Exception, e:
            logger.error("Invalid antibody_batch initializer: %s" %
                         initializer)
            raise

Example #36

0

Show file

File: import_dataset.py Project: seanderickson/hmslincs

def readDataColumns(path):
    # Read in the DataColumn Sheet
    sheetname = 'Data Columns'
    dataColumnSheet = iu.readtable([path, sheetname])

    # Lookup all of the field types of the Datacolumn table.  
    # These will be used to validate input type by converting on read
    _fields = util.get_fields(DataColumn)
    _typelookup = dict((f.name, iu.totype(f)) for f in _fields)
    
    # TODO: Use the import_utils methods here
    # TODO: compare and combine this with the fieldinformation entity
    labels = {'Worksheet Column':'worksheet_column',
              'Display Order':'display_order',
              'Name':'name',
              'Display Name':'display_name',
              'Data Type':'data_type',
              'Decimal Places':'precision',
              'Description':'description',
              'Replicate Number':'replicate',
              'Unit':'unit', 
              'Assay readout type':'readout_type',
              'Comments':'comments',
              'Protein HMS LINCS ID': 'protein', 
              'Cell HMS LINCS ID': 'cell'}

    # create an array of dict's, each dict defines a DataColumn    
    dataColumnDefinitions = []
    #Note we also allow a list of pro
    # first the label row (it contains the worksheet column, it is unique)
    for v in dataColumnSheet.labels[1:]:
        dataColumnDefinitions.append({labels['Worksheet Column']:v})
        
    logger.debug(str(('========== datacolumns:',dataColumnDefinitions)))
    # for each row, create the dictionary entry in the dataColumnDefinitions
    for row in dataColumnSheet:
        rowAsUnicode = util.make_row(row)
        keyRead = rowAsUnicode[0]
        for i,cellText in enumerate(rowAsUnicode[1:]):
            try:
                for key,fieldName in labels.items():
                    # if one of the DataColumn fields, add it to the dict
                    if re.match(key,keyRead,re.M|re.I): 
                        if re.match('Protein HMS LINCS ID', keyRead, re.M|re.I):
                            facility_id = util.convertdata(cellText, int);
                            if facility_id:
                                dataColumnDefinitions[i][fieldName] = \
                                    Protein.objects.get(lincs_id=facility_id) 
                        elif re.match('Cell HMS LINCS ID', keyRead, re.M|re.I):
                            facility_id = util.convertdata(cellText, int);
                            if facility_id:
                                dataColumnDefinitions[i][fieldName] = \
                                    Cell.objects.get(facility_id=facility_id) 
                        else:
                            # Use the type from the fieldinformation table 
                            # to read in the data for each DC field
                            dataColumnDefinitions[i][fieldName] = \
                                util.convertdata(cellText,
                                                 _typelookup.get(fieldName, None)) 
                    else:
                        logger.debug(str((
                            '"Data Column definition not used: ', cellText)) ) 
                        pass
            except Exception, e:
                logger.error(str(('Exception reading data for cell', i, cellText, e)))
                raise e
        logger.debug(str(("definitions: ", dataColumnDefinitions)) )

Example #37

0

Show file

File: import_fieldinformation.py Project: seanderickson/hmslincs

def main(path):
    """
    Read in the Data Working Group sheets
    """
    logger.info("start")
    book = xlrd.open_workbook(path) #open our xls file, there's lots of extra default options in this call, for logging etc. take a look at the docs
 
    #sheet = book.sheets()[0] #book.sheets() returns a list of sheet objects... alternatively...
    #sheet = book.sheet_by_name("qqqq") #we can pull by name
    worksheet = book.sheet_by_index(0) #or by the index it has in excel's sheet collection
    properties = ('model_field','required','default','converter')
    column_definitions = {'table':'table',
                          'field':'field',
                          'alias':'alias',
                          'queryset':'queryset',
                          'show in detail':('show_in_detail',True,False,util.bool_converter),
                          'show in list':('show_in_list',True,False,util.bool_converter),
                          'show_as_extra_field':('show_as_extra_field',False,False,util.bool_converter),
                          'is_lincs_field':('is_lincs_field',True,False,util.bool_converter),
                          'is_unrestricted':('is_unrestricted',False,False,util.bool_converter),
                          'order':('order',True,None,lambda x:util.convertdata(x,int)),
                          'use_for_search_index':('use_for_search_index',True,False,util.bool_converter),
                          'Data Working Group version':'dwg_version',
                          'Unique ID':('unique_id',True),
                          'DWG Field Name':'dwg_field_name',
                          'HMS Field Name':'hms_field_name',
                          'Related to':'related_to',
                          'Description':'description',
                          'Importance (1: essential; 2: desirable / recommended; 3: optional)':'importance',
                          'Comments':'comments',
                          'Ontologies / references considered':'ontology_reference',
                          'Link to ontology / reference':'ontology_reference',
                          'Additional Notes (for development)':'additional_notes',
                          }
       
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(properties,column_definitions)
    
    num_rows = worksheet.nrows - 1
    num_cells = worksheet.ncols - 1

    curr_row = 0 # note zero indexed
    row = worksheet.row(curr_row)
    labels = []
    i = -1
    while i < num_cells:
        i += 1
        # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
        # cell_type = worksheet.cell_type(curr_row, curr_cell)
        labels.append(str(worksheet.cell_value(curr_row, i)))
    
    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions, labels, all_sheet_columns_required=False)
    
    logger.info('delete current table');
    FieldInformation.objects.all().delete()
    
    rows = 0
    while curr_row < num_rows:
        curr_row += 1
        actual_row = curr_row + 1
        row = worksheet.row(curr_row)
        if(logger.isEnabledFor(logging.DEBUG)): logger.debug(str(('row', row)))
        i = -1
        initializer = {}
        while i < num_cells:
            i += 1
            # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
            #cell_type = worksheet.cell_type(curr_row, curr_cell)
            value = unicode(worksheet.cell_value(curr_row, i))

            if i not in cols: 
                continue
            properties = cols[i]

            logger.debug(str(('read col: ', i, ', ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            # Todo, refactor to a method
            logger.debug(str(('raw value', value)))
            if(converter != None):
                logger.debug(str(('using converter',converter,value)))
                value = converter(value)
                logger.debug(str(('converted',value)))
            if(value == None ):
                if( default != None ):
                    value = default
            if(value == None and  required == True):
                raise Exception('Field is required: %s, record: %d' % (properties['column_label'],actual_row))
            logger.debug(str(('model_field: ' , model_field, ', value: ', value)))
            initializer[model_field] = value

        try:
            logger.debug(str(('initializer: ', initializer)))
            #if((initializer['table'] == None and initializer['queryset'] == None ) or
            if(initializer['field'] == None):
                logger.warn(str(('Note: table entry has no field definition (will be skipped)', initializer, 'current row:', actual_row)))
                continue;
            lfi = FieldInformation(**initializer)
            # check if the table/field exists
            if(lfi.table != None):
                table = models.get_model(APPNAME, lfi.table)
                if( table != None):
                    if(lfi.field not in map(lambda x: x.name,table._meta.fields) ):
                        raise Exception(str(('unknown field: ', lfi.field)))
                else:
                    raise Exception(str(('unknown table', lfi.table )))
            lfi.save()
            logger.info(str(('fieldInformation created:', lfi)))
            rows += 1
        except Exception, e:
            logger.error(str(( "Invalid fieldInformation, initializer so far: ", initializer, 'current row:', actual_row,e)))
            raise e

Example #38

0

Show file

File: import_dataset.py Project: dwrobel1/hmslincs

def main(path):
    
    # read in the two columns of the meta sheet to a dict that defines a DataSet
    metadata = read_metadata(path)
    dataset = DataSet(**metadata)
    dataset.save()
    
    # read in the data columns sheet to an array of dict's, each dict defines a DataColumn    
    dataColumnDefinitions = readDataColumns(path)
    
    # now that the array of DataColumn dicts is created, use them to create the DataColumn instances
    dataColumns = {}
    for dc in dataColumnDefinitions:
        dc['dataset'] = dataset
        dataColumn = DataColumn(**dc)
        dataColumn.save()
        dataColumns[dataColumn.name] = dataColumn    

    # read the Data sheet
    sheetname = 'Data'
    dataSheet = iu.readtable([path, sheetname])
    
    # First, map the sheet column indices to the DataColumns that were created
    dataColumnList = {}
    metaColumnDict = {'Well':-1, 'Plate':-1, 'Control Type':-1} # meta columns contain forensic information
    mappingColumnDict = {'Small Molecule':-1, 'Cell':-1, 'Protein':-1} # what is being studied - at least one is required
    # NOTE: this scheme is matching based on the labels between the "Data Column" sheet and the "Data" sheet
    for i,label in enumerate(dataSheet.labels):
        if(label == 'None' or label == 'well_id' or label.strip()=='' or label == 'Exclude' ): continue  
        if label in metaColumnDict: 
            metaColumnDict[label] = i
            continue
        if label in mappingColumnDict: 
            mappingColumnDict[label] = i
            continue
        if label in dataColumns:
            dataColumnList[i] = dataColumns[label] # note here "i" is the index to the dict
            
        else:
            #raise Exception("no datacolumn for the label: " + label)
            columnName = chr(ord('A') + i)
            findError = True
            for column in dataColumns.values():
                if(column.worksheet_column == columnName):
                    dataColumnList[i] = column
                    findError = False
                    break
            if findError:    
                print "Error: no datacolumn for ", label
                sys.exit(-1)
    
    found=False
    for key,value in mappingColumnDict.items():
        if(value != -1): 
            found=True
    if(not found):
        raise Exception('at least one of: ' + str(mappingColumnDict.keys()) + ' must be defined and used in the Data sheet.')
    
    # Read in the Data sheet, create DataPoint values for mapped column in each row
    pointsSaved = 0
    rowsRead = 0
    for row in dataSheet:
        r = util.make_row(row)
        dataRecord = DataRecord(dataset=dataset )
        map_column = mappingColumnDict['Small Molecule']
        mapped = False
        if(map_column > -1):
            try:
                value = util.convertdata(r[map_column].strip())
                if(value != None and value != '' ):
                    facility = value.split("-")[0] # TODO: purge "HMSL" from the db
                    salt = value.split("-")[1]
                    dataRecord.small_molecule = SmallMolecule.objects.get(facility_id=facility, sm_salt=salt)
                    mapped = True
            except Exception, e:
                print "Invalid Small Molecule facility id: ", value
                raise    
        map_column = mappingColumnDict['Cell']
        if(map_column > -1):
            try:
                value = util.convertdata(r[map_column].strip())
                if(value != None and value != '' ):
                    facility_id = value
                    dataRecord.cell = Cell.objects.get(facility_id=facility_id) # TODO: purge "HMSL" from the db
                    mapped = True
            except Exception, e:
                print "Invalid Cell facility id: ", facility_id
                raise

Example #39

0

Show file

def main(path):
    """
    Read in the Data Working Group sheets
    """
    logger.info(str(('read field information file', path)))

    properties = ('model_field', 'required', 'default', 'converter')
    column_definitions = {
        'table':
        'table',
        'field':
        'field',
        'alias':
        'alias',
        'queryset':
        'queryset',
        'show in detail': ('show_in_detail', True, False, util.bool_converter),
        'show in list': ('show_in_list', True, False, util.bool_converter),
        'show_as_extra_field':
        ('show_as_extra_field', False, False, util.bool_converter),
        'is_lincs_field': ('is_lincs_field', True, False, util.bool_converter),
        'is_unrestricted':
        ('is_unrestricted', False, False, util.bool_converter),
        'list_order':
        ('list_order', True, None, lambda x: util.convertdata(x, int)),
        'detail_order':
        ('detail_order', True, None, lambda x: util.convertdata(x, int)),
        'use_for_search_index': ('use_for_search_index', True, False,
                                 util.bool_converter),
        'Data Working Group version':
        'dwg_version',
        'Unique ID': ('unique_id', True),
        'DWG Field Name':
        'dwg_field_name',
        'HMS Field Name':
        'hms_field_name',
        'Related to':
        'related_to',
        'Description':
        'description',
        'Importance (1: essential; 2: desirable / recommended; 3: optional)':
        'importance',
        'Comments':
        'comments',
        'Ontologies / references considered':
        'ontology_reference',
        'Link to ontology / reference':
        'ontology_reference',
        'Additional Notes (for development)':
        'additional_notes',
    }

    column_definitions = util.fill_in_column_definitions(
        properties, column_definitions)

    with open(path) as f:
        reader = csv.reader(f)

        labels = reader.next()
        cols = util.find_columns(column_definitions,
                                 labels,
                                 all_sheet_columns_required=False)

        logger.info('delete current table')
        FieldInformation.objects.all().delete()

        for j, row in enumerate(reader):
            logger.debug('row %d: %s', j, row)
            initializer = {}
            for i, value in enumerate(row):

                if i not in cols:
                    logger.info(str(('column out of range', j + 1, i)))
                    continue
                properties = cols[i]

                logger.debug(str(('read col: ', i, ', ', properties)))
                required = properties['required']
                default = properties['default']
                converter = properties['converter']
                model_field = properties['model_field']

                # Todo, refactor to a method
                logger.debug(str(('raw value', value)))
                if converter:
                    logger.debug(str(('using converter', converter, value)))
                    value = converter(value)
                    logger.debug(str(('converted', value)))
                # Note: must check the value against None, as False is a valid value
                if value is None:
                    if default != None:
                        value = default
                # Note: must check the value against None, as False is a valid value
                if value is None and required is True:
                    raise Exception('Field is required: %s, record: %d' %
                                    (properties['column_label'], j + 1))
                logger.debug(
                    str(('model_field: ', model_field, ', value: ', value)))
                initializer[model_field] = value

            try:
                logger.debug(str(('initializer: ', initializer)))
                if not initializer['field']:
                    logger.warn(
                        str((
                            'Note: table entry has no field definition (will be skipped)',
                            initializer, 'current row:', j + 1)))
                    continue
                lfi = FieldInformation(**initializer)
                # check if the table/field exists
                if lfi.table:
                    table = models.get_model(APPNAME, lfi.table)
                    if table:
                        if lfi.field not in map(lambda x: x.name,
                                                table._meta.fields):
                            raise Exception(str(
                                ('unknown field: ', lfi.field)))
                    else:
                        raise Exception(str(('unknown table', lfi.table)))
                lfi.save()
                logger.info(str(('fieldInformation created:', lfi)))
            except Exception, e:
                logger.error(
                    str(("Invalid fieldInformation, initializer so far: ",
                         initializer, 'current row:', j + 1, e)))
                raise e

Example #40

0

Show file

def readLibraries(path, sheetName):

    sheet = iu.readtable([path, sheetName
                          ])  # Note, skipping the header row by default
    # dict to map spreadsheet fields to the Library fields
    properties = ('model_field', 'required', 'default', 'converter')
    date_parser = lambda x: util.convertdata(x, date)
    column_definitions = {
        'Name': ('name', True),  # TODO use the model to determine if req'd
        'ShortName': ('short_name', True),
        'Library Type':
        'type',
        'Date First Plated': ('date_first_plated', False, None, date_parser),
        'Date Data Received': ('date_data_received', False, None, date_parser),
        'Date Loaded': ('date_loaded', False, None, date_parser),
        'Date Publicly Available':
        ('date_publicly_available', False, None, date_parser),
        'Most Recent Update':
        ('date_updated', False, None, util.date_converter),
        'Is Restricted': ('is_restricted', False, False)
    }
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(
        properties, column_definitions)
    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions, sheet.labels)

    rows = 0
    libraries = {}
    for row in sheet:
        logger.debug(str(('row raw: ', row)))
        r = util.make_row(row)
        logger.debug(str(('row: ', r)))
        initializer = {}
        for i, value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]

            logger.debug(str(('read col: ', i, ', ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            # Todo, refactor to a method
            logger.debug(str(('raw value', value)))
            if (converter != None):
                value = converter(value)
            if (value == None):
                if (default != None):
                    value = default
            if (value == None and required == True):
                raise Exception('Field is required: %s, record: %d' %
                                (properties['column_label'], rows))
            logger.debug(
                str(('model_field: ', model_field, ', value: ', value)))
            initializer[model_field] = value
        try:
            library = Library(**initializer)
            library.save()
            logger.info(str(('library created', library)))
            libraries[library.short_name] = library
            rows += 1
        except Exception, e:
            logger.error(str(('library initializer problem: ', initializer)))
            raise e

Example #41

0

Show file

def main(path):
    """
    Read in the Cell
    """
    sheet_name = 'HMS-LINCS cell line metadata'
    sheet = iu.readtable([path, sheet_name, 1]) # Note, skipping the header row by default

    properties = ('model_field','required','default','converter')
    column_definitions = {
              'Facility ID':('facility_id',True,None, lambda x: x[x.index('HMSL')+4:]),
              'CL_Name':('name',True),
              'CL_LINCS_ID':'lincs_id',
              'CL_Alternate_Name':'alternative_names',
              'CL_Alternate_ID':'alternate_id',
              'CL_Center_Specific_ID':'center_specific_id',
              'MGH_ID':('mgh_id',False,None,lambda x:util.convertdata(x,int)),
              'Assay':'assay',
              'CL_Organism':'organism',
              'CL_Organ':'organ',
              'CL_Tissue':'tissue',
              'CL_Cell_Type':'cell_type',
              'CL_Cell_Type_Detail':'cell_type_detail',
              'CL_Donor_Sex': 'donor_sex',
              'CL_Donor_Age': ('donor_age_years',False,None,lambda x:util.convertdata(x,int)),
              'CL_Donor_Ethnicity': 'donor_ethnicity',
              'CL_Donor_Health_Status': 'donor_health_status',
              'CL_Disease':'disease',
              'CL_Disease_Detail':'disease_detail',
              'CL_Growth_Properties':'growth_properties',
              'CL_Genetic_Modification':'genetic_modification',
              'CL_Related_Projects':'related_projects',
              'CL_Recommended_Culture_Conditions':'recommended_culture_conditions',
              'CL_Verification_Reference_Profile':'verification_reference_profile',
              'CL_Known_Mutations':'mutations_known',
              'CL_Mutations_Citations':'mutations_citations',
              'CL_Molecular_Features': 'molecular_features',
              'CL_Relevant_Citations': 'relevant_citations',
              'CL_Reference_Source': 'reference_source',
              'CL_Reference_Source_ID': 'reference_source_id',
              'Reference Source URL': 'reference_source_url',
              'Usage Note': 'usage_note',
              
              'Date Data Received':('date_data_received',False,None,util.date_converter),
              'Date Loaded': ('date_loaded',False,None,util.date_converter),
              'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter),
              'Most Recent Update': ('date_updated',False,None,util.date_converter),
              'Is Restricted':('is_restricted',False,False,util.bool_converter)}
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(properties,column_definitions)
    
    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions, sheet.labels, all_sheet_columns_required=False)
            
    rows = 0    
    logger.debug(str(('cols: ' , cols)))
    for row in sheet:
        r = util.make_row(row)
        initializer = {}
        for i,value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]

            logger.debug(str(('read col: ', i, ', ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            # Todo, refactor to a method
            logger.debug(str(('raw value', value)))
            if(converter != None):
                value = converter(value)
            if(value == None ):
                if( default != None ):
                    value = default
            if(value == None and  required == True):
                raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows))
            logger.debug(str(('model_field: ' , model_field, ', value: ', value)))
            initializer[model_field] = value

        try:
            logger.debug(str(('initializer: ', initializer)))
            cell = Cell(**initializer)
            cell.save()
            logger.info(str(('cell created:', cell)))
            rows += 1

            # create a default batch - 0
            CellBatch.objects.create(reagent=cell,batch_id=0)
            
        except Exception, e:
            print "Invalid Cell, name: ", r[0]
            raise e

Example #42

0

Show file

def main(import_file,file_directory,deploy_dir):
    """
    Read in the qc events for batches 
    - version 1 - for small molecule batches
    """
    sheet_name = 'Sheet1'
    start_row = 0
    sheet = iu.readtable([import_file, sheet_name, start_row]) # Note, skipping the header row by default

    properties = ('model_field','required','default','converter')
    column_definitions = { 
              'facility_id': ('facility_id_for',True,None, lambda x: util.convertdata(x,int)),
              'salt_id': ('salt_id_for',False,None, lambda x: util.convertdata(x,int)),
              'batch_id':('batch_id_for',True,None, lambda x: util.convertdata(x,int)),
              'QC event date': ('date',True,None,util.date_converter),
              'outcome': ('outcome',True),
              'comment': 'comment',
              'is_restricted':('is_restricted',False,False,util.bool_converter),
              'file1': 'file1',
              'file2': 'file2',
              'file3': 'file3',
              'file4': 'file4',
              'file5': 'file5',
              }
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(properties,column_definitions)
    
    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions, sheet.labels)
    
    rows = 0    
    logger.debug(str(('cols: ' , cols)))
    for row in sheet:
        r = util.make_row(row)
        # store each row in a dict
        _dict = {}
        for i,value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]

            logger.debug(str(('read col: ', i, ', ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            logger.debug(str(('raw value', value)))
            if(converter != None):
                value = converter(value)
            if(value == None ):
                if( default != None ):
                    value = default
            if(value == None and  required == True):
                raise Exception('Field is required: %s, record: %d' % (properties['column_label'],rows))
            logger.debug(str(('model_field: ' , model_field, ', value: ', value)))
            _dict[model_field] = value

        logger.debug(str(('dict: ', _dict)))
        
        files_to_attach = []
        for i in range(10):
            filenameProp = 'file%s'%i;
            if _dict.get(filenameProp, None):
                fileprop = _dict[filenameProp]
                filepath = os.path.join(file_directory,fileprop)
                if not os.path.exists(filepath):
                    raise Exception(str(('file does not exist:',filepath,'row',
                        rows+start_row)))
                filename = os.path.basename(filepath)
                relative_path = fileprop[:fileprop.index(filename)]
                
                # Move the file
                dest_dir = deploy_dir
                if not dest_dir:
                    dest_dir = settings.STATIC_AUTHENTICATED_FILE_DIR
                if not os.path.isdir(dest_dir):
                    raise Exception(str(('no such deploy directory, please create it', dest_dir)))
                if relative_path:
                    dest_dir = os.path.join(dest_dir, relative_path)
                    if not os.path.exists(dest_dir):
                        os.makedirs(dest_dir)
                deployed_path = os.path.join(dest_dir, filename)
                    
                logger.debug(str(('deploy',filepath, deployed_path)))
                if os.path.exists(deployed_path):
                    os.remove(deployed_path)
                copy(filepath,deployed_path)
                if not os.path.isfile (deployed_path):
                    raise Exception(str(('could not deploy to', deployed_path)))
                else:
                    logger.debug(str(('successfully deployed to', deployed_path)))
                
                files_to_attach.append((filename,relative_path))
        
        initializer = None
        try:
            # create the qc record
            initializer = {key:_dict[key] for key in 
                ['facility_id_for','salt_id_for','batch_id_for','outcome','comment','date']}
            qc_event = QCEvent(**initializer)
            qc_event.save()
            logger.debug(str(('saved', qc_event)))
            
            # create attached file records
            for (filename,relative_path) in files_to_attach:
                initializer = {
                    'qc_event':qc_event,
                    'filename':filename,
                    'relative_path':relative_path,
                    'is_restricted':_dict['is_restricted']
                    }
                qc_attached_file = QCAttachedFile(**initializer)
                qc_attached_file.save()
                logger.debug(str(('created qc attached file', qc_attached_file)))
            
            rows += 1
            
        except Exception, e:
            logger.error(str(("Invalid initializer: ", initializer, 'row', 
                rows+start_row+2, e)))
            raise

Example #43

0

Show file

def read_data(book, col_to_dc_map, first_small_molecule_column, dataset):

    datarecord_batch = []
    save_interval = 1000

    logger.debug('read the Data sheet')
    data_sheet = book.sheet_by_name('Data')
    
    for i,label in enumerate(data_sheet.row_values(0)):
        logger.debug('find datasheet label %r:%r' % (colname(i), label))
        if label in meta_columns: 
            meta_columns[label] = i
            continue
    
    logger.debug('meta_columns: %s, datacolumnList: %s' 
        % (meta_columns, col_to_dc_map) )
    logger.debug('read the data sheet, save_interval: %d' % save_interval)
    loopStart = time.time()
    pointsSaved = 0
    rows_read = 0
    for i in xrange(data_sheet.nrows-1):
        current_row = i + 2
        row = data_sheet.row_values(i+1)    

        r = util.make_row(row)
        datarecord = DataRecord(dataset=dataset)
        
        if meta_columns['Control Type'] > -1: 
            datarecord.control_type = util.convertdata(
                r[meta_columns['Control Type']])

        datapoint_batch = []
        small_molecule_datapoint = None 
        for i,dc in col_to_dc_map.items():
            value = r[i]
            logger.debug(
                'reading column %r, %s, val: %r' % (colname(i), dc, value))
            value = value.strip()
            value = util.convertdata(value)
            if not value: 
                continue
            datapoint = _create_datapoint(dc, dataset, datarecord, value)
            datapoint_batch.append(datapoint)
            pointsSaved += 1
            if not small_molecule_datapoint and dc.data_type == 'small_molecule':
                small_molecule_datapoint = datapoint
                
        if meta_columns['Plate'] > -1:
            _read_plate_well(
                meta_columns['Plate'], r, current_row, datarecord,
                first_small_molecule_column,small_molecule_datapoint,
                datapoint_batch)
        
        
        datarecord_batch.append((datarecord, datapoint_batch))
        rows_read += 1
        
        if (rows_read % save_interval == 0):
            bulk_create_datarecords(datarecord_batch)
            logger.debug(
                'datarecord batch created, rows_read: %d , time (ms): %d'
                    % (rows_read, time.time()-loopStart ) )
            count = bulk_create_datapoints(datarecord_batch)
            logger.debug('datapoints created in batch: %d ' % count)
            datarecord_batch=[]

    bulk_create_datarecords(datarecord_batch)
    et = time.time()-loopStart
    logger.debug(
        'final datarecord batch created, rows_read: %d, time (ms): %d' 
            % (rows_read, et))

    count = bulk_create_datapoints(datarecord_batch)
    logger.debug('created dps %d' % count )

    print 'Finished reading, rows_read: ', rows_read, ', points Saved: ', pointsSaved
    print 'elapsed: ', et , 'avg: ', et/rows_read
    
    cleanup_unused_datacolumns(dataset)

Example #44

0

Show file

def read_metadata(meta_sheet):

    properties = ('model_field', 'required', 'default', 'converter')
    field_definitions = {
        'Lead Screener First':
        'lead_screener_firstname',
        'Lead Screener Last':
        'lead_screener_lastname',
        'Lead Screener Email':
        'lead_screener_email',
        'Lab Head First':
        'lab_head_firstname',
        'Lab Head Last':
        'lab_head_lastname',
        'Lab Head Email':
        'lab_head_email',
        'Title':
        'title',
        'Facility ID':
        ('facility_id', True, None, lambda x: util.convertdata(x, int)),
        'Summary':
        'summary',
        'Protocol':
        'protocol',
        'References':
        'protocol_references',
        'Date Data Received':
        ('date_data_received', False, None, util.date_converter),
        'Date Loaded': ('date_loaded', False, None, util.date_converter),
        'Date Publicly Available':
        ('date_publicly_available', False, None, util.date_converter),
        'Most Recent Update':
        ('date_updated', False, None, util.date_converter),
        'Is Restricted': ('is_restricted', False, False, util.bool_converter),
        'Dataset Type': ('dataset_type', False),
        'Bioassay': ('bioassay', False),
        'Dataset Keywords': ('dataset_keywords', False),
        'Usage Message': ('usage_message', False),
        'Associated Publication': ('associated_publication', False),
        'Associated Project Summary': ('associated_project_summary', False),
    }

    sheet_labels = []
    for i in xrange(meta_sheet.nrows - 1):
        row = meta_sheet.row_values(i + 1)
        sheet_labels.append(row[0])

    field_definitions = util.fill_in_column_definitions(
        properties, field_definitions)

    cols = util.find_columns(field_definitions,
                             sheet_labels,
                             all_column_definitions_required=False)

    initializer = {}
    for i in xrange(meta_sheet.nrows - 1):
        row = meta_sheet.row_values(i + 1)

        properties = cols[i]
        value = row[1]
        logger.debug('Metadata raw value %r' % value)

        required = properties['required']
        default = properties['default']
        converter = properties['converter']
        model_field = properties['model_field']

        if converter:
            value = converter(value)
        if not value and default != None:
            value = default
        if not value and required:
            raise Exception('Field is required: %s, record: %d' %
                            (properties['column_label'], row))
        logger.debug('model_field: %s, value: %r' % (model_field, value))
        initializer[model_field] = value

    return initializer

Example #45

0

Show file

def main(path):
    sheet_name = 'sheet 1'
    start_row = 1
    sheet = iu.readtable([path, sheet_name, start_row])

    properties = ('model_field','required','default','converter')
    column_definitions = { 
        'facility_id': (
            'facility_id',True,None, lambda x: util.convertdata(x,int)),
        'facility_batch_id':(
            'batch_id',True,None, lambda x: util.convertdata(x,int)),
        'provider': ('provider_name',False),
        'provider_catalog_id':'provider_catalog_id',
        'provider_sample_id':'provider_batch_id',
        'Date Data Received':(
            'date_data_received',False,None,util.date_converter),
        'Date Loaded': ('date_loaded',False,None,util.date_converter),
        'Date Publicly Available': (
            'date_publicly_available',False,None,util.date_converter),
        'Most Recent Update': (
            'date_updated',False,None,util.date_converter),
        }
    column_definitions = util.fill_in_column_definitions(
        properties,column_definitions)
    
    cols = util.find_columns(column_definitions, sheet.labels,
        all_sheet_columns_required=False)
    
    rows = 0    
    logger.debug('cols: %s' % cols)
    for row in sheet:
        r = util.make_row(row)
        dict = {}
        initializer = {}
        for i,value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]

            logger.debug('read col: %d: %s' % (i,properties))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            logger.debug('raw value %r' % value)
            if(converter != None):
                value = converter(value)
            if(value == None ):
                if( default != None ):
                    value = default
            if(value == None and  required == True):
                raise Exception('Field is required: %s, record: %d' 
                    % (properties['column_label'],rows))

            logger.debug('model_field: %s, converted value %r'
                % (model_field, value) )
            initializer[model_field] = value
        try:
            logger.debug('initializer: %s' % initializer)
            
            facility_id = initializer.pop('facility_id',None)
            try:
                other_reagent = OtherReagent.objects.get(facility_id=facility_id)
                initializer['reagent'] = other_reagent
            except ObjectDoesNotExist, e:
                logger.error('facility_id: "%s" does not exist, row: %d' 
                    % (facility_id,i))
            batch = OtherReagentBatch(**initializer)
            batch.save()
            logger.debug('batch created: %s', batch)
            rows += 1
        except Exception, e:
            logger.error("Invalid other_reagent_batch initializer: %s" % initializer)
            raise

Example #46

0

Show file

File: import_cell_batch.py Project: elcovi/hmslincs

def main(path):
    """
    Read in the cell batch info
    """
    sheet_name = 'Sheet1'
    start_row = 1
    sheet = iu.readtable([path, sheet_name, start_row]) # Note, skipping the header row by default

    properties = ('model_field','required','default','converter')
    column_definitions = { 
              'Facility ID':('facility_id',True,None, lambda x: x[x.index('HMSL')+4:]),
              'CL_Batch_ID':('batch_id',True,None,lambda x:util.convertdata(x,int)),
              'CL_Provider_Name':'provider_name',
              'CL_Provider_Batch_ID':'provider_batch_id',
              'CL_Provider_Catalog_ID':'provider_catalog_id',
              'CL_Quality_Verification':'quality_verification',
              'CL_Transient_Modification': 'transient_modification',
              'Date Data Received':('date_data_received',False,None,util.date_converter),
              'Date Loaded': ('date_loaded',False,None,util.date_converter),
              'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter),
              'Most Recent Update': ('date_updated',False,None,util.date_converter),
              }
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(properties,column_definitions)
    
    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions, sheet.labels)
    
    rows = 0    
    logger.debug(str(('cols: ' , cols)))
    for row in sheet:
        r = util.make_row(row)
        initializer = {}
        for i,value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]

            logger.debug(str(('read col: ', i, ', ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            logger.debug(str(('raw value', value)))
            if(converter != None):
                value = converter(value)
            if(value == None ):
                if( default != None ):
                    value = default
            if(value == None and  required == True):
                raise Exception('Field is required: %s, record: %d' % (
                    properties['column_label'],rows))
            logger.debug(str(('model_field: ' , model_field, ', value: ', value)))
            
            if model_field == 'facility_id':
                try:
                    cell = Cell.objects.get(facility_id=value)
                    initializer['reagent'] = cell
                except:
                    logger.error(str(("Cell not found", value, 'row',rows+start_row+2)))
                    raise
            else:
                initializer[model_field] = value
        try:
            logger.debug(str(('initializer: ', initializer)))
            cell = CellBatch(**initializer)
            cell.save()
            logger.debug(str(('cell created:', cell)))
            rows += 1
        except Exception, e:
            logger.error(str(( "Invalid CellBatch initializer: ", initializer, 
                'row', rows+start_row+2, e)))
            raise

Example #47

0

Show file

def main(path, do_precursors_only):
    """
    Read in the Cell
    """
    sheet_name = 'HMS-LINCS cell line metadata'
    sheet = iu.readtable([path, sheet_name, 1]) # allow for informational header row

    properties = ('model_field','required','default','converter')
    column_definitions = {
        'Facility ID':('facility_id',True,None, lambda x: x[x.index('HMSL')+4:]),
        'CL_Name':('name',True),
        'CL_LINCS_ID':'lincs_id',
        'CL_Alternate_Name':'alternative_names',
        'CL_Alternate_ID':'alternative_id',
        'Precursor_Cell':'precursor_facility_batch_id',
        'CL_Organism':'organism',
        'CL_Organ':'organ',
        'CL_Tissue':'tissue',
        'CL_Cell_Type':'cell_type',
        'CL_Cell_Type_Detail':'cell_type_detail',
        'CL_Donor_Sex': 'donor_sex',
        'CL_Donor_Age': ('donor_age_years',False,None,lambda x:util.convertdata(x,int)),
        'CL_Donor_Ethnicity': 'donor_ethnicity',
        'CL_Donor_Health_Status': 'donor_health_status',
        'CL_Disease':'disease',
        'CL_Disease_Detail':'disease_detail',
        'CL_Production_Details': 'production_details',
        'CL_Genetic_Modification':'genetic_modification',
        'CL_Known_Mutations':'mutations_known',
        'CL_Mutation_Citations':'mutation_citations',
        'CL_Verification_Reference_Profile':'verification_reference_profile',
        'CL_Growth_Properties':'growth_properties',
        'CL_Recommended_Culture_Conditions':'recommended_culture_conditions',
        'CL_Relevant_Citations': 'relevant_citations',
        'Usage Note': 'usage_note',
        'CL_Reference_Source': 'reference_source',
        'Reference Source URL': 'reference_source_url',
        
        'Date Data Received':('date_data_received',False,None,util.date_converter),
        'Date Loaded': ('date_loaded',False,None,util.date_converter),
        'Date Publicly Available': ('date_publicly_available',False,None,util.date_converter),
        'Most Recent Update': ('date_updated',False,None,util.date_converter),
        'Is Restricted':('is_restricted',False,False,util.bool_converter)}
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(properties,column_definitions)
    
    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions, sheet.labels, all_sheet_columns_required=False)
            
    rows = 0    
    precursor_map = {}
    precursor_pattern = re.compile(r'HMSL(5\d{4})-(\d+)')
    for row in sheet:
        r = util.make_row(row)
        initializer = {}
        for i,value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']
            
            value = convertdata(value)
            if value is not None:
                if converter:
                    try:
                        value = converter(value)
                    except Exception:
                        logger.error('field parse error: %r, value: %r, row: %d',
                            properties['column_label'],value,rows+2)
                        raise 
            if value is None:
                if default is not None:
                    value = default
            if value is None and required:
                raise Exception('Field is required: %s, record: %d' 
                    % (properties['column_label'],rows))

            logger.debug('model_field: %r, value: %r' , model_field, value)
            initializer[model_field] = value
            
        precursor_facility_batch_id = initializer.pop('precursor_facility_batch_id')
        if precursor_facility_batch_id:
            match = precursor_pattern.match(precursor_facility_batch_id)
            if not match:
                raise Exception('Invalid precursor pattern: needs: %s: %r, row: %d'
                    % (precursor_pattern, initializer, rows))
            precursor_map[initializer['facility_id']] = (match.group(1),match.group(2))
        
        if not do_precursors_only:
            try:
                logger.info('initializer: %r', initializer)
                cell = Cell(**initializer)
                cell.save()
                logger.info(str(('cell created:', cell)))
    
                # create a default batch - 0
                CellBatch.objects.create(reagent=cell,batch_id=0)
                
            except Exception, e:
                print "Invalid Cell, name: ", r[0]
                raise e
        
        rows += 1

Example #48

0

Show file

def main(import_file, file_directory, deploy_dir):
    """
    Read in the qc events for batches 
    - version 1 - for small molecule batches
    """
    sheet_name = 'Sheet1'
    start_row = 0
    sheet = iu.readtable([import_file, sheet_name, start_row
                          ])  # Note, skipping the header row by default

    properties = ('model_field', 'required', 'default', 'converter')
    column_definitions = {
        'facility_id':
        ('facility_id_for', True, None, lambda x: util.convertdata(x, int)),
        'salt_id':
        ('salt_id_for', False, None, lambda x: util.convertdata(x, int)),
        'batch_id':
        ('batch_id_for', True, None, lambda x: util.convertdata(x, int)),
        'QC event date': ('date', True, None, util.date_converter),
        'outcome': ('outcome', True),
        'comment':
        'comment',
        'is_restricted': ('is_restricted', False, False, util.bool_converter),
        'file1':
        'file1',
        'file2':
        'file2',
        'file3':
        'file3',
        'file4':
        'file4',
        'file5':
        'file5',
    }
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(
        properties, column_definitions)

    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions, sheet.labels)

    rows = 0
    logger.debug(str(('cols: ', cols)))
    for row in sheet:
        r = util.make_row(row)
        # store each row in a dict
        _dict = {}
        for i, value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]

            logger.debug(str(('read col: ', i, ', ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            logger.debug(str(('raw value', value)))
            if (converter != None):
                value = converter(value)
            if (value == None):
                if (default != None):
                    value = default
            if (value == None and required == True):
                raise Exception('Field is required: %s, record: %d' %
                                (properties['column_label'], rows))
            logger.debug(
                str(('model_field: ', model_field, ', value: ', value)))
            _dict[model_field] = value

        logger.debug(str(('dict: ', _dict)))

        files_to_attach = []
        for i in range(10):
            filenameProp = 'file%s' % i
            if _dict.get(filenameProp, None):
                fileprop = _dict[filenameProp]
                filepath = os.path.join(file_directory, fileprop)
                if not os.path.exists(filepath):
                    raise Exception(
                        str(('file does not exist:', filepath, 'row',
                             rows + start_row)))
                filename = os.path.basename(filepath)
                relative_path = fileprop[:fileprop.index(filename)]

                # Move the file
                dest_dir = deploy_dir
                if not dest_dir:
                    dest_dir = settings.STATIC_AUTHENTICATED_FILE_DIR
                if not os.path.isdir(dest_dir):
                    raise Exception(
                        str(('no such deploy directory, please create it',
                             dest_dir)))
                if relative_path:
                    dest_dir = os.path.join(dest_dir, relative_path)
                    if not os.path.exists(dest_dir):
                        os.makedirs(dest_dir)
                deployed_path = os.path.join(dest_dir, filename)

                logger.debug(str(('deploy', filepath, deployed_path)))
                if os.path.exists(deployed_path):
                    os.remove(deployed_path)
                copy(filepath, deployed_path)
                if not os.path.isfile(deployed_path):
                    raise Exception(str(
                        ('could not deploy to', deployed_path)))
                else:
                    logger.debug(
                        str(('successfully deployed to', deployed_path)))

                files_to_attach.append((filename, relative_path))

        initializer = None
        try:
            # create the qc record
            initializer = {
                key: _dict[key]
                for key in [
                    'facility_id_for', 'salt_id_for', 'batch_id_for',
                    'outcome', 'comment', 'date'
                ]
            }
            qc_event = QCEvent(**initializer)
            qc_event.save()
            logger.debug(str(('saved', qc_event)))

            # create attached file records
            for (filename, relative_path) in files_to_attach:
                initializer = {
                    'qc_event': qc_event,
                    'filename': filename,
                    'relative_path': relative_path,
                    'is_restricted': _dict['is_restricted']
                }
                qc_attached_file = QCAttachedFile(**initializer)
                qc_attached_file.save()
                logger.debug(
                    str(('created qc attached file', qc_attached_file)))

            rows += 1

        except Exception, e:
            logger.error(
                str(("Invalid initializer: ", initializer, 'row',
                     rows + start_row + 2, e)))
            raise

Example #49

0

Show file

def main(path):
    """
    Read in the smallmolecule batch info
    """
    sheet_name = 'sheet 1'
    start_row = 1
    sheet = iu.readtable([path, sheet_name, start_row
                          ])  # Note, skipping the header row by default

    properties = ('model_field', 'required', 'default', 'converter')
    column_definitions = {
        # NOTE: even though these db field are not integers,
        # it is convenient to convert the read in values to INT to make sure they are not interpreted as float values
        'facility_id':
        ('facility_id', True, None, lambda x: util.convertdata(x, int)),
        'salt_id': ('salt_id', True, None, lambda x: util.convertdata(x, int)),
        'facility_batch_id':
        ('batch_id', True, None, lambda x: util.convertdata(x, int)),
        'provider': ('provider_name', True),
        'provider_catalog_id':
        'provider_catalog_id',
        'provider_sample_id':
        'provider_batch_id',
        'chemical_synthesis_reference':
        'chemical_synthesis_reference',
        'purity':
        'purity',
        'purity_method':
        'purity_method',
        'aqueous_solubility':
        'aqueous_solubility',
        # FIXME: should warn the user if no unit is provided when
        # aqueous_solubility is provided
        'aqueous_solubility_unit':
        'aqueous_solubility_unit',
        'Date Data Received':
        ('date_data_received', False, None, util.date_converter),
        'Date Loaded': ('date_loaded', False, None, util.date_converter),
        'Date Publicly Available':
        ('date_publicly_available', False, None, util.date_converter),
        'Most Recent Update': ('date_updated', False, None,
                               util.date_converter),
    }
    # convert the labels to fleshed out dict's, with strategies for optional, default and converter
    column_definitions = util.fill_in_column_definitions(
        properties, column_definitions)

    # create a dict mapping the column ordinal to the proper column definition dict
    cols = util.find_columns(column_definitions,
                             sheet.labels,
                             all_sheet_columns_required=False)

    rows = 0
    logger.debug(str(('cols: ', cols)))
    for row in sheet:
        r = util.make_row(row)
        initializer = {}
        small_molecule_lookup = {'facility_id': None, 'salt_id': None}
        for i, value in enumerate(r):
            if i not in cols: continue
            properties = cols[i]

            logger.debug(str(('read col: ', i, ', ', properties)))
            required = properties['required']
            default = properties['default']
            converter = properties['converter']
            model_field = properties['model_field']

            # Todo, refactor to a method
            logger.debug(str(('raw value', value)))
            if (converter != None):
                value = converter(value)
            if (value == None):
                if (default != None):
                    value = default
            if (value == None and required == True):
                raise Exception('Field is required: %s, record: %d' %
                                (properties['column_label'], rows))
            logger.debug(
                str(('model_field: ', model_field, ', value: ', value)))

            if (model_field in small_molecule_lookup):
                small_molecule_lookup[model_field] = value
                if (None not in small_molecule_lookup.values()):
                    try:
                        sm = SmallMolecule.objects.get(**small_molecule_lookup)
                        initializer['reagent'] = sm
                    except Exception, e:
                        logger.error(
                            str(('sm identifiers not found',
                                 small_molecule_lookup, 'row',
                                 rows + start_row + 2)))
                        raise
            else:
                initializer[model_field] = value
        try:
            logger.debug(str(('initializer: ', initializer)))
            smb = SmallMoleculeBatch(**initializer)
            smb.save()
            logger.debug(str(('smb created:', smb)))
            rows += 1
        except Exception, e:
            logger.error(
                str(("Invalid smallmolecule batch initializer: ", initializer,
                     'row', rows + start_row + 2, e)))
            raise