Ejemplo n.º 1
0
def readDataColumns(path):
        # Read in the DataColumn Sheet
    sheetname = 'Data Columns'
    dataColumnSheet = iu.readtable([path, sheetname])

    _fields = util.get_fields(DataColumn)
    _typelookup = dict((f.name, iu.totype(f)) for f in _fields)
    
    labels = {'Worksheet Column':'worksheet_column',
              'Name':'name',
              'Data Type':'data_type',
              'Decimal Places':'precision',
              'Description':'description',
              'Replicate Number':'replicate',
              'Time point':'time_point', 
              'Assay readout type':'readout_type',
              'Comments':'comments'}

    # create an array of dict's, each dict defines a DataColumn    
    dataColumnDefinitions = []
    # first put the label row in (it contains the worksheet column, and its unique)
    for v in dataColumnSheet.labels[1:]:
        dataColumnDefinitions.append({labels['Worksheet Column']:v})
    # now, for each row, create the appropriate dictionary entry in the dataColumnDefinitions
    for row in dataColumnSheet:
        rowAsUnicode = util.make_row(row)
        keyRead = rowAsUnicode[0]
        for i,cellText in enumerate(rowAsUnicode[1:]):
            for key,fieldName in labels.items():
                if re.match(key,keyRead,re.M|re.I): # if the row is one of the DataColumn fields, then add it to the dict
                    dataColumnDefinitions[i][fieldName] = util.convertdata(cellText,_typelookup.get(fieldName, None)) # Note: convert the data to the model field type
                else:
                    pass
                    # print '"Data Column definition not used: ', cellText 
    print "definitions: ", dataColumnDefinitions
    
    return dataColumnDefinitions
Ejemplo n.º 2
0
def read_datacolumns(book):
    '''
    @return an array of data column definition dicts 
    '''

    data_column_sheet = book.sheet_by_name('Data Columns')

    labels = {
        'Worksheet Column': 'worksheet_column',
        '"Data" Worksheet Column': 'worksheet_column',
        'Display Order': 'display_order',
        'Display Name': 'display_name',
        'Name': 'name',
        'Data Type': 'data_type',
        'Decimal Places': 'precision',
        'Description': 'description',
        'Replicate Number': 'replicate',
        'Unit': 'unit',
        'Assay readout type': 'readout_type',
        'Comments': 'comments',
    }

    dc_definitions = []
    datacolumn_fields = util.get_fields(DataColumn)
    type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields)
    logger.debug('datacolumn type lookups: %s' % type_lookup)
    required_labels = ['name', 'data_type']

    logger.info('read the data column definitions...')
    for i in xrange(data_column_sheet.nrows):
        row_values = data_column_sheet.row_values(i)

        if i == 0:
            for val in row_values[1:]:
                dc_definitions.append({})

        label_read = row_values[0]

        recognized_label = next(
            (field_name for label, field_name in labels.items()
             if label_read and label.lower() == label_read.lower()), None)

        if recognized_label:

            logger.debug('label: %r, recognized_label: %r' %
                         (label_read, recognized_label))

            for j, val in enumerate(row_values[1:]):
                dc_dict = dc_definitions[j]

                logger.debug('data column %s:%d:%d:%r' %
                             (recognized_label, i, j, val))

                final_val = util.convertdata(
                    val, type_lookup.get(recognized_label, None))

                if final_val != None:
                    dc_dict[recognized_label] = final_val
                    if recognized_label == 'display_order':
                        # add 10 to the order, so default reagent cols can go first
                        dc_dict['display_order'] = (dc_dict['display_order'] +
                                                    10)
                    if recognized_label == 'name':
                        # split on non-alphanumeric chars
                        temp = re.split(r'[^a-zA-Z0-9]+', dc_dict['name'])
                        # convert, if needed
                        if len(temp) > 1:
                            dc_dict['name'] = camel_case_dwg(dc_dict['name'])
                else:
                    if recognized_label in required_labels:
                        raise Exception(
                            'Error, data column field is required: %s, col: %r'
                            % (recognized_label, colname(j + 1)))
        else:
            logger.debug('unrecognized label in "Data Columns" sheet %r' %
                         label_read)

    for dc_dict in dc_definitions:
        for label in required_labels:
            if label not in dc_dict:
                raise Exception('required "Data Column" label not defined %r' %
                                label)

    logger.info('find the data columns on the "Data" sheet...')

    data_sheet = book.sheet_by_name('Data')
    data_sheet_labels = data_sheet.row_values(0)
    dc_definitions_found = []
    data_labels_found = []
    for i, data_label in enumerate(data_sheet_labels):

        if not data_label or not data_label.strip():
            logger.info('break on data sheet col %d, blank' % i)
            break

        data_label = data_label.upper()
        col_letter = colname(i)

        for dc_dict in dc_definitions:
            _dict = None
            if 'worksheet_column' in dc_dict:

                v = dc_dict['worksheet_column']
                if v.upper() == col_letter:
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict

            elif 'name' in dc_dict or 'display_name' in dc_dict:

                if (dc_dict.get('name', '').upper() == data_label or
                        dc_dict.get('display_name', '').upper() == data_label):

                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict

            if _dict and 'display_order' not in _dict:

                _dict['display_order'] = i + 10
                logger.warn('auto assigning "display_order" for col %r as %d' %
                            (_dict['name'], i + 10))

        if i not in data_labels_found:

            logger.debug(('Data sheet label not found %r,'
                          ' looking in default reagent definitions %s') %
                         (data_label, default_reagent_columns.keys()))

            for key, dc_dict in default_reagent_columns.items():
                if (key.upper() == data_label
                        or dc_dict.get('name', '').upper() == data_label or
                        dc_dict.get('display_name', '').upper() == data_label):

                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)

    data_labels_not_found = [
        data_label for i, data_label in enumerate(data_sheet_labels)
        if data_label and data_label.strip() and i not in data_labels_found
        and data_label not in meta_columns
    ]
    if data_labels_not_found:
        logger.warn('data sheet labels not recognized %s' %
                    data_labels_not_found)

    # for legacy datasets: make sure the small molecule column 1 is always created
    small_mol_col = None
    for dc_dict in dc_definitions_found:
        if dc_dict['data_type'] == 'small_molecule':
            small_mol_col = dc_dict
            break
    if not small_mol_col:
        dc_definitions_found.append(
            default_reagent_columns['Small Molecule Batch'])

    logger.info('data column definitions found: %s' %
                [x['display_name'] for x in dc_definitions_found])

    return dc_definitions_found