def index_from_get_well_name(well_name, platesize): row_index = well_name_row_index(well_name) col_index = well_name_col_index(well_name) rows = get_rows(platesize) cols = get_cols(platesize) if row_index >= rows: raise ValidationError( key='well_name', msg='%r, row index: %d, is > rows: %d, for platesize: %d' % (well_name, row_index, rows, platesize)) if col_index >= cols: raise ValidationError( key='well_name', msg='%r, col index: %d, is > cols: %d, for platesize: %d' % (well_name, col_index, cols, platesize)) # Fill in cols by row then col # index = col_index * rows + row_index # Fill by cols just like the Screening lab does: col then row index = row_index * cols + col_index if index > platesize: raise ValidationError(key='well_name', msg='%r, index: %d, is > platesize: %d' % (well_name, index, platesize)) return index
def create_request_from_job(job_data, raw_data=''): if DEBUG_BACKGROUND is True: logger.info('create_request_from_job: %r', job_data) if raw_data: logger.info('raw_data: %r, %d', type(raw_data), len(raw_data)) if len(raw_data) < 1000: logger.info('raw_data: %r', raw_data) else: logger.info('raw_data(trunc): %r', raw_data[:1000]) request_factory = RequestFactory() path = job_data[JOB.URI] method = job_data[JOB.METHOD] encoding = job_data[JOB.ENCODING] content_type = job_data[JOB.CONTENT_TYPE] # For job processing, force response to JSON # accept = urllib.unquote(job_data[JOB.HTTP_ACCEPT]) accept = JSON_MIMETYPE comment = job_data[JOB.COMMENT] params = job_data[JOB.PARAMS] if params: params = json.loads(params) else: params = {} params[HEADER_APILOG_COMMENT_CLIENT] = comment params[HEADER_APILOG_COMMENT] = comment if raw_data: if DEBUG_BACKGROUND is True: logger.info('add raw data: %d', len(raw_data)) if MULTIPART_MIMETYPE not in content_type: msg = 'content type must contain %r for raw data post: found: %r'\ % (MULTIPART_MIMETYPE, content_type) logger.error(msg) raise ValidationError(key=JOB.CONTENT_TYPE, msg=msg) if method != 'POST': errmsg = 'method %r is not %r, required for raw data post' % ( method, 'POST') raise ValidationError(key=JOB.METHOD, msg=errmsg) else: if DEBUG_BACKGROUND is True: logger.info('no raw data to add') if DEBUG_BACKGROUND is True: logger.info('create_request_from_job content type: %r', content_type) request = request_factory.generic(method, path, data=raw_data, HTTP_ACCEPT=accept, content_type=content_type, **params) if DEBUG_BACKGROUND is True: logger.info('create_request_from_job: META: %r', request.META) logger.info('create_request_from_job: FILES: %r', request.FILES) return request
def parse_wells_to_leave_empty(wells_to_leave_empty, plate_size): ''' Parse the wells to leave empty field of the Cherry Pick Request. TODO: replace with parse_well_ranges ''' logger.debug('raw wells_to_leave_empty: %r, plate_size: %r', wells_to_leave_empty, plate_size) ncols = get_cols(plate_size) nrows = get_rows(plate_size) row_pattern = re.compile('row:\s*([a-zA-Z]{1,2})', flags=re.IGNORECASE) col_pattern = re.compile(r'col:\s*(\d{1,2})', flags=re.IGNORECASE) selections = re.split(r'\s*,\s*', wells_to_leave_empty) new_selections = [] for selection in selections: colmatch = col_pattern.match(selection) if colmatch: col = int(colmatch.group(1)) if col > ncols: raise ValidationError(key='wells_to_leave_empty', msg='column out of range: %d, %r' % (col, selection)) new_selections.append('Col:%d' % col) continue rowmatch = row_pattern.match(selection) if rowmatch: row = letter_to_row_index(rowmatch.group(1)) if row >= nrows: raise ValidationError(key='wells_to_leave_empty', msg='row out of range: %r, %r' % (rowmatch.group(1), selection)) new_selections.append('Row:%s' % rowmatch.group(1).upper()) continue wellmatch = WELL_NAME_PATTERN.match(selection) if wellmatch: new_selections.append(selection.upper()) continue raise ValidationError(key='wells_to_leave_empty', msg='unrecognized pattern: %r' % selection) logger.debug('new wells_to_leave_empty selections: %r', new_selections) decorated = [] for wellname in new_selections: if 'Col:' in wellname: decorated.append((1, int(wellname.split(':')[1]), wellname)) elif 'Row:' in wellname: decorated.append((2, wellname.split(':')[1], wellname)) else: match = WELL_NAME_PATTERN.match(wellname) decorated.append((match.group(1), match.group(1), wellname)) new_wells_to_leave_empty = [x[2] for x in sorted(decorated)] logger.debug('wells_to_leave_empty: %r', new_wells_to_leave_empty) return new_wells_to_leave_empty
def assay_plate_available_wells(wells_to_leave_empty, plate_size): if plate_size not in ALLOWED_PLATE_SIZES: raise ValidationError( key='plate_size', msg=('plate_size: %d for assay_plate_available_wells, ' 'not in allowed: %r' % (plate_size, ALLOWED_PLATE_SIZES))) available_wells = [] # Parse and sanitize the wells_to_leave_empty (should be already done) wells_to_leave_empty_list = [] if wells_to_leave_empty: wells_to_leave_empty_list = parse_wells_to_leave_empty( wells_to_leave_empty, plate_size) row_specifier = 'Row:%s' col_specifier = 'Col:%d' for i in range(0, plate_size): well_name = well_name_from_index(i, plate_size) wellmatch = WELL_NAME_PATTERN.match(well_name) row = wellmatch.group(1) col = int(wellmatch.group(2)) if row_specifier % row in wells_to_leave_empty_list: continue if col_specifier % col in wells_to_leave_empty_list: continue if well_name in wells_to_leave_empty_list: continue available_wells.append(well_name) return available_wells
def get_value(member_name): logger.info('get_value: %r', member_name) if member_name.upper() in Collation.ordered_members: return Collation.ordered_members.index(member_name.upper()) else: msg = 'must be one of %r' % Collation.ordered_members logger.warn('collation ' + msg) raise ValidationError(key='collation', msg=msg)
def well_id_plate_number(well_id): ''' Get the plate_number from the well_id ''' match = WELL_ID_PATTERN.match(well_id) if not match: raise ValidationError(key='well_id', msg='%r Does not match pattern: %s' % (well_id, WELL_ID_PATTERN.pattern)) return int(match.group(1))
def well_id_name(well_id): ''' Get the plate_number from the well_id ''' match = WELL_ID_PATTERN.match(well_id) if not match: raise ValidationError(key='well_id', msg='%r Does not match pattern: %s' % (well_id, WELL_ID_PATTERN.pattern)) wellrow = match.group(3).upper() wellcol = match.group(4) return '%s%s' % (wellrow, str(wellcol).zfill(2))
def well_row_col(well_name): ''' @return zero based (row_index,col_index) ''' match = WELL_NAME_PATTERN.match(well_name) if not match: raise ValidationError(key='well_name', msg='%r does not match pattern: %s' % (well_name, WELL_NAME_PATTERN.pattern)) return (letter_to_row_index(match.group(1)), int(match.group(2)) - 1)
def parse_val(value, key, data_type, options=None): """ All values are read as strings from the input files, so this function converts them as directed. TODO: validation """ try: if (value is None or value == '' or value == 'None' or value == u'None' or value == 'null' or value == u'n/a'): if data_type == 'string': return '' elif data_type == 'list': return [] else: return None if data_type == 'string': return value elif data_type == 'integer': # todo: this is a kludge, create an integer from values like "5.0" return int(float(value)) elif data_type == 'date': return dateutil.parser.parse(value).date() elif data_type == 'datetime': return dateutil.parser.parse(value) elif data_type == 'boolean': if value is True or value is False: return value value = str(value) if (value.lower() == 'true' or value.lower() == 't' or value == '1'): return True return False elif data_type == 'float': return float(value) elif data_type == 'decimal': if isinstance(value, float): logger.warn('converting float: %r to decimal: %r', value, Decimal(str(value))) value = str(value) return Decimal(value) elif data_type == 'list': if isinstance(value, six.string_types): if value.strip(): return (value, ) # convert string to list else: return [] return value # otherwise, better be a list else: raise Exception('unknown data type: %s: "%s"' % (key, data_type)) except Exception, e: logger.exception('value not parsed %r:%r', key, value) raise ValidationError(key=key, msg='parse error: %r' % str(e))
def plate_size_from_plate_type(plate_type): ''' Get the plate size from the current plate_type vocabulary: eppendorf_384 costar_96 abgene_384 genetix_384 marsh_384 nunc_96 eppendorf_96 Note: plate_type must end with the plate size integer for this to work: FIXME: plate size determined by magic value embedded in plate_types ''' parts = plate_type.split('_') if len(parts) != 2: raise ValidationError(key='plate_type', msg='not a recognized type: %r' % plate_type) plate_size = int(parts[1]) if plate_size not in ALLOWED_PLATE_SIZES: raise ValidationError( key='plate_type', msg='plate_size: %d for plate_type: %r, not in allowed: %r' % (plate_size, plate_type, ALLOWED_PLATE_SIZES)) return plate_size
def parse_columns(columns): ''' Parse the Screen Result input file Data Columns sheet into valid API Data Columns input. ''' parsed_cols = OrderedDict() errors = {} for i,column in enumerate(columns): parsed_col = { 'is_derived': False, 'is_follow_up_data': False, 'ordinal': i } logger.debug('parsing column: %r', column['data_worksheet_column']) if column['data_worksheet_column'] in parsed_cols: raise ValidationError( key='data_worksheet_column', msg='%r is listed more than once' % column['data_worksheet_column']) parsed_cols[column['data_worksheet_column']] = parsed_col for key,val in column.items(): if key == 'is_follow_up_data': parsed_col[key] = ( val and val.lower() == 'follow up') elif key == 'data_type': val = default_converter(val) # handle validation errors in the api if val not in DATA_TYPE_VALUES: key = '%s:%s' % (column['data_worksheet_column'],'data_type') errors[key] = 'val: %r must be one of %r' % (val,DATA_TYPE_VALUES) parsed_col[key] = val elif key == 'assay_readout_type': parsed_col[key] = default_converter(val) else: if key == 'how_derived': parsed_col['is_derived'] = ( val is not None and val.strip() is not '' ) parsed_col[key] = val if parsed_col.get('decimal_places') is not None: try: key = '%s:%s' % (column['data_worksheet_column'],'data_type') column['decimal_places'] = parse_val( column['decimal_places'],key,'integer') except ValidationError, e: errors.update(e.errors) logger.debug('parsed_col: %r', parsed_col)
def parse_copywell_id(pattern): parts = pattern.split('/') if len(parts) < 3: raise ValidationError(key='copywell_id', msg='Invalid pattern: must contain ' '"library_short_name/copy_name/well_id"') else: library_short_name = parts[0] copy_name = parts[1] _well_id = parts[2] plate_number = well_id_plate_number(_well_id) well_name = well_id_name(_well_id) return (copy_name, plate_number, well_id(plate_number, well_name), well_name)
def transform(input_matrices, counter, aps, lps): assert aps in ALLOWED_MATRIX_SIZES, \ ('assay_plate_size must be one of %r' % ALLOWED_MATRIX_SIZES) assert lps in ALLOWED_MATRIX_SIZES, \ ('library_plate_size must be one of %r' % ALLOWED_MATRIX_SIZES) if aps < lps: logger.info('convolute matrices') factor = lps / aps if factor != 4: msg = ( 'Convolute: library_plate_size/assay_plate_size != 4: %d/%d' % (aps, lps)) raise ValidationError({ 'assay_plate_size': msg, 'library_plate_size': msg }) if len(input_matrices) % 4 != 0: msg = 'Convolute: input matrix array must contain a multiple of 4 members' raise ValidationError({ 'assay_plate_size': msg, 'library_plate_size': msg }) # Create an adjusted counter to match the input: # - add quadrant counter to the right of plate counter new_counter_hash = OrderedDict() for key, value in counter.counter_hash.items(): new_counter_hash[key] = value if key == 'plate': new_counter_hash['quadrant'] = [0, 1, 2, 3] counter96 = Counter(new_counter_hash) logger.info('counter96: %r', counter96) if counter96.size() != len(input_matrices): raise ProgrammingError( 'input_matrices length (%d) must match ' 'the counter length with 4 quadrants: (%d)' % (len(input_matrices), counter96.size())) # - Create blank output matrices convoluted_matrices = [ lims_utils.create_blank_matrix(lps) for x in range(0, len(input_matrices) / 4) ] # Iterate through output (384) matrices and find the 96 matrix values # NOTE: could also start by iterating through input matrices for index, matrix in enumerate(convoluted_matrices): readout = counter.get_readout(index) for rownum, row in enumerate(matrix): for colnum in range(0, len(row)): input_quadrant = lims_utils.deconvolute_quadrant( lps, aps, rownum, colnum) readout96 = dict(readout, quadrant=input_quadrant) logger.debug( 'index: %d, 384 readout: %r, quadrant: %d, 96: %r', index, readout, input_quadrant, readout96) logger.debug('counter96: %r' % counter96.counter_hash) input_index = counter96.get_index(readout96) input_row = lims_utils.deconvolute_row( lps, aps, rownum, colnum) input_col = lims_utils.deconvolute_col( lps, aps, rownum, colnum) logger.debug('find: index: %d, cell: [%d][%d]', input_index, input_row, input_col) row[colnum] = input_matrices[input_index][input_row][ input_col] return convoluted_matrices elif lps < aps: logger.info('deconvolute matrices') factor = aps / lps if factor != 4: msg = ( 'Deconvolute: assay_plate_size/library_plate_size != 4: %d/%d' % (aps, lps)) raise ValidationError({ 'assay_plate_size': msg, 'library_plate_size': msg }) # Create an adjusted counter to match the input plates = counter.counter_hash.get('plate') logger.info('plates: %r', plates) if len(plates) % 4 != 0: msg = 'Deconvolute: plate count must be a multiple of 4: %d' % len( plates) raise ValidationError({'plate_ranges': msg}) plates_1536 = OrderedDict() for i, plate in enumerate(plates): plate_number_1536 = i / 4 if plate_number_1536 not in plates_1536: plates_1536[plate_number_1536] = [] plates_1536[plate_number_1536].append(plate) logger.info('plates_1536: %r', plates_1536) new_counter_hash = counter.counter_hash.copy() new_counter_hash['plate'] = plates_1536.keys() counter1536 = Counter(new_counter_hash) # Create blank output matrices deconvoluted_matrices = [ None for x in range(0, len(input_matrices) * 4) ] # Iterate through input (1536) matrices and find the output 384 matrix value for index, matrix in enumerate(input_matrices): readout1536 = counter1536.get_readout(index) plate1536 = readout1536['plate'] # Convert each 1536 plate separately, and find the output matrix position output_384_matrices = lims_utils.deconvolute_matrices([matrix], aps, lps) for quadrant, matrix384 in enumerate(output_384_matrices): plate384 = plates_1536[plate1536][quadrant] readout384 = dict(readout1536, plate=plate384) index384 = counter.get_index(readout384) deconvoluted_matrices[index384] = matrix384 return deconvoluted_matrices else: return input_matrices
def parse_result_row(i,parsed_columns,result_row): ''' Parse the Screen Result input file format into a valid API input format: - Convert plate_number and well_name into a well_id - Convert the assay_well_control_type input: use the ASSAY_WELL_CONTROL_TYPES to map api schema assaywell.control_type - Convert the exclude column specifiers into known column letters: "all" is converted to a list of all column letters - Parse value columns according to the data_type specified: - Create default values for positive columns - (TODO: validation rules can be moved to API) - Verify that PARTITION_POSITIVE_MAPPING values are used - Verify that CONFIRMED_POSITIVE_MAPPING values are used - Verify that integer values are integers - Verify that decimal values can be parsed as float ''' logger.debug( 'parse result row: %d, %r: %r', i, parsed_columns.keys(), result_row) meta_columns = RESULT_VALUE_FIELD_MAP.values() parsed_row = {} excluded_cols = [] well_id_errors = [] meta_key = 'plate_number' val = result_row[meta_key] logger.debug('plate value to parse: %r', val) plate_number = parse_val(val, meta_key, 'integer') if plate_number is None: well_id_errors.append('%s is required' % meta_key) meta_key = 'well_name' val = result_row[meta_key] if not val: well_id_errors.append('%s is required' % meta_key) elif WELL_NAME_PATTERN.match(val): wellname = val else: well_id_errors.append('Well_name val %r does not follow the pattern: %r' % (val, WELL_NAME_PATTERN.pattern)) if well_id_errors: raise ParseError(errors={ 'row: %d'%i: well_id_errors }) parsed_row['well_id'] = \ '%s:%s' % (str(plate_number).zfill(5), wellname) meta_key = 'assay_well_control_type' val = result_row.get(meta_key) parsed_row[meta_key] = None if val is not None: if val.lower() in ASSAY_WELL_CONTROL_TYPES: parsed_row[meta_key] = \ ASSAY_WELL_CONTROL_TYPES[val.lower()] else: msg = ('%s: val %r is not one of the choices: %r' % (meta_key, val, ASSAY_WELL_CONTROL_TYPES)) logger.error(msg) raise ValidationError(key=parsed_row['well_id'], msg=msg) meta_key = 'exclude' val = result_row.get(meta_key) if val is not None: if val.lower() == 'all': excluded_cols = parsed_columns.keys() else: excluded_cols = [x.strip().upper() for x in val.split(',')] unknown_excluded_cols = ( set(excluded_cols) - set(parsed_columns.keys())) if unknown_excluded_cols: raise ValidationError( key = parsed_row['well_id'], msg = 'unknown excluded cols: %r' % unknown_excluded_cols ) parsed_row[meta_key] = excluded_cols for colname, raw_val in result_row.items(): logger.debug('colname: %r, raw_val: %r', colname, raw_val) if colname in meta_columns: continue if colname not in parsed_columns: # NOTE: this is no longer an error, as the result value sheet may # contain extra columns (selected by user on output) logger.debug( 'result value column %r is not in recognized columns: %r', colname, parsed_columns.keys()) parsed_row[colname] = raw_val continue column = parsed_columns[colname] if raw_val is None: # 20180315 - verified with DJW, default values for # positive indicator columns if column['data_type'] == DATA_TYPE.BOOLEAN_POSITIVE: raw_val = False elif column['data_type'] == DATA_TYPE.PARTITIONED_POSITIVE: raw_val = 'NP' elif column['data_type'] == DATA_TYPE.CONFIRMED_POSITIVE: raw_val = 'NT' else: continue key = '%s-%s' % (parsed_row['well_id'],colname) parsed_row[colname] = raw_val if column['data_type'] in DATA_TYPE.numeric_types: if column['decimal_places'] > 0: # parse, to validate only; use decimal for final parsing parse_val(raw_val, key, 'float') else: parsed_row[colname] = parse_val(raw_val, key, 'integer') elif column['data_type'] == DATA_TYPE.PARTITIONED_POSITIVE: val = raw_val.upper() if val not in PARTITION_POSITIVE_MAPPING: raise ValidationError( key=key, msg='val: %r must be one of %r' % (raw_val, PARTITION_POSITIVE_MAPPING.keys())) parsed_row[colname] = val elif column['data_type'] == DATA_TYPE.CONFIRMED_POSITIVE: val = raw_val.upper() if val not in CONFIRMED_POSITIVE_MAPPING: raise ValidationError( key=key, msg='val: %r must be one of %r' % (raw_val, CONFIRMED_POSITIVE_MAPPING.keys())) parsed_row[colname] = val elif column['data_type'] == DATA_TYPE.BOOLEAN_POSITIVE: val = parse_val(raw_val,key,'boolean') parsed_row[colname] = val logger.debug('parsed_row: %r', parsed_row) return parsed_row
else: if key == 'how_derived': parsed_col['is_derived'] = ( val is not None and val.strip() is not '' ) parsed_col[key] = val if parsed_col.get('decimal_places') is not None: try: key = '%s:%s' % (column['data_worksheet_column'],'data_type') column['decimal_places'] = parse_val( column['decimal_places'],key,'integer') except ValidationError, e: errors.update(e.errors) logger.debug('parsed_col: %r', parsed_col) if errors: raise ValidationError(errors={'Data Columns': errors}) logger.debug('parsed cols: %r', parsed_cols) return parsed_cols def result_value_field_mapper(header_row, parsed_columns): ''' Parse the Screen Result input file result sheet headers into the valid API result value input headers using the RESULT_VALUE_FIELD_MAP ''' if DEBUG_IMPORTER: logger.info('map result value header row... %r', parsed_columns.keys()) mapped_row = [] header_row = [x for x in header_row] for i,value in enumerate(header_row): if not value:
def execute_from_python(job_id, sbatch=False): ''' Utility method to invoke from the running server. @see settings.BACKGROUND_PROCESSOR @param sbatch if true, requires "sbatch_settings" in the BACKGROUND_PROCESSOR settings @param keep_stdout (for testing) set True to use STDOUT (for non-sbatch only) ''' logger.info('using settings.BACKGROUND_PROCESSOR: %r', settings.BACKGROUND_PROCESSOR) check_settings = set([ 'post_data_directory', 'job_output_directory', 'credential_file', 'python_environ_script', 'background_process_script' ]) if not check_settings.issubset(set(settings.BACKGROUND_PROCESSOR.keys())): raise ValidationError( key='settings.BACKGROUND_PROCESSOR', msg='missing required entries: %s' % (check_settings - set(settings.BACKGROUND_PROCESSOR.keys()))) job_output_dir = settings.BACKGROUND_PROCESSOR['job_output_directory'] if not os.path.exists(job_output_dir): os.makedirs(job_output_dir) credential_file = settings.BACKGROUND_PROCESSOR['credential_file'] python_environ_script = settings.BACKGROUND_PROCESSOR[ 'python_environ_script'] if not os.path.exists(python_environ_script): raise InformationError(key='python_environ_script', msg='file does not exist: %r' % python_environ_script) # background_process_script = settings.BACKGROUND_PROCESSOR['background_process_script'] background_process_script = os.path.abspath(__file__) logger.info('this file: %r', background_process_script) output_stdout = '%d.stdout' % job_id output_stdout = os.path.abspath(os.path.join(job_output_dir, output_stdout)) output_stderr = '%d.stderr' % job_id output_stderr = os.path.abspath(os.path.join(job_output_dir, output_stderr)) run_sh_args = [ python_environ_script, background_process_script, '--job_id', str(job_id), '--c', credential_file ] full_args = [] if sbatch is True: os.putenv('USER', 'sde4') full_args.append('/usr/local/bin/sbatch') sbatch_settings = settings.BACKGROUND_PROCESSOR.get('sbatch_settings') if sbatch_settings is None: raise InformationError( key='sbatch_settings', msg='missing from the BACKGROUND_PROCESSOR settings') sbatch_settings['output'] = output_stdout sbatch_settings['error'] = output_stderr sbatch_settings['job-name'] = 'ss_{}'.format(job_id) sbatch_args = [] for k, v in sbatch_settings.items(): sbatch_args.extend(['--%s=%s' % (k, '%s' % str(v))]) full_args.extend(sbatch_args) full_args.append('-vvv') full_args.extend(run_sh_args) logger.info('full args: %r', full_args) if sbatch is True: logger.info('sbatch specified, invoke sbatch and wait for output...') logger.info('full command %s: ', ' '.join(full_args)) try: output = \ subprocess.check_output(full_args, stderr=subprocess.STDOUT) logger.info('ran, output: %r', output) # TODO: parse the SLURM process ID from the output return output except subprocess.CalledProcessError, e: logger.error('subprocess.CalledProcessError: output: %r', e.output) raise
def create_output_data(screen_facility_id, fields, result_values ): ''' Translate Screen Result data into a data structure ready for Serialization: { 'Screen Info': [ [ row1 ], [ row2 ]...]. 'Data Columns': [ [ row1 ], [ row2 ]...]. 'Data': [ [ row1 ], [ row2 ]...]. } @param fields an iterable containing result_value data_column dicts and field information dicts for the non-result value columns @param result_values an iterable containing result_value dicts ''' logger.info('create screen result data structure for %r', screen_facility_id) control_type_mapping = {v:k for k,v in ASSAY_WELL_CONTROL_TYPES.items()} data = OrderedDict() data['Screen Info'] = { 'Screen Number': screen_facility_id } data_column_structure = [] data['Data Columns'] = data_column_structure datacolumn_labels = DATA_COLUMN_FIELD_MAP.keys() data_columns = [] data_column_names = [] other_columns = [] for key,field in fields.items(): if ( field.get('is_datacolumn',False) or field.get('data_worksheet_column', None)): data_columns.append(key) data_column_names.append(field['name']) elif ( key not in ['well_id', 'plate_number','well_name', 'screen_facility_id', 'assay_well_control_type'] and key not in RESULT_VALUE_FIELD_MAP.keys() ): other_columns.append(key) data_columns = sorted(data_columns, key=lambda x: fields[x]['ordinal']) other_columns = sorted(other_columns, key=lambda x: fields[x]['ordinal']) data_column_names_to_col_letter = { dc:xl_col_to_name(len(RESULT_VALUE_FIELD_MAP)+i) for (i,dc) in enumerate(data_column_names) } logger.info('data columns: %r, other_columns: %r', data_columns, other_columns) # Transpose the field definitions into the output data_column sheet: # Row 0 - "Data" Worksheet Column # Row 1 - name # Row 2 - data_type # Row N - other data column fields # Column 0 - data column field label # Column 1-N data column values header_row = [datacolumn_labels[0]] header_row.extend([xl_col_to_name(len(RESULT_VALUE_FIELD_MAP)+i) for i in range(len(data_columns))]) logger.debug('header_row: %r', header_row) for i,(sheet_label,sheet_key) in enumerate( DATA_COLUMN_FIELD_MAP.items()[1:]): row = [sheet_label] for j,key in enumerate(data_columns): val = fields[key].get(sheet_key, None) if sheet_key == 'data_type': val = fields[key].get( 'assay_data_type',fields[key].get('data_type',None)) if val: if sheet_key == 'is_follow_up_data': if val == True: val = 'Follow up' elif val == False: val = 'Primary' elif sheet_key == 'derived_from_columns': if fields[key].get('screen_facility_id', None) == screen_facility_id: logger.info('Translate derived_from_columns: %r', val) if not set(data_column_names_to_col_letter.keys()).issuperset(set(val)): raise ValidationError( key='derived_from_columns', msg=('col: %r, values: %r are not in %r' %(key,val,data_column_names_to_col_letter.keys()))) val = ', '.join( [data_column_names_to_col_letter[dc_name] for dc_name in val]) else: # Manually serialize using commas val = ', '.join(val) row.append(val) else: row.append(None) logger.debug( 'Note: sheet key not found in schema field: %r, %r', sheet_key, fields[key]) logger.debug('data column row: %r', row) data_column_structure.append(OrderedDict(zip(header_row,row))) def result_value_generator(result_values): logger.info('Write the result values sheet') header_row = [] header_row.extend(RESULT_VALUE_FIELD_MAP.keys()) # TODO: allow column titles to be optional header_row.extend([fields[key].get('title', key) for key in data_columns]) header_row.extend(other_columns) row_count = 0 for result_value in result_values: row_count += 1 row = [] row.extend(result_value['well_id'].split(':')) if ( result_value.has_key('assay_well_control_type') and result_value['assay_well_control_type'] ): control_type = default_converter(result_value['assay_well_control_type']) # note: "empty", "experimental", "buffer" are values that can be # found in this column, due to legacy data entry, but they are # not valid if control_type in control_type_mapping: row.append(control_type_mapping[control_type]) else: row.append(None) else: row.append(None) excluded_cols = [] if result_value.has_key('exclude') and result_value['exclude']: temp = result_value['exclude'] if hasattr(temp, 'split'): temp = temp.split(LIST_DELIMITER_SQL_ARRAY) logger.debug('excluded data_columns: find %r, in %r', temp, data_columns) for data_column_name in temp: excluded_cols.append(get_column_letter( len(RESULT_VALUE_FIELD_MAP)+1 +data_columns.index(data_column_name))) excluded_cols = sorted(excluded_cols) row.append(','.join(excluded_cols)) for j,key in enumerate(data_columns): if result_value.has_key(key): row.append(result_value[key]) else: row.append(None) # append the non-result value columns to the end of the row for j,key in enumerate(other_columns): if result_value.has_key(key): row.append(result_value[key]) if row_count % 10000 == 0: logger.info('wrote %d rows', row_count) yield OrderedDict(zip(header_row,row)) data['Data'] = result_value_generator(result_values) return data
def parse_result_row(i,parsed_columns,result_row): logger.debug('parse result row: %r', result_row) meta_columns = RESULT_VALUE_FIELD_MAP.values() parsed_row = {} excluded_cols = [] meta_key = 'plate_number' val = result_row[meta_key] logger.debug('plate value to parse: %r', val) plate_number = parse_val(val, meta_key, 'integer') meta_key = 'well_name' val = result_row[meta_key] if WELLNAME_MATCHER.match(val): wellname = val else: raise ParseError( key=i, msg=('well_name val %r does not follow the pattern: %r' % (val, WELLNAME_MATCHER.pattern))) parsed_row['well_id'] = \ '%s:%s' % (str(plate_number).zfill(5), wellname) meta_key = 'assay_well_control_type' val = result_row.get(meta_key, None) parsed_row[meta_key] = None if val: if val.lower() in ASSAY_WELL_CONTROL_TYPES: parsed_row[meta_key] = \ ASSAY_WELL_CONTROL_TYPES[val.lower()] else: msg = ('%s: val %r is not one of the choices: %r' % (meta_key, val, ASSAY_WELL_CONTROL_TYPES)) logger.error(msg) raise ValidationError(key=parsed_row['well_id'], msg=msg) meta_key = 'exclude' val = result_row.get(meta_key, None) if val: if val.lower() == 'all': excluded_cols = parsed_columns.keys() else: excluded_cols = [x.strip().upper() for x in val.split(',')] unknown_excluded_cols = ( set(excluded_cols) - set(parsed_columns.keys())) if unknown_excluded_cols: raise ValidationError( key = parsed_row['well_id'], msg = 'unknown excluded cols: %r' % unknown_excluded_cols ) parsed_row[meta_key] = excluded_cols for colname, raw_val in result_row.items(): if colname in meta_columns: continue if colname not in parsed_columns: # NOTE: this is no longer an error, as the result value sheet may # contain extra columns (selected by user on output) logger.debug( 'result value column %r is not in recognized columns: %r', colname, parsed_columns.keys()) parsed_row[colname] = raw_val continue column = parsed_columns[colname] if ( column['data_type'] == 'partition_positive_indicator' and not raw_val): raw_val = 'NP' if ( column['data_type'] == 'confirmed_positive_indicator' and not raw_val): raw_val = 'NT' if raw_val is None: continue key = '%s-%s' % (parsed_row['well_id'],colname) parsed_row[colname] = raw_val if column['data_type'] == 'numeric': if column['decimal_places'] > 0: # parse, to validate parse_val(raw_val, key, 'float') else: parsed_row[colname] = parse_val(raw_val, key, 'integer') elif column['data_type'] == 'partition_positive_indicator': val = raw_val.upper() if val not in PARTITION_POSITIVE_MAPPING: raise ValidationError( key=key, msg='val: %r must be one of %r' % (raw_val, PARTITION_POSITIVE_MAPPING.keys())) parsed_row[colname] = val elif column['data_type'] == 'confirmed_positive_indicator': val = raw_val.upper() if val not in CONFIRMED_POSITIVE_MAPPING: raise ValidationError( key=key, msg='val: %r must be one of %r' % (raw_val, CONFIRMED_POSITIVE_MAPPING.keys())) parsed_row[colname] = val elif column['data_type'] == 'boolean_positive_indicator': val = parse_val(raw_val,key,'boolean') parsed_row[colname] = val return parsed_row