def normalize_nedm_val(nedm, range_check = True): """Normalize the data within neuroelectro.models NeuronEphysDataMap to standard units and range """ data_mean_value = nedm.val data_err_value = nedm.err # initialize output dictionary key_list = ['value', 'error'] output_dict = dict.fromkeys(key_list) unit_reg = UnitRegistry() ecm = nedm.ephys_concept_map ephys_prop = nedm.ephys_concept_map.ephys_prop natural_unit = unicode(ephys_prop.units) # try to get unit from table header, if can't, assume unit is natural unit found_unit = ecm.identified_unit if found_unit is None: found_unit = get_units_from_table_header(ecm.ref_text) if found_unit is None: parsed_nedm = resolve_data_float(nedm.ref_text, initialize_dict = True) found_unit = parsed_nedm['units'] # need to save new unit to ecm now if found_unit is None: found_unit = natural_unit # normalize mean value conv_mean_value = convert_units(found_unit, natural_unit, data_mean_value) if conv_mean_value: # custom normalization for negative and ratio values conv_mean_value = convert_negative_value(conv_mean_value, ephys_prop) conv_mean_value = convert_percent_to_ratio(conv_mean_value, ephys_prop, ecm.ref_text) # check whether mean value in appropriate range if range_check: if check_data_val_range(conv_mean_value, ephys_prop) is False: print 'neuron ephys data map %s, with pk %s out of appropriate range' % (data_mean_value, nedm.pk) print conv_mean_value, ephys_prop conv_mean_value = None output_dict['value'] = conv_mean_value # normalize error term # TODO: address if errors represented as standard deviations if data_err_value: conv_err_value = convert_units(found_unit, natural_unit, data_err_value) if conv_err_value: conv_err_value = convert_percent_to_ratio(conv_err_value, ephys_prop, ecm.ref_text) #print 'reported err val: %s, norm err val: %s' % (nedm.err, conv_err_value) # really basic check for error term validity if conv_err_value < 0: conv_err_value = None output_dict['error'] = conv_err_value return output_dict
def identify_ephys_units(): """Iterates through ephys concept map objects and assigns an identified_unit field if found""" ecms = m.EphysConceptMap.objects.all() ecm_count = ecms.count() print "adding units to ephys concept maps" for i,ecm in enumerate(ecms): prog(i,ecm_count) ref_text = ecm.ref_text identified_unit = get_units_from_table_header(ref_text) try: if identified_unit: ecm.identified_unit = identified_unit ecm.save() except Exception: pass
def resolve_data_float(data_str, initialize_dict = False): """Given a string containing numerical data, return a dictionary of text-mined assertions of mean value, error term, number of observations, and min and max range Args: data_str (str): string from a data table cell, corresponding to form XX +/- YY (ZZ) where XX refers the mean value, YY is the error term, and ZZ reflects count initialize_dict (bool) : indicates whether dict keys should all be initialized with None values Returns: a dictionary of text-mined data attributes and their values example: {'value' : 46.5, 'error' : 3.4, 'num_obs' : 5, 'min_range', 20.4, 'max_range', 50.4, } """ # TODO: consider adding an extracted SI unit as well key_list = ['value', 'error', 'num_obs', 'min_range', 'max_range', 'units'] # initialize dict with None values if requested if initialize_dict : data_dict = dict.fromkeys(key_list) else: data_dict = {} # check if input string is mostly characters - then its probably not a data cont string if digit_pct(data_str) < .05: try: print 'Too many elems of string %s are not digits: %.2f' % (data_str.encode("iso-8859-15", "replace"), digit_pct(data_str)) except Exception: pass return data_dict # first map unicode negative values new_str = re.sub(u'\u2212', '-', data_str) new_str = re.sub(u'\u2013', '-', new_str) new_str = re.sub(u'\+/-', u'\xb1', new_str) new_str = re.sub(u'\+\\-', u'\xb1', new_str) new_str = re.sub(u'\u2009', ' ', new_str) # look for string like '(XX)' num_obs_check = re.findall(u'\([Nn]?\s+?=?\s+?\d+\)', new_str) if len(num_obs_check) > 0: data_dict['num_obs'] = int(re.search('\d+', num_obs_check[0]).group(0)) # remove number of observations instance from the string new_str = new_str.replace(num_obs_check[0], '') # try to ID unit here found_unit = get_units_from_table_header(new_str) if found_unit: found_unit = re.sub("-?\.?\d+\.?\d*", "", found_unit) data_dict['units'] = found_unit # remove whitespace from the data string as it serves no purpose new_str = re.sub('\s', '', new_str) # try to split string based on unicode +/- split_str_list = re.split('\xb1', new_str) if re.search('\xb1', new_str) else re.split('\+/-', new_str) # parse 'error' value as second element after +/- sign if len(split_str_list) == 2: error_float = str_to_float(split_str_list[1]) # error_float must be greater than 0 if error_float and error_float > 0: data_dict['error'] = error_float # remove the part of the string defined as error new_str = split_str_list[0] # Check the remaining string for range (it has to start with a range) range_str_check = re.search(r'-?\.?\d+\.?\d*--?\.?\d+\.?\d*', new_str) if range_str_check: range_str = range_str_check.group(0) minus_count = len(re.findall('-', range_str)) range_split_list = re.split('-', range_str) if minus_count == 1: min_range = str_to_float(range_split_list[0]) max_range = str_to_float(range_split_list[1]) elif minus_count == 2: if re.search('^\-', range_str): min_range = str_to_float("-" + range_split_list[1]) max_range = str_to_float(range_split_list[2]) else: min_range = str_to_float(range_split_list[0]) max_range = str_to_float("-" + range_split_list[2]) elif minus_count == 3: min_range = str_to_float("-" + range_split_list[1]) max_range = str_to_float("-" + range_split_list[3]) else: print "Unparsable data range detected in String: '" + range_str + "'. Too many '-' signs." if min_range is not None and max_range is not None: if min_range < max_range: data_dict['min_range'] = min_range data_dict['max_range'] = max_range else: data_dict['min_range'] = max_range data_dict['max_range'] = min_range # prematurely assign a value as the mean of min and max ranges data_dict['value'] = np.mean([min_range, max_range]) new_str = re.sub(range_str, "", new_str) # parse 'mean' data value as first element new_str = re.search(r'-?\.?\d+\.?\d*', new_str) if new_str: data_dict['value'] = str_to_float(new_str.group(0)) return data_dict