def reset(self, data_folder, file_name, suffix=_default_suffix, variables=None): """Reset the aggregator class and clear design. Parameters ---------- data_folder : str folder which contains of data of the subjects file_name : str name of the files. All files that start with this name will be considered for the analysis (cf. aggregator.data_files) suffix : str, optional if specified only files that end with this particular suffix will be considered (default=.xpd) variables : array of str, optional array of variable names, process only the specified variables """ self._data_folder = data_folder self._file_name = file_name self._data_files = [] self._variables = [] self._dv = [] self._dv_txt = [] self._iv = [] self._iv_txt = [] self._exclusions = [] self._exclusions_txt = [] self._computes = [] self._computes_txt = [] self._recode_txt = [] self._recode = [] self._subject_variables = [] self._last_data = [] self._added_data = [] self._added_variables = [] self._suffix = suffix for flname in _os.listdir(_os.path.dirname(self._data_folder + "/")): if flname.endswith(self._suffix) and flname.startswith(self._file_name): _data, vnames, _subject_info, _comments = read_datafile( self._data_folder + "/" + flname, read_variables=variables ) if len(self._variables) < 1: self._variables = vnames else: if vnames != self._variables: message = u"Different variables in ".format(flname) message = message + u"\n{0}".format(vnames) message = message + u"\ninstead of\n{0}".format(self._variables) raise RuntimeError(_unicode2str(message)) self._data_files.append(flname) if len(self._data_files) < 1: raise Exception("No data files found in {0}".format(_unicode2str(self._data_folder))) print "found {0} subject_data sets".format(len(self._data_files)) print "found {0} variables: {1}".format(len(self._variables), [_unicode2str(x) for x in self._variables])
def _get_variable_id(self, variables, throw_exception=False): for cnt, v in enumerate(self.variables): if variables == v: return cnt if throw_exception: raise RuntimeError("Unknown variable name '{0}'".format(_unicode2str(variables))) return None
def print_n_trials(self, variables): """Print the number of trials in the combinations of the independent variables. Notes ----- The functions is for instance useful to quickly check the experimental design. Parameters ---------- variables : str or list A string or a list of strings that represent the names of one or more data variables (aggregator.variables) """ old_iv = self._iv old_dv = self._dv self.set_dependent_variables("n_trials") self.set_independent_variables(variables) result, varnames = self.aggregate() for row in result: print "Subject {0}".format(row[0]) for cnt, var in enumerate(varnames): if cnt > 0: if isinstance(row[cnt], unicode): _row_data = _unicode2str(row[cnt]) else: _row_data = row[cnt] print "\t{0}:\t{1}".format(var[4:], _row_data) print "\n" self._dv = old_dv self._iv = old_iv
def _add_exclusion(self, relation_syntax): """Add an exclusion.""" relation = self._parse_syntax(relation_syntax, throw_exception=True) if relation[1] in self._relations: self._exclusions.append(relation) else: raise RuntimeError("Incorrect exclusion syntax: '{0}'".format(_unicode2str(relation_syntax)))
def _add_dependent_variable(self, variable): if variable == "n_trials": self._dv.append([variable, 0]) else: tmp = variable.replace(")", "").split("(") dv_fnc = tmp[0].strip() try: dv_txt = tmp[1].strip() except: raise RuntimeError("Incorrect syntax for DV: '{0}'".format( _unicode2str(variable))) var_id = self._get_variable_id(dv_txt, True) if dv_fnc in self._dv_functions: self._dv.append([dv_fnc, var_id]) else: raise RuntimeError("Unknown function for dependent variable:" + " '{0}'".format(_unicode2str(dv_fnc)))
def _get_variable_id(self, variables, throw_exception=False): for cnt, v in enumerate(self.variables): if variables == v: return cnt if (throw_exception): raise RuntimeError("Unknown variable name '{0}'".format( _unicode2str(variables))) return None
def _add_exclusion(self, relation_syntax): """Add an exclusion.""" relation = self._parse_syntax(relation_syntax, throw_exception=True) if relation[1] in self._relations: self._exclusions.append(relation) else: raise RuntimeError("Incorrect exclusion syntax: '{0}'".format( _unicode2str(relation_syntax)))
def _add_dependent_variable(self, variable): if variable == "n_trials": self._dv.append([variable, 0]) else: tmp = variable.replace(")", "").split("(") dv_fnc = tmp[0].strip() try: dv_txt = tmp[1].strip() except: raise RuntimeError( "Incorrect syntax for DV: '{0}'".format( _unicode2str(variable))) var_id = self._get_variable_id(dv_txt, True) if dv_fnc in self._dv_functions: self._dv.append([dv_fnc, var_id]) else: raise RuntimeError("Unknown function for dependent variable:" + " '{0}'".format(_unicode2str(dv_fnc)))
def write_csv_file(filename, data, varnames=None, delimiter=','): """Write 2D data array to csv file. Parameters ---------- filename : str name (fullpath) of the data file data : list of list 2D array with data (list of list) varnames : list of str, optional array of strings representing variable names delimiter : str, optional delimiter character (default=",") """ _sys.stdout.write("write file: {0}".format(filename)) try: _locale_enc = _locale.getdefaultlocale()[1] except: _locale_enc = "UTF-8" with open(filename, 'w') as f: header = "# -*- coding: {0} -*-\n".format( _locale_enc) f.write(header) if varnames is not None: for c, v in enumerate(varnames): if c > 0: f.write(delimiter) f.write(_unicode2str(v)) f.write("\n") cnt = 0 for row in data: for c, v in enumerate(row): if c > 0: f.write(delimiter) if isinstance(v, unicode): _unicode2str(v) f.write(v) cnt += 1 f.write("\n") print " ({0} cells in {1} rows)".format(cnt, len(data))
def write_csv_file(filename, data, varnames=None, delimiter=','): """Write 2D data array to csv file. Parameters ---------- filename : str name (fullpath) of the data file data : list of list 2D array with data (list of list) varnames : list of str, optional array of strings representing variable names delimiter : str, optional delimiter character (default=",") """ _sys.stdout.write("write file: {0}".format(filename)) try: _locale_enc = _locale.getdefaultlocale()[1] except: _locale_enc = "UTF-8" with open(filename, 'w') as f: header = "# -*- coding: {0} -*-\n".format(_locale_enc) f.write(header) if varnames is not None: for c, v in enumerate(varnames): if c > 0: f.write(delimiter) f.write(_unicode2str(v)) f.write("\n") cnt = 0 for row in data: for c, v in enumerate(row): if c > 0: f.write(delimiter) if isinstance(v, unicode): _unicode2str(v) f.write(v) cnt += 1 f.write("\n") print " ({0} cells in {1} rows)".format(cnt, len(data))
def _add_compute_variable(self, compute_syntax): """Add a new variable to be computed.""" tmp = compute_syntax.replace("==", "@@") # avoid confusion = & == tmp = tmp.replace("!=", "##") # avoid confusion = & == tmp = tmp.split("=") variable_name = tmp[0].strip() try: syntax = tmp[1].strip() syntax = syntax.replace("@@", "==") syntax = syntax.replace("##", "==") except: raise RuntimeError("Incorrect compute syntax: '{0}'".format(_unicode2str(compute_syntax))) variable_def = self._parse_syntax(syntax, throw_exception=True) if variable_def is None: variable_def = self._parse_operation(syntax, throw_exception=True) if self._get_variable_id(variable_name) is not None: raise RuntimeError("Variable already defined '{0}'".format(_unicode2str(variable_name))) else: self._variables.append(variable_name) self._computes.append([variable_name, variable_def])
def _add_compute_variable(self, compute_syntax): """Add a new variable to be computed.""" tmp = compute_syntax.replace("==", "@@") # avoid confusion = & == tmp = tmp.replace("!=", "##") # avoid confusion = & == tmp = tmp.split("=") variable_name = tmp[0].strip() try: syntax = tmp[1].strip() syntax = syntax.replace("@@", "==") syntax = syntax.replace("##", "==") except: raise RuntimeError("Incorrect compute syntax: '{0}'".format( _unicode2str(compute_syntax))) variable_def = self._parse_syntax(syntax, throw_exception=True) if variable_def is None: variable_def = self._parse_operation(syntax, throw_exception=True) if self._get_variable_id(variable_name) is not None: raise RuntimeError("Variable already defined '{0}'".format( _unicode2str(variable_name))) else: self._variables.append(variable_name) self._computes.append([variable_name, variable_def])
def _add_variable_recoding(self, recode_syntax): """Add a new variable recoding rule.""" error = False tmp = recode_syntax.split(":") if len(tmp) == 2: var_id = self._get_variable_id(tmp[0].strip(), True) excl_array = [] for rule in tmp[1].split(","): rule = rule.split("=") if len(rule) == 2: excl_array.append([rule[0].strip(), rule[1].strip()]) else: error = True else: error = True if error: raise RuntimeError("Incorrect recoding syntax: '{0}'".format(_unicode2str(recode_syntax))) else: self._recode.append([var_id, excl_array])
def _add_variable_recoding(self, recode_syntax): """Add a new variable recoding rule.""" error = False tmp = recode_syntax.split(":") if len(tmp) == 2: var_id = self._get_variable_id(tmp[0].strip(), True) excl_array = [] for rule in tmp[1].split(","): rule = rule.split("=") if len(rule) == 2: excl_array.append([rule[0].strip(), rule[1].strip()]) else: error = True else: error = True if error: raise RuntimeError("Incorrect recoding syntax: '{0}'".format( _unicode2str(recode_syntax))) else: self._recode.append([var_id, excl_array])
def _parse_syntax(self, syntax, throw_exception): """Preprocess relation and operation syntax. Returns relation array. """ rels_ops = _copy(self._relations) rels_ops.extend(self._operations) found = None for ro in rels_ops: if syntax.find(ro) > 0: found = ro break if found is None: if throw_exception: raise RuntimeError("Incorrect syntax: '{0}'".format(_unicode2str(syntax))) else: return None else: syntax = syntax.split(found) var_id = self._get_variable_id(syntax[0].strip(), True) return [var_id, found, syntax[1].strip()]
def _parse_syntax(self, syntax, throw_exception): """Preprocess relation and operation syntax. Returns relation array. """ rels_ops = _copy(self._relations) rels_ops.extend(self._operations) found = None for ro in rels_ops: if syntax.find(ro) > 0: found = ro break if found is None: if throw_exception: raise RuntimeError("Incorrect syntax: '{0}'".format( _unicode2str(syntax))) else: return None else: syntax = syntax.split(found) var_id = self._get_variable_id(syntax[0].strip(), True) return [var_id, found, syntax[1].strip()]
def get_data(self, filename, recode_variables=True, compute_new_variables=True, exclude_trials=True): """Read data from from a single Expyriment data file. Notes ----- The function can be only applied on data of aggregator.data_files, that is, on the files in the defined data folder that start with the experiment name. According to the defined design, the result contains recoded data together with the new computed variables, and the subject variables from the headers of the Expyriment data files. Parameters ---------- filename : str name of the Expyriment data file recode_variables : bool, optional set to False if defined variable recodings should not be applied (default=True) compute_new_variables : bool, optional set to False if new defined variables should not be computed (default=True) exclude_trials : bool, optional set to False if exclusion rules should not be applied (default=True) Returns ------- data : numpy.array var_names : list list of variable names info : str subject info comment : str comments in data """ # check filename if filename not in self._data_files: raise RuntimeError("'{0}' is not in the data list\n".format( _unicode2str(filename))) data, _vnames, subject_info, comments = \ read_datafile(self._data_folder + "/" + filename) print " reading {0}".format(_unicode2str(filename)) if recode_variables: for var_id, recoding in self._recode: for old, new in recoding: for row in range(len(data)): if data[row][var_id] == old: data[row][var_id] = new data = _np.array(data, dtype='|U99') # compute new defined variables and append if compute_new_variables: for new_var_name, var_def in self._computes: if var_def[1] in self._relations: # relations are true or false col = _np.zeros([data.shape[0], 1], dtype=int) idx = self._find_idx(data, var_def[0], var_def[1], var_def[2]) col[idx, 0] = 1 else: # operations try: a = _np.float64([data[:, var_def[0]]]).transpose() second_var_id = self._get_variable_id(var_def[2], False) if second_var_id is not None: b = _np.float64( [data[:, second_var_id]]).transpose() else: b = _np.float64(var_def[2]) except: msg = "Error while computing new variable {0}. " + \ "Non-number in variables of {1}" msg.format(new_var_name, filename) raise RuntimeError(msg) if var_def[1] == "+": col = a + b elif var_def[1] == "-": col = a - b elif var_def[1] == "*": col = a * b elif var_def[1] == "/": col = a / b elif var_def[1] == "%": col = a % b data = _np.concatenate((data, col), axis=1) # add subject information for sv in self.subject_variables: try: info = subject_info[sv] except: info = "nan" col = _np.array([[info for _x in range(data.shape[0])]]) data = _np.c_[data, col.transpose()] # _add_exclusion trials if exclude_trials: for exl in self._exclusions: idx = self._find_idx(data, exl[0], exl[1], exl[2]) if len(idx) > 0: data = _np.delete(data, idx, axis=0) var = _copy(self._variables) var.extend(self._subject_variables) return [data, var, subject_info, comments]
def _find_idx(self, data, column_id, relation, value): """Find the indices of elements in a data column. Notes ----- It compares of column elements with a value or the elements of a second column, if value is a name of variable. The method deals with numerical and string comparisons and throws an exception for invalid string comparisons. Parameters ---------- data : numpy.array the data column_id : int id of column to compare relation : str relation as string. possible relations: "==", "!=", ">", "<", ">=", "<=", "=>", "<=" value : numeric or string value to find or a variable name """ # is value a variable name second_var_id = self._get_variable_id(value, False) # _add_exclusion try: col = _np.float64(data[:, column_id]) except: # handling strings col = data[:, column_id] try: if second_var_id is not None: val = _np.float64(data[:, second_var_id]) else: val = _np.float64(value) except: # handling strings if second_var_id is not None: val = data[:, second_var_id] else: val = value if value.endswith("std") and (value.find("*") > 0): # remove relative depending std tmp = value.split("*") fac = float(tmp[0]) mean_stds = self._dv_mean_std(data, column_id) idx = [] if relation not in [">", "<", "=>", ">=", "=<", "<="]: raise RuntimeError("Incorrect syntax for " + "exception: '{0} {1}'".format( _unicode2str(relation), _unicode2str(value))) for cnt, row in enumerate(data): #find name of combination combi_str = self.variables[column_id] for iv in self._iv: if isinstance(row[iv], unicode): _row_data = _unicode2str(row[iv]) else: _row_data = row[iv] combi_str = combi_str + "_" + \ "{0}{1}".format(_unicode2str(self.variables[iv]), _row_data) deviation = float(row[column_id]) - mean_stds[combi_str][0] if (relation == ">" and deviation > fac * mean_stds[combi_str][1]) or \ (relation == "=>" or relation == ">=" and deviation >= fac * mean_stds[combi_str][1]) or \ (relation == "<" and deviation < -fac * mean_stds[combi_str][1]) or \ (relation == "=<" or relation == "<=" and deviation <= -fac * mean_stds[combi_str][1]): idx.append(cnt) return idx else: if relation == "!=": comp = (col != val) elif relation == "==": comp = (col == val) elif relation == "<": comp = (col < val) elif relation == ">": comp = (col > val) elif relation == "=<" or relation == "<=": comp = (col <= val) elif relation == "=>" or relation == ">=": comp = (col >= val) else: comp = None # should never occur if isinstance(comp, bool): raise RuntimeError( "Incorrect syntax for " + "exception: '{0} {1}'".format( _unicode2str(relation), _unicode2str(value))) return _np.flatnonzero(comp)
def _find_idx(self, data, column_id, relation, value): """Find the indices of elements in a data column. Notes ----- It compares of column elements with a value or the elements of a second column, if value is a name of variable. The method deals with numerical and string comparisons and throws an exception for invalid string comparisons. Parameters ---------- data : numpy.array the data column_id : int id of column to compare relation : str relation as string. possible relations: "==", "!=", ">", "<", ">=", "<=", "=>", "<=" value : numeric or string value to find or a variable name """ # is value a variable name second_var_id = self._get_variable_id(value, False) # _add_exclusion try: col = _np.float64(data[:, column_id]) except: # handling strings col = data[:, column_id] try: if second_var_id is not None: val = _np.float64(data[:, second_var_id]) else: val = _np.float64(value) except: # handling strings if second_var_id is not None: val = data[:, second_var_id] else: val = value if value.endswith("std") and (value.find("*") > 0): # remove relative depending std tmp = value.split("*") fac = float(tmp[0]) mean_stds = self._dv_mean_std(data, column_id) idx = [] if relation not in [">", "<", "=>", ">=", "=<", "<="]: raise RuntimeError( "Incorrect syntax for " + "exception: '{0} {1}'".format( _unicode2str(relation), _unicode2str(value))) for cnt, row in enumerate(data): #find name of combination combi_str = self.variables[column_id] for iv in self._iv: if isinstance(row[iv], unicode): _row_data = _unicode2str(row[iv]) else: _row_data = row[iv] combi_str = combi_str + "_" + \ "{0}{1}".format(_unicode2str(self.variables[iv]), _row_data) deviation = float(row[column_id]) - mean_stds[combi_str][0] if (relation == ">" and deviation > fac * mean_stds[combi_str][1]) or \ (relation == "=>" or relation == ">=" and deviation >= fac * mean_stds[combi_str][1]) or \ (relation == "<" and deviation < -fac * mean_stds[combi_str][1]) or \ (relation == "=<" or relation == "<=" and deviation <= -fac * mean_stds[combi_str][1]): idx.append(cnt) return idx else: if relation == "!=": comp = (col != val) elif relation == "==": comp = (col == val) elif relation == "<": comp = (col < val) elif relation == ">": comp = (col > val) elif relation == "=<" or relation == "<=": comp = (col <= val) elif relation == "=>" or relation == ">=": comp = (col >= val) else: comp = None # should never occur if isinstance(comp, bool): raise RuntimeError( "Incorrect syntax for " + "exception: '{0} {1}'".format( _unicode2str(relation), _unicode2str(value))) return _np.flatnonzero(comp)
def reset(self, data_folder, file_name, suffix=_default_suffix, variables=None): """Reset the aggregator class and clear design. Parameters ---------- data_folder : str folder which contains of data of the subjects file_name : str name of the files. All files that start with this name will be considered for the analysis (cf. aggregator.data_files) suffix : str, optional if specified only files that end with this particular suffix will be considered (default=.xpd) variables : array of str, optional array of variable names, process only the specified variables """ self._data_folder = data_folder self._file_name = file_name self._data_files = [] self._variables = [] self._dv = [] self._dv_txt = [] self._iv = [] self._iv_txt = [] self._exclusions = [] self._exclusions_txt = [] self._computes = [] self._computes_txt = [] self._recode_txt = [] self._recode = [] self._subject_variables = [] self._last_data = [] self._added_data = [] self._added_variables = [] self._suffix = suffix for flname in _os.listdir(_os.path.dirname(self._data_folder + "/")): if flname.endswith(self._suffix) and \ flname.startswith(self._file_name): _data, vnames, _subject_info, _comments = \ read_datafile(self._data_folder + "/" + flname, read_variables=variables) if len(self._variables) < 1: self._variables = vnames else: if vnames != self._variables: message = u"Different variables in ".format(flname) message = message + u"\n{0}".format(vnames) message = message + u"\ninstead of\n{0}".format( self._variables) raise RuntimeError(_unicode2str(message)) self._data_files.append(flname) if len(self._data_files) < 1: raise Exception("No data files found in {0}".format( _unicode2str(self._data_folder))) print "found {0} subject_data sets".format(len(self._data_files)) print "found {0} variables: {1}".format(len( self._variables), [_unicode2str(x) for x in self._variables])
def get_data(self, filename, recode_variables=True, compute_new_variables=True, exclude_trials=True): """Read data from from a single Expyriment data file. Notes ----- The function can be only applied on data of aggregator.data_files, that is, on the files in the defined data folder that start with the experiment name. According to the defined design, the result contains recoded data together with the new computed variables, and the subject variables from the headers of the Expyriment data files. Parameters ---------- filename : str name of the Expyriment data file recode_variables : bool, optional set to False if defined variable recodings should not be applied (default=True) compute_new_variables : bool, optional set to False if new defined variables should not be computed (default=True) exclude_trials : bool, optional set to False if exclusion rules should not be applied (default=True) Returns ------- data : numpy.array var_names : list list of variable names info : str subject info comment : str comments in data """ # check filename if filename not in self._data_files: raise RuntimeError("'{0}' is not in the data list\n".format( _unicode2str(filename))) data, _vnames, subject_info, comments = \ read_datafile(self._data_folder + "/" + filename) print " reading {0}".format(_unicode2str(filename)) if recode_variables: for var_id, recoding in self._recode: for old, new in recoding: for row in range(len(data)): if data[row][var_id] == old: data[row][var_id] = new data = _np.array(data, dtype='|S99') # compute new defined variables and append if compute_new_variables: for new_var_name, var_def in self._computes: if var_def[1] in self._relations: # relations are true or false col = _np.zeros([data.shape[0], 1], dtype=int) idx = self._find_idx(data, var_def[0], var_def[1], var_def[2]) col[idx, 0] = 1 else: # operations try: a = _np.float64([data[:, var_def[0]]]).transpose() second_var_id = self._get_variable_id( var_def[2], False) if second_var_id is not None: b = _np.float64([data[:, second_var_id]]).transpose() else: b = _np.float64(var_def[2]) except: msg = "Error while computing new variable {0}. " + \ "Non-number in variables of {1}" msg.format(new_var_name, filename) raise RuntimeError(msg) if var_def[1] == "+": col = a + b elif var_def[1] == "-": col = a - b elif var_def[1] == "*": col = a * b elif var_def[1] == "/": col = a / b elif var_def[1] == "%": col = a % b data = _np.concatenate((data, col), axis=1) # add subject information for sv in self.subject_variables: try: info = subject_info[sv] except: info = "nan" col = _np.array([[info for _x in range(data.shape[0])]]) data = _np.c_[data, col.transpose()] # _add_exclusion trials if exclude_trials: for exl in self._exclusions: idx = self._find_idx(data, exl[0], exl[1], exl[2]) if len(idx) > 0: data = _np.delete(data, idx, axis=0) var = _copy(self._variables) var.extend(self._subject_variables) return [data, var, subject_info, comments]