def get_table(): dbpath = DATA_PATH.joinpath("human", "workers.db") if not dbtools.Table.exists(dbpath, "workers"): logger.info("Creating new table 'workers'") tbl = dbtools.Table.create(dbpath, "workers", [('pid', str), ('dataset', str)]) else: logger.info("Loading existing table 'workers'") tbl = dbtools.Table(dbpath, "workers") return tbl
def get_table(): dbpath = CPO_PATH.joinpath("metadata.db") if not dbtools.Table.exists(dbpath, "stability"): logger.info("Creating new table 'stability'") tbl = dbtools.Table.create(dbpath, "stability", [('stimulus', str), ('kappa', int), ('nfell', int), ('stable', int), ('dataset', str)]) else: logger.info("Loading existing table 'stability'") tbl = dbtools.Table(dbpath, "stability") return tbl
def gen_config(exp, cond, phase, cb): fb, ratio = parse_cond(cond, phase) # load render script script = load_script(exp, cond, phase, cb) script['feedback'] = fb script['ratio'] = ratio script['counterbalance'] = cb # load metadata (e.g. stability) tbl = dbtools.Table(CPO_PATH.joinpath("metadata.db"), "stability") meta = pd.concat(map( lambda s, k: load_meta(tbl, s, k), script.stimulus, script.kappa)) # merge metadata and render script to get the config config = pd.merge(script, meta, on=['stimulus', 'kappa']) config = config.set_index('stimulus').sort().reset_index() # sanity check, to make sure ratios match kappas r2kappa = np.array(map(float, config.ratio)) assert (10**config.kappa == r2kappa).all() # dump dataframe to a dictionary and sort by stimulus trials = config.reset_index(drop=True).T.to_dict().values() trials.sort(cmp=lambda x, y: cmp(x['stimulus'], y['stimulus'])) # make sure there are no nans, because javascript won't load JSON # with nans in it for trial in trials: for key, val in trial.iteritems(): if isinstance(val, float) and np.isnan(val): if key in ("color0", "color1"): trial[key] = None # only return a list if there are multiple trials if len(trials) == 1: trials = trials[0] return trials
def find_bad_participants(exp, data): """Check participant data to make sure they pass the following conditions: 1. No duplicated trials 2. They finished the whole experiment 3. They passed the posttest Returns a dictionary of failed participants that includes the reasons why they failed. """ participants = [] for (assignment, pid), df in data.groupby(['assignment', 'pid']): info = { 'pid': pid, 'assignment': assignment, 'note': None, 'timestamp': None, 'percent': 0.0, 'num_failed': np.nan } # go ahead and add this to our list now -- the dictionary is # mutable, so when we update stuff later the dictionary in the # list will also be updated participants.append(info) # get the time they started the experiment times = df['psiturk_time'].copy() times.sort_values(inplace=True) start_time = pd.to_datetime(datetime.fromtimestamp(times.iloc[0] / 1e3)) info['timestamp'] = start_time # add condition/counterbalance cond = int(df['condition'].unique()) cb = int(df['counterbalance'].unique()) info['condition'] = cond info['counterbalance'] = cb # check for duplicated entries if exp == 'mass_inference-G': dupes = df.sort_values(by='psiturk_time')[['mode', 'trial', 'trial_phase']]\ .duplicated().any() if dupes: logger.warning("%s (%s, %s) has duplicate trials", pid, cond, cb) info['note'] = "duplicate_trials" continue # check to make sure they actually finished try: prestim = df\ .set_index(['mode', 'trial', 'trial_phase'])\ .groupby(level='trial_phase')\ .get_group('prestim') except IndexError: if df['trial_phase'].isnull().all(): incomplete = True else: raise else: if exp == 'mass_inference-G': num_trials = 62 elif exp == 'mass_inference-H': num_trials = 62 elif exp == 'mass_inference-I': num_trials = 32 else: raise ValueError("unhandled experiment: %s" % exp) incomplete = len(prestim) != num_trials info['percent'] = 100 * len(prestim) / num_trials if incomplete: logger.warning( "%s (%s, %s) is incomplete (completed %d/32 trials [%.1f%%])", pid, cond, cb, len(prestim), info['percent']) info['note'] = "incomplete" continue # check to see if they passed the posttest posttest = df\ .set_index(['mode', 'trial', 'trial_phase'])\ .groupby(level=['mode', 'trial_phase'])\ .get_group(('posttest', 'fall_response')) truth = (posttest['nfell'] > 0).astype(float) resp = (posttest['response'] > 4).astype(float) resp[posttest['response'] == 4] = np.nan failed = (truth != resp).sum() info['num_failed'] = failed if failed > 1: logger.warning("%s (%s, %s) failed posttest (%d wrong)", pid, cond, cb, failed) info['note'] = "failed_posttest" continue # see if they already did (a version of) the experiment dbpath = DATA_PATH.joinpath("human", "workers.db") tbl = dbtools.Table(dbpath, "workers") datasets = tbl.select("dataset", where=("pid=?", pid))['dataset'] exps = map(lambda x: path(x).namebase, datasets) if exp in exps: exps.remove(exp) if len(exps) > 0: logger.warning("%s (%s, %s) is a repeat worker", pid, cond, cb) info['note'] = "repeat_worker" continue return participants
stiminfo_path = os.path.join(dataset, "stimuli-info.csv") if stiminfo_path.exists(): with open(stiminfo_path, "r") as fh: stims = [x for x in fh.read().split("\n") if x != ''] stims = sorted([x.split(",")[0] for x in stims[1:]]) stims = { "stim_%d" % (i + 1): x for i, x in enumerate(stims) } else: stims = None # order = dataset.split("_")[-1] dbpath = os.path.join(dataset, "data.db") tbl = dbtools.Table(dbpath, "Participants") condition = tbl[int(name[:-4])].condition.values[0] ccode = tbl[int(name[:-4])].completion_code.values[0] vcode = tbl[int(name[:-4])].validation_code.values[0] if 'completion_codes' not in extradata: extradata['completion_codes'] = {} extradata['completion_codes'][name[:-4]] = ccode if 'validation_codes' not in extradata: extradata['validation_codes'] = {} extradata['validation_codes'][name[:-4]] = vcode process = locals()['process_%s' % dataset_type] newdata, extradata_ = process(name[:-4], data, trials, stims, condition) all_data.update(newdata)