def process_model_fall(exp, tag, force=False): src_dp = DATA_PATH.joinpath("model-raw", "%s_%s.dpkg" % (exp, tag)) dest_dp = DATA_PATH.joinpath("model", "%s_%s_fall.dpkg" % (exp, tag)) if not src_dp.exists(): raise IOError("Package '%s' does not exist" % src_dp.relpath()) if dest_dp.exists() and not force: return # load the raw model data logger.info("Loading '%s'", src_dp) dp = dpkg.DataPackage.load(src_dp) dp.load_resources() # compute the number of blocks that moved resp = process_model_nmoved(dp) # the destination datapackage already exists, so just load it and # update it if dest_dp.exists(): new_dp = dpkg.DataPackage.load(dest_dp) new_dp.bump_minor_version() r1 = new_dp.get_resource("model.csv") r2 = new_dp.get_resource("metadata") # the destination datapackage doesn't exist, so we need to create # it from scratch else: new_dp = dpkg.DataPackage(name=dest_dp.name, licenses=['odc-by']) new_dp['version'] = '1.0.0' new_dp.add_contributor("Jessica B. Hamrick", "*****@*****.**") new_dp.add_contributor("Peter W. Battaglia", "*****@*****.**") new_dp.add_contributor("Joshua B. Tenenbaum", "*****@*****.**") r1 = dpkg.Resource(name="model.csv", fmt="csv", pth="./model.csv") r2 = dpkg.Resource(name="metadata", fmt="json") r2['mediaformat'] = "application/json" new_dp.add_resource(r1) new_dp.add_resource(r2) # update the resource data r1.data = resp r2.data = dict(source=src_dp, nfell_min=0, nfell_max=10) # create destination folders, if they don't exist if not dest_dp.dirname().exists(): dest_dp.dirname().makedirs_p() # save new_dp.save(dest_dp.dirname()) logger.info("Saved to '%s'" % dest_dp.relpath())
def get_table(): dbpath = DATA_PATH.joinpath("human", "workers.db") if not dbtools.Table.exists(dbpath, "workers"): logger.info("Creating new table 'workers'") tbl = dbtools.Table.create(dbpath, "workers", [('pid', str), ('dataset', str)]) else: logger.info("Loading existing table 'workers'") tbl = dbtools.Table(dbpath, "workers") return tbl
def save_stability(exp): dp_path = DATA_PATH.joinpath("human", "%s.dpkg" % exp) # load the datapackage logger.info("Loading '%s'", dp_path.relpath()) dp = dpkg.DataPackage.load(dp_path) # is there a turk.csv file? try: turk = dp.load_resource("turk.csv") except KeyError: turk = None # is there a participants.csv file? try: parts = dp.load_resource("participants.csv") except KeyError: parts = None # get the worker ids if turk is not None: workers = sorted(turk.reset_index()['WorkerId'].unique()) elif parts is not None: workers = sorted(parts['pid'].unique()) else: logger.warning("'%s' is not a Mechanical Turk experiment", exp) return # create a new dataframe df = pd.DataFrame({'pid': workers}) df['dataset'] = str(dp_path.name) # load the table we're saving it to tbl = get_table() KEY = ['pid'] tbl_dupes = tbl.select(KEY, where=("dataset=?", dp_path.name)) tbl_dupes['pid'] = map(str, tbl_dupes['pid']) tbl_dupes = set([x[0] for x in tbl_dupes.to_records(index=False).tolist()]) df_dupes = set([x[0] for x in df[KEY].to_records(index=False).tolist()]) # get the unique values and the duplicated values, because we will # treat them differently unique = pd.Index(df_dupes.difference(tbl_dupes), name="pid") df_idx = df.set_index(KEY) if len(unique) > 0: logger.info("Adding %d new items", len(unique)) tbl.insert(df_idx.ix[unique].reset_index().T.to_dict().values())
def save_stability(exp, tag, force=False): dp_path = DATA_PATH.joinpath("model", "%s_%s_fall.dpkg" % (exp, tag)) # load the stability data logger.info("Loading '%s'", dp_path.relpath()) fb = get_stability(dp_path) if fb is None: return fb['dataset'] = str(dp_path.name) # load the table we're saving it to tbl = get_table() KEY = ['stimulus', 'kappa'] tbl_dupes = tbl.select(KEY, where=("dataset=?", dp_path.name)) tbl_dupes['stimulus'] = map(str, tbl_dupes['stimulus']) tbl_dupes = set(tbl_dupes.to_records(index=False).tolist()) fb_dupes = set(fb[KEY].to_records(index=False).tolist()) # get the unique values and the duplicated values, because we will # treat them differently unique = fb_dupes.difference(tbl_dupes) dupes = fb_dupes.intersection(tbl_dupes) fb_idx = fb.set_index(KEY) if len(unique) > 0: logger.info("Adding %d new items", len(unique)) tbl.insert(fb_idx.ix[unique].reset_index().T.to_dict().values()) if len(dupes) > 0 and force: logger.info("Updating %d old items", len(dupes)) for dupe in dupes: newvals = fb_idx.ix[[dupe]].reset_index().irow(0).to_dict() newvals['kappa'] = float(newvals['kappa']) newvals['nfell'] = int(newvals['nfell']) newvals['stable'] = bool(newvals['stable']) tbl.update(newvals, where=("stimulus=? AND kappa=? AND dataset=?", dupe + (str(dp_path.name), )))
def fetch(site_root, filename, experiment, force=False): """Download `filename` from `site_root` and save it in the human-raw/`experiment` data folder. """ # get the url url = path(site_root).joinpath(filename) # get the destination to save the data, and don't do anything if # it exists already dest = DATA_PATH.joinpath("human-raw", experiment, "%s.csv" % url.name) if dest.exists() and not force: return # try to open it try: handler = urllib2.urlopen(url) except IOError as err: if getattr(err, 'code', None) == 401: logger.error("Server authentication failed.") raise err else: raise # download the data data = handler.read() logger.info("Fetched succesfully: %s", url) # make the destination folder if it doesn't exist if not dest.dirname().exists(): dest.dirname().makedirs_p() # write out the data file with open(dest, "w") as fh: fh.write(data) logger.info("Saved to '%s'", dest.relpath())
def find_bad_participants(exp, data): """Check participant data to make sure they pass the following conditions: 1. No duplicated trials 2. They finished the whole experiment 3. They passed the posttest Returns a dictionary of failed participants that includes the reasons why they failed. """ participants = [] for (assignment, pid), df in data.groupby(['assignment', 'pid']): info = { 'pid': pid, 'assignment': assignment, 'note': None, 'timestamp': None, 'percent': 0.0, 'num_failed': np.nan } # go ahead and add this to our list now -- the dictionary is # mutable, so when we update stuff later the dictionary in the # list will also be updated participants.append(info) # get the time they started the experiment times = df['psiturk_time'].copy() times.sort_values(inplace=True) start_time = pd.to_datetime(datetime.fromtimestamp(times.iloc[0] / 1e3)) info['timestamp'] = start_time # add condition/counterbalance cond = int(df['condition'].unique()) cb = int(df['counterbalance'].unique()) info['condition'] = cond info['counterbalance'] = cb # check for duplicated entries if exp == 'mass_inference-G': dupes = df.sort_values(by='psiturk_time')[['mode', 'trial', 'trial_phase']]\ .duplicated().any() if dupes: logger.warning("%s (%s, %s) has duplicate trials", pid, cond, cb) info['note'] = "duplicate_trials" continue # check to make sure they actually finished try: prestim = df\ .set_index(['mode', 'trial', 'trial_phase'])\ .groupby(level='trial_phase')\ .get_group('prestim') except IndexError: if df['trial_phase'].isnull().all(): incomplete = True else: raise else: if exp == 'mass_inference-G': num_trials = 62 elif exp == 'mass_inference-H': num_trials = 62 elif exp == 'mass_inference-I': num_trials = 32 else: raise ValueError("unhandled experiment: %s" % exp) incomplete = len(prestim) != num_trials info['percent'] = 100 * len(prestim) / num_trials if incomplete: logger.warning( "%s (%s, %s) is incomplete (completed %d/32 trials [%.1f%%])", pid, cond, cb, len(prestim), info['percent']) info['note'] = "incomplete" continue # check to see if they passed the posttest posttest = df\ .set_index(['mode', 'trial', 'trial_phase'])\ .groupby(level=['mode', 'trial_phase'])\ .get_group(('posttest', 'fall_response')) truth = (posttest['nfell'] > 0).astype(float) resp = (posttest['response'] > 4).astype(float) resp[posttest['response'] == 4] = np.nan failed = (truth != resp).sum() info['num_failed'] = failed if failed > 1: logger.warning("%s (%s, %s) failed posttest (%d wrong)", pid, cond, cb, failed) info['note'] = "failed_posttest" continue # see if they already did (a version of) the experiment dbpath = DATA_PATH.joinpath("human", "workers.db") tbl = dbtools.Table(dbpath, "workers") datasets = tbl.select("dataset", where=("pid=?", pid))['dataset'] exps = map(lambda x: path(x).namebase, datasets) if exp in exps: exps.remove(exp) if len(exps) > 0: logger.warning("%s (%s, %s) is a repeat worker", pid, cond, cb) info['note'] = "repeat_worker" continue return participants
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument("-e", "--exp", required=True, help="Experiment version.") parser.add_argument("-f", "--force", action="store_true", default=False, help="Force all tasks to be put on the queue.") args = parser.parse_args() # paths to the data and where we will save it data_path = DATA_PATH.joinpath("human-raw", args.exp) dest_path = DATA_PATH.joinpath("human", "%s.dpkg" % args.exp) # don't do anything if the datapackage already exists if dest_path.exists() and not args.force: sys.exit(0) # create the directory if it doesn't exist if not dest_path.dirname().exists: dest_path.dirname().makedirs_p() # load the data meta, conds, fields = load_meta(data_path) data, participants = load_data(data_path, conds, fields) events = load_events(data_path) data = hashids(data)
import re import warnings from copy import deepcopy # External import numpy as np # Cogphysics import cogphysics import cogphysics.lib.hashtools as ht # Scenesim from scenesim.objects.sso import SSO from scenesim.objects.pso import RBSO # Local from snippets import datapackage as dpkg from mass import DATA_PATH OLDPATH = DATA_PATH.joinpath("old/old-cogphysics-model-raw") NEWPATH = DATA_PATH.joinpath("model-raw") convtable_path = os.path.join(cogphysics.RESOURCE_PATH, "cpobj_conv_stability.pkl") with open(convtable_path, "r") as fh: convtable = pickle.load(fh) conv_cache = {} def convert_name(oldname): global conv_cache if oldname in conv_cache: newname = conv_cache[oldname] else:
def process(exp, tag, overwrite=False): name = "%s_%s.dpkg" % (exp, tag) dp_path = DATA_PATH.joinpath("model-raw", name) if dp_path.exists() and not overwrite: return params, data = load(exp, tag) forces = params['forces'] noises = params['noises'] sim_meta = { 'simulations': params['simulation'], 'physics': params['physics'], 'index_names': params['index_names'], 'index_levels': params['index_levels'], } force_meta = {'index_names': ['phi', 'stimulus', 'sample']} force_meta['index_levels'] = { x: params['index_levels'][x] for x in force_meta['index_names'] } noise_meta = { 'index_names': ['sigma', 'stimulus', 'sample', 'object', 'position'] } noise_meta['index_levels'] = { x: params['index_levels'][x] for x in noise_meta['index_names'][:-1] } noise_meta['index_levels']['position'] = ['x', 'y', 'z'] # load the existing datapackage and bump the version if dp_path.exists(): dp = dpkg.DataPackage.load(dp_path) dp.bump_minor_version() dp.clear_resources() # create the datapackage else: dp = dpkg.DataPackage(name=name, licenses=['odc-by']) dp['version'] = '1.0.0' dp.add_contributor("Jessica B. Hamrick", "*****@*****.**") dp.add_contributor("Peter W. Battaglia", "*****@*****.**") dp.add_contributor("Joshua B. Tenenbaum", "*****@*****.**") dp.add_resource( dpkg.Resource(name="simulations.npy", fmt="npy", data=data, pth="./simulations.npy")) dp.add_resource( dpkg.Resource(name="forces.npy", fmt="npy", data=forces, pth="./forces.npy")) dp.add_resource( dpkg.Resource(name="noises.npy", fmt="npy", data=noises, pth="./noises.npy")) sm = dpkg.Resource(name="simulation_metadata", fmt="json", data=sim_meta) sm['mediaformat'] = 'application/json' dp.add_resource(sm) fm = dpkg.Resource(name="force_metadata", fmt="json", data=force_meta) fm['mediaformat'] = 'application/json' dp.add_resource(fm) fm = dpkg.Resource(name="noise_metadata", fmt="json", data=noise_meta) fm['mediaformat'] = 'application/json' dp.add_resource(fm) dp.save(dp_path.dirname())
#!/usr/bin/env python import pandas as pd import sys import os from mass import DATA_PATH root = os.path.join(os.path.dirname(__file__), '..', '..') sys.path.append(os.path.join(root, 'lib')) import datapackage as dpkg data_G_path = DATA_PATH.joinpath("human", "mass_inference-G.dpkg") data_H_path = DATA_PATH.joinpath("human", "mass_inference-H.dpkg") data_I_path = DATA_PATH.joinpath("human", "mass_inference-I.dpkg") data_path = DATA_PATH.joinpath("human", "mass_inference-merged.dpkg") dp_G = dpkg.DataPackage.load(data_G_path) dp_H = dpkg.DataPackage.load(data_H_path) dp_I = dpkg.DataPackage.load(data_I_path) dp = dpkg.DataPackage(name=data_path.name, licenses=['odc-by']) dp['version'] = '1.0.0' dp.add_contributor("Jessica B. Hamrick", "*****@*****.**") dp.add_contributor("Thomas L. Griffiths", "*****@*****.**") dp.add_contributor("Peter W. Battaglia", "*****@*****.**") dp.add_contributor("Joshua B. Tenenbaum", "*****@*****.**") # add event data, and save it as csv events_G = dp_G.load_resource("events.csv") events_G['version'] = 'G' events_H = dp_H.load_resource("events.csv")
from datetime import datetime # External import numpy as np import pandas as pd import yaml import json import dbtools from path import path # Cogphysics import cogphysics import cogphysics.lib.hashtools as ht # Local from snippets import datapackage as dpkg from mass import DATA_PATH OLDPATH = DATA_PATH.joinpath("old/old-cogphysics-human-raw-reorganized") NEWPATH = DATA_PATH.joinpath("human") convtable_path = os.path.join(cogphysics.RESOURCE_PATH, "cpobj_conv_stability.pkl") with open(convtable_path, "r") as fh: convtable = pickle.load(fh) conv_cache = {} def convert_name(oldname): global conv_cache if oldname in conv_cache: newname = conv_cache[oldname] else: