# Create a list of user_ids to exclude based on marginal distribution
            exclude_id_based_on_marginal_distribution = calculate_marginal_distribution_for_each_user(project_short_name)

            # Filter the include_ids removing any that should be excluded based on marginal distributions
            include_ids = [id for id in include_ids if id not in exclude_id_based_on_marginal_distribution]

            gold_standard_data = define_gold_standard_data(project_short_name=project_short_name)
            expert_ids = define_gold_standard_ids(project_short_name=project_short_name)
            #combined_dict = build_combined_dict_keyed_on_composite_key(project_short_name=project_short_name, user_ids_to_include=include_ids, expert_project_short_name=gold_standard_data, expert_user_ids_to_include=expert_ids)

### CLEAN FROM HERE ###

            user_dict = create_individual_dict(project_short_name=project_short_name,user_ids_to_include=include_ids)
            if project_short_name == 'tb2-r2.0':
                ihc.get_gs_data_2r20()
            else:
                gs_dict = create_individual_dict(project_short_name=gold_standard_data, user_ids_to_include=expert_ids)

            list_of_composite_keys_in_both = []
            for k in gs_dict.keys():
                if (k in user_dict.keys()) and (':0:0' in k): #converts data stored by keys back to data stored by task/image
                    list_of_composite_keys_in_both.append(k)
            list_of_composite_keys_in_both.sort()

            for key in list_of_composite_keys_in_both:
                user_answers = pandas.DataFrame(user_dict[key]['ihc'])
                gs_answers = pandas.DataFrame(gs_dict[key]['ihc'])

                user_prop_mean = user_answers.sum(axis=1)['proportion']/len(user_answers.keys())
                expert_prop_mean = gs_answers.sum(axis=1)['proportion']/len(gs_answers.keys())
import IHCtools as ihc
from ihc_settings import s
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats.stats as sss
import scikits.bootstrap as bootstrap

# seed the rng with clock or similarly sufficiently random state
np.random.RandomState(seed=None)

# disable truncated printing of arrays
np.set_printoptions(threshold=np.inf)

########### Loading and organising data
### load numpy array of GS data (nImages*nMeasures*nExperts). Also returns imIDs to keep track of what image each row in expert_data_np refers to
datExp, AllimIDs = ihc.get_gs_data_2r20_numpy(s["expertIDs"])
if s["project_short_name"] == "tb2-r2.0":
    imIDs = list(AllimIDs.Utask_ID_PA)
elif s["project_short_name"] == "tb2-r2.0a":
    imIDs = list(AllimIDs.Utask_ID_a)
elif s["project_short_name"] == "tb2-r2.1":
    imIDs = list(AllimIDs.Utask_ID_2dot1)
elif s["project_short_name"] == "tb2-r2.1b":
    imIDs = list(AllimIDs.GStask_ID.astype(int))
else:
    raise Exception("add project to list to get imIDs")
print "number of images included:", len(imIDs)
### load numpy array of user data. Also returns userIDs in a list
(datUser, userIDs) = ihc.get_user_data(imIDs, s["project_short_name"])
### simulate user data from expert data
# (datUser,userIDs) = ihc.simulate_user_data(datExp)