def classify_subjects_parallel(sub, subs, feature_spaces, model, cv):
    """ Helper function to parallelize analysis across subjects. """
    scores, coefs = [], dict()
    for i, fs in enumerate(feature_spaces):

        if not isinstance(fs, (tuple, list)):
            fs = (fs, )

        fs_name = '+'.join(fs)

        dl = DataLoader(sub=sub, log_level=30)
        dl.load_y(strategy_doubles='hard')
        dl.load_X(feature_set=fs, n_comp=100)
        X_val, y_val = dl.return_Xy()

        other_X, other_y = [], []
        other_subs = [s for s in subs if s != sub]
        for other_sub in other_subs:
            dl = DataLoader(sub=other_sub, log_level=30)
            dl.load_y(strategy_doubles='hard')
            dl.load_X(feature_set=fs, n_comp=100)
            this_X, this_y = dl.return_Xy()
            other_X.append(this_X)
            other_y.append(this_y)

        X = pd.concat(other_X, axis=0)
        y = pd.concat(other_y, axis=0)

        scores_, coefs_, model_ = cross_val_predict_and_score(
            estimator=model,
            X=X,
            y=y,
            cv=cv,
            scoring=roc_auc_score_per_class,
            X_val=X_val,
            y_val=y_val,
            per_class=True,
            return_model=True)
        joblib.dump(model_,
                    f'models/sub-{sub}_type-between_fs-{fs_name}_model.jl')

        dl.log.warning(
            f"sub-{sub} scores: {np.round(scores_, 2)} (fs = {fs_name})")
        scores_df = pd.DataFrame(scores_, columns=['score'])
        scores_df['feature_set'] = fs_name
        scores_df['emotion'] = dl.le.classes_
        scores_df['sub'] = sub
        scores.append(scores_df)

        coefs_df = pd.DataFrame(data=coefs_, columns=X.columns)
        coefs_df['feature_set'] = fs_name
        coefs_df['emotion'] = dl.le.classes_
        coefs_df['sub'] = sub
        coefs[fs_name] = coefs_df

    scores_df = pd.concat(scores, axis=0)
    return scores_df, coefs
def classify_subjects_parallel(sub, feature_spaces, model, cv):
    """ Helper function to parallelize analysis across subjects. """
    scores, preds, coefs = [], [], dict()
    for fs in feature_spaces:
        
        if not isinstance(fs, (tuple, list)):
            fs = (fs,)
        
        fs_name = '+'.join(fs)

        dl = DataLoader(sub=sub, log_level=30)
        dl.load_y(strategy_doubles='hard')
        dl.load_X(feature_set=fs, n_comp=100)
        X, y = dl.return_Xy()

        preds_, scores_, coefs_, model_ = cross_val_predict_and_score(
            estimator=model,
            X=X, y=y,
            cv=cv,
            scoring=tjur_score,
            between_sub=False,
            soft=True
        )
        joblib.dump(model_, f'models/sub-{sub}_analysis-within_split-train_fs-{fs_name}_model.jl')

        dl.log.warning(f"sub-{sub} scores: {np.round(scores_, 2)} (fs = {fs_name})")
        scores_df = pd.DataFrame(scores_, columns=['score'])
        scores_df['feature_set'] = fs_name
        scores_df['emotion'] = dl.le.classes_
        scores_df['sub'] = sub
        scores.append(scores_df)

        for i in range(len(preds_)):
            preds_[i]['feature_set'] = fs_name
            preds_[i]['sub'] = sub
            preds_[i]['rep'] = i
        
        preds.append(pd.concat(preds_, axis=0))

        coefs_df = pd.DataFrame(data=coefs_, columns=X.columns)
        coefs_df['feature_set'] = fs_name
        coefs_df['emotion'] = dl.le.classes_
        coefs_df['sub'] = sub
        coefs[fs_name] = coefs_df

    scores = pd.concat(scores, axis=0)
    preds = pd.concat(preds, axis=0)
    return preds, scores, coefs
def classify_fs_parallel(subs, fs, model, cv):
    """ Helper function to parallelize analysis across FSs. """

    if not isinstance(fs, (tuple, list)):
        fs = (fs, )

    fs_name = '+'.join(fs)

    X, y = [], []
    for sub in subs:
        dl = DataLoader(sub=sub, log_level=30)
        dl.load_y(strategy_doubles='hard')
        dl.load_X(feature_set=fs, n_comp=100)
        this_X, this_y = dl.return_Xy()
        X.append(this_X)
        y.append(this_y)

    X = pd.concat(X, axis=0)
    y = pd.concat(y, axis=0)

    preds_, scores_, coefs_, model_ = cross_val_predict_and_score(
        estimator=model, X=X, y=y, cv=cv, scoring=tjur_score, soft=True)

    joblib.dump(
        model_,
        f'models/sub-{sub}_analysis-between_split-train_fs-{fs_name}_model.jl')

    dl.log.warning(
        f"sub-{sub} scores: {np.round(scores_, 2)} (fs = {fs_name})")
    scores = pd.DataFrame(scores_, columns=['score'])
    scores['feature_set'] = fs_name
    scores['emotion'] = dl.le.classes_
    scores['sub'] = sub

    for i in range(len(preds_)):
        preds_[i]['feature_set'] = fs_name
        preds_[i]['rep'] = i

    preds = pd.concat(preds_, axis=0)

    coefs = pd.DataFrame(data=coefs_, columns=X.columns)
    coefs['feature_set'] = fs_name
    coefs['emotion'] = dl.le.classes_

    return preds, scores, coefs
sys.path.append('src')
from data_io import DataLoader
from metrics import tjur_score

ohe = OneHotEncoder(categories='auto', sparse=False)
ohe.fit(np.arange(6)[:, np.newaxis])

scores_all = []
for api in ['google', 'azure']:
    df = pd.read_csv(f'data/api-{api}_emoratings.tsv', sep='\t', index_col=0)
    subs = [str(s).zfill(2) for s in range(1, 14) if s != 11]
    scores = np.zeros((len(subs), 6))
    for i, sub in enumerate(subs):
        dl = DataLoader(sub=sub, log_level=30)
        dl.load_y(strategy_doubles='hard')
        y_api = df.loc[dl.y.index].values
        y_true = ohe.transform(dl.y.values[:, np.newaxis])
        scores[i, :] = tjur_score(y_true, y_api, average=None)

    scores = pd.DataFrame(scores, columns=dl.le.classes_,
                          index=subs).reset_index()
    scores = pd.melt(scores,
                     id_vars='index',
                     value_name='score',
                     var_name='emotion')
    scores = scores.rename({'index': 'sub'}, axis=1)
    scores['api'] = api
    scores_all.append(scores)

scores = pd.concat(scores_all, axis=0)
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, recall_score

sys.path.append(op.abspath(op.dirname(op.dirname(__file__))))
from data_io import DataLoader
from noise_ceiling import compute_noise_ceiling
from metrics import brier_score, tjur_score


subs = [str(s).zfill(2) for s in range(1, 14) if s != 11]
ceilings = np.zeros((len(subs), 6))
y_all = []
for i, sub in enumerate(subs):
    dl = DataLoader(sub=sub, log_level=30)
    y_doubles = dl.load_y(return_doubles=True)
    ceilings[i, :] = compute_noise_ceiling(y_doubles, soft=True, scoring=tjur_score)
    dl.log.warning(f"Ceiling sub-{sub}: {ceilings[i, :]}")

    # Note to self: between-subject NC only works with 'hard' labels,
    # otherwise you need to deal with two sources of "doubles"/inconsistency
    dl.load_y(return_doubles=False, strategy_doubles='hard')
    y_all.append(dl.y)

# Ceilings per subject
ceilings = pd.DataFrame(ceilings, columns=dl.le.classes_, index=subs)

# Ceiling across subjects
y = pd.concat(y_all, axis=0)
pd.get_dummies(y).to_csv('results/y_all.tsv', sep='\t')