def inter_rater_reliability(bucket,
                            experimentfilter,
                            mldf,
                            pred,
                            outpth,
                            QualityBrightnessThreshold=None,
                            color=None):
    """
    Calculates Cohen's Kappa across raters
    Rhodri Cusack Trinity College Dublin 2018-03-11 [email protected]
    :param bucket: s3 bucket for auto coding to load
    :param experimentfilter: experiment to work on (path in s3 for coded results)
    :return:
    """

    if not color:
        color = 'r'

    s3 = boto3.resource('s3')
    s3bucket = s3.Bucket(bucket)

    # Download each subject and add to dataframe
    df = pd.DataFrame({'man_man_kappa': [], 'man_auto_kappa': []})
    for subjind, codobj in enumerate(
            s3bucket.objects.filter(Prefix=experimentfilter)):
        # This is where the output will be written
        outkey = os.path.join('coding_summary', experimentfilter,
                              'summary.pickle')

        man_man_kappa = []
        man_auto_kappa = []
        fn = s3tools.getpath({'S3Bucket': bucket, 'S3ObjectName': codobj.key})
        mldf_thissubj = mldf[mldf.index.str.match(codobj.key)]
        with open(fn, 'rb') as f:
            if QualityBrightnessThreshold is None or (
                    mldf_thissubj['QualityBrightness'] >=
                    QualityBrightnessThreshold).any():
                obj = pickle.load(f)
                m = [[
                    oneperson['code'] for oneperson in c['mancod_allraters']
                    if not oneperson['code'] is None
                ] for c in obj['coding']]
                m = np.array(m)
                a = pred[subjind]
                q = np.concatenate((np.array(a).reshape((-1, 1)), m), axis=1)
                for pair in combinations(range(np.size(m, 1)), 2):
                    man_man_kappa.append(
                        cohen_kappa_score(m[:, pair[0]], m[:, pair[1]]))
                for manind in range(np.size(m, 1)):
                    man_auto_kappa.append(cohen_kappa_score(m[:, manind], a))
                df = df.append(
                    pd.DataFrame(
                        {
                            'man_man_kappa': np.mean(man_man_kappa),
                            'man_auto_kappa': np.mean(man_auto_kappa)
                        },
                        index=[codobj.key]))

    with open(os.path.join(outpth, 'kappa.txt'), 'a') as fout:
        print(df, file=fout)
    return df
def run_machine_learning(bucket,
                         experimentfilters,
                         outpth,
                         colorscheme=None,
                         possible_codes=None,
                         usemedianforcentering=False):
    """
    Runs leave-one-subject-out machine learning

    :param bucket: bucket to work
    :param experimentfilters: list of paths to coding_summary
    :return:
    """
    # Colorscheme
    if not colorscheme:
        colorscheme = ['r', 'g', 'b']

    # Write annotated video and coding files to s3
    s3 = boto3.resource('s3')
    s3bucket = s3.Bucket(bucket)

    classifycols = [
        'Pitch', 'Roll', 'Yaw', 'leftPupil', 'rightPupil', 'eyeLeft',
        'eyeRight', 'EyesOpenValue', 'EyesOpenConfidence'
    ]

    # All experiments
    allmldf = pd.DataFrame()

    # Load summary file
    allpred = []
    for spind, sp in enumerate(experimentfilters):
        fn = s3tools.getpath({
            'S3Bucket': bucket,
            'S3ObjectName': os.path.join(sp, 'summary.pickle')
        })
        with open(fn, 'rb') as f:
            obj = pickle.load(f)

            # Find blanks
            for testsubjind, testdata in enumerate(obj):
                obj[testsubjind]['df_dropna'] = testdata['df'].dropna()

            # Centre the classification columns
            for testsubjind, testdata in enumerate(obj):
                if usemedianforcentering:
                    testdata['df_zerocentre'] = testdata['df_dropna'][
                        classifycols].subtract(
                            testdata['df_dropna'][classifycols].median())
                else:
                    testdata['df_zerocentre'] = testdata['df_dropna'][
                        classifycols].subtract(
                            testdata['df_dropna'][classifycols].mean())

            # Leave-one-subject-out classification
            pred = []
            mldf = pd.DataFrame()
            for testsubjind, testsubj in enumerate(obj):
                testlabels = testsubj['df_dropna']['mancod']
                testfeat = testsubj['df_zerocentre']

                # Get TRAIN data from all but one subject
                trainlabels = pd.concat([
                    x[1]['df_dropna']['mancod'] for x in enumerate(obj)
                    if not x[0] == testsubjind
                ])
                trainfeat = pd.concat([
                    x[1]['df_zerocentre'] for x in enumerate(obj)
                    if not x[0] == testsubjind
                ])

                # Run and test classifier. Use quadratic discriminant as look/don't look is non-linear function of position
                clf = QuadraticDiscriminantAnalysis()
                clf.fit(trainfeat, trainlabels)
                prednonan = clf.predict(testfeat)

                # We've filtered out nans before predicting.
                # Put them back in before storing, so indices of pred correspond to data before filtering out nans
                predallrows = np.ones((testsubj['df'].shape[0]))
                predallrows[testsubj['df'].notnull().all(axis=1)] = prednonan
                pred.append(predallrows)

                # First index (rows) are the "truth" of manual coding in testlabels
                cnf = metrics.confusion_matrix(testlabels,
                                               prednonan,
                                               labels=possible_codes)

                # Adjust for trials in which no face was detected
                # In absence of face, machine coding defaults to option 0 - for experiment 1, no face; for experiment 2, left
                rowisnan = testsubj['df'].isnull().any(axis=1)
                cnf[0, 0] = cnf[0, 0] + (testsubj['df']['mancod'][rowisnan]
                                         == possible_codes[0]).sum()
                cnf[1, 0] = cnf[1, 0] + (testsubj['df']['mancod'][rowisnan]
                                         == possible_codes[1]).sum()

                # Signal detection theory heuristic - if hits or fa=0 then replace with half a trial, and same at max end
                fa = cnf[0, 1] if not cnf[0, 1] == 0 else 0.5
                hits = cnf[1, 1] if not cnf[1, 1] == 0 else 0.5
                n0 = cnf[0, :].sum()
                n1 = cnf[1, :].sum()
                hits = hits if not hits == n1 else n1 - 0.5
                fa = fa if not fa == n0 else n0 - 0.5
                hits = hits / n1
                fa = fa / n0

                # Summary statistics
                proponeface = len(testsubj['df_dropna']) / len(testsubj['df'])
                mldf = mldf.append(
                    pd.DataFrame(
                        {
                            'proponeface':
                            proponeface,
                            'score':
                            clf.score(testfeat, testlabels),
                            'fa':
                            fa,
                            'hits':
                            hits,
                            'n0':
                            n0,
                            'n1':
                            n1,
                            'Confidence':
                            testsubj['df']['Confidence'].mean(skipna=True),
                            'QualitySharpness':
                            testsubj['df']['QualitySharpness'].mean(
                                skipna=True),
                            'QualityBrightness':
                            testsubj['df']['QualityBrightness'].mean(
                                skipna=True),
                            'deltat':
                            testsubj['deltat'],
                            'fps':
                            testsubj['fps'],
                            'dur':
                            testsubj['dur'],
                        },
                        index=[testsubj['S3ObjectName']]))
                plt.figure("boundingbox")
                ax = plt.subplot(111)
                bb = testsubj['df'][[
                    'BoundingBoxLeft', 'BoundingBoxTop', 'BoundingBoxWidth',
                    'BoundingBoxHeight'
                ]].mean()
                ax.add_patch(
                    mpatches.Rectangle(
                        (bb[0], bb[1] - bb[3]),
                        bb[2],
                        bb[3],
                        edgecolor=(1 - proponeface, proponeface, 0),
                        Fill=False))
                # print(mldf.describe())

            plt.figure("ROC")
            # Add d-prime lines
            for dprime in np.arange(3):
                fan = np.arange(-5, 5 - dprime, 0.1)
                hitsn = fan + dprime
                plt.plot(norm.cdf(fan),
                         norm.cdf(hitsn),
                         linestyle='dashed',
                         color='gray',
                         alpha=0.5)
            # Add hits and fa
            plt.scatter(data=mldf,
                        x='fa',
                        y='hits',
                        s=64 * mldf['proponeface'],
                        color=colorscheme[spind])

            mldf['dprime'] = norm.ppf(mldf['hits']) - norm.ppf(mldf['fa'])
            plt.figure('Confidence-dprime')
            plt.scatter(x='Confidence',
                        y='dprime',
                        data=mldf,
                        color=colorscheme[spind])
            plt.figure('QualitySharpness-dprime')
            plt.scatter(x='QualitySharpness',
                        y='dprime',
                        data=mldf,
                        color=colorscheme[spind])
            plt.figure('QualityBrightness-dprime')
            plt.scatter(x='QualityBrightness',
                        y='dprime',
                        data=mldf,
                        color=colorscheme[spind])
            plt.figure('proponeface-dprime')
            plt.scatter(x='proponeface',
                        y='dprime',
                        data=mldf,
                        color=colorscheme[spind])

            # Store across experiments
            mldf['spind'] = spind
            allmldf = allmldf.append(mldf)
            allpred.append(pred)

    fig = plt.figure("ROC")
    plt.xlabel('False alarm rate')
    plt.ylabel('Hit rate')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    fig.savefig(os.path.join(outpth, 'ROC.pdf'), format='pdf')

    fig = plt.figure("boundingbox")
    plt.xlim([-0.3, 1.3])
    plt.ylim([-0.3, 1.3])
    ax.add_patch(mpatches.Rectangle((0, 0), 1, 1, alpha=0.1))
    fig.savefig(os.path.join(outpth, 'boundingbox.pdf'), format='pdf')

    fig = plt.figure('Confidence-dprime')
    sns.regplot(x='Confidence',
                y='dprime',
                marker="",
                line_kws={'color': '0.5'},
                data=allmldf,
                dropna=True)
    plt.xlabel('Confidence of Face Detection')
    plt.ylabel('d-prime')
    fig.savefig(os.path.join(outpth, 'Confidence-dprime.pdf'), format='pdf')

    fig = plt.figure('QualitySharpness-dprime')
    sns.regplot(x='QualitySharpness',
                y='dprime',
                marker="",
                line_kws={'color': '0.5'},
                data=allmldf,
                dropna=True)
    plt.xlabel('Quality - Sharpness')
    plt.ylabel('d-prime')
    fig.savefig(os.path.join(outpth, 'QualitySharpness-dprime.pdf'),
                format='pdf')

    fig = plt.figure('QualityBrightness-dprime')
    sns.regplot(x='QualityBrightness',
                y='dprime',
                marker="",
                line_kws={'color': '0.5'},
                data=allmldf,
                dropna=True)
    plt.xlabel('Quality - Brightness')
    plt.ylabel('d-prime')
    fig.savefig(os.path.join(outpth, 'QualityBrightness-dprime.pdf'),
                format='pdf')

    fig = plt.figure('proponeface-dprime')
    sns.regplot(x='proponeface',
                y='dprime',
                marker="",
                line_kws={'color': '0.5'},
                data=allmldf,
                dropna=True)
    plt.xlabel('PropOneFace')
    plt.ylabel('d-prime')
    fig.savefig(os.path.join(outpth, 'proponeface-dprime.pdf'), format='pdf')

    plt.show()

    return {'allpred': allpred, 'allmldf': allmldf}
def process_rekognition_video(bucket, compmsg, doevenifdone=False):
    """
    Extract details from processed video,
    :param bucket: S3 bucket for data
    :param compmsg: SQS message returned by rekogntion
    :param doevenifdone: Do again even if output already present
    :return:
    """

    # Get result from rekognition
    rekognition = boto3.client('rekognition')
    jobid = compmsg['JobId']
    vid = compmsg['Video']

    if vid['S3ObjectName'][-12:] == '_lighter.mp4':
        print("Skipping lighter video %s" % vid['S3ObjectName'])
        return

    # Annotated video filename
    outkey_annotated = "annotated/" + vid['S3ObjectName']
    outfn_annotated = s3tools.getcacheoutpath(outkey_annotated)

    # Coding filename
    outkey_coding = "coding/" + os.path.splitext(
        vid['S3ObjectName'])[0] + '.pickle'
    outfn_coding = os.path.join(Path.home(), ".s3cache-out", outkey_coding)

    s3bucket = boto3.resource('s3').Bucket(vid['S3Bucket'])
    s3client = boto3.client('s3')
    if doevenifdone or 'Contents' not in s3client.list_objects(
            Bucket=vid['S3Bucket'], Prefix=outkey_coding):
        try:
            if jobid is not None:
                response = rekognition.get_face_detection(JobId=jobid)

                assert response[
                    'JobStatus'] == 'SUCCEEDED', "Rekogntion job status not SUCCEEDED but %s" % response[
                        'JobStatus']

                allfaces = response['Faces']
                while 'NextToken' in response:
                    response = rekognition.get_face_detection(
                        JobId=jobid, NextToken=response['NextToken'])
                    allfaces.extend(response['Faces'])
                print("%d faces detected" % len(allfaces))
            else:
                allfaces = compmsg['allfaces']

            # Work out what sampling rekogition seems to be using
            ts = [face['Timestamp'] for face in allfaces]
            difft = np.ediff1d(ts)
            difft = [x for x in difft if not x == 0]
            deltat = stats.mode(difft).mode
            print("Delta t is %f" % deltat)

            # Get the behavioural file that corresponds to the video
            behav = find_behav_for_video(vid)

            if not behav['matches']:
                print("No behavioural file found to correspond to %s" %
                      vid['S3ObjectName'])
                return False

            exp = []
            for rater in behav['matches']:
                pth = s3tools.getpath({
                    'S3Bucket': behav['S3Bucket'],
                    'S3ObjectName': rater
                })
                exp.append(behav['experiment'](pth))

            print(behav)

            # Get the video
            v = videotools.Video(vid)
            if v._pth is None or not os.path.exists(v._pth):
                print("Video not found")
                return False
            else:
                v.open()
                dur = v.get_dur()
                fps = v.get_fps()
                print("Dur %f and FPS %f" % (dur, fps))

                timestamps = [face['Timestamp'] for face in allfaces]

                # Make directory if necessary
                dirname = os.path.dirname(outfn_coding)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)

                writer = skvideo.io.FFmpegWriter(outfn_annotated)

                facetimes = [face['Timestamp'] for face in allfaces]

                firstface = 0
                lastface = 0

                # Store all coding results
                coding = []

                while v.isopen:
                    # Get frame
                    img = v.get_next_frame()

                    # End of video
                    if img is None:
                        break

                    currtime_ms = np.round(v.currtime * 1000)

                    # Move forwards the first face we need to consider, if appropriate
                    while firstface < len(
                            facetimes) and facetimes[firstface] < (
                                currtime_ms - deltat / 2 +
                                1):  # 1 ms buffer for rounding errors
                        firstface += 1

                    # Add all faces to the last one we need to consider
                    faces = []
                    for ind in range(firstface, len(facetimes) - 1):
                        if facetimes[ind] >= (currtime_ms + deltat / 2):
                            break
                        faces.append(ind)

                    # Count them and set up colours
                    countfaces = len(faces)
                    cols = [(0, 0, 255, 128)] * countfaces

                    # Mark one or more infant faces in green
                    infantfaces = []
                    infantind = []
                    for i0, faceind in enumerate(faces):
                        if allfaces[faceind]['Face']['AgeRange']['Low'] < 10:
                            infantind.append(
                                i0)  # which of elements in faces are infants
                            infantfaces.append(allfaces[faceind]['Face'])
                            cols[i0] = (255, 0, 0, 128)

                    # If two largely overlapping faces are found, they must be the same one, so delete one
                    if len(infantind) == 2:
                        bb0 = allfaces[faces[
                            infantind[0]]]['Face']['BoundingBox']
                        bb1 = allfaces[faces[
                            infantind[1]]]['Face']['BoundingBox']
                        dx = bb0['Left'] - bb1['Left']
                        mw = 0.5 * (bb0['Width'] + bb1['Width'])
                        dy = bb0['Top'] - bb1['Top']
                        mh = 0.5 * (bb0['Height'] + bb1['Height'])

                        if np.sqrt((dx / mw)**2 + (dy / mh)**2
                                   ) < 0.1:  # shifted by less than 10% of size
                            infantind = [infantind[0]]
                            infantfaces = [infantfaces[0]]

                    # Automatic scoring
                    autocod = exp[0].score_face(infantfaces)
                    if len(infantind) == 1:
                        cols[infantind[0]] = autocod['colour']

                    # Annotate faces
                    img = faceannotation.markfaces(
                        [allfaces[item] for item in faces], img, cols)
                    img = faceannotation.marklandmarks(
                        [allfaces[item] for item in faces], img)
                    img = faceannotation.markeyesclosed(
                        [allfaces[item] for item in faces], img)

                    # Get manual coding average
                    # Annotate manual coding status on border of image
                    mancod_allraters = []
                    coltot = (0, 0, 0, 0)
                    for singlerater in exp:
                        mancod_allraters.append(
                            singlerater.get_mancod_state(currtime_ms))
                        coltot = np.array(coltot) + np.array(
                            mancod_allraters[-1]['colour'])
                    colmean = tuple(map(int, coltot / len(exp)))

                    # Outer border, coloured appropriately
                    img = faceannotation.markmanual(img, colmean)

                    # Write annotated frame
                    writer.writeFrame(img)

                    # Store the coding results
                    coding.append({
                        'autocod': autocod,
                        'mancod_allraters': mancod_allraters,
                        'faces': faces,
                        'infantind': infantind
                    })

                writer.close()

                # Frame-by-frame manual coding - modal value across raters
                m = [
                    stats.mode([
                        oneperson['code']
                        for oneperson in c['mancod_allraters']
                        if not oneperson['code'] is None
                    ]) for c in coding
                ]
                m = [x.mode[0] if len(x.mode) >= 1 else None
                     for x in m]  # Correct for frames with no ratings
                m = [0 if x is None else x
                     for x in m]  # No ratings, set code to zero

                # Frame-by-frame auto coding
                a = [c['autocod']['code'] for c in coding]
                a = [0 if x is None else x
                     for x in a]  # No faces, set code to zero

                # Possible codes and descriptions
                possible_codes = exp[0].possible_codes()

                # Calculate confusion matrix
                conf = metrics.confusion_matrix(m,
                                                a,
                                                labels=possible_codes['code'])
                print(exp[0].possible_codes()['desc'])
                print(conf)

                # Create summary dict
                summary = {
                    'coding': coding,
                    'conf': conf,
                    'possible_codes': possible_codes,
                    'allfaces': allfaces,
                    'behav': behav,
                    'vid': vid,
                    'compmsg': compmsg,
                    'deltat': deltat,
                    'fps': fps,
                    'dur': dur
                }

                # Write coding file
                with open(outfn_coding, 'wb') as f:
                    pickle.dump(summary, f)

                # Write annotated video and coding files to s3
                s3bucket.upload_file(outfn_annotated, outkey_annotated)
                s3bucket.upload_file(outfn_coding, outkey_coding)

                return True
        except ClientError as e:
            if e.response['Error']['Code'] == 'ResourceNotFoundException':
                print("No response from rekognition available for jobid %s" %
                      compmsg['JobId'])
                return False
            else:
                raise
    else:
        print("Not repeating previously annotated %s" % outkey_coding)
        return False
def post_rekognition_summary(bucket, experimentfilter):
    """
    Aggregates data across subjects from auto and manual coding
    Rhodri Cusack Trinity College Dublin 2018-03-11 [email protected]
    :param bucket: s3 bucket for auto coding to load
    :param experimentfilter: experiment to work on (path in s3 for coded results)
    :return:
    """
    # Write annotated video and coding files to s3
    s3 = boto3.resource('s3')
    s3bucket = s3.Bucket(bucket)

    # Columns to be extracted
    pose_columns = ['Pitch', 'Yaw', 'Roll']
    landmark_columns = [['leftPupil', 'X'], ['rightPupil', 'X'],
                        ['eyeLeft', 'X'], ['eyeRight', 'X']]
    bb_columns = ['Top', 'Left', 'Width', 'Height']

    df = []

    allagerangelow = []
    allagerangehigh = []

    # Download each subject and add to dataframe
    for codobj in s3bucket.objects.filter(Prefix=experimentfilter):
        # This is where the output will be written
        outkey = os.path.join('coding_summary', experimentfilter,
                              'summary.pickle')

        fn = s3tools.getpath({'S3Bucket': bucket, 'S3ObjectName': codobj.key})
        df.append({
            'S3Bucket':
            bucket,
            'S3ObjectName':
            codobj.key,
            'df':
            pd.DataFrame(columns=pose_columns +
                         [x[0] for x in landmark_columns] + ['mancod'])
        })
        with open(fn, 'rb') as f:
            obj = pickle.load(f)

            df[-1]['deltat'] = obj['deltat']
            df[-1]['fps'] = obj['fps']
            df[-1]['dur'] = obj['dur']

            q = [
                [x['faces'][y] for y in x['infantind']] for x in obj['coding']
            ]  # "faces" contains indices within allfaces. "infantind" contains indices within faces. Get indices of infants among allfaces.
            myfaces = [[obj['allfaces'][z1] for z1 in z0]
                       for z0 in q]  # get actual face data from allfaces

            # Get the face details
            allagerangelow.extend(
                [x['Face']['AgeRange']['Low'] for x in obj['allfaces']])
            allagerangehigh.extend(
                [x['Face']['AgeRange']['High'] for x in obj['allfaces']])

            for ind, item in enumerate(myfaces):
                # Build a row
                row = {}

                # One infant face?
                if len(item) == 1:
                    # Pose
                    for col in pose_columns:
                        row[col] = item[0]['Face']['Pose'][col]
                    # Landmarks
                    for col in landmark_columns:
                        row[col[0]] = [
                            x[col[1]] for x in item[0]['Face']['Landmarks']
                            if x['Type'] == col[0]
                        ][0]
                    # Eyes open
                    row['EyesOpenValue'] = int(
                        item[0]['Face']['EyesOpen']['Value'])
                    row['EyesOpenConfidence'] = item[0]['Face']['EyesOpen'][
                        'Confidence']
                    # Bounding box
                    for col in bb_columns:
                        row['BoundingBox' +
                            col] = item[0]['Face']['BoundingBox'][col]
                    # Confidence and Quality
                    row['Confidence'] = item[0]['Face']['Confidence']
                    row['QualitySharpness'] = item[0]['Face']['Quality'][
                        'Sharpness']
                    row['QualityBrightness'] = item[0]['Face']['Quality'][
                        'Brightness']

                # And mean manual coder
                list = [
                    x['code'] for x in obj['coding'][ind]['mancod_allraters']
                ]
                row['mancod'] = float(max(set(list), key=list.count))
                df[-1]['df'] = df[-1]['df'].append(pd.DataFrame([row]),
                                                   ignore_index=True)

        print(df[-1]['df'].describe())

    # Save result and upload to S3
    fname = s3tools.getcacheoutpath(outkey)
    with open(fname, 'wb') as f:
        pickle.dump(df, f)
    s3 = boto3.resource('s3')
    s3.Bucket(bucket).upload_file(fname, outkey)

    # Done!
    print("All done")

    return ({'agerangelow': allagerangelow, 'agerangehigh': allagerangehigh})
Exemple #5
0
 def __init__(self,pth):
     self._pth=s3tools.getpath(pth)