Ejemplo n.º 1
0
    def getPerformanceStatistics(self, project, entities_eval, entity_target, entityType='user', threshold=0.5, goldenQuestionsOnly=True):
        '''
            Compares the accuracy of a list of users or model states with a target
            user.
            The following measures of accuracy are reported, depending on the
            annotation type:
            - image labels: overall accuracy
            - points:
                    RMSE (distance to closest point with the same label; in pixels)
                    overall accuracy (labels)
            - bounding boxes:
                    IoU (max. with any target bounding box, regardless of label)
                    overall accuracy (labels)
            - segmentation masks:
                    TODO

            Value 'threshold' determines the geometric requirement for an annotation to be
            counted as correct (or incorrect) as follows:
                - points: maximum euclidean distance in pixels to closest target
                - bounding boxes: minimum IoU with best matching target

            If 'goldenQuestionsOnly' is True, only images with flag 'isGoldenQuestion' = True
            will be considered for evaluation.
        '''
        entityType = entityType.lower()

        # get annotation and prediction types for project
        annoTypes = self.dbConnector.execute('''SELECT annotationType, predictionType
            FROM aide_admin.project WHERE shortname = %s;''',
            (project,),
            1)
        annoType = annoTypes[0]['annotationtype']
        predType = annoTypes[0]['predictiontype']

        if entityType != 'user' and annoType != predType:
            # different combinations of annotation and prediction types are currently not supported
            raise Exception('Statistics for unequal annotation and AI model prediction types are currently not supported.')

        # for segmentation masks: get label classes and their ordinals      #TODO: implement per-class statistics for all types
        labelClasses = {}
        lcDef = self.dbConnector.execute(sql.SQL('''
            SELECT id, name, idx, color FROM {id_lc};
        ''').format(id_lc=sql.Identifier(project, 'labelclass')),
        None, 'all')
        if lcDef is not None:
            for l in lcDef:
                labelClasses[str(l['id'])] = (l['idx'], l['name'], l['color'])

        else:
            # no label classes defined
            return {}               


        # compose args list and complete query
        queryArgs = [entity_target, tuple(entities_eval)]
        if annoType == 'points' or annoType == 'boundingBoxes':
            queryArgs.append(threshold)
            if annoType == 'points':
                queryArgs.append(threshold)

        if goldenQuestionsOnly:
            sql_goldenQuestion = sql.SQL('''JOIN (
                    SELECT id
                    FROM {id_img}
                    WHERE isGoldenQuestion = true
                ) AS qi
                ON qi.id = q2.image''').format(
                id_img=sql.Identifier(project, 'image')
            )
        else:
            sql_goldenQuestion = sql.SQL('')


        # result tokens
        tokens = {}
        tokens_normalize = []
        if annoType == 'labels':
            tokens = {
                'num_matches': 0,
                'correct': 0,
                'incorrect': 0,
                'overall_accuracy': 0.0
            }
            tokens_normalize = ['overall_accuracy']
        elif annoType == 'points':
            tokens = {
                'num_pred': 0,
                'num_target': 0,
                'tp': 0,
                'fp': 0,
                'fn': 0,
                'avg_dist': 0.0
            }
            tokens_normalize = ['avg_dist']
        elif annoType == 'boundingBoxes':
            tokens = {
                'num_pred': 0,
                'num_target': 0,
                'tp': 0,
                'fp': 0,
                'fn': 0,
                'avg_iou': 0.0
            }
            tokens_normalize = ['avg_iou']
        elif annoType == 'segmentationMasks':
            tokens = {
                'num_matches': 0,
                'overall_accuracy': 0.0,
                'per_class': {}
            }
            for clID in labelClasses.keys():
                tokens['per_class'][clID] = {
                    'num_matches': 0,
                    'prec': 0.0,
                    'rec': 0.0,
                    'f1': 0.0
                }
            tokens_normalize = []
        
        if entityType == 'user':
            queryStr = getattr(StatisticalFormulas_user, annoType).value
            queryStr = sql.SQL(queryStr).format(
                id_anno=sql.Identifier(project, 'annotation'),
                id_iu=sql.Identifier(project, 'image_user'),
                sql_goldenQuestion=sql_goldenQuestion
            )

        else:
            queryStr = getattr(StatisticalFormulas_model, annoType).value
            queryStr = sql.SQL(queryStr).format(
                id_anno=sql.Identifier(project, 'annotation'),
                id_iu=sql.Identifier(project, 'image_user'),
                id_pred=sql.Identifier(project, 'prediction'),
                sql_goldenQuestion=sql_goldenQuestion
            )

        #TODO: update points query (according to bboxes); re-write stats parsing below

        # get stats
        response = {}
        result = self.dbConnector.execute(queryStr, tuple(queryArgs), 'all')
        if result is not None and len(result):
            for b in result:
                if entityType == 'user':
                    entity = b['username']
                else:
                    entity = str(b['cnnstate'])

                if not entity in response:
                    response[entity] = copy.deepcopy(tokens)
                if annoType in ('points', 'boundingBoxes'):
                    response[entity]['num_matches'] = 1
                    if b['num_target'] > 0:
                        response[entity]['num_matches'] += 1
                
                if annoType == 'segmentationMasks':
                    # decode segmentation masks
                    try:
                        mask_target = np.array(base64ToImage(b['q1segmask'], b['q1width'], b['q1height']))
                        mask_source = np.array(base64ToImage(b['q2segmask'], b['q2width'], b['q2height']))
                        
                        if mask_target.shape == mask_source.shape and np.any(mask_target) and np.any(mask_source):

                            # calculate OA
                            intersection = (mask_target>0) * (mask_source>0)
                            if np.any(intersection):
                                oa = np.mean(mask_target[intersection] == mask_source[intersection])
                                response[entity]['overall_accuracy'] += oa
                                response[entity]['num_matches'] += 1

                            # calculate per-class precision and recall values
                            for clID in labelClasses.keys():
                                idx = labelClasses[clID][0]
                                tp = np.sum((mask_target==idx) * (mask_source==idx))
                                fp = np.sum((mask_target!=idx) * (mask_source==idx))
                                fn = np.sum((mask_target==idx) * (mask_source!=idx))
                                if (tp+fp+fn) > 0:
                                    prec, rec, f1 = self._calc_geometric_stats(tp, fp, fn)
                                    response[entity]['per_class'][clID]['num_matches'] += 1
                                    response[entity]['per_class'][clID]['prec'] += prec
                                    response[entity]['per_class'][clID]['rec'] += rec
                                    response[entity]['per_class'][clID]['f1'] += f1

                    except Exception as e:
                        print(f'TODO: error in segmentation mask statistics calculation ("{str(e)}").')

                else:
                    for key in tokens.keys():
                        if key == 'correct' or key == 'incorrect':
                            # classification
                            correct = b['label_correct']
                            # ignore None
                            if correct is True:
                                response[entity]['correct'] += 1
                                response[entity]['num_matches'] += 1
                            elif correct is False:
                                response[entity]['incorrect'] += 1
                                response[entity]['num_matches'] += 1
                        elif key in b and b[key] is not None:
                            response[entity][key] += b[key]

        for entity in response.keys():
            for t in tokens_normalize:
                if t in response[entity]:
                    if t == 'overall_accuracy':
                        response[entity][t] = float(response[entity]['correct']) / \
                            float(response[entity]['correct'] + response[entity]['incorrect'])
                    elif annoType in ('points', 'boundingBoxes'):
                        response[entity][t] /= response[entity]['num_matches']

            if annoType == 'points' or annoType == 'boundingBoxes':
                prec, rec, f1 = self._calc_geometric_stats(
                    response[entity]['tp'],
                    response[entity]['fp'],
                    response[entity]['fn']
                )
                response[entity]['prec'] = prec
                response[entity]['rec'] = rec
                response[entity]['f1'] = f1

            elif annoType == 'segmentationMasks':
                # normalize OA
                response[entity]['overall_accuracy'] /= response[entity]['num_matches']

                # normalize all label class values as well
                for lcID in labelClasses.keys():
                    numMatches = response[entity]['per_class'][lcID]['num_matches']
                    if numMatches > 0:
                        response[entity]['per_class'][lcID]['prec'] /= numMatches
                        response[entity]['per_class'][lcID]['rec'] /= numMatches
                        response[entity]['per_class'][lcID]['f1'] /= numMatches

        return {
            'label_classes': labelClasses,
            'per_entity': response
            }
Ejemplo n.º 2
0
    def prepareDataDownload(self,
                            project,
                            dataType='annotation',
                            userList=None,
                            dateRange=None,
                            extraFields=None,
                            segmaskFilenameOptions=None,
                            segmaskEncoding='rgb'):
        '''
            Polls the database for project data according to the
            specified restrictions:
            - dataType: "annotation" or "prediction"
            - userList: for type "annotation": None (all users) or
                        an iterable of user names
            - dateRange: None (all dates) or two values for a mini-
                         mum and maximum timestamp
            - extraFields: None (no field) or dict of keywords and bools for
                           additional fields (e.g. browser meta) to be queried.
            - segmaskFilenameOptions: customization parameters for segmentation
                                      mask images' file names.
            - segmaskEncoding: encoding of the segmentation mask pixel
                               values ("rgb" or "indexed")
            
            Creates a file in this machine's temporary directory
            and returns the file name to it.
            Note that in some cases (esp. for semantic segmentation),
            the number of queryable entries may be limited due to
            file size and free disk space restrictions. An upper cei-
            ling is specified in the configuration *.ini file ('TODO')
        '''

        now = datetime.now(tz=pytz.utc)

        # argument check
        if userList is None:
            userList = []
        elif isinstance(userList, str):
            userList = [userList]
        if dateRange is None:
            dateRange = []
        elif len(dateRange) == 1:
            dateRange = [dateRange, now]

        if extraFields is None or not isinstance(extraFields, dict):
            extraFields = {'meta': False}
        else:
            if not 'meta' in extraFields or not isinstance(
                    extraFields['meta'], bool):
                extraFields['meta'] = False

        if segmaskFilenameOptions is None:
            segmaskFilenameOptions = {
                'baseName': 'filename',
                'prefix': '',
                'suffix': ''
            }
        else:
            if not 'baseName' in segmaskFilenameOptions or \
                segmaskFilenameOptions['baseName'] not in ('filename', 'id'):
                segmaskFilenameOptions['baseName'] = 'filename'
            try:
                segmaskFilenameOptions['prefix'] = str(
                    segmaskFilenameOptions['prefix'])
            except:
                segmaskFilenameOptions['prefix'] = ''
            try:
                segmaskFilenameOptions['suffix'] = str(
                    segmaskFilenameOptions['suffix'])
            except:
                segmaskFilenameOptions['suffix'] = ''

            for char in self.FILENAMES_PROHIBITED_CHARS:
                segmaskFilenameOptions['prefix'] = segmaskFilenameOptions[
                    'prefix'].replace(char, '_')
                segmaskFilenameOptions['suffix'] = segmaskFilenameOptions[
                    'suffix'].replace(char, '_')

        # check metadata type: need to deal with segmentation masks separately
        if dataType == 'annotation':
            metaField = 'annotationtype'
        elif dataType == 'prediction':
            metaField = 'predictiontype'
        else:
            raise Exception('Invalid dataType specified ({})'.format(dataType))
        metaType = self.dbConnector.execute(
            '''
                SELECT {} FROM aide_admin.project
                WHERE shortname = %s;
            '''.format(metaField), (project, ), 1)[0][metaField]

        if metaType.lower() == 'segmentationmasks':
            is_segmentation = True
            fileExtension = '.zip'

            # create indexed color palette for segmentation masks
            if segmaskEncoding == 'indexed':
                try:
                    indexedColors = []
                    labelClasses = self.dbConnector.execute(
                        sql.SQL('''
                            SELECT idx, color FROM {id_lc} ORDER BY idx ASC;
                        ''').format(
                            id_lc=sql.Identifier(project, 'labelclass')), None,
                        'all')
                    currentIndex = 1
                    for lc in labelClasses:
                        if lc['idx'] == 0:
                            # background class
                            continue
                        while currentIndex < lc['idx']:
                            # gaps in label classes; fill with zeros
                            indexedColors.extend([0, 0, 0])
                            currentIndex += 1
                        color = lc['color']
                        if color is None:
                            # no color specified; add from defaults
                            #TODO
                            indexedColors.extend([0, 0, 0])
                        else:
                            # convert to RGB format
                            indexedColors.extend(helpers.hexToRGB(color))

                except:
                    # an error occurred; don't convert segmentation mask to indexed colors
                    indexedColors = None
            else:
                indexedColors = None

        else:
            is_segmentation = False
            fileExtension = '.txt'  #TODO: support JSON?

        # prepare output file
        filename = 'aide_query_{}'.format(
            now.strftime('%Y-%m-%d_%H-%M-%S')) + fileExtension
        destPath = os.path.join(self.tempDir, 'aide/downloadRequests', project)
        os.makedirs(destPath, exist_ok=True)
        destPath = os.path.join(destPath, filename)

        # generate query
        queryArgs = []
        tableID = sql.Identifier(project, dataType)
        userStr = sql.SQL('')
        iuStr = sql.SQL('')
        dateStr = sql.SQL('')
        queryFields = [
            'filename',
            'isGoldenQuestion',
            'date_image_added',
            'last_requested_image',
            'image_corrupt'  # default image fields
        ]
        if dataType == 'annotation':
            iuStr = sql.SQL('''
                JOIN (SELECT image AS iu_image, username AS iu_username, viewcount, last_checked, last_time_required FROM {id_iu}) AS iu
                ON t.image = iu.iu_image
                AND t.username = iu.iu_username
            ''').format(id_iu=sql.Identifier(project, 'image_user'))
            if len(userList):
                userStr = sql.SQL('WHERE username IN %s')
                queryArgs.append(tuple(userList))

            queryFields.extend(
                getattr(QueryStrings_annotation, metaType).value)
            queryFields.extend([
                'username', 'viewcount', 'last_checked', 'last_time_required'
            ])  #TODO: make customizable

        else:
            queryFields.extend(
                getattr(QueryStrings_prediction, metaType).value)

        if len(dateRange):
            if len(userStr.string):
                dateStr = sql.SQL(
                    ' AND timecreated >= to_timestamp(%s) AND timecreated <= to_timestamp(%s)'
                )
            else:
                dateStr = sql.SQL(
                    'WHERE timecreated >= to_timestamp(%s) AND timecreated <= to_timestamp(%s)'
                )
            queryArgs.extend(dateRange)

        if not is_segmentation:
            # join label classes
            lcStr = sql.SQL('''
                JOIN (SELECT id AS lcID, name AS labelclass_name, idx AS labelclass_index
                    FROM {id_lc}
                ) AS lc
                ON label = lc.lcID
            ''').format(id_lc=sql.Identifier(project, 'labelclass'))
            queryFields.extend(['labelclass_name', 'labelclass_index'])
        else:
            lcStr = sql.SQL('')

        # remove redundant query fields
        queryFields = set(queryFields)
        for key in extraFields.keys():
            if not extraFields[key]:
                queryFields.remove(key)
        queryFields = list(queryFields)

        queryStr = sql.SQL('''
            SELECT * FROM {tableID} AS t
            JOIN (
                SELECT id AS imgID, filename, isGoldenQuestion, date_added AS date_image_added, last_requested AS last_requested_image, corrupt AS image_corrupt
                FROM {id_img}
            ) AS img ON t.image = img.imgID
            {lcStr}
            {iuStr}
            {userStr}
            {dateStr}
        ''').format(tableID=tableID,
                    id_img=sql.Identifier(project, 'image'),
                    lcStr=lcStr,
                    iuStr=iuStr,
                    userStr=userStr,
                    dateStr=dateStr)

        # query and process data
        if is_segmentation:
            mainFile = zipfile.ZipFile(destPath, 'w', zipfile.ZIP_DEFLATED)
        else:
            mainFile = open(destPath, 'w')
        metaStr = '; '.join(queryFields) + '\n'

        with self.dbConnector.execute_cursor(queryStr,
                                             tuple(queryArgs)) as cursor:
            while True:
                b = cursor.fetchone()
                if b is None:
                    break

                if is_segmentation:
                    # convert and store segmentation mask separately
                    segmask_filename = 'segmentation_masks/'

                    if segmaskFilenameOptions['baseName'] == 'id':
                        innerFilename = b['image']
                        parent = ''
                    else:
                        innerFilename = b['filename']
                        parent, innerFilename = os.path.split(innerFilename)
                    finalFilename = os.path.join(
                        parent,
                        segmaskFilenameOptions['prefix'] + innerFilename +
                        segmaskFilenameOptions['suffix'] + '.tif')
                    segmask_filename += finalFilename

                    segmask = base64ToImage(b['segmentationmask'], b['width'],
                                            b['height'])

                    if indexedColors is not None and len(indexedColors) > 0:
                        # convert to indexed color and add color palette from label classes
                        segmask = segmask.convert('RGB').convert(
                            'P', palette=Image.ADAPTIVE, colors=3)
                        segmask.putpalette(indexedColors)

                    # save
                    bio = io.BytesIO()
                    segmask.save(bio, 'TIFF')
                    mainFile.writestr(segmask_filename, bio.getvalue())

                # store metadata
                metaLine = ''
                for field in queryFields:
                    if field.lower() == 'segmentationmask':
                        continue
                    metaLine += '{}; '.format(b[field.lower()])
                metaStr += metaLine + '\n'

        if is_segmentation:
            mainFile.writestr('query.txt', metaStr)
        else:
            mainFile.write(metaStr)

        if is_segmentation:
            # append separate text file for label classes
            labelclassQuery = sql.SQL('''
                SELECT id, name, color, labelclassgroup, idx AS labelclass_index
                FROM {id_lc};
            ''').format(id_lc=sql.Identifier(project, 'labelclass'))
            result = self.dbConnector.execute(labelclassQuery, None, 'all')
            lcStr = 'id,name,color,labelclassgroup,labelclass_index\n'
            for r in result:
                lcStr += '{},{},{},{},{}\n'.format(r['id'], r['name'],
                                                   r['color'],
                                                   r['labelclassgroup'],
                                                   r['labelclass_index'])
            mainFile.writestr('labelclasses.csv', lcStr)

        mainFile.close()

        return filename