Beispiel #1
0
    def __marking_header_setup__(self,workflow_id,task_id,shapes,output_directory):
        """
        - create the csv output files for each workflow/task pairing where the task is a marking
        also write out the header line
        - since different tools (for the same task) can have completely different shapes, these shapes should
        be printed out to different files - hence the multiple output files
        - we will give both a summary file and a detailed report file
        """
        for shape in shapes:
            fname = str(task_id) + self.instructions[workflow_id][task_id]["instruction"][:50]
            fname = helper_functions.csv_string(fname)
            # fname += ".csv"


            self.file_names[(task_id,shape,"detailed")] = fname + "_" + shape + ".csv"
            self.file_names[(task_id,shape,"summary")] = fname + "_" + shape + "_summary.csv"

            # polygons - since they have an arbitary number of points are handled slightly differently
            if shape == "polygon":
                id_ = task_id,shape,"detailed"
                self.csv_files[id_] = open(output_directory+fname+"_"+shape+".csv","wb")
                self.csv_files[id_].write("subject_id,cluster_index,most_likely_tool,area,list_of_xy_polygon_coordinates\n")

                id_ = task_id,shape,"summary"
                self.csv_files[id_] = open(output_directory+fname+"_"+shape+"_summary.csv","wb")
                # self.csv_files[id_].write("subject_id,\n")
                polygon_tools = [t_index for t_index,t in enumerate(self.workflows[workflow_id][1][task_id]) if t == "polygon"]
                header = "subject_id,"
                for tool_id in polygon_tools:
                    tool = self.instructions[workflow_id][task_id]["tools"][tool_id]["marking tool"]
                    tool = helper_functions.csv_string(tool)
                    header += "area("+tool+"),"
                self.csv_files[id_].write(header+"\n")

            else:
                id_ = task_id,shape,"detailed"
                # fname += "_"+shape+".csv"
                self.csv_files[id_] = open(output_directory+fname+"_"+shape+".csv","wb")

                header = "subject_id,cluster_index,most_likely_tool,"
                if shape == "point":
                    header += "x,y,"
                elif shape == "rectangle":
                    # todo - fix this
                    header += "x1,y1,x2,y2,"
                elif shape == "line":
                    header += "x1,y1,x2,y2,"
                elif shape == "ellipse":
                    header += "x1,y1,r1,r2,theta,"

                header += "p(most_likely_tool),p(true_positive),num_users"
                self.csv_files[id_].write(header+"\n")
                # do the summary output else where
                self.__summary_header_setup__(output_directory,workflow_id,fname,task_id,shape)
    def __get_filename__(self,workflow_id,task_id,summary=False,tool_id=None,followup_id=None):
        """
        use the user's instructions to help create a file name to store the results in
        :param workflow_id:
        :param task_id:
        :param summary:
        :return:
        """
        assert (tool_id is None) or (followup_id is not None)

        # read in the instructions
        # if just a simple classification question
        if tool_id is None:
            instructions = self.instructions[workflow_id][task_id]["instruction"]
        # else a follow up question to a marking - so the instructions are stored in a sligghtly different spot
        else:
            instructions = (
                self.instructions[workflow_id][task_id]["tools"][tool_id]
                    ["followup_questions"][followup_id].get("question",  "")
            )

        fname = str(task_id) + instructions[:50]
        if summary:
            fname += "_summary"
        # get rid of any characters (like extra ","s) that could cause problems
        fname = helper_functions.csv_string(fname)
        fname += ".csv"

        return fname
    def __marking_summary_header__(self,workflow_id,task_id,shape):
        """
        setup the summary csv file for a given marking tool
        all shape aggregation will have a summary file - with one line per subject
        DON'T call this for polygons - they need to be handled differently
        :return:
        """
        assert shape != "polygon"

        id_ = task_id,shape,"summary"
        with open(self.file_names[id_],"w") as csv_file:
            # the summary file will contain just line per subject
            csv_file.write("subject_id")

            # extract only the tools which can actually make markings of the desired shape
            # [1] - is the list of marking tasks, i.e. [0] is the list of classification tasks and [2] is
            # survey tasks
            for tool_id,tool_shape in enumerate(self.workflows[workflow_id][1][task_id]):
                # does this particular tool use the desired shape?
                if tool_shape != shape:
                    continue

                # what is the label given to this tool - this is what we want to use in our column header
                # i.e. we don't want to say tool 0, or shape rectangle, we want to say "zebra"
                tool_label = self.instructions[workflow_id][task_id]["tools"][tool_id]["marking tool"]
                # remove any characters (such as spaces) which shouldn't be in a csv column header
                tool_label = helper_functions.csv_string(tool_label)

                csv_file.write(",median(" + tool_label +")")

            # as final stats to add
            csv_file.write(",mean_probability,median_probability,mean_tool,median_tool\n")
    def __polygon_row__(self,workflow_id,task_id,subject_id,aggregations):
        """
        print out results for a polygon - include the outline in pixels, the area (as a percentage of image area)
        and what is the most likely tool to have created this polygon
        :param workflow_id:
        :param task_id:
        :param subject_id:
        :param aggregations:
        :return:
        """
        id_ = task_id,"polygon","detailed"

        with open(self.file_names[id_],"a") as csv_file:
            for p_index,cluster in aggregations["polygon clusters"].items():
                if p_index == "all_users":
                    continue

                tool_classification = cluster["tool_classification"][0].items()
                most_likely_tool,tool_probability = max(tool_classification, key = lambda x:x[1])
                tool = self.instructions[workflow_id][task_id]["tools"][int(most_likely_tool)]["marking tool"]
                tool = helper_functions.csv_string(tool)

                row = str(subject_id) + ","+ str(p_index)+ ","+ tool + ","+ str(cluster["area"]) + ",\"" +str(cluster["center"]) + "\""

                csv_file.write(row+"\n")
Beispiel #5
0
    def __single_choice_classification_row__(self,answers,task_id,subject_id,results,cluster_index=None):
        """
        output a row for a classification task which only allowed allowed one answer
        global_task_id => the task might actually be a subtask, in which case the id needs to contain
        the task id, tool and follow up question id
        :param global_task_id:
        :param subject_id:
        :param results:
        :return:
        """
        # since only one choice is allowed, go for the maximum
        votes,num_users = results
        if votes == {}:
            return
        most_likely,top_probability = max(votes.items(), key = lambda x:x[1])

        # extract the text corresponding to the most likely answer
        most_likely_label = answers[int(most_likely)]
        # this corresponds to when the question is a follow up
        if isinstance(most_likely_label,dict):
            most_likely_label = most_likely_label["label"]
        most_likely_label = helper_functions.csv_string(most_likely_label)

        probabilities = votes.values()
        entropy = self.__shannon_entropy__(probabilities)

        row = str(subject_id)+","
        if cluster_index is not None:
            row += str(cluster_index) + ","
        row += most_likely_label+","+str(top_probability)+","+str(entropy)+","+str(num_users)+"\n"

        # finally write the stuff out to file
        self.csv_files[task_id].write(row)
Beispiel #6
0
    def __polygon_row__(self,workflow_id,task_id,subject_id,aggregations):
        id_ = task_id,"polygon","detailed"

        # for p_index,cluster in aggregations["polygon clusters"].items():
        #     if p_index == "all_users":
        #         continue
        #
        #     tool_classification = cluster["tool_classification"][0].items()
        #     most_likely_tool,tool_probability = max(tool_classification, key = lambda x:x[1])
        #     total_area[int(most_likely_tool)] += cluster["area"]

        for p_index,cluster in aggregations["polygon clusters"].items():
            if p_index == "all_users":
                continue

            tool_classification = cluster["tool_classification"][0].items()
            most_likely_tool,tool_probability = max(tool_classification, key = lambda x:x[1])
            tool = self.instructions[workflow_id][task_id]["tools"][int(most_likely_tool)]["marking tool"]
            tool = helper_functions.csv_string(tool)

            for polygon in cluster["center"]:
                p = geometry.Polygon(polygon)

                row = str(subject_id) + ","+ str(p_index)+ ","+ tool + ","+ str(p.area/float(cluster["image area"])) + ",\"" +str(polygon) + "\""
                self.csv_files[id_].write(row+"\n")
    def __marking_file_setup__(self,output_directory,workflow_id):
        """
        - create the csv output files for each workflow/task pairing where the task is a marking
        also write out the header line
        - since different tools (for the same task) can have completely different shapes, these shapes should
        be printed out to different files - hence the multiple output files
        - we will give both a summary file and a detailed report file
        """
        tasks = self.workflows[workflow_id]

        # iterate over each task and the shapes (not tools) available for each task
        for task_id,tools in tasks['marking'].items():
            for shape in set(tools):
                # get the file name - and remove any characters (such as spaces) which should not be in a file name
                fname = str(task_id) + self.instructions[workflow_id][task_id]["instruction"][:50]
                fname = helper_functions.csv_string(fname)

                # create the files - both detailed and summary
                self.file_names[(task_id,shape,"detailed")] = output_directory+"/"+fname + "_" + shape + ".csv"
                self.file_names[(task_id,shape,"summary")] = output_directory+"/"+fname + "_" + shape + "_summary.csv"

                # polygons - since they have an arbitary number of points are handled slightly differently
                if shape == "polygon":
                    self.__add_polygon_summary_row__()
                    self.__polygon_detailed_setup__(task_id)
                else:
                    # write the headers for the csv summary files
                    self.__marking_summary_header__(workflow_id,task_id,shape)
                    # and for the detailed
                    self.__marking_detailed_header__(task_id,shape)
Beispiel #8
0
    def __get_filename__(self,workflow_id,task_id,summary=False,tool_id=None,followup_id=None):
        """
        use the user's instructions to help create a file name to store the results in
        :param workflow_id:
        :param task_id:
        :param summary:
        :return:
        """
        assert (tool_id is None) or (followup_id is not None)

        # read in the instructions
        # if just a simple classification question
        if tool_id is None:
            instructions = self.instructions[workflow_id][task_id]["instruction"]
        # else a follow up question to a marking - so the instructions are stored in a sligghtly different spot
        else:
            instructions = self.instructions[workflow_id][task_id]["tools"][tool_id]["followup_questions"][followup_id]["question"]

        fname = str(task_id) + instructions[:50]
        if summary:
            fname += "_summary"
        # get rid of any characters (like extra ","s) that could cause problems
        fname = helper_functions.csv_string(fname)
        fname += ".csv"

        return fname
    def __survey_file_setup__(self,output_directory,workflow_id):
        """
        set up the csv files for surveys. we will just have one output file
        :param output_directory:
        :param workflow_id:
        :return:
        """
        tasks = self.workflows[workflow_id]

        for task_id in tasks['survey']:
            instructions = self.instructions[workflow_id][task_id]

            self.file_names[task_id] = output_directory+str(task_id) + ".csv"

            with open(self.file_names[task_id],"w") as csv_file:
                # now write the header
                header = "subject_id,num_classifications,pielou_score,species,"
                header += "percentage_of_votes_for_species,number_of_votes_for_species"

                # always include these headers for HWMN follow up question - these columns may be NA in output
                # but at least we have the header explaining why
                header += ",minimum_number_of_animals,most_likely_number_of_animals,percentage,maximum_number_of_animals"

                # todo - we'll assume, for now, that "how many" is always the first question
                # for followup_id in instructions["questionsOrder"]:
                for followup_id in instructions["questions"].keys():
                    multiple_answers = instructions["questions"][followup_id]["multiple"]
                    label = instructions["questions"][followup_id]["label"]

                    # the question "how many" is treated differently - we'll give the minimum, maximum and mostly likely
                    if followup_id == "HWMN":
                        continue
                    else:
                        if "behavior" in label:
                            stem = "behaviour:"
                        elif "behaviour" in label:
                            stem = "behaviour:"
                        else:
                            stem = helper_functions.csv_string(label)

                        if not multiple_answers:
                            header += ",most_likely(" + stem + ")"

                        for answer_id in instructions["questions"][followup_id]["answersOrder"]:
                            header += ",percentage(" + stem + helper_functions.csv_string(instructions["questions"][followup_id]["answers"][answer_id]["label"]) +")"

                csv_file.write(header+"\n")
Beispiel #10
0
    def __classification_summary_row__(self,workflow_id,task_id,subject_id,aggregations,followup_id=None,tool_id = None,cluster_index=None):
        """
        given a result for a specific subject (and possibily a specific cluster within that specific subject)
        add one row of results to the summary file. that row contains
        subject_id,tool_index,cluster_index,most_likely,p(most_likely),shannon_entropy,mean_agreement,median_agreement,num_users
        tool_index & cluster_index are only there if we have a follow up to marking task
        :param id_:
        :param subject_id:
        :param results:
        :param answer_dict:
        :return:
        """
        # key for accessing the csv output in the dictionary
        id_ = (task_id,tool_id,followup_id,"summary")

        # get what percentage of users voted for each classification
        votes,num_users = aggregations[task_id]

        try:
            most_likely,top_probability = max(votes.items(), key = lambda x:x[1])

            # if tool_id is not None -> we have a follow up question
            # extract the text corresponding to the most likely answer
            # follow up questions for markings with have a different structure
            if tool_id is not None:
                answers = self.instructions[workflow_id][task_id]["tools"][tool_id]["followup_questions"][followup_id]["answers"]
                most_likely_label = answers[int(most_likely)]["label"]
            else:
                most_likely_label = self.instructions[workflow_id][task_id]["answers"][int(most_likely)]

            # and get rid of any bad characters
            most_likely_label = helper_functions.csv_string(most_likely_label)

            # calculate some summary values such as entropy and mean and median percentage of votes for each classification
            probabilities = votes.values()
            entropy = self.__shannon_entropy__(probabilities)

            mean_p = np.mean(votes.values())
            median_p = np.median(votes.values())

            with open(self.file_names[id_],"a") as results_file:
                results_file.write(str(subject_id)+",")

                if cluster_index is not None:
                    results_file.write(str(cluster_index)+",")

                # write out details regarding the top choice
                # this might not be a useful value if multiple choices are allowed - in which case just ignore it
                results_file.write(str(most_likely_label)+","+str(top_probability))
                # write out some summaries about the distributions of people's answers
                # again entropy probably only makes sense if only one answer is allowed
                # and mean_p and median_p probably only make sense if multiple answers are allowed
                # so people will need to pick and choose what they want
                results_file.write(","+str(entropy)+","+str(mean_p)+","+str(median_p))
                # finally - how many people have seen this subject for this task
                results_file.write(","+str(num_users)+"\n")
        # empty values should be ignored - but shouldn't happen too often either
        except ValueError:
            pass
Beispiel #11
0
    def __make_files__(self,workflow_id):
        """
        create all of the files necessary for this workflow
        :param workflow_id:
        :return:
        """
        # close any previously used files (and delete their pointers)
        for f in self.csv_files.values():
            f.close()
        self.csv_files = {}

        # now create a sub directory specific to the workflow
        try:
            workflow_name = self.workflow_names[workflow_id]
        except KeyError:
            warning(self.workflows)
            warning(self.workflow_names)
            raise

        workflow_name = helper_functions.csv_string(workflow_name)
        output_directory = "/tmp/"+str(self.project_id)+"/" +str(workflow_id) + "_" + workflow_name + "/"

        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
        self.workflow_directories[workflow_id] = output_directory

        classification_tasks,marking_tasks,survey_tasks = self.workflows[workflow_id]

        # go through the classification tasks - they will either be simple c. tasks (one answer allowed)
        # multiple c. tasks (more than one answer allowed) and possibly a follow up question to a marking
        for task_id in classification_tasks:
            # is this task a simple classification task?
            # don't care if the questions allows for multiple answers, or requires a single one
            if classification_tasks[task_id] in ["single","multiple"]:
                self.__classification_header__(output_directory,workflow_id,task_id)

            else:
                # this classification task is actually a follow up to a marking task
                for tool_id in classification_tasks[task_id]:
                    for followup_id,answer_type in enumerate(classification_tasks[task_id][tool_id]):
                        # instructions = self.instructions[workflow_id][task_id]["tools"][tool_id]["followup_questions"][followup_index]["question"]
                        self.__classification_header__(output_directory,workflow_id,task_id,tool_id,followup_id)
                        # id_ = (task_id,tool_id,followup_index)
                        # if answer_type == "single":
                        #     self.__single_response_csv_header__(output_directory,id_,instructions)
                        # else:
                        #     self.__multi_response_csv_header__(output_directory,id_,instructions)

        # now set things up for the marking tasks
        for task_id in marking_tasks:
            shapes = set(marking_tasks[task_id])
            self.__marking_header_setup__(workflow_id,task_id,shapes,output_directory)

        # and finally the survey tasks
        for task_id in survey_tasks:
            instructions = self.instructions[workflow_id][task_id]
            self.__survey_header_setup__(output_directory,task_id,instructions)

        return output_directory
Beispiel #12
0
    def __survey_header_setup__(self,output_directory,task_id,instructions):
        """
        create the csv output file for a survey task
        and give the header row
        :param output_directory:
        :param task_id:
        :param instructions:
        :return:
        """
        # # start with the summary files
        # fname = output_directory+str(task_id) + "_survey_summary.csv"
        # self.file_names[(task_id,"summary")] = fname
        # with open(fname,"wb") as f:
        #     f.write("subject_id,pielou_index\n")

        # and then the detailed files
        fname = output_directory+str(task_id) + "_survey_detailed.csv"
        self.file_names[(task_id,"detailed")] = fname

        # now write the header
        header = "subject_id,num_classifications,pielou_score,species,number_of_votes_for_species"

        # todo - we'll assume, for now, that "how many" is always the first question
        for followup_id in instructions["questionsOrder"]:
            multiple_answers = instructions["questions"][followup_id]["multiple"]
            label = instructions["questions"][followup_id]["label"]

            # the question "how many" is treated differently - we'll give the minimum, maximum and mostly likely
            if followup_id == "HWMN":
                header += ",minimum_number_of_animals,most_likely_number_of_animals,percentage,maximum_number_of_animals"
            else:
                if "behavior" in label:
                    stem = "behaviour:"
                elif "behaviour" in label:
                    stem = "behaviour:"
                else:
                    stem = helper_functions.csv_string(label)

                if not multiple_answers:
                    header += ",most_likely(" + stem + ")"

                for answer_id in instructions["questions"][followup_id]["answersOrder"]:
                    header += ",percentage(" + stem + helper_functions.csv_string(instructions["questions"][followup_id]["answers"][answer_id]["label"]) +")"

        with open(fname,"wb") as f:
            f.write(header+"\n")
Beispiel #13
0
    def __detailed_marking_row__(self,workflow_id,task_id,subject_id,aggregations,shape):
        """
        output for line segments
        :param workflow_id:
        :param task_id:
        :param subject_id:
        :param aggregations:
        :return:
        """
        id_ = (task_id,shape,"detailed")

        for cluster_index,cluster in aggregations[shape + " clusters"].items():
            if cluster_index == "all_users":
                continue
            # convert to int - not really sure why but get stored as unicode
            cluster_index = int(cluster_index)

            # build up the row bit by bit to have the following structure
            # "subject_id,most_likely_tool,x,y,p(most_likely_tool),p(true_positive),num_users"
            row = str(subject_id)+","
            # todo for now - always give the cluster index
            row += str(cluster_index)+","

            # extract the most likely tool for this particular marking and convert it to
            # a string label
            # not completely sure why some clusters are missing this value but does seem to happen

            most_likely_tool = cluster["most_likely_tool"]
            # again - not sure why this percentage would be 0, but does seem to happen
            tool_probability = cluster["percentage"]
            assert tool_probability > 0

            # convert the tool into the string label
            tool_str = self.instructions[workflow_id][task_id]["tools"][int(most_likely_tool)]["marking tool"]
            row += helper_functions.csv_string(tool_str) + ","

            # get the central coordinates next
            for center_param in cluster["center"]:
                if isinstance(center_param,list) or isinstance(center_param,tuple):
                    # if we have a list, split it up into subpieces
                    for param in center_param:
                        row += str(param) + ","
                else:
                    row += str(center_param) + ","

            # add on how likely the most likely tool was
            row += str(tool_probability) + ","
            # how likely the cluster is to being a true positive and how many users (out of those who saw this
            # subject) actually marked it. For the most part p(true positive) is equal to the percentage
            # of people, so slightly redundant but allows for things like weighted voting and IBCC in the future
            prob_true_positive = cluster["existence"][0]["1"]
            num_users = cluster["existence"][1]
            row += str(prob_true_positive) + "," + str(num_users)

            with open(self.file_names[id_],"a") as csvfile:
                csvfile.write(row+"\n")
Beispiel #14
0
    def __detailed_classification_file_setup__(self,output_directory,workflow_id,task_id,tool_id=None,followup_id=None):
        """
        create a csv file for the detailed results of a classification task and set up the headers
        :param output_directory:
        :param workflow_id:
        :param task_id:
        :param tool_id:
        :param followup_id:
        :return:
        """
        # the file name will be based on the task label - which we need to make sure isn't too long and doesn't
        # have any characters which might cause trouble, such as spaces
        fname = self.__get_filename__(workflow_id,task_id,tool_id=tool_id,followup_id=followup_id)

        # start with the detailed results
        id_ = (task_id,tool_id,followup_id,"detailed")
        self.file_names[id_] = output_directory+fname

        # open the file and add the column headers
        with open(output_directory+fname,"wb") as detailed_results:
            # now write the headers
            detailed_results.write("subject_id")

            # the answer dictionary is structured differently for follow up questions markings
            if tool_id is not None:
                # if a follow up question - we will also add a column for the cluster id
                detailed_results.write(",cluster_id")

                answer_dict = dict()
                answers = self.instructions[workflow_id][task_id]["tools"]
                answers = answers[tool_id]["followup_questions"][followup_id]
                answers = answers.get("answers", {})

                for answer_key, answer in answers.items():
                    answer_dict[answer_key] = answer["label"]
            else:
                answer_dict = self.instructions[workflow_id][task_id]["answers"]

            # each possible response will have a separate column - this column will be the percentage of people
            # who selected a certain response. This works whether a single response or multiple ones are allowed
            for answer_key in sorted(answer_dict.keys()):
                # break this up into multiple lines so we can be sure that the answers are sorted correctly
                # order might not matter in the end, but just to be sure
                answer = answer_dict[answer_key]
                answer_string = helper_functions.csv_string(answer)[:50]
                detailed_results.write(",p("+answer_string+")")

            # the final column will give the number of user
            # for follow up question - num_users should be the number of users with markings in the cluster
            detailed_results.write(",num_users\n")
Beispiel #15
0
    def __marking_row__(self,workflow_id,task_id,subject_id,aggregations,shape):
        """
        output for line segments
        :param workflow_id:
        :param task_id:
        :param subject_id:
        :param aggregations:
        :return:
        """
        key = task_id,shape,"detailed"
        for cluster_index,cluster in aggregations[shape + " clusters"].items():
            if cluster_index == "all_users":
                continue

            # build up the row bit by bit to have the following structure
            # "subject_id,most_likely_tool,x,y,p(most_likely_tool),p(true_positive),num_users"
            row = str(subject_id)+","
            # todo for now - always give the cluster index
            row += str(cluster_index)+","

            # extract the most likely tool for this particular marking and convert it to
            # a string label
            try:
                tool_classification = cluster["tool_classification"][0].items()
            except KeyError:
                warning(shape)
                warning(cluster)
                raise
            most_likely_tool,tool_probability = max(tool_classification, key = lambda x:x[1])
            tool_str = self.instructions[workflow_id][task_id]["tools"][int(most_likely_tool)]["marking tool"]
            row += helper_functions.csv_string(tool_str) + ","

            # get the central coordinates next
            for center_param in cluster["center"]:
                if isinstance(center_param,list) or isinstance(center_param,tuple):
                    row += "\"" + str(tuple(center_param)) + "\","
                else:
                    row += str(center_param) + ","

            # add on how likely the most likely tool was
            row += str(tool_probability) + ","
            # how likely the cluster is to being a true positive and how many users (out of those who saw this
            # subject) actually marked it. For the most part p(true positive) is equal to the percentage
            # of people, so slightly redundant but allows for things like weighted voting and IBCC in the future
            prob_true_positive = cluster["existence"][0]["1"]
            num_users = cluster["existence"][1]
            row += str(prob_true_positive) + "," + str(num_users)
            self.csv_files[key].write(row+"\n")
Beispiel #16
0
    def __add_summary_row__(self,id_,subject_id,results,answer_dict):
        """
        given a result for a specific subject (and possibily a specific cluster within that specific subject)
        add one row of results to the summary file. that row contains
        subject_id,tool_index,cluster_index,most_likely,p(most_likely),shannon_entropy,mean_agreement,median_agreement,num_users
        tool_index & cluster_index are only there if we have a follow up to marking task
        :param id_:
        :param subject_id:
        :param results:
        :param answer_dict:
        :return:
        """
        votes,num_users = results

        # get the top choice
        try:
            most_likely,top_probability = max(votes.items(), key = lambda x:x[1])
        except ValueError:
            warning(results)
            raise

        # extract the text corresponding to the most likely answer
        most_likely_label = answer_dict[int(most_likely)]
        # and get rid of any bad characters
        most_likely_label = helper_functions.csv_string(most_likely_label)

        probabilities = votes.values()
        entropy = self.__shannon_entropy__(probabilities)

        mean_p = np.mean(votes.values())
        median_p = np.median(votes.values())

        with open(self.file_names[id_],"a") as results_file:
            results_file.write(str(subject_id)+",")

            # write out details regarding the top choice
            # this might not be a useful value if multiple choices are allowed - in which case just ignore it
            results_file.write(str(most_likely_label)+","+str(top_probability))
            # write out some summaries about the distributions of people's answers
            # again entropy probably only makes sense if only one answer is allowed
            # and mean_p and median_p probably only make sense if multiple answers are allowed
            # so people will need to pick and choose what they want
            results_file.write(","+str(entropy)+","+str(mean_p)+","+str(median_p))
            # finally - how many people have seen this subject for this task
            results_file.write(","+str(num_users)+"\n")
Beispiel #17
0
 def __summary_header_setup__(self,output_directory,workflow_id,fname,task_id,shape):
     """
     all shape aggregation will have a summary file - with one line per subject
     :return:
     """
     # the summary file will contain just line per subject
     id_ = task_id,shape,"summary"
     self.csv_files[id_] = open(output_directory+fname+"_"+shape+"_summary.csv","wb")
     header = "subject_id"
     # extract only the tools which can actually make point markings
     for tool_id in sorted(self.instructions[workflow_id][task_id]["tools"].keys()):
         tool_id = int(tool_id)
         # self.workflows[workflow_id][0] is the list of classification tasks
         # we want [1] which is the list of marking tasks
         found_shape = self.workflows[workflow_id][1][task_id][tool_id]
         if found_shape == shape:
             tool_label = self.instructions[workflow_id][task_id]["tools"][tool_id]["marking tool"]
             tool_label = helper_functions.csv_string(tool_label)
             header += ",median(" + tool_label +")"
     header += ",mean_probability,median_probability,mean_tool,median_tool"
     self.csv_files[id_].write(header+"\n")
Beispiel #18
0
    def __classification_header__(self,output_directory,workflow_id,task_id,tool_id=None,followup_id=None):
        assert (tool_id is None) or (followup_id is not None)
        # start with the detailed results
        fname = self.__get_filename__(workflow_id,task_id,tool_id=tool_id,followup_id=followup_id)

        id_ = (workflow_id,task_id,tool_id,followup_id,"detailed")
        self.file_names[id_] = output_directory+fname
        with open(output_directory+fname,"wb") as detailed_results:
            # now write the headers
            detailed_results.write("subject_id")

            if tool_id is not None:
                detailed_results.write(",cluster_id")

            answer_dict = self.instructions[workflow_id][task_id]["answers"]
            for answer_key in sorted(answer_dict.keys()):
                # break this up into multiple lines so we can be sure that the answers are sorted correctly
                # order might not matter in the end, but just to be sure
                answer = answer_dict[answer_key]
                answer_string = helper_functions.csv_string(answer)[:50]
                detailed_results.write(",p("+answer_string+")")

            detailed_results.write(",num_users\n")

        # now setup the summary file
        fname = self.__get_filename__(workflow_id,task_id,summary = True,tool_id=tool_id,followup_id=followup_id)
        id_ = (workflow_id,task_id,tool_id,followup_id,"summary")
        self.file_names[id_] = output_directory+fname
        with open(output_directory+fname,"wb") as summary_results:
            self.csv_files[id_] = summary_results
            self.csv_files[id_].write("subject_id,")

            if tool_id is not None:
                self.csv_files[id_].write("cluster_id,")

            self.csv_files[id_].write("most_likely,p(most_likely),shannon_entropy,mean_agreement,median_agreement,num_users\n")
Beispiel #19
0
    def __make_files__(self,workflow_id):
        """
        create all of the files necessary for this workflow
        :param workflow_id:
        :return:
        """
        # delete any reference to previous csv outputs - this means we don't have to worry about using
        # workflow ids in the keys and makes things simplier
        self.file_names = {}

        # now create a sub directory specific to the workflow
        try:
            workflow_name = self.workflow_names[workflow_id]
        except KeyError:
            warning(self.workflows)
            warning(self.workflow_names)
            raise

        # workflow names might have characters (such as spaces) which shouldn't be part of a filename, so clean up the
        # workflow names
        workflow_name = helper_functions.csv_string(workflow_name)
        output_directory = "/tmp/"+str(self.project_id)+"/" +str(workflow_id) + "_" + workflow_name + "/"

        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
        self.workflow_directories[workflow_id] = output_directory

        # create the csv files for the classification tasks (both simple and follow up ones)
        self.__classification_file_setup__(output_directory,workflow_id)

        # now set things up for the marking tasks
        self.__marking_file_setup__(output_directory,workflow_id)

        self.__survey_file_setup__(output_directory,workflow_id)

        return output_directory
Beispiel #20
0
    def __survey_row__(self,instructions,aggregations):
        """
        for a given workflow, task and subject print one row of aggregations per species found to a csv file
        where the task correspond to a survey task
        :param workflow_id:
        :param task_id:
        :param subject_id:
        :param aggregations:
        :return:
        """
        # what we are returning (to be printed out to file elsewhere)
        rows = []

        # in dev - for a small project a few bad aggregations got into the system - so filer them out
        if max(aggregations["num species"]) == 0:
            return []

        # on average, how many species did people see?
        # note - nothing here (or empty or what ever) counts as a species - we just won't give any follow up
        # answer responses
        species_in_subject = aggregations.get("num species in image", [])

        views_of_subject = aggregations["num users"]

        pielou = aggregations["pielou index"]

        # only go through the top X species - where X is the median number of species seen
        for species_id,_ in species_in_subject:
            if species_id == "num users":
                continue

            # how many people voted for this species?
            num_votes = aggregations[species_id]["num votes"]
            percentage = num_votes/float(views_of_subject)

            # extract the species name - just to be sure, make sure that the label is "csv safe"
            species_label = helper_functions.csv_string(instructions["species"][species_id])
            row = "," + str(views_of_subject) + "," + str(pielou) + "," + species_label + "," + str(percentage) + "," + str(num_votes)

            # if there is nothing here - there are no follow up questions so just move on
            # same with FR - fire, NTHNG - nothing
            if species_id in ["NTHNGHR","NTHNG","FR"]:
                break

            # do the how many question first
            row += self.__survey_how_many__(instructions,aggregations,species_id)

            # now go through each of the other follow up questions
            # for followup_id in instructions["questionsOrder"]:
            for ii,followup_id in enumerate(instructions["questions"].keys()):
                followup_question = instructions["questions"][followup_id]

                if followup_question["label"] == "How many?":
                    # this gets dealt with separately
                    continue

                multiple_answers = instructions["questions"][followup_id]["multiple"]

                # this follow up question might not be relevant to the particular species
                if followup_id not in aggregations[species_id]["followup"]:
                    # if we do not allow for multiple answers, include blank columns for top candidate
                    # and corresponding percentage
                    if not multiple_answers:
                        row += ","

                    # add in a blank column for each follow up answer (since none of these answers are relevant)
                    for _ in instructions["questions"][followup_id]["answersOrder"]:
                        row += ","
                else:
                    votes = aggregations[species_id]["followup"][followup_id]

                    # if users are only allowed to pick a single answer - return the most likely answer
                    # but still give the individual break downs

                    if not multiple_answers:
                        votes = aggregations[species_id]["followup"][followup_id]
                        answers =(instructions["questions"][followup_id]["answers"])
                        top_candidate,percent = self.__get_top_survey_followup__(votes,answers)

                        row += "," + str(top_candidate)# + "," + str(percent)

                    for answer_id in instructions["questions"][followup_id]["answersOrder"]:

                        if answer_id in votes:
                            row += "," + str(votes[answer_id]/float(num_votes))
                        else:
                            row += ",0"

            # print(len(row.split(",")))
            # assert len(row.split(",")) == 58
            rows.append(row+"\n")

        return rows
Beispiel #21
0
    def __survey_row__(self,instructions,aggregations):
        """
        for a given workflow, task and subject print one row of aggregations per species found to a csv file
        where the task correspond to a survey task
        :param workflow_id:
        :param task_id:
        :param subject_id:
        :param aggregations:
        :return:
        """
        # what we are returning (to be printed out to file elsewhere)
        rows = []

        # in dev - for a small project a few bad aggregations got into the system - so filer them out
        if max(aggregations["num species"]) == 0:
            return []

        # on average, how many species did people see?
        # note - nothing here (or empty or what ever) counts as a species - we just won't give any follow up
        # answer responses
        species_in_subject = self.__get_species_in_subject(aggregations)

        views_of_subject = aggregations["num users"]

        pielou = self.__calc__pielou__(aggregations)

        # only go through the top X species - where X is the median number of species seen
        for species_id,_ in species_in_subject:
            if species_id == "num users":
                continue

            # how many people voted for this species?
            num_votes = aggregations[species_id]["num votes"]
            # percentage = num_votes/float(views_of_subject)

            # extract the species name - just to be sure, make sure that the label is "csv safe"
            species_label = helper_functions.csv_string(instructions["species"][species_id])
            row = "," + str(views_of_subject) + "," + str(pielou) + "," + species_label + "," + str(num_votes)

            # if there is nothing here - there are no follow up questions so just move on
            # same with FR - fire, NTHNG - nothing
            if species_id in ["NTHNGHR","NTHNG","FR"]:
                break

            # do the how many question first
            row += self.__survey_how_many__(instructions,aggregations,species_id)

            # now go through each of the other follow up questions
            for followup_id in instructions["questionsOrder"]:
                followup_question = instructions["questions"][followup_id]

                if followup_question["label"] == "How many?":
                    # this gets dealt with separately
                    continue

                # this follow up question might not be relevant to the particular species
                if followup_id not in aggregations[species_id]["followup"]:
                    for answer_id in instructions["questions"][followup_id]["answersOrder"]:
                        row += ","
                else:
                    votes = aggregations[species_id]["followup"][followup_id]

                    # if users are only allowed to pick a single answer - return the most likely answer
                    # but still give the individual break downs
                    multiple_answers = instructions["questions"][followup_id]["multiple"]
                    if not multiple_answers:
                        votes = aggregations[species_id]["followup"][followup_id].items()
                        answers =(instructions["questions"][followup_id]["answers"])
                        top_candidate,percent = self.__get_top_survey_followup__(votes,answers)

                        row += "," + str(top_candidate) + "," + str(percent)

                    for answer_id in instructions["questions"][followup_id]["answersOrder"]:
                        if answer_id in votes:
                            row += "," + str(votes[answer_id]/float(num_votes))
                        else:
                            row += ",0"

            rows.append(row+"\n")

        return rows