def __set_tags__(self,text): # convert to ascii try: text = text.encode('ascii','ignore') except AttributeError: warning(text) raise # good place to check if there is a newline character in the transcription # if so, someone tried to transcribe multiple lines at once - this is no longer allowed # but there are some legacy transcriptions with \n - such transcriptions are simply ignored if "\n" in text: return "" # the order of the keys matters - we need them to constant across all uses cases # we could sort .items() but that would be a rather large statement # replace each tag with a single non-standard ascii character (given by chr(num) for some number) text = text.strip() for key in sorted(self.tags.keys()): tag = self.tags[key] text = re.sub(tag,chr(key),text) # get rid of some other random tags and commands that shouldn't be included at all # todo - generalize text = re.sub("<br>","",text) text = re.sub("<font size=\"1\">","",text) text = re.sub("</font>","",text) text = re.sub(" ","",text) text = re.sub("&","&",text) text = re.sub("\?\?\?","",text) return text
def __set_tags__(self, text): # convert to ascii try: text = text.encode('ascii', 'ignore') except AttributeError: warning(text) raise # good place to check if there is a newline character in the transcription # if so, someone tried to transcribe multiple lines at once - this is no longer allowed # but there are some legacy transcriptions with \n - such transcriptions are simply ignored if "\n" in text: return "" # the order of the keys matters - we need them to constant across all uses cases # we could sort .items() but that would be a rather large statement # replace each tag with a single non-standard ascii character (given by chr(num) for some number) text = text.strip() for key in sorted(self.tags.keys()): tag = self.tags[key] text = re.sub(tag, chr(key), text) # get rid of some other random tags and commands that shouldn't be included at all # todo - generalize text = re.sub("<br>", "", text) text = re.sub("<font size=\"1\">", "", text) text = re.sub("</font>", "", text) text = re.sub(" ", "", text) text = re.sub("&", "&", text) text = re.sub("\?\?\?", "", text) return text
def __make_files__(self,workflow_id): """ create all of the files necessary for this workflow :param workflow_id: :return: """ # close any previously used files (and delete their pointers) for f in self.csv_files.values(): f.close() self.csv_files = {} # now create a sub directory specific to the workflow try: workflow_name = self.workflow_names[workflow_id] except KeyError: warning(self.workflows) warning(self.workflow_names) raise workflow_name = helper_functions.csv_string(workflow_name) output_directory = "/tmp/"+str(self.project_id)+"/" +str(workflow_id) + "_" + workflow_name + "/" if not os.path.exists(output_directory): os.makedirs(output_directory) self.workflow_directories[workflow_id] = output_directory classification_tasks,marking_tasks,survey_tasks = self.workflows[workflow_id] # go through the classification tasks - they will either be simple c. tasks (one answer allowed) # multiple c. tasks (more than one answer allowed) and possibly a follow up question to a marking for task_id in classification_tasks: # is this task a simple classification task? # don't care if the questions allows for multiple answers, or requires a single one if classification_tasks[task_id] in ["single","multiple"]: self.__classification_header__(output_directory,workflow_id,task_id) else: # this classification task is actually a follow up to a marking task for tool_id in classification_tasks[task_id]: for followup_id,answer_type in enumerate(classification_tasks[task_id][tool_id]): # instructions = self.instructions[workflow_id][task_id]["tools"][tool_id]["followup_questions"][followup_index]["question"] self.__classification_header__(output_directory,workflow_id,task_id,tool_id,followup_id) # id_ = (task_id,tool_id,followup_index) # if answer_type == "single": # self.__single_response_csv_header__(output_directory,id_,instructions) # else: # self.__multi_response_csv_header__(output_directory,id_,instructions) # now set things up for the marking tasks for task_id in marking_tasks: shapes = set(marking_tasks[task_id]) self.__marking_header_setup__(workflow_id,task_id,shapes,output_directory) # and finally the survey tasks for task_id in survey_tasks: instructions = self.instructions[workflow_id][task_id] self.__survey_header_setup__(output_directory,task_id,instructions) return output_directory
def __aggregate__(self,raw_classifications,workflow_id,aggregations): # start by looking for empty subjects self.to_retire = set() for subject_id in raw_classifications["T0"]: user_ids,is_subject_empty = zip(*raw_classifications["T0"][subject_id]) if is_subject_empty != []: empty_count = sum([1 for i in is_subject_empty if i == True]) if empty_count >= 3: self.to_retire.add(subject_id) blank_retirement = len(self.to_retire) non_blanks = [] # now look to see if everything has been transcribed for subject_id in raw_classifications["T3"]: user_ids,completely_transcribed = zip(*raw_classifications["T3"][subject_id]) completely_count = sum([1 for i in completely_transcribed if i == True]) if completely_count >= 3: self.to_retire.add(subject_id) non_blanks.append(subject_id) # get an updated token assert isinstance(self.project,AggregationAPI) self.project.__panoptes_connect__() token = self.project.token for retired_subject in self.to_retire: try: headers = {"Accept":"application/vnd.api+json; version=1","Content-Type": "application/json", "Authorization":"Bearer "+token} params = {"subject_id":retired_subject} r = requests.post("https://panoptes.zooniverse.org/api/workflows/"+str(workflow_id)+"/retired_subjects",headers=headers,data=json.dumps(params)) # rollbar.report_message("results from trying to retire subjects","info",extra_data=r.text) except TypeError as e: warning(e) rollbar.report_exc_info() print("we would have retired " + str(len(self.to_retire))) print("with non-blanks " + str(len(self.to_retire)-blank_retirement)) print(str(len(self.to_retire)-blank_retirement)) self.num_retired = len(self.to_retire) self.non_blanks_retired = len(self.to_retire)-blank_retirement return aggregations
def __marking_row__(self,workflow_id,task_id,subject_id,aggregations,shape): """ output for line segments :param workflow_id: :param task_id: :param subject_id: :param aggregations: :return: """ key = task_id,shape,"detailed" for cluster_index,cluster in aggregations[shape + " clusters"].items(): if cluster_index == "all_users": continue # build up the row bit by bit to have the following structure # "subject_id,most_likely_tool,x,y,p(most_likely_tool),p(true_positive),num_users" row = str(subject_id)+"," # todo for now - always give the cluster index row += str(cluster_index)+"," # extract the most likely tool for this particular marking and convert it to # a string label try: tool_classification = cluster["tool_classification"][0].items() except KeyError: warning(shape) warning(cluster) raise most_likely_tool,tool_probability = max(tool_classification, key = lambda x:x[1]) tool_str = self.instructions[workflow_id][task_id]["tools"][int(most_likely_tool)]["marking tool"] row += helper_functions.csv_string(tool_str) + "," # get the central coordinates next for center_param in cluster["center"]: if isinstance(center_param,list) or isinstance(center_param,tuple): row += "\"" + str(tuple(center_param)) + "\"," else: row += str(center_param) + "," # add on how likely the most likely tool was row += str(tool_probability) + "," # how likely the cluster is to being a true positive and how many users (out of those who saw this # subject) actually marked it. For the most part p(true positive) is equal to the percentage # of people, so slightly redundant but allows for things like weighted voting and IBCC in the future prob_true_positive = cluster["existence"][0]["1"] num_users = cluster["existence"][1] row += str(prob_true_positive) + "," + str(num_users) self.csv_files[key].write(row+"\n")
def __classification_output__(self,workflow_id,task_id,subject_id,aggregations,shape_id=None,followup_id=None): """ add a row to both the summary and detailed csv output files """ # a dictionary containing the index id of each answer and its corresponding label answer_dict = self.instructions[workflow_id][task_id]["answers"] # start with the summary file id_ = (workflow_id,task_id,shape_id,followup_id,"summary") try: self.__add_summary_row__(id_,subject_id,aggregations,answer_dict) id_ = (workflow_id,task_id,shape_id,followup_id,"detailed") self.__add_detailed_row__(id_,subject_id,aggregations,answer_dict) except ValueError: warning("empty aggregations for workflow id " + str(workflow_id) + " task id " + str(task_id) + " and subject id" + str(subject_id) + " -- skipping")
def __add_summary_row__(self,id_,subject_id,results,answer_dict): """ given a result for a specific subject (and possibily a specific cluster within that specific subject) add one row of results to the summary file. that row contains subject_id,tool_index,cluster_index,most_likely,p(most_likely),shannon_entropy,mean_agreement,median_agreement,num_users tool_index & cluster_index are only there if we have a follow up to marking task :param id_: :param subject_id: :param results: :param answer_dict: :return: """ votes,num_users = results # get the top choice try: most_likely,top_probability = max(votes.items(), key = lambda x:x[1]) except ValueError: warning(results) raise # extract the text corresponding to the most likely answer most_likely_label = answer_dict[int(most_likely)] # and get rid of any bad characters most_likely_label = helper_functions.csv_string(most_likely_label) probabilities = votes.values() entropy = self.__shannon_entropy__(probabilities) mean_p = np.mean(votes.values()) median_p = np.median(votes.values()) with open(self.file_names[id_],"a") as results_file: results_file.write(str(subject_id)+",") # write out details regarding the top choice # this might not be a useful value if multiple choices are allowed - in which case just ignore it results_file.write(str(most_likely_label)+","+str(top_probability)) # write out some summaries about the distributions of people's answers # again entropy probably only makes sense if only one answer is allowed # and mean_p and median_p probably only make sense if multiple answers are allowed # so people will need to pick and choose what they want results_file.write(","+str(entropy)+","+str(mean_p)+","+str(median_p)) # finally - how many people have seen this subject for this task results_file.write(","+str(num_users)+"\n")
def __cluster__(self,used_shapes,raw_markings,image_dimensions,aggregations): """ :param aggregations: we're working on a subject by subject basis - aggregations is from previous subjects """ if raw_markings == {}: warning("skipping") return aggregations # start by clustering text # print("clustering text") # cluster_aggregations = {} cluster_aggregations = self.text_algorithm.__aggregate__(raw_markings,image_dimensions) aggregations = self.__merge_aggregations__(aggregations,cluster_aggregations) # print("clustering images") image_aggregations = self.image_algorithm.__aggregate__(raw_markings,image_dimensions) aggregations = self.__merge_aggregations__(aggregations,image_aggregations) return aggregations
def __setup_aligned_text__(self,aligned_text,text_coordinates,user_ids,x1,y1,x2,y2): """ when printing out the individual transcriptions that make up a cluster we need to do a few things including sorting them. :return: """ # todo - honestly not sure if most of this function is necessary new_aligned = [] for t in aligned_text: # todo - figure out if this is necessary or useful if t is None: warning("text was none - really not sure why but skipping") continue # put tags back into multicharacter format t = self.__reset_tags__(t) # instead of chr(24), use "\u0018" - postgres prefers that new_aligned.append(t.replace(chr(24),unicode("\u0018"))) # if the text is horizontal - i.e. the angle of the center is less than 45 degrees # sort the aligned text by x coordinates - otherwise sort by DECREASING y coordinates # (since 0,0 is at the top left) try: tan_theta = math.fabs(y1-y2)/math.fabs(x1-x2) theta = math.atan(tan_theta) except ZeroDivisionError: theta = math.pi/2. # horizontal # pretty sure that X1 < X2 but don't want to make an assumption if math.fabs(theta) <= math.pi/4.: starting_coordinates = [min(x1,x2) for x1,x2,_,_ in text_coordinates] # vertical text # pretty not sure about whether Y1<Y2 so playing it safe else: starting_coordinates = [-max(y1,y2) for _,_,y1,y2 in text_coordinates] text_and_ids_with_coordinates = zip(starting_coordinates,new_aligned,user_ids) # sort text_and_ids_with_coordinates.sort(key = lambda x:x[0]) _,aligned_text,user_id = zip(*text_and_ids_with_coordinates) return aligned_text
def __cluster__(self,used_shapes,raw_markings,image_dimensions): """ for when I want to see raw classifications in addition to markings :param workflow_id: :return: """ if raw_markings == {}: warning("warning - empty set of images") return {} # start by clustering text print("clustering text") cluster_aggregation = self.text_algorithm.__aggregate__(raw_markings,image_dimensions) print("clustering images") image_aggregation = self.image_algorithm.__aggregate__(raw_markings,image_dimensions) cluster_aggregation = self.__merge_aggregations__(cluster_aggregation,image_aggregation) return cluster_aggregation
def __line_alignment__(self,lines): """ align.py the text by using MAFFT :param lines: :return: """ aligned_text = [] if len(lines) == 1: return lines with tempfile.NamedTemporaryFile(suffix=".fasta") as in_file, tempfile.NamedTemporaryFile("r") as out_file: for line in lines: if isinstance(line,tuple): # we have a list of text segments which we should join together line = "".join(line) # line = unicodedata.normalize('NFKD', line).encode('ascii','ignore') # assert isinstance(line,str) # for i in range(max_length-len(line)): # fasta_line += "-" try: in_file.write(">\n"+line+"\n") except UnicodeEncodeError: warning(line) warning(unicodedata.normalize('NFKD', line).encode('ascii','ignore')) raise in_file.flush() # todo - play around with gap penalty --op 0.5 t = "mafft --op 0.85 --text " + in_file.name + " > " + out_file.name +" 2> /dev/null" # t = "mafft --text " + in_file.name + " > " + out_file.name +" 2> /dev/null" os.system(t) cumulative_line = "" for line in out_file.readlines(): if (line == ">\n"): if (cumulative_line != ""): aligned_text.append(cumulative_line) cumulative_line = "" else: cumulative_line += line[:-1] if cumulative_line == "": warning(lines) assert False aligned_text.append(cumulative_line) # no idea why mafft seems to have just including this line in the output # also might just be affecting Greg's computer if aligned_text[0] == '/usr/lib/mafft/lib/mafft': return aligned_text[1:] else: return aligned_text
def __cluster__(self, used_shapes, raw_markings, image_dimensions, aggregations): """ :param aggregations: we're working on a subject by subject basis - aggregations is from previous subjects """ if raw_markings == {}: warning("skipping") return aggregations # start by clustering text # print("clustering text") # cluster_aggregations = {} cluster_aggregations = self.text_algorithm.__aggregate__( raw_markings, image_dimensions) aggregations = self.__merge_aggregations__(aggregations, cluster_aggregations) # print("clustering images") image_aggregations = self.image_algorithm.__aggregate__( raw_markings, image_dimensions) aggregations = self.__merge_aggregations__(aggregations, image_aggregations) return aggregations
def __make_files__(self,workflow_id): """ create all of the files necessary for this workflow :param workflow_id: :return: """ # delete any reference to previous csv outputs - this means we don't have to worry about using # workflow ids in the keys and makes things simplier self.file_names = {} # now create a sub directory specific to the workflow try: workflow_name = self.workflow_names[workflow_id] except KeyError: warning(self.workflows) warning(self.workflow_names) raise # workflow names might have characters (such as spaces) which shouldn't be part of a filename, so clean up the # workflow names workflow_name = helper_functions.csv_string(workflow_name) output_directory = "/tmp/"+str(self.project_id)+"/" +str(workflow_id) + "_" + workflow_name + "/" if not os.path.exists(output_directory): os.makedirs(output_directory) self.workflow_directories[workflow_id] = output_directory # create the csv files for the classification tasks (both simple and follow up ones) self.__classification_file_setup__(output_directory,workflow_id) # now set things up for the marking tasks self.__marking_file_setup__(output_directory,workflow_id) self.__survey_file_setup__(output_directory,workflow_id) return output_directory
def __aggregate__(self,raw_classifications,workflow,aggregations,workflow_id): """ classification aggregation for annotate/folger means looking for subjects which we can retire :param raw_classifications: :param workflow: :param aggregations: :param workflow_id: :return: """ if not isinstance(workflow_id, int): raise TypeError('workflow_id must be an int') to_retire = set() # start by looking for empty subjects # "T0" really should always be there but we may have a set of classifications (really old ones before # the workflow changed) where it is missing - if "T0" isn't there, just skip if "T0" in raw_classifications: to_retire.update(self.__get_blank_subjects__(raw_classifications)) # now look to see what has been completely transcribed if "T3" in raw_classifications: to_retire.update(self.__get_completed_subjects__(raw_classifications)) # call the Panoptes API to retire these subjects # get an updated token time_delta = datetime.datetime.now()-self.token_date # update every 30 minutes if time_delta.seconds > (30*60): self.token_date = datetime.datetime.now() if not isinstance(self.project, AggregationAPI): raise TypeError( 'self.project must be an AggregationAPI instance' ) self.project.__panoptes_connect__() token = self.project.token self.total_retired += len(to_retire) # need to retire the subjects one by one for retired_subject in to_retire: self.to_retire.add(retired_subject) try: headers = {"Accept":"application/vnd.api+json; version=1","Content-Type": "application/json", "Authorization":"Bearer "+token} params = {"subject_id":retired_subject} r = requests.post("https://panoptes.zooniverse.org/api/workflows/"+str(workflow_id)+"/retired_subjects",headers=headers,data=json.dumps(params)) r.raise_for_status() except TypeError as e: warning(e) rollbar.report_exc_info() # if to_retire != set(): # print("total retired so far " + str(self.total_retired)) # print("we would have retired " + str(len(self.to_retire))) # print("with non-blanks " + str(len(self.to_retire)-blank_retirement)) # print(str(len(self.to_retire)-blank_retirement)) # # self.num_retired = len(self.to_retire) # self.non_blanks_retired = len(self.to_retire)-blank_retirement return aggregations
} } }, ReplyToAddresses=[ '*****@*****.**', ], ReturnPath='*****@*****.**' ) print("response from emailing results") print(response) if __name__ == "__main__": try: opts, args = getopt.getopt(sys.argv[1:],"shi:e:d:",["summary","project_id=","environment=","end_date="]) except getopt.GetoptError: warning('transcription.py -i <project_id> -e: <environment> -d: <end_date>') sys.exit(2) environment = os.environ.get('ENVIRONMENT', 'development') project_id = None end_date = None summary = False for opt, arg in opts: if opt in ["-i","--project_id"]: project_id = int(arg) elif opt in ["-e","--environment"]: environment = arg elif opt in ["-d","--end_date"]: end_date = parser.parse(arg) elif opt in ["-s","--summary"]:
def __subject_output__(self,workflow_id,subject_id,aggregations): """ add csv rows for all the output related to this particular workflow/subject_id :param workflow_id: :param subject_id: :param aggregations: :return: """ classification_tasks,marking_tasks,survey_tasks = self.workflows[workflow_id] for task_id,task_type in classification_tasks.items(): # a subject might not have results for all tasks if task_id not in aggregations: continue # we have follow up questions if isinstance(task_type,dict): for tool_id in task_type: for followup_index,answer_type in enumerate(task_type[tool_id]): # what sort of shape are we looking for - help us find relevant clusters shape = self.workflows[workflow_id][1][task_id][tool_id] for cluster_index,cluster in aggregations[task_id][shape + " clusters"].items(): if cluster_index == "all_users": continue classification = cluster["tool_classification"][0] most_likely_tool,_ = max(classification.items(),key = lambda x:x[1]) # only consider clusters which most likely correspond to the correct tool if int(most_likely_tool) != int(tool_id): continue possible_answers = self.instructions[workflow_id][task_id]["tools"][tool_id]["followup_questions"][followup_index]["answers"] if "followup_question" not in aggregations[task_id][shape + " clusters"][cluster_index]: print("missing follow up response") continue try: results = aggregations[task_id][shape + " clusters"][cluster_index]["followup_question"][str(followup_index)] except KeyError: warning(aggregations[task_id][shape + " clusters"][cluster_index]) raise id_ = task_id,tool_id,followup_index if answer_type == "single": self.__single_choice_classification_row__(possible_answers,id_,subject_id,results,cluster_index) else: self.__multi_choice_classification_row__(possible_answers,id_,subject_id,results,cluster_index) else: results = aggregations[task_id] self.__classification_output__(workflow_id,task_id,subject_id,results) for task_id,possible_shapes in marking_tasks.items(): for shape in set(possible_shapes): # not every task have been done for every aggregation if task_id in aggregations: if shape == "polygon": self.__polygon_row__(workflow_id,task_id,subject_id,aggregations[task_id]) self.__polygon_summary_output__(workflow_id,task_id,subject_id,aggregations[task_id]) else: self.__marking_row__(workflow_id,task_id,subject_id,aggregations[task_id],shape) self.__shape_summary_output__(workflow_id,task_id,subject_id,aggregations,shape) for task_id in survey_tasks: instructions = self.instructions[workflow_id][task_id] # id_ = (task_id,"summary") # with open(self.file_names[id_],"a") as f: # summary_line = self.__survey_summary_row(aggregations) # f.write(str(subject_id)+summary_line) id_ = (task_id,"detailed") with open(self.file_names[id_],"a") as f: detailed_lines = self.__survey_row__(instructions,aggregations) for l in detailed_lines: f.write(str(subject_id)+l)
def __restructure_json__(self): print("restructing json results") workflow_id = self.workflows.keys()[0] cur = self.postgres_session.cursor() stmt = "select subject_id,aggregation from aggregations where workflow_id = " + str(workflow_id) cur.execute(stmt) new_json = {} subjects_with_results = 0 for ii,(subject_id,aggregation) in enumerate(cur.fetchall()): # if subject_id not in self.classification_alg.to_retire: continue try: clusters_by_line = {} if isinstance(aggregation,str): print("converting aggregation to string") aggregation = json.loads(aggregation) for key,cluster in aggregation["T2"]["text clusters"].items(): if key == "all_users": continue if isinstance(cluster,str): warning("cluster is in string format for some reason") cluster = json.loads(cluster) try: # for dev only since we may not have updated every transcription if cluster["cluster members"] == []: continue except TypeError: warning(cluster) warning() raise index = cluster["set index"] # text_y_coord.append((cluster["center"][2],cluster["center"][-1])) if index not in clusters_by_line: clusters_by_line[index] = [cluster] else: clusters_by_line[index].append(cluster) cluster_set_coordinates = {} for set_index,cluster_set in clusters_by_line.items(): # clusters are based on purely horizontal lines so we don't need to take the # average or anything like that. # todo - figure out what to do with vertical lines, probably keep them completely separate cluster_set_coordinates[set_index] = cluster_set[0]["center"][2] sorted_sets = sorted(cluster_set_coordinates.items(), key = lambda x:x[1]) for set_index,_ in sorted_sets: cluster_set = clusters_by_line[set_index] # now on the (slightly off chance) that there are multiple clusters for this line, sort them # by x coordinates line = [(cluster["center"][0],cluster["center"][-1]) for cluster in cluster_set] line.sort(key = lambda x:x[0]) _,text = zip(*line) text = list(text) # for combining the possible multiple clusters for this line into one merged_line = "" for t in text: # think that storing in postgres converts from str to unicode # for general display, we don't need ord(24) ie skipped characters new_t = t.replace(chr(24),"") merged_line += new_t # we seem to occasionally get lines that are just skipped characters (i.e. the string # if just chr(24)) - don't report these lines if merged_line != "": # is this the first line we've encountered for this subject? if subject_id not in new_json: new_json[subject_id] = {"text":[],"individual transcriptions":[], "accuracy":[], "coordinates" : [],"users_per_line":[]} # add in the metadata metadata = self.__get_subject_metadata__(subject_id)["subjects"][0]["metadata"] new_json[subject_id]["metadata"] = metadata new_json[subject_id]["zooniverse subject id"] = subject_id # add in the line of text new_json[subject_id]["text"].append(merged_line) # now add in the aligned individual transcriptions # use the first cluster we found for this line as a "representative cluster" rep_cluster = cluster_set[0] zooniverse_ids = [] for user_id in rep_cluster["cluster members"]: zooniverse_login_name = self.__get_login_name__(user_id) # todo - not sure why None can be returned but does seem to happen if zooniverse_login_name is not None: # new_json[subject_id]["users_per_line"].append(zooniverse_login_name) zooniverse_ids.append(zooniverse_login_name) else: zooniverse_ids.append("None") # todo - if a line is transcribed completely but in distinct separate parts # todo - this may cause trouble new_json[subject_id]["individual transcriptions"].append(rep_cluster["aligned_text"]) new_json[subject_id]["users_per_line"].append(zooniverse_ids) # what was the accuracy for this line? accuracy = len([c for c in merged_line if ord(c) != 27])/float(len(merged_line)) new_json[subject_id]["accuracy"].append(accuracy) # add in the coordinates # this is only going to work with horizontal lines line_segments = [cluster["center"][:-1] for cluster in cluster_set] x1,x2,y1,y2 = zip(*line_segments) # find the line segments which define the start and end of the line overall x_start = min(x1) x_end = max(x2) start_index = np.argmin(x1) end_index = np.argmax(x2) y_start = y1[start_index] y_end = y1[end_index] new_json[subject_id]["coordinates"].append([x_start,x_end,y_start,y_end]) # count once per subject subjects_with_results += 1 except KeyError: pass json.dump(new_json,open("/tmp/"+str(self.project_id)+".json","wb")) aws_tar = self.__get_aws_tar_name__() print("saving json results") with tarfile.open("/tmp/"+aws_tar,mode="w") as t: t.add("/tmp/"+str(self.project_id)+".json")
ReplyToAddresses=[ '*****@*****.**', ], ReturnPath='*****@*****.**') print("response from emailing results") print(response) if __name__ == "__main__": try: opts, args = getopt.getopt( sys.argv[1:], "shi:e:d:", ["summary", "project_id=", "environment=", "end_date="]) except getopt.GetoptError: warning( 'transcription.py -i <project_id> -e: <environment> -d: <end_date>' ) sys.exit(2) environment = os.environ.get('ENVIRONMENT', 'development') project_id = None end_date = None summary = False for opt, arg in opts: if opt in ["-i", "--project_id"]: project_id = int(arg) elif opt in ["-e", "--environment"]: environment = arg elif opt in ["-d", "--end_date"]: end_date = parser.parse(arg)
def __line_alignment__(self, lines): """ align.py the text by using MAFFT :param lines: :return: """ aligned_text = [] if len(lines) == 1: return lines with tempfile.NamedTemporaryFile( suffix=".fasta") as in_file, tempfile.NamedTemporaryFile( "r") as out_file: for line in lines: if isinstance(line, tuple): # we have a list of text segments which we should join together line = "".join(line) # line = unicodedata.normalize('NFKD', line).encode('ascii','ignore') # assert isinstance(line,str) # for i in range(max_length-len(line)): # fasta_line += "-" try: in_file.write(">\n" + line + "\n") except UnicodeEncodeError: warning(line) warning( unicodedata.normalize('NFKD', line).encode('ascii', 'ignore')) raise in_file.flush() # todo - play around with gap penalty --op 0.5 t = "mafft --op 0.85 --text " + in_file.name + " > " + out_file.name + " 2> /dev/null" # t = "mafft --text " + in_file.name + " > " + out_file.name +" 2> /dev/null" os.system(t) cumulative_line = "" for line in out_file.readlines(): if (line == ">\n"): if (cumulative_line != ""): aligned_text.append(cumulative_line) cumulative_line = "" else: cumulative_line += line[:-1] if cumulative_line == "": warning(lines) assert False aligned_text.append(cumulative_line) # no idea why mafft seems to have just including this line in the output # also might just be affecting Greg's computer if aligned_text[0] == '/usr/lib/mafft/lib/mafft': return aligned_text[1:] else: return aligned_text
def __aggregate__(self, raw_classifications, workflow, aggregations, workflow_id): """ classification aggregation for annotate/folger means looking for subjects which we can retire :param raw_classifications: :param workflow: :param aggregations: :param workflow_id: :return: """ if not isinstance(workflow_id, int): raise TypeError('workflow_id must be an int') to_retire = set() # start by looking for empty subjects # "T0" really should always be there but we may have a set of classifications (really old ones before # the workflow changed) where it is missing - if "T0" isn't there, just skip if "T0" in raw_classifications: to_retire.update(self.__get_blank_subjects__(raw_classifications)) # now look to see what has been completely transcribed if "T3" in raw_classifications: to_retire.update( self.__get_completed_subjects__(raw_classifications)) # call the Panoptes API to retire these subjects # get an updated token time_delta = datetime.datetime.now() - self.token_date # update every 30 minutes if time_delta.seconds > (30 * 60): self.token_date = datetime.datetime.now() if not isinstance(self.project, AggregationAPI): raise TypeError( 'self.project must be an AggregationAPI instance') self.project.__panoptes_connect__() token = self.project.token self.total_retired += len(to_retire) # need to retire the subjects one by one for retired_subject in to_retire: self.to_retire.add(retired_subject) try: headers = { "Accept": "application/vnd.api+json; version=1", "Content-Type": "application/json", "Authorization": "Bearer " + token } params = {"subject_id": retired_subject} r = requests.post( "https://panoptes.zooniverse.org/api/workflows/" + str(workflow_id) + "/retired_subjects", headers=headers, data=json.dumps(params)) r.raise_for_status() except TypeError as e: warning(e) rollbar.report_exc_info() # if to_retire != set(): # print("total retired so far " + str(self.total_retired)) # print("we would have retired " + str(len(self.to_retire))) # print("with non-blanks " + str(len(self.to_retire)-blank_retirement)) # print(str(len(self.to_retire)-blank_retirement)) # # self.num_retired = len(self.to_retire) # self.non_blanks_retired = len(self.to_retire)-blank_retirement return aggregations
class FolgerClustering(TextClustering): def __init__(self, shape, project, param_dict): TextClustering.__init__(self, shape, project, param_dict) self.folger_safe_tags = dict() # for folger the tags in the transcriptions are not actually the tags that folger wants for key, tag in self.tags.items(): self.folger_safe_tags[key] = tag.replace("sw-", "") self.total = 0 self.error = 0 def __accuracy__(self, s): assert isinstance(s, str) assert len(s) > 0 return sum([1 for c in s if c != "-"]) / float(len(s)) def __reset_tags__(self, text): """ with text, we will have tags represented by a single character (with ord() > 128 to indicate that something is special) Convert these back to the full text representation also take care of folger specific stuff right :param text: :return: """ assert type(text) in [str, unicode] # reverse_map = {v: k for k, v in self.tags.items()} # also go with something different for "not sure" # this matter when the function is called on the aggregate text # reverse_map[200] = chr(27) # and for gaps inserted by MAFFT # reverse_map[201] = chr(24) ret_text = "" for c in text: if ord(c) > 128: ret_text += self.folger_safe_tags[ord(c)] else: ret_text += c return ret_text def __find_completed_components__(self, aligned_text, coordinates): """ go through the aggregated text looking for subsets where at least 3 people have transcribed everything :param aligned_text: :param coordinates: :return: """ completed_indices = [] for char_index in range(len(aligned_text[0])): num_char = len( [1 for text in aligned_text if ord(text[char_index]) != 25]) if num_char >= 3: completed_indices.append(char_index) starting_points = {} ending_points = {} # transcription_range = {} # find consecutive blocks if completed_indices != []: # find the contiguous blocks of completed transcriptions blocks = [ [completed_indices[0]], ] for i, char_index in list(enumerate(completed_indices))[1:]: # do we have a jump - if so, start a new block if completed_indices[i - 1] != (char_index - 1): blocks[-1].append(completed_indices[i - 1]) blocks.append([char_index]) # if the last character started a new block (kinda weird but happens) # then we have a block only one character long - skip it blocks[-1].append(completed_indices[-1]) if blocks[-1][0] == blocks[-1][1]: blocks = blocks[:-1] # technically we can have multiple transcriptions from the same user so # instead of user_index, I'll use transcription_index # also, technically the same user could give transcribe the same piece of text twice (or more) # and include those transcriptions in different annotations. Going to assume that doesn't happen for transcription_index, (text, coord) in enumerate( zip(aligned_text, coordinates)): x1, x2, y1, y2 = coord non_space_characters = [ i for (i, c) in enumerate(text) if ord(c) != 25 ] first_char = min(non_space_characters) last_char = max(non_space_characters) # transcription_range[transcription_index] = (first_char,last_char) # look for transcriptions which exactly match up with the completed segment # match on either starting OR ending point matching up # we'll use these transcriptions to determine where to place the red dots # telling people to no longer transcribe that text # such transcriptions may not exist - in which case we cannot really do anything for b in blocks: b = tuple(b) # does the start of the transcription match up with the start of the completed segment if b[0] == first_char: if b in starting_points: starting_points[b].append((x1, y1)) else: starting_points[b] = [(x1, y1)] # does the end of the transcription match up with the end of the completed segment? if b[1] == last_char: if (first_char, last_char) in ending_points: ending_points[b].append((x2, y2)) else: ending_points[b] = [(x2, y2)] return starting_points, ending_points def __create_clusters__(self, (starting_points, ending_points), aggregated_text, cluster_index, aligned_text, variants, user_ids, text_coordinates): """ the aggregated text, split up into completed components and make a result (aggregate) cluster for each of those components :param aggregated_text: :param transcription_range: where (relative to the aggregate text) each transcription string starts and stops useful for differentiating between gap markers before or after the text and gaps inside the text :param markings: the original markings - without the tags tokenized :return: """ clusters = [] # go through every segment that is considered done for (lb, ub) in starting_points: # not sure how likely this is to happen, but just to be sure # make sure that we have both a starting and ending point if (lb, ub) not in ending_points: continue new_cluster = {} X1, Y1 = zip(*starting_points[(lb, ub)]) X2, Y2 = zip(*ending_points[(lb, ub)]) x1 = np.median(X1) x2 = np.median(X2) y1 = np.median(Y1) y2 = np.median(Y2) completed_text = self.__reset_tags__(aggregated_text[lb:ub + 1]) # chr(26) means not enough people have transcribed at a given position # but we specifically chose this substring as a substring where all the characters have # been transcribed by enough people. So sanity check assert chr(26) not in completed_text assert isinstance(completed_text, str) new_cluster["center"] = (x1, x2, y1, y2, completed_text) # # new_cluster["cluster members"] = list(user_ids) new_cluster["individual points"] = zip(X1, Y1, X2, Y2) # print(new_cluster["individual points"]) # assert False new_cluster["set index"] = cluster_index new_aligned = [] for t in aligned_text: # todo - figure out if this is necessary or useful if t is None: warning("text was none - really not sure why but skipping") continue # put tags back into multicharacter format t = self.__reset_tags__(t) # instead of chr(24), use "\u0018" - postgres prefers that new_aligned.append(t.replace(chr(24), unicode("\u0018"))) # if the text is horizontal - i.e. the angle of the center is less than 45 degrees # sort the aligned text by x coordinates - otherwise sort by DECREASING y coordinates # (since 0,0 is at the top left) try: tan_theta = math.fabs(y1 - y2) / math.fabs(x1 - x2) theta = math.atan(tan_theta) except ZeroDivisionError: theta = math.pi / 2. # horizontal # pretty sure that X1 < X2 but don't want to make an assumption if math.fabs(theta) <= math.pi / 4.: starting_coordinates = [ min(x1, x2) for x1, x2, _, _ in text_coordinates ] # vertical text # pretty not sure about whether Y1<Y2 so playing it safe else: starting_coordinates = [ -max(y1, y2) for _, _, y1, y2 in text_coordinates ] text_and_ids_with_coordinates = zip(starting_coordinates, new_aligned, user_ids) # sort text_and_ids_with_coordinates.sort(key=lambda x: x[0]) _, aligned_text, user_id = zip(*text_and_ids_with_coordinates) new_cluster["aligned_text"] = aligned_text new_cluster["cluster members"] = user_ids new_cluster["num users"] = len(new_cluster["cluster members"]) new_cluster["variants"] = [] # since a simple spelling mistake can count as a variant, look for cases where at least # two people have given the same variant variant_count = dict() for variant_list in variants: for v in variant_list: if v not in variant_count: variant_count[v] = 1 else: variant_count[v] += 1 if variant_count[v] == 2: new_cluster["variants"].append(v) clusters.append(new_cluster) return clusters