def __agglomerative__clustering__(self,markings,reduced_markings,user_ids): """ TBD """ # start by splitting markings into lines and text and then the lines into slopes and intercepts intercepts,slopes,text = zip(*reduced_markings) # deal with special characters in the text and "recombine" the markings # text has capital letters used only for special characters/tags # while capitalized_text has the original capitalization which is useful for the final aggregate result text,capitalized_text = zip(*[self.__set_special_characters__(t) for t in text]) reduced_markings = zip(intercepts,slopes,text) l = [] for ii,t in enumerate(text): # print t if "finished" in t: l.append(ii) print "===" # normalize the the slopes and intercepts normalized_intercepts,normalized_slopes = self.__normalize_lines__(intercepts,slopes) pts_list = zip(normalized_intercepts,normalized_slopes) # pts_list = zip(intercepts,slopes) # do agglomerative clustering # the panda dataframe seems necessary but not totally sure # labels = range(len(pts_list)) # print labels # variables = ["X","Y"] # df = pd.DataFrame(list(pts_list),columns=variables, index=labels) # row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns=labels, index=labels) # see http://stackoverflow.com/questions/18952587/use-distance-matrix-in-scipy-cluster-hierarchy-linkage labels = range(len(pts_list)) variables = ["X","Y"] # X = np.random.random_sample([5,3])*10 df = pd.DataFrame(list(pts_list),columns=variables, index=labels) # variables = ["X"] # # X = np.random.random_sample([5,3])*10 # df = pd.DataFrame(list(normalized_intercepts),columns=variables, index=labels) row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns=labels, index=labels) distances = squareform(pdist(df, metric='euclidean'))[l[0]] distances = zip(range(len(distances)),distances) distances.sort(key = lambda x:x[1]) print distances for ii,d in distances[:10]: print text[ii] print "==-----" agglomerations = linkage(row_dist, method='single') print len(pts_list) print len(agglomerations) assert False # clusters will be a list representation of the the tree that results from merging clusters # as determined by agglomerations clusters = [] for m,M,raw_transcription in zip(reduced_markings,capitalized_text,markings): # coordinates will store the xy coordinates of the line segment c = {"cluster members":[m,],"individual transcriptions":[M,], "coordinates":[raw_transcription[:-1]]} clusters.append(c) # go through of the cluster mergers given by the scipy algorithm # and decide whether to go with them or not for merge in agglomerations: rchild_index = int(merge[0]) lchild_index = int(merge[1]) print lchild_index,rchild_index # None => we have already capped every path in the subtree # i.e. on any path from this node down to the root, we will encounter a capped cluster # so if both nodes are capped, just add None and continue if (clusters[rchild_index] is None) and (clusters[lchild_index] is None): clusters.append(None) continue # if only one node is None, cap the node that is not None elif clusters[rchild_index] is None: clusters[lchild_index] = self.__cap_cluster__(clusters[lchild_index]) clusters.append(None) continue elif clusters[lchild_index] is None: clusters[rchild_index] = self.__cap_cluster__(clusters[rchild_index]) clusters.append(None) continue assert "center" not in clusters[lchild_index] assert "center" not in clusters[rchild_index] # todo - I think this use of cluster members is consistent - but not consistent with code # todo - else where - won't cause a bug, but could cause some confusion _,_,transcriptions = zip(*(clusters[rchild_index]["cluster members"])) _,_,transcriptions_left = zip(*(clusters[lchild_index]["cluster members"])) # if True in [("surely" in t) for t in transcriptions]: # print rchild_index,lchild_index # print clusters[rchild_index]["cluster members"] # print clusters[lchild_index]["cluster members"] # print # elif True in [("surely" in t) for t in transcriptions_left]: # print rchild_index,lchild_index # print clusters[rchild_index]["cluster members"] # print clusters[lchild_index]["cluster members"] # print # convert to list transcriptions = list(transcriptions) transcriptions.extend(transcriptions_left) aligned_transcriptions = self.__line_alignment__(transcriptions) accuracy = self.__agreement__(aligned_transcriptions) # if the minimum accuracy is reasonably high, then we will want to combine them if min(accuracy) >= 0.6: new_cluster = deepcopy(clusters[rchild_index]) new_cluster["cluster members"].extend(clusters[lchild_index]["cluster members"]) new_cluster["individual transcriptions"].extend(clusters[lchild_index]["individual transcriptions"]) new_cluster["coordinates"].extend(clusters[lchild_index]["coordinates"]) clusters.append(new_cluster) else: # the accuracy of the combined cluster is low enough that we do not want combine # instead, we'll "cap" each of the clusters - by giving it a center value # and appending None to the list of clusters clusters[rchild_index] = self.__cap_cluster__(clusters[rchild_index]) clusters[lchild_index] = self.__cap_cluster__(clusters[lchild_index]) clusters.append(None) capped_clusters = [] for c in clusters: if (c is not None) and ("center" in c): if c["num users"] >= 3: print "***" capped_clusters.append(c) else: print hesse_line_reduction([c["center"],])[0][:2],c["center"][-1] return capped_clusters
def __inner_fit__(self,markings,user_ids,tools,reduced_markings): # we want to first cluster first just on dist and theta - ignoring the text contents # dist_list,theta_list,text_list,raw_pts_list = zip(*markings) # mapped_markings = zip(dist_list,theta_list) # cluster just on points, not on text print reduced_markings dist_l,theta_l,text_l = zip(*reduced_markings) reduced_markings_without_text = zip(dist_l,theta_l) ordering = self.__fit2__(reduced_markings_without_text,user_ids) # use the below 2 to build up each cluster current_lines = {} current_pts = {} clusters = [] for a,b in ordering: # a - line values - "intercept" and slope user_index = reduced_markings_without_text.index(a) user = user_ids[user_index] # extract the corresponding text and the raw (unmapped) point # text = text_list[user_index] # raw_pt = raw_pts_list[user_index] text = markings[user_index][-1] raw_pt = markings[user_index][:-1] if "\n" in text: print "multiline - skipping" continue # convert from unicode to ascii assert isinstance(text,unicode) text = text.encode('ascii','ignore') # # # todo - can this be done better? # special_characters = {} # for tag in ["[notenglish]","[/notenglish]"]: # special_characters[tag] = [match.start() for match in re.finditer(re.escape(tag), text)] # print special_characters text = re.sub("\[deletion\].*\[/deletion\]","",text) text = re.sub(r'\[deletion\].*\[\\deletion\]',"",text) text = re.sub("\[illegible\].*\[/illegible\]","",text) text = re.sub(r'\[deletionhas\]\[/deletion\]',"",text) text = re.sub("\[insertion\].*\[/insertion\]","",text) text = re.sub("\[underline\].*\[/underline\]","",text) text = re.sub("\[notenglish\].*\[/notenglish\]","",text) text = re.sub(r'\[has\]',"",text) text = re.sub(r'\(deleted\)',"",text) text = re.sub(r'\[deletion\]',"",text) text = re.sub("\[insertion\]","",text) # todo - find a way to fix this - stupid postgres/json text = re.sub(r'\'',"",text) # do this now, because all of the above subsitutions may have created an empty line if text == "": continue # if we have an empty cluster, just add the line if current_lines == {}: current_lines[user] = text #(text,special_characters) # adding the user id is slightly redundant but makes doing the actual clustering easier current_pts[user] = (raw_pt,user) else: # need to see if we want to merge # do we already have some text from this user for this current cluster? # IMPORTANT # VERY IMPORTANT # for the simplified transcription, we will assume that we should automatically start a new # cluster - i.e. we don't deal with split lines if user in current_pts: clusters.append((current_lines.values(),current_pts.values())) current_lines = {user:text} #(text,special_characters)} current_pts = {user:(raw_pt,user)} else: # does adding this line to the cluster make sense? # compare against the current accuracy - if we only have 1 line so far, # current accuracy is NA users_and_lines = sorted(current_lines.items(),key = lambda x:x[0]) sorted_users,sorted_lines = zip(*users_and_lines) # sorted_lines = zip(*sorted_pts)[-1] # uncomment below if you want to compare the new accuracy against the old # if len(current_lines) > 1: # aligned_text = self.__get_aggregation_lines__(sorted_lines) # current_accuracy = self.__agreement__(aligned_text) # else: # current_accuracy = -1 # what would the accuracy be if we added in this new user's line? new_lines = list(sorted_lines) assert isinstance(sorted_users,tuple) # user_index = sorted_users.index(user) # start by trying straight up replacing new_lines.append(text) # print sorted_pts # print new_lines new_aligned = self.__get_aggregation_lines__(new_lines) new_accuracy = self.__agreement__(new_aligned) if min(new_accuracy) >= 0.6: current_pts[user] = (raw_pt,user) current_lines[user] = text else: clusters.append((current_lines.values(),current_pts.values())) # current_pts = {user:(pt,text)} current_lines = {user:text} current_pts = {user:(raw_pt,user)} clusters.append((current_lines.values(),current_pts.values())) # remove any clusters which have only one user for cluster_index in range(len(clusters)-1,-1,-1): # print len(clusters[cluster_index][0]) if len(clusters[cluster_index][0]) <= 1: #2 # assert len(clusters[cluster_index][1]) == 1 clusters.pop(cluster_index) if len(clusters) == 0: return [],0 # if we have more than one cluster - some of them might need to be merged # after removing "error" cluster # to do so - revert back to Hesse format if len(clusters) > 1: hessen_lines = [] for cluster_index in range(len(clusters)): lines_segments,users = zip(*clusters[cluster_index][1]) x1_l, x2_l, y1_l, y2_l = zip(*lines_segments) x1,x2,y1,y2 = np.median(x1_l),np.median(x2_l),np.median(y1_l),np.median(y2_l) hessen_lines.append(hesse_line_reduction([[x1,x2,y1,y2],])[0]) # print hessen_lines slope_l,angle_l = zip(*hessen_lines) # print max_s,min_s = max(slope_l),min(slope_l) max_a,min_a = max(angle_l),min(angle_l) # normalize values hessen_lines = [((max_s-s)/(max_s-min_s),(max_a-a)/(max_a-min_a)) for s,a in hessen_lines] # print hessen_lines tree = spatial.KDTree(hessen_lines) to_merge = [] will_be_merged = set() for l_index in range(len(hessen_lines)-1,-1,-1): for l2_index in tree.query_ball_point(hessen_lines[l_index],0.15): if l2_index > l_index: t_lines = clusters[l_index][0][:] t_lines.extend(clusters[l2_index][0]) aligned_text = self.__get_aggregation_lines__(t_lines) accuracy = self.__agreement__(aligned_text) if min(accuracy) >= 0.5: will_be_merged.add(l_index) will_be_merged.add(l2_index) # make sure that there are not any overlapping users users_1 = zip(*clusters[l_index][1])[1] users_2 = zip(*clusters[l2_index][1])[1] if [u for u in users_1 if u in users_2] != []: continue # is merge "relevant" to any other? relevant = False for m_index,m in enumerate(to_merge): if (l_index in m) or (l2_index in m): relevant = True m.add(l_index) m.add(l2_index) break if not relevant: to_merge.append(set([l_index,l2_index])) # might be a better way to do this but will mulitple popping from list, safer # to work with a copy new_clusters = [] for cluster_index in range(len(clusters)): if cluster_index not in will_be_merged: new_clusters.append(clusters[cluster_index]) for merged_clusters in to_merge: t_cluster = [[],[]] for cluster_index in merged_clusters: t_cluster[0].extend(clusters[cluster_index][0]) t_cluster[1].extend(clusters[cluster_index][1]) new_clusters.append(t_cluster[:]) # print clusters clusters = new_clusters # and now, finally, the actual text clustering cluster_centers = [] cluster_pts = [] cluster_users = [] cluster_members = [] for lines,pts_and_users in clusters: pts,users = zip(*pts_and_users) x1_values,x2_values,y1_values,y2_values = zip(*pts) # todo - handle when some of the coordinate values are not numbers - # this corresponds to when there are multiple text segments from the same user x1 = np.median(x1_values) x2 = np.median(x2_values) y1 = np.median(y1_values) y2 = np.median(y2_values) aligned_text = self.__get_aggregation_lines__(lines) aggregate_text = "" for char_index in range(len(aligned_text[0])): char_set = set(text[char_index] for text in aligned_text) # get the percentage of votes for each character at this position char_vote = {c:sum([1 for text in aligned_text if text[char_index] == c])/float(len(aligned_text)) for c in char_set} most_likely_char,vote_percentage = max(char_vote.items(),key=lambda x:x[1]) if vote_percentage > 0.75: aggregate_text += most_likely_char else: aggregate_text += "-" aggregate_text = re.sub(r'@'," ",aggregate_text) cluster_centers.append((x1,x2,y1,y2,aggregate_text)) cluster_pts.append(zip(pts,lines)) cluster_users.append(users) # if len(users) >= 5: # try to remove all special characters temp_text = [] for text in aligned_text: text = re.sub("@"," ",text) temp_text.append(text) cluster_members.append(temp_text) # results.append({"users":merged_users,"cluster members":merged_points,"tools":merged_tools,"num users":num_users}) results = [] for center,pts,users,lines in zip(cluster_centers,cluster_pts,cluster_users,cluster_members): results.append({"center":center,"cluster members":lines,"tools":[],"num users":len(users)}) # return (cluster_centers,cluster_pts,cluster_users),0 return results,0
def __inner_fit__(self,markings,user_ids,tools,reduced_markings): # we want to first cluster first just on dist and theta - ignoring the text contents # dist_list,theta_list,text_list,raw_pts_list = zip(*markings) # mapped_markings = zip(dist_list,theta_list) # cluster just on points, not on text dist_l,theta_l,text_l = zip(*reduced_markings) reduced_markings_without_text = zip(dist_l,theta_l) ordering = self.__fit2__(reduced_markings_without_text,user_ids) # use the below 2 to build up each cluster current_lines = {} current_pts = {} clusters = [] for a,b in ordering: # a - line values - "intercept" and slope user_index = reduced_markings_without_text.index(a) user = user_ids[user_index] # extract the corresponding text and the raw (unmapped) point # text = text_list[user_index] # raw_pt = raw_pts_list[user_index] text = markings[user_index][-1] raw_pt = markings[user_index][:-1] if "\n" in text: print "multiline - skipping" continue # convert from unicode to ascii assert isinstance(text,unicode) text = text.encode('ascii','ignore') # # # todo - can this be done better? # special_characters = {} # for tag in ["[notenglish]","[/notenglish]"]: # special_characters[tag] = [match.start() for match in re.finditer(re.escape(tag), text)] # print special_characters text = re.sub("\[deletion\].*\[/deletion\]","",text) text = re.sub(r'\[deletion\].*\[\\deletion\]',"",text) text = re.sub("\[illegible\].*\[/illegible\]","",text) text = re.sub(r'\[deletionhas\]\[/deletion\]',"",text) text = re.sub("\[insertion\].*\[/insertion\]","",text) text = re.sub("\[underline\].*\[/underline\]","",text) text = re.sub("\[notenglish\].*\[/notenglish\]","",text) text = re.sub(r'\[has\]',"",text) text = re.sub(r'\(deleted\)',"",text) text = re.sub(r'\[deletion\]',"",text) text = re.sub("\[insertion\]","",text) # todo - find a way to fix this - stupid postgres/json text = re.sub(r'\'',"",text) # do this now, because all of the above subsitutions may have created an empty line if text == "": continue # if we have an empty cluster, just add the line if current_lines == {}: current_lines[user] = text #(text,special_characters) # adding the user id is slightly redundant but makes doing the actual clustering easier current_pts[user] = (raw_pt,user) else: # need to see if we want to merge # do we already have some text from this user for this current cluster? # IMPORTANT # VERY IMPORTANT # for the simplified transcription, we will assume that we should automatically start a new # cluster - i.e. we don't deal with split lines if user in current_pts: clusters.append((current_lines.values(),current_pts.values())) current_lines = {user:text} #(text,special_characters)} current_pts = {user:(raw_pt,user)} else: # does adding this line to the cluster make sense? # compare against the current accuracy - if we only have 1 line so far, # current accuracy is NA users_and_lines = sorted(current_lines.items(),key = lambda x:x[0]) sorted_users,sorted_lines = zip(*users_and_lines) # sorted_lines = zip(*sorted_pts)[-1] # uncomment below if you want to compare the new accuracy against the old # if len(current_lines) > 1: # aligned_text = self.__get_aggregation_lines__(sorted_lines) # current_accuracy = self.__agreement__(aligned_text) # else: # current_accuracy = -1 # what would the accuracy be if we added in this new user's line? new_lines = list(sorted_lines) assert isinstance(sorted_users,tuple) # user_index = sorted_users.index(user) # start by trying straight up replacing new_lines.append(text) # print sorted_pts # print new_lines new_aligned = self.__get_aggregation_lines__(new_lines) new_accuracy = self.__agreement__(new_aligned) if min(new_accuracy) >= 0.6: current_pts[user] = (raw_pt,user) current_lines[user] = text else: clusters.append((current_lines.values(),current_pts.values())) # current_pts = {user:(pt,text)} current_lines = {user:text} current_pts = {user:(raw_pt,user)} clusters.append((current_lines.values(),current_pts.values())) # remove any clusters which have only one user for cluster_index in range(len(clusters)-1,-1,-1): # print len(clusters[cluster_index][0]) if len(clusters[cluster_index][0]) <= 1: #2 # assert len(clusters[cluster_index][1]) == 1 clusters.pop(cluster_index) if len(clusters) == 0: return [],0 # if we have more than one cluster - some of them might need to be merged # after removing "error" cluster # to do so - revert back to Hesse format if len(clusters) > 1: hessen_lines = [] for cluster_index in range(len(clusters)): lines_segments,users = zip(*clusters[cluster_index][1]) x1_l, x2_l, y1_l, y2_l = zip(*lines_segments) x1,x2,y1,y2 = np.median(x1_l),np.median(x2_l),np.median(y1_l),np.median(y2_l) hessen_lines.append(hesse_line_reduction([[x1,x2,y1,y2],])[0]) # print hessen_lines slope_l,angle_l = zip(*hessen_lines) # print max_s,min_s = max(slope_l),min(slope_l) max_a,min_a = max(angle_l),min(angle_l) # normalize values hessen_lines = [((max_s-s)/(max_s-min_s),(max_a-a)/(max_a-min_a)) for s,a in hessen_lines] # print hessen_lines tree = spatial.KDTree(hessen_lines) to_merge = [] will_be_merged = set() for l_index in range(len(hessen_lines)-1,-1,-1): for l2_index in tree.query_ball_point(hessen_lines[l_index],0.15): if l2_index > l_index: t_lines = clusters[l_index][0][:] t_lines.extend(clusters[l2_index][0]) aligned_text = self.__get_aggregation_lines__(t_lines) accuracy = self.__agreement__(aligned_text) if min(accuracy) >= 0.5: will_be_merged.add(l_index) will_be_merged.add(l2_index) # make sure that there are not any overlapping users users_1 = zip(*clusters[l_index][1])[1] users_2 = zip(*clusters[l2_index][1])[1] if [u for u in users_1 if u in users_2] != []: continue # is merge "relevant" to any other? relevant = False for m_index,m in enumerate(to_merge): if (l_index in m) or (l2_index in m): relevant = True m.add(l_index) m.add(l2_index) break if not relevant: to_merge.append(set([l_index,l2_index])) # might be a better way to do this but will mulitple popping from list, safer # to work with a copy new_clusters = [] for cluster_index in range(len(clusters)): if cluster_index not in will_be_merged: new_clusters.append(clusters[cluster_index]) for merged_clusters in to_merge: t_cluster = [[],[]] for cluster_index in merged_clusters: t_cluster[0].extend(clusters[cluster_index][0]) t_cluster[1].extend(clusters[cluster_index][1]) new_clusters.append(t_cluster[:]) # print clusters clusters = new_clusters # and now, finally, the actual text clustering cluster_centers = [] cluster_pts = [] cluster_users = [] cluster_members = [] for lines,pts_and_users in clusters: pts,users = zip(*pts_and_users) x1_values,x2_values,y1_values,y2_values = zip(*pts) # todo - handle when some of the coordinate values are not numbers - # this corresponds to when there are multiple text segments from the same user x1 = np.median(x1_values) x2 = np.median(x2_values) y1 = np.median(y1_values) y2 = np.median(y2_values) aligned_text = self.__get_aggregation_lines__(lines) aggregate_text = "" for char_index in range(len(aligned_text[0])): char_set = set(text[char_index] for text in aligned_text) # get the percentage of votes for each character at this position char_vote = {c:sum([1 for text in aligned_text if text[char_index] == c])/float(len(aligned_text)) for c in char_set} most_likely_char,vote_percentage = max(char_vote.items(),key=lambda x:x[1]) if vote_percentage > 0.75: aggregate_text += most_likely_char else: aggregate_text += "-" aggregate_text = re.sub(r'@'," ",aggregate_text) cluster_centers.append((x1,x2,y1,y2,aggregate_text)) cluster_pts.append(zip(pts,lines)) cluster_users.append(users) # if len(users) >= 5: # try to remove all special characters temp_text = [] for text in aligned_text: text = re.sub("@"," ",text) temp_text.append(text) cluster_members.append(temp_text) # results.append({"users":merged_users,"cluster members":merged_points,"tools":merged_tools,"num users":num_users}) results = [] for center,pts,users,lines in zip(cluster_centers,cluster_pts,cluster_users,cluster_members): results.append({"center":center,"cluster members":lines,"tools":[],"num users":len(users)}) # return (cluster_centers,cluster_pts,cluster_users),0 return results,0
def __cluster__(self,markings,user_ids,tools,reduced_markings,image_dimensions): # we want to first cluster first just on dist and theta - ignoring the text contents # dist_list,theta_list,text_list,raw_pts_list = zip(*markings) # mapped_markings = zip(dist_list,theta_list) # cluster just on points, not on text dist_l,theta_l,text_l = zip(*reduced_markings) reduced_markings_without_text = zip(dist_l,theta_l) ordering = self.__preliminarily__clustering__(reduced_markings_without_text,user_ids) # use the below 2 to build up each cluster current_lines = {} current_pts = {} clusters = [] non_fasta_text = {} for a,b in ordering: # a - line values - "intercept" and slope user_index = reduced_markings_without_text.index(a) user = user_ids[user_index] # extract the corresponding text and the raw (unmapped) point # text = text_list[user_index] # raw_pt = raw_pts_list[user_index] text = markings[user_index][-1] raw_pt = markings[user_index][:-1] # skip lines with new lines characters in them # Roger has set things up so that new line characters are no longer allowed # but we need to be careful with transcriptions submitted before that change if "\n" in text: print "multiline - skipping" continue # convert from unicode to ascii assert isinstance(text,unicode) # not sure if it is possible to have empty lines, but just in case if text == "": continue # handle all characters which MAFFT cannot handle and keep a record of where all # the tags are in the string # text_with_capitalization is used (at the end) to determine the correct capitalization # of character (since in the mean time capital letters are used for other stuff) text, nf_text = self.__set_special_characters__(text) # save these values for later use non_fasta_text[(raw_pt,user)] = nf_text # if we currently have an empty cluster, just add the line if current_lines == {}: current_lines[user] = text # adding the user id is slightly redundant but makes doing the actual clustering easier current_pts[user] = (raw_pt,user) else: # need to see if we want to merge the text with the existing cluster or start a new one # do we already have some text from this user for this current cluster? # IMPORTANT # VERY IMPORTANT # for the simplified transcription, we will assume that we should automatically start a new # cluster - i.e. we don't deal with split lines if user in current_pts: clusters.append((current_lines.values(),current_pts.values())) current_lines = {user:text} #(text,special_characters)} current_pts = {user:(raw_pt,user)} else: # does adding this line to the cluster make sense? # todo - why am I sorting here? doesn't really seem necessary # users_and_lines = sorted(current_lines.items(),key = lambda x:x[0]) # sorted_users,sorted_lines = zip(*users_and_lines) # take the current set of text lines and add in the new one new_lines = current_lines.values() new_lines.append(text) # uncomment below if you want to compare the new accuracy against the old # if len(current_lines) > 1: # aligned_text = self.__get_aggregation_lines__(sorted_lines) # current_accuracy = self.__agreement__(aligned_text) # else: # current_accuracy = -1 # what would the accuracy be if we added in this new user's line? # new_lines = list(sorted_lines) # assert isinstance(sorted_users,tuple) # user_index = sorted_users.index(user) # start by trying straight up replacing # new_lines.append(text) new_aligned = self.__line_alignment__(new_lines) new_accuracy = self.__agreement__(new_aligned) # todo - we can get two slightly different values for new_accuracy # todo - because of slightly different approaches - is one better? # todo - we might not need __agreement__, if not, we can remove it # temp1,temp2,new_accuracy = self.__merge_aligned_text__(new_aligned) # if the minimum accuracy resulted by adding in this line is still reasonably good # add the line to the current cluster if min(new_accuracy) >= 0.6: current_pts[user] = (raw_pt,user) current_lines[user] = text else: # otherwise, start a new cluster clusters.append((current_lines.values(),current_pts.values())) current_lines = {user:text} current_pts = {user:(raw_pt,user)} # make sure to add the final cluster that we were working on at the end clusters.append((current_lines.values(),current_pts.values())) # remove any clusters which have only one user - treat those as noise for cluster_index in range(len(clusters)-1,-1,-1): # print len(clusters[cluster_index][0]) if len(clusters[cluster_index][0]) <= 4: #2 # assert len(clusters[cluster_index][1]) == 1 clusters.pop(cluster_index) if len(clusters) == 0: return [],0 # if we have more than one cluster - some of them might need to be merged # after removing "error" clusters # to do so - revert back to Hesse format # todo - maybe only run this if we have removed any error lines if len(clusters) > 1: hessen_lines = [] for cluster_index in range(len(clusters)): lines_segments,users = zip(*clusters[cluster_index][1]) x1_l, x2_l, y1_l, y2_l = zip(*lines_segments) x1,x2,y1,y2 = np.median(x1_l),np.median(x2_l),np.median(y1_l),np.median(y2_l) hessen_lines.append(hesse_line_reduction([[x1,x2,y1,y2],])[0]) # print hessen_lines slope_l,angle_l = zip(*hessen_lines) max_s,min_s = max(slope_l),min(slope_l) max_a,min_a = max(angle_l),min(angle_l) # normalize values hessen_lines = [((max_s-s)/(max_s-min_s),(max_a-a)/(max_a-min_a)) for s,a in hessen_lines] tree = spatial.KDTree(hessen_lines) to_merge = [] will_be_merged = set() for l_index in range(len(hessen_lines)-1,-1,-1): for l2_index in tree.query_ball_point(hessen_lines[l_index],0.15): if l2_index > l_index: t_lines = clusters[l_index][0][:] t_lines.extend(clusters[l2_index][0]) aligned_text = self.__line_alignment__(t_lines) accuracy = self.__agreement__(aligned_text) if min(accuracy) >= 0.5: will_be_merged.add(l_index) will_be_merged.add(l2_index) # make sure that there are not any overlapping users users_1 = zip(*clusters[l_index][1])[1] users_2 = zip(*clusters[l2_index][1])[1] if [u for u in users_1 if u in users_2] != []: continue # is merge "relevant" to any other? relevant = False for m_index,m in enumerate(to_merge): if (l_index in m) or (l2_index in m): relevant = True m.add(l_index) m.add(l2_index) break if not relevant: to_merge.append(set([l_index,l2_index])) # might be a better way to do this but will mulitple popping from list, safer # to work with a copy new_clusters = [] for cluster_index in range(len(clusters)): if cluster_index not in will_be_merged: new_clusters.append(clusters[cluster_index]) for merged_clusters in to_merge: t_cluster = [[],[]] for cluster_index in merged_clusters: t_cluster[0].extend(clusters[cluster_index][0]) t_cluster[1].extend(clusters[cluster_index][1]) new_clusters.append(t_cluster[:]) clusters = new_clusters # and now, finally, the actual text clustering cluster_centers = [] cluster_pts = [] cluster_users = [] cluster_members = [] agreement = [] self.line_agreement.append([]) for lines,pts_and_users in clusters: pts,users = zip(*pts_and_users) x1_values,x2_values,y1_values,y2_values = zip(*pts) # todo - handle when some of the coordinate values are not numbers - # todo - this corresponds to when there are multiple text segments from the same user # todo - this in turn corresponds to the case where we look for "broken" lines # todo - so definitely something down the line x1 = np.median(x1_values) x2 = np.median(x2_values) y1 = np.median(y1_values) y2 = np.median(y2_values) # align the text aligned_text = self.__line_alignment__(lines) # align the non-fasta version of the text lines nf_aligned_lines = self.__add_alignment_spaces__(aligned_text,non_fasta_text,pts_and_users) # aggregate the lines - looking for character spots where there is mostly consensus aggregate_text,character_agreement,per_user_agreement = self.__merge_aligned_text__(nf_aligned_lines) for t in aligned_text: print t print aggregate_text print # deal with characters that python/postgres has trouble with aggregate_text = self.__reset_special_characters__(aggregate_text) cluster_centers.append((x1,x2,y1,y2,aggregate_text)) # and deal with special characters for each individual lines temp_pts_lines = [] for p,l in zip(pts,nf_aligned_lines): l = self.__reset_special_characters__(l) temp_pts_lines.append((p,l)) cluster_pts.append(temp_pts_lines) cluster_users.append(users) agreement.append(character_agreement) # cluster_members.append(aligned_text) # use this if you want to keep track of stats # self.line_agreement[-1].append((character_agreement,len(users))) results = [] for center,pts,users,a in zip(cluster_centers,cluster_pts,cluster_users,agreement): results.append({"center":center,"cluster members":pts,"tools":[],"num users":len(users),"agreement":a}) return results,0