예제 #1
0
    def __agglomerative__clustering__(self,markings,reduced_markings,user_ids):
        """
        TBD
        """
        # start by splitting markings into lines and text and then the lines into slopes and intercepts
        intercepts,slopes,text = zip(*reduced_markings)

        # deal with special characters in the text and "recombine" the markings
        # text has capital letters used only for special characters/tags
        # while capitalized_text has the original capitalization which is useful for the final aggregate result
        text,capitalized_text = zip(*[self.__set_special_characters__(t) for t in text])
        reduced_markings = zip(intercepts,slopes,text)

        l = []
        for ii,t in enumerate(text):
            # print t
            if "finished" in t:
                l.append(ii)

        print "==="

        # normalize the the slopes and intercepts
        normalized_intercepts,normalized_slopes = self.__normalize_lines__(intercepts,slopes)
        pts_list = zip(normalized_intercepts,normalized_slopes)
        # pts_list = zip(intercepts,slopes)

        # do agglomerative clustering
        # the panda dataframe seems necessary but not totally sure
        # labels = range(len(pts_list))
        # print labels
        # variables = ["X","Y"]
        # df = pd.DataFrame(list(pts_list),columns=variables, index=labels)
        # row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns=labels, index=labels)

        # see http://stackoverflow.com/questions/18952587/use-distance-matrix-in-scipy-cluster-hierarchy-linkage
        labels = range(len(pts_list))
        variables = ["X","Y"]
        # X = np.random.random_sample([5,3])*10
        df = pd.DataFrame(list(pts_list),columns=variables, index=labels)



        # variables = ["X"]
        # # X = np.random.random_sample([5,3])*10
        # df = pd.DataFrame(list(normalized_intercepts),columns=variables, index=labels)

        row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns=labels, index=labels)
        distances = squareform(pdist(df, metric='euclidean'))[l[0]]
        distances = zip(range(len(distances)),distances)
        distances.sort(key = lambda x:x[1])
        print distances
        for ii,d in distances[:10]:
            print text[ii]
        print "==-----"


        agglomerations = linkage(row_dist, method='single')
        print len(pts_list)
        print len(agglomerations)
        assert False

        # clusters will be a list representation of the the tree that results from merging clusters
        # as determined by agglomerations
        clusters = []
        for m,M,raw_transcription in zip(reduced_markings,capitalized_text,markings):
            # coordinates will store the xy coordinates of the line segment
            c = {"cluster members":[m,],"individual transcriptions":[M,], "coordinates":[raw_transcription[:-1]]}
            clusters.append(c)

        # go through of the cluster mergers given by the scipy algorithm
        # and decide whether to go with them or not
        for merge in agglomerations:
            rchild_index = int(merge[0])
            lchild_index = int(merge[1])



            print lchild_index,rchild_index

            # None => we have already capped every path in the subtree
            # i.e. on any path from this node down to the root, we will encounter a capped cluster
            # so if both nodes are capped, just add None and continue
            if (clusters[rchild_index] is None) and (clusters[lchild_index] is None):
                clusters.append(None)
                continue
            # if only one node is None, cap the node that is not None
            elif clusters[rchild_index] is None:
                clusters[lchild_index] = self.__cap_cluster__(clusters[lchild_index])
                clusters.append(None)
                continue
            elif clusters[lchild_index] is None:
                clusters[rchild_index] = self.__cap_cluster__(clusters[rchild_index])
                clusters.append(None)
                continue

            assert "center" not in clusters[lchild_index]
            assert "center" not in clusters[rchild_index]

            # todo - I think this use of cluster members is consistent - but not consistent with code
            # todo - else where - won't cause a bug, but could cause some confusion
            _,_,transcriptions = zip(*(clusters[rchild_index]["cluster members"]))
            _,_,transcriptions_left = zip(*(clusters[lchild_index]["cluster members"]))

            # if True in [("surely" in t) for t in transcriptions]:
            #     print rchild_index,lchild_index
            #     print clusters[rchild_index]["cluster members"]
            #     print clusters[lchild_index]["cluster members"]
            #     print
            # elif True in [("surely" in t) for t in transcriptions_left]:
            #     print rchild_index,lchild_index
            #     print clusters[rchild_index]["cluster members"]
            #     print clusters[lchild_index]["cluster members"]
            #     print

            # convert to list
            transcriptions = list(transcriptions)
            transcriptions.extend(transcriptions_left)

            aligned_transcriptions = self.__line_alignment__(transcriptions)
            accuracy = self.__agreement__(aligned_transcriptions)

            # if the minimum accuracy is reasonably high, then we will want to combine them
            if min(accuracy) >= 0.6:
                new_cluster = deepcopy(clusters[rchild_index])
                new_cluster["cluster members"].extend(clusters[lchild_index]["cluster members"])
                new_cluster["individual transcriptions"].extend(clusters[lchild_index]["individual transcriptions"])
                new_cluster["coordinates"].extend(clusters[lchild_index]["coordinates"])
                clusters.append(new_cluster)
            else:
                # the accuracy of the combined cluster is low enough that we do not want combine
                # instead, we'll "cap" each of the clusters - by giving it a center value
                # and appending None to the list of clusters
                clusters[rchild_index] = self.__cap_cluster__(clusters[rchild_index])
                clusters[lchild_index] = self.__cap_cluster__(clusters[lchild_index])
                clusters.append(None)

        capped_clusters = []
        for c in clusters:
            if (c is not None) and ("center" in c):
                if c["num users"] >= 3:
                    print "***"
                    capped_clusters.append(c)
                else:
                    print hesse_line_reduction([c["center"],])[0][:2],c["center"][-1]

        return capped_clusters
    def __inner_fit__(self,markings,user_ids,tools,reduced_markings):
        # we want to first cluster first just on dist and theta - ignoring the text contents
        # dist_list,theta_list,text_list,raw_pts_list = zip(*markings)
        # mapped_markings = zip(dist_list,theta_list)

        # cluster just on points, not on text
        print reduced_markings
        dist_l,theta_l,text_l = zip(*reduced_markings)
        reduced_markings_without_text = zip(dist_l,theta_l)
        ordering  = self.__fit2__(reduced_markings_without_text,user_ids)

        # use the below 2 to build up each cluster
        current_lines = {}
        current_pts = {}
        clusters = []


        for a,b in ordering:
            # a - line values - "intercept" and slope
            user_index = reduced_markings_without_text.index(a)
            user = user_ids[user_index]
            # extract the corresponding text and the raw (unmapped) point
            # text = text_list[user_index]
            # raw_pt = raw_pts_list[user_index]

            text = markings[user_index][-1]
            raw_pt = markings[user_index][:-1]


            if "\n" in text:
                print "multiline - skipping"
                continue



            # convert from unicode to ascii
            assert isinstance(text,unicode)
            text = text.encode('ascii','ignore')

            # #  # todo - can this be done better?
            # special_characters = {}
            # for tag in ["[notenglish]","[/notenglish]"]:
            #     special_characters[tag] = [match.start() for match in re.finditer(re.escape(tag), text)]
            # print special_characters
            text = re.sub("\[deletion\].*\[/deletion\]","",text)
            text = re.sub(r'\[deletion\].*\[\\deletion\]',"",text)
            text = re.sub("\[illegible\].*\[/illegible\]","",text)
            text = re.sub(r'\[deletionhas\]\[/deletion\]',"",text)
            text = re.sub("\[insertion\].*\[/insertion\]","",text)
            text = re.sub("\[underline\].*\[/underline\]","",text)
            text = re.sub("\[notenglish\].*\[/notenglish\]","",text)
            text = re.sub(r'\[has\]',"",text)
            text = re.sub(r'\(deleted\)',"",text)
            text = re.sub(r'\[deletion\]',"",text)
            text = re.sub("\[insertion\]","",text)

            # todo - find a way to fix this - stupid postgres/json
            text = re.sub(r'\'',"",text)

            # do this now, because all of the above subsitutions may have created an empty line
            if text == "":
                continue

            # if we have an empty cluster, just add the line
            if current_lines == {}:
                current_lines[user] = text #(text,special_characters)

                # adding the user id is slightly redundant but makes doing the actual clustering easier
                current_pts[user] = (raw_pt,user)
            else:
                # need to see if we want to merge
                # do we already have some text from this user for this current cluster?
                # IMPORTANT
                # VERY IMPORTANT
                # for the simplified transcription, we will assume that we should automatically start a new
                # cluster - i.e. we don't deal with split lines
                if user in current_pts:
                    clusters.append((current_lines.values(),current_pts.values()))
                    current_lines = {user:text} #(text,special_characters)}
                    current_pts = {user:(raw_pt,user)}
                else:
                    # does adding this line to the cluster make sense?
                    # compare against the current accuracy - if we only have 1 line so far,
                    # current accuracy is NA
                    users_and_lines = sorted(current_lines.items(),key = lambda x:x[0])
                    sorted_users,sorted_lines = zip(*users_and_lines)
                    # sorted_lines = zip(*sorted_pts)[-1]

                    # uncomment below if you want to compare the new accuracy against the old
                    # if len(current_lines) > 1:
                    #     aligned_text = self.__get_aggregation_lines__(sorted_lines)
                    #     current_accuracy = self.__agreement__(aligned_text)
                    # else:
                    #     current_accuracy = -1

                    # what would the accuracy be if we added in this new user's line?
                    new_lines = list(sorted_lines)
                    assert isinstance(sorted_users,tuple)
                    # user_index = sorted_users.index(user)

                    # start by trying straight up replacing
                    new_lines.append(text)
                    # print sorted_pts
                    # print new_lines
                    new_aligned = self.__get_aggregation_lines__(new_lines)
                    new_accuracy = self.__agreement__(new_aligned)

                    if min(new_accuracy) >= 0.6:
                        current_pts[user] = (raw_pt,user)
                        current_lines[user] = text
                    else:
                        clusters.append((current_lines.values(),current_pts.values()))
                        # current_pts = {user:(pt,text)}
                        current_lines = {user:text}
                        current_pts = {user:(raw_pt,user)}

        clusters.append((current_lines.values(),current_pts.values()))

        # remove any clusters which have only one user
        for cluster_index in range(len(clusters)-1,-1,-1):
            # print len(clusters[cluster_index][0])
            if len(clusters[cluster_index][0]) <= 1: #2
                # assert len(clusters[cluster_index][1]) == 1
                clusters.pop(cluster_index)

        if len(clusters) == 0:
            return [],0

        # if we have more than one cluster - some of them might need to be merged
        # after removing "error" cluster
        # to do so - revert back to Hesse format
        if len(clusters) > 1:

            hessen_lines = []

            for cluster_index in range(len(clusters)):
                lines_segments,users = zip(*clusters[cluster_index][1])
                x1_l, x2_l, y1_l, y2_l = zip(*lines_segments)
                x1,x2,y1,y2 = np.median(x1_l),np.median(x2_l),np.median(y1_l),np.median(y2_l)
                hessen_lines.append(hesse_line_reduction([[x1,x2,y1,y2],])[0])

            # print hessen_lines
            slope_l,angle_l = zip(*hessen_lines)
            # print
            max_s,min_s = max(slope_l),min(slope_l)
            max_a,min_a = max(angle_l),min(angle_l)

            # normalize values
            hessen_lines = [((max_s-s)/(max_s-min_s),(max_a-a)/(max_a-min_a)) for s,a in hessen_lines]
            # print hessen_lines

            tree = spatial.KDTree(hessen_lines)

            to_merge = []
            will_be_merged = set()

            for l_index in range(len(hessen_lines)-1,-1,-1):
                for l2_index in tree.query_ball_point(hessen_lines[l_index],0.15):
                    if l2_index > l_index:
                        t_lines = clusters[l_index][0][:]
                        t_lines.extend(clusters[l2_index][0])

                        aligned_text = self.__get_aggregation_lines__(t_lines)
                        accuracy = self.__agreement__(aligned_text)
                        if min(accuracy) >= 0.5:
                            will_be_merged.add(l_index)
                            will_be_merged.add(l2_index)

                            # make sure that there are not any overlapping users
                            users_1 = zip(*clusters[l_index][1])[1]
                            users_2 = zip(*clusters[l2_index][1])[1]

                            if [u for u in users_1 if u in users_2] != []:
                                continue

                            # is merge "relevant" to any other?
                            relevant = False
                            for m_index,m in enumerate(to_merge):
                                if (l_index in m) or (l2_index in m):
                                    relevant = True
                                    m.add(l_index)
                                    m.add(l2_index)
                                    break

                            if not relevant:
                                to_merge.append(set([l_index,l2_index]))

            # might be a better way to do this but will mulitple popping from list, safer
            # to work with a copy
            new_clusters = []

            for cluster_index in range(len(clusters)):
                if cluster_index not in will_be_merged:
                    new_clusters.append(clusters[cluster_index])
            for merged_clusters in to_merge:
                t_cluster = [[],[]]
                for cluster_index in merged_clusters:
                    t_cluster[0].extend(clusters[cluster_index][0])
                    t_cluster[1].extend(clusters[cluster_index][1])
                new_clusters.append(t_cluster[:])

            # print clusters
            clusters = new_clusters

        # and now, finally, the actual text clustering
        cluster_centers = []
        cluster_pts = []
        cluster_users = []

        cluster_members = []

        for lines,pts_and_users in clusters:
            pts,users = zip(*pts_and_users)
            x1_values,x2_values,y1_values,y2_values = zip(*pts)

            # todo - handle when some of the coordinate values are not numbers -
            # this corresponds to when there are multiple text segments from the same user
            x1 = np.median(x1_values)
            x2 = np.median(x2_values)
            y1 = np.median(y1_values)
            y2 = np.median(y2_values)

            aligned_text = self.__get_aggregation_lines__(lines)
            aggregate_text = ""
            for char_index in range(len(aligned_text[0])):
                char_set = set(text[char_index] for text in aligned_text)
                # get the percentage of votes for each character at this position
                char_vote = {c:sum([1 for text in aligned_text if text[char_index] == c])/float(len(aligned_text)) for c in char_set}
                most_likely_char,vote_percentage = max(char_vote.items(),key=lambda x:x[1])

                if vote_percentage > 0.75:
                    aggregate_text += most_likely_char
                else:
                    aggregate_text += "-"

            aggregate_text = re.sub(r'@'," ",aggregate_text)

            cluster_centers.append((x1,x2,y1,y2,aggregate_text))
            cluster_pts.append(zip(pts,lines))
            cluster_users.append(users)

            # if len(users) >= 5:


            # try to remove all special characters
            temp_text = []
            for text in aligned_text:
                text = re.sub("@"," ",text)
                temp_text.append(text)

            cluster_members.append(temp_text)

        # results.append({"users":merged_users,"cluster members":merged_points,"tools":merged_tools,"num users":num_users})

        results = []
        for center,pts,users,lines in zip(cluster_centers,cluster_pts,cluster_users,cluster_members):
            results.append({"center":center,"cluster members":lines,"tools":[],"num users":len(users)})

        # return (cluster_centers,cluster_pts,cluster_users),0
        return results,0
    def __inner_fit__(self,markings,user_ids,tools,reduced_markings):
        # we want to first cluster first just on dist and theta - ignoring the text contents
        # dist_list,theta_list,text_list,raw_pts_list = zip(*markings)
        # mapped_markings = zip(dist_list,theta_list)

        # cluster just on points, not on text
        dist_l,theta_l,text_l = zip(*reduced_markings)
        reduced_markings_without_text = zip(dist_l,theta_l)
        ordering  = self.__fit2__(reduced_markings_without_text,user_ids)

        # use the below 2 to build up each cluster
        current_lines = {}
        current_pts = {}
        clusters = []


        for a,b in ordering:
            # a - line values - "intercept" and slope
            user_index = reduced_markings_without_text.index(a)
            user = user_ids[user_index]
            # extract the corresponding text and the raw (unmapped) point
            # text = text_list[user_index]
            # raw_pt = raw_pts_list[user_index]

            text = markings[user_index][-1]
            raw_pt = markings[user_index][:-1]


            if "\n" in text:
                print "multiline - skipping"
                continue



            # convert from unicode to ascii
            assert isinstance(text,unicode)
            text = text.encode('ascii','ignore')

            # #  # todo - can this be done better?
            # special_characters = {}
            # for tag in ["[notenglish]","[/notenglish]"]:
            #     special_characters[tag] = [match.start() for match in re.finditer(re.escape(tag), text)]
            # print special_characters
            text = re.sub("\[deletion\].*\[/deletion\]","",text)
            text = re.sub(r'\[deletion\].*\[\\deletion\]',"",text)
            text = re.sub("\[illegible\].*\[/illegible\]","",text)
            text = re.sub(r'\[deletionhas\]\[/deletion\]',"",text)
            text = re.sub("\[insertion\].*\[/insertion\]","",text)
            text = re.sub("\[underline\].*\[/underline\]","",text)
            text = re.sub("\[notenglish\].*\[/notenglish\]","",text)
            text = re.sub(r'\[has\]',"",text)
            text = re.sub(r'\(deleted\)',"",text)
            text = re.sub(r'\[deletion\]',"",text)
            text = re.sub("\[insertion\]","",text)

            # todo - find a way to fix this - stupid postgres/json
            text = re.sub(r'\'',"",text)

            # do this now, because all of the above subsitutions may have created an empty line
            if text == "":
                continue

            # if we have an empty cluster, just add the line
            if current_lines == {}:
                current_lines[user] = text #(text,special_characters)

                # adding the user id is slightly redundant but makes doing the actual clustering easier
                current_pts[user] = (raw_pt,user)
            else:
                # need to see if we want to merge
                # do we already have some text from this user for this current cluster?
                # IMPORTANT
                # VERY IMPORTANT
                # for the simplified transcription, we will assume that we should automatically start a new
                # cluster - i.e. we don't deal with split lines
                if user in current_pts:
                    clusters.append((current_lines.values(),current_pts.values()))
                    current_lines = {user:text} #(text,special_characters)}
                    current_pts = {user:(raw_pt,user)}
                else:
                    # does adding this line to the cluster make sense?
                    # compare against the current accuracy - if we only have 1 line so far,
                    # current accuracy is NA
                    users_and_lines = sorted(current_lines.items(),key = lambda x:x[0])
                    sorted_users,sorted_lines = zip(*users_and_lines)
                    # sorted_lines = zip(*sorted_pts)[-1]

                    # uncomment below if you want to compare the new accuracy against the old
                    # if len(current_lines) > 1:
                    #     aligned_text = self.__get_aggregation_lines__(sorted_lines)
                    #     current_accuracy = self.__agreement__(aligned_text)
                    # else:
                    #     current_accuracy = -1

                    # what would the accuracy be if we added in this new user's line?
                    new_lines = list(sorted_lines)
                    assert isinstance(sorted_users,tuple)
                    # user_index = sorted_users.index(user)

                    # start by trying straight up replacing
                    new_lines.append(text)
                    # print sorted_pts
                    # print new_lines
                    new_aligned = self.__get_aggregation_lines__(new_lines)
                    new_accuracy = self.__agreement__(new_aligned)

                    if min(new_accuracy) >= 0.6:
                        current_pts[user] = (raw_pt,user)
                        current_lines[user] = text
                    else:
                        clusters.append((current_lines.values(),current_pts.values()))
                        # current_pts = {user:(pt,text)}
                        current_lines = {user:text}
                        current_pts = {user:(raw_pt,user)}

        clusters.append((current_lines.values(),current_pts.values()))

        # remove any clusters which have only one user
        for cluster_index in range(len(clusters)-1,-1,-1):
            # print len(clusters[cluster_index][0])
            if len(clusters[cluster_index][0]) <= 1: #2
                # assert len(clusters[cluster_index][1]) == 1
                clusters.pop(cluster_index)

        if len(clusters) == 0:
            return [],0

        # if we have more than one cluster - some of them might need to be merged
        # after removing "error" cluster
        # to do so - revert back to Hesse format
        if len(clusters) > 1:

            hessen_lines = []

            for cluster_index in range(len(clusters)):
                lines_segments,users = zip(*clusters[cluster_index][1])
                x1_l, x2_l, y1_l, y2_l = zip(*lines_segments)
                x1,x2,y1,y2 = np.median(x1_l),np.median(x2_l),np.median(y1_l),np.median(y2_l)
                hessen_lines.append(hesse_line_reduction([[x1,x2,y1,y2],])[0])

            # print hessen_lines
            slope_l,angle_l = zip(*hessen_lines)
            # print
            max_s,min_s = max(slope_l),min(slope_l)
            max_a,min_a = max(angle_l),min(angle_l)

            # normalize values
            hessen_lines = [((max_s-s)/(max_s-min_s),(max_a-a)/(max_a-min_a)) for s,a in hessen_lines]
            # print hessen_lines

            tree = spatial.KDTree(hessen_lines)

            to_merge = []
            will_be_merged = set()

            for l_index in range(len(hessen_lines)-1,-1,-1):
                for l2_index in tree.query_ball_point(hessen_lines[l_index],0.15):
                    if l2_index > l_index:
                        t_lines = clusters[l_index][0][:]
                        t_lines.extend(clusters[l2_index][0])

                        aligned_text = self.__get_aggregation_lines__(t_lines)
                        accuracy = self.__agreement__(aligned_text)
                        if min(accuracy) >= 0.5:
                            will_be_merged.add(l_index)
                            will_be_merged.add(l2_index)

                            # make sure that there are not any overlapping users
                            users_1 = zip(*clusters[l_index][1])[1]
                            users_2 = zip(*clusters[l2_index][1])[1]

                            if [u for u in users_1 if u in users_2] != []:
                                continue

                            # is merge "relevant" to any other?
                            relevant = False
                            for m_index,m in enumerate(to_merge):
                                if (l_index in m) or (l2_index in m):
                                    relevant = True
                                    m.add(l_index)
                                    m.add(l2_index)
                                    break

                            if not relevant:
                                to_merge.append(set([l_index,l2_index]))

            # might be a better way to do this but will mulitple popping from list, safer
            # to work with a copy
            new_clusters = []

            for cluster_index in range(len(clusters)):
                if cluster_index not in will_be_merged:
                    new_clusters.append(clusters[cluster_index])
            for merged_clusters in to_merge:
                t_cluster = [[],[]]
                for cluster_index in merged_clusters:
                    t_cluster[0].extend(clusters[cluster_index][0])
                    t_cluster[1].extend(clusters[cluster_index][1])
                new_clusters.append(t_cluster[:])

            # print clusters
            clusters = new_clusters

        # and now, finally, the actual text clustering
        cluster_centers = []
        cluster_pts = []
        cluster_users = []

        cluster_members = []

        for lines,pts_and_users in clusters:
            pts,users = zip(*pts_and_users)
            x1_values,x2_values,y1_values,y2_values = zip(*pts)

            # todo - handle when some of the coordinate values are not numbers -
            # this corresponds to when there are multiple text segments from the same user
            x1 = np.median(x1_values)
            x2 = np.median(x2_values)
            y1 = np.median(y1_values)
            y2 = np.median(y2_values)

            aligned_text = self.__get_aggregation_lines__(lines)
            aggregate_text = ""
            for char_index in range(len(aligned_text[0])):
                char_set = set(text[char_index] for text in aligned_text)
                # get the percentage of votes for each character at this position
                char_vote = {c:sum([1 for text in aligned_text if text[char_index] == c])/float(len(aligned_text)) for c in char_set}
                most_likely_char,vote_percentage = max(char_vote.items(),key=lambda x:x[1])

                if vote_percentage > 0.75:
                    aggregate_text += most_likely_char
                else:
                    aggregate_text += "-"

            aggregate_text = re.sub(r'@'," ",aggregate_text)

            cluster_centers.append((x1,x2,y1,y2,aggregate_text))
            cluster_pts.append(zip(pts,lines))
            cluster_users.append(users)

            # if len(users) >= 5:


            # try to remove all special characters
            temp_text = []
            for text in aligned_text:
                text = re.sub("@"," ",text)
                temp_text.append(text)

            cluster_members.append(temp_text)

        # results.append({"users":merged_users,"cluster members":merged_points,"tools":merged_tools,"num users":num_users})

        results = []
        for center,pts,users,lines in zip(cluster_centers,cluster_pts,cluster_users,cluster_members):
            results.append({"center":center,"cluster members":lines,"tools":[],"num users":len(users)})

        # return (cluster_centers,cluster_pts,cluster_users),0
        return results,0
예제 #4
0
    def __cluster__(self,markings,user_ids,tools,reduced_markings,image_dimensions):
        # we want to first cluster first just on dist and theta - ignoring the text contents
        # dist_list,theta_list,text_list,raw_pts_list = zip(*markings)
        # mapped_markings = zip(dist_list,theta_list)

        # cluster just on points, not on text
        dist_l,theta_l,text_l = zip(*reduced_markings)
        reduced_markings_without_text = zip(dist_l,theta_l)
        ordering  = self.__preliminarily__clustering__(reduced_markings_without_text,user_ids)

        # use the below 2 to build up each cluster
        current_lines = {}
        current_pts = {}
        clusters = []

        non_fasta_text = {}

        for a,b in ordering:
            # a - line values - "intercept" and slope
            user_index = reduced_markings_without_text.index(a)
            user = user_ids[user_index]
            # extract the corresponding text and the raw (unmapped) point
            # text = text_list[user_index]
            # raw_pt = raw_pts_list[user_index]

            text = markings[user_index][-1]
            raw_pt = markings[user_index][:-1]

            # skip lines with new lines characters in them
            # Roger has set things up so that new line characters are no longer allowed
            # but we need to be careful with transcriptions submitted before that change
            if "\n" in text:
                print "multiline - skipping"
                continue

            # convert from unicode to ascii
            assert isinstance(text,unicode)

            # not sure if it is possible to have empty lines, but just in case
            if text == "":
                continue

            # handle all characters which MAFFT cannot handle and keep a record of where all
            # the tags are in the string
            # text_with_capitalization is used (at the end) to determine the correct capitalization
            # of character (since in the mean time capital letters are used for other stuff)
            text, nf_text = self.__set_special_characters__(text)

            # save these values for later use
            non_fasta_text[(raw_pt,user)] = nf_text

            # if we currently have an empty cluster, just add the line
            if current_lines == {}:
                current_lines[user] = text
                # adding the user id is slightly redundant but makes doing the actual clustering easier
                current_pts[user] = (raw_pt,user)
            else:
                # need to see if we want to merge the text with the existing cluster or start a new one
                # do we already have some text from this user for this current cluster?
                # IMPORTANT
                # VERY IMPORTANT
                # for the simplified transcription, we will assume that we should automatically start a new
                # cluster - i.e. we don't deal with split lines
                if user in current_pts:
                    clusters.append((current_lines.values(),current_pts.values()))
                    current_lines = {user:text} #(text,special_characters)}
                    current_pts = {user:(raw_pt,user)}
                else:
                    # does adding this line to the cluster make sense?
                    # todo - why am I sorting here? doesn't really seem necessary
                    # users_and_lines = sorted(current_lines.items(),key = lambda x:x[0])
                    # sorted_users,sorted_lines = zip(*users_and_lines)

                    # take the current set of text lines and add in the new one
                    new_lines = current_lines.values()
                    new_lines.append(text)


                    # uncomment below if you want to compare the new accuracy against the old
                    # if len(current_lines) > 1:
                    #     aligned_text = self.__get_aggregation_lines__(sorted_lines)
                    #     current_accuracy = self.__agreement__(aligned_text)
                    # else:
                    #     current_accuracy = -1

                    # what would the accuracy be if we added in this new user's line?
                    # new_lines = list(sorted_lines)
                    # assert isinstance(sorted_users,tuple)
                    # user_index = sorted_users.index(user)

                    # start by trying straight up replacing
                    # new_lines.append(text)
                    new_aligned = self.__line_alignment__(new_lines)
                    new_accuracy = self.__agreement__(new_aligned)
                    # todo - we can get two slightly different values for new_accuracy
                    # todo - because of slightly different approaches - is one better?
                    # todo - we might not need __agreement__, if not, we can remove it
                    # temp1,temp2,new_accuracy = self.__merge_aligned_text__(new_aligned)

                    # if the minimum accuracy resulted by adding in this line is still reasonably good
                    # add the line to the current cluster
                    if min(new_accuracy) >= 0.6:
                        current_pts[user] = (raw_pt,user)
                        current_lines[user] = text
                    else:
                        # otherwise, start a new cluster
                        clusters.append((current_lines.values(),current_pts.values()))
                        current_lines = {user:text}
                        current_pts = {user:(raw_pt,user)}

        # make sure to add the final cluster that we were working on at the end
        clusters.append((current_lines.values(),current_pts.values()))

        # remove any clusters which have only one user - treat those as noise
        for cluster_index in range(len(clusters)-1,-1,-1):
            # print len(clusters[cluster_index][0])
            if len(clusters[cluster_index][0]) <= 4: #2
                # assert len(clusters[cluster_index][1]) == 1
                clusters.pop(cluster_index)

        if len(clusters) == 0:
            return [],0

        # if we have more than one cluster - some of them might need to be merged
        # after removing "error" clusters
        # to do so - revert back to Hesse format
        # todo - maybe only run this if we have removed any error lines
        if len(clusters) > 1:

            hessen_lines = []

            for cluster_index in range(len(clusters)):
                lines_segments,users = zip(*clusters[cluster_index][1])
                x1_l, x2_l, y1_l, y2_l = zip(*lines_segments)
                x1,x2,y1,y2 = np.median(x1_l),np.median(x2_l),np.median(y1_l),np.median(y2_l)
                hessen_lines.append(hesse_line_reduction([[x1,x2,y1,y2],])[0])

            # print hessen_lines
            slope_l,angle_l = zip(*hessen_lines)
            max_s,min_s = max(slope_l),min(slope_l)
            max_a,min_a = max(angle_l),min(angle_l)

            # normalize values
            hessen_lines = [((max_s-s)/(max_s-min_s),(max_a-a)/(max_a-min_a)) for s,a in hessen_lines]

            tree = spatial.KDTree(hessen_lines)

            to_merge = []
            will_be_merged = set()

            for l_index in range(len(hessen_lines)-1,-1,-1):
                for l2_index in tree.query_ball_point(hessen_lines[l_index],0.15):
                    if l2_index > l_index:
                        t_lines = clusters[l_index][0][:]
                        t_lines.extend(clusters[l2_index][0])

                        aligned_text = self.__line_alignment__(t_lines)
                        accuracy = self.__agreement__(aligned_text)
                        if min(accuracy) >= 0.5:
                            will_be_merged.add(l_index)
                            will_be_merged.add(l2_index)

                            # make sure that there are not any overlapping users
                            users_1 = zip(*clusters[l_index][1])[1]
                            users_2 = zip(*clusters[l2_index][1])[1]

                            if [u for u in users_1 if u in users_2] != []:
                                continue

                            # is merge "relevant" to any other?
                            relevant = False
                            for m_index,m in enumerate(to_merge):
                                if (l_index in m) or (l2_index in m):
                                    relevant = True
                                    m.add(l_index)
                                    m.add(l2_index)
                                    break

                            if not relevant:
                                to_merge.append(set([l_index,l2_index]))

            # might be a better way to do this but will mulitple popping from list, safer
            # to work with a copy
            new_clusters = []

            for cluster_index in range(len(clusters)):
                if cluster_index not in will_be_merged:
                    new_clusters.append(clusters[cluster_index])
            for merged_clusters in to_merge:
                t_cluster = [[],[]]
                for cluster_index in merged_clusters:
                    t_cluster[0].extend(clusters[cluster_index][0])
                    t_cluster[1].extend(clusters[cluster_index][1])
                new_clusters.append(t_cluster[:])

            clusters = new_clusters

        # and now, finally, the actual text clustering
        cluster_centers = []
        cluster_pts = []
        cluster_users = []

        cluster_members = []


        agreement = []
        self.line_agreement.append([])

        for lines,pts_and_users in clusters:
            pts,users = zip(*pts_and_users)
            x1_values,x2_values,y1_values,y2_values = zip(*pts)

            # todo - handle when some of the coordinate values are not numbers -
            # todo - this corresponds to when there are multiple text segments from the same user
            # todo - this in turn corresponds to the case where we look for "broken" lines
            # todo - so definitely something down the line
            x1 = np.median(x1_values)
            x2 = np.median(x2_values)
            y1 = np.median(y1_values)
            y2 = np.median(y2_values)

            # align the text
            aligned_text = self.__line_alignment__(lines)



            # align the non-fasta version of the text lines
            nf_aligned_lines = self.__add_alignment_spaces__(aligned_text,non_fasta_text,pts_and_users)

            # aggregate the lines - looking for character spots where there is mostly consensus
            aggregate_text,character_agreement,per_user_agreement = self.__merge_aligned_text__(nf_aligned_lines)

            for t in aligned_text:
                print t

            print aggregate_text
            print

            # deal with characters that python/postgres has trouble with
            aggregate_text = self.__reset_special_characters__(aggregate_text)


            cluster_centers.append((x1,x2,y1,y2,aggregate_text))

            # and deal with special characters for each individual lines
            temp_pts_lines = []
            for p,l in zip(pts,nf_aligned_lines):
                l = self.__reset_special_characters__(l)
                temp_pts_lines.append((p,l))

            cluster_pts.append(temp_pts_lines)

            cluster_users.append(users)
            agreement.append(character_agreement)
            # cluster_members.append(aligned_text)

            # use this if you want to keep track of stats
            # self.line_agreement[-1].append((character_agreement,len(users)))

        results = []
        for center,pts,users,a in zip(cluster_centers,cluster_pts,cluster_users,agreement):
            results.append({"center":center,"cluster members":pts,"tools":[],"num users":len(users),"agreement":a})

        return results,0