def printDiffs(wordcount, clusters): diffCounts = {} for clusterSet in clusters.values(): for cluster in clusterSet: if len(cluster) > 1: ranked = sorted(list(cluster), key=wordcount.get, reverse=True) diffs = StringDistance.pythonDiff(ranked[0], ranked[1], path=True) for diff in diffs: if not diff in diffCounts: diffCounts[diff] = 0 diffCounts[diff] += 1 for diff, ct in diffCounts.iteritems(): print(diff, ct, sep='\t')
def printEdits(wordcount, clusters): editCounts = {} for clusterSet in clusters.values(): for cluster in clusterSet: if len(cluster) > 1: ranked = sorted(list(cluster), key=wordcount.get, reverse=True) edits = StringDistance.levenshtein(ranked[0], ranked[1], path=True) for edit in edits[0]: if not edit in editCounts: editCounts[edit] = 0 editCounts[edit] += 1 for edit, ct in editCounts.iteritems(): print(edit, ct, sep='\t')