def buildEmbeddingsTree(indexMap, embeddings, comparator=None): embeddingsCount = len(embeddings) embeddingIndices = numpy.arange(0, embeddingsCount) xy = [xy for xy in itertools.product(embeddingIndices, embeddingIndices)] comparator = lambda a, b: vectors.euclideanDistance(a, b) + 1 / (2 + 2*vectors.cosineSimilarity(a, b)) function = lambda xy: comparator(embeddings[xy[0]], embeddings[xy[1]]) if xy[0] != xy[1] else 0 comparisons = map(function, xy) maxComparison = max(comparisons) comparisons = numpy.reshape(comparisons, (embeddingsCount, embeddingsCount)) / maxComparison comparisons = ssd.squareform(comparisons) links = linkage(comparisons) fig, ax = plt.subplots() fig.subplots_adjust(right=0.8) names = map(lambda nameIndexPair: nameIndexPair[0].split('/')[-1], indexMap.items()) names = sorted(names) dendrogram( links, leaf_rotation=90., leaf_font_size=8., orientation='right', labels=names, show_contracted=True, show_leaf_counts=True) plt.show()
def compareEmbeddings(indexMap, embeddingsList, comparator=None, annotate=False, axisLabels=True): embeddingsCount = len(indexMap) embeddingIndices = numpy.arange(0, embeddingsCount) xy = [xy for xy in itertools.product(embeddingIndices, embeddingIndices)] xx, yy = zip(*xy) if comparator is None: comparator = lambda a, b: vectors.cosineSimilarity(a, b) + 1 / vectors.euclideanDistance(a, b) function = lambda xy: comparator(embeddingsList[xy[0]], embeddingsList[xy[1]]) if xy[0] != xy[1] else numpy.nan comparisons = map(function, xy) comparisons = numpy.reshape(comparisons, (embeddingsCount, embeddingsCount)) nanxx, nanyy = numpy.where(numpy.isnan(comparisons)) nanxy = zip(nanxx, nanyy) leftx = lambda x: max(x, 0) rightx = lambda x: min(x, comparisons.shape[0]) lefty = lambda y: max(y, 0) righty = lambda y: min(y, comparisons.shape[1]) for x, y in nanxy: neighbours = comparisons[leftx(x-1):rightx(x+2),lefty(y-1):righty(y+2)] neighbours = neighbours[neighbours > 0] comparisons[x,y] = numpy.mean(neighbours) fig, ax = plt.subplots() fig.subplots_adjust(bottom=0.2) if axisLabels: filePaths = indexMap.keys() fileNames = [os.path.basename(filePath).split('.')[0] for filePath in filePaths] indices = [indexMap[filePath] for filePath in filePaths] plt.xticks(indices, fileNames, size='small', rotation='vertical') plt.yticks(indices, fileNames, size='small') plt.contourf(comparisons) if annotate: for x, y, c in zip(xx, yy, comparisons.flatten()): c = '{0:.1f}'.format(c*100) plt.annotate(c, (x, y)) plt.show()