for l in languages: X = np.fromfile(embed_data_dir + l + "_embeddings.raw", dtype=np.float32, count=-1) print(X.shape) X.resize(X.shape[0] // dim, dim) print(X.shape) X = X[start:end] print(X.shape) target_embeddings.append(X) # Calculate representational similarity analysis x, C = get_dists(target_embeddings, labels=languages, ticklabels=[], distance="cosine", save_dir=result_dir) distance_matrix = compute_distance_over_dists(x, C, languages, save_dir=result_dir + category) print(distance_matrix.shape) # Save result with open(distance_matrix_name, 'wb') as handle: pickle.dump(distance_matrix, handle) else: # If distance matrix has been calculated and you just want to adjust the plot, you can also load it directly
print(langdict.keys()) # get words if len(words) == 0: words = list(langdicts[0].keys()) # get vectors langvectors = [] for word in words: langvectors.append(langdict[word]) vectors.append(langvectors) ### Representational Similarity Analysis # Calculate similarity matrices for all languages x, C = get_dists(vectors, labels=languages, ticklabels=words, distance="cosine", save_dir=save_dir + name) # Calculate RSA over all languages distance_matrix = compute_distance_over_dists(x, C, languages, save_dir=save_dir + name) # # Save distance matrix with open(save_dir + "_distancematrix_" + name + ".pickle", 'wb') as handle: pickle.dump(distance_matrix, handle)
words = list(set(list(langdict1.keys())).union(list(langdict2.keys()))) print(words) # get vectors langvectors = [] for word in words: try: langvectors.append(langdict1[word]) except KeyError: langvectors.append(langdict2[word]) pass vectors.append(langvectors) # Calculate cosine similarities x, C = get_dists(vectors, labels=languages, ticklabels=words, distance="cosine") pairs = {} # Extra analysis for reviewer! # Check languages with max and min values in examples # for lang in ["et", "hu", "el", "he", "it", "pt", "es"]: # id = languages.index(lang) # similarities = np.array(C[id]) # #Mask the lower part with zeros (including the diagonal) # upper_part = np.triu(similarities,1) # # Flatten and remove all zeros # flattened = np.matrix.flatten(upper_part) # similarities = [x for x in flattened if not x==0]
print("Read vectors for " + lang) with open(data_dir + embeddings_name + "_embeddings." + lang + ".pickle", 'rb') as handle: langdict = pickle.load(handle) langdicts.append(langdict) # get vectors langvectors = [] for word in words["en"]: langvectors.append(langdict[word]) vectors.append(langvectors) # Calculate cosine similarities for selected words x, C = get_dists([langvectors], labels=[lang], ticklabels=words[lang], distance="cosine", save_dir=save_dir) ### PLOTTING data = C[0] mask = np.tri(data.shape[0], k=-1) data = np.ma.array(data, mask=mask) # Plot the heatmap im = ax.imshow(data, cmap="YlOrRd", vmin=0, vmax=1) # Finetuning the plot ax.xaxis.tick_top() ax.xaxis.set_label_position('top') ax.yaxis.tick_right() ax.yaxis.set_label_position('right')
for l in target_langs: X = np.fromfile(data_dir + l + "_embeddings.raw", dtype=np.float32, count=-1) X.resize(X.shape[0] // dim, dim) X = X[start:end] target_embeddings.append(X) # Get English sentences with open(data_dir + "en_sents.txt") as f: content = f.readlines() content = [x.strip() for x in content] content = content[start:end] x, C = get_dists(target_embeddings, labels=target_langs, ticklabels=content, distance="cosine") pairs = {} for sent1 in range(0, len(content)): for sent2 in range(0, len(content)): if sent1 < sent2: similarities = np.array( [C[lang][sent1][sent2] for lang in range(0, len(C))]) pairs[(content[sent1], content[sent2])] = similarities var = {} for key in pairs.keys(): var[key] = np.var(pairs[key]) print("Pairs with variation across languages:")