def get_percentage(thresholds, lang="nl"): _, lang_short = language_map(lang) path_en = "data/subtitle_data/en_" + lang_short + "/OpenSubtitles.en-" + lang_short + ".en" path_to = "data/subtitle_data/en_" + lang_short + "/OpenSubtitles.en-" + lang_short + "." + lang_short sample = 0.1 num_lines = get_num_lines(path_en) lines_sampled = 0 counts = np.zeros(len(thresholds)) counter = 0 with open(path_en, "r") as file1, open(path_to) as file2: line1 = file1.readline() line2 = file2.readline() while line1 and line2: counter += 1 if counter % 1e7 == 0: print("Read", counter, "out of", num_lines) if random.random() < sample: lines_sampled += 1 for i, threshold in enumerate(thresholds): if keep_lines(line1, line2, threshold): counts[i] += 1 else: break line1 = file1.readline() line2 = file2.readline() return counts / lines_sampled
def __init__(self, lang, type="w2v", dim=100, max_vocab=50000): self.vector_dic = dict() self.filename = '' self.pickle_filename = '' lang_full, lang_short = language_map(lang) self.lang_full = lang_full self.lang_short = lang_short self.max_vocab = max_vocab self.dim = dim self.dir = "vector_models/" + lang_full if type == "w2v": self.get_w2v_embeddings() if type == "glove": self.get_glove_embeddings() if type == "ft": if dim != 300: print("embedding.dim set to 300; only dimension available!") self.dim = 300 self.get_ft_embeddings() test_vec = next(iter(self.vector_dic.values())) if test_vec.size != dim: print("Vector size different than desired dim = ", dim, file=sys.stderr) if "SOS" not in self.vector_dic: self.add_special_words()
def get_fasttext_url(language): _, language = language_map(language) filename = "cc." + language + ".300.vec.gz" url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/" + filename return url, filename
def get_wikipedia2vec_url(language, dim=300): _, language = language_map(language) filename = language + "wiki_20180420_" + str(dim) + "d.txt.bz2" url = "http://wikipedia2vec.s3.amazonaws.com/models/" + language + "/2018-04-20/" + filename return url, filename
def download_subtitle_files(datasets): print("\nDownloading languages\n") for language, url in datasets: lang_full, lang_short = language_map(language) print("Downloading", language) if "raw" in url: DIR = "data/subtitle_data/raw" else: DIR = "data/subtitle_data/en_" + lang_short if not os.path.exists(DIR): os.mkdir(DIR) download_url(url, DIR)
def line_shrinkage(thresholds, lang="nl"): _, lang_short = language_map(lang) path_en = "data/subtitle_data/en_" + lang_short + "/OpenSubtitles.en-" + lang_short + ".en" path_to = "data/subtitle_data/en_" + lang_short + "/OpenSubtitles.en-" + lang_short + "." + lang_short p = 0.1 lines_sampled = 0 num_lines = get_num_lines(path_en) original_lengths = 0 new_lengths = np.zeros(len(thresholds)) lines_kept = np.zeros(len(thresholds)) counter = 0 with open(path_en, "r") as file1, open(path_to) as file2: line1 = file1.readline() line2 = file2.readline() while line1 and line2: counter += 1 if counter % 1e7 == 0: print("Read", counter, "out of", num_lines) if random.random() < p: lines_sampled += 1 apr_length = (len(line1) + len(line2)) / 2 for i, threshold in enumerate(thresholds): if keep_lines(line1, line2, threshold): lines_kept[i] += 1 new_lengths[i] += apr_length original_lengths += apr_length line1 = file1.readline() line2 = file2.readline() new_lengths /= lines_kept original_lengths /= lines_sampled return new_lengths / original_lengths
def sample_lines(num_lines, threshold=0.55, only_kept=False, only_discarded=False, lang="nl"): _, lang_short = language_map(lang) path_en = "data/subtitle_data/en_" + lang_short + "/OpenSubtitles.en-" + lang_short + ".en" path_to = "data/subtitle_data/en_" + lang_short + "/OpenSubtitles.en-" + lang_short + "." + lang_short kept = 0 discarded = 0 with open(path_en, "r") as file1, open(path_to) as file2: line1 = file1.readline() line2 = file2.readline() while line1 and line2 and (kept < num_lines or discarded < num_lines): if random.random() < 0.0001: diff = abs(len(line1) - len(line2)) / np.mean( (len(line1), len(line2))) if keep_lines(line1, line2, threshold): if kept < num_lines and diff > threshold - 0.1: kept += 1 if not only_discarded: print("\nKEEPING") print("Diff", diff) print(line1) print(line2) elif discarded < num_lines: discarded += 1 if not only_kept: print("\nDISCARDING") print("Diff", diff) print(line1) print(line2) line1 = file1.readline() line2 = file2.readline()
def get_subplot_for_data(lang="en"): lang_full, lang_short = language_map(lang) fig = plt.figure() plot_labels = { "w2v": "Word2Vec", "ft": "FastText", "cbow": "CBOW", "sg": "Skip-Gram" } for i, type in enumerate(["w2v", "ft"]): for j, hp in enumerate(["cbow", "sg"]): print(type, hp) # First word2vec model_name = type + "_" + lang + "_d100_" + hp + "_st.bin" path = "data/vector_models/" + model_name if type == "ft": wv = FastTextKeyedVectors.load(path) else: wv = KeyedVectors.load_word2vec_format(path, binary=True) words = all_words[lang] total_words = [] for topic in words: total_words.extend(topic) pca = PCA(n_components=2) X = wv[wv.vocab] mean = np.mean(X, axis=0) var = np.var(X, axis=0) X -= mean X /= var pca.fit(X) # Start subplot subplot_num = i * 2 + (j + 1) axis = fig.add_subplot(2, 2, subplot_num) for topic in words: X = wv[topic] X -= mean X /= var result = pca.transform(X) axis.scatter(result[:, 0], result[:, 1], s=5.0) for k, word in enumerate(topic): axis.annotate(word, xy=(result[k, 0], result[k, 1]), size=7) plt.setp(axis.get_xticklabels(), visible=False) plt.setp(axis.get_yticklabels(), visible=False) axis.set_title(lang_full.capitalize() + " - " + plot_labels[type] + " - " + plot_labels[hp], fontdict={"fontsize": 12}) # plt.savefig("Figures/embedding_" + lang_short + ".png") plt.show()