Example #1
0
def get_percentage(thresholds, lang="nl"):
    _, lang_short = language_map(lang)

    path_en = "data/subtitle_data/en_" + lang_short + "/OpenSubtitles.en-" + lang_short + ".en"
    path_to = "data/subtitle_data/en_" + lang_short + "/OpenSubtitles.en-" + lang_short + "." + lang_short

    sample = 0.1

    num_lines = get_num_lines(path_en)
    lines_sampled = 0

    counts = np.zeros(len(thresholds))
    counter = 0
    with open(path_en, "r") as file1, open(path_to) as file2:
        line1 = file1.readline()
        line2 = file2.readline()
        while line1 and line2:
            counter += 1
            if counter % 1e7 == 0:
                print("Read", counter, "out of", num_lines)

            if random.random() < sample:
                lines_sampled += 1
                for i, threshold in enumerate(thresholds):
                    if keep_lines(line1, line2, threshold):
                        counts[i] += 1
                    else:
                        break

            line1 = file1.readline()
            line2 = file2.readline()

    return counts / lines_sampled
    def __init__(self, lang, type="w2v", dim=100, max_vocab=50000):
        self.vector_dic = dict()
        self.filename = ''
        self.pickle_filename = ''

        lang_full, lang_short = language_map(lang)
        self.lang_full = lang_full
        self.lang_short = lang_short
        self.max_vocab = max_vocab
        self.dim = dim

        self.dir = "vector_models/" + lang_full

        if type == "w2v":
            self.get_w2v_embeddings()

        if type == "glove":
            self.get_glove_embeddings()

        if type == "ft":
            if dim != 300:
                print("embedding.dim set to 300; only dimension available!")
                self.dim = 300

            self.get_ft_embeddings()

        test_vec = next(iter(self.vector_dic.values()))
        if test_vec.size != dim:
            print("Vector size different than desired dim = ", dim, file=sys.stderr)
            
        if "SOS" not in self.vector_dic:
            self.add_special_words()
def get_fasttext_url(language):
    _, language = language_map(language)

    filename = "cc." + language + ".300.vec.gz"
    url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/" + filename

    return url, filename
Example #4
0
def get_wikipedia2vec_url(language, dim=300):
    _, language = language_map(language)

    filename = language + "wiki_20180420_" + str(dim) + "d.txt.bz2"
    url = "http://wikipedia2vec.s3.amazonaws.com/models/" + language + "/2018-04-20/" + filename

    return url, filename
Example #5
0
def download_subtitle_files(datasets):
    print("\nDownloading languages\n")
    for language, url in datasets:
        lang_full, lang_short = language_map(language)
        print("Downloading", language)
        if "raw" in url:
            DIR = "data/subtitle_data/raw"
        else:
            DIR = "data/subtitle_data/en_" + lang_short

        if not os.path.exists(DIR):
            os.mkdir(DIR)

        download_url(url, DIR)
Example #6
0
def line_shrinkage(thresholds, lang="nl"):
    _, lang_short = language_map(lang)

    path_en = "data/subtitle_data/en_" + lang_short + "/OpenSubtitles.en-" + lang_short + ".en"
    path_to = "data/subtitle_data/en_" + lang_short + "/OpenSubtitles.en-" + lang_short + "." + lang_short

    p = 0.1

    lines_sampled = 0
    num_lines = get_num_lines(path_en)

    original_lengths = 0

    new_lengths = np.zeros(len(thresholds))
    lines_kept = np.zeros(len(thresholds))

    counter = 0
    with open(path_en, "r") as file1, open(path_to) as file2:
        line1 = file1.readline()
        line2 = file2.readline()
        while line1 and line2:
            counter += 1
            if counter % 1e7 == 0:
                print("Read", counter, "out of", num_lines)

            if random.random() < p:
                lines_sampled += 1

                apr_length = (len(line1) + len(line2)) / 2
                for i, threshold in enumerate(thresholds):
                    if keep_lines(line1, line2, threshold):
                        lines_kept[i] += 1
                        new_lengths[i] += apr_length

                original_lengths += apr_length

            line1 = file1.readline()
            line2 = file2.readline()

    new_lengths /= lines_kept
    original_lengths /= lines_sampled

    return new_lengths / original_lengths
Example #7
0
def sample_lines(num_lines,
                 threshold=0.55,
                 only_kept=False,
                 only_discarded=False,
                 lang="nl"):
    _, lang_short = language_map(lang)

    path_en = "data/subtitle_data/en_" + lang_short + "/OpenSubtitles.en-" + lang_short + ".en"
    path_to = "data/subtitle_data/en_" + lang_short + "/OpenSubtitles.en-" + lang_short + "." + lang_short

    kept = 0
    discarded = 0

    with open(path_en, "r") as file1, open(path_to) as file2:
        line1 = file1.readline()
        line2 = file2.readline()
        while line1 and line2 and (kept < num_lines or discarded < num_lines):
            if random.random() < 0.0001:
                diff = abs(len(line1) - len(line2)) / np.mean(
                    (len(line1), len(line2)))
                if keep_lines(line1, line2, threshold):
                    if kept < num_lines and diff > threshold - 0.1:
                        kept += 1
                        if not only_discarded:
                            print("\nKEEPING")
                            print("Diff", diff)
                            print(line1)
                            print(line2)
                elif discarded < num_lines:
                    discarded += 1
                    if not only_kept:
                        print("\nDISCARDING")
                        print("Diff", diff)
                        print(line1)
                        print(line2)

            line1 = file1.readline()
            line2 = file2.readline()
def get_subplot_for_data(lang="en"):
    lang_full, lang_short = language_map(lang)
    fig = plt.figure()

    plot_labels = {
        "w2v": "Word2Vec",
        "ft": "FastText",
        "cbow": "CBOW",
        "sg": "Skip-Gram"
    }

    for i, type in enumerate(["w2v", "ft"]):
        for j, hp in enumerate(["cbow", "sg"]):
            print(type, hp)

            # First word2vec
            model_name = type + "_" + lang + "_d100_" + hp + "_st.bin"
            path = "data/vector_models/" + model_name

            if type == "ft":
                wv = FastTextKeyedVectors.load(path)
            else:
                wv = KeyedVectors.load_word2vec_format(path, binary=True)

            words = all_words[lang]

            total_words = []
            for topic in words:
                total_words.extend(topic)

            pca = PCA(n_components=2)

            X = wv[wv.vocab]
            mean = np.mean(X, axis=0)
            var = np.var(X, axis=0)

            X -= mean
            X /= var
            pca.fit(X)

            # Start subplot
            subplot_num = i * 2 + (j + 1)
            axis = fig.add_subplot(2, 2, subplot_num)

            for topic in words:
                X = wv[topic]
                X -= mean
                X /= var
                result = pca.transform(X)

                axis.scatter(result[:, 0], result[:, 1], s=5.0)
                for k, word in enumerate(topic):
                    axis.annotate(word,
                                  xy=(result[k, 0], result[k, 1]),
                                  size=7)

                plt.setp(axis.get_xticklabels(), visible=False)
                plt.setp(axis.get_yticklabels(), visible=False)

            axis.set_title(lang_full.capitalize() + " - " + plot_labels[type] +
                           " - " + plot_labels[hp],
                           fontdict={"fontsize": 12})
    # plt.savefig("Figures/embedding_" + lang_short + ".png")

    plt.show()