Ejemplo n.º 1
0
def main():
    """
    Runs main experiments using self supervised alignment.
    """
    # wv_source = "wordvectors/latin/corpus1/0.vec"
    # wv_target = "wordvectors/latin/corpus2/0.vec"
    # wv_source = "wordvectors/source/theguardianuk.vec"
    # wv_target = "wordvectors/source/thenewyorktimes_1.vec"
    wv_source = "wordvectors/semeval/latin-corpus1.vec"
    wv_target = "wordvectors/semeval/latin-corpus2.vec"
    # wv_source = "wordvectors/usuk/bnc.vec"
    # wv_target = "wordvectors/usuk/coca_mag.vec"
    # wv_source = "wordvectors/artificial/NYT-0.vec"
    # wv_target = "wordvectors/artificial/NYT-500_random.vec"
    plt.style.use("seaborn")

    # Read WordVectors
    normalized = False
    wv1 = WordVectors(input_file=wv_source, normalized=normalized)
    wv2 = WordVectors(input_file=wv_target, normalized=normalized)

    wv1, wv2 = intersection(wv1, wv2)

    landmarks, non_landmarks, Q = s4(wv1,
                                     wv2,
                                     cls_model="nn",
                                     n_targets=100,
                                     n_negatives=100,
                                     rate=1,
                                     t=0.5,
                                     iters=100,
                                     verbose=1,
                                     plot=1)
    wv1, wv2, Q = align(wv1, wv2, anchor_words=landmarks)
    d_l = [cosine(wv1[w], wv2[w]) for w in landmarks]
    d_n = [cosine(wv1[w], wv2[w]) for w in non_landmarks]
    sns.distplot(d_l, color="blue")
    sns.distplot(d_n, color="red")
    plt.legend()
    plt.show()
Ejemplo n.º 2
0
def main():
    """
    Performs tests on SemEval2020-Task 1 data on Unsupervised Lexical Semantic Change Detection.
    This experiments is designed to evaluate the performance of different landmark selection approaches,
    showing how the classification performance is affected by the landmark choices.
    """
    np.random.seed(1)

    align_methods = [
        "s4", "noise-aware", "top-10", "bot-10", "global", "top-5", "bot-5"
    ]

    parser = argparse.ArgumentParser()
    parser.add_argument("--languages",
                        nargs="+",
                        help="Languages to use",
                        default=["english", "german", "latin", "swedish"])
    parser.add_argument("--cls",
                        choices=["cosine", "s4", "cosine-auto"],
                        default="cosine",
                        help="Classifier to use")

    args = parser.parse_args()
    languages = args.languages
    classifier = args.cls

    align_params = \
    {
        "english" : {
            "n_targets": 100,
            "n_negatives": 50,
            "rate": 1,
            "iters": 100
        },
        "german" : {
            "n_targets": 100,
            "n_negatives": 200,
            "rate": 1,
            "iters": 100
        },
        "latin" : {
            "n_targets": 10,
            "n_negatives": 4,
            "rate": 0.5,
            "iters": 100
        },
        "swedish" : {
            "n_targets": 100,
            "n_negatives": 200,
            "rate": 1,
            "iters": 100
        }
    }

    cls_params = \
    {
        "english": {
            "n_targets": 100,
            "n_negatives": 50,
            "rate": 1,
            "iters": 500
        },
        "german":{
            "n_targets": 50,
            "n_negatives": 200
        },
        "latin":
        {
            "n_targets": 50,
            "n_negatives": 10
        },
        "swedish":
        {
            "n_targets": 120,
            "n_negatives": 120
        }
    }

    auto_params = \
    {
        "english":
            {
            "rate": 1.5,
            "n_fold": 1,
            "n_targets": 50,
            "n_negatives": 100
            },
        "german":
        {
            "rate":1,
            "n_fold": 1,
            "n_targets": 200,
            "n_negatives": 100
        },
        "latin":
        {
            "rate": 1,
            "n_targets": 100,
            "n_negatives": 15
        },
        "swedish":
        {
            "rate": 1,
            "n_targets": 100,
            "n_negatives": 200
        }
    }

    normalized = False
    accuracies = defaultdict(dict)
    true_positives = defaultdict(dict)
    false_negatives = defaultdict(dict)
    correct_ans = defaultdict(dict)
    cm = defaultdict(dict)
    for lang in languages:
        # print("---")
        # print(lang)
        t = 0.5
        thresholds = np.arange(0.1, 1, 0.1)
        path_task1 = "data/semeval/truth/%s.txt" % lang
        path_task2 = "data/semeval/truth/%s.txt" % lang

        with open(path_task1) as fin:
            data = map(lambda s: s.strip().split("\t"), fin.readlines())
            targets, true_class = zip(*data)
            y_true = np.array(true_class, dtype=int)
        with open(path_task2) as fin:
            data = map(lambda s: s.strip().split("\t"), fin.readlines())
            _, true_ranking = zip(*data)
            true_ranking = np.array(true_ranking, dtype=float)

        corpus1_path = "wordvectors/semeval/%s-corpus1.vec" % lang
        corpus2_path = "wordvectors/semeval/%s-corpus2.vec" % lang
        wv1 = WordVectors(input_file=corpus1_path, normalized=normalized)
        wv2 = WordVectors(input_file=corpus2_path, normalized=normalized)

        c_method = defaultdict(list)
        wv1, wv2 = intersection(wv1, wv2)
        # print("Size of common vocab.", len(wv1))
        prediction = dict()  # store per-word prediction
        for align_method in align_methods:
            accuracies[align_method][lang] = list()
            true_positives[align_method][lang] = list()
            false_negatives[align_method][lang] = list()
            cm[align_method][lang] = np.zeros((2, 2))

            if align_method == "global":
                landmarks = wv1.words
            elif align_method == "noise-aware":
                Q, alpha, landmarks, non_landmarks = noise_aware(
                    wv1.vectors, wv2.vectors)
                landmarks = [wv1.words[i] for i in landmarks]
            elif align_method == "s4":
                landmarks, non_landmarks, Q = s4(
                    wv1,
                    wv2,
                    cls_model="nn",
                    verbose=0,
                    **align_params[lang],
                )
            elif align_method == "top-10":
                landmarks = wv1.words[int(len(wv1.words) * 0.1):]
            elif align_method == "top-5":
                landmarks = wv1.words[int(len(wv1.words) * 0.05):]
            elif align_method == "top-50":
                landmarks = wv1.words[int(len(wv1.words) * 0.50):]
            elif align_method == "bot-10":
                landmarks = wv1.words[-int(len(wv1.words) * 0.1):]
            elif align_method == "bot-5":
                landmarks = wv1.words[-int(len(wv1.words) * 0.05):]
            elif align_method == "bot-50":
                landmarks = wv1.words[-int(len(wv1.words) * 0.50):]

            wv1_, wv2_, Q = align(wv1, wv2, anchor_words=landmarks)

            # Cosine-based classifier
            if classifier == "cosine":
                x = np.array([cosine(wv1_[w], wv2_[w]) for w in wv1.words])
                x = get_feature_cdf(x)
                x = np.array([x[wv1.word_id[i.lower()]] for i in targets])
                p = x.reshape(-1, 1)
                r = vote(p)
                y_pred = r

                best_acc = 0
                for t in thresholds:
                    y_bin = (y_pred > t)
                    correct = (y_bin == y_true)

                    accuracy = accuracy_score(y_true, y_bin)
                    if accuracy > best_acc:
                        prediction[align_method] = correct
                        best_acc = accuracy
                    tn, fp, fn, tp = confusion_matrix(y_true, y_bin).ravel()
                    cm[align_method][lang] += confusion_matrix(y_true,
                                                               y_bin,
                                                               normalize="all")
                    accuracies[align_method][lang].append(round(accuracy, 2))
                    true_positives[align_method][lang].append(round(tp, 2))
                    false_negatives[align_method][lang].append(round(fn, 2))
            elif classifier == "cosine-auto":
                t_cos = threshold_crossvalidation(wv1_,
                                                  wv2_,
                                                  iters=1,
                                                  **auto_params[lang],
                                                  landmarks=landmarks)
                x = np.array([cosine(wv1_[w], wv2_[w]) for w in wv1.words])
                x = get_feature_cdf(x)
                x = np.array([x[wv1.word_id[i.lower()]] for i in targets])
                p = x.reshape(-1, 1)
                r = vote(p)
                y_pred = r
                y_bin = y_pred > t_cos
                correct = (y_bin == y_true)

                accuracy = accuracy_score(y_true, y_bin)

                accuracies[align_method][lang].append(round(accuracy, 2))

            elif classifier == "s4":
                model = s4(wv1_,
                           wv2_,
                           landmarks=landmarks,
                           verbose=0,
                           **cls_params[lang],
                           update_landmarks=False)
                # Concatenate vectors of target words for prediction
                x = np.array([
                    np.concatenate((wv1_[t.lower()], wv2_[t.lower()]))
                    for t in targets
                ])
                y_pred = model.predict(x)
                y_bin = y_pred > 0.5
                correct = (y_bin == y_true)

                accuracy = accuracy_score(y_true, y_bin)
                print(accuracy)
                accuracies[align_method][lang].append(round(accuracy, 2))

            c_method[align_method] = y_pred
            rho, pvalue = spearmanr(true_ranking, y_pred)

            # print(lang, align_method, "acc", accuracies[align_method][lang],
            #                                 "\nranking", round(rho, 2),
            #                                 "landmarks", len(landmarks))

    print("|Method|Language|Mean acc.|Max acc.|")
    print("|------|--------|---------|--------|")
    for method in accuracies:
        print("|", method, end="|")
        for lang in accuracies[method]:
            print(lang,
                  round(np.mean(accuracies[method][lang]), 2),
                  np.max(accuracies[method][lang]),
                  sep="|",
                  end="|\n")
    print()
Ejemplo n.º 3
0
def main():
    """
    The following experiments are available:
        - Find most stable words in each ArXiv category (cs, math, cond-mat, physics)
        - Find most unstable (changed) words in earch category
        - Finds stable/unstable words across categories
        - Using different alignment strategies
    """

    parser = argparse.ArgumentParser()
    parser.add_argument("cat1", type=str, help="Name of first arXiv category")
    parser.add_argument("cat2", type=str, help="Name of second arXiv category")

    args = parser.parse_args()

    cat1 = args.cat1
    cat2 = args.cat2

    cat1_name = cat1.split("/")[-1]
    cat2_name = cat2.split("/")[-1]

    # cat1_name = cat1.split("_")[2].rstrip(".vec")
    # cat2_name = cat2.split("_")[2].rstrip(".vec")

    path_out = "results/arxiv/"

    wva = WordVectors(input_file=cat1)
    wvb = WordVectors(input_file=cat2)
    wva, wvb = intersection(wva, wvb)
    wva, wvb, Q = align(wva, wvb)
    words = wva.words

    print("-- Common vocab", len(words))
    # each column of this matrix will store a set of results for a method
    out_grid = np.zeros((len(words), 5))

    d = distribution_of_change(wva, wvb)
    print("====== GLOBAL")
    print("=> landmarks", len(wva.words))
    print_table(d, wva.words)
    out_grid[:, 0] = d  # add first column

    print("====== Noise Aware")

    Q, alpha, landmarks, noisy = noise_aware(wva.vectors, wvb.vectors)
    wva, wvb, Q = align(wva, wvb, anchor_words=landmarks)
    print("=> landmarks", len(landmarks))
    d = distribution_of_change(wva, wvb)
    print_table(d, wva.words)
    out_grid[:, 1] = d  # add new column

    print("===== SELF")
    landmarks, nonl, Q = s4(wva, wvb, iters=100, verbose=1)
    wva, wvb, Q = align(wva, wvb, anchor_words=landmarks)
    d = distribution_of_change(wva, wvb)
    print_table(d, wva.words)
    out_grid[:, 2] = d  # last column

    # WRITE-OUT
    with open(os.path.join(path_out, "%s-%s.csv" % (cat1_name, cat2_name)),
              "w") as fout:
        fout.write("word,global,noise-aware,self,top,bot\n")
        for i, w in enumerate(words):
            fout.write("%s,%.3f,%.3f,%.3f,%.3f,%.3f\n" %
                       (w, out_grid[i][0], out_grid[i][1], out_grid[i][2],
                        out_grid[i][3], out_grid[i][4]))
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("alignment",
                        choices=[
                            'top-5', 'top-10', 'noise-aware', 'bot-5',
                            'bot-10', 'global', 's4'
                        ],
                        default="top",
                        help="Method to use in the alignment of UK to US")
    parser.add_argument("--rounds",
                        type=int,
                        default=1,
                        help="No. of rounds to run the classifications")

    args = parser.parse_args()

    path_us = "wordvectors/ukus/coca.vec"
    path_uk = "wordvectors/ukus/bnc.vec"
    path_dict = "data/ukus/dict_similar.txt"
    path_dict_dis = "data/ukus/dict_dissimilar.txt"

    normalized = False

    wv1 = WordVectors(input_file=path_uk, normalized=normalized)
    wv2 = WordVectors(input_file=path_us, normalized=normalized)

    wv_uk, wv_us = intersection(wv1, wv2)

    # Load dictionaries of words
    with open(path_dict) as fin:
        dico_sim = list(map(lambda s: s.strip().split(" ", 1),
                            fin.readlines()))

    with open(path_dict_dis) as fin:
        dico_dis = list(map(lambda s: (s.strip(), s.strip()), fin.readlines()))

    # Filter words not in the vocabulry of either UK or US corpora
    dico_sim = [(a, b) for a, b in dico_sim
                if a in wv_uk.word_id and b in wv_us.word_id]
    dico_dis = [(a, b) for a, b in dico_dis
                if a in wv_uk.word_id and b in wv_us.word_id]
    dico = dico_sim + dico_dis
    # Create true labels for terms
    # 0 -> similar | 1 -> dissimilar
    y_true = [0] * len(dico_sim) + [1] * len(dico_dis)

    m = args.alignment
    # Align wordvectors (using any alignment approach)
    if m == "noise-aware":
        Q, alpha, landmarks, noise = noise_aware(wv_uk.vectors, wv_us.vectors)
        landmarks = [wv_uk.words[i] for i in landmarks]
        a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks)
    elif m == "global":
        landmarks = wv_us.words
        a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks)
        landmarks = landmarks[:len(landmarks) // 2]
    elif m == "s4":
        landmarks = wv_us.words
        a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks)
        landmarks, non_landmarks, Q = s4(
            wv_uk,
            wv_us,
            cls_model="nn",
            verbose=0,
            iters=100,
            n_targets=100,
            n_negatives=10,
            rate=0.25,
        )

        a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks)
    elif m == "top-10":
        landmarks = wv_us.words[:int(len(wv_us.words) * 0.1)]
    elif m == "top-5":
        landmarks = wv_us.words[:int(len(wv_us.words) * 0.05)]
    elif m == "bot-10":
        landmarks = wv_us.words[-int(len(wv_us.words) * 0.1):]
    elif m == 'bot-5':
        landmarks = wv_us.words[-int(len(wv_us.words) * 0.05):]

    a_, b_, Q = align(wv_uk, wv_us, anchor_words=landmarks)

    wv1_ = WordVectors(words=wv1.words, vectors=np.dot(wv1.vectors, Q))

    test_pairs = dico
    # print("Landmarks", len(landmarks))
    # Train classifier
    self_scores = list()
    cos_scores = list()
    na_scores = list()
    iters = 100

    # Interval to vary cosine thresholds
    cos_thresholds = [0.3, 0.5, 0.7]

    # Run several rounds, if given
    for r in range(args.rounds):
        model = s4(a_,
                   b_,
                   iters=iters,
                   landmarks=landmarks,
                   verbose=0,
                   n_targets=1000,
                   n_negatives=1000,
                   rate=0.25,
                   cls_model="nn",
                   update_landmarks=False)

        acc = 0
        acc_cos = 0
        total = 0
        y_pred = list()
        y_pred_cos = list()
        try:
            x = np.array(
                [np.concatenate((wv1_[p[0]], wv2[p[1]])) for p in test_pairs])
            x_cos = np.array(
                [cosine(wv1_[p[0]], wv2[p[1]]) for p in test_pairs])

            # Predict with noise-aware
            # Generate pairs (u, v) and apply noise-aware
            # 0 if pair is clean, 1 if pair is noisy

            v_a = np.array([wv1_[p[0]] for p in test_pairs])
            v_b = np.array([wv2[p[1]] for p in test_pairs])
            Q, alpha, clean, noisy = noise_aware(v_a, v_b)

            y_pred_na = np.zeros((len(test_pairs)))
            for i in noisy:
                y_pred_na[i] = 1

        except KeyError as e:  # skip word if not in model
            pass
        y_hat = model.predict(x)
        y_pred = (y_hat > 0.5)

        self_acc = accuracy_score(y_true, y_pred)
        self_prec = precision_score(y_true, y_pred)
        self_rec = recall_score(y_true, y_pred)
        self_f1 = f1_score(y_true, y_pred)
        self_scores.append([self_acc, self_prec, self_rec, self_f1])

        # Cosine metrics
        # Compute average over multiple runs
        cos_acc = cos_prec = cos_rec = cos_f1 = 0
        for t in cos_thresholds:
            y_pred_cos = (x_cos > t)
            cos_acc = round(accuracy_score(y_true, y_pred_cos), 2)
            cos_prec = round(precision_score(y_true, y_pred_cos), 2)
            cos_rec = round(recall_score(y_true, y_pred_cos), 2)
            cos_f1 = round(f1_score(y_true, y_pred_cos), 2)

            cos_scores.append([cos_acc, cos_prec, cos_rec, cos_f1])

        # Noise-Aware metrics
        na_acc = round(accuracy_score(y_true, y_pred_na), 2)
        na_prec = round(precision_score(y_true, y_pred_na), 2)
        na_rec = round(recall_score(y_true, y_pred_na), 2)
        na_f1 = round(f1_score(y_true, y_pred_na), 2)
        na_scores.append([na_acc, na_prec, na_rec, na_f1])

    self_scores = np.array(self_scores)
    cos_scores = np.array(cos_scores)
    na_scores = np.array(na_scores)

    # Print Markdown Table
    for j, t in enumerate(cos_thresholds):
        print("|COS %.2f" % t, m, sep="|", end="|")
        for i in range(4):
            print("%.2f" % (round(cos_scores[j:, i].mean(), 2)),
                  end="|",
                  sep=" ")
        print("|")
    print("|")
    print("|S4-D", m, end="|", sep="|")
    for i in range(4):
        print("%.2f +- %.2f" % (round(self_scores[:, i].mean(),
                                      2), round(self_scores[:, i].std(), 2)),
              end="|",
              sep=" ")
    print("|")
    print("|Noisy-Pairs", "-", *na_scores[0], sep="|", end="|\n")