Ejemplo n.º 1
0
def propTrain_split(words, labels, tagsInOrder, config):
    config.set_split('propTrain')

    num_data = len(words)
    data_ydist, _, _ = getLabelDist(labels, tagsInOrder)

    min_loss = 9999999
    best_split = None
    for split in random_split(num_data, config.max_sampling_trials,
                              config.ratio_split):
        train, dev, test = split

        # Label distribution for each split
        train_ydist, train_nlabels, _ = getLabelDist(labels[train],
                                                     tagsInOrder)
        test_ydist, _, _ = getLabelDist(labels[test], tagsInOrder)
        dev_ydist, _, _ = getLabelDist(labels[dev], tagsInOrder)

        if train_nlabels == 41:
            kl_loss = kl(train_ydist, data_ydist)
            if kl_loss < min_loss:
                min_loss = kl_loss
                best_split = split
                plotbar(data_ydist,
                        train_ydist,
                        dev_ydist,
                        test_ydist,
                        train_nlabels,
                        save_filename='{}.png'.format(config.split))
                print('loss: {}'.format(kl_loss))

    train, dev, test = best_split
    write_linebased(words[train], labels[train], config.filenames_split[0])
    write_linebased(words[dev], labels[dev], config.filenames_split[1])
    write_linebased(words[test], labels[test], config.filenames_split[2])
Ejemplo n.º 2
0
def propAll_split(words, labels, tagsInOrder, config):
    config.set_split('propAll')
    num_data = len(words)
    data_ydist, _, _ = getLabelDist(labels, tagsInOrder)

    min_loss = 9999999
    best_split = None
    for split in random_split(num_data, config.max_sampling_trials,
                              config.ratio_split):
        train, dev, test = split

        # Label distribution for each split
        train_ydist, train_nlabels, _ = getLabelDist(labels[train],
                                                     tagsInOrder)
        dev_ydist, dev_nlabels, _ = getLabelDist(labels[dev], tagsInOrder)
        test_ydist, test_nlabels, _ = getLabelDist(labels[test], tagsInOrder)

        if np.all(np.array([train_nlabels, test_nlabels, dev_nlabels]) == 41):
            kl1 = kl(dev_ydist, train_ydist)
            kl2 = kl(test_ydist, train_ydist)
            kl_loss = 0.3 * (kl1 + kl2) + 0.7 * abs(kl1 - kl2)
            if kl_loss < min_loss:
                min_loss = kl_loss
                best_split = split
                plotbar(data_ydist,
                        train_ydist,
                        dev_ydist,
                        test_ydist,
                        41,
                        save_filename='{}.png'.format(config.split))
                print("loss: {}".format(kl_loss))
                print("--- kl,(dev, test): {}, {}".format(kl1, kl2))

    train, dev, test = best_split
    write_linebased(words[train], labels[train], config.filenames_split[0])
    write_linebased(words[dev], labels[dev], config.filenames_split[1])
    write_linebased(words[test], labels[test], config.filenames_split[2])
Ejemplo n.º 3
0
def kl_distance(x, y):
    """
    Wrapper for scipy.stats.entropy
    Parameters
    ----------
    x: Numpy.array
    y: Numpy.array

    Returns
    -------
    float
        Kullback–Leibler divergence
    """
    div = kl(x, y)
    assert div is not None, 'KL divergence is Null'
    return div
Ejemplo n.º 4
0
    def get_kl_indexes(self, item):
        sentences = item.quote.get_lemmas()
        ntokens = item.quote._nlemmas_by_sentence
        full_text = sum(sentences, Counter())
        full_text_nwords = sum(full_text.values())
        full_text_dist = [
            (full_text[key] + 1) / (full_text_nwords + len(full_text.keys()))
            for key in full_text.keys()
        ]

        summary_idx = []
        summary_ntokens = 0
        summary_counts = Counter()

        while summary_ntokens < self.mean_ntokens:
            current_scores = np.ones((len(sentences)))
            for idx, lemmas in enumerate(sentences):
                if idx not in summary_idx:
                    if summary_ntokens > 0:
                        lemmas = summary_counts + lemmas
                    dist = [(lemmas[key] + 1) /
                            (sum(lemmas.values()) + len(full_text.keys()))
                            for key in full_text.keys()]
                    current_scores[idx] = kl(full_text_dist, dist)

            idx_max = np.argmin(current_scores)
            idx_ntokens = ntokens[idx_max]
            if summary_ntokens > 0 and \
                    abs(self.mean_ntokens - summary_ntokens - idx_ntokens) > abs(self.mean_ntokens - summary_ntokens):
                return summary_idx
            summary_idx.append(idx_max)
            summary_counts += sentences[idx_max]
            summary_ntokens += idx_ntokens
            # print(summary_counts)

        if summary_ntokens is 0:
            print(
                f"No summary for {item.uid} | {int(item.quote.ntokens / item.quote.nsents)}\t| {item.quote.cleaned_text}"
            )

        return summary_idx
Ejemplo n.º 5
0
def kl_divergence(x, y):
    return kl(x.T, y.T).mean()
Ejemplo n.º 6
0
def kl_divergence(x, y):
    return kl(x.T, y.T).mean()
Ejemplo n.º 7
0
def generate_balance_sets(config):
    if not os.path.exists(config.filename_raw):
        generate_semcor_data(config.filename_raw)

    raw = SemcorDataset(config.filename_raw)

    tagsInOrder = []
    i = 0
    for k, _ in config.vocab_tags.items():
        if k.startswith('B'):
            tagsInOrder.append(k)
            i += 1

    _words, _labels = [], []
    for word, label, _ in raw:
        _words.append(word)
        _labels.append(label)

    # Q_raw, _ = getTagDistFromList(_labels)

    min_loss = 100000
    best_seed = None
    for i in range(500):
        # Generate split randomly
        seed_shuffle = random.randint(0, 2**32 - 1)
        words, labels = shuffle(_words, _labels, random_state=seed_shuffle)

        seed1 = random.randint(0, 2**32 - 1)
        X, Xtest, y, ytest = train_test_split(words,
                                              labels,
                                              test_size=config.ratio_split[2],
                                              random_state=seed1)
        dev_size = config.ratio_split[1] / np.sum(config.ratio_split[:1])

        seed2 = random.randint(0, 2**32 - 1)
        Xtrain, Xdev, ytrain, ydev = train_test_split(X,
                                                      y,
                                                      test_size=dev_size,
                                                      random_state=seed2)

        # Calculate KL divergence of each subset with respect to the original distribution
        ydist_train, nlabels_train, freq_train = getTagDistFromList(
            ytrain, tagsInOrder)
        ydist_test, nlabels_test, freq_test = getTagDistFromList(
            ytest, tagsInOrder)
        ydist_dev, nlabels_dev, freq_dev = getTagDistFromList(
            ydev, tagsInOrder)

        if np.all(np.array([nlabels_train, nlabels_test, nlabels_dev]) == 41):
            # kl1 = kl(ydist_train, Q_raw)
            # kl2 = kl(ydist_test, Q_raw)
            # kl3 = kl(ydist_dev, Q_raw)
            # kl_loss = kl1 + kl2 + kl3 + abs(kl1-kl2) + abs(kl2-kl3) + abs(kl3-kl1)

            kl1 = kl(ydist_test, ydist_train)
            kl2 = kl(ydist_dev, ydist_train)
            kl_loss = 0.3 * (kl1 + kl2) + 0.7 * abs(kl1 - kl2)

            split_seeds = (seed_shuffle, seed1, seed2)
            if kl_loss < min_loss:
                min_loss = kl_loss
                best_seed = split_seeds

                # print("New best kl loss: {}".format(kl_loss))
                # print("--- seed: {}".format(best_seed))
                # print("--- kls: {}, {}, {}".format(kl1, kl2, kl3))

                print("New best kl loss: {}, {}".format(i, kl_loss))
                print("--- seed: {}".format(best_seed))
                print("--- kl,(test, dev): {}, {}".format(kl1, kl2))
                '''print("--- freq:")
                print("   -- train {}".format(freq_train))
                print("   -- dev {}".format(freq_dev))
                print("   -- test {}".format(freq_test))

                print("--- dist:")
                print("   -- train {}".format(ydist_train))
                print("   -- dev {}".format(ydist_dev))
                print("   -- test {}".format(ydist_test))'''

                plotbar(ydist_train, ydist_dev, ydist_test, 41)

    # Plot the distribution of the best split
    words, labels = shuffle(_words, _labels, random_state=best_seed[0])
    X, Xtest, y, ytest = train_test_split(words,
                                          labels,
                                          test_size=config.ratio_split[2],
                                          random_state=best_seed[1])
    dev_size = config.ratio_split[1] / np.sum(config.ratio_split[:1])
    Xtrain, Xdev, ytrain, ydev = train_test_split(X,
                                                  y,
                                                  test_size=dev_size,
                                                  random_state=best_seed[2])

    ydist_train, nlabels_train, label_freq_train = getTagDistFromList(
        ytrain, tagsInOrder)
    ydist_test, _, label_freq_test = getTagDistFromList(ytest, tagsInOrder)
    ydist_dev, _, label_freq_dev = getTagDistFromList(ydev, tagsInOrder)

    # Bar plot

    plotbar(ydist_train, ydist_dev, ydist_test, 41, save=True)

    write_linebased(Xtrain, ytrain, config.filenames_split[0])
    write_linebased(Xdev, ydev, config.filenames_split[1])
    write_linebased(Xtest, ytest, config.filenames_split[2])