def svm_robust_score(args, data, labels):
    idx_for_split = int(0.2 * len(data))
    phrases = []
    pred = []
    for index, row in data.iterrows():
        phrases.append(
            noise_generator(row["sentence1"], args.noise_level, chars))
        phrases.append(
            noise_generator(row["sentence2"], args.noise_level, chars))
    from sample import sample_multi
    results = np.squeeze(
        np.vsplit(sample_multi(args.save_dir, phrases, args.model_type),
                  len(phrases)))
    for i in range(0, len(results), 2):
        v1 = results[i]
        v2 = results[i + 1]
        if (v1 == np.zeros_like(v1)).all() or (v2 == np.zeros_like(v2)).all():
            print(i)
        pred.append(1 - cosine(v1, v2))
        if math.isnan(pred[-1]):
            pred[-1] = 0.5
    pr = pd.DataFrame(pred)
    train = pr.iloc[idx_for_split:]
    test = pr.iloc[:idx_for_split]
    train_label = labels[idx_for_split:]
    test_label = labels[:idx_for_split]
    roc_auc = linear_svm(train, test, train_label, test_label)

    with open("results_entail_" + args.model_type + ".txt", "at") as f_out:
        # f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, mean_squared_error(true, pred)))
        f_out.write(args.mode + ",%.2f,%.3f\n" % (args.noise_level, roc_auc))
def get_robust_score(args, pairs, true):
    if "robust" in args.mode:
        pred = []
        phrases = []
        for index, row in pairs.iterrows():
            phrases.append(
                noise_generator(row["sentence1"], args.noise_level, chars))
            phrases.append(
                noise_generator(row["sentence2"], args.noise_level, chars))
        from sample import sample_multi
        results = np.vsplit(
            sample_multi(args.save_dir, phrases, args.model_type),
            len(phrases))
        for i in range(0, len(results), 2):
            v1 = results[i]
            v2 = results[i + 1]
            if (v1 == np.zeros_like(v1)).all() or (v2
                                                   == np.zeros_like(v2)).all():
                print(i)
            pred.append(1 - cosine(v1, v2))
            if math.isnan(pred[-1]):
                pred[-1] = 0.5
        with open("results_entail" + args.model_type + ".txt", "at") as f_out:
            # f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, mean_squared_error(true, pred)))
            f_out.write(args.mode + ",%.2f,%.3f\n" %
                        (args.noise_level, roc_auc_score(true, pred)))
def samping_sentiment_data(args, data, labels):
    idx_for_split = int(0.2 * len(data))
    results = np.squeeze(np.vsplit(sample_multi(args.save_dir, data, args.model_type), len(data)))
    train = results[idx_for_split:]
    test = results[:idx_for_split]
    train_label = labels[idx_for_split:]
    test_label = labels[:idx_for_split]
    # keras_test(train, test, train_label, test_label)
    roc_auc_score = linear_svm(train, test, train_label, test_label)
    with open(args.model_type + "_results_sentiment.txt", "at") as f_out:
        f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, roc_auc_score))
def svm_robust_score(args,data, labels):
    idx_for_split = int(0.2 * len(data))
    phrases = []
    pred = []
    for index, row in data.iterrows():
        phrases.append(noise_generator(row["sentence1"], args.noise_level, chars))
        phrases.append(noise_generator(row["sentence2"], args.noise_level, chars))
    from sample import sample_multi
    results = np.squeeze(np.vsplit(sample_multi(args.save_dir, phrases, args.model_type), len(phrases)))
    #pairs_vectors = zip(results[0::2], results[1::2])
    # df = pd.DataFrame(columns=['cosine', 'canberra', 'cityblock', 'euclidean', 'minkowski', 'braycurtis',"skew_q1","skew_q2"\
    #                            "kur_q1","kur_q2", "skew_diff", "kur_diff"])
    # df['cosine'] = [cosine(x, y) for (x, y) in pairs_vectors]
    # print(len(df))
    # df['canberra'] = [canberra(x, y) for (x, y) in zip(results[0::2], results[1::2])]
    # df['cityblock'] = [cityblock(x, y) for (x, y) in zip(results[0::2], results[1::2])]
    # df['euclidean'] = [euclidean(x, y) for (x, y) in zip(results[0::2], results[1::2])]
    # df['minkowski'] = [minkowski(x, y, 3) for (x, y) in zip(results[0::2], results[1::2])]
    # df['braycurtis'] = [braycurtis(x, y) for (x, y) in zip(results[0::2], results[1::2])]
    # question1_vec = results[0::2]
    # question2_vec = results[1::2]
    # data['skew_q1'] = [skew(x) for x in question1_vec]
    # data['skew_q2'] = [skew(x) for x in question2_vec]
    # data['kur_q1'] = [kurtosis(x) for x in question1_vec]
    # data['kur_q2'] = [kurtosis(x) for x in question2_vec]
    #
    # data['skew_diff'] = np.abs(data['skew_q1'] - data['skew_q2'])
    # data['kur_diff'] = np.abs(data['kur_q1'] - data['kur_q2'])

    for i in range(0, len(results), 2):
        v1 = results[i]
        v2 = results[i + 1]
        if (v1 == np.zeros_like(v1)).all() or (v2 == np.zeros_like(v2)).all():
            print(i)
        pred.append(1 - cosine(v1, v2))
        if math.isnan(pred[-1]):
            pred[-1] = 0.5
    # pr = pd.DataFrame(pred)
    # train = df.iloc[idx_for_split:]
    # test = df.iloc[:idx_for_split]
    # train_label = labels[idx_for_split:]
    # test_label = labels[:idx_for_split]
    roc_auc= roc_auc_score(labels, pred)
    # clf = catboost.CatBoostClassifier(depth=6, iterations=5000, learning_rate=0.1, thread_count=16)
    # clf.fit(train, train_label)
    # y_proba = clf.predict_proba(test)[:, 1]
    # roc_auc = roc_auc_score(test_label, y_proba)
    with open("results_quora" + args.model_type + ".txt", "at") as f_out:
        # f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, mean_squared_error(true, pred)))
        f_out.write(args.mode + ",%.2f,%.3f\n" % (args.noise_level, roc_auc))
Example #5
0
                if lemma in w2v:
                    vector = w2v[lemma]
                vectors.append(vector)
        return np.mean(vectors, axis=0)
    for pair in tqdm(pairs):
        v1 = get_mean_vec(noise_generator(pair["text_1"]))
        v2 = get_mean_vec(noise_generator(pair["text_2"]))
        pred.append(1 - cosine(v1, v2))
    with open("results.txt", "at") as f_out:
        f_out.write("word2vec,%.2f,%.3f\n" % (args.noise_level, roc_auc_score(true, pred)))
    # print "ROC\t\t=\t%.2f" % roc_auc_score(true, pred)

if "robust" in args.mode:
    pred = []
    phrases = []
    for pair in pairs:
        phrases.append(noise_generator(pair["text_1"]))
        phrases.append(noise_generator(pair["text_2"]))
    from sample import sample_multi
    results = np.vsplit(sample_multi(args.save_dir, phrases), len(phrases))
    for i in range(0, len(results), 2):
        v1 = results[i]
        v2 = results[i + 1]
        pred.append(1 - cosine(v1, v2))
    with open("results.txt", "at") as f_out:
        f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, roc_auc_score(true, pred)))
    # print "ROC\t\t=\t%.2f" % roc_auc_score(true, pred)

# print "Class ratio\t=\t%.2f" % (float(len(filter(None, true)))/len(true))
# print "F1\t=\t%.2f" % f1_score(true, pred)
                    type=str,
                    default="save")
parser.add_argument("-t",
                    "--model_type",
                    help="type of model used to train",
                    type=str,
                    default="biSRU")
parser.add_argument("-i", "--input-dir", help="dir to go through")

args = parser.parse_args()

with open(join(args.save_dir, 'chars_vocab.pkl'), 'rb') as f:
    chars, _ = cPickle.load(f)

if "robust" in args.mode:
    filenames = []
    phrases = []

    for filename in glob(join(args.input_dir, "*.txt")):
        filenames.append(filename)
        with open(filename, "rt") as f:
            lines = [line.strip() for line in f.readlines()]
            phrases.append(" ".join(lines))

    from sample import sample_multi
    results = np.vsplit(sample_multi(args.save_dir, phrases, args.model_type),
                        len(phrases))

    for i in range(len(results)):
        np.savetxt(filenames[i] + ".rove", results[i])
Example #7
0
def main(data_dir, template, output, save_dir):
    clusters = {}
    clustered = {}
    print('Load clustermap')
    with open(os.path.join(data_dir, 'SBEADS.resC'), 'r') as fin:
        for line in tqdm(fin):
            tab = line.index('\t')
            cluster_id = int(line[:tab])
            ids = [int(t) for t in line[tab + 1:].split()]
            for i in ids:
                clustered[i] = cluster_id
            clusters[cluster_id] = ids

    print('Load messages')
    data = []
    with open(os.path.join(data_dir, 'docs.out'), 'r') as fin:
        for line in tqdm(fin):
            # take care about escape symbols
            filtered = line.replace(r"\'", "'").replace('\\', '/')
            try:
                entry = json.loads(filtered)
                id = int(entry.get('id'))
                title = entry.get('title')
                data.append((id, title))
            except:
                print(filtered)

    word2vectors = sample_multi(save_dir, [t[1] for t in data])
    vectors = np.zeros((len(data), word2vectors[0][0].shape[2]))

    for i in range(len(word2vectors)):
        vectors[i, :] = np.mean(np.squeeze(np.array(word2vectors[i])), axis=0)
    X = vectors

    print('Compute natural w2v clusterization quality.')

    # ATTENTION: very dirty code, just let it works
    n = 100000
    clust_numbers = list(clusters.keys())
    selected = np.random.choice(clust_numbers, n)

    res = []
    # not sure if they are continuous:
    indexes = {t[0]: i for (i, t) in enumerate(data)}
    misscounter = 0
    for i in tqdm(selected):
        tmp = []
        for j in clusters[i]:
            if j in indexes:
                tmp.append(indexes[j])
            else:
                misscounter += 1

        if len(tmp) < 2:
            # bad, let it go
            # print('Pass')
            continue
        # get pair
        pair = np.random.choice(tmp, 2)
        one = X[pair[0], :]
        two = X[pair[1], :]
        sim = 1.0 - spatial.distance.cosine(one, two)
        if np.isnan(sim) or np.isinf(sim):
            sim = 0.0
        res.append((1.0, sim))

        # (try to) get nonpair
        one = np.random.choice(tmp, 1)
        two = np.random.random_integers(0, len(X) - 1, 1)
        gnd = 1.0 * (two in tmp)
        one = X[one, :]
        two = X[two, :]
        sim = 1.0 - spatial.distance.cosine(one, two)
        if np.isnan(sim) or np.isinf(sim):
            sim = 0.0

        res.append((gnd, sim))

    res = np.array(res)
    print("Classes ratio:\t%.3f" % (sum(res[:, 0]) / len(res)))
    print("MSE:\t\t%.3f" % mean_squared_error(res[:, 0], res[:, 1]))
    print("AUC:\t\t%.3f" % roc_auc_score(res[:, 0], res[:, 1]))
    # last result was
    # Classes ratio:	0.500
    # MSE:		0.106
    # AUC:		0.968
    # End of ATTENTION

    labels = np.array([clustered.get(t[0], -1) for t in data])
    score = silhouette_score(X, labels, sample_size=1000)
    # it gives me about 0.77
    print('Natural w2v silhouette_score is {}'.format(score))

    idx = (labels > -1)

    score = silhouette_score(X[idx], labels[idx], sample_size=1000)
    # it gives me about 0.87
    print('Without unclustered samples score is {}'.format(score))

    # Preparation for visualization. Unfinished yet
    print('Compute 2d projection')
    pca = PCA(n_components=2)
    X2 = pca.fit_transform(X)
    print('Save the data')

    repack = []
    with open('data.csv', 'w') as fout:
        for i, x in zip([t[0] for t in data], X2):
            q = (x[0], x[1], i, clustered.get(i, -1))
            fout.write('{:.2f},{:.2f},{},{}\n'.format(*q))
            repack.append(q)
    repack = json.dumps(repack)

    # Experiment with coarsed coordinates:
    d1 = len(set(['{:.1f},{:.1f}'.format(x[0], x[1]) for x in X2]))
    d2 = len(set(['{:.2f},{:.2f}'.format(x[0], x[1]) for x in X2]))
    d3 = len(set(['{:.3f},{:.3f}'.format(x[0], x[1]) for x in X2]))
    print('We can coarse the data: ')
    print(d1)
    print(d2)
    print(d3)

    with open(template, 'r') as fin:
        page = Template(fin.read()).render(data=repack)
        with open(output, 'w') as fout:
            fout.write(page)

    with open('labels.csv', 'w') as fout:
        for i, label in data:
            fout.write('{}\t{}\n'.format(i, label))

    print('Done')
Example #8
0
# king_queen = 1 - cosine(king, queen)
#
# man_king = 1 - cosine(man, king)
# woman_queen = 1 - cosine(woman, queen)
#
# print("Distances")
# print("woman_man:{} \n king_queen:{} \n".format(woman_man, king_queen))
# print("man_king:{} \n woman_queen:{} \n".format(man_king, woman_queen))
#
# print("Queen similarity")
# print(1 - cosine(king - man + woman, queen))

pos1 = "You have no affinity for most of the characters ."
pos2 = "The characters , cast in impossibly contrived situations , are totally estranged from reality ."

neg1 = "Everybody loves a David and Goliath story , and this one is told almost entirely from David 's point of view ."
neg2 = "Those who want to be jolted out of their gourd should drop everything and run to Ichi."
positive = [pos1, pos2]
negative = [neg1, neg2]
vec = positive + negative
print(len(vec))
results = sample_multi(DEFAULT_MODEL, vec, "biLSTM")
pos_pos = 1 - cosine(results[0], results[1])
neg_neg = 1 - cosine(results[2], results[3])
pos_neg = 1 - cosine(results[1], results[3])
neg_pos = 1 - cosine(results[2], results[0])

print("pos_pos {}".format(pos_pos))
print("neg {}".format(neg_neg))
print("neg_pos {}".format(neg_pos))
print("pos_neg {}".format(pos_neg))