Ejemplo n.º 1
0
def test_different_signature_dims_flow():
    first_signature = np.array([0.0, 1.0])
    second_signature = np.array([5.0, 3.0, 3.0])
    distance_matrix = np.array([[0.0, 0.5, 0.0], [0.5, 0.0, 0.0],
                                [0.5, 0.0, 0.0]])
    with pytest.raises(ValueError):
        emd_with_flow(first_signature, second_signature, distance_matrix)
Ejemplo n.º 2
0
def test_emd_with_flow_validate_square_distance_matrix():
    first_signature = np.array([0.0, 1.0])
    second_signature = np.array([5.0, 3.0])
    distance_matrix = np.array([[0.0, 0.5, 3.0],
                                [0.5, 0.0]])
    with pytest.raises(ValueError):
        emd_with_flow(first_signature, second_signature, distance_matrix)
Ejemplo n.º 3
0
def test_emd_with_flow_1():
    first_signature = np.array([0.0, 1.0])
    second_signature = np.array([5.0, 3.0])
    distance_matrix = np.array([[0.0, 0.5], [0.5, 0.0]])
    emd_flow_assert(
        emd_with_flow(first_signature, second_signature, distance_matrix),
        (3.5, [[0.0, 0.0], [0.0, 1.0]]))
Ejemplo n.º 4
0
def test_case_2_flow():
    first_signature = np.array([1.0, 1.0])
    second_signature = np.array([1.0, 1.0])
    distance_matrix = np.array([[0.0, 1.0], [1.0, 0.0]])

    assert (emd_with_flow(first_signature, second_signature,
                          distance_matrix) == (0.0, [[1.0, 0.0], [0.0, 1.0]]))
Ejemplo n.º 5
0
def word_mover_score(refs,
                     hyps,
                     idf_dict_ref,
                     idf_dict_hyp,
                     stop_words=[],
                     n_gram=1,
                     remove_subwords=True,
                     batch_size=256):
    preds = []
    for batch_start in range(0, len(refs), batch_size):
        batch_refs = refs[batch_start:batch_start + batch_size]
        batch_hyps = hyps[batch_start:batch_start + batch_size]

        ref_embedding, ref_lens, ref_masks, ref_idf, ref_tokens = get_bert_embedding(
            batch_refs, model, tokenizer, idf_dict_ref)
        hyp_embedding, hyp_lens, hyp_masks, hyp_idf, hyp_tokens = get_bert_embedding(
            batch_hyps, model, tokenizer, idf_dict_hyp)

        ref_embedding = ref_embedding[-1]
        hyp_embedding = hyp_embedding[-1]

        batch_size = len(ref_tokens)
        for i in range(batch_size):
            ref_ids = [
                k for k, w in enumerate(ref_tokens[i])
                if w in stop_words or '##' in w or w in set(string.punctuation)
            ]
            hyp_ids = [
                k for k, w in enumerate(hyp_tokens[i])
                if w in stop_words or '##' in w or w in set(string.punctuation)
            ]

            ref_embedding[i, ref_ids, :] = 0
            hyp_embedding[i, hyp_ids, :] = 0

            ref_idf[i, ref_ids] = 0
            hyp_idf[i, hyp_ids] = 0

        raw = torch.cat([ref_embedding, hyp_embedding], 1)

        raw.div_(torch.norm(raw, dim=-1).unsqueeze(-1) + 1e-30)

        distance_matrix = batched_cdist_l2(raw, raw).double().cpu().numpy()

        for i in range(batch_size):
            c1 = np.zeros(raw.shape[1], dtype=np.float)
            c2 = np.zeros(raw.shape[1], dtype=np.float)
            c1[:len(ref_idf[i])] = ref_idf[i]
            c2[len(ref_idf[i]):] = hyp_idf[i]

            c1 = _safe_divide(c1, np.sum(c1))
            c2 = _safe_divide(c2, np.sum(c2))

            dst = distance_matrix[i]
            _, flow = emd_with_flow(c1, c2, dst)
            flow = np.array(flow, dtype=np.float32)
            score = 1 - np.sum(flow * dst)
            preds.append(score)

    return preds
Ejemplo n.º 6
0
def test_emd_with_flow_5():
    first_signature = np.array([3.0, 5.0])
    second_signature = np.array([6.0, 2.0])
    distance_matrix = np.array([[0.0, 0.0], [0.0, 0.0]])
    emd_flow_assert(
        emd_with_flow(first_signature, second_signature, distance_matrix),
        (0.0, [[3.0, 0.0], [3.0, 2.0]]))
Ejemplo n.º 7
0
        def emd_rep_loss(student_reps, teacher_reps, student_layer_weight,
                         teacher_layer_weight, stu_layer_num, tea_layer_num,
                         loss_func, device):
            student_layer_weight = np.concatenate(
                (student_layer_weight, np.zeros(tea_layer_num)))
            teacher_layer_weight = np.concatenate(
                (np.zeros(stu_layer_num), teacher_layer_weight))
            total_num = stu_layer_num + tea_layer_num
            distance_matrix = torch.zeros([total_num, total_num]).to(device)

            for i in range(stu_layer_num):
                student_rep = student_reps[i]
                for j in range(tea_layer_num):
                    teacher_rep = teacher_reps[j]
                    tmp_loss = loss_func(student_rep, teacher_rep)
                    distance_matrix[i][j + stu_layer_num] = distance_matrix[
                        j + stu_layer_num][i] = tmp_loss

            _, trans_matrix = emd_with_flow(
                student_layer_weight, teacher_layer_weight,
                distance_matrix.detach().cpu().numpy().astype('float64'))

            rep_loss = torch.sum(
                torch.tensor(trans_matrix).to(device) * distance_matrix)

            # tmp = distance_matrix.detach().cpu().numpy()

            return rep_loss, trans_matrix, distance_matrix
Ejemplo n.º 8
0
 def attn_loss(self, student_atts, teacher_atts, return_distance=False):
     if len(teacher_atts[0].shape) == 4 and len(student_atts[0].shape) == 3:
         teacher_atts = [torch.mean(x, dim=1) for x in teacher_atts]
     elif len(student_atts[0].shape) == 4 and len(
             teacher_atts[0].shape) == 4:
         if student_atts[0].shape[1] != teacher_atts[0].shape[1]:
             teacher_atts = [torch.mean(x, dim=1) for x in teacher_atts]
             student_atts = [torch.mean(x, dim=1) for x in student_atts]
     elif len(student_atts[0].shape) == 4 and len(
             teacher_atts[0].shape) == 3:
         student_atts = [torch.mean(x, dim=1) for x in student_atts]
     _s_weight = np.concatenate(
         (self.s_weight, np.zeros_like(self.t_weight)))
     _t_weight = np.concatenate(
         (np.zeros_like(self.s_weight), self.t_weight))
     totol_num = self.s_layer_num + self.t_layer_num
     distance_matrix = torch.zeros([totol_num, totol_num]).to(self.device)
     for i in range(self.s_layer_num):
         student_att = student_atts[i]
         for j in range(self.t_layer_num):
             teacher_att = teacher_atts[j]
             tmp_loss = self.d_method(student_att, teacher_att)
             distance_matrix[i][j + self.s_layer_num] = distance_matrix[
                 j + self.s_layer_num][i] = tmp_loss
     _, trans_matrix = emd_with_flow(
         _s_weight, _t_weight,
         distance_matrix.detach().cpu().numpy().astype('float64'))
     d = torch.sum(
         torch.tensor(trans_matrix).to(self.device) * distance_matrix)
     if return_distance:
         return d, trans_matrix, distance_matrix
     else:
         return d
def get_opinion_distance(model, noun_freq_polar1_model, noun_freq_polar1_terms,
                         noun_freq_polar2_model, noun_freq_polar2_terms,
                         filename1, filename2):
    dictionary = Dictionary(
        documents=[noun_freq_polar1_terms, noun_freq_polar2_terms])

    # Compute the euclidean distance between word vectors
    semantic_distance_matrix = compute_semantic_distance_matrix(
        model, noun_freq_polar1_terms, noun_freq_polar2_terms, dictionary,
        filename1, filename2)
    if semantic_distance_matrix is None:
        print("Semantic distance is none")
        return np.nan, np.nan

    # Get normalized frequency and it's polarity
    normalized_freq1, pol1 = get_norm_freq_polarity(noun_freq_polar1_model,
                                                    dictionary)
    normalized_freq2, pol2 = get_norm_freq_polarity(noun_freq_polar2_model,
                                                    dictionary)

    # Change output to d1_terms, d2_terms, d_matrix[len(d1_terms),len(d2_terms)]
    emd_distance, matching_matrix = emd_with_flow(normalized_freq1,
                                                  normalized_freq2,
                                                  semantic_distance_matrix)
    polarity_distance = compute_polarity_distance(normalized_freq1,
                                                  normalized_freq2, dictionary,
                                                  matching_matrix,
                                                  semantic_distance_matrix,
                                                  pol1, pol2)

    return polarity_distance, emd_distance
Ejemplo n.º 10
0
    def get_one_grad(self, id1, id2):
        vn = len( self.all_words )
        n = len(self.tr_X)

        vec1 = np.zeros(vn)
        for k,w in enumerate( self.tr_words[id1][0] ):
            word = w#w[0]            
            vec1[ self.word_id_map[word] ] = self.tr_BOW_X[id1][0][k]

        vec2 = np.zeros(vn)
        for k,w in enumerate( self.tr_words[id2][0] ):
            word = w#w[0]
            vec2[ self.word_id_map[word] ] = self.tr_BOW_X[id2][0][k]
        part_grad_A = np.zeros_like( self.A )
        
        T = emd_with_flow( np.array(vec1), np.array(vec2), self.cw_dist_mat )
        for id1,t1 in enumerate(T[1]):
            for id2,t2 in enumerate(t1):
                if t2 > 0.:
                    x = np.array(self.all_X[ id1 ]) - np.array(self.all_X[ id2 ])
                    const = t2 / (2 * np.sqrt( np.sum( self.A.dot(x)**2 ) )) if np.sum( x**2 ) != 0 else 0
                    for i in range(self.vector_dim):
                        Ax = self.A[i,:].dot(x)
                        for j in range(self.vector_dim):
                            part_grad_A[i,j] += const * 2 * Ax * x[j]
        return part_grad_A
Ejemplo n.º 11
0
def word_mover_score(mapping, projection, bias, model, tokenizer, src, hyps, \
                     n_gram=2, layer=8, dropout_rate=0.3, batch_size=256, device='cuda:0'):
    idf_dict_src = defaultdict(lambda: 1.)
    idf_dict_hyp = defaultdict(lambda: 1.)

    preds = []
    for batch_start in range(0, len(src), batch_size):
        batch_src = src[batch_start:batch_start + batch_size]
        batch_hyps = hyps[batch_start:batch_start + batch_size]

        src_embedding, src_lens, src_masks, src_idf, src_tokens = get_bert_embedding(
            batch_src, model, tokenizer, idf_dict_src, device=device)
        hyp_embedding, hyp_lens, hyp_masks, hyp_idf, hyp_tokens = get_bert_embedding(
            batch_hyps, model, tokenizer, idf_dict_hyp, device=device)

        src_embedding = src_embedding[layer]
        hyp_embedding = hyp_embedding[layer]

        src_embedding = cross_lingual_mapping(mapping, src_embedding,
                                              projection, bias[0])

        batch_size = src_embedding.shape[0]

        for i in range(batch_size):
            src_embedding_i = get_ngram_embs(src_embedding[i, :src_lens[i], :],
                                             ngram=n_gram)
            hyp_embedding_i = get_ngram_embs(hyp_embedding[i, :hyp_lens[i], :],
                                             ngram=n_gram)
            src_idf_i = [1] * (src_lens[i] - n_gram + 1)
            hyp_idf_i = [1] * (hyp_lens[i] - n_gram + 1)

            W = torch.cat([src_embedding_i, hyp_embedding_i], 0)
            W.div_(torch.norm(W, dim=-1).unsqueeze(-1))

            c1 = list(src_idf_i) + [0] * len(hyp_idf_i)
            c2 = [0] * len(src_idf_i) + list(hyp_idf_i)

            c1 = c1 / np.sum(c1) + 1e-9
            c2 = c2 / np.sum(c2) + 1e-9

            dist = torch.cdist(W, W, p=2).double().cpu().numpy()
            flow = np.stack(emd_with_flow(c1, c2, dist)[1])

            flow = torch.from_numpy(flow[:len(src_idf_i), len(src_idf_i):])
            dist = torch.from_numpy(dist[:len(src_idf_i), len(src_idf_i):])

            # remove noisy elements in a flow
            flow_flatten = flow.reshape(-1)
            idx = torch.nonzero(flow_flatten)
            threshold = flow_flatten[idx].topk(k=max(
                int(len(idx) * dropout_rate), 1),
                                               dim=0,
                                               largest=False)[0][-1]
            flow[flow < threshold] = 0

            score = (flow * dist).sum()
            preds.append(1 - score)

    return preds
Ejemplo n.º 12
0
def test_emd_with_flow_6():
    first_signature = np.array([1.0, 2.0, 1.0, 2.0])
    second_signature = np.array([2.0, 1.0, 2.0, 1.0])
    distance_matrix = np.array([[0.0, 1.0, 1.0, 2.0], [1.0, 0.0, 2.0, 1.0],
                                [1.0, 2.0, 0.0, 1.0], [2.0, 1.0, 1.0, 0.0]])
    emd_flow_assert(
        emd_with_flow(first_signature, second_signature, distance_matrix),
        (2.0, [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0],
               [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 1.0, 1.0]]))
Ejemplo n.º 13
0
def test_emd_with_flow_1():
    first_signature = np.array([0.0, 1.0])
    second_signature = np.array([5.0, 3.0])
    distance_matrix = np.array([[0.0, 0.5],
                                [0.5, 0.0]])
    emd_flow_assert(
        emd_with_flow(first_signature, second_signature, distance_matrix),
        (3.5, [[0.0, 0.0],
               [0.0, 1.0]])
    )
Ejemplo n.º 14
0
def test_emd_with_flow_5():
    first_signature = np.array([3.0, 5.0])
    second_signature = np.array([6.0, 2.0])
    distance_matrix = np.array([[0.0, 0.0],
                                [0.0, 0.0]])
    emd_flow_assert(
        emd_with_flow(first_signature, second_signature, distance_matrix),
        (0.0, [[3.0, 0.0],
               [3.0, 2.0]])
    )
Ejemplo n.º 15
0
def get_wmd(s1, s2, dists, w2i, get_flow=False):
    """
    Get WMD for two input sentences
    """
    s1, s2 = s1.split(), s2.split()
    h1, h2, words = get_wmd_histograms(s1, s2, w2i)
    D = dists[np.ix_(words, words)]

    if get_flow:
        return pyemd.emd_with_flow(h1, h2, D)

    return pyemd.emd(h1, h2, D)
Ejemplo n.º 16
0
def test_case_6_flow():
    first_signature = np.array([1.0, 2.0, 1.0, 2.0])
    second_signature = np.array([2.0, 1.0, 2.0, 1.0])
    distance_matrix = np.array([[0.0, 1.0, 1.0, 2.0], [1.0, 0.0, 2.0, 1.0],
                                [1.0, 2.0, 0.0, 1.0], [2.0, 1.0, 1.0, 0.0]])
    emd_value, flow = emd_with_flow(first_signature, second_signature,
                                    distance_matrix)
    emd_value = round(emd_value, EMD_PRECISION)
    assert emd_value == 2.0
    flow = np.round(flow, FLOW_PRECISION)
    assert np.array_equal(flow, [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0],
                                 [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 1.0, 1.0]])
Ejemplo n.º 17
0
    def emd_rep_loss(student_reps, teacher_reps, student_layer_weight,
                     teacher_layer_weight, stu_layer_num, tea_layer_num,
                     loss_mse, args):
        student_layer_weight = np.concatenate(
            (student_layer_weight, np.zeros(tea_layer_num)))
        teacher_layer_weight = np.concatenate(
            (np.zeros(stu_layer_num), teacher_layer_weight))
        total_num = stu_layer_num + tea_layer_num
        distance_matrix = torch.zeros([total_num, total_num]).to(args.device)

        for i in range(stu_layer_num):
            student_rep = student_reps[i]
            for j in range(tea_layer_num):
                teacher_rep = teacher_reps[j]
                if args.emd_type == "v6":
                    tmp_loss = 1 - torch.sum(student_rep * teacher_rep, dim=2)
                    tmp_loss = tmp_loss.mean()
                elif args.emd_type == "v8":
                    # weight = torch.norm(teacher_rep, dim=2, keepdim=True)
                    # weight = weight / torch.sum(weight, dim=1, keepdim=True)
                    tmp_loss = torch.sum(v8_weight *
                                         (student_rep - teacher_rep)**2,
                                         dim=1).mean()
                else:
                    tmp_loss = loss_mse(student_rep, teacher_rep)
                distance_matrix[i][j + stu_layer_num] = distance_matrix[
                    j + stu_layer_num][i] = tmp_loss

        if args.emd_type == "v10":
            tmp = distance_matrix.detach().cpu().numpy()
            student_layer_weight = tmp.mean(axis=1)[:stu_layer_num]
            teacher_layer_weight = tmp.mean(axis=0)[stu_layer_num:]
            student_layer_weight = sum(
                student_layer_weight) / student_layer_weight
            teacher_layer_weight = sum(
                teacher_layer_weight) / teacher_layer_weight
            student_layer_weight = utils.softmax(student_layer_weight / 20)
            teacher_layer_weight = utils.softmax(teacher_layer_weight / 20)
            student_layer_weight = np.concatenate(
                (student_layer_weight, np.zeros(tea_layer_num)))
            teacher_layer_weight = np.concatenate(
                (np.zeros(stu_layer_num), teacher_layer_weight))

        _, trans_matrix = emd_with_flow(
            student_layer_weight, teacher_layer_weight,
            distance_matrix.detach().cpu().numpy().astype('float64'))

        rep_loss = torch.sum(
            torch.tensor(trans_matrix).to(args.device) * distance_matrix)

        tmp = distance_matrix.detach().cpu().numpy()

        return rep_loss, trans_matrix, distance_matrix
Ejemplo n.º 18
0
def test_emd_with_flow_extra_mass_penalty():
    first_signature = np.array([0.0, 2.0, 1.0, 2.0])
    second_signature = np.array([2.0, 1.0, 2.0, 1.0])
    distance_matrix = np.array([[0.0, 1.0, 1.0, 2.0], [1.0, 0.0, 2.0, 1.0],
                                [1.0, 2.0, 0.0, 1.0], [2.0, 1.0, 1.0, 0.0]])
    emd_flow_assert(
        emd_with_flow(first_signature,
                      second_signature,
                      distance_matrix,
                      extra_mass_penalty=2.5),
        (4.5, [[0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0],
               [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 1.0, 1.0]]))
Ejemplo n.º 19
0
    def emd_attn_loss(student_attns, teacher_attns, student_layer_weight,
                      teacher_layer_weight, stu_layer_num, tea_layer_num,
                      loss_mse, args):
        student_layer_weight = np.concatenate(
            (student_layer_weight, np.zeros(tea_layer_num)))
        teacher_layer_weight = np.concatenate(
            (np.zeros(stu_layer_num), teacher_layer_weight))
        total_num = stu_layer_num + tea_layer_num
        distance_matrix = torch.zeros([total_num, total_num]).to(args.device)
        if args.emd_type == "v3":
            t_dis_matrix = torch.zeros([total_num, total_num]).to(args.device)

        for i in range(stu_layer_num):
            student_attn = student_attns[i]
            for j in range(tea_layer_num):
                teacher_attn = teacher_attns[j]
                if args.emd_type == "v6":
                    tmp_loss = 1 - torch.sum(teacher_attn * student_attn,
                                             dim=2)
                    tmp_loss = tmp_loss.mean()
                elif args.emd_type == "v8":
                    # weight = torch.norm(teacher_attn, dim=2, keepdim=True)
                    # weight = weight / torch.sum(weight, dim=1, keepdim=True)
                    tmp_loss = torch.sum(v8_weight *
                                         (student_attn - teacher_attn)**2,
                                         dim=1).mean()
                else:
                    tmp_loss = loss_mse(student_attn, teacher_attn)
                if args.emd_type == "v3":
                    t_dis_matrix[i][j + stu_layer_num] = t_dis_matrix[
                        j + stu_layer_num][i] = tmp_loss
                    tmp_loss *= (
                        1 + abs(i / stu_layer_num - j / tea_layer_num) / 5)
                distance_matrix[i][j + stu_layer_num] = distance_matrix[
                    j + stu_layer_num][i] = tmp_loss

        _, trans_matrix = emd_with_flow(
            student_layer_weight, teacher_layer_weight,
            distance_matrix.detach().cpu().numpy().astype('float64'))

        if args.emd_type == "v3":
            attn_loss = torch.sum(
                torch.tensor(trans_matrix).to(args.device) * t_dis_matrix)
        else:
            attn_loss = torch.sum(
                torch.tensor(trans_matrix).to(args.device) * distance_matrix)

        tmp = distance_matrix.detach().cpu().numpy()

        return attn_loss, trans_matrix, distance_matrix
Ejemplo n.º 20
0
def test_emd_with_flow_6():
    first_signature = np.array([1.0, 2.0, 1.0, 2.0])
    second_signature = np.array([2.0, 1.0, 2.0, 1.0])
    distance_matrix = np.array([[0.0, 1.0, 1.0, 2.0],
                                [1.0, 0.0, 2.0, 1.0],
                                [1.0, 2.0, 0.0, 1.0],
                                [2.0, 1.0, 1.0, 0.0]])
    emd_flow_assert(
        emd_with_flow(first_signature, second_signature, distance_matrix),
        (2.0, [[1.0, 0.0, 0.0, 0.0],
               [1.0, 1.0, 0.0, 0.0],
               [0.0, 0.0, 1.0, 0.0],
               [0.0, 0.0, 1.0, 1.0]])
    )
Ejemplo n.º 21
0
def test_extra_mass_penalty_flow():
    first_signature = np.array([0.0, 2.0, 1.0, 2.0])
    second_signature = np.array([2.0, 1.0, 2.0, 1.0])
    distance_matrix = np.array([[0.0, 1.0, 1.0, 2.0], [1.0, 0.0, 2.0, 1.0],
                                [1.0, 2.0, 0.0, 1.0], [2.0, 1.0, 1.0, 0.0]])
    emd_value, flow = emd_with_flow(first_signature,
                                    second_signature,
                                    distance_matrix,
                                    extra_mass_penalty=2.5)
    emd_value = round(emd_value, EMD_PRECISION)
    assert emd_value == 4.5
    flow = np.round(flow, FLOW_PRECISION)
    print(flow)
    assert np.array_equal(flow, [[0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0],
                                 [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 1.0, 1.0]])
Ejemplo n.º 22
0
def test_emd_with_flow_extra_mass_penalty():
    first_signature = np.array([0.0, 2.0, 1.0, 2.0])
    second_signature = np.array([2.0, 1.0, 2.0, 1.0])
    distance_matrix = np.array([[0.0, 1.0, 1.0, 2.0],
                                [1.0, 0.0, 2.0, 1.0],
                                [1.0, 2.0, 0.0, 1.0],
                                [2.0, 1.0, 1.0, 0.0]])
    emd_flow_assert(
        emd_with_flow(first_signature, second_signature, distance_matrix,
                      extra_mass_penalty=2.5),
        (4.5, [[0.0, 0.0, 0.0, 0.0],
               [1.0, 1.0, 0.0, 0.0],
               [0.0, 0.0, 1.0, 0.0],
               [0.0, 0.0, 1.0, 1.0]])
    )
Ejemplo n.º 23
0
 def loss(self, student_reps, teacher_reps, return_distance=False):
     _s_weight = np.concatenate(
         (self.s_weight, np.zeros_like(self.t_weight)))
     _t_weight = np.concatenate(
         (np.zeros_like(self.s_weight), self.t_weight))
     totol_num = self.s_layer_num + self.t_layer_num
     distance_matrix = torch.zeros([totol_num, totol_num]).to(self.device)
     for i in range(self.s_layer_num):
         student_rep = student_reps[i]
         for j in range(self.t_layer_num):
             teacher_rep = teacher_reps[j + 1]
             tmp_loss = self.d_method(student_rep, teacher_rep)
             distance_matrix[i][j + self.s_layer_num] = distance_matrix[
                 j + self.s_layer_num][i] = tmp_loss
     _, trans_matrix = emd_with_flow(
         _s_weight, _t_weight,
         distance_matrix.detach().cpu().numpy().astype('float64'))
     d = torch.sum(
         torch.tensor(trans_matrix).to(self.device) * distance_matrix)
     if return_distance:
         return d, trans_matrix, distance_matrix
     else:
         return d
Ejemplo n.º 24
0
#from sys import argv
import csv
import numpy as np
import pandas as pd
#import os
#from math import radians, cos, sin, asin, sqrt, exp
#import copy
#import random
import pyemd
from pyemd import emd_with_flow

# Load species seasonal abundance distributions (estimated from eBird data)
abundance_BR = pd.read_table('results/output/seasonalAbundance_BR.csv', sep=",")
abundance_NB = pd.read_table('results/output/seasonalAbundance_NB.csv', sep=",")
abundance_BR = abundance_BR.apply(lambda x: x/sum(x))
abundance_NB = abundance_NB.apply(lambda x: x/sum(x))

# Load matrix of pairwise distance between every hexagons on the grid
distanceMatrix = np.loadtxt('results/output/distanceMatrix.csv', delimiter=";")

# Compute optimal redistribution using the Earth Mover's Distance algorithm 
for s in abundance_BR.columns:
  EMD_results = emd_with_flow(np.array(abundance_BR[s]), np.array(abundance_NB[s]), distanceMatrix)
  EMD_results = np.array(EMD_results[1])[(np.array(abundance_BR[s]) > 0),:][:,(np.array(abundance_NB[s]) > 0)]
  np.savetxt("results/output/ORSIM_results_" + s + ".csv", EMD_results, delimiter=',') # Save simulated migratory connectivity 
Ejemplo n.º 25
0
from pyemd import emd_with_flow
import numpy as np
first_histogram = np.array([0, 1.0])
second_histogram = np.array([5.0, 3.0])
distance_matrix = np.array([[0, 2.0], [2.5, 0]])
w_dis, F = emd_with_flow(first_histogram, second_histogram, distance_matrix)
print(w_dis, F)
sumF = np.sum(F)
norm_dis = w_dis / sumF
print(norm_dis)
Ejemplo n.º 26
0
    def emd_distill_loss(self, layer_wise_hidden, teacher_layer_wise_hidden):
        # 1. Compute EMD loss
        # 2. Update weight

        student_weight = self.student_weights.copy()  # [|S|]
        teacher_weight = self.teacher_weights.copy()  # [|T|]

        student_weight_hist = np.concatenate(
            (student_weight, np.zeros(self.teacher_layers)))  # [|S|+|T|]
        teacher_weight_hist = np.concatenate(
            (np.zeros(self.student_layers), teacher_weight))  # [|S|+|T|]

        total = self.teacher_layers + self.student_layers
        distance_matrix = torch.zeros([total, total], device=self.dummy.device)

        # Compute distance matrix, shape=(|S|, |T|)
        for i in range(self.student_layers):
            student_hidden = layer_wise_hidden[i]  # [B, L, C]
            for j in range(self.teacher_layers):
                teacher_hidden = teacher_layer_wise_hidden[j]  # [B, L, C]
                # KL Div
                distance_matrix[i][
                    j + self.student_layers] = self.layer_distill_loss(
                        student_hidden, teacher_hidden)
                distance_matrix[
                    j + self.student_layers][i] = self.layer_distill_loss(
                        teacher_hidden, student_hidden)
                # MSE symmetric
                # distance = self.mse(student_hidden, teacher_hidden)
                # distance_matrix[i][j + self.student_layers] = distance
                # distance_matrix[j + self.student_layers][i] = distance

        d_np = distance_matrix.detach().cpu().numpy().astype(
            "float64")  # [|S|+|T|, |S|+|T|]
        _, transfer_matrix = emd_with_flow(student_weight_hist,
                                           teacher_weight_hist, d_np)
        transfer_matrix = np.array(transfer_matrix,
                                   dtype=np.float)  # [|S|+|T|, |S|+|T|]

        transfer_matrix_torch = torch.tensor(transfer_matrix,
                                             device=self.dummy.device)
        kd_loss = torch.sum(transfer_matrix_torch * distance_matrix)

        # Update weight
        def update_weight(weight, t_mat, num_layers, bias=0):
            # t_mat: [|S|+|T|, |S|+|T|]
            transfer_weight = np.sum(t_mat * d_np, -1)  # [|S|+|T|]
            for idx in range(num_layers):
                weight[idx] = transfer_weight[idx + bias] / weight[idx]
            weight_sum = np.sum(weight)
            for idx in range(num_layers):
                if weight[idx] != 0:
                    weight[idx] = weight_sum / weight[idx]
            weight = np_softmax(weight / self.emd_temperature)
            return weight

        self.student_weights = update_weight(student_weight, transfer_matrix,
                                             self.student_layers)
        self.teacher_weights = update_weight(teacher_weight,
                                             np.transpose(transfer_matrix),
                                             self.teacher_layers,
                                             bias=self.student_layers)

        return kd_loss
Ejemplo n.º 27
0
def plot_example(is_flow, reference, translation, device='cuda:0'):

    idf_dict_ref = defaultdict(lambda: 1.)
    idf_dict_hyp = defaultdict(lambda: 1.)

    ref_embedding, ref_lens, ref_masks, ref_idf, ref_tokens = get_bert_embedding(
        [reference], model, tokenizer, idf_dict_ref, device=device)
    hyp_embedding, hyp_lens, hyp_masks, hyp_idf, hyp_tokens = get_bert_embedding(
        [translation], model, tokenizer, idf_dict_hyp, device=device)

    ref_embedding = ref_embedding[-1]
    hyp_embedding = hyp_embedding[-1]

    raw = torch.cat([ref_embedding, hyp_embedding], 1)
    raw.div_(torch.norm(raw, dim=-1).unsqueeze(-1) + 1e-30)

    distance_matrix = batched_cdist_l2(raw, raw)
    masks = torch.cat([ref_masks, hyp_masks], 1)
    masks = torch.einsum('bi,bj->bij', (masks, masks))
    distance_matrix = masks * distance_matrix

    i = 0
    c1 = np.zeros(raw.shape[1], dtype=np.float)
    c2 = np.zeros(raw.shape[1], dtype=np.float)
    c1[:len(ref_idf[i])] = ref_idf[i]
    c2[len(ref_idf[i]):] = hyp_idf[i]

    c1 = _safe_divide(c1, np.sum(c1))
    c2 = _safe_divide(c2, np.sum(c2))

    dst = distance_matrix[i].double().cpu().numpy()

    if is_flow:
        _, flow = emd_with_flow(c1, c2, dst)
        new_flow = np.array(flow, dtype=np.float32)
        res = new_flow[:len(ref_tokens[i]),
                       len(ref_idf[i]):(len(ref_idf[i]) + len(hyp_tokens[i]))]
    else:
        res = 1 - dst[:len(ref_tokens[i]),
                      len(ref_idf[i]):(len(ref_idf[i]) + len(hyp_tokens[i]))]

    r_tokens = ref_tokens[i]
    h_tokens = hyp_tokens[i]

    fig, ax = plt.subplots(figsize=(len(r_tokens) * 0.8, len(h_tokens) * 0.8))
    im = ax.imshow(res, cmap='Blues')

    ax.set_xticks(np.arange(len(h_tokens)))
    ax.set_yticks(np.arange(len(r_tokens)))

    ax.set_xticklabels(h_tokens, fontsize=10)
    ax.set_yticklabels(r_tokens, fontsize=10)
    plt.xlabel("System Translation", fontsize=14)
    plt.ylabel("Human Reference", fontsize=14)
    plt.title("Flow Matrix", fontsize=14)

    plt.setp(ax.get_xticklabels(),
             rotation=45,
             ha="right",
             rotation_mode="anchor")

    #    for i in range(len(r_tokens)):
    #        for j in range(len(h_tokens)):
    #            text = ax.text(j, i, '{:.2f}'.format(res[i, j].item()),
    #                           ha="center", va="center", color="k" if res[i, j].item() < 0.6 else "w")
    fig.tight_layout()
    plt.show()
Ejemplo n.º 28
0
def wmdo(wvvecs,
         ref,
         cand,
         ref_lang='en',
         cand_lang='en',
         delta=0.18,
         alpha=0.1):
    '''
    wvvecs: word vectors -- retrieved from load_wv method
    ref: reference translation
    cand: candidate translation
    missing: missing word dictionary -- initialise as {}
    dim: word vector dimension
    delta: weight of fragmentation penalty
    alpha: weight of missing word penalty
    '''

    ref_list = get_input_words(ref)
    cand_list = get_input_words(cand)

    ref = ' '.join(ref_list)
    cand = ' '.join(cand_list)

    common_vectorizer = CountVectorizer().fit(ref_list + cand_list)

    ref_count_vector, cand_count_vector = common_vectorizer.transform(
        [ref, cand])

    ref_count_vector = ref_count_vector.toarray().ravel()
    cand_count_vector = cand_count_vector.toarray().ravel()

    dim = wvvecs[ref_lang].vector_size

    wvoc, missing = create_vocabulary(common_vectorizer, wvvecs, dim, ref_list,
                                      cand_list, ref_lang, cand_lang)

    distance_matrix = cosine_distances(wvoc)
    vocab_words = common_vectorizer.get_feature_names()
    for cand_word_idx, count in enumerate(cand_count_vector):
        if count > 0:
            most_similar_ref_indexes = np.argsort(
                distance_matrix[cand_word_idx])
            for ref_word_index in most_similar_ref_indexes[1:]:
                if ref_count_vector[ref_word_index] > 0:
                    print('{}: {}'.format(vocab_words[cand_word_idx],
                                          vocab_words[ref_word_index]))
                    break

    if np.sum(distance_matrix) == 0.0:
        return 0., {}
        #return float('inf')

    ref_count_vector = ref_count_vector.astype(np.double)
    cand_count_vector = cand_count_vector.astype(np.double)

    ref_count_vector /= ref_count_vector.sum()
    cand_count_vector /= cand_count_vector.sum()

    distance_matrix = distance_matrix.astype(np.double)
    (wmd, flow) = emd_with_flow(ref_count_vector, cand_count_vector,
                                distance_matrix)

    return wmd, {}

    # adding penalty
    ratio = fragmentation(ref_list, cand_list, common_vectorizer, flow)
    if ratio > 1:
        ratio = 1
    penalty = delta * ratio

    # missing words penalty
    missingwords = 0
    for w in cand_list:
        if w not in wvvecs:
            missingwords += 1
    missingratio = missingwords / len(cand_list)
    missing_penalty = alpha * missingratio

    penalty += missing_penalty

    wmd += penalty

    return wmd, missing
Ejemplo n.º 29
0
    def compute_loss(self, results_S, results_T):

        losses_dict = dict()

        total_loss = 0
        if 'logits' in results_T and 'logits' in results_S:
            logits_list_T = results_T['logits']  # list of tensor
            logits_list_S = results_S['logits']  # list of tensor
            total_kd_loss = 0
            if 'logits_mask' in results_S:
                masks_list_S = results_S['logits_mask']
                logits_list_S = select_logits_with_mask(
                    logits_list_S, masks_list_S)  #(mask_sum, num_of_class)
            if 'logits_mask' in results_T:
                masks_list_T = results_T['logits_mask']
                logits_list_T = select_logits_with_mask(
                    logits_list_T, masks_list_T)  #(mask_sum, num_of_class)

            for l_T, l_S in zip(logits_list_T, logits_list_S):
                if self.d_config.temperature_scheduler is not None:
                    temperature = self.d_config.temperature_scheduler(
                        l_S, l_T, self.d_config.temperature)
                else:
                    temperature = self.d_config.temperature
                total_kd_loss += self.kd_loss(l_S, l_T, temperature)

            total_loss += total_kd_loss * self.d_config.kd_loss_weight
            losses_dict['unweighted_kd_loss'] = total_kd_loss

        inters_T = {
            feature: results_T.get(feature, [])
            for feature in FEATURES
        }
        inters_S = {
            feature: results_S.get(feature, [])
            for feature in FEATURES
        }
        inputs_mask_T = results_T.get('inputs_mask', None)
        inputs_mask_S = results_S.get('inputs_mask', None)

        #hidden states and embedding
        feature = self.emd_feature
        emd_loss_weight = self.emd_loss_weight
        loss_type = self.emd_loss_type
        match_loss = MATCH_LOSS_MAP[loss_type]

        feature_maps_S = inters_S[feature][1:]  # list of features
        feature_maps_T = inters_T[feature][1:]  # list of features

        embeddings_S = inters_S[feature][0]
        embeddings_T = inters_T[feature][0]

        assert isinstance(feature_maps_S, (tuple, list))
        assert isinstance(feature_maps_T, (tuple, list))
        assert isinstance(feature_maps_S[0], torch.Tensor)
        assert isinstance(feature_maps_T[0], torch.Tensor)
        assert len(feature_maps_S) == self.layer_num_S - 1
        assert len(feature_maps_T) == self.layer_num_T - 1

        if len(self.projs) > 0:
            assert len(self.projs) == self.layer_num_S
            embeddings_S = self.projs[0](embeddings_S)
            feature_maps_S = [
                proj(s) for proj, s in zip(self.projs[1:], feature_maps_S)
            ]

        feature_num_S = len(feature_maps_S)
        feature_num_T = len(feature_maps_T)
        feature_num_A = feature_num_S + feature_num_T

        distance_matrix = torch.zeros([feature_num_A,
                                       feature_num_A]).to(feature_maps_S[0])
        for s in range(feature_num_S):
            f_S = feature_maps_S[s]
            for t in range(feature_num_T):
                f_T = feature_maps_T[t]
                distance_matrix[s][t + feature_num_S] = distance_matrix[
                    t + feature_num_S][s] = match_loss(f_S,
                                                       f_T,
                                                       mask=inputs_mask_S)

        feature_weight_S = np.concatenate(
            [self.feature_weight_S,
             np.zeros(feature_num_T)])
        feature_weight_T = np.concatenate(
            [np.zeros(feature_num_S), self.feature_weight_T])

        _, trans_matrix = emd_with_flow(
            feature_weight_S, feature_weight_T,
            distance_matrix.detach().cpu().numpy().astype('float64'))
        trans_matrix = torch.tensor(trans_matrix).to(distance_matrix)

        emd_loss = torch.sum(trans_matrix * distance_matrix)

        total_loss += emd_loss * emd_loss_weight

        losses_dict[f'unweighted_{feature}_{loss_type}_emd'] = emd_loss

        if (self.feature_weight_S <= 0).any() or (self.feature_weight_T <=
                                                  0).any():
            import sys
            logger.info(f"{self.feature_weight_S}")
            logger.info(f"{self.feature_weight_T}")

        if np.isnan(self.feature_weight_S).any() or np.isnan(
                self.feature_weight_T).any():
            import sys
            logger.info(f"{self.feature_weight_S}")
            logger.info(f"{self.feature_weight_T}")
            sys.exit()

        #feature_weight_S = np.copy(self.feature_weight_S)
        #feature_weight_T = np.copy(self.feature_weight_T)
        #self.feature_weight_S, self.feature_weight_T = get_new_feature_weight(
        #    trans_matrix, distance_matrix.detach(), feature_weight_S, feature_weight_T, self.d_config.temperature)

        #embedding matching
        embedding_loss = match_loss(embeddings_S,
                                    embeddings_T,
                                    mask=inputs_mask_S)
        total_loss += embedding_loss * emd_loss_weight  #sharing the same weight
        losses_dict[f'unweighted_embedding_{loss_type}'] = embedding_loss

        if 'losses' in results_S:
            total_hl_loss = 0
            for loss in results_S['losses']:
                # in case of multi-GPU
                total_hl_loss += loss.mean()
            total_loss += total_hl_loss * self.d_config.hard_label_weight
            losses_dict['unweighted_hard_label_loss'] = total_hl_loss
        return total_loss, losses_dict
Ejemplo n.º 30
0
import numpy as np
from pyemd import emd
from pyemd import emd_with_flow
from pyemd import emd_samples

s1 = 8
s2 = 8
np.random.seed(10)

a = np.random.rand(s1)
b = np.random.rand(s2)
d = np.random.rand(s1, s2)

result1 = emd(a, b, d)
result2 = emd_with_flow(a, b, d)
result3 = emd_samples(a, b)

print (result1)
print ("\n", result2)
print (result3)
Ejemplo n.º 31
0
def test_emd_with_flow_validate_square_distance_matrix():
    first_signature = np.array([0.0, 1.0])
    second_signature = np.array([5.0, 3.0])
    distance_matrix = np.array([[0.0, 0.5, 3.0], [0.5, 0.0]])
    with pytest.raises(ValueError):
        emd_with_flow(first_signature, second_signature, distance_matrix)
Ejemplo n.º 32
0
os.chdir(
    '/Volumes/Marius_SSD/American-Flyway/Connectivity_NAbirds/Redistribution-model'
)

# Load species seasonal abundance distributions (estimated from eBird data)
abundance_BR = np.loadtxt('Data/STEMs/seasonalAbundance_wlswar_BR.csv',
                          delimiter=';')
abundance_NB = np.loadtxt('Data/STEMs/seasonalAbundance_wlswar_NB.csv',
                          delimiter=';')
abundance_BR = abundance_BR / sum(abundance_BR)
abundance_NB = abundance_NB / sum(abundance_NB)

# Load matrix of pairwise distance between every hexagons on the grid
distanceMatrix = np.loadtxt('ideal-optimal-redistribution/distanceMatrix.csv',
                            delimiter=';')

# Compute optimal redistribution using the Earth Mover's Distance algorithm
EMD_results = emd_with_flow(abundance_BR, abundance_NB, distanceMatrix)
flow = 0
for i in range(0, len(distanceMatrix)):
    flow = flow + sum(EMD_results[1][i])
EMD_results2 = EMD_results[0] / flow

print EMD_results2

# Save simulated migratory connectivity
np.savetxt("ORSIM-outputs/ORSIMresults_wlswar.csv",
           EMD_results[1],
           delimiter=',')
Ejemplo n.º 33
0
def wmdsimilarity(doc1, doc2, lang1, lang2, vecs, with_flow=False):
    tok1 = list(processing.tokenize(lang1, doc1, include_stopwords=True))
    tok2 = list(processing.tokenize(lang2, doc2, include_stopwords=True))

    print(tok1, tok2)

    dictionary = Dictionary(documents=[tok1, tok2])
    vocab_len = len(dictionary)

    if vocab_len == 1:
        # Both documents are composed by a single unique token
        return 0.0

    # Sets for faster look-up.
    docset1 = set(tok1)
    docset2 = set(tok2)

    print(dictionary, docset1, docset2)

    # Compute distance matrix.
    distance_matrix = np.zeros((vocab_len, vocab_len), dtype=np.double)
    for i, t1 in dictionary.items():
        for j, t2 in dictionary.items():
            if t1 not in docset1 or t2 not in docset2:
                continue
            # Compute Euclidean distance between word vectors.
            distance_matrix[i, j] = np.sqrt(
                np.sum((vecs[lang1][t1] - vecs[lang2][t2])**2))

    if np.sum(distance_matrix) == 0.0:
        # `emd` gets stuck if the distance matrix contains only zeros.
        print('The distance matrix is all zeros. Aborting (returning inf).')
        return float('inf')

    def nbow(document):
        d = np.zeros(vocab_len, dtype=np.double)
        nbow = dictionary.doc2bow(document)  # Word frequencies.
        doc_len = len(document)
        for idx, freq in nbow:
            d[idx] = freq / float(doc_len)  # Normalized word frequencies.
        return d

    # Compute nBOW representation of documents.
    d1 = nbow(tok1)
    d2 = nbow(tok2)

    # Compute WMD.
    if with_flow:
        emd = emd_with_flow(d1, d2, distance_matrix)
        return {
            'tokens': list(dictionary.values()),
            'pdf1': list(d1),
            'pdf2': list(d2),
            'wmd': emd[0],
            'flow': emd[1],
            'dist_matrix': distance_matrix.tolist()
        }
    else:
        return {
            'tokens': list(dictionary.values),
            'pdf1': list(d1),
            'pdf2': list(d2),
            'wmd': emd(d1, d2, distance_matrix),
            'dist_matrix': distance_matrix.tolist()
        }
Ejemplo n.º 34
0
def wmdo(wvvecs, ref, cand, missing, dim, delta, alpha):
    '''
    wvvecs: word vectors -- retrieved from load_wv method
    ref: reference translation
    cand: candidate translation
    missing: missing word dictionary -- initialise as {}
    dim: word vector dimension
    delta: weight of fragmentation penalty
    alpha: weight of missing word penalty
    '''
    ref_list = [w.lower() for w in word_tokenize(ref)]
    cand_list = [w.lower() for w in word_tokenize(cand)]

    vc = CountVectorizer().fit(ref_list + cand_list)

    v_obj, v_cap = vc.transform([ref, cand])

    v_obj = v_obj.toarray().ravel()
    v_cap = v_cap.toarray().ravel()

    # need to deal with missing words
    wvoc = []
    for w in vc.get_feature_names():
        if w in wvvecs:
            wvoc.append(wvvecs[w])
        else:
            if w not in missing:
                missing[w] = np.zeros(dim)
            wvoc.append(missing[w])

    distance_matrix = cosine_distances(wvoc)

    if np.sum(distance_matrix) == 0.0:
        return float('inf')

    v_obj = v_obj.astype(np.double)
    v_cap = v_cap.astype(np.double)

    v_obj /= v_obj.sum()
    v_cap /= v_cap.sum()

    distance_matrix = distance_matrix.astype(np.double)
    (wmd, flow) = emd_with_flow(v_obj, v_cap, distance_matrix)

    # adding penalty
    penalty = 0

    ratio = fragmentation(ref_list, cand_list, vc, flow)
    if ratio > 1:
        ratio = 1
    penalty = delta * ratio

    # missing words penalty
    missingwords = 0
    for w in cand_list:
        if w not in wvvecs:
            missingwords += 1
    missingratio = missingwords / len(cand_list)
    missing = alpha * missingratio

    penalty += missing

    wmd += penalty

    return wmd
Ejemplo n.º 35
0
def test_larger_signatures_1():
    first_signature = np.array([0.0, 1.0, 2.0])
    second_signature = np.array([5.0, 3.0])
    distance_matrix = np.array([[0.0, 0.5], [0.5, 0.0]])
    with pytest.raises(ValueError):
        emd_with_flow(first_signature, second_signature, distance_matrix)