Beispiel #1
0
def calculate_similarity(sentences, cache=None):
    """
    """
    tensor, len_sent = generate_tensor(sentences)

    cm = CosineSimilarity(dim=0)
    return cm.forward(tensor[len_sent[0] - 1, 0], tensor[len_sent[1] - 1,
                                                         1]), tensor, len_sent
Beispiel #2
0
def calculate_similarity(sentences, model, params, dico):
    """
    """
    sent_to_bpe = [sent for sent, lan in sentences]

    lan_sent = [lan for sent, lan in sentences]

    file = open("input_file", "w", encoding="utf-8")
    for sent in sent_to_bpe:
        file.write(sent + '\n')
    file.close()

    print('tokenizing and lowercasing data')
    process = subprocess.Popen(
        ".cat input_file | tools/tokenize.sh fr | python tools/lowercase_and_remove_accent.py > prep_input",
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT)
    (output, err) = process.communicate(
    )  #now wait plus that you can send commands to process
    #This makes the wait possible
    p_status = process.wait()
    #This will give you the output of the command being executed
    print("Command tokenize output: ", output)

    print('executing bpe')
    process = subprocess.Popen(
        "./tools/fastBPE" + ("_local" if local else "") +
        "/fast applybpe output_file prep_input codes_xnli_15",
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT)
    (output, err) = process.communicate(
    )  #now wait plus that you can send commands to process
    #This makes the wait possible
    p_status = process.wait()
    #This will give you the output of the command being executed
    print("Command fast output: ", output)

    print("reading file")
    f = open("output_file", "r")
    sentences = [x.rstrip('\n') for x in f]
    f.close()
    print(sentences)

    #    sentences = bpe.apply(sent_to_bpe)

    # len_sent = [len(sent) for sent in sentences]
    sentences = zip(sentences, lan_sent)

    tensor, len_sent = generate_embedding_tensors(sentences, model, params,
                                                  dico)
    cm = CosineSimilarity(dim=0)
    return cm.forward(tensor[len_sent[0] - 1, 0], tensor[len_sent[1] - 1, 1])
Beispiel #3
0
def cosine(x_dialogues):
    cos=CosineSimilarity(dim=-1)
    x = x_dialogues
    y = torch.narrow(x, dim=0, start=1, length=x.size(0)-1)
    y = torch.cat([y, torch.ones(1, y.size(1))], dim=0)
    scores = cos(x,y)[:-1].mean(dim=-1) # drop the last value as it is just a comparison to the 1-vector
    return scores
Beispiel #4
0
    def __init__(self, args, vocab):
        super(ParaModel, self).__init__()

        self.args = args
        self.vocab = vocab
        self.gpu = args.gpu

        self.cosine = CosineSimilarity()
Beispiel #5
0
def similarity(model, dataset, i, j):
    sim = CosineSimilarity()
    a_img = dataset[i][0]
    b_img = dataset[j][0]
    imshow(a_img)
    imshow(b_img)
    a = model.get_embedding(a_img.unsqueeze(0).to(device))
    b = model.get_embedding(b_img.unsqueeze(0).to(device))
    print(f"sim(a,b) = {sim(a, b).item()}")
Beispiel #6
0
def try_untrained():
    # dataset = TripletImageFolder(FOOD101_IMG_PATH)
    dataset = TripletImageFolder(OWN_IMG_PATH)
    embed = EmbeddingResnet().to(device)
    sim = CosineSimilarity()
    a, b, c = map(lambda x: x.unsqueeze(0).to(device), dataset[2])
    print(f"sim(a,b) = {sim(embed(a), embed(b)).item()}")
    print(f"sim(a,c) = {sim(embed(a), embed(c)).item()}")
    print(f"sim(b,c) = {sim(embed(b), embed(c)).item()}")
Beispiel #7
0
    def __init__(self, args, vocab, sp_file='simile-mrt/sim/sim.sp.30k.model'):
        super(ParaModel, self).__init__()

        self.args = args
        self.vocab = vocab
        self.gpu = args.gpu

        self.cosine = CosineSimilarity()

        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(sp_file)
    def test_query_support_distances(self):
        # Create some dummy data with easily verifiable distances
        q = 1  # 1 query per class
        k = 3  # 3 way classification
        d = 2  # embedding dimension of two

        query = torch.zeros([q * k, d], dtype=torch.double)
        query[0] = torch.Tensor([0, 0])
        query[1] = torch.Tensor([0, 1])
        query[2] = torch.Tensor([1, 0])
        support = torch.zeros([k, d], dtype=torch.double)
        support[0] = torch.Tensor([1, 1])
        support[1] = torch.Tensor([1, 2])
        support[2] = torch.Tensor([2, 2])

        distances = pairwise_distances(query, support, 'l2')
        self.assertEqual(distances.shape, (q * k, k),
                         'Output should have shape (q * k, k).')

        # Calculate squared distances by iterating through all query-support pairs
        for i, q_ in enumerate(query):
            for j, s_ in enumerate(support):
                self.assertEqual((q_ - s_).pow(2).sum(), distances[i, j].item(
                ), 'The jth column of the ith row should be the squared distance between the '
                                 'ith query sample and the kth query sample')

        # Create some dummy data with easily verifiable distances
        q = 1  # 1 query per class
        k = 3  # 3 way classification
        d = 2  # embedding dimension of two
        query = torch.zeros([q * k, d], dtype=torch.double)
        query[0] = torch.Tensor([1, 0])
        query[1] = torch.Tensor([0, 1])
        query[2] = torch.Tensor([1, 1])
        support = torch.zeros([k, d], dtype=torch.double)
        support[0] = torch.Tensor([1, 1])
        support[1] = torch.Tensor([-1, -1])
        support[2] = torch.Tensor([0, 2])

        distances = pairwise_distances(query, support, 'cosine')

        # Calculate distances by iterating through all query-support pairs
        for i, q_ in enumerate(query):
            for j, s_ in enumerate(support):
                self.assertTrue(
                    torch.isclose(1 - CosineSimilarity(dim=0)(q_, s_),
                                  distances[i, j],
                                  atol=2e-8),
                    'The jth column of the ith row should be the squared distance between the '
                    'ith query sample and the kth query sample')
Beispiel #9
0
 def compute_cosine_matrix(self, A, B):
     similarity = CosineSimilarity()
     similarity = PairwiseDistance()
     print(A.size(), B.size())
     num_A, _ = A.size()
     num_B, _ = B.size()
     _matrix = np.zeros((num_A, num_B))
     for i in range(num_A):
         vA = A[i, :].unsqueeze(0)
         vA = vA.cuda()
         for j in range(num_B):
             vB = B[j, :].unsqueeze(0)
             vB = vB.cuda()
             value = similarity(vA, vB)
             # print(value)
             _matrix[i, j] = value.item()
     return _matrix
Beispiel #10
0
def try_trained():
    # dataset = TripletImageFolder(FOOD101_IMG_PATH)
    dataset = ImageFolder(FOOD101_IMG_PATH,
                          transform=transforms.Compose([
                              transforms.Resize((512, 512)),
                              transforms.ToTensor(),
                              transforms.Normalize((0.486, 0.459, 0.408),
                                                   (0.229, 0.224, 0.225))
                          ]))
    model = torch.load(MODEL_PATH, pickle_module=dill)
    sim = CosineSimilarity()
    a, b, c = map(lambda x: x.unsqueeze(0).to(device),
                  [dataset[95000][0], dataset[93003][0], dataset[93004][0]])
    a, b, c = map(model.get_embedding, (a, b, c))
    print(f"sim(a,b) = {sim(a, b).item()}")
    print(f"sim(a,c) = {sim(a, c).item()}")
    print(f"sim(b,c) = {sim(b, c).item()}")
Beispiel #11
0
    def __init__(self, data, args, vocab, vocab_fr):
        super(ParaModel, self).__init__()

        self.data = data
        self.args = args
        self.gpu = args.gpu
        self.save_interval = args.save_interval
        if "report_interval" in args:
            self.report_interval = args.report_interval
        else:
            self.report_interval = args.save_interval

        self.vocab = vocab
        self.rev_vocab = {v: k for k, v in vocab.items()}
        self.vocab_fr = vocab_fr
        self.ngrams = args.ngrams

        self.delta = args.delta
        self.pool = args.pool

        self.dropout = args.dropout
        self.share_encoder = args.share_encoder
        self.share_vocab = args.share_vocab
        self.scramble_rate = args.scramble_rate
        self.zero_unk = args.zero_unk

        self.batchsize = args.batchsize
        self.max_megabatch_size = args.megabatch_size
        self.curr_megabatch_size = 1
        self.megabatch = []
        self.megabatch_anneal = args.megabatch_anneal
        self.increment = False

        self.sim_loss = nn.MarginRankingLoss(margin=self.delta)
        self.cosine = CosineSimilarity()

        self.embedding = nn.Embedding(len(self.vocab), self.args.dim)
        if self.vocab_fr is not None:
            self.embedding_fr = nn.Embedding(len(self.vocab_fr), self.args.dim)

        self.sp = None
        if args.sp_model:
            self.sp = spm.SentencePieceProcessor()
            self.sp.Load(args.sp_model)
Beispiel #12
0
    def __init__(self, src_emb, tgt_emb, mapping, params):
        """
        Initialize trainer script.
        """
        self.src_emb = src_emb
        self.tgt_emb = tgt_emb
        self.src_dico = params.src_dico
        self.tgt_dico = getattr(params, 'tgt_dico', None)
        self.mapping = mapping
        self.params = params

        # optimizers
        if hasattr(params, 'mapping_optimizer'):
            optim_fn, optim_params = get_optimizer(params.mapping_optimizer)
            self.mapping_optimizer = optim_fn(mapping.parameters(),
                                              **optim_params)
        self.criterion = CosineSimilarity(
        )  #ContrastiveLoss(margin=0.0, measure='cosine')
        self.criterionRCSLS = RCSLS()

        # best validation score
        self.best_valid_metric = -1e12

        self.decrease_lr = False
Beispiel #13
0
    def __init__(self, vocab: Vocabulary,
                 char_embedder: TextFieldEmbedder,
                 word_embedder: TextFieldEmbedder,
                 tokens_encoder: Seq2SeqEncoder,
                 model_args,
                 inp_drop_rate: float = 0.5,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        """
        :param vocab: vocabulary from train and dev dataset
        :param char_embedder: character embedding + cnn encoder
        :param word_embedder: word embedding
        :param tokens_encoder: Bi-LSTM backbone for split
        :param model_args: model arguments
        :param inp_drop_rate: input dropout rate
        """
        super(FollowUpSnippetModel, self).__init__(vocab, regularizer)

        self.tokens_encoder = tokens_encoder

        self.projection_layer = torch.nn.Linear(
            in_features=word_embedder.get_output_dim() + 1 + char_embedder.get_output_dim(),
            out_features=self.tokens_encoder.get_input_dim(),
            bias=False)

        # integer to mark field, 0 or 1
        self.num_classes = 2
        self.num_conflicts = 2

        self._non_linear = torch.nn.PReLU()

        self.hidden_size = int(self.tokens_encoder.get_output_dim() / 2)

        self.policy_net = PolicyNet(self.tokens_encoder.get_output_dim() * 3,
                                    self.num_classes)

        self.token_field_embedding = word_embedder
        self.char_field_embedding = char_embedder

        self._scaled_value = 1.0
        self._self_attention = CosineMatrixAttention()

        self.margin_loss = MarginRankingLoss(margin=model_args.margin)

        # calculate span similarity
        self.cosine_similar = CosineSimilarity(dim=0)

        if inp_drop_rate > 0:
            self._variational_dropout = InputVariationalDropout(p=inp_drop_rate)
        else:
            self._variational_dropout = lambda x: x

        self.metrics = {
            "bleu": BLEUScore(),
            "reward": RewardScore(),
            "symbol": SymbolScore(),
            "reward_var": RewardScore(),
            "overall": RewardScore()
        }

        initializer(self)
Beispiel #14
0
if __name__ == '__main__':
    import cv2
    import dlib
    import matplotlib.pyplot as plt
    from torch.nn.modules.distance import PairwiseDistance, CosineSimilarity

    detector = dlib.get_frontal_face_detector()
    shaper = dlib.shape_predictor(
        './models/shape_predictor_68_face_landmarks.dat')

    model = InceptionResnetV1('casia-webface', device='cuda')
    model.eval()

    pwd = PairwiseDistance().cuda()
    cos = CosineSimilarity().cuda()

    image1 = cv2.imread(
        '../Data/LFW/lfw-deepfunneled/Abdullah/Abdullah_0001.jpg')
    image1 = image1[:, :, ::-1]
    faces1 = detector(image1)
    alined1 = dlib.get_face_chip(image1, shaper(image1, faces1[0]), size=160)
    inp1 = prewhiten(torch.tensor(alined1, dtype=torch.float32))
    inp1 = inp1.permute(2, 0, 1).unsqueeze(0)
    out1 = model(inp1.cuda())

    image2 = cv2.imread(
        '../Data/LFW/lfw-deepfunneled/Abdullah/Abdullah_0003.jpg')
    image2 = image2[:, :, ::-1]
    faces2 = detector(image2)
    alined2 = dlib.get_face_chip(image2, shaper(image2, faces2[0]), size=160)
Beispiel #15
0
 def __init__(self, args, device):
     super(CosineCoherence, self).__init__()
     self.seed = args.seed
     self.cos = CosineSimilarity(dim=-1)
     self.emb = GloveEmbedding(args)
     self.device = device
Beispiel #16
0
@author: tonyparker
Credits to [1] G. Lample *, A. Conneau * [*Cross-lingual Language Model Pretraining*](https://arxiv.org/abs/1901.07291)
"""
import os
import torch
import numpy.matlib
import numpy as np

from src.utils import AttrDict
from src.data.dictionary import Dictionary, BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD, MASK_WORD
from src.model.transformer import TransformerModel

import subprocess
from torch.nn.modules.distance import CosineSimilarity

cm = CosineSimilarity(dim=0)

#On initialise le modèle
model_path = './mlm_tlm_xnli15_1024.pth'
reloaded = torch.load(model_path)
params = AttrDict(reloaded['params'])
print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

# build dictionary / update parameters
dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                  reloaded['dico_counts'])
params.n_words = len(dico)
params.bos_index = dico.index(BOS_WORD)
params.eos_index = dico.index(EOS_WORD)
params.pad_index = dico.index(PAD_WORD)
params.unk_index = dico.index(UNK_WORD)
Beispiel #17
0
import tqdm
import numpy as np
import rouge
import edlib
import os
import pandas as pd
import re
import glob
import sys
sys.path.append("./BERT/pytorch-pretrained-BERT-master")
sys.path.append("./BERT")
from pytorch_pretrained_bert import BertTokenizer, BertModel
from wmd import WMD
from torch.nn.modules.distance import CosineSimilarity

torch_emb_sim = CosineSimilarity()

from bert_score import score as bert_score

nlp = spacy.load('en_core_web_md')
nlp.add_pipe(WMD.SpacySimilarityHook(nlp), last=True)


def _clean_text(txt):
    return txt.lower()


class CFRInstance(object):
    def __init__(
        self,
        original_context: str,
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.similarity = CosineSimilarity(dim=1)
Beispiel #19
0
 def __init__(self):
     super().__init__()
     self.cossim = CosineSimilarity()