Beispiel #1
0
    def __init__(self, config):

        super(DependencyBaselineModel, self).__init__()
        self.device = config[DEVICE]
        self.hid_dim = config[LSTM_HID] * 2
        self.num_lstm_pointer = 1
        self.config = config

        self.ptr_criterion = nn.CrossEntropyLoss(reduction="sum",
                                                 ignore_index=-100)

        self.link_predictor = CosineSimilarity(dim=2)

        self.root_clf = nn.Sequential(nn.Linear(self.hid_dim, self.hid_dim),
                                      nn.GELU(), nn.Linear(self.hid_dim, 1))

        self.edu_embed_model = EduEmbeddingModel(config)
        self.lm_decoder = LMDecodingModel(config)

        if not self.config[USE_SEP_ENCODER]:
            self.pointer_net = PointerNet(self.hid_dim, \
                                          self.hid_dim, \
                                          self.num_lstm_pointer, \
                                          self.config[DROPOUT])

        self.root_embed = nn.Parameter(th.rand(1, self.hid_dim),
                                       requires_grad=True)
        self.alpha = 0.5
Beispiel #2
0
def evaluate_embeddings(embedding, vocab: Vocabulary):
    cosine = CosineSimilarity(dim=0)

    simlex999 = read_simlex999()
    sims_pred = []
    oov_count = 0
    for word1, word2, sim in simlex999:
        word1_id = vocab.get_token_index(word1, 'token_in')
        if word1_id == 1:
            sims_pred.append(0.)
            oov_count += 1
            continue
        word2_id = vocab.get_token_index(word2, 'token_in')
        if word2_id == 1:
            sims_pred.append(0.)
            oov_count += 1
            continue

        sim_pred = cosine(embedding.weight[word1_id],
                          embedding.weight[word2_id]).item()
        sims_pred.append(sim_pred)

    assert len(sims_pred) == len(simlex999)
    print('# of OOV words: {} / {}'.format(oov_count, len(simlex999)))

    return spearmanr(sims_pred, [sim for _, _, sim in simlex999])
Beispiel #3
0
    def __init__(self, data):
        super(Bert_Comparing, self).__init__()

        self.question_bert_embedding = BertCharEmbedding(data.bert_path, data.requires_grad)
        self.path_bert_embedding = BertCharEmbedding(data.bert_path, data.requires_grad)
        self.args = data
        self.similarity = CosineSimilarity(dim=1)
Beispiel #4
0
def computeTask_(index, symptom, combinedOutputFolder, meanEmb,
                 similarityThreshold):

    symptomToken = tokenizer.encode(symptom)[1]

    cos = CosineSimilarity(dim=1, eps=1e-6)

    filename = os.path.join(combinedOutputFolder, f"{index+6}.pkl")
    subDict = pickle.load(open(filename, 'rb'))

    IDList = subDict['id']
    tokenList = subDict['token']
    embList = subDict['emb']

    #         sim = np.round(cosine_similarity(embList, meanEmb.reshape(1,-1)).reshape(-1),4)

    arrA = torch.from_numpy(meanEmb.reshape(1, -1))
    arrB = torch.from_numpy(embList)

    #         arrA = torch.from_numpy(meanEmb.reshape(1,-1)).cuda()
    #         arrB = torch.from_numpy(embList).cuda()

    sim = cos(arrA, arrB).cpu().numpy().reshape(-1)

    sim = np.round(sim, 4)

    index = np.where([sim > similarityThreshold])[1]

    tokenList_ = tokenList[index]
    IDList_ = IDList[index]
    simList = sim[index]

    out = [(x, y, z) for x, y, z in zip(tokenList_, simList, IDList_)]

    return out
Beispiel #5
0
def create_classes_data_frame(dataset_name, distance="cosine", tsne_dimension=2):
    """Create a new classes dataframe for the specified dataset. The dataset must be registered in the project settings.
    The data frame is pickled before function return, to prevent re-calculating things.

    Args:
      dataset_name: the name of the dataset
      distance: which distance function to be used for nearest neighbor computation. Either 'cosine' or 'pairwise' (Default value = "cosine")
      tsne_dimension: the dimensions for the lower dimensional vector projections (Default value = 2)

    Returns:
      a pandas DataFrame with "class", "vector" (document embeddings) and "tsne" columns

    """
    dataset_dir = DATA_SOURCES[dataset_name]["images"]
    paths = classes_set(dataset_dir)
    classes = pd.DataFrame(columns=["class", "vector", "tsne"])
    classes["classes"] = sorted(list(paths))
    tqdm.pandas(desc="Removing special characters.")
    classes["classes"] = classes["classes"].progress_apply(lambda cls: " ".join(re.split(r"[_\-]", cls)))
    tqdm.pandas(desc="Applying full clean.")
    classes["classes"] = classes["classes"].progress_apply(full_clean)
    tqdm.pandas(desc="Creating document vectors.")
    vectors = torch.tensor(np.vstack(classes["classes"].progress_apply(document_vector)))
    classes["vectors"] = vectors
    p_dist = PairwiseDistance(p=2) if distance == "pairwise" else CosineSimilarity()
    classes["distances"] = p_dist(  # distance from every node to every node
        vectors.repeat_interleave(vectors.shape[0], 0),  # each index repeated num_edges times
        vectors.repeat(vectors.shape[0], 1),  # the index range repeated num_edges times
    ).reshape(
        vectors.shape[0], -1
    )  # convert to 2D matrix with shape [vectors.shape[0], vectors.shape[0]]
    classes["tsne"] = torch.tensor(TSNE(n_components=tsne_dimension).fit_transform(vectors))
    pickle.dump(classes, open(os.path.join(dataset_dir, "classes.pickle"), "wb"))
    return classes
Beispiel #6
0
    def __init__(self, input_size: int, dropout: float = None):
        super().__init__(input_size,
                         dropout,
                         key='cosine-regression',
                         supports_compressed_streamlines=False,
                         loss_description='negative cosine similarity')

        # Loss will be applied on the last dimension.
        self.loss = CosineSimilarity(dim=-1)
def getSimilarWords(tokenizer, combinedOutputFolder, symptom, meanEmb, similarityThreshold = 0.3, numThreshold = 150000, numComp = 10000):
        
    output = []

    symptomToken = tokenizer.encode(symptom)[1]

    fileList = os.listdir(combinedOutputFolder)
    
    cos = CosineSimilarity(dim=1, eps=1e-6)

    examineCount = 0
    

    for i in tqdm(range(len(fileList))):

        if examineCount >= numThreshold:
            break


        filename = os.path.join(combinedOutputFolder, f"{i}.pkl")
        subDict = pickle.load(open(filename,'rb'))

        IDList = subDict['id']
        tokenList = subDict['token']
        embList = subDict['emb']
        
        arrA = torch.from_numpy(meanEmb.reshape(1,-1)).to(device).type(torch.cuda.FloatTensor)
        arrB = torch.from_numpy(embList).to(device).type(torch.cuda.FloatTensor)
        
#         arrA = torch.from_numpy(meanEmb.reshape(1,-1)).to(device)
#         arrB = torch.from_numpy(embList).to(device)
        
        sim = cos(arrA,arrB).cpu().numpy().reshape(-1)
        
        del arrA
        del arrB
        
        sim = np.round(sim,4)

        index= np.where([sim> similarityThreshold])[1]

        tokenList_ = tokenList[index]
        IDList_ = IDList[index]
        simList = sim[index]

        out = [(x,y,z) for x,y,z in zip(tokenList_, simList, IDList_)]
        print(len(out))
        

        output += out

        examineCount += numComp
        

    return output
Beispiel #8
0
def loss_function(origin, target, random_1, random_2, random_3, random_4):
    cos = CosineSimilarity(dim=1, eps=1e-6)
    sim_1 = cos(origin, target).unsqueeze(1)  #batch_size * 1
    sim_2 = cos(origin, random_1).unsqueeze(1)
    sim_3 = cos(origin, random_2).unsqueeze(1)
    sim_4 = cos(origin, random_3).unsqueeze(1)
    sim_5 = cos(origin, random_4).unsqueeze(1)
    sim = torch.cat((sim_1, sim_2, sim_3, sim_4, sim_5),
                    dim=1)  #batch_size * compare_size
    logSoft = LogSoftmax(dim=1)
    output = torch.mean(logSoft(sim)[:, 0])
    return -output
def get_synonyms(token: str, embedding: Model, vocab: Vocabulary, num_synonyms: int = 10):
    """Given a token, return a list of top N most similar words to the token."""
    token_id = vocab.get_token_index(token, 'token_in')
    token_vec = embedding.weight[token_id]
    cosine = CosineSimilarity(dim=0)
    sims = Counter()

    for index, token in vocab.get_index_to_token_vocabulary('token_in').items():
        sim = cosine(token_vec, embedding.weight[index]).item()
        sims[token] = sim

    return sims.most_common(num_synonyms)
Beispiel #10
0
    def __init__(self, input_size: int, dropout: float = None):
        # Prepare the dropout, Relu, loop:
        super().__init__(dropout)

        # Layers
        hidden_size = ceil(input_size / 2)
        h1 = Linear(input_size, hidden_size)
        h2 = Linear(hidden_size, 3)
        self.layers = [h1, h2]

        # Loss will be applied on the last dimension.
        self.loss = CosineSimilarity(dim=-1)
Beispiel #11
0
    def __init__(self, tokenizer, **kw):
        super().__init__(**kw)
        self.encoder = BertModel.from_pretrained('prajjwal1/bert-tiny')
        self.decoder = AutoModelForCausalLM.from_pretrained(
            'prajjwal1/bert-tiny', )
        self.decoder.config.is_decoder = True
        self.tokenizer = tokenizer

        self.encoder.resize_token_embeddings(len(self.tokenizer))
        self.decoder.resize_token_embeddings(len(self.tokenizer))

        self.cs = CosineSimilarity()
 def __init__(
     self,
     collection_name: str,
     stopwords_list,
     text_transformer: TextTransformer,
     weighting_model: str = "tw-idf",
 ):
     self.collection = Collection(collection_name, stopwords_list,
                                  weighting_model, text_transformer)
     self.weighting_model = weighting_model
     self.stopwords = stopwords_list
     self.__text_transformer = text_transformer
     self.__cos = CosineSimilarity(dim=0, eps=1e-6)
Beispiel #13
0
class TripletMarginCosLoss(Function):
    """Triplet loss function.
    """
    def __init__(self, margin):
        super(TripletMarginCosLoss, self).__init__()
        self.margin = margin
        self.pdist = CosineSimilarity(dim=1, eps=1e-6)  # norm 2

    def forward(self, anchor, positive, negative):
        d_p = self.pdist.forward(anchor, positive)
        d_n = self.pdist.forward(anchor, negative)

        dist_hinge = torch.clamp(self.margin - d_p + d_n, min=0.0)
        # loss = torch.sum(dist_hinge)
        loss = torch.mean(dist_hinge)
        return loss
    def get_synonyms(self, token: str, num_synonyms: int = 30):
        """Given a token, return a list of top N most similar words to the token."""
        vocab = self._model.vocab
        embedding = self._model.embedding_target

        token_id = vocab.get_token_index(token, 'token_target')
        token_vec = embedding.weight[token_id]
        cosine = CosineSimilarity(dim=0)
        sims = Counter()

        for index, token in vocab.get_index_to_token_vocabulary(
                'token_target').items():
            sim = cosine(token_vec, embedding.weight[index]).item()
            sims[token] = sim

        return sims.most_common(num_synonyms)
Beispiel #15
0
    def __init__(self, args, text_data, writer, summary_dir, out):
        self.args = args
        self.text_data = text_data
        self.writer = writer
        self.summary_dir = summary_dir
        self.out = out
        self.stopwords = set(stopwords.words('english'))
        self.mode = None
        self.samples = None
        self.cosine_similarity = CosineSimilarity(dim=1, eps=1e-6)
        self.global_step = 0

        if self.args.attack == 'deepfool':
            self.attack = DeepFool(args=self.args, num_classes=2, max_iters=20)
        else:
            print('Attack {} not recognized'.format(self.args.attack))
Beispiel #16
0
    def __init__(self, vocab_size, embedding_dim):
        """Initializes model layers.

        Args:
            vocab_size (int): Number of tokens in corpus. This is used to init embeddings.
            embedding_dim (int): Dimension of embedding vector.
        """
        super().__init__()

        self._embedding_dim = embedding_dim

        self.encoder_in = Encoder(vocab_size, embedding_dim)
        self.encoder_out = Encoder(vocab_size, embedding_dim)

        self.linear = Linear(embedding_dim, embedding_dim, bias=False)
        self.similarity = CosineSimilarity(dim=2)
        self.softmax = Softmax(dim=2)
Beispiel #17
0
    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, self.batch_Size, -1)
        embedded = self.dropout(embedded)
        ### batch_size*(embed_size+hidden_size)
        output = F.relu(embedded)
        output, hidden = self.gru(output, hidden)
        ### hidden (1, batch_size, hidden_size)
        ### out (seq_len, batch_size, hidden_size)
        ### attn_applied (batch_size, 1, hidden_size)

        #output_hidden = torch.cat((output[0], hidden[0]), 1)
        ### using output to match all the encoder_outputs
        ### output[0] (batch_size, hidden_size) hidden[0] (batch_size, hidden_size)
        ### output_hidden (batch_size, 2*hidden_size)
        cos = CosineSimilarity(dim=2, eps=1e-7)
        similar = cos(output.repeat(self.max_length, 1, 1), encoder_outputs)
        ### out (seq_len, batch_size, hidden_size)
        ### encoder out_put (seq_len, batch_size, hidden_size)
        #print('similar.shape', similar.shape)
        #print('similar', similar)
        ### similar (max_len, hidden)
        #attn_weights = F.softmax(
        #    self.attn(similar), dim=1)
        attn_weights = F.softmax(similar, dim=0)
        #print('attn_weights.shape', attn_weights)
        attn_weights = torch.t(attn_weights)
        #print('attn_weights.shape', attn_weights)
        ### attn_weights (batch_size*max_length)
        threeDattn_weights = attn_weights.unsqueeze(0)
        ### threeDattn_weights (1, 3, 300) (1, batch_size, max_length)
        #threeDencoder_outputs = encoder_outputs.unsqueeze(0)
        ### threeDencoder_outputs (300, 3, 300) (maxLen, batch_size, hidden_size)
        threeDattn_weights = threeDattn_weights.permute(1, 0, 2)
        threeDencoder_outputs = encoder_outputs
        threeDencoder_outputs = threeDencoder_outputs.permute(1, 0, 2)

        attn_applied = torch.bmm(threeDattn_weights, threeDencoder_outputs)
        ### (3,1,300)x(3,300,300) = (3,1,300)
        ### embedded (seq_len, batch_size, feature)

        IntoLinear = torch.cat((output[0], attn_applied.squeeze(1)), 1)
        ### output[0] (batch_size, hidden_size) attn_applied (batch_size, hidden_size)
        #IntoLinear (batch_size, 2*hidden_size)

        output = F.log_softmax(self.out(IntoLinear), dim=1)
        return output, hidden, attn_weights
Beispiel #18
0
def discovery(embedding, vocab, chord_a, chord_b, chord_c, num_output=10):
    a_id = vocab.get_token_index(chord_a)
    b_id = vocab.get_token_index(chord_b)
    c_id = vocab.get_token_index(chord_c)
    vec_a = embedding.weight[a_id]
    vec_b = embedding.weight[b_id]
    vec_c = embedding.weight[c_id]
    cosine = CosineSimilarity(dim=0)
    sims = Counter()

    vec = vec_b - vec_a + vec_c

    for index, token in vocab.get_index_to_token_vocabulary().items():
        sim = cosine(vec, embedding.weight[index]).item()
        sims[token] = sim

    return sims.most_common(num_output)
Beispiel #19
0
def evaluate(model, eval_feature_path, enrollment_path, eval_path,
             annotation_path):
    model.eval()
    _, grp_embeddings = enrollment(model, eval_feature_path, enrollment_path)
    indices, _ = readTestPaths(eval_path)  # 组编号-文件名列表
    cosine_similarity = CosineSimilarity(dim=1)
    cos_similarity = {}
    for key, value in indices.items():  # 组编号-文件名列表
        for path in value:
            out = calculateOneEmbedding(model, eval_feature_path,
                                        path)  # test embedding
            cosine = cosine_similarity(
                out, grp_embeddings[key]
            )  # grp_embeddings[key] is the corresponding enroll embeddings
            cos_similarity[path] = max(cosine).item()  # 距离越远越好
    accuracy, threshold = acc(cos_similarity, annotation_path)
    print('ACCURACY: ', accuracy)
    return accuracy, threshold
Beispiel #20
0
def verify(model, data_loader, sub_name, test=False):
    model.eval()

    cos_sim = CosineSimilarity(dim=1)
    cosine_similarity = torch.Tensor([])
    true_similarity = torch.Tensor([])
    if not test:
        for i, (x, y) in enumerate(data_loader):
            img1 = x[0].to(device)
            img2 = x[1].to(device)
            y = y.to("cpu")

            out1 = model(img1)[0].to("cpu")
            out2 = model(img2)[0].to("cpu")

            cosine_similarity = torch.cat(
                (cosine_similarity.detach(), cos_sim(out1, out2).detach()), 0)
            true_similarity = torch.cat((true_similarity, y), 0)

            del x, y, img1, img2, out1, out2
            torch.cuda.empty_cache()
        model.train()
        try:
            AUC = roc_auc_score(
                true_similarity.type(torch.DoubleTensor),
                cosine_similarity.type(torch.DoubleTensor).detach().numpy())
            return AUC
        except Exception as e:
            print(e)
            return -1
    else:
        for i, (x) in enumerate(data_loader):
            img1 = x[0].to(device)
            img2 = x[1].to(device)

            out1 = model(img1)[0].to("cpu")
            out2 = model(img2)[0].to("cpu")

            cosine_similarity = torch.cat(
                (cosine_similarity.detach(), cos_sim(out1, out2).detach()), 0)
            if i % 1000 == 0:
                print("Verification", i, end='\r')
        model.train()
        return write_submission(sub_name, cosine_similarity)
class TestEncoder:
    cosine = CosineSimilarity(dim=-1)

    # The base model will take longer than the small model, which triggers a test timing error.
    # Turn off deadlines to avoid this.
    @settings(deadline=None)
    @given(sphereize=booleans())
    def test_encoder(self, inputs: List[str], inputs_filepath: Path,
                     encoder: Encoder, sphereize: bool) -> None:
        # The relative ranking should not change if sphereize is True/False, so run tests with both.
        encoder._sphereize = sphereize

        # Run three distinct tests, which should cover all use cases of Encoder:
        #  1. A List[str] input where batch_size is not None.
        embeddings = encoder(inputs, batch_size=len(inputs))
        embeddings = torch.from_numpy(embeddings)
        # These are hard-coded examples that should have the highest cosine similarity.
        assert torch.topk(self.cosine(embeddings[2], embeddings),
                          k=2)[-1][-1].item() == 3
        assert torch.topk(self.cosine(embeddings[6], embeddings),
                          k=2)[-1][-1].item() == 7

        #  2. A str input where batch_size is None. Check that the expected UserWarning is raised.
        embeddings = []
        for text in inputs:
            if sphereize:
                with pytest.warns(UserWarning):
                    embeddings.append(encoder(text, batch_size=None))
            else:
                embeddings.append(encoder(text, batch_size=None))
        embeddings = torch.as_tensor(embeddings).squeeze(1)
        assert torch.topk(self.cosine(embeddings[2], embeddings),
                          k=2)[-1][-1].item() == 3
        assert torch.topk(self.cosine(embeddings[6], embeddings),
                          k=2)[-1][-1].item() == 7

        #  3. A filepath input that points to file with one example per line.
        embeddings = encoder(inputs_filepath, batch_size=len(inputs))
        embeddings = torch.from_numpy(embeddings)
        assert torch.topk(self.cosine(embeddings[2], embeddings),
                          k=2)[-1][-1].item() == 3
        assert torch.topk(self.cosine(embeddings[6], embeddings),
                          k=2)[-1][-1].item() == 7
Beispiel #22
0
    def forward(self, x1, x2, labels):
        """
        Args:
            x: feature matrix with shape (batch_size, feat_dim).
            labels: ground truth labels with shape (batch_size).
        """
        #         batch_size = x1.shape[0]
        #         dist = torch.zeros(batch_size)
        #         for i in range(batch_size):
        #             dist[i] = torch.norm(x1[i]-x2[i])

        dist = CosineSimilarity(dim=1)(x1, x2)

        # total_loss = labels*dist + (1-labels)*(self.margin-dist) # for euclidean distance
        total_loss = (1 - labels) * dist + labels * (self.margin - dist)

        loss = total_loss.mean()

        return loss
Beispiel #23
0
def get_top_k(query_embedding, queried_embeddings, k, distance):
    """Returns the distances and indices of the k nearest embeddings in the `queried_embeddings` tensor to the
    `query_embedding` tensor.

    Args:
      query_embedding: tensor with the embedding of the query image.
      queried_embeddings: tensor with the stacked embeddings of the queried dataset.
      k: the number of most similar images to be returned.
      distance: which distance function to be used for nearest neighbor computation. Either 'cosine' or 'pairwise'

    Returns:
      the closest k embeddings in the `embeddings` tensor to the `query_embedding`. A 2-tuple with shape `[k]`
      tensor with their distances and indices are returned (respectively).

    """
    p_dist = PairwiseDistance(
        p=2) if distance == "pairwise" else CosineSimilarity()
    distances = p_dist(queried_embeddings, query_embedding)
    return torch.topk(distances, k)  # return the top k results
Beispiel #24
0
def verify(model, data_loader, sub_name, test=False):
    model.eval()

    cos_sim = CosineSimilarity(dim=1)
    cosine_similarity = torch.Tensor([])
    true_similarity = torch.Tensor([])
    if not test:
        for i, (x, y) in enumerate(data_loader):
            img1 = x[0].to(device)
            img2 = x[1].to(device)
            y = y.to("cpu")

            out1 = model(img1).to("cpu")
            out2 = model(img2).to("cpu")

            cosine_similarity = torch.cat(
                (cosine_similarity.detach(), cos_sim(out1, out2).detach()), 0)
            true_similarity = torch.cat((true_similarity, y), 0)

            del x, y, img1, img2, out1, out2
            torch.cuda.empty_cache()
            if i % 10 == 0:
                print("Verification on validation set:",
                      i * batchsize,
                      end='\r')

        AUC = roc_auc_score(true_similarity,
                            cosine_similarity.detach().numpy())
        return AUC
    else:
        for i, (x) in enumerate(data_loader):
            img1 = x[0].to(device)
            img2 = x[1].to(device)

            out1 = model(img1).to("cpu")
            out2 = model(img2).to("cpu")

            cosine_similarity = torch.cat(
                (cosine_similarity.detach(), cos_sim(out1, out2).detach()), 0)
            if i % 10 == 0:
                print("Verification on test set:", i * batchsize, end='\r')
        return write_submission(sub_name, cosine_similarity)
def get_related(token: str,
                embedding: Model,
                vocab: Vocabulary,
                num_related: int = 20):
    """Given a token, return a list of top 20 most similar words to the token."""
    token_id = vocab.get_token_index(token, 'token_in')
    token_vec = embedding.weight[
        token_id]  #A pre-initialization weight matrix for the embedding lookup, allowing the use of pretrained vectors.
    cosine = CosineSimilarity(
        dim=0
    )  #we do this to be able calculate simple cosine similarity between 2 vectors
    sims = Counter()

    for index, token in vocab.get_index_to_token_vocabulary(
            'token_in').items():
        # Cosine similarity of our token vector with every other word vector in the vocabulary
        sim = cosine(token_vec, embedding.weight[index]).item()
        sims[token] = sim  #save the value of cosine similarity

    return sims.most_common(num_related)
Beispiel #26
0
def get_eigenvector_decomposition_magnitude_indv(eigenvectors, eigenvalues, X,
                                                 correction_mean):
    '''
    Mean average of magnitude of cosine distance to each eigenevector
    '''
    whitened_cos_dists = []
    ranks = []

    with torch.no_grad():
        # Correct by pre-calculated authentic data mean
        X = X - correction_mean.repeat(X.size(0), 1)

        cos = CosineSimilarity(dim=1)
        for i in range(eigenvectors.size(0)):
            ranks.append(i)
            v = eigenvectors[i]
            v_repeat = v.repeat(X.size(0), 1)
            abs_cos_dist = torch.abs(cos(X, v_repeat))
            whitened_abs_cos_dist = abs_cos_dist / (eigenvalues[i]**0.5)
            whitened_cos_dists.append(whitened_abs_cos_dist)
        whitened_cos_dists = torch.stack(whitened_cos_dists, dim=1)

    return ranks, whitened_cos_dists
Beispiel #27
0
def predict(model, test_feature_path, enrollment_path, test_path, threshold):
    model.cpu()
    _, grp_embeddings = enrollment(model, test_feature_path, enrollment_path)
    indices, _ = readTestPaths(test_path)  # 组编号-文件名列表
    cosine_similarity = CosineSimilarity(dim=1)
    cos_similarity = {}
    for key, value in indices.items():  # 组编号-文件名列表
        for path in value:
            out = calculateOneEmbedding(model, test_feature_path, path)
            cosine = cosine_similarity(out, grp_embeddings[key])
            cos_similarity[path] = max(cosine).item()
    groupid = []
    fileid = []
    ismember = []
    results = {}
    for idx, (k, v) in enumerate(cos_similarity.items()):
        groupid.append(idx // 100)
        fileid.append(k)
        ismember.append('Y') if v > threshold else ismember.append('N')
    results['GroupID'] = groupid
    results['FileID'] = fileid
    results['IsMember'] = ismember
    results = pd.DataFrame(results)
    results.to_csv('results.csv', index=False)
def evaluate_embeddings(embedding, vocab: Vocabulary):
    cosine = CosineSimilarity(dim=0)
    simlex999 = read_simlex999()
    sims_pred = []
    oov_count = 0
    for word1, word2, sim in simlex999:
        word1_id = vocab.get_token_index(
            word1, 'token_in')  #word1_id takes the ID of the word 1.
        if word1_id == 1:  # word_ID==1  means that that the word is out of vocabulary OOV
            sims_pred.append(0.)
            oov_count += 1
            continue
        word2_id = vocab.get_token_index(
            word2, 'token_in')  #word2_id takes the ID of the word 2
        if word2_id == 1:
            sims_pred.append(0.)
            oov_count += 1
            continue

        sim_pred = cosine(
            embedding.weight[word1_id], embedding.weight[word2_id]
        ).item(
        )  #Calculate the CosineSimilarity between word1 and word2 and charge this in sim_pred.
        sims_pred.append(sim_pred)

    assert len(sims_pred) == len(
        simlex999
    )  # Assertion de l'egalité de longueur de sims_pred et simlex999
    print('# of OOV words: {} / {}'.format(oov_count, len(simlex999)))
    print(pearsonr(sims_pred, [sim for _, _, sim in simlex999]))
    return spearmanr(
        sims_pred, [sim for _, _, sim in simlex999]
    )  # compare two sets of similarities and calculate how they are related, it's called spearman's correlation
    #compare two sets of similarities and calculate how they are related.
    #Calculates a Spearman rank-order correlation coefficient and the p-value to test for non-correlation.
    """scipy.stats.spearmanr(a, b=None, axis=0)[source]
Beispiel #29
0
import numpy as np
from itertools import islice
from collections import deque
import matplotlib
import umap
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import proj3d
import matplotlib.cm as cm
from torch.nn import CosineSimilarity
from sty import fg, bg, ef, rs, RgbFg
from sklearn.preprocessing import MinMaxScaler
import syntok.segmenter as segmenter

document = Document()  ## Create a python-docx document
cos = CosineSimilarity(dim=1, eps=1e-6)

sent_level = False
dynamic = True
graph = False
doc_embeddings = []
scores = []

stacked_embeddings = DocumentPoolEmbeddings([
    WordEmbeddings('en'),
    #WordEmbeddings('glove'),
    #WordEmbeddings('extvec'),#ELMoEmbeddings('original'),
    #BertEmbeddings('bert-base-cased'),
    #FlairEmbeddings('news-forward-fast'),
    #FlairEmbeddings('news-backward-fast'),
    #OpenAIGPTEmbeddings()
Beispiel #30
0
def predict(model,
            eval_dataloader,
            output_dir,
            eval_fearures,
            args,
            cur_train_mean_loss=None,
            logger=None,
            compute_metrics=True,
            eval_script_path='../MeasEval/eval/measeval-eval.py',
            only_parts=''):

    only_parts = [part for part in only_parts.split('+') if part]
    model.eval()
    syns = sorted(model.local_config['syns'])
    device = torch.device(
        'cuda') if model.local_config['use_cuda'] else torch.device('cpu')

    metrics = defaultdict(float)
    nb_eval_steps = 0

    syns_preds = []

    for batch_id, batch in enumerate(
            tqdm(eval_dataloader,
                 total=len(eval_dataloader),
                 desc='validation ... ')):

        batch = tuple([elem.to(device) for elem in batch])

        input_ids, input_mask, token_type_ids, b_syn_labels, b_positions = batch

        with torch.no_grad():
            loss, syn_logits = model(input_ids=input_ids,
                                     token_type_ids=token_type_ids,
                                     attention_mask=input_mask,
                                     input_labels={
                                         'syn_labels': b_syn_labels,
                                         'positions': b_positions
                                     })

        if compute_metrics:
            for key, value in loss.items():
                metrics[f'eval_{key}_loss'] += value.mean().item()

        nb_eval_steps += 1
        if model.local_config['loss'] != 'cosine_similarity':
            syns_preds.append(syn_logits.detach().cpu().numpy())
        else:
            syns_preds.append(CosineSimilarity()(
                syn_logits[0], syn_logits[1]).detach().cpu().numpy())

    syns_scores = np.concatenate(syns_preds,
                                 axis=0)  # n_examples x 2 or n_examples
    if syns_scores.shape[-1] != 1 and model.local_config[
            'loss'] != 'cosine_similarity':
        syns_preds = np.argmax(syns_scores, axis=1)  # n_examples
    elif model.local_config['loss'] == 'cosine_similarity':
        syns_preds = np.zeros(syns_scores.shape, dtype=int)
        syns_preds[syns_scores >= 0.5] = 1
    else:
        syns_preds = np.zeros(syns_scores.shape, dtype=int)
        if model.local_config['train_scd']:
            syns_preds[syns_scores >= 3.0] = 1
        else:
            syns_preds[syns_scores > 0.5] = 1

    predictions = defaultdict(lambda: defaultdict(list))
    golds = defaultdict(lambda: defaultdict(list))
    scores = defaultdict(lambda: defaultdict(list))
    gold_scores = defaultdict(lambda: defaultdict(list))
    lemmas = defaultdict(lambda: defaultdict(list))

    syn_ids_to_label = {0: 'F', 1: 'T'}
    for ex_id, (ex_feature, ex_syn_preds, ex_scores) in enumerate(
            zip(eval_fearures, syns_preds, syns_scores)):
        example = ex_feature.example
        docId = example.docId
        posInDoc = int(docId.split('.')[-1])
        docId = '.'.join(docId.split('.')[:-1])
        syn_pred = syn_ids_to_label[ex_syn_preds.item()]
        predictions[docId][posInDoc].append(syn_pred)
        golds[docId][posInDoc].append(example.label)
        # scores for positive class
        if model.local_config['loss'] == 'cosine_similarity':
            scores[docId][posInDoc].append(ex_scores)
        elif len(ex_scores) > 1:
            scores[docId][posInDoc].append(softmax(ex_scores)[-1])
        else:
            scores[docId][posInDoc].append(ex_scores[0])
        gold_scores[docId][posInDoc].append(example.score)
        lemmas[docId][posInDoc].append((example.lemma, example.grp))

    if os.path.exists(output_dir):
        os.system(f'rm -r {output_dir}/*')
    else:
        os.makedirs(output_dir, exist_ok=True)

    print(f'saving predictions for: {only_parts}')
    for docId, doc_preds in predictions.items():
        doc_scores = scores[docId]
        if len(only_parts) > 0 and all([
                f'{docId.split(".")[1]}.score' not in part
                for part in only_parts
        ]):
            continue
        print(f'saving predictions for part: {docId}')
        prediction = [{
            'id': f'{docId}.{pos}',
            'tag': 'F' if 'F' in doc_preds[pos] else 'T'
        } for pos in sorted(doc_preds)]
        prediction_file = os.path.join(output_dir, docId)
        json.dump(prediction, open(prediction_file, 'w'))
        prediction = [{
            'id': f'{docId}.{pos}',
            'score': [str(x) for x in doc_scores[pos]]
        } for pos in sorted(doc_preds)]
        prediction_file = os.path.join(output_dir, f'{docId}.scores')
        json.dump(prediction, open(prediction_file, 'w'))

    if compute_metrics:
        for key in metrics:
            metrics[key] /= nb_eval_steps
        mean_non_english = []
        for docId, doc_preds in predictions.items():
            if 'scd' in docId:
                doc_golds = gold_scores[docId]
                doc_lemmas = lemmas[docId]
                doc_scores = scores[docId]

                keys = sorted(list(doc_golds.keys()))
                # print(doc_lemmas)
                unique_lemmas = sorted(
                    set([
                        doc_lemmas[key][0][0] for key in keys
                        if doc_lemmas[key][0][1] == 'COMPARE'
                    ]))
                y_true, y_pred = [], []
                y_sent_true, y_sent_pred = [], []
                for unique_lemma in unique_lemmas:
                    unique_lemma_keys = [
                        key for key in keys
                        if doc_lemmas[key][0][0] == unique_lemma
                        and doc_lemmas[key][0][1] == 'COMPARE'
                    ]
                    unique_word_scores_pred = [
                        np.array(doc_scores[key]).mean()
                        for key in unique_lemma_keys
                    ]
                    unique_word_scores_true = [
                        doc_golds[key][0] for key in unique_lemma_keys
                    ]
                    y_true.append(np.array(unique_word_scores_true).mean())
                    y_pred.append(np.array(unique_word_scores_pred).mean())
                    y_sent_true.extend(unique_word_scores_true)
                    y_sent_pred.extend(unique_word_scores_pred)
                # print(y_true, y_pred)
                # metrics[f'spearman.{docId}.score'], _ = spearmanr(y_true, y_pred)
                # metrics[f'spearman.{docId}.pairwise'], _ = spearmanr(y_sent_true, y_sent_pred)
                metrics[f'spearman.{docId}.wordwise.score'], _ = spearmanr(
                    y_true, y_pred)
                metrics[f'spearman.{docId}.score'], _ = spearmanr(
                    y_sent_true, y_sent_pred)
                doc_golds = golds[docId]
                keys = list(doc_golds.keys())
                doc_golds = [doc_golds[key][0] for key in keys]
                doc_preds = [
                    'F' if 'F' in doc_preds[key] else 'T' for key in keys
                ]
                metrics[f'{docId}.accuracy'] = accuracy_score(
                    doc_golds, doc_preds)
            else:
                doc_golds = golds[docId]
                keys = list(doc_golds.keys())
                doc_golds = [doc_golds[key][0] for key in keys]
                doc_preds = [
                    'F' if 'F' in doc_preds[key] else 'T' for key in keys
                ]
                metrics[f'accuracy.{docId}.score'] = accuracy_score(
                    doc_golds, doc_preds)
                if 'en-en' not in docId:
                    mean_non_english.append(metrics[f'accuracy.{docId}.score'])
        if mean_non_english:
            metrics[f'accuracy.{docId.split(".")[0]}.nen-nen.score'] = sum(
                mean_non_english) / len(mean_non_english)

        if cur_train_mean_loss is not None:
            metrics.update(cur_train_mean_loss)
    else:
        metrics = {}

    model.train()

    return metrics