コード例 #1
0
    def eval(self, write_flag=False):
        with torch.no_grad():

            self.n.eval()

            start_time = time.time()
            for i, (x, target) in enumerate(self.eval_data_loader):
                # measure data loading time
                # print("data time: " + str(time.time() - start_time))

                # compute output
                x = x.to(DEVICE)
                target = target.to(DEVICE)
                output = self.n(x)
                predictions = output.data.squeeze_(1).squeeze_().cpu().numpy()
                # predictions = output.data.max(1)[1].squeeze_(1).squeeze_(0).cpu().numpy()
                if i == 0:
                    predictions_all = predictions
                else:
                    predictions_all = np.concatenate((predictions_all, predictions))

                if i == 0:
                    gts_all = target.data.squeeze_().cpu().numpy()
                else:
                    gts_all = np.concatenate((gts_all, target.data.squeeze_().cpu().numpy()))

            acc = jaccard(predictions_all, gts_all)
            print('Validation set = Acc: ' + str(acc) + ' | time: ' + str(time.time() - start_time))
        if write_flag:
            ffname = opt.outd + 'UNet_val_accuracies.txt'
            with open(ffname, 'a') as f:
                f.write(str(acc) + '\n')
コード例 #2
0
def calculate_jaccard_score(
    original_tweet, 
    target_string, 
    sentiment_val, 
    idx_start, 
    idx_end, 
    offsets,
    verbose=False):
    
    if idx_end < idx_start:
        idx_end = idx_start
    
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "

    if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
        filtered_output = original_tweet

    if sentiment_val != "neutral" and verbose == True:
        if filtered_output.strip().lower() != target_string.strip().lower():
            print("********************************")
            print(f"Output= {filtered_output.strip()}")
            print(f"Target= {target_string.strip()}")
            print(f"Tweet= {original_tweet.strip()}")
            print("********************************")

    jac = utils.jaccard(target_string.strip(), filtered_output.strip())
    return jac, filtered_output
コード例 #3
0
ファイル: anchors.py プロジェクト: XingLiuJia/retina_net
    def encode_argmax(self, target):
        if len(target) == 0:
            return torch.zeros(self.anchors.shape[0], 5)

        ious = jaccard(target[:, :4], corner_form(self.anchors))
        max_iou, iou_idxs = ious.max(dim=0)

        if (max_iou >= self.argmax_pos_thresh).sum() == 0:
            return torch.zeros(self.anchors.shape[0], 5)

        boxes = center_form(target[:, :4])[iou_idxs]

        xy = 10 * (boxes[:, :2] - self.anchors[:, :2]) / self.anchors[:, 2:]
        wh = 5 * torch.log(boxes[:, 2:] / self.anchors[:, 2:])

        target_boxes = torch.cat([xy, wh], dim=1)

        labels = torch.zeros(target_boxes.shape[0], 1)

        labels[max_iou >= self.argmax_pos_thresh,
               0] = target[:,
                           -1][iou_idxs[max_iou >= self.argmax_pos_thresh]] + 1
        labels[(max_iou > self.argmax_neg_thresh)
               & (max_iou < self.argmax_pos_thresh)] = -1

        # If it doesn't have a high enough threshold, still give it a label if it is the nearest anchor
        _, idxs = ious.max(dim=1)
        labels[idxs, 0] = target[:, -1] + 1

        return torch.cat([target_boxes, labels], dim=1)
コード例 #4
0
def eval_fn(model, dataset):
    """Eval the eval dataset and returns the metric"""

    data = tf.data.Dataset.from_generator(
        dataset.TweetDataset(data, config.TOKENIZER, config.MAX_LEN).gen,
        output_types=dataset.gen_str).batch(config.VALID_BATCH_SIZE)

    def get_text(text, pred):

        pred_texts = []
        orig_texts = []
        text = text.numpy()
        pred = tf.argmax(pred, axis=1).numpy()

        for t, p in zip(text, pred):
            orig_texts.append(t.decode("utf-8"))
            t = config.TOKENIZER.encode(orig_texts[-1]).offsets
            i, j = p[0], p[1]
            pred_texts.append(orig_texts[-1][t[i][0]:t[j][1]])

        return orig_texts, pred_texts

    score = 0
    for i, (data, _) in tqdm(enumerate(data)):
        orig_text = data["orig"]
        ext_text = data["ext"]
        preds = model.predict(data)
        targets, pred_texts = get_text(orig_text, preds)
        score = score + utils.jaccard(pred_texts, targets)
    score = sum(score) / len(score)

    print("Total jaccard score : ", score)
コード例 #5
0
def calculate_jaccard_score(original_tweet,
                            target_string,
                            sentiment_val,
                            idx_start,
                            idx_end,
                            offsets,
                            verbose=False):
    """
    Calculate the jaccard score from the predicted span and the actual span for a batch of tweets
    """

    # A span's start index has to be greater than or equal to the end index
    # If this doesn't hold, the start index is set to equal the end index (the span is a single token)
    if idx_end < idx_start:
        idx_end = idx_start

    # Combine into a string the tokens that belong to the predicted span
    filtered_output = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]:offsets[ix][1]]
        # If the token is not the last token in the tweet, and the ending offset of the current token is less
        # than the beginning offset of the following token, add a space.
        # Basically, add a space when the next token (word piece) corresponds to a new word
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            filtered_output += " "

    # Set the predicted output as the original tweet when the tweet's sentiment is "neutral", or the tweet only contains one word
    if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
        filtered_output = original_tweet

    # Calculate the jaccard score between the predicted span, and the actual span
    # The IOU (intersection over union) approach is detailed in the utils module's `jaccard` function:
    # https://www.kaggle.com/abhishek/utils
    jac = utils.jaccard(target_string.strip(), filtered_output.strip())
    return jac, filtered_output
コード例 #6
0
def cal_jaccard(tweet, target, idx_start, idx_end, input_offsets):
    if idx_end < idx_start:
        idx_end = idx_start
    output = ""
    for idx in range(idx_start, idx_end + 1):
        output += tweet[input_offsets[idx][0]: input_offsets[idx][1]]
    jac = utils.jaccard(target, output)
    return jac, output
コード例 #7
0
    def validation_step(self, batch, batch_idx):
        target_text = batch['selected_text']

        preds = self.test_step(batch, batch_idx)
        preds_text = preds["preds"]
        jaccard_score = [
            jaccard(p, t) for p, t in zip(preds_text, target_text)
        ]

        return {"jaccard_score": jaccard_score}
コード例 #8
0
def calculate_jaccard_score(original_context, target_string, question_val,
                            idx_start, idx_end):

    if idx_end < idx_start:
        idx_end = idx_start

    filtered_output = original_context[idx_start:idx_end + 1]

    jac = utils.jaccard(target_string.strip(), filtered_output.strip())
    return jac, filtered_output
コード例 #9
0
def calculate_jaccard_score(
    original_tweet, 
    target_string, 
    sentiment_val, 
    idx_start, 
    idx_end, 
    offsets_start, 
    offsets_end,
    verbose=False):

    offsets = list(zip(offsets_start, offsets_end))
    
    if idx_end < idx_start:
        idx_end = idx_start
    
    filtered_output  = ""
    original_tweet_sp = " ".join(original_tweet.split())
    for ix in range(idx_start, idx_end + 1):
        if offsets[ix][0] == 0 and offsets[ix][1] == 0:
            continue
        filtered_output += original_tweet_sp[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "

    filtered_output = filtered_output.replace(" .", ".")
    filtered_output = filtered_output.replace(" ?", "?")
    filtered_output = filtered_output.replace(" !", "!")
    filtered_output = filtered_output.replace(" ,", ",")
    filtered_output = filtered_output.replace(" ' ", "'")
    filtered_output = filtered_output.replace(" n't", "n't")
    filtered_output = filtered_output.replace(" 'm", "'m")
    filtered_output = filtered_output.replace(" do not", " don't")
    filtered_output = filtered_output.replace(" 's", "'s")
    filtered_output = filtered_output.replace(" 've", "'ve")
    filtered_output = filtered_output.replace(" 're", "'re")

    if sentiment_val == "neutral":
        filtered_output = original_tweet

    if sentiment_val != "neutral" and verbose == True:
        if filtered_output.strip().lower() != target_string.strip().lower():
            print("********************************")
            print(f"Output= {filtered_output.strip()}")
            print(f"Target= {target_string.strip()}")
            print(f"Tweet= {original_tweet.strip()}")
            print("********************************")

    jac = utils.jaccard(target_string.strip(), filtered_output.strip())
    return jac
コード例 #10
0
    def sup_forward(self, x, y, d_index, hyperparameters):

        self.sup.eval()

        # Encoding content image.
        content, _ = self.gen.encode(x)

        # Forwarding on supervised model.
        y_pred = self.sup(content, only_prediction=True)

        # Computing metrics.
        pred = y_pred.data.max(1)[1].squeeze_(1).squeeze_(0).cpu().numpy()

        jacc, jacc_cup = jaccard(pred, y.cpu().squeeze(0).numpy())

        return jacc, jacc_cup, pred, content
コード例 #11
0
ファイル: trainer.py プロジェクト: lconet/CoDAGANs
    def sup_forward(self, x, y, d_index, hyperparameters):

        self.sup.eval()

        # Encoding content image.
        one_hot_x = torch.cat([x, self.one_hot_img[d_index, 0].unsqueeze(0)],
                              1)
        hidden, _ = self.gen.encode(one_hot_x)

        # Forwarding on supervised model.
        y_pred = self.sup(hidden, only_prediction=True)

        # Computing metrics.
        pred = y_pred.data.max(1)[1].squeeze_(1).squeeze_(0).cpu().numpy()

        jacc = jaccard(pred, y.cpu().squeeze(0).numpy())

        return jacc, pred, hidden
コード例 #12
0
def test(X):
    jaccard_scores = []
    for article in X:
        scores = []
        for sentence1, sentence2 in zip(article, article[1:]):
            stopped_sentence1 = utils.remove_stop_words(sentence1)
            stemmed_sentence1 = utils.stem_tokens(stopped_sentence1)
            stopped_sentence2 = utils.remove_stop_words(sentence2)
            stemmed_sentence2 = utils.stem_tokens(stopped_sentence2)

            scores.append(utils.jaccard(stemmed_sentence1, stemmed_sentence2))

        if scores:
            jaccard_scores.append(np.average(scores))
        else:
            jaccard_scores.append(0.1)

    return np.array(jaccard_scores).reshape(len(jaccard_scores), 1)
コード例 #13
0
ファイル: anchors.py プロジェクト: XingLiuJia/retina_net
    def encode_bipartite(self, target):
        if len(target) == 0:
            return torch.zeros(self.anchors.shape[0], 5)

        ious = jaccard(target[:, :4], corner_form(self.anchors))

        max_iou, iou_idxs = ious.max(dim=1)

        target_boxes = torch.zeros(self.anchors.shape[0], 5)

        anchors = self.anchors[iou_idxs]
        cf = center_form(target[:, :4])
        xy = 10 * (cf[:, :2] - anchors[:, :2]) / anchors[:, 2:]
        wh = 5 * torch.log(cf[:, 2:] / anchors[:, 2:])
        encoded = torch.cat([xy, wh], dim=1)

        target_boxes[iou_idxs] = torch.cat(
            [encoded, target[:, -1].unsqueeze(1) + 1], dim=1)
        return target_boxes
コード例 #14
0
    def compute_similarities(self, new_doc_ids=None, min_similarity=0.5):
        docs = self.corpus.all_docs()

        # new_doc_ids is used to keep from recomputing already known similarities.
        # None is special signal to compute on all doc pairs.
        if new_doc_ids is None:
            new_doc_ids = docs.keys()

        with get_similarity_writer(self.corpus.id) as writer:
            i = 0
            for (x, y) in self._pairs_for_comparison(docs.keys(), new_doc_ids):
                similarity = jaccard(docs[x], docs[y])
                if similarity >= min_similarity:
                    writer.write(x, y, similarity)

                i += 1
                if i % 10000000 == 0:
                    writer.flush()
                    sys.stdout.write('.')
                    sys.stdout.flush()
コード例 #15
0
ファイル: ingestion.py プロジェクト: imclab/cluster-explorer
    def compute_similarities(self, new_doc_ids=None, min_similarity=0.5):
        docs = self.corpus.all_docs()

        # new_doc_ids is used to keep from recomputing already known similarities.
        # None is special signal to compute on all doc pairs.
        if new_doc_ids is None:
            new_doc_ids = docs.keys()
    
        with get_similarity_writer(self.corpus.id) as writer:
            i = 0
            for (x, y) in self._pairs_for_comparison(docs.keys(), new_doc_ids):
                similarity = jaccard(docs[x], docs[y])
                if similarity >= min_similarity:
                    writer.write(x, y, similarity)
                
                i += 1
                if i % 10000000 == 0:
                    writer.flush()
                    sys.stdout.write('.')
                    sys.stdout.flush()
コード例 #16
0
def calculate_jaccard_score(original_tweet,
                            target_string,
                            sentiment_val,
                            idx_start,
                            idx_end,
                            offsets,
                            verbose=False):

    if idx_end < idx_start:
        idx_end = idx_start

    filtered_output = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]:offsets[ix][1]]
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            filtered_output += " "

    if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
        filtered_output = original_tweet

    jac = utils.jaccard(target_string.strip(), filtered_output.strip())
    return jac, filtered_output
コード例 #17
0
    def encode(self, boxes_list):
        '''convert ground truth boxes to loc_offset and labels
        :param boxes: list of tensors. [[num_boxes,4],...], the list length if the batchSize
        :return: ious_list: the length of the list is same with the anchor nums,
                            each tensor is [batchsize, 1, h, w]
                 offset_list: each tensor is [batchsize, 4, h, w]
        '''
        ious_list = []
        offsets_list = []
        for anchor_idx in range(self.num_anchor):
            yxyx = self._buffers['yxyx_%d' % anchor_idx]
            yxhw = self._buffers['yxhw_%d' % anchor_idx]
            anchor_size = self.anchor_sizes[anchor_idx]
            ious = []
            offsets = []
            for boxes in boxes_list:
                flat_yxyx = th.reshape(yxyx, [-1, 4])
                iou = ut.jaccard(flat_yxyx, boxes)
                iou, idx = th.max(iou, -1)
                iou = th.reshape(iou, [yxyx.shape[0], yxyx.shape[1]])
                ious.append(iou)

                boxes_yxhw = ut.yxyx_to_yxhw(boxes)
                flat_yxhw = th.reshape(yxhw, [-1, 4])
                expand_yxhw = flat_yxhw.unsqueeze(1).expand(
                    flat_yxhw.size(0), boxes_yxhw.size(0), 4)
                offset_array = boxes_yxhw - expand_yxhw
                offset = [offset_array[i, idx[i]] for i in range(idx.size(0))]
                offset = th.stack(offset) / anchor_size
                offset = th.reshape(offset, yxyx.shape)
                offsets.append(offset)
            ious = th.stack(ious, dim=0)
            offsets = th.stack(offsets, dim=0).permute([0, 3, 1, 2])
            ious_list.append(ious)
            offsets_list.append(offsets)
        return ious_list, offsets_list
コード例 #18
0
    def test(self, write_flag=False):
        with torch.no_grad():

            self.n.eval()
            acc = []
            start_time = time.time()
            for i, (x, target) in enumerate(self.test_data_loader):
                # measure data loading time
                # print("data time: " + str(time.time() - start_time))

                # compute output
                x = x.to(DEVICE)
                target = target.to(DEVICE)
                gt = target.data.squeeze_().cpu().numpy()
                output = self.n(x)
                output = nn.functional.interpolate(output, size=target.shape, mode='bilinear')
                prediction = output.data.squeeze_(1).squeeze_().cpu().numpy()
                acc.append(jaccard(prediction, gt))

            print('Test set = Acc: ' + str(np.mean(acc)) + ' | time: ' + str(time.time() - start_time))
        if write_flag:
            ffname = opt.outd + 'UNet_accuracies.txt'
            with open(ffname, 'a') as f:
                f.write(str(np.mean(acc)) + '\n')
コード例 #19
0
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import subprocess as sub  #Running BASH script within python???
import matplotlib.pyplot as plt
import numpy as np

lemma = nltk.WordNetLemmatizer()
relArticles = findRelevantArticles("Heart Attack")
articlefilelist = []
wordslist = ['../STEMI_words', '../NSTEMI_words', '../WIKI_words']

for article in relArticles:
    articlefilename = "content_" + str(article) + ".txt"
    with codecs.open(articlefilename, 'wb', 'utf-8') as outfile:
        content = wikipedia.page(article).content
        content = [lemma.lemmatize(word) for word in content]
        content = set(content)
        for word in content:
            print >> outfile, word
    articlefilelist.append(articlefilename)

for piece in wordslist:
    articlefilelist.append(piece)

matrix = np.matrix([[jaccard(i, j) for i in articlefilelist]
                    for j in articlefilelist])
print matrix

with open('jaccardVals', 'wb') as outfile:
    print >> outfile, matrix
コード例 #20
0
def main():

    # tf flag
    flags = tf.flags
    flags.DEFINE_string(
        "test_data_txt",
        'F:/data_info/VAE_liver/set_5/TFrecord/fold_1/test.txt',
        "test data txt")
    flags.DEFINE_string(
        "indir",
        'G:/experiment_result/liver/VAE/set_5/down/64/alpha_0.1/fold_1/VAE/axis_5/beta_7',
        "input dir")
    flags.DEFINE_string(
        "outdir",
        'G:/experiment_result/liver/VAE/set_5/down/64/alpha_0.1/fold_1/VAE/axis_5/beta_7/rec',
        "outdir")
    flags.DEFINE_integer("model_index", 3300, "index of model")
    flags.DEFINE_string("gpu_index", "0", "GPU-index")
    flags.DEFINE_float("beta", 1.0, "hyperparameter beta")
    flags.DEFINE_integer("num_of_test", 75, "number of test data")
    flags.DEFINE_integer("batch_size", 1, "batch size")
    flags.DEFINE_integer("latent_dim", 5, "latent dim")
    flags.DEFINE_list("image_size", [56, 72, 88, 1], "image size")
    FLAGS = flags.FLAGS

    # check folder
    if not (os.path.exists(FLAGS.outdir)):
        os.makedirs(FLAGS.outdir)

    # read list
    test_data_list = io.load_list(FLAGS.test_data_txt)

    # test step
    test_step = FLAGS.num_of_test // FLAGS.batch_size
    if FLAGS.num_of_test % FLAGS.batch_size != 0:
        test_step += 1

    # load test data
    test_set = tf.data.TFRecordDataset(test_data_list, compression_type='GZIP')
    test_set = test_set.map(
        lambda x: utils._parse_function(x, image_size=FLAGS.image_size),
        num_parallel_calls=os.cpu_count())
    test_set = test_set.batch(FLAGS.batch_size)
    test_iter = test_set.make_one_shot_iterator()
    test_data = test_iter.get_next()

    # initializer
    init_op = tf.group(tf.initializers.global_variables(),
                       tf.initializers.local_variables())

    with tf.Session(config=utils.config(index=FLAGS.gpu_index)) as sess:
        # set network
        kwargs = {
            'sess': sess,
            'outdir': FLAGS.outdir,
            'beta': FLAGS.beta,
            'latent_dim': FLAGS.latent_dim,
            'batch_size': FLAGS.batch_size,
            'image_size': FLAGS.image_size,
            'encoder': encoder_resblock_bn,
            'decoder': decoder_resblock_bn,
            'downsampling': down_sampling,
            'upsampling': up_sampling,
            'is_training': False,
            'is_down': False
        }
        VAE = Variational_Autoencoder(**kwargs)

        sess.run(init_op)

        # testing
        VAE.restore_model(
            os.path.join(FLAGS.indir, 'model',
                         'model_{}'.format(FLAGS.model_index)))
        tbar = tqdm(range(test_step), ascii=True)
        preds = []
        ori = []
        ji = []
        for k in tbar:
            test_data_batch = sess.run(test_data)
            ori_single = test_data_batch
            preds_single = VAE.reconstruction_image(ori_single)
            preds_single = preds_single[0, :, :, :, 0]
            ori_single = ori_single[0, :, :, :, 0]

            preds.append(preds_single)
            ori.append(ori_single)

            # # label
            ji = []
            for j in range(len(preds)):

                # EUDT
                eudt_image = sitk.GetImageFromArray(preds[j])
                eudt_image.SetSpacing([1, 1, 1])
                eudt_image.SetOrigin([0, 0, 0])

                label = np.where(preds[j] > 0.5, 0, 1)
                # label = np.where(preds[j] > 0.5, 1, 0.5)
                label = label.astype(np.int16)
                label_image = sitk.GetImageFromArray(label)
                label_image.SetSpacing([1, 1, 1])
                label_image.SetOrigin([0, 0, 0])

                ori_label = np.where(ori[j] > 0.5, 0, 1)
                ori_label_image = sitk.GetImageFromArray(ori_label)
                ori_label_image.SetSpacing([1, 1, 1])
                ori_label_image.SetOrigin([0, 0, 0])

                # # calculate ji
                ji.append([utils.jaccard(label, ori_label)])

                # output image
                io.write_mhd_and_raw(
                    label_image, '{}.mhd'.format(
                        os.path.join(FLAGS.outdir, 'label',
                                     'recon_{}'.format(j))))

        generalization = np.mean(ji)
        print('generalization = %f' % generalization)

        # # output csv file
        with open(os.path.join(
                FLAGS.outdir,
                'generalization_{}.csv'.format(FLAGS.model_index)),
                  'w',
                  newline='') as file:
            writer = csv.writer(file)
            writer.writerows(ji)
            writer.writerow(['generalization= ', generalization])
コード例 #21
0
def eval_fn(data_loader, model, device):
    model.eval()

    fin_outputs_start = []
    fin_outputs_end = []
    fin_tweet_tokens = []
    fin_padding_lens = []
    fin_orig_selected = []
    fin_orig_sentiment = []
    fin_orig_tweet = []
    fin_tweet_token_ids = []

    for bi, d in enumerate(tk0, data_loader):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        tweet_tokens = d['tweet_tokens']
        padding_len = d['padding_len']
        orig_sentiment = d['orig_sentiment']
        orig_selected = d['orig_selected']
        orig_tweet = d['orig_tweet']

        #move everything to appropriate device
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)

        targets_start = targets_start.to(device, dtype=torch.float)
        targets_end = targets_end.to(device, dtype=torch.float)

        o1, o2 = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

        #we are not calculating loss for validation so removed it .If you want to calculate loss feel free to do so.

        fin_outputs_end.append(torch.sigmoid(o2).cpu().detach().numpy())
        fin_outputs_start.append(torch.sigmoid(o1).cpu().detach().numpy())
        fin_padding_lens.extennd(padding_len.cpu().detach().numpy().tolist())

        fin_tweet_tokens.extend(tweet_tokens)
        fin_orig_sentiment.extend(orig_sentiment)
        fin_orig_selected.extend(orig_selected)
        fin_orig_tweet.extend(orig_tweet)

    #NOTE : It is important that how you select the final selected text-----now fun begins
    fin_outputs_start = np.vstack(fin_output_start)
    fin_outputs_end = np.vstack(fin_output_end)

    threshold = 0.2
    jaccards = []
    #iterate over each prediction
    for j in range(len(fin_tweet_tokens)):
        target_string = fin_orig_selected[j]
        tweet_tokens = fin_tweet_tokens[j]
        padding_len = fin_padding_lens[j]
        original_tweet = fin_orig_tweet[j]
        sentiment = fin_orig_sentiment[j]

        if padding_len > 0:
            mask_start = fin_outputs_start[j, :][:-padding_len] >= threshold
            mask_end = fin_outputs_end[j, :][:-padding_len] >= threshold

        else:
            mask_start = fin_outputs_start[j, :] >= threshold
            mask_end = fin_outputs_end[j, :] >= threshold

        mask = [0] * len(mask_start)
        idx_start = np.nonzero(mask_start)[0]
        idx_end = np.nonzero(mask_end)[0]

        if len(idx_start) > 0:
            idx_start = idx_start[0]
            if len(idx_end) > 0:
                idx_end = idx_end[0]
            else:
                idx_end = idx_start
        else:
            idx_start = 0
            idx_end = 0

        for mj in range(idx_start, idx_end + 1):
            mask[mj] = 1

        #if original tweet's word is present in selected text then it will be included in your output_toekns list
        output_tokens = [
            x for p, x in enumerate(tweet_tokens.split()) if mask[p] == 1
        ]

        #By above code CLS and SEP is also present in output_tokens list so let's remove them
        output_tokens = [
            x for x in output_tokens if x not in ("[CLS]", "[SEP]")
        ]  #

        final_output = ""
        for ot in output_tokens:
            #make your own rules if you want

            #-----------------rules start from  here----------------
            #if one word has been splitted(that is identified by ##) then add it back to previous word ==>> eg. youtube =====splitted into ===>>> you and ##tube  =====>add it back==>>> youtube
            if ot.starswith("##"):
                final_output = final_output + ot[2:]

            elif len(ot) == 1 and ot in string.punctuation:
                final_output = final_output + ot

            else:
                final_output = final_output + " " + ot
            #----------rule ended here--------------------

        final_output = final_output.strip()

        if sentiment == 'neutral' or len(original_tweet.split()) < 4:
            final_output = original_tweet

        jac = utils.jaccard(target_string.strip(), final_output.strip())
        jaccards.append(jac)

        mean_jac = np.mean(jaccards)
        return mean_jac
コード例 #22
0
def main():
    parser = argparse.ArgumentParser(
        description='py, test_data_txt, ground_truth_txt, outdir')

    parser.add_argument('--ground_truth_txt', '-i1', default='')

    parser.add_argument('--model', '-i2', default='./model_{}'.format(50000))

    parser.add_argument('--outdir', '-i3', default='')

    args = parser.parse_args()

    # check folder
    if not (os.path.exists(args.outdir)):
        os.makedirs(args.outdir)

    # tf flag
    flags = tf.flags
    flags.DEFINE_float("beta", 0.1, "hyperparameter beta")
    flags.DEFINE_integer("num_of_generate", 1000, "number of generate data")
    flags.DEFINE_integer("batch_size", 1, "batch size")
    flags.DEFINE_integer("latent_dim", 2, "latent dim")
    flags.DEFINE_list("image_size", [512, 512, 1], "image size")
    FLAGS = flags.FLAGS

    # load ground truth
    ground_truth = io.load_matrix_data(args.ground_truth_txt, 'int32')
    print(ground_truth.shape)

    # initializer
    init_op = tf.group(tf.initializers.global_variables(),
                       tf.initializers.local_variables())

    with tf.Session(config=utils.config) as sess:

        # set network
        kwargs = {
            'sess': sess,
            'outdir': args.outdir,
            'beta': FLAGS.beta,
            'latent_dim': FLAGS.latent_dim,
            'batch_size': FLAGS.batch_size,
            'image_size': FLAGS.image_size,
            'encoder': cnn_encoder,
            'decoder': cnn_decoder
        }
        VAE = Variational_Autoencoder(**kwargs)

        sess.run(init_op)

        # testing
        VAE.restore_model(args.model)
        tbar = tqdm(range(FLAGS.num_of_generate), ascii=True)
        specificity = []
        for i in tbar:
            sample_z = np.random.normal(0, 1.0, (1, FLAGS.latent_dim))
            generate_data = VAE.generate_sample(sample_z)
            generate_data = generate_data[0, :, :, 0]

            # EUDT
            eudt_image = sitk.GetImageFromArray(generate_data)
            eudt_image.SetSpacing([1, 1])
            eudt_image.SetOrigin([0, 0])

            # label
            label = np.where(generate_data > 0, 0, 1)
            label_image = sitk.GetImageFromArray(label)
            label_image.SetSpacing([1, 1])
            label_image.SetOrigin([0, 0])

            # calculate ji
            case_max_ji = 0.
            for image_index in range(ground_truth.shape[0]):
                ji = utils.jaccard(label, ground_truth[image_index])
                if ji > case_max_ji:
                    case_max_ji = ji
            specificity.append([case_max_ji])

            # output image
            io.write_mhd_and_raw(
                eudt_image,
                '{}.mhd'.format(os.path.join(args.outdir, 'EUDT', str(i + 1))))
            io.write_mhd_and_raw(
                label_image,
                '{}.mhd'.format(os.path.join(args.outdir, 'label',
                                             str(i + 1))))

    print('specificity = %f' % np.mean(specificity))

    # output csv file
    with open(os.path.join(args.outdir, 'specificity.csv'), 'w',
              newline='') as file:
        writer = csv.writer(file)
        writer.writerows(specificity)
        writer.writerow(['specificity:', np.mean(specificity)])
コード例 #23
0
    def __getitem__(self, index):
        """
        Return some image with its meta information and labeled annotations.

        Parameters
        ----------
        index : int
            The index of the image to be returned.

        Returns
        -------
        image : Tensor
            The image at self.images[index] after some optional transforms have been
            performed as an (w, h, 3) Tensor in the range [0., 1.].
        image_info : dict
            A dictionary object containing meta information about the image.
        target : Tensor
            A Tensor representing the target output of the YOLOv2 network which was
            used to initialise the dataset object.

        """
        dataset, img = self.data[index]
        data = np.load(os.path.join(self.raw_dir[dataset], img + '.npz'))
        signal = data['signal']
        samp_rate = data['samp_rate']
        N_fft = data['N_fft']
        N_overlap = data['N_overlap']
        signal = signal[0] + 1.j * signal[1]
        stft, _, _ = self.stft(signal,
                               N_fft=N_fft,
                               N_overlap=N_overlap,
                               samp_rate=samp_rate)
        if self.mode == 'spectrogram':
            data = np.abs(stft)**2
        elif self.mode == 'spectrogram_db':
            data = 10. * np.log10(np.abs(stft)**2)
        elif self.mode == 'spectrogram_ap':
            data = [np.abs(stft)**2, np.angle(stft)]
        elif self.mode == 'spectrogram_ap_db':
            data = [10. * np.log10(np.abs(stft)**2), np.angle(stft)]
        elif self.mode == 'stft':
            data = np.abs(stft)
        elif self.mode == 'stft_iq':
            data = [stft.real, stft.imag]
        elif self.mode == 'stft_ap':
            data = [np.abs(stft), np.angle(stft)]
        else:
            raise ValueError(
                'Unknown mode. Use one of spectrogram, spectrogram_db, '
                'spectrogram_ap, spectrogram_ap_db, stft_iq or stft_ap.')

        data = torch.tensor(data, dtype=torch.float32)
        if data.ndim == 2:
            data = data[None]
        data = (data - torch.tensor(self.mu)[:, None, None]) / torch.tensor(
            self.sigma)[:, None, None]

        data_info = {
            'id': img,
            'width': data.shape[2],
            'height': data.shape[1],
            'dataset': self.dataset[dataset]
        }

        if self.do_transforms:
            pass
        data_info['padding'] = [0., 0., 0., 0.]
        data_info['scale'] = [1., 1.]

        assert (data.size()[1:] == self.image_size[index]).all()

        if self.return_targets:
            annotations = get_annotations(self.annotations_dir[dataset], img)
            random.shuffle(annotations)
            target = [
                np.zeros((self.grid_sizes[i][index,
                                             1], self.grid_sizes[i][index, 0],
                          self.num_anchors[i] * self.num_features),
                         dtype=np.float32) for i in range(self.num_detectors)
            ]
            cell_dims = np.array([[self.strides[i], self.strides[i]]
                                  for i in range(self.num_detectors)])

            anchors = [torch.zeros((n, 4)) for n in self.num_anchors]
            for i, (a, t) in enumerate(zip(anchors, target)):
                a[:, 2:] = self.anchors[i].clone()
                t[:, np.arange(self.grid_sizes[i][index][0]), 0::self.num_features] = \
                    np.arange(self.grid_sizes[i][index][0])[None, :, None] + 0.5
                t[:, :, 1::self.num_features] = np.arange(
                    self.grid_sizes[i][index][1])[:, None, None] + 0.5
                t[:, :, 2::self.num_features] = a[:, 2]
                t[:, :, 3::self.num_features] = a[:, 3]

            # For each object in image.
            for annotation in annotations:
                name, height, width, xmin, ymin, xmax, ymax, truncated, difficult = annotation
                if (self.skip_truncated and truncated) or (self.skip_difficult
                                                           and difficult):
                    continue
                if name not in self.classes:
                    continue
                if self.do_transforms:
                    pass
                xmin = np.clip(xmin, a_min=1, a_max=self.image_size[index, 0])
                xmax = np.clip(xmax, a_min=1, a_max=self.image_size[index, 0])
                ymin = np.clip(ymin, a_min=1, a_max=self.image_size[index, 1])
                ymax = np.clip(ymax, a_min=1, a_max=self.image_size[index, 1])
                xmin, xmax, ymin, ymax = np.round(xmin), np.round(
                    xmax), np.round(ymin), np.round(ymax)
                if xmax == xmin or ymax == ymin:
                    continue
                xmin /= cell_dims[:, 0]
                xmax /= cell_dims[:, 0]
                ymin /= cell_dims[:, 1]
                ymax /= cell_dims[:, 1]
                if all(xmax - xmin < (SMALL_THRESHOLD * cell_dims[:, 0])):
                    continue
                if all(ymax - ymin < (SMALL_THRESHOLD * cell_dims[:, 1])):
                    continue
                idx = np.floor((xmax + xmin) / 2.), np.floor(
                    (ymax + ymin) / 2.)
                idx = np.array(idx, dtype=np.int).T

                ground_truth = torch.tensor([xmin, ymin, xmax, ymax],
                                            dtype=torch.float32).t()
                anchors = [
                    torch.zeros((self.num_anchors[i], 4))
                    for i in range(self.num_detectors)
                ]
                for i in range(self.num_detectors):
                    anchors[i][:, 2:] = self.anchors[i].clone()
                    anchors[i][:, 0::2] += xmin[i]
                    anchors[i][:, 1::2] += ymin[i]
                anchors = torch.cat(anchors)
                ious = jaccard(ground_truth, anchors)
                if ious.max() < IOU_MATCH_THRESHOLD:
                    continue
                max_iou = 0.
                cumsum_detectors = np.cumsum([0] + self.num_anchors)
                for i in range(self.num_detectors):
                    if ious[i,
                            cumsum_detectors[i]:cumsum_detectors[i + 1]].max(
                            ) > max_iou:
                        l = i
                        d = ious[
                            i,
                            cumsum_detectors[i]:cumsum_detectors[i +
                                                                 1]].argmax()
                        max_iou = ious[
                            i,
                            cumsum_detectors[i]:cumsum_detectors[i + 1]].max()

                target[l][idx[l][1], idx[l][0],
                          d * self.num_features + 0] = (xmin[l] + xmax[l]) / 2.
                target[l][idx[l][1], idx[l][0],
                          d * self.num_features + 1] = (ymin[l] + ymax[l]) / 2.
                target[l][idx[l][1], idx[l][0],
                          d * self.num_features + 2] = xmax[l] - xmin[l]
                target[l][idx[l][1], idx[l][0],
                          d * self.num_features + 3] = ymax[l] - ymin[l]
                target[l][idx[l][1], idx[l][0], d * self.num_features + 4] = 1.
                target[l][idx[l][1], idx[l][0], d * self.num_features + 5:(d + 1) * self.num_features] = \
                    self.encode_categorical(name)

            target = [
                torch.tensor(target[i]).permute(2, 0, 1)
                for i in range(self.num_detectors)
            ]

            return data, data_info, target
        else:
            return data, data_info
コード例 #24
0
def find_best_anchors(classes,
                      root_dir,
                      dataset,
                      k=5,
                      max_iter=20,
                      skip_truncated=True,
                      init=(13, 13),
                      weighted=True,
                      multi_scale=False,
                      device='cuda'):

    annotations_dir = [os.path.join(r, 'Annotations') for r in root_dir]
    sets_dir = [os.path.join(r, 'ImageSets', 'Main') for r in root_dir]

    images = []

    for d in range(len(dataset)):
        for cls in classes:
            file = os.path.join(sets_dir[d],
                                '{}_{}.txt'.format(cls, dataset[d]))
            with open(file) as f:
                for line in f:
                    image_desc = line.split()
                    if image_desc[1] == '1':
                        images.append((d, image_desc[0]))

    images = list(set(images))
    bboxes = []

    for image in images:
        annotations = get_annotations(annotations_dir[image[0]], image[1])
        for annotation in annotations:
            name, height, width, xmin, ymin, xmax, ymax, truncated, difficult = annotation
            if skip_truncated and truncated:
                continue
            width = (xmax - xmin) / width
            height = (ymax - ymin) / height
            if multi_scale:
                for i in [2. * d + 1 for d in range(4, 10)]:
                    bboxes.append([0., 0., i * width, i * height])
            else:
                bboxes.append([0., 0., 13. * width, 13. * height])

    bboxes = torch.tensor(bboxes, dtype=torch.float64, device=device)
    # anchors = [[0, 0, 3, 3],
    #            [0, 0, 4, 3],
    #            [0, 0, 5, 3],
    #            [0, 0, 4, 4],
    #            [0, 0, 5, 4],
    #            [0, 0, 5, 5],
    #            [0, 0, 6, 5],
    #            [0, 0, 10, 5],
    #            [0, 0, 13, 5]]
    anchors = torch.tensor(([0., 0., init[0], init[1]] * np.random.random(
        (k, 4))).astype(dtype=np.float64),
                           device=device)
    # anchors = torch.tensor(anchors, dtype=torch.float64, device=device)

    for _ in range(max_iter):
        ious = jaccard(bboxes, anchors)
        iou_max, idx = torch.max(ious, dim=1)
        for i in range(k):
            if weighted:
                weights = (torch.tensor([1.], device=device) -
                           iou_max[idx == i, None])**10
                anchors[i] = torch.sum(bboxes[idx == i] * weights,
                                       dim=0) / torch.sum(
                                           weights)  # Weighted k-means

            else:
                anchors[i] = torch.mean(bboxes[idx == i],
                                        dim=0)  # Normal k-means

        sort = torch.argsort(anchors[:, 2], dim=0)
        anchors = anchors[sort]

    return anchors[:, 2:]
コード例 #25
0
options, args = parser.parse_args()

command_line_arguments = [command.strip().lower() for command in options.pipeline.split()]
pipeline = [getattr(tech, command) for command in command_line_arguments]

if os.path.isdir(options.input):
	input_filenames = [os.path.join(options.input,filename.strip().lower()) for filename in os.listdir(options.input)]
	input_filenames = [filename for filename in input_filenames if filename.endswith('.txt')]
else:
	input_filenames  = [filename.strip().lower() for filename in options.input.split()]
data = {basename(filename):tech.filestream_to_word_list(open(filename,'rb')) for filename in input_filenames}
#Analysis methods are pairwise

#Calculate Jaccard similarity

keys = data.keys()
jaccard_similarity = np.zeros((len(keys),len(keys)))
for j in xrange(jaccard_similarity.shape[1]):
	for i in xrange(j):
		jaccard_similarity[i,j] = tech.jaccard(data[keys[i]],data[keys[j]])

jaccard_similarity += jaccard_similarity.transpose()
jaccard_similarity[np.diag_indices_from(jaccard_similarity)] = 1

np.savetxt('../data/jaccard_similarity.tsv',jaccard_similarity,fmt='%.04f',header = ' '.join(keys))

fig, ax  = plt.subplots()
ax = sns.heatmap(jaccard_similarity, annot=True, fmt='.02f', square = True,
					  xticklabels = keys, yticklabels=keys)
plt.tight_layout()
plt.savefig('../graphs/jaccard_similarity.png')
コード例 #26
0
def experiment(data, box, cv, output):
    """
    Write the results of an experiment.
        This function will run an experiment for a specific dataset for a bounding box. 
        There will be CV runs of randomized experiments run and the outputs will be 
        written to a file. 

        Parameters
        ----------
        data : string
            Dataset name.
            
        box : string 
            Bounding box on the file name.
        cv : int 
            Number of cross validation runs. 
            
        output : string
            If float or tuple, the projection will be the same for all features,
            otherwise if a list, the projection will be described feature by feature.
                    
        Returns
        -------
        None
            
        Raises
        ------
        ValueError
            If the percent poison exceeds the number of samples in the requested data.
    """
    #data, box, cv, output = 'conn-bench-sonar-mines-rocks', '1', 5, 'results/test.npz'

    # load normal and adversarial data
    path_adversarial_data = 'data/attacks/' + data + '_[xiao][' + box + '].csv'
    df_normal = pd.read_csv('data/clean/' + data + '.csv', header=None).values
    df_adversarial = pd.read_csv(path_adversarial_data, header=None).values

    # separate out the normal and adversarial data
    Xn, yn = df_normal[:, :-1], df_normal[:, -1]
    Xa, ya = df_adversarial[:, :-1], df_adversarial[:, -1]

    # change the labels from +/-1 to [0,1]
    ya[ya == -1], yn[yn == -1] = 0, 0

    # calculate the ratios of data that would be used for training and hold out
    p0, p1 = 1. / cv, (1. - 1. / cv)
    N = len(Xn)
    # calculate the total number of training and testing samples and set the numbfer of
    # features that are going to be selected
    Ntr, Nte = int(p1 * N), int(p0 * N)
    n_selected_features = int(Xn.shape[1] * SEL_PERCENT) + 1

    # zero the results out : err_jaccard and err_kuncheva are 9x4 matrices
    err_jaccard, err_kuncheva = np.zeros((NPR, NALG)), np.zeros((NPR, NALG))
    # For M3(KNN classification error) analysis: err_KNN_norm will have just one row(1x4) because it only only contains normal data
    # err_KNN_pois is a 9x4 matrix
    err_KNN_norm, err_KNN_pois = np.zeros((1, NALG)), np.zeros((NPR, NALG))

    # Empty lists that will hold feature sets for all npr
    MIM_fset = []
    MIFS_fset = []
    MRMR_fset = []
    JMI_fset = []

    # creating list of empty lists
    for n in range(NPR):
        MIM_fset.append([])
        MIFS_fset.append([])
        MRMR_fset.append([])
        JMI_fset.append([])

    # run `cv` randomized experiments. note this is not performing cross-validation, rather
    # we are going to use randomized splits of the data.
    for _ in range(cv):
        # shuffle up the data for the experiment then split the data into a training and
        # testing dataset
        i = np.random.permutation(N)
        Xtrk, ytrk, Xtek, ytek = Xn[i][:Ntr], yn[i][:Ntr], Xn[i][Nte:], yn[i][
            Nte:]
        # run feature selection on the baseline dataset without an adversarial data. this
        # will serve as the baseline. use a parallel assignment to speed things up.
        sf_base_jmi, sf_base_mim, sf_base_mrmr, sf_base_mifs = run_feature_selection(
            Xtrk, ytrk, n_selected_features)

        Xtr_mim = Xtrk[:, sf_base_mim]
        Xtr_mifs = Xtrk[:, sf_base_mifs]
        Xtr_mrmr = Xtrk[:, sf_base_mrmr]
        Xtr_jmi = Xtrk[:, sf_base_jmi]

        Xte_mim = Xtek[:, sf_base_mim]
        Xte_mifs = Xtek[:, sf_base_mifs]
        Xte_mrmr = Xtek[:, sf_base_mrmr]
        Xte_jmi = Xtek[:, sf_base_jmi]

        # err_KNN_norm table gives us the classification accuracy score of feature selection
        # algorithms performed on untainted data, that can be used for further analysis
        err_KNN_norm[0, 0] += err_KNN_classification(Xtr_mim, ytrk, Xte_mim,
                                                     ytek)
        err_KNN_norm[0, 1] += err_KNN_classification(Xtr_mifs, ytrk, Xte_mifs,
                                                     ytek)
        err_KNN_norm[0, 2] += err_KNN_classification(Xtr_mrmr, ytrk, Xte_mrmr,
                                                     ytek)
        err_KNN_norm[0, 3] += err_KNN_classification(Xtr_jmi, ytrk, Xte_jmi,
                                                     ytek)

        # loop over the number of poisoning ratios that we need to evaluate
        for n in range(NPR):

            # calucate the number of poisoned data that we are going to need to make sure
            # that the poisoning ratio is correct in the training data. e.g., if you have
            # N=100 samples and you want to poison by 20% then the 20% needs to be from
            # the training size. hence it is not 20.
            Np = int(len(ytrk) * POI_RNG[n] + 1)
            if Np >= len(ya):
                # shouldn't happen but catch the case where we are requesting more poison
                # data samples than are available. NEED TO BE CAREFUL WHEN WE ARE CREATING
                # THE ADVERSARIAL DATA
                raise ValueError(
                    'Number of poison data requested is larger than the available data.'
                )

            # find the number of normal samples (i.e., not poisoned) samples in the
            # training data. then create the randomized data set that has Nn normal data
            # samples and Np adversarial samples in the training data
            Nn = len(ytrk) - Np
            idx_normal, idx_adversarial = np.random.permutation(len(ytrk))[:Nn], \
                                          np.random.permutation(len(ya))[:Np]
            Xtrk_poisoned, ytrk_poisoned = np.concatenate((Xtrk[idx_normal], Xa[idx_adversarial])), \
                                           np.concatenate((ytrk[idx_normal], ya[idx_adversarial]))

            # run feature selection with the training data that has adversarial samples
            sf_adv_jmi, sf_adv_mim, sf_adv_mrmr, sf_adv_mifs = run_feature_selection(
                Xtrk_poisoned, ytrk_poisoned, n_selected_features)

            Xtrk_poisoned_MIM = Xtrk_poisoned[:, sf_adv_mim]
            Xtrk_poisoned_MIFS = Xtrk_poisoned[:, sf_adv_mifs]
            Xtrk_poisoned_MRMR = Xtrk_poisoned[:, sf_adv_mrmr]
            Xtrk_poisoned_JMI = Xtrk_poisoned[:, sf_adv_jmi]

            Xtest_MIM = Xtek[:, sf_adv_mim]
            Xtest_MIFS = Xtek[:, sf_adv_mifs]
            Xtest_MRMR = Xtek[:, sf_adv_mrmr]
            Xtest_JMI = Xtek[:, sf_adv_jmi]

            # calculate the accumulated jaccard and kuncheva performances for each of the
            # feature selection algorithms
            err_jaccard[n, 0] += jaccard(sf_adv_mim, sf_base_mim)
            err_jaccard[n, 1] += jaccard(sf_adv_mifs, sf_base_mifs)
            err_jaccard[n, 2] += jaccard(sf_adv_mrmr, sf_base_mrmr)
            err_jaccard[n, 3] += jaccard(sf_adv_jmi, sf_base_jmi)

            err_kuncheva[n, 0] += kuncheva(sf_adv_mim, sf_base_mim,
                                           Xtrk.shape[1])
            err_kuncheva[n, 1] += kuncheva(sf_adv_mifs, sf_base_mifs,
                                           Xtrk.shape[1])
            err_kuncheva[n, 2] += kuncheva(sf_adv_mrmr, sf_base_mrmr,
                                           Xtrk.shape[1])
            err_kuncheva[n, 3] += kuncheva(sf_adv_jmi, sf_base_jmi,
                                           Xtrk.shape[1])

            # err_KNN_pois table gives the classification accuracy score of feature selection
            # algorithms performed on poisoned data
            err_KNN_pois[n,
                         0] += err_KNN_classification(Xtrk_poisoned_MIM,
                                                      ytrk_poisoned, Xtest_MIM,
                                                      ytek)
            err_KNN_pois[n,
                         1] += err_KNN_classification(Xtrk_poisoned_MIFS,
                                                      ytrk_poisoned,
                                                      Xtest_MIFS, ytek)
            err_KNN_pois[n,
                         2] += err_KNN_classification(Xtrk_poisoned_MRMR,
                                                      ytrk_poisoned,
                                                      Xtest_MRMR, ytek)
            err_KNN_pois[n,
                         3] += err_KNN_classification(Xtrk_poisoned_JMI,
                                                      ytrk_poisoned, Xtest_JMI,
                                                      ytek)

            # Storing all the features in corresponding feature selection algo list
            MIM_fset[n].append(sf_adv_mim)
            MIFS_fset[n].append(sf_base_mifs)
            MRMR_fset[n].append(sf_base_mrmr)
            JMI_fset[n].append(sf_adv_jmi)

    MIM_stability_score = comb_kuncheva(MIM_fset, 2, cv, Xtrk.shape[1])
    MIFS_stability_score = comb_kuncheva(MIFS_fset, 2, cv, Xtrk.shape[1])
    MRMR_stability_score = comb_kuncheva(MRMR_fset, 2, cv, Xtrk.shape[1])
    JMI_stability_score = comb_kuncheva(JMI_fset, 2, cv, Xtrk.shape[1])

    feature_stability = np.column_stack(
        (MIM_stability_score, MIFS_stability_score, MRMR_stability_score,
         JMI_stability_score))

    # scale the kuncheva and jaccard statistics by 1.0/cv then write the output file
    err_jaccard, err_kuncheva = err_jaccard / cv, err_kuncheva / cv
    err_KNN_pois, err_KNN_norm = err_KNN_pois / cv, err_KNN_norm / cv
    np.savez(output,
             M1=feature_stability,
             err_jaccard=err_jaccard,
             M2=err_kuncheva,
             M3_pois=err_KNN_pois,
             M3_norm=err_KNN_norm)

    return None
コード例 #27
0
ファイル: darknet.py プロジェクト: LlewellynS96/darktorch
    def loss(self, predictions, targets, stats):
        assert type(predictions) == list
        loss = {}
        for i, (p, t) in enumerate(zip(predictions, targets)):
            assert p.shape == t.shape

            l = {}
            batch_size = t.shape[0]

            t = t.permute(0, 2, 3, 1)
            p = p.permute(0, 2, 3, 1)

            t = t.contiguous().view(batch_size, -1, self.num_features)
            p = p.contiguous().view(batch_size, -1, self.num_features)

            img_idx = torch.arange(batch_size,
                                   dtype=torch.float,
                                   device=self.device)
            img_idx = img_idx.reshape(-1, 1) * p.shape[2]
            t[:, :, 0] += 2. * img_idx
            p[:, :, 0] += 2. * img_idx
            img_idx = torch.arange(batch_size,
                                   dtype=torch.float,
                                   device=self.device)
            img_idx = img_idx.reshape(-1, 1) * p.shape[1]
            t[:, :, 1] += 2. * img_idx
            p[:, :, 1] += 2. * img_idx

            t = t.contiguous().view(-1, self.num_features)
            p = p.contiguous().view(-1, self.num_features)

            obj_mask = torch.nonzero(t[:, 4]).flatten()
            num_obj = len(obj_mask)

            if obj_mask.numel() > 0:
                p_xyxy = xywh2xyxy(p[:, :4].detach())
                t_xyxy = xywh2xyxy(t[obj_mask, :4])

                all_ious = jaccard(p_xyxy, t_xyxy)
                ious, _ = torch.max(all_ious, dim=1)
                stats['avg_obj_iou'].append(
                    all_ious[obj_mask].diag().mean().item())

                mask = torch.nonzero(ious > self.noobj_iou_threshold).squeeze()
                t[mask, 4] = 1.
                noobj_mask = torch.nonzero(t[:, 4] == 0.).squeeze()

                l['coord'] = nn.MSELoss(reduction='sum')(p[obj_mask, 0],
                                                         t[obj_mask, 0])
                l['coord'] += nn.MSELoss(reduction='sum')(p[obj_mask, 1],
                                                          t[obj_mask, 1])
                l['coord'] += nn.MSELoss(reduction='sum')(torch.sqrt(
                    p[obj_mask, 2]), torch.sqrt(t[obj_mask, 2]))
                l['coord'] += nn.MSELoss(reduction='sum')(torch.sqrt(
                    p[obj_mask, 3]), torch.sqrt(t[obj_mask, 3]))
                l['coord'] *= LAMBDA_COORD / batch_size

                if self.iteration * self.batch_size < 12800:
                    l['bias'] = nn.MSELoss(reduction='sum')(p[noobj_mask, 0],
                                                            t[noobj_mask, 0])
                    l['bias'] += nn.MSELoss(reduction='sum')(p[noobj_mask, 1],
                                                             t[noobj_mask, 1])
                    l['bias'] += nn.MSELoss(reduction='sum')(torch.sqrt(
                        p[noobj_mask, 2]), torch.sqrt(t[noobj_mask, 2]))
                    l['bias'] += nn.MSELoss(reduction='sum')(torch.sqrt(
                        p[noobj_mask, 3]), torch.sqrt(t[noobj_mask, 3]))

                    l['bias'] *= 0.1 / batch_size

                p[obj_mask, 5:] = F.log_softmax(p[obj_mask, 5:], dim=-1)
                t_long = torch.argmax(t[obj_mask, 5:], dim=1)
                if USE_CROSS_ENTROPY:
                    l['class'] = nn.NLLLoss(reduction='sum')(p[obj_mask, 5:],
                                                             t_long)
                else:
                    l['class'] = nn.MSELoss(reduction='sum')(torch.exp(
                        p[obj_mask, 5:]), t[obj_mask, 5:])
                l['class'] *= LAMBDA_CLASS / batch_size
                stats['avg_class'].append(
                    torch.exp(p[obj_mask, 5 + t_long]).mean().item())

                # l['object'] = nn.MSELoss(reduction='sum')(p[obj_mask, 4],
                #                                           all_ious[obj_mask, torch.arange(num_obj)].detach())
                l['object'] = nn.MSELoss(reduction='sum')(p[obj_mask, 4],
                                                          t[obj_mask, 4])
                l['object'] *= LAMBDA_OBJ / batch_size
                stats['avg_pobj'].append(p[obj_mask, 4].mean().item())

                l['no_object'] = nn.MSELoss(reduction='sum')(p[noobj_mask, 4],
                                                             t[noobj_mask, 4])
                l['no_object'] *= LAMBDA_NOOBJ / batch_size
                stats['avg_pnoobj'].append(p[noobj_mask, 4].mean().item())
            else:
                l['object'] = torch.tensor([0.], device=self.device)
                l['coord'] = torch.tensor([0.], device=self.device)
                l['class'] = torch.tensor([0.], device=self.device)
                l['no_object'] = LAMBDA_NOOBJ / batch_size * nn.MSELoss(
                    reduction='sum')(p[:, 4], t[:, 4])
                if self.iteration * self.batch_size < 12800:
                    l['bias'] = nn.MSELoss(reduction='sum')(p[:, 0], t[:, 0])
                    l['bias'] += nn.MSELoss(reduction='sum')(p[:, 1], t[:, 1])
                    l['bias'] += nn.MSELoss(reduction='sum')(torch.sqrt(p[:,
                                                                          2]),
                                                             torch.sqrt(t[:,
                                                                          2]))
                    l['bias'] += nn.MSELoss(reduction='sum')(torch.sqrt(p[:,
                                                                          3]),
                                                             torch.sqrt(t[:,
                                                                          3]))
                    l['bias'] *= 0.1 / batch_size

            l['total'] = (l['coord'] + l['class'] + l['object'] +
                          l['no_object'])
            for k, v, in l.items():
                try:
                    loss[k] = loss[k] + v
                except KeyError:
                    loss[k] = v

        return loss, stats
コード例 #28
0
def main(argv):

    # turn off log message
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.FATAL)

    # check folder
    if not os.path.exists(FLAGS.dir):
        raise Exception("model dirctory is not existed!")
    if not os.path.exists(os.path.join(FLAGS.dir, 'generalization')):
        os.makedirs(os.path.join(FLAGS.dir, 'generalization'))

    # get tfrecord list
    test_data_list = glob.glob(FLAGS.indir + '/*')

    # test step
    test_step = FLAGS.num_of_test // FLAGS.batch_size
    if FLAGS.num_of_test % FLAGS.batch_size != 0:
        test_step += 1

    # load test data
    test_set = tf.data.TFRecordDataset(test_data_list)
    test_set = test_set.map(lambda x: utils._parse_function_val_test(
        x, image_size=FLAGS.image_size),
                            num_parallel_calls=os.cpu_count())
    test_set = test_set.batch(FLAGS.batch_size)
    test_iter = test_set.make_one_shot_iterator()
    test_data = test_iter.get_next()

    # initializer
    init_op = tf.group(tf.initializers.global_variables(),
                       tf.initializers.local_variables())

    with tf.Session(config=utils.config(index=FLAGS.gpu_index)) as sess:
        # set network
        kwargs = {
            'sess': sess,
            'latent_dim': FLAGS.latent_dim,
            'scale_lambda': FLAGS.scale_lambda,
            'scale_kappa': FLAGS.scale_kappa,
            'scale_psi': FLAGS.scale_psi,
            'image_size': FLAGS.image_size,
            'points_num': FLAGS.points_num,
            'k_size': FLAGS.k_size,
            'encoder_layer': encoder_layer,
            'points_encoder_layer': points_encoder_layer,
            'generator_layer': generator_layer,
            'discriminator_layer': discriminator_layer,
            'code_discriminator_layer': code_discriminator_layer,
            'is_training': False
        }

        Model = conditional_alphaGAN(**kwargs)

        sess.run(init_op)

        # print parameters
        utils.cal_parameter()

        # test
        Model.restore_model(FLAGS.dir +
                            '/model/model_{}'.format(FLAGS.model_index))
        tbar = tqdm(range(test_step), ascii=True)
        for i in tbar:
            test_image_batch, test_points_batch, test_label_batch = sess.run(
                test_data)
            reconstruction_batch = Model.reconstruction(
                test_image_batch, test_points_batch)

            # dilation of points
            test_points_batch = tf.keras.layers.MaxPooling3D(
                pool_size=5, strides=1, padding='same')(test_points_batch)
            test_points_batch = test_points_batch.eval()
            test_points_batch = test_points_batch * 2  # scaling

            if i is 0:
                test_label = np.asarray(test_label_batch)
                reconstruction = np.asarray(reconstruction_batch)[0]
                points = np.asarray(test_points_batch)
            else:
                test_label = np.concatenate(
                    (test_label, np.asarray(test_label_batch)), axis=0)
                reconstruction = np.concatenate(
                    (reconstruction, np.asarray(reconstruction_batch)[0]),
                    axis=0)
                points = np.concatenate((points, np.array(test_points_batch)),
                                        axis=0)

        # calculate Jaccard Index and output images
        generalization = []
        tbar = tqdm(range(reconstruction.shape[0]), ascii=True)
        for i in tbar:
            test_label_single = test_label[i][:, :, :, 0]
            reconstruction_single = reconstruction[i][:, :, :, 0]
            points_single = points[i][:, :, :, 0]

            # label
            rec_label = np.where(reconstruction_single > 0.5, 0, 1)
            rec_label = rec_label.astype(np.int8)

            # calculate ji
            generalization.append(
                [utils.jaccard(rec_label, test_label_single)])

            # label and points
            label_and_points = rec_label + points_single

            rec_label = rec_label.astype(np.int8)
            label_and_points = label_and_points.astype(np.int8)

            # output image
            io.write_mhd_and_raw(reconstruction_single,
                                 '{}.mhd'.format(
                                     os.path.join(FLAGS.dir, 'generalization',
                                                  'logodds',
                                                  'generate_{}'.format(i))),
                                 spacing=[1, 1, 1],
                                 origin=[0, 0, 0],
                                 compress=True)
            io.write_mhd_and_raw(rec_label,
                                 '{}.mhd'.format(
                                     os.path.join(FLAGS.dir, 'generalization',
                                                  'predict',
                                                  'recon_{}'.format(i))),
                                 spacing=[1, 1, 1],
                                 origin=[0, 0, 0],
                                 compress=True)
            io.write_mhd_and_raw(label_and_points,
                                 '{}.mhd'.format(
                                     os.path.join(FLAGS.dir, 'generalization',
                                                  'label_and_points',
                                                  'generate_{}'.format(i))),
                                 spacing=[1, 1, 1],
                                 origin=[0, 0, 0],
                                 compress=True)

        print('generalization = %f' % np.mean(generalization))

        # write csv
        io.write_csv(
            generalization,
            os.path.join(FLAGS.dir, 'generalization',
                         'generalization_val_{}.csv'.format(
                             FLAGS.model_index)), 'generalization')
コード例 #29
0
ファイル: predict_gen.py プロジェクト: yuki3-18/beta-VAE
def main():
    parser = argparse.ArgumentParser(
        description='py, test_data_txt, model, outdir')

    parser.add_argument('--test_data_txt', '-i1', default='')

    parser.add_argument('--model', '-i2', default='./model_{}'.format(50000))

    parser.add_argument('--outdir', '-i3', default='')

    args = parser.parse_args()

    # check folder
    if not (os.path.exists(args.outdir)):
        os.makedirs(args.outdir)

    # tf flag
    flags = tf.flags
    flags.DEFINE_float("beta", 0.1, "hyperparameter beta")
    flags.DEFINE_integer("num_of_test", 100, "number of test data")
    flags.DEFINE_integer("batch_size", 1, "batch size")
    flags.DEFINE_integer("latent_dim", 2, "latent dim")
    flags.DEFINE_list("image_size", [512, 512, 1], "image size")
    FLAGS = flags.FLAGS

    # read list
    test_data_list = io.load_list(args.test_data_txt)

    # test step
    test_step = FLAGS.num_of_test // FLAGS.batch_size
    if FLAGS.num_of_test % FLAGS.batch_size != 0:
        test_step += 1

    # load test data
    test_set = tf.data.TFRecordDataset(test_data_list)
    test_set = test_set.map(
        lambda x: _parse_function(x, image_size=FLAGS.image_size),
        num_parallel_calls=os.cpu_count())
    test_set = test_set.batch(FLAGS.batch_size)
    test_iter = test_set.make_one_shot_iterator()
    test_data = test_iter.get_next()

    # initializer
    init_op = tf.group(tf.initializers.global_variables(),
                       tf.initializers.local_variables())

    with tf.Session(config=utils.config) as sess:

        # set network
        kwargs = {
            'sess': sess,
            'outdir': args.outdir,
            'beta': FLAGS.beta,
            'latent_dim': FLAGS.latent_dim,
            'batch_size': FLAGS.batch_size,
            'image_size': FLAGS.image_size,
            'encoder': cnn_encoder,
            'decoder': cnn_decoder
        }
        VAE = Variational_Autoencoder(**kwargs)

        sess.run(init_op)

        # testing
        VAE.restore_model(args.model)
        tbar = tqdm(range(test_step), ascii=True)
        preds = []
        ori = []
        for k in tbar:
            test_data_batch = sess.run(test_data)
            ori_single = test_data_batch
            preds_single = VAE.reconstruction_image(ori_single)
            preds_single = preds_single[0, :, :, 0]
            ori_single = ori_single[0, :, :, 0]

            preds.append(preds_single)
            ori.append(ori_single)

        # # label
        ji = []
        for j in range(len(preds)):

            # EUDT
            eudt_image = sitk.GetImageFromArray(preds[j])
            eudt_image.SetSpacing([1, 1])
            eudt_image.SetOrigin([0, 0])

            label = np.where(preds[j] > 0, 0, 1)
            label_image = sitk.GetImageFromArray(label)
            label_image.SetSpacing([1, 1])
            label_image.SetOrigin([0, 0])

            ori_label = np.where(ori[j] > 0, 0, 1)
            ori_label_image = sitk.GetImageFromArray(ori_label)
            ori_label_image.SetSpacing([1, 1])
            ori_label_image.SetOrigin([0, 0])

            # # calculate ji
            ji.append(utils.jaccard(label, ori_label))

            # output image
            io.write_mhd_and_raw(
                eudt_image, '{}.mhd'.format(
                    os.path.join(args.outdir, 'EUDT', 'recon_{}'.format(j))))
            io.write_mhd_and_raw(
                label_image, '{}.mhd'.format(
                    os.path.join(args.outdir, 'label', 'recon_{}'.format(j))))

    generalization = np.mean(ji)
    print('generalization = %f' % generalization)

    # output csv file
    with open(os.path.join(args.outdir, 'generalization.csv'), 'w',
              newline='') as file:
        writer = csv.writer(file)
        writer.writerows(ji)
        writer.writerow(['generalization= ', generalization])
コード例 #30
0
def evaluate(epoch, dataloader, model, criterion, device, prefix=None):
    model.eval()
    with torch.no_grad():
        jaccard_sum, loss_sum, count = 0.0, 0.0, 0.0
        K = 5
        samples = {
            'positive': {
                'best': [],
                'worst': [],
            },
            'negative': {
                'best': [],
                'worst': [],
            },
            'neutral': {
                'best': [],
                'worst': [],
            },
        }

        pbar = tqdm(
            desc='{}Eval Batch'.format('' if prefix is None else prefix + ' '),
            total=len(dataloader),
            leave=False)

        for i, batch in enumerate(dataloader):

            tweet = batch['tweet'].to(device)
            selection = batch['selection'].long().to(device)
            raw_selection = batch['raw_selection']
            raw_tweet = batch['raw_tweet']
            sentiment = batch['sentiment']
            start = batch['start'].long().to(device)
            end = batch['end'].long().to(device)

            pos = batch['pos'].long().to(device)
            offsets = batch['offsets']

            non_pad_elements = selection.shape[1] - \
                (selection == -1).sum(dim=1)
            y_hat_start, y_hat_end = model(tweet, pos)

            loss = criterion(y_hat_start, start, y_hat_end, end, selection)

            loss_sum += loss.data.item()

            y_hat_start = torch.argmax(y_hat_start, dim=1)
            y_hat_end = torch.argmax(y_hat_end, dim=1)

            final = []

            for j, t in enumerate(tweet):
                s = offsets[j][y_hat_start[j]][0]
                e = offsets[j][y_hat_end[j]][1]
                final.append(raw_tweet[j][s:e])

            for j, raw in enumerate(raw_selection):
                selection_output = final[j]
                jacc = jaccard(raw, selection_output)
                jaccard_sum += jacc / tweet.shape[0]

                if len(samples[sentiment[j]]['best']) < K:
                    samples[sentiment[j]]['best'].append(
                        (jacc, raw_tweet[j], raw, selection_output))
                elif jacc > samples[sentiment[j]]['best'][0][0]:
                    samples[sentiment[j]]['best'].append(
                        (jacc, raw_tweet[j], raw, selection_output))
                    samples[sentiment[j]]['best'].sort(key=lambda x: x[0])
                    samples[sentiment[j]]['best'].pop(0)

                if len(samples[sentiment[j]]['worst']) < K:
                    samples[sentiment[j]]['worst'].append(
                        (jacc, raw_tweet[j], raw, selection_output))
                elif jacc < samples[sentiment[j]]['worst'][-1][0]:
                    samples[sentiment[j]]['worst'].append(
                        (jacc, raw_tweet[j], raw, selection_output))
                    samples[sentiment[j]]['worst'].sort(key=lambda x: x[0])
                    samples[sentiment[j]]['worst'].pop(-1)

            count += 1
            pbar.update()
        pbar.clear()
        pbar.close()
        return loss_sum / count, jaccard_sum / count, samples
コード例 #31
0
def train_fn(model, selected_model, dataloaders_dict, criterion, optimizer,
             num_epochs, filename):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = 0.0
            epoch_jaccard = 0.0

            tk0 = tqdm(dataloaders_dict[phase],
                       total=len(dataloaders_dict[phase]))
            for data in (tk0):
                ids = data['ids'].to(device)
                masks = data['masks'].to(device)
                tweet = data['tweet']
                offsets = data['offsets'].numpy()
                start_idx = data['start_idx'].to(device)
                end_idx = data['end_idx'].to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    if selected_model == 'LSTM':
                        start_logits, end_logits = model(ids)
                    elif selected_model == 'RoBERTa':
                        start_logits, end_logits = model(ids, masks)
                    loss = criterion(start_logits, end_logits, start_idx,
                                     end_idx)
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    epoch_loss += loss.item() * len(ids)
                    start_idx = start_idx.cpu().detach().numpy()
                    end_idx = end_idx.cpu().detach().numpy()
                    start_logits = torch.softmax(start_logits,
                                                 dim=1).cpu().detach().numpy()
                    end_logits = torch.softmax(end_logits,
                                               dim=1).cpu().detach().numpy()

                    for i in range(len(ids)):
                        start_pred = np.argmax(start_logits[i])
                        end_pred = np.argmax(end_logits[i])
                        pred = utils.get_selected_text(tweet[i], start_pred,
                                                       end_pred, offsets[i])
                        true = utils.get_selected_text(tweet[i], start_idx[i],
                                                       end_idx[i], offsets[i])
                        jaccard_score = utils.jaccard(pred, true)
                        epoch_jaccard += jaccard_score
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_jaccard = epoch_jaccard / len(
                dataloaders_dict[phase].dataset)

            print(
                'Epoch {}/{} | {:^5} | Loss: {:.4f} | Jaccard: {:.4f}'.format(
                    epoch + 1, num_epochs, phase, epoch_loss, epoch_jaccard))
    torch.save(model.state_dict(), filename)
コード例 #32
0
def eval_fn(data_loader, model, device):
    model.train()
    fin_output_start = []
    fin_output_end = []
    fin_padding_lens = []
    fin_tweet_tokens = []
    fin_orig_sentiment = []
    fin_orig_selected = []
    fin_orig_tweet = []

    for bi, d in enumerate(data_loader):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        tweet_tokens = d["tweet_tokens"]
        padding_len = d["padding_len"]
        orig_sentiment = d["orig_sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)

        o1, o2 = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        fin_output_start.append(torch.sigmoid(o1).cpu().detach().numpy())
        fin_output_end.append(torch.sigmoid(o1).cpu().detach().numpy())
        fin_padding_lens.extend(padding_len.cpu().detach().numpy().tolist())

        fin_tweet_tokens.extend(tweet_tokens)
        fin_orig_sentiment.extend(orig_sentiment)
        fin_orig_selected.extend(orig_selected)
        fin_orig_tweet.extend(orig_tweet) 

    fin_output_start = np.vstack(fin_output_start)
    fin_output_end = np.vstack(fin_output_end)

    threshold = 0.2
    jaccards = []
    for j in range(len(fin_tweet_tokens)):
        target_string = fin_orig_selected[j]
        target_tokens = fin_tweet_tokens[j]
        padding_len = fin_padding_lens[j]
        original_tweet = fin_orig_tweet[j]
        sentiment = fin_orig_sentiment[j]

        if padding_len > 0:
            mask_start = fin_output_start[j, :][:-padding_len] >= threshold
            mask_end = fin_output_end[j, :][:-padding_len] >= threshold
        else:
            mask_start = fin_output_start[j, :] >= threshold
            mask_end = fin_output_end[j, :] >= threshold
        mask = [0] * len(mask_start)
        idx_start = np.nonzero(mask_start)[0]
        idx_end = np.nonzero(mask_end)[0]

        if len(idx_start) > 0:
            idx_start = idx_start[0]
            if len(idx_end) > 0:
                idx_end = idx_end[0]
            else:
                idx_end = idx_start

        else:
            idx_start = 0
            idx_end = 0

        for mj in range(idx_start, idx_end + 1):
            mask[mj] = 1

        output_tokens = [x for p, x in enumerate(tweet_tokens.split()) if mask[p] == 1]
        output_tokens = [x for x in output_tokens if x not in ("[CLS]", "[SEP]")]

        final_output = ""
        for ot in output_tokens:
            if ot.startswith("##"):
                final_output = final_output + ot[2:]
            elif len(ot) == 1 and ot in string.punctuation:
                final_output = final_output + ot
            else:
                final_output = final_output + " " + ot
        final_output = final_output.strip()

        if sentiment == "neutral" or len(original_tweet.split()) < 4:
            final_output = original_tweet

        jac = utils.jaccard(target_string.strip(), final_output.strip())
        jaccards.append(jac)
    mean_jac = np.mean(jaccards)
    return mean_jac

#--- CALCULATE JACCARD SIMILARITY

source_rubric = [[source for source in sources] 
						 for source in sources]


filenames = ['jaccard-similarity-%s'%disease for disease in keywords]
filenames += ['jaccard-similarities.json']


if not all([os.path.isfile(filename) for filename in filenames]):
	jaccard_matrices = {disease:np.zeros((len(sources),len(sources))) for disease in keywords}
	for disease in keywords:
		jaccard_matrices[disease] = np.array([[tech.jaccard(corpus[sources[i]][disease],corpus[sources[j]][disease])
											for i in xrange(len(sources))]
											for j in xrange(len(sources))])


		fig = plt.figure()
		ax = fig.add_subplot(111)
		cax = ax.imshow(jaccard_matrices[disease],interpolation='nearest',aspect='equal',vmin=0,vmax=1)

		ax.set_xticks(range(len(sources)))
		ax.set_yticks(range(len(sources)))

		ax.set_xticklabels(map(tech.format,sources))
		ax.set_yticklabels(map(tech.format,sources))
		cbar = plt.colorbar(cax)
		cbar.set_label(tech.format('Jaccard Similarity'))