def one_same_one_different(self):
        a = np.zeros((10, 10))
        b = np.array(a, copy=True)
        a[0, 0] = 1
        b[0:2] = 1
        assert np.isclose(jaccard(a, b), 0.5)

        b_downwards = np.zeros_like(a)
        b_downwards[:2, 0] = 1
        assert np.isclose(jaccard(a, b), 0.5)
Exemple #2
0
def mostSimilar(songs):
    """
        Finds the two songs that are the most
        similar according to their jaccard indices

        Parameters:
            songs (dict{string:set}): dictionary with
                         song ids and scores 
                        (after being converted into a set of 
                        tuples)
        Return:
            tuple() containing the highest jaccard
                    index, and the ids of the
                    two songs that produced it
    """

    most = 0
    mostSongs = []

    # compare every song to one another
    for song1 in songs:
        for song2 in songs:

            if song1 == song2:
                continue

            score1 = songs[song1]
            score2 = songs[song2]

            val = jac.jaccard(score1, score2)
            if val > most:
                most = val
                mostSongs = [song1, song2]

    return (most, mostSongs)
def eval_fn(data_loader, model, device, tokenizer=TOKENIZER):
    model.eval()
    all_ids = []
    start_idx = []
    end_idx = []
    orig_selected = []
    padding_len = []

    for d in data_loader:
        ids = d['ids']
        token_type_ids = d['token_type_ids']
        mask = d['mask']
        selected_text = d['orig_selected']
        pad_len = d['padding_len']
        targets_start = d['targets_start']
        targets_end = d['targets_end']

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.float)
        targets_end = targets_end.to(device, dtype=torch.float)

        o1, o2 = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

        all_ids.append(ids.cpu().detach().numpy())
        start_idx.append(torch.sigmoid(o1).cpu().detach().numpy())
        end_idx.append(torch.sigmoid(o2).cpu().detach().numpy())
        orig_selected.extend(selected_text)
        padding_len.extend(pad_len)

    start_idx = np.vstack(start_idx)
    end_idx = np.vstack(end_idx)
    all_ids = np.vstack(all_ids)

    jaccards = []

    for i in range(0, len(start_idx)):
        start_logits = start_idx[i][3:-padding_len[i]]
        end_logits = end_idx[i][3:-padding_len[i]]
        this_id = all_ids[i][3:-padding_len[i]]

        idx1 = idx2 = None
        max_sum = 0
        for ii, s in enumerate(start_logits):
            for jj, e in enumerate(end_logits):
                if s + e > max_sum:
                    max_sum = s + e
                    idx1 = ii
                    idx2 = jj

        this_id = this_id[idx1:idx2 + 1]
        predicted_text = tokenizer.decode(this_id, skip_special_tokens=True)
        predicted_text = predicted_text.strip()
        sel_text = orig_selected[i].strip()

        jaccards.append(jaccard(predicted_text, sel_text))

    return np.mean(jaccards)
Exemple #4
0
    def calc_IOU(self):
        """ calculate the Intersection over Union AKA Jaccard Index
        between two images

        https://en.wikipedia.org/wiki/Jaccard_index
        """
        # flatten bc  jaccard_similarity_score expects 1D arrays
        state = self._state.ravel()
        state[state != -1] = 0  # mask out non-agent trajectory
        state = state.astype(bool)  # everything non-zero => True
        if not state.any():  # no agent trajectory
            print(" no state trajectory found")
            iou = 0.0
        else:
            iou = jaccard(state, self.original_state)
            # print("computed iou ", iou)
            # print("sum(agent) ", sum(state), "sum(original state)", sum(self.original_state), "computed iou ", iou)
        # print("agent \n", state.shape)
        # print("og \n", original_state.shape)
        # np.save("agent", state)
        # np.save("og", original_state)
        # assert isinstance(iou, )
        return iou
Exemple #5
0
    def calc_IOU(self):
        """ calculate the Intersection over Union AKA Jaccard Index
        between two images

        https://en.wikipedia.org/wiki/Jaccard_index
        """
        # flatten bc  jaccard_similarity_score expects 1D arrays
        # agent_trajectory = self._state
        # state = self.swc_to_tiff.ravel()
        # agent_trajectory[agent_trajectory != -1] = 0  # mask out non-agent trajectory
        # state = state.astype(bool)  # everything non-zero => True

        # images should not be all True
        # assert agent_trajectory.all() == False
        # print("calling locations_to_img from IOU")
        agent_trajectory = locations_to_img(self._agent_nodes[:self.cnt],
                                            self.observation_dims,
                                            img=None,
                                            val=-1)

        # if self.cnt > 0:
        #     if self.curr_IOU > 0.7:
        #         np.set_printoptions(threshold=np.nan)
        #         print("arr1 \n ", np.array_repr(agent_trajectory))
        #         print("arr2 \n ", np.array_repr(self.original_state))
        #         print(self.original_state[np.where(self.original_state < 0)])
        #         raise Exception
        iou, _, _ = jaccard(agent_trajectory, self.original_state)
        # print("computed iou ", iou)
        # print("sum(agent) ", np.sum(agent_trajectory), "sum(original state)", np.sum(self.original_state), "computed iou ", iou)
        # print("agent shape\n", agent_trajectory.shape)
        # print("og shape\n", self.original_state.shape)
        # np.save("agent", state)
        # np.save("og", original_state)
        # assert isinstance(iou, )
        return iou
Exemple #6
0
    def __init__(
            self,
            directory=None,
            viz=False,
            task=False,
            files_list=None,
            observation_dims=(27, 27, 27),
            multiscale=False,  # FIXME automatic dimensions
            max_num_frames=20,
            saveGif=False,
            saveVideo=False):  # FIXME hardcoded max num frames!
        """
        :param train_directory: environment or game name
        :param viz: visualization
            set to 0 to disable
            set to +ve number to be the delay between frames to show
            set to a string to be the directory for storing frames
        :param observation_dims: shape of the frame cropped from the image to feed
            it to dqn (d,w,h) - defaults (27,27,27)
        :param nullop_start: start with random number of null ops
        :param location_history_length: consider lost of lives as end of
            episode (useful for training)
        :max_num_frames: maximum number of frames per episode.
        """
        super(Brain_Env, self).__init__()

        print(
            "warning! max num frames hard coded to {}!".format(max_num_frames),
            flush=True)

        # inits stat counters
        self.reset_stat()

        # counter to limit number of steps per episodes
        self.cnt = 0
        # maximum number of frames (steps) per episodes
        self.max_num_frames = max_num_frames
        # stores information: terminal, score, distError
        self.info = None
        # option to save display as gif
        self.saveGif = saveGif
        self.saveVideo = saveVideo
        # training flag
        self.task = task
        # image dimension (2D/3D)
        self.observation_dims = observation_dims
        self.dims = len(self.observation_dims)
        # multi-scale agent
        self.multiscale = multiscale
        # FIXME force multiscale false for now
        self.multiscale = False

        # init env dimensions
        if self.dims == 2:
            self.width, self.height = observation_dims
        elif self.dims == 3:
            self.width, self.height, self.depth = observation_dims
        else:
            raise ValueError

        with _ALE_LOCK:
            self.rng = get_rng(self)
            # TODO: understand this viz setup
            # visualization setup
            #     if isinstance(viz, six.string_types):  # check if viz is a string
            #         assert os.path.isdir(viz), viz
            #         viz = 0
            #     if isinstance(viz, int):
            #         viz = float(viz)
            self.viz = viz
        #     if self.viz and isinstance(self.viz, float):
        #         self.viewer = None
        #         self.gif_buffer = []
        # stat counter to store current score or accumlated reward
        self.current_episode_score = StatCounter()
        # get action space and minimal action set
        self.action_space = spaces.Discrete(6)  # change number actions here
        self.actions = self.action_space.n
        self.observation_space = spaces.Box(low=0,
                                            high=255,
                                            shape=self.observation_dims,
                                            dtype=np.uint8)
        # history buffer for storing last locations to check oscilations
        self._history_length = max_num_frames
        # TODO initialize _observation_bounds limits from input image coordinates
        self._observation_bounds = ObservationBounds(0, 0, 0, 0, 0, 0)
        # add your data loader here
        # TODO: look into returnLandmarks
        # if self.task == 'play':
        #     self.files = filesListBrainMRLandmark(directory, files_list,
        #                                           returnLandmarks=False)
        # else:
        #     self.files = filesListBrainMRLandmark(directory, files_list,
        #                                           returnLandmarks=True)
        self.files = FilesListCubeNPY(directory, files_list)

        # self.files = filesListFetalUSLandmark(directory,files_list)
        # self.files = filesListCardioMRLandmark(directory,files_list)
        # prepare file sampler
        self.filepath = None
        self.file_sampler = self.files.sample_circular()  # returns generator
        # reset buffer, terminal, counters, and init new_random_game
        # we put this here so that init_player in DQN.py doesn't try to update_history
        self._clear_history()  # init arrays
        self._restart_episode()
        # self.viz = True  # FIXME viz should default False
        assert (np.shape(self._state) == self.observation_dims)
        assert np.isclose(jaccard(self.original_state, self.original_state), 1)
Exemple #7
0
data_tr, data_te = trainTestSplit(n=3)

# ===取出训练集用户的IP与浏览网址======
ipTrain = list(set(data_tr['realIP']))
urlTrain = list(set(data_tr['fullURL']))

# ===用户物品矩阵构建========
te = pd.DataFrame(0, index=ipTrain, columns=urlTrain)
for i in data_tr.index:
    te.loc[data_tr.loc[i, 'realIP'], data_tr.loc[i, 'fullURL']] = 1
te.sum().sum()   # 同data_tr个数
# te.to_excel('./te.xlsx')


# ===构建物品相似度矩阵======
cor = jaccard(te)  # 杰卡德相似系数
cor = pd.DataFrame(cor, index=urlTrain, columns=urlTrain)
# cor.to_excel('./cor.xlsx')

# 构建测试集用户网址浏览字典
ipTest = list(set(data_te['realIP']))
dic_te = {ip: list(data_te.loc[data_te['realIP'] == ip, 'fullURL']) for ip in ipTest}

rem = pd.DataFrame(index=range(len(data_te)), columns=['IP', 'url', 'rec1','rec2','rec3','rec4','rec5','recall','precision','R','anum'])
rem['IP'] = list(data_te['realIP'])
rem['url'] = list(data_te['fullURL'])
index = data_process()['fullURL'].value_counts()
for i in rem.index:
    rnum = 0  # 给用户的推荐中用户真正感兴趣的个数
    anum = len(dic_te[rem.loc[i, 'IP']])  # 当前用户实际访问个数    即该用户实际感兴趣的网页个数
    rem.loc[i, 'anum'] = anum
Exemple #8
0
    s = MLStripper()
    s.feed(html)
    return s.get_data()

idFirst = sys.argv[1]
idSecond = sys.argv[2]

parameters = {'format' : 'json', 'action' : 'query', 'revids' : idFirst, 'prop' : 'extracts', 'rvprop' : 'content', 'continue' : '', "exsectionformat" : "plain", "redirects" : ""}
parametersSecond = {'format' : 'json', 'action' : 'query', 'revids' : idSecond, 'prop' : 'extracts', 'rvprop' : 'content', 'continue' : '', "exsectionformat" : "plain", "redirects" : ""}

r = requests.get('http://en.wikipedia.org/w/api.php', params=parameters)
rSecond = requests.get('http://en.wikipedia.org/w/api.php', params=parametersSecond)

data = r.json()
dataSecond = rSecond.json()

def toPlainText(rdata, jsonFile, txtFile):
    with open(jsonFile, 'w') as outfile:
        json.dump(rdata, outfile)
    with codecs.open(txtFile, 'w', 'utf-8') as file2:
        ids = rdata['query']['pages'].keys()
        text = ' '.join([rdata['query']['pages'][idx]['extract'] for idx in ids])
        text = strip_tags(text)
        file2.write(text)

toPlainText(data, 'firstRev.json', 'firstRev.txt')
toPlainText(dataSecond, 'secondRev.json', 'secondRev.txt')

with open('jaccardData.txt', 'a') as outfile:
    print>>outfile,jaccard('firstRev.txt', 'secondRev.txt')
 def one_different(self):
     a = np.zeros((10, 10))
     b = np.array(a, copy=True)
     a[0,0] = 1
     assert np.isclose(jaccard(a, b), 0)
Exemple #10
0
 def all_zeros(self):
     a = np.zeros((10, 10))
     a_copy = np.array(a, copy=True)
     assert np.isnan(jaccard(a, a_copy))
Exemple #11
0
 def identical_ones(self):
     a = np.ones((10, 10))
     a_copy = np.array(a, copy=True)
     assert np.isclose(jaccard(a, a_copy), 1)
Exemple #12
0
 def test_jaccard_same(self):
     assert np.isclose(jaccard(self.env.original_state, self.env.original_state), 1)
Exemple #13
0
    def __init__(self, directory=None, viz=False, task=False, files_list=None,
                 observation_dims=(27, 27, 27), multiscale=False,  # FIXME automatic dimensions
                 max_num_frames=0, saveGif=False, saveVideo=False):  # FIXME hardcoded max num frames!
        """
        :param train_directory: environment or game name
        :param viz: visualization
            set to 0 to disable
            set to +ve number to be the delay between frames to show
            set to a string to be the directory for storing frames
        :param observation_dims: shape of the frame cropped from the image to feed
            it to dqn (d,w,h) - defaults (27,27,27)
        :param nullop_start: start with random number of null ops
        :param location_history_length: consider lost of lives as end of
            episode (useful for training)
        :max_num_frames: maximum number of frames per episode.
        """
        super(Brain_Env, self).__init__()

        # inits stat counters
        self.reset_stat()

        # counter to limit number of steps per episodes
        self.cnt = 0
        # maximum number of frames (steps) per episodes
        self.max_num_frames = max_num_frames
        # stores information: terminal, score, distError
        self.info = None
        # option to save display as gif
        self.saveGif = saveGif
        self.saveVideo = saveVideo
        # training flag
        self.task = task
        # image dimension (2D/3D)
        self.observation_dims = observation_dims
        self.dims = len(self.observation_dims)
        # multi-scale agent
        self.multiscale = multiscale
        # FIXME force multiscale false for now
        self.multiscale = False

        # init env dimensions
        if self.dims == 2:
            self.width, self.height = observation_dims
        elif self.dims == 3:
            self.width, self.height, self.depth = observation_dims
        else:
            raise ValueError

        with THREAD_LOCKER:
            self.rng = get_rng(self)
        self.viz = viz

        print("viz {} gif {} video {}".format(self.viz, self.saveGif, self.saveVideo))


        # get action space and minimal action set
        self.action_space = spaces.Discrete(6)  # change number actions here
        self.actions = self.action_space.n
        self.observation_space = spaces.Box(low=-1., high=1.,
                                            shape=self.observation_dims,
                                            dtype=np.uint8)
        # history buffer for storing last locations to check oscilations
        # TODO initialize _observation_bounds limits from input image coordinates
        # -1 to compensate for 0 indexing
        self._observation_bounds = ObservationBounds(0,
                                                     self.observation_dims[0] - 1,
                                                     0,
                                                     self.observation_dims[1] - 1,
                                                     0,
                                                     self.observation_dims[2] - 1)

        self.files = FilesListCubeNPY(directory, files_list)

        # self.files = filesListFetalUSLandmark(directory,files_list)
        # self.files = filesListCardioMRLandmark(directory,files_list)
        # prepare file sampler
        self.filepath = None
        self.file_sampler = self.files.sample_circular()  # returns generator
        # reset buffer, terminal, counters, and init new_random_game
        # we put this here so that init_player in DQN.py doesn't try to update_history
        self._clear_history()  # init arrays
        self._restart_episode()
        assert (np.shape(self._state) == self.observation_dims)
        # test jaccard
        assert np.isclose(jaccard(self.original_state, self.original_state)[0], 1)