def one_same_one_different(self): a = np.zeros((10, 10)) b = np.array(a, copy=True) a[0, 0] = 1 b[0:2] = 1 assert np.isclose(jaccard(a, b), 0.5) b_downwards = np.zeros_like(a) b_downwards[:2, 0] = 1 assert np.isclose(jaccard(a, b), 0.5)
def mostSimilar(songs): """ Finds the two songs that are the most similar according to their jaccard indices Parameters: songs (dict{string:set}): dictionary with song ids and scores (after being converted into a set of tuples) Return: tuple() containing the highest jaccard index, and the ids of the two songs that produced it """ most = 0 mostSongs = [] # compare every song to one another for song1 in songs: for song2 in songs: if song1 == song2: continue score1 = songs[song1] score2 = songs[song2] val = jac.jaccard(score1, score2) if val > most: most = val mostSongs = [song1, song2] return (most, mostSongs)
def eval_fn(data_loader, model, device, tokenizer=TOKENIZER): model.eval() all_ids = [] start_idx = [] end_idx = [] orig_selected = [] padding_len = [] for d in data_loader: ids = d['ids'] token_type_ids = d['token_type_ids'] mask = d['mask'] selected_text = d['orig_selected'] pad_len = d['padding_len'] targets_start = d['targets_start'] targets_end = d['targets_end'] ids = ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) targets_start = targets_start.to(device, dtype=torch.float) targets_end = targets_end.to(device, dtype=torch.float) o1, o2 = model(ids=ids, mask=mask, token_type_ids=token_type_ids) all_ids.append(ids.cpu().detach().numpy()) start_idx.append(torch.sigmoid(o1).cpu().detach().numpy()) end_idx.append(torch.sigmoid(o2).cpu().detach().numpy()) orig_selected.extend(selected_text) padding_len.extend(pad_len) start_idx = np.vstack(start_idx) end_idx = np.vstack(end_idx) all_ids = np.vstack(all_ids) jaccards = [] for i in range(0, len(start_idx)): start_logits = start_idx[i][3:-padding_len[i]] end_logits = end_idx[i][3:-padding_len[i]] this_id = all_ids[i][3:-padding_len[i]] idx1 = idx2 = None max_sum = 0 for ii, s in enumerate(start_logits): for jj, e in enumerate(end_logits): if s + e > max_sum: max_sum = s + e idx1 = ii idx2 = jj this_id = this_id[idx1:idx2 + 1] predicted_text = tokenizer.decode(this_id, skip_special_tokens=True) predicted_text = predicted_text.strip() sel_text = orig_selected[i].strip() jaccards.append(jaccard(predicted_text, sel_text)) return np.mean(jaccards)
def calc_IOU(self): """ calculate the Intersection over Union AKA Jaccard Index between two images https://en.wikipedia.org/wiki/Jaccard_index """ # flatten bc jaccard_similarity_score expects 1D arrays state = self._state.ravel() state[state != -1] = 0 # mask out non-agent trajectory state = state.astype(bool) # everything non-zero => True if not state.any(): # no agent trajectory print(" no state trajectory found") iou = 0.0 else: iou = jaccard(state, self.original_state) # print("computed iou ", iou) # print("sum(agent) ", sum(state), "sum(original state)", sum(self.original_state), "computed iou ", iou) # print("agent \n", state.shape) # print("og \n", original_state.shape) # np.save("agent", state) # np.save("og", original_state) # assert isinstance(iou, ) return iou
def calc_IOU(self): """ calculate the Intersection over Union AKA Jaccard Index between two images https://en.wikipedia.org/wiki/Jaccard_index """ # flatten bc jaccard_similarity_score expects 1D arrays # agent_trajectory = self._state # state = self.swc_to_tiff.ravel() # agent_trajectory[agent_trajectory != -1] = 0 # mask out non-agent trajectory # state = state.astype(bool) # everything non-zero => True # images should not be all True # assert agent_trajectory.all() == False # print("calling locations_to_img from IOU") agent_trajectory = locations_to_img(self._agent_nodes[:self.cnt], self.observation_dims, img=None, val=-1) # if self.cnt > 0: # if self.curr_IOU > 0.7: # np.set_printoptions(threshold=np.nan) # print("arr1 \n ", np.array_repr(agent_trajectory)) # print("arr2 \n ", np.array_repr(self.original_state)) # print(self.original_state[np.where(self.original_state < 0)]) # raise Exception iou, _, _ = jaccard(agent_trajectory, self.original_state) # print("computed iou ", iou) # print("sum(agent) ", np.sum(agent_trajectory), "sum(original state)", np.sum(self.original_state), "computed iou ", iou) # print("agent shape\n", agent_trajectory.shape) # print("og shape\n", self.original_state.shape) # np.save("agent", state) # np.save("og", original_state) # assert isinstance(iou, ) return iou
def __init__( self, directory=None, viz=False, task=False, files_list=None, observation_dims=(27, 27, 27), multiscale=False, # FIXME automatic dimensions max_num_frames=20, saveGif=False, saveVideo=False): # FIXME hardcoded max num frames! """ :param train_directory: environment or game name :param viz: visualization set to 0 to disable set to +ve number to be the delay between frames to show set to a string to be the directory for storing frames :param observation_dims: shape of the frame cropped from the image to feed it to dqn (d,w,h) - defaults (27,27,27) :param nullop_start: start with random number of null ops :param location_history_length: consider lost of lives as end of episode (useful for training) :max_num_frames: maximum number of frames per episode. """ super(Brain_Env, self).__init__() print( "warning! max num frames hard coded to {}!".format(max_num_frames), flush=True) # inits stat counters self.reset_stat() # counter to limit number of steps per episodes self.cnt = 0 # maximum number of frames (steps) per episodes self.max_num_frames = max_num_frames # stores information: terminal, score, distError self.info = None # option to save display as gif self.saveGif = saveGif self.saveVideo = saveVideo # training flag self.task = task # image dimension (2D/3D) self.observation_dims = observation_dims self.dims = len(self.observation_dims) # multi-scale agent self.multiscale = multiscale # FIXME force multiscale false for now self.multiscale = False # init env dimensions if self.dims == 2: self.width, self.height = observation_dims elif self.dims == 3: self.width, self.height, self.depth = observation_dims else: raise ValueError with _ALE_LOCK: self.rng = get_rng(self) # TODO: understand this viz setup # visualization setup # if isinstance(viz, six.string_types): # check if viz is a string # assert os.path.isdir(viz), viz # viz = 0 # if isinstance(viz, int): # viz = float(viz) self.viz = viz # if self.viz and isinstance(self.viz, float): # self.viewer = None # self.gif_buffer = [] # stat counter to store current score or accumlated reward self.current_episode_score = StatCounter() # get action space and minimal action set self.action_space = spaces.Discrete(6) # change number actions here self.actions = self.action_space.n self.observation_space = spaces.Box(low=0, high=255, shape=self.observation_dims, dtype=np.uint8) # history buffer for storing last locations to check oscilations self._history_length = max_num_frames # TODO initialize _observation_bounds limits from input image coordinates self._observation_bounds = ObservationBounds(0, 0, 0, 0, 0, 0) # add your data loader here # TODO: look into returnLandmarks # if self.task == 'play': # self.files = filesListBrainMRLandmark(directory, files_list, # returnLandmarks=False) # else: # self.files = filesListBrainMRLandmark(directory, files_list, # returnLandmarks=True) self.files = FilesListCubeNPY(directory, files_list) # self.files = filesListFetalUSLandmark(directory,files_list) # self.files = filesListCardioMRLandmark(directory,files_list) # prepare file sampler self.filepath = None self.file_sampler = self.files.sample_circular() # returns generator # reset buffer, terminal, counters, and init new_random_game # we put this here so that init_player in DQN.py doesn't try to update_history self._clear_history() # init arrays self._restart_episode() # self.viz = True # FIXME viz should default False assert (np.shape(self._state) == self.observation_dims) assert np.isclose(jaccard(self.original_state, self.original_state), 1)
data_tr, data_te = trainTestSplit(n=3) # ===取出训练集用户的IP与浏览网址====== ipTrain = list(set(data_tr['realIP'])) urlTrain = list(set(data_tr['fullURL'])) # ===用户物品矩阵构建======== te = pd.DataFrame(0, index=ipTrain, columns=urlTrain) for i in data_tr.index: te.loc[data_tr.loc[i, 'realIP'], data_tr.loc[i, 'fullURL']] = 1 te.sum().sum() # 同data_tr个数 # te.to_excel('./te.xlsx') # ===构建物品相似度矩阵====== cor = jaccard(te) # 杰卡德相似系数 cor = pd.DataFrame(cor, index=urlTrain, columns=urlTrain) # cor.to_excel('./cor.xlsx') # 构建测试集用户网址浏览字典 ipTest = list(set(data_te['realIP'])) dic_te = {ip: list(data_te.loc[data_te['realIP'] == ip, 'fullURL']) for ip in ipTest} rem = pd.DataFrame(index=range(len(data_te)), columns=['IP', 'url', 'rec1','rec2','rec3','rec4','rec5','recall','precision','R','anum']) rem['IP'] = list(data_te['realIP']) rem['url'] = list(data_te['fullURL']) index = data_process()['fullURL'].value_counts() for i in rem.index: rnum = 0 # 给用户的推荐中用户真正感兴趣的个数 anum = len(dic_te[rem.loc[i, 'IP']]) # 当前用户实际访问个数 即该用户实际感兴趣的网页个数 rem.loc[i, 'anum'] = anum
s = MLStripper() s.feed(html) return s.get_data() idFirst = sys.argv[1] idSecond = sys.argv[2] parameters = {'format' : 'json', 'action' : 'query', 'revids' : idFirst, 'prop' : 'extracts', 'rvprop' : 'content', 'continue' : '', "exsectionformat" : "plain", "redirects" : ""} parametersSecond = {'format' : 'json', 'action' : 'query', 'revids' : idSecond, 'prop' : 'extracts', 'rvprop' : 'content', 'continue' : '', "exsectionformat" : "plain", "redirects" : ""} r = requests.get('http://en.wikipedia.org/w/api.php', params=parameters) rSecond = requests.get('http://en.wikipedia.org/w/api.php', params=parametersSecond) data = r.json() dataSecond = rSecond.json() def toPlainText(rdata, jsonFile, txtFile): with open(jsonFile, 'w') as outfile: json.dump(rdata, outfile) with codecs.open(txtFile, 'w', 'utf-8') as file2: ids = rdata['query']['pages'].keys() text = ' '.join([rdata['query']['pages'][idx]['extract'] for idx in ids]) text = strip_tags(text) file2.write(text) toPlainText(data, 'firstRev.json', 'firstRev.txt') toPlainText(dataSecond, 'secondRev.json', 'secondRev.txt') with open('jaccardData.txt', 'a') as outfile: print>>outfile,jaccard('firstRev.txt', 'secondRev.txt')
def one_different(self): a = np.zeros((10, 10)) b = np.array(a, copy=True) a[0,0] = 1 assert np.isclose(jaccard(a, b), 0)
def all_zeros(self): a = np.zeros((10, 10)) a_copy = np.array(a, copy=True) assert np.isnan(jaccard(a, a_copy))
def identical_ones(self): a = np.ones((10, 10)) a_copy = np.array(a, copy=True) assert np.isclose(jaccard(a, a_copy), 1)
def test_jaccard_same(self): assert np.isclose(jaccard(self.env.original_state, self.env.original_state), 1)
def __init__(self, directory=None, viz=False, task=False, files_list=None, observation_dims=(27, 27, 27), multiscale=False, # FIXME automatic dimensions max_num_frames=0, saveGif=False, saveVideo=False): # FIXME hardcoded max num frames! """ :param train_directory: environment or game name :param viz: visualization set to 0 to disable set to +ve number to be the delay between frames to show set to a string to be the directory for storing frames :param observation_dims: shape of the frame cropped from the image to feed it to dqn (d,w,h) - defaults (27,27,27) :param nullop_start: start with random number of null ops :param location_history_length: consider lost of lives as end of episode (useful for training) :max_num_frames: maximum number of frames per episode. """ super(Brain_Env, self).__init__() # inits stat counters self.reset_stat() # counter to limit number of steps per episodes self.cnt = 0 # maximum number of frames (steps) per episodes self.max_num_frames = max_num_frames # stores information: terminal, score, distError self.info = None # option to save display as gif self.saveGif = saveGif self.saveVideo = saveVideo # training flag self.task = task # image dimension (2D/3D) self.observation_dims = observation_dims self.dims = len(self.observation_dims) # multi-scale agent self.multiscale = multiscale # FIXME force multiscale false for now self.multiscale = False # init env dimensions if self.dims == 2: self.width, self.height = observation_dims elif self.dims == 3: self.width, self.height, self.depth = observation_dims else: raise ValueError with THREAD_LOCKER: self.rng = get_rng(self) self.viz = viz print("viz {} gif {} video {}".format(self.viz, self.saveGif, self.saveVideo)) # get action space and minimal action set self.action_space = spaces.Discrete(6) # change number actions here self.actions = self.action_space.n self.observation_space = spaces.Box(low=-1., high=1., shape=self.observation_dims, dtype=np.uint8) # history buffer for storing last locations to check oscilations # TODO initialize _observation_bounds limits from input image coordinates # -1 to compensate for 0 indexing self._observation_bounds = ObservationBounds(0, self.observation_dims[0] - 1, 0, self.observation_dims[1] - 1, 0, self.observation_dims[2] - 1) self.files = FilesListCubeNPY(directory, files_list) # self.files = filesListFetalUSLandmark(directory,files_list) # self.files = filesListCardioMRLandmark(directory,files_list) # prepare file sampler self.filepath = None self.file_sampler = self.files.sample_circular() # returns generator # reset buffer, terminal, counters, and init new_random_game # we put this here so that init_player in DQN.py doesn't try to update_history self._clear_history() # init arrays self._restart_episode() assert (np.shape(self._state) == self.observation_dims) # test jaccard assert np.isclose(jaccard(self.original_state, self.original_state)[0], 1)