def _solve(g, name, params): from grb_lazy import solve_problem as solve print() print('*** STARTING A NEW PROBLEM ***') print() print('name:', name) print('params:', params) # nontriv_sccs = sum(1 for sc in strongly_connected_components(g) if len(sc) > 1) assert nontriv_sccs == 1, nontriv_sccs assert g.number_of_selfloops() == 0 # stats = Stats(name=name, params=params, is_optimal=True, cost=None, ILP=0, node=0, iter=0, time=None) start = time() elims, cost, cycle_matrix = solve(g, stats) end = time() # stats.time = end - start # stats.cost = cost print_stats(g, stats) fname = (name + '_' + params).replace(' ', '_') serialize(cycle_matrix, TMP_DIR + 'cycle_matrix_' + fname + '.pkl.gz') serialize(elims, TMP_DIR + 'solution_' + fname + '.pkl.gz') # return stats
def __init__(self, dataset, min_tracks): """ """ # Inizialize classes self.pp = PreProcessing() self.st = Stats() # Define input files IN_DIR = os.path.join("../data", dataset) playlists_file = os.path.join(IN_DIR, "playlists.tsv") tracklist_file = os.path.join(IN_DIR, "tracklist.tsv") glove_embs = os.path.join("../data", "tag_embeds", dataset + ".txt") lastfm_tags = os.path.join("../data", "lastfm_tags", "lastfm_tags.tsv") # Import data DictEmbeds, self.DictKeyEmbeds = self.pp.import_embeddings(glove_embs) self.a_idx = self.pp.create_annoy_idx(DictEmbeds) self.DictTrackTag = self.pp.import_track_tags(lastfm_tags) self.playlists = self.pp.filter_playlists( self.pp.import_playlists(playlists_file, min_tracks)) self.DictTrack = self.pp.import_tracklist(tracklist_file) # Define variables self.low_pDI_playlists = os.path.join(IN_DIR, 'low_pDI_playlists.tsv') self.high_pDI_playlists = os.path.join(IN_DIR, 'high_pDI_playlists.tsv') self.rand_tracks_playlist = []
def main(): print('\n loading the dataset ... \n') print('\n done \n') # args.distributed = args.world_size > 1 model = AttenVgg(input_size=args.img_size, num_class=args.num_classes) model = loadpartweight(model) LR = Learning_rate_generater('step', [40, 50], 120) opt = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4) # plot network # vizNet(model, args.modeldir) # if args.distributed: # dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size,rank=0) # model.cuda() # model = torch.nn.parallel.DistributedDataParallel(model) model = torch.nn.DataParallel(model).cuda() trainloader, valloader = get_data(args) critertion = torch.nn.CrossEntropyLoss().cuda() if args.evaluate: evaluate(valloader, model, critertion) return if not os.path.exists(args.modeldir): os.mkdir(args.modeldir) stats = Stats(args.modeldir, start_epoch=0) for epoch in range(args.epochs): # if args.distributed: # train_sampler.set_epoch(epoch) adjust_learning_rate(opt, LR.lr_factor, epoch) trainObj, top1, top2 = train(trainloader, model, critertion, opt, epoch) valObj, prec1, prec2 = evaluate(valloader, model, critertion) stats._update(trainObj, top1, top2, valObj, prec1, prec2) filename = [] if args.store_per_epoch: filename.append( os.path.join(args.modeldir, 'net-epoch-%s.pth.tar' % (epoch + 1))) else: filename.append(os.path.join(args.modeldir, 'checkpoint.pth.tar')) filename.append(os.path.join(args.modeldir, 'model_best.pth.tar')) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': opt.state_dict() }, (prec1 > best_prec1), filename) plot_curve(stats, args.modeldir, True) sio.savemat(os.path.join(args.modeldir, 'stats.mat'), {'data': stats})
def track_popularity_analysis(playlists): """ Analyze how popularity is distributed within the tracks in the dataset. It takes as input a list of playlists and return: - Dict with as keys the track IDs and as values the tracks popularity - Int representing the minimum track popularity - Int representing the maximum track popularity """ DictTrackPop = get_track_popularity(playlists) # Get most popular track top_tPI = max(DictTrackPop.items(), key=operator.itemgetter(1))[0] logging.info("Most popular track <{}>, with tPI: {} ".format( top_tPI, DictTrackPop[top_tPI])) logging.info("Top tPI divided by number of playlists: {} ".format( DictTrackPop[top_tPI] / len(playlists))) # Min-Max normalization min_pop, max_pop = [min(DictTrackPop.values()), max(DictTrackPop.values())] DictTrackPop = { k: (v - min_pop) / (max_pop - min_pop) for k, v in DictTrackPop.items() } # Groupe in 10 groups according to tPI DictTrackPopGroup = {} for track in DictTrackPop: group = int(np.floor(DictTrackPop[track] * 10)) if group not in DictTrackPopGroup: DictTrackPopGroup[group] = 0 DictTrackPopGroup[group] += 1 # Compute Shannon and Simpson Indexes idx = Stats() logging.info("Tracks with tPI in [0.0, 0.1) (%): {}".format( DictTrackPopGroup[0] * 100 / len(DictTrackPop))) logging.info("Shannon Diversity Index: {}".format( idx.shannon_di(DictTrackPopGroup))) logging.info("Simpson Diversity Index: {}".format( idx.simpson_di(DictTrackPopGroup))) return DictTrackPop, min_pop, max_pop
def __init__(self, mi=None, args=None): if args is None: self.args = ArgsProvider(call_from=self, define_params=self._params(), on_get_params=self._init) else: self.args = args self._init(args) # Accumulated errors. self.stats = defaultdict(lambda: Stats()) self._cb = {} if mi is not None: self.model_interface = mi
def train(model, dataloader, optimizer, scheduler, params): print("Starting training...") best_val_loss = 100 #print(params.save_dir, params.tag) stats = Stats(params.save_dir, params.tag) for epoch in range(params.epoch_num): loss_avg = RunningAverage() train_data = tqdm(dataloader.data_iterator(data_type='train', batch_size=params.batch_size), total=(dataloader.size()[0] // params.batch_size)) optimizer.zero_grad() model.zero_grad() for data, labels in train_data: model.train() data = torch.tensor(data, dtype=torch.long).to(params.device) labels = torch.tensor(labels, dtype=torch.long).to(params.device) batch_masks = (data != 0) output = model(data, attention_mask=batch_masks, labels=labels) loss = torch.mean(output[0]) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), params.max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue) optimizer.step() scheduler.step() model.zero_grad() optimizer.zero_grad() # update the average loss loss_avg.update(loss.item()) train_data.set_postfix(type='TRAIN',epoch=epoch,loss='{:05.3f}'.format(loss_avg())) metrics = validate(model, dataloader, params) print('After {} epochs: F1={}, Loss={}'.format(epoch , metrics.f1(), metrics.loss)) stats.update(metrics, epoch, loss_avg()) stats.save() if epoch % params.save_freq == 0 and params.save_checkpoints: save_checkpoint({'epoch': epoch, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, is_best=False, tag=params.tag, epoch=epoch, score=metrics.f1(), checkpoint=params.save_dir) if metrics.loss < best_val_loss: best_val_loss = metrics.loss save_checkpoint({'epoch': epoch, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, is_best=True, tag=params.tag, epoch='generic', score='epic', checkpoint=params.save_dir)
env_eval, _ = make_env(args.env, args.max_episode_steps, add_downsampling=False, downsampling_tiles_w=None, downsampling_tiles_h=None, downsampling_pix_values=None, atari_frameskip=args.atari_frameskip) eval_fn = get_evaluate_fn(env_eval=env_eval, preproc_obs_fn=preproc_obs_fn, policy_NN=call_model, args=args) process = psutil.Process() memory_usage_fn = lambda: process.memory_info().rss stats = Stats(use_tensorboard=args.use_tensorboard, log_path=log_path) experience_keys = ["observations", "target_policy"] if args.compute_value: experience_keys.append("returns") experience_replay = ExperienceReplay(keys=experience_keys, capacity=args.replay_capacity) run_episode_fn = get_episode_fn( actor=high_level_actor if args.hierarchical else low_level_actor, planner=high_level_planner if args.hierarchical else low_level_planner, train_fn=train_fn, dataset=experience_replay, add_returns=args.compute_value, stats=stats, memory_usage_fn=memory_usage_fn,
hypers: Dict[str, Tuple[float, float]] = { "bostonHousing": (0.0001, 0.1), "concrete": (0.001, 0.1), "energy": (0.0001, 0.1), "kin8nm": (0.1, 0.1), "power-plant": (0.1, 1.0), "wine-quality-red": (0.001, 0.01), "yacht": (0.0001, 1.0), "naval-propulsion-plant": (0.001, 1.0), "protein-tertiary-structure": (0.0001, 0.1), } for name in hypers: print(name) (lengthscale, tau) = hypers[name] stats = Stats() for run in range(args.runs): train, val, test = get_pbp_sets(name, args.batch_size, get_val=args.val) # l = prior lengthscale, tau = model precision, N = dataset instances # weight regularizer = l^2 / (tau N) wr = lengthscale**2.0 / (tau * len(train.dataset)) # dropout regularizer = 2 / tau N dr = 2 / (tau * len(train.dataset)) for (x, y) in train: break h_dim = 50 if "protein" not in name else 100
parse_chunk_async(file_name, temp_dir, start, end) total = dict( (count_name, LogCounter(count_name)) for count_name in count_names ) stats.waiting() for (temp_file_name, job_pid, job_time) in queue: stats.received_job_result() start_reduce_time = time.time() mapper = pickle.load(open(temp_file_name, 'rb')) for (count_name, counter) in mapper.get_counters().iteritems(): total[count_name].add_counter(counter) os.remove(temp_file_name) stats.job_report(job_pid, job_time, time.time() - start_reduce_time) stats.waiting() finally: shutil.rmtree(temp_dir) for name in count_names: print total[name].report() if __name__ == '__main__': args = parse_argv() stats = Stats(args.log_file, log_each_job=args.log_each_job) parse_file(args.filename, args.cores, args.jobs_per_core, stats) stats.report_master_stats()
# low_level_planner = CountbasedRolloutIW(generate_successor_fn=low_level_tree_actor.generate_successor, width=low_level_width, features_name="low_level_features") low_level_planner.add_stop_fn(lambda tree: not interactions.within_budget()) abstract_tree_actor = AbstractTreeActor(low_level_planner=low_level_planner, low_level_tree_actor=low_level_tree_actor) # high_level_planner = BFS(generate_successor_fn=abstract_tree_actor.generate_successor, features_name="high_level_features") # high_level_planner = IW(generate_successor_fn=abstract_tree_actor.generate_successor, width=high_level_width, features_name="high_level_features") high_level_planner = CountbasedRolloutIW(generate_successor_fn=abstract_tree_actor.generate_successor, width=high_level_width, features_name="high_level_features") high_level_planner.add_stop_fn(lambda tree: not interactions.within_budget()) abstract_tree = abstract_tree_actor.reset() episode_done = False stats = Stats() abstract_tree_actor.render_tree(abstract_tree, size=None) while not episode_done: interactions.reset_budget() high_level_planner.initialize(tree=abstract_tree) high_level_planner.plan(tree=abstract_tree) abstract_tree_actor.compute_returns(abstract_tree, discount_factor=discount_factor, add_value=False) Q = compute_node_Q(node=abstract_tree.root.low_level_tree.root, n_actions=env.action_space.n, discount_factor=discount_factor, add_value=False) low_level_policy = softmax(Q, temp=0) a = sample_pmf(low_level_policy) abstract_tree_nodes = len(abstract_tree)
class PlaylistDI(object): """ Perform playlist diversity analysis. """ def __init__(self, dataset, min_tracks): """ """ # Inizialize classes self.pp = PreProcessing() self.st = Stats() # Define input files IN_DIR = os.path.join("../data", dataset) playlists_file = os.path.join(IN_DIR, "playlists.tsv") tracklist_file = os.path.join(IN_DIR, "tracklist.tsv") glove_embs = os.path.join("../data", "tag_embeds", dataset + ".txt") lastfm_tags = os.path.join("../data", "lastfm_tags", "lastfm_tags.tsv") # Import data DictEmbeds, self.DictKeyEmbeds = self.pp.import_embeddings(glove_embs) self.a_idx = self.pp.create_annoy_idx(DictEmbeds) self.DictTrackTag = self.pp.import_track_tags(lastfm_tags) self.playlists = self.pp.filter_playlists( self.pp.import_playlists(playlists_file, min_tracks)) self.DictTrack = self.pp.import_tracklist(tracklist_file) # Define variables self.low_pDI_playlists = os.path.join(IN_DIR, 'low_pDI_playlists.tsv') self.high_pDI_playlists = os.path.join(IN_DIR, 'high_pDI_playlists.tsv') self.rand_tracks_playlist = [] def tag_distance(self, word1, word2): """ Compute pairwise cosine distance between two tags, from the Annoy index. If tags are not in the index, return -1 """ word1 = self.pp.norm_str(word1) word2 = self.pp.norm_str(word2) try: dist = self.a_idx.get_distance(self.DictKeyEmbeds[word1], self.DictKeyEmbeds[word2]) except KeyError: dist = -1 return dist def TT_distance(self, v1, v2): """ Compute Track-Tag distance between two tracks. If tracks have not the same number of tags, return -1. """ if not v1 or not v2: return -1 max_len = max(len(v1), len(v2)) # TODO ### improve propagation when incomplete information if len(v1) < max_len: return -1 elif len(v2) < max_len: return -1 s = 0 for i in range(max_len): max_weight = max(v1[i][1], v2[i][1]) if max_weight == 0: s += 0 max_len += -1 else: dist = self.tag_distance(v1[i][0], v2[i][0]) if dist == -1: s += 0 max_len += -1 else: s += ((v1[i][1] + v2[i][1]) / float(2 * max_weight) * dist) if max_len == 0: return -1 return (s / float(max_len)) def log_results(self, results): """ Print results of the diversity analysis. """ logging.info("Mean pDI: {}".format(np.mean(results))) logging.info("Std pDI: {}".format(np.std(results))) logging.info("Max pDI: {}".format(max(results))) logging.info("Min pDI: {}".format(min(results))) logging.info("Gini pDI: {}".format(gini(np.array(results)))) logging.info("QCD pDI: {}".format(self.st.qcd(results))) def analyze_playlist(self): """ Analyze diversity of playlists from the dataset. """ logging.info("Analyzing Playlists...") pDI = [] pDI_idx = [] playlist_analyzed = 0 for c, playlist in enumerate(self.playlists): playlist_track_tags = [] playlist_tracks_tags_count = 0 for track in playlist: track = str(track).strip() try: # Continue if track has at least 1 tag associated if self.DictTrackTag[self.DictTrack[track]]: playlist_track_tags.append( self.DictTrackTag[self.DictTrack[track]]) playlist_tracks_tags_count += 1 # Get random tracks for evaluation if random.randint(0, 9) > 5: self.rand_tracks_playlist.append( self.DictTrackTag[self.DictTrack[track]]) # Skip if tracks has not tags associated except KeyError: pass # Skip playlist without complete information if playlist_tracks_tags_count >= int(1 * len(playlist)): playlist_analyzed += 1 pDI_sum = 0 tracks_comb = list( itertools.combinations(playlist_track_tags, 2)) for track_tags in tracks_comb: dist = self.TT_distance(track_tags[0], track_tags[1]) if dist == -1: tracks_comb.remove(track_tags) else: pDI_sum += dist if pDI_sum == 0: pass else: pDI.append(pDI_sum / float(len(tracks_comb))) pDI_idx.append(c) self.log_results(pDI) logging.info("Playlists analyzed: {}/{}".format( playlist_analyzed, len(self.playlists))) return pDI, pDI_idx def analyze_random_playlist(self): """ Analyze diversity of random playlists created with tracks from the dataset. """ logging.info("Analyzing Random Playlists...") playlist_len_mean = int(np.mean([len(x) for x in self.playlists])) k = 0 while k < 1: # Shuffle tracks at each iteration rand_tracks_playlist = random.sample( self.rand_tracks_playlist, len(self.rand_tracks_playlist)) rand_pDI = [] random_playlists = [ rand_tracks_playlist[x:x + playlist_len_mean] for x in range(0, len(rand_tracks_playlist), playlist_len_mean) ] for el in random_playlists: rand_pDI_sum = 0 tracks_comb = list(itertools.combinations(el, 2)) for track_tags in tracks_comb: dist = self.TT_distance(track_tags[0], track_tags[1]) if dist == -1: tracks_comb.remove(track_tags) else: rand_pDI_sum += dist if tracks_comb: if rand_pDI_sum == 0: pass else: rand_pDI.append(rand_pDI_sum / float(len(tracks_comb))) self.log_results(rand_pDI) k += 1 def write_playlist_qualia(self, pDI, pDI_idx): """ Write out the files with the playlists for performing the qualitative analysis """ dist_10pct = int(0.1 * len(pDI)) # Write most similar playlists with open(self.low_pDI_playlists, 'w+') as outf: _writer = csv.writer(outf, delimiter='\t') for idx in sorted(range(len(pDI)), key=lambda i: pDI[i], reverse=False)[:dist_10pct]: row = [pDI[idx], self.playlists[pDI_idx[idx]]] _writer.writerow(row) # Write less similar playlists with open(self.high_pDI_playlists, 'w+') as outf: _writer = csv.writer(outf, delimiter='\t') for idx in sorted(range(len(pDI)), key=lambda i: pDI[i], reverse=True)[:dist_10pct]: row = [pDI[idx], self.playlists[pDI_idx[idx]]] _writer.writerow(row) def run(self): """ Main function. It performs the diversity on the playlists from the dataset, then on random playlists. Finally, it writes the files with the playlists for performing a qualitative analysis """ pDI, pDI_idx = self.analyze_playlist() self.analyze_random_playlist() self.write_playlist_qualia(pDI, pDI_idx)
def policy(state, q_values, stats): epsilon = stats.epsilon(state) actions = q_values[state] if np.random.uniform() > epsilon: #greedy - exploit return np.argmax(actions) #random action - explore return np.random.randint(4) print('Episodes: %d' % EPISODES) print('Biased environment: ', use_bias_env) stats = Stats() q_values = QValue() win = [0] lose = [0] draw = [0] wi, lo, dr, has_ace = 0, 0, 0, 0 for ep in tqdm(range(EPISODES)): blackjack = Blackjack_Biased(1000) if use_bias_env else Blackjack(1000) #get start state state = blackjack.reset() action = policy(state, q_values, stats)
"wine-quality-red": (2.5, 3.0, 3.5), "yacht": (0.25, 0.5, 0.75), "naval-propulsion-plant": (30000, 40000, 50000), "protein-tertiary-structure": (0.025, 0.05, 0.075), } for name in taus: print(name) best_rmse = float("inf") best_stats = None best_model = None best_hypers = (0.0, 0.0) for lengthscale in lengthscales: for tau in taus[name]: stats = Stats() for run in range(args.runs): train, _, test = get_pbp_sets(name, args.batch_size, get_val=False) # l = prior lengthscale, tau = model precision, N = dataset instances # weight regularizer = l^2 / (tau N) wr = lengthscale**2.0 / (tau * len(train.dataset)) # dropout regularizer = 2 / tau N dr = 2 / (tau * len(train.dataset)) for (x, y) in train: break h_dim = 50 if "protein" not in name else 100
def register_stats(self, env): """ Creates charts/stats that we want to show. """ env.stats['rewards'] = Stats(PlotType.REGULAR) env.stats['fails'] = Stats(PlotType.REGULAR) env.stats['heatmap'] = MatrixStats(env.height, env.width, 0)
def run(dataset, print_ex): """ Analyze the 10% of playlists with the highest, and the 10% with the lowest diversity index. """ pp = PreProcessing() st = Stats() # Define input files IN_DIR = os.path.join("../data", dataset) tracklist_file = os.path.join(IN_DIR, "tracklist.tsv") lastfm_tags = os.path.join("../data", "lastfm_tags", "lastfm_tags.tsv") # Import data DictTrackTag = pp.import_track_tags(lastfm_tags) DictTrack = pp.import_tracklist(tracklist_file) low_pDI_playlists = os.path.join(IN_DIR, 'low_pDI_playlists.tsv') high_pDI_playlists = os.path.join(IN_DIR, 'high_pDI_playlists.tsv') results_pd = [] for input_file in [low_pDI_playlists, high_pDI_playlists]: # Initialize variables tag_no = [] tag_common = [] ratio_tag_track = [] artist_no = [] tracks_no = [] ratio_track_art = [] distances = [] print_c = 0 playlist_c = 0 with open(input_file, 'r') as inf: _reader = csv.reader(inf, delimiter='\t') # Iterate over playlists for row in _reader: playlist_c += 1 dist, playlist = row playlist = eval(playlist) distances.append(float(dist)) artistnames = set() total_tags = set() tags_list = [] # Print playlist info if print_c < print_ex: logging.info("Printing info new playlist...") logging.info("Playlist pDI:{}".format(dist)) logging.info("Playlist Tracks:") # Iterate over playlist tracks for track in playlist: track = str(track) try: artistname, trackname = DictTrack[track].split("|") except ValueError: continue artistnames.add(artistname) tags_tracks = set() if DictTrack[track] in DictTrackTag: for tag in DictTrackTag[DictTrack[track]]: total_tags.add(pp.norm_str(tag[0])) tags_tracks.add(pp.norm_str(tag[0])) tags_list.append(tags_tracks) if print_c < print_ex: logging.info("{} {}".format( DictTrack[track], DictTrackTag[DictTrack[track]])) else: tags_list.append(set()) continue # Print playlist stats if print_c < print_ex: logging.info("No. unique tags: {}".format(len(total_tags))) logging.info("No. unique tags for tracks: {}".format( len(total_tags) / float(len(playlist)))) logging.info("No. unique artists: {}".format( len(artistnames))) logging.info("No. unique tracks: {}".format(len(playlist))) logging.info("No. unique tracks for artists: {}".format( len(playlist) / float(len(artistnames)))) print_c += 1 tag_no.append(len(total_tags)) ratio_tag_track.append(len(total_tags) / float(len(playlist))) artist_no.append(len(artistnames)) tracks_no.append(len(playlist)) ratio_track_art.append(len(playlist) / float(len(artistnames))) tag_common.append(set.intersection(*tags_list)) common_tags = round( len([x for x in tag_common if len(x) > 1]) * 100 / float(playlist_c)) single_artists = round( len([x for x in artist_no if x == 1]) * 100 / float(playlist_c)) # Print playlist dataset qualitative analysis results logging.info("") logging.info( "## Qualitative analysis of playlists from {} file ## ".format( input_file)) logging.info("Average pDI: {}".format(np.mean(distances))) logging.info("Average tag count: {}".format(round( np.mean(tag_no)))) logging.info("Common tags(%): {}".format(common_tags)) logging.info("Average tag over tracks: {}".format( round(np.mean(ratio_tag_track)))) logging.info("Average artist count: {}".format( round(np.mean(artist_no)))) logging.info("Single-artist(%): {}".format(single_artists)) logging.info("Average tracks count: {}".format( round(np.mean(tracks_no)))) logging.info("Average tracks over artists: {}".format( round(np.mean(ratio_track_art)))) # Store results for computing Percentual Difference results = [ np.mean(distances), round(np.mean(tag_no)), common_tags, round(np.mean(ratio_tag_track)), round(np.mean(artist_no)), single_artists, round(np.mean(tracks_no)), round(np.mean(ratio_track_art)) ] results_pd.append(results) logging.info("") logging.info("## Percentage Difference (PD) ## ".format(input_file)) for c in range(0, 8): if c not in [2, 5]: logging.info(st.pdiff(results_pd[0][c], results_pd[1][c])) else: logging.info(abs(results_pd[0][c] - results_pd[1][c]))