Esempio n. 1
0
    def mp_progress(func, iterable, processes=10, scale=10):
        gensim = utils.import_module("gensim")
        mp = utils.import_module("multiprocessing.pool")
        chunks = list(gensim.utils.chunkize(iterable, processes * scale))
        pool = mp.Pool(processes)
        ret = []

        for chunk in utils.tqdm(chunks):
            ret.extend(pool.map(func, chunk))

        return ret
Esempio n. 2
0
def train_model(agent, dataObj, ep_count=20, batch_size=32, evaluate_every=5):
    start_ep = agent.episode + 0
    end_ep = start_ep + ep_count
    for i in range(ep_count):
        if (i + 1) % evaluate_every == 0:
            evaluate_model(agent, dataObj, batch_size=batch_size)

        start_time = getChiTimeNow()
        epsilon_start = agent.epsilon
        total_profit = 0
        data_length = len(dataObj.trainDF) - 1
        avg_loss_array = []
        agent.reset()

        state = agent.getState(dataObj, 0)
        try:
            for t in tqdm(range(data_length),
                          total=data_length,
                          leave=True,
                          desc='Episode {}/{}'.format(agent.episode, end_ep)):
                reward = 0

                # select an action
                action = agent.act(state)

                open_pnl = calcOpenPnl(agent, dataObj, t + 1)
                day_pnl = evaluateAction(action, agent, dataObj, t + 1)
                reward = open_pnl + day_pnl
                total_profit += reward

                next_state = agent.getState(dataObj, t + 1)
                done = (t == data_length - 1)
                agent.remember(state, action, reward, next_state, done)

                if len(agent.memory) > batch_size:
                    # train every batch_size
                    if t % batch_size == 0:
                        loss = agent.train_experience_replay(batch_size)
                        avg_loss_array.append(loss)

                state = next_state

            endEpisode(agent, dataObj, batch_size, epsilon_start, avg_loss_array, \
                                'train', start_time)
        except (KeyboardInterrupt, SystemExit):
            print('KeyboardInterrupt or SystemExit. Ending current episode.')
            endEpisode(agent, dataObj, batch_size, epsilon_start, avg_loss_array, \
                                'train', start_time)
            raise
        except:
            print('Unknown error...Ending current episode.')
            endEpisode(agent, dataObj, batch_size, epsilon_start, avg_loss_array, \
                                'train', start_time)
            raise
Esempio n. 3
0
    def metric(self, name):
        if name == 'vali':
            data = self.data.vali_batch
        elif name == 'test':
            data = self.data.test_batch
        elif name == 'train':
            data = self.data.train_batch
        else:
            raise Exception(f'unknown name: {name}')

        cnt = 0
        max_phase = 11
        full_hitrate_per_phase = [[] for _ in range(max_phase)]
        full_ndcg_per_phase = [[] for _ in range(max_phase)]
        half_hitrate_per_phase = [[] for _ in range(max_phase)]
        half_ndcg_per_phase = [[] for _ in range(max_phase)]
        pbar = tqdm(desc='predicting...', leave=False)
        loss_list = []
        for mv in self.model.metric(data):
            for i in range(len(mv.ndcg)):
                pbar.update(1)
                # if mv.future_seq[0][0] == 0: continue
                phase = mv.phase[i]
                item_deg = self.data.dp.item_deg_per_phase[phase][
                    mv.true_item[i]]
                mid_deg = self.data.dp.mid_deg_per_phase[phase]
                if item_deg <= mid_deg:
                    half_hitrate_per_phase[phase].append(mv.hit_rate[i])
                    half_ndcg_per_phase[phase].append(mv.ndcg[i])
                full_hitrate_per_phase[phase].append(mv.hit_rate[i])
                full_ndcg_per_phase[phase].append(mv.ndcg[i])

            loss_list.append(mv.loss)
            cnt += 1
            if args.run_test and cnt > 10:
                break
        pbar.close()
        result = np.zeros(4, dtype=float)
        for p in range(max_phase):
            if half_hitrate_per_phase[p]:
                m1 = np.mean(full_hitrate_per_phase[p])
                m2 = np.mean(full_ndcg_per_phase[p])
                m3 = np.mean(half_hitrate_per_phase[p])
                m4 = np.mean(half_ndcg_per_phase[p])
                m = np.array([m1, m2, m3, m4])
                if args.show_detail:
                    print(
                        f'phase: {p}, vali: {format_metric(m)}, full num: {len(full_hitrate_per_phase[p])}, half num: {len(half_hitrate_per_phase[p])}'
                    )

                result += m

        loss_mean = np.mean(loss_list)
        return result, format_metric(result), loss_mean
def lddmm_matching(I,
                   J,
                   m=None,
                   lddmm_steps=1000,
                   lddmm_integration_steps=10,
                   reg_weight=1e-1,
                   learning_rate_pose=2e-2,
                   fluid_params=[1.0, .1, .01],
                   progress_bar=True):
    """Matching image I to J via LDDMM"""
    if m is None:
        defsh = [I.shape[0], 3] + list(I.shape[2:])
        m = torch.zeros(defsh, dtype=I.dtype).to(I.device)
    do_regridding = m.shape[2:] != I.shape[2:]
    J = J.to(I.device)
    matchterms = []
    regterms = []
    losses = []
    metric = lm.FluidMetric(fluid_params)
    m.requires_grad_()
    pb = range(lddmm_steps)
    if progress_bar: pb = tqdm(pb)
    for mit in pb:
        if m.grad is not None:
            m.grad.detach_()
            m.grad.zero_()
        m.requires_grad_()
        h = lm.expmap(metric, m, num_steps=lddmm_integration_steps)
        if do_regridding is not None:
            h = lm.regrid(h, shape=I.shape[2:], displacement=True)
        Idef = lm.interp(I, h)
        regterm = (metric.sharp(m) * m).mean()
        matchterm = mse_loss(Idef, J)
        matchterms.append(matchterm.detach().item())
        regterms.append(regterm.detach().item())
        loss = matchterm + reg_weight * regterm
        loss.backward()
        loss.detach_()
        with torch.no_grad():
            #v = metric.sharp(m)
            #regterm = (v*m).mean()#.detach()
            #del v
            #losses.append(loss.detach()+ .5*reg_weight*regterm)
            losses.append(loss.detach())
            p = metric.flat(m.grad).detach()
            if torch.isnan(losses[-1]).item():
                print(f"loss is NaN at iter {mit}")
                break
            #if mit > 0 and losses[-1].item() > losses[-2].item():
            #    print(f"loss increased at iter {mit}")
            #p.add_(reg_weight/np.prod(m.shape[1:]), m)
            m.add_(-learning_rate_pose, p)
    return m.detach(), [l.item() for l in losses], matchterms, regterms
Esempio n. 5
0
def nearest_neighbors(args, samples, dataset):
    dataset_bow = [create_bag_of_words(dataset[i]["string"])
                   for i in range(len(dataset))]
    samples_bow = [create_bag_of_words(sample) for sample in samples]
    knn = []
    progress = utils.tqdm(total=len(samples), desc="finding knn")
    for sample, sample_bow in zip(samples, samples_bow):
        progress.update(1)
        sims = [(bow_cosine_similarity(sample_bow, dataset_bow[i]),
                 dataset[i]["string"]) for i in range(len(dataset))]
        sims.sort(key=lambda x: x[0], reverse=True)
        knn.append([x[1] for x in sims[:args.nearest_neighbors]])
    return knn
Esempio n. 6
0
 def generate_from(self, zs):
     self.model.train(False)
     res = []
     progress = utils.tqdm(total=len(zs), desc="generating")
     for i in range(0, len(zs), self.batch_size):
         z = zs[i:i + self.batch_size]
         progress.update(len(z))
         z = z.to(self.device)
         gens, probs = self._generate_step(z)
         res.extend(list(zip(gens, probs)))
     progress.close()
     gens, probs = list(zip(*res))
     return list(zip(*gens)), list(zip(*probs))
Esempio n. 7
0
    def dump_features_all_item(self, name):
        if name == 'vali':
            data = self.data.vali_batch
        elif name == 'test':
            data = self.data.test_batch
        elif name == 'train':
            data = self.data.train_batch
        else:
            raise Exception(f'unknown name: {name}')

        users = []
        items = []
        logits = []

        from run_for_fuse import all_res
        fn_list = all_res.keys()

        user2items = None
        for fn in fn_list:
            _users, _items_list, _ = utils.load_pkl(
                f'{utils.for_fuse_dir}/{fn}_{name}')
            if user2items is None:
                user2items = {}
                for _u, _items in zip(_users, _items_list):
                    user2items[_u] = set(_items)
            else:
                assert set(user2items.keys()) == set(_users)
                for _u, _items in zip(_users, _items_list):
                    user2items[_u] |= set(_items)

        pbar = tqdm(desc=f'dump {name}, predicting...', leave=False)
        for pv in self.model.predict(data):
            pbar.update(1)
            users.extend(pv.user.tolist())
            for i in range(len(pv.user)):
                user = pv.user[i]
                _items_i = sorted(user2items[user])
                items.append(_items_i)
                logits.append(pv.all_scores[i, _items_i].tolist())

        pbar.close()

        feat = [users, items, logits]

        fn = f'{utils.for_fuse_dir}/union_{args.msg}_{name}'

        print(f'{utils.get_time_str()} dump file {fn}')
        utils.save_pkl(feat, fn)
        print(f'{utils.get_time_str()} dump file {fn} over')

        return fn
Esempio n. 8
0
def load():
    print('loading h5 store')
    store = pd.HDFStore(conf.wd + conf.data + '.h5', mode='r')
    full = store['df']

    print('sampling')

    #either use t (timestep) and wnd (window) or tstart and tend
    if conf.t is not None and conf.wnd is not None:
        t = np.searchsorted(full.index, pd.Timestamp(conf.t))
        tstart = full.index[t - 1]
        tend = full.index[t + conf.wnd - 1]
    else:
        tstart = pd.Timestamp(conf.tstart)
        tend = pd.Timestamp(conf.tend)

    print('Loading range', tstart, '-', tend)

    # assumes 0-indexed
    if conf.gridXmin == 0 and conf.gridXmax == conf.gridXdim - 1 and conf.gridYmin == 0 and conf.gridYmax == conf.gridYdim - 1:
        used_data_slice = full.loc[slice(tstart, tend), :]
    else:
        used_data_slice = pd.DataFrame()

        # tqdm for progress bar
        #only if gridXmin and gridXmax are defined. Otherwise full dataset is used
        for i in tqdm(range(conf.gridXmin, conf.gridXmax + 1)):
            x = full.loc[slice(tstart, tend),
                         slice(i * conf.gridYdim +
                               conf.gridYmin, i * conf.gridYdim +
                               conf.gridYmax)]
            used_data_slice = used_data_slice.merge(x,
                                                    how='outer',
                                                    left_index=True,
                                                    right_index=True,
                                                    copy=False)

    print('sample size: {} = {}%'.format(
        used_data_slice.shape[0] * used_data_slice.shape[1],
        (used_data_slice.shape[0] * used_data_slice.shape[1] /
         (full.shape[0] * full.shape[1])) * 100))
    store.close()

    if conf.diff:
        #Calculates the difference of a DataFrame element compared with another element in the DataFrame (default is the element in the same column of the previous row).
        used_data_slice = used_data_slice.diff().shift(-1)
    #hack: drop those snp tickers that screw up the scale
    #p.drop(p.columns[np.where((p > 200).any())[0]], axis=1, inplace=True)

    used_data_slice.fillna(0, inplace=True)
    return used_data_slice
Esempio n. 9
0
def eval(tokenizer: Tokenizer, model: GPT2LMHeadModel, dataset: MyDataset,
         args: TrainingArguments):
    model.eval()
    loss = 0
    iterator = build_data_iterator(tokenizer, dataset, args.eval_batch_size,
                                   args.block_size)
    for ids, attention_mask in tqdm(iterator, desc='eval'):
        ids = ids.to(args.device)
        with torch.no_grad():
            loss += model(ids,
                          attention_mask=attention_mask.to(args.device),
                          labels=ids)[0].item()
    model.train()
    return loss / len(iterator)
Esempio n. 10
0
def batch_average(dataloader, **kwargs):
    """Compute the average using streaming batches from a dataloader along a given dimension"""
    avg = None
    sumsizes = 0
    for (i, img) in tqdm(dataloader, 'image avg'):
        sz = img.shape[0]
        avi = img.to('cuda').mean(**kwargs)
        if avg is None:
            avg = avi
        else:
            # add similar-sized numbers using this running average
            avg = avg*(sumsizes/(sumsizes+sz)) + avi*(sz/(sumsizes+sz))
        sumsizes += sz
    return avg
def expo(args):
    def filename_fn(args):
        rs = 'N({}, {})'.format(args.radius, args.sigma)
        return rs

    def fpath(fname):
        _fpath = os.path.join(args.output_dir, fname)
        return _fpath

    length = 5 * args.radius
    linspace, data = SyntheticDataset.grid_data(args.num_points, length=length)

    #    loader = dataset[args.dataset](args)
    #    trainData = loader.train
    #    for batch_idx, samples in enumerate(trainData):
    #        data,labels = samples[DatasetType.InD]

    plt.xlim(-1 * length, length)
    plt.ylim(-1 * length, length)

    for scale in tqdm([1, 2, 3, 4]):
        sigma = scale * args.sigma

        scale_args = deepcopy(args)
        scale_args.sigma = sigma
        fname = filename_fn(scale_args)

        checkpoint_dir = os.path.join(args.work_dir, 'checkpoints')
        saver = Saver(checkpoint_dir)  # makes directory if already not present
        payload = saver.load(hash_args(
            scale_args))  #hash_args(scale_args) generates the hex string

        def run_and_save(scale_args):
            export = main(scale_args)  #Model creation??

            payload = export['model']
            saver.save(hash_args(scale_args), payload)
            return payload

        export = payload or run_and_save(scale_args)

        with torch.no_grad():
            scores = inference(export, data)
            np_x = data.cpu().numpy()
            for key in scores:
                score = scores[key].cpu().numpy()
                plot_pcolormesh(np_x, linspace, score)
                score_fname = '{}_{}'.format(fname, key)
                plt.title(score_fname)
                flush_plot(plt, fpath(score_fname) + '.png')
Esempio n. 12
0
    def train_loop(self):
        brk = 0
        vali_best_w = -1
        for ep in range(args.epochs):
            pbar = tqdm(total=args.nb_vali_step, desc='training', leave=False)
            try:
                train_v = []
                t0 = time.time()
                for _ in range(args.nb_vali_step):
                    # dict
                    v = self.model.fit()
                    train_v.append(v)
                    pbar.update(1)
            finally:
                pbar.close()
            train_time = time.time() - t0
            train_msg = dict_mean(train_v)

            vali_v, vali_str, vali_loss = self.metric('vali')

            vali_w = np.sum(
                [v * w for v, w in zip(vali_v, self.metric_weights)])

            if vali_w > vali_best_w:
                vali_best_w = vali_w
                self.best_vali = vali_v
                self.model.save(0)
                brk = 0
            else:
                brk += 1
            red = (brk == 0)

            msg = f'#{ep + 1}/{args.epochs} {train_msg}, brk: {brk}, vali: {vali_str}, {vali_loss:.4f}'
            if args.show_test and args.nb_test > 0:
                _, test_str, test_loss = self.metric('test')
                msg = f'{msg}, test: {test_str}'
            vali_time = time.time() - t0 - train_time
            msg = f'{msg}, time: {train_time:.0f}s,{vali_time:.0f}s'

            args.log.log(msg, red=red)

            if ep < args.min_train_epochs:
                brk = 0
            if brk >= args.early_stopping:
                break
        if args.epochs == 0:
            self.model.save(0)

        self.model.restore(0)
Esempio n. 13
0
 def build_graph_item_item(self):
     from tqdm import tqdm
     G_forward = [defaultdict(int) for _ in range(args.nb_items)]
     G_backward = [defaultdict(int) for _ in range(args.nb_items)]
     nb_edges = 0
     for u, item_list in tqdm(enumerate(self.user2item_seq),
                              desc='build edges'):
         n = len(item_list)
         for i in range(1, n):
             a, b = item_list[i - 1], item_list[i]
             if a >= 3 and b >= 3:
                 G_forward[a][b] += 1
                 G_backward[b][a] += 1
                 if G_forward[a][b] == args.gnn_min_edge_cnt:
                     nb_edges += 1
     args.update(nb_edges=nb_edges)
     neighbors = [[], []]
     maxn = args.gnn_adj_length
     for item in tqdm(range(args.nb_items), desc='sample neighbors'):
         nxt_forward = self.sample_neighbors(G_forward[item], maxn)
         nxt_backward = self.sample_neighbors(G_backward[item], maxn)
         neighbors[0].append(nxt_forward)
         neighbors[1].append(nxt_backward)
     self.neighbors = neighbors
Esempio n. 14
0
 def search(self, queries):
     with torch.no_grad():
         neighbors = []
         num_queries = len(queries)
         for i in utils.tqdm(range(0, num_queries, self.batch_size),
                             desc="searching nearest neighbors"):
             x = queries[i:i + self.batch_size]
             x = torch.stack([self.tensorize_bow(s)
                              for s in x]).to(self.device)
             logits = torch.matmul(self.tensors, x.t()).t()
             idxs = torch.sort(logits, 1, True)[1][:, :self.num_neighbors]
             idxs = idxs.cpu().tolist()
             neighbors.extend([[self.sents[j] for j in idx]
                               for idx in idxs])
         return neighbors
def affine_matching(I,
                    J,
                    A=None,
                    T=None,
                    affine_steps=100,
                    reg_weightA=1e2,
                    reg_weightT=1e1,
                    learning_rate_A=1e-4,
                    learning_rate_T=1e-2,
                    progress_bar=True):
    """Matching image I to J via affine transform"""
    if A is None:
        A = torch.zeros((I.shape[0], 3, 3), dtype=I.dtype).to(I.device)
    if T is None:
        T = torch.zeros((I.shape[0], 3), dtype=I.dtype).to(I.device)
    J = J.to(I.device)
    losses = []
    I.requires_grad_(False)
    J.requires_grad_(False)
    steps = range(affine_steps)
    eye = torch.eye(3).view(1, 3, 3).type(I.dtype).to(I.device)
    if progress_bar: steps = tqdm(steps)
    for mit in steps:
        A.requires_grad_(True)
        T.requires_grad_(True)
        if A.grad is not None and T.grad is not None:
            A.grad.detach_()
            A.grad.zero_()
            T.grad.detach_()
            T.grad.zero_()
        Idef = lm.affine_interp(I, A + eye, T)
        regtermA = mse_loss(A, A)
        regtermT = mse_loss(T, T)
        loss = mse_loss(
            Idef,
            J) + .5 * reg_weightA * regtermA + .5 * reg_weightT * regtermT
        loss.backward()
        loss.detach_()
        with torch.no_grad():
            losses.append(loss)
            #if torch.isnan(losses[-1]).item():
            #print(f"loss is NaN at iter {mit}")
            #break
            #if mit > 0 and losses[-1].item() > losses[-2].item():
            #print(f"loss increased at iter {mit}")
            A.add_(-learning_rate_A, A.grad)
            T.add_(-learning_rate_T, T.grad)
    return A.detach(), T.detach(), [l.item() for l in losses]
Esempio n. 16
0
def evaluate_model(agent, dataObj, debug=False, batch_size=32):
    print('Evaluating Model')
    start_time = getChiTimeNow()
    epsilon_start = np.nan
    total_profit = 0
    data_length = len(dataObj.trainDF) - 1
    avg_loss_array = []
    agent.reset()

    state = agent.getState(dataObj, 0)

    try:
        for t in tqdm(range(data_length)):
            reward = 0
            # select an action
            action = agent.act(state, is_eval=True)

            open_pnl = calcOpenPnl(agent, dataObj, t + 1)
            day_pnl = evaluateAction(action, agent, dataObj, t + 1)
            reward = open_pnl + day_pnl
            total_profit += reward

            next_state = agent.getState(dataObj, t + 1)

            done = (t == data_length - 1)

            #         agent.memory.append((state, action, reward, next_state, done)) # don't know why this line was here instead of the below
            agent.remember(state, action, reward, next_state, done)

            if len(agent.memory) > batch_size:
                # train every batch_size
                if t % batch_size == 0:
                    loss = agent.evaluate_experience_replay(batch_size)
                    avg_loss_array.append(loss)

            state = next_state
        endEpisode(agent, dataObj, batch_size, epsilon_start, avg_loss_array, \
                   'eval', start_time)
    except (KeyboardInterrupt, SystemExit):
        print('KeyboardInterrupt or SystemExit. Ending current episode.')
        endEpisode(agent, dataObj, batch_size, epsilon_start, avg_loss_array, \
                            'eval', start_time)
        raise
    except:
        print('Unknown error...Ending current episode.')
        endEpisode(agent, dataObj, batch_size, epsilon_start, avg_loss_array, \
                            'eval', start_time)
        raise
Esempio n. 17
0
 def yc(self, frac=1):
     pbar = tqdm(desc='read data', total=self.N)
     f = open(f'{data_home}/yc_1_{frac}/data.txt', 'r')
     for line in f:
         pbar.update(1)
         line = line[:-1]
         sid, vid_list_str = line.split()
         vid_list = []
         for vid in vid_list_str.split(','):
             vid, cls, ts = vid.split(':')
             cls = int(cls)  # cls: 0, 1, 2, ...
             ts = int(ts)
             vid_list.append([vid, cls, ts])
         yield vid_list
     f.close()
     pbar.close()
Esempio n. 18
0
    def train(self, dataloader):
        self.global_step = 0
        self.model.train(True)
        optimizer = self.optimizer_cls(list(self.trainable_params()))
        self.progress = utils.tqdm(total=len(dataloader.dataset),
                                   disable=not self.show_progress)
        if self.kld_annealing is not None:
            kld_scale = 0.0
        else:
            kld_scale = 1.0

        for eidx in range(1, self.epochs + 1):
            self.local_step = 0
            stats_cum = collections.defaultdict(float)
            for batch in dataloader:
                optimizer.zero_grad()
                batch_size, x, lens, targets = self.prepare_batch(batch)
                self.global_step += batch_size
                self.local_step += batch_size
                self.progress.update(batch_size)
                ret = self.model(x, lens)
                logits, loss_kld = ret.get("pass"), ret.get("loss")
                loss = self.calculate_celoss(logits, targets)
                if loss_kld is not None:
                    loss += kld_scale * loss_kld.mean()
                loss.backward()
                optimizer.step()

                stats = {"loss": loss.item()}
                if loss_kld is not None:
                    stats["loss-kld"] = kld_scale * loss_kld.mean().item()
                    stats["kld-anneal"] = kld_scale
                for k, v in stats.items():
                    stats_cum[f"{k}-cum"] += v * batch_size
                desc = self.report_stats(stats)
                self.progress.set_description(desc)
                self.report_samples(batch.get("string"),
                                    logits.max(2)[1], lens)
            stats_cum = {k: v / self.local_step for k, v in stats_cum.items()}
            desc = self.report_stats(stats_cum)
            logging.info(f"[{eidx}] {desc}")
            if self.kld_annealing is not None:
                kld_scale += self.kld_annealing
                kld_scale = min(1.0, kld_scale)

            if eidx % self.save_period == 0:
                self.snapshot(eidx)
Esempio n. 19
0
def automatic_image_disambiguation(features,
                                   queries,
                                   select_clusters,
                                   gamma=1.0,
                                   k=200,
                                   n_clusters=None,
                                   max_clusters=10,
                                   show_progress=False):
    """ Automatic Image Disambiguation (our method) based on clustering of directions and directed boni.
    
    features - n-by-d matrix containing d-dimensional features of n samples.
    
    queries - Dictionary mapping query IDs to dictionaries with keys 'relevant' and 'img_id'. 'img_id' gives the ID of the query
              image and 'relevant' points to a list of IDs of images relevant for this query.
    
    select_clusters - Callback function taking a query dictionary with keys 'relevant' and 'img_id' and a list of lists of images
                      for each cluster as arguments and returning a list of indices of selected clusters.
    
    gamma - Controls the effect of the cluster selection. For gamma < 1.0, the direction of samples must match the selected direction
            more exactly for those samples being adjusted, while for very large gamma, even samples in the orthogonal direction will
            be assigned a highly adjusted distance.
    
    k - The number of baseline retrieval results to be used for the initial clustering step.
    
    n_clusters - The number of clusters (image senses) to be shown to the user for selection of the relevant clusters. If set to None,
                 the number of clusters will be determined heuristically.
    
    max_clusters - Maximum number of clusters. Has only an effect if n_clusters is None.
    
    show_progress - If True, a progress bar will be shown (requires tqdm).
    
    Returns: re-ranked retrieval results as dictionary mapping query IDs to tuples consisting of an ordered list of retrieved image IDs
             and a corresponding list of adjusted distances to the query.
    """

    # Baseline retrieval
    retrievals = baseline_retrieval(features, queries, select_clusters)

    ret_it = tqdm(
        retrievals.items(), desc='AID', total=len(retrievals),
        leave=False) if show_progress else retrievals.items()

    with Pool(initializer=_init_pool,
              initargs=(features, queries, select_clusters, gamma, k,
                        n_clusters, max_clusters)) as p:
        return dict(p.imap_unordered(_aid_worker, ret_it, 10))
Esempio n. 20
0
 def encode(self, dataloader):
     with torch.no_grad():
         self.model.train(False)
         self.step = 0
         progress = utils.tqdm(
             total=len(dataloader.dataset),
             desc=f"encoding distribution",
         )
         means, stds = [], []
         for batch in dataloader:
             batch_size, (w, l, i, lens) = self.prepare_batch(batch)
             self.step += batch_size
             progress.update(batch_size)
             mean, std = self.model.encode(w, l, i, lens)
             means.append(mean.cpu())
             stds.append(std.cpu())
         progress.close()
         return torch.cat(means, 0), torch.cat(stds, 0)
Esempio n. 21
0
    def generate(self, num_samples):
        self.model.train(False)
        z = self.sample_z(num_samples)
        samples = []
        progress = utils.tqdm(total=num_samples, desc="generating")

        for i in range(0, num_samples, self.batch_size):
            z_batch = z[i:i + self.batch_size]
            progress.update(z_batch.size(0))
            x = z.new(z_batch.size(0), 1).fill_(self.bos_idx).long()
            x, lens = self.model.decode(z_batch, x,
                eos_idx=self.eos_idx,
                max_len=self.max_len
            )
            x, lens = x.cpu().tolist(), lens.cpu().tolist()
            for sent, l in zip(x, lens):
                samples.append(self.to_sent(sent[:l]))

        return samples
Esempio n. 22
0
def hard_cluster_selection(features,
                           queries,
                           select_clusters,
                           k=200,
                           n_clusters=None,
                           max_clusters=10,
                           show_progress=False):
    """ Hard Cluster Selection as used by CLUE, but on the clusters determined by AID (our method). """

    # Baseline retrieval
    retrievals = baseline_retrieval(features, queries, select_clusters)

    ret_it = tqdm(retrievals.items(),
                  desc='Hard-Select',
                  total=len(retrievals),
                  leave=False) if show_progress else retrievals.items()

    with Pool(initializer=_init_pool,
              initargs=(features, queries, select_clusters, 1.0, k, n_clusters,
                        max_clusters)) as p:
        return dict(p.imap_unordered(_hs_worker, ret_it, 10))
Esempio n. 23
0
    def build_graph_user_item(self):
        from tqdm import tqdm
        user2item = [defaultdict(int) for _ in range(args.nb_users)]
        item2user = [defaultdict(int) for _ in range(args.nb_items)]
        for user, item_list in tqdm(enumerate(self.user2item_seq),
                                    desc='build edges'):
            for item in item_list:
                if item >= 3:
                    user2item[user][item] += 1
                    item2user[item][user] += 1

        maxn = args.gnn_adj_length
        user_neighbors = []
        for user in range(args.nb_users):
            items = self.sample_neighbors(user2item[user], maxn)
            user_neighbors.append(items)

        item_neighbors = []
        for item in range(args.nb_items):
            users = self.sample_neighbors(item2user[item], maxn)
            item_neighbors.append(users)

        self.neighbors = [user_neighbors, item_neighbors]
Esempio n. 24
0
    def train(self, dataset, device):
        self.model.train()
        # dataset_iter = iter(dataset)
        # np.random.shuffle(dataset)
        N = sum(1 for _ in deepcopy(dataset))
        # print("len of datasets: ", N)
        loss_total = 0
        i = 0
        self.optimizer.zero_grad()
        adjs, atoms, proteins, labels = [], [], [], []
        # TODO: 进度条
        for _ in tqdm(range(N), ascii=True):
            data = next(dataset)
            i = i+1
            atom, adj, protein, label = data
            # TODO: 将Tensor转移到显卡上
            if torch.cuda.is_available():
                atom, adj, protein, label = atom.cuda(), adj.cuda(), protein.cuda(), label.cuda()
            adjs.append(adj)
            atoms.append(atom)
            proteins.append(protein)
            labels.append(label)
            if i % 8 == 0 or i == N:
                data_pack = pack(atoms, adjs, proteins, labels, device)
                loss = self.model(data_pack)
                # loss = loss / self.batch
                loss.backward()
                # torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10)
                adjs, atoms, proteins, labels = [], [], [], []
            else:
                continue
            if i % self.batch == 0 or i == N:
                self.optimizer.step()
                self.optimizer.zero_grad()
            loss_total += loss.item()

        return loss_total
Esempio n. 25
0
    def dump_features(self, name):
        if name == 'vali':
            data = self.data.vali_batch
        elif name == 'test':
            data = self.data.test_batch
        elif name == 'train':
            data = self.data.train_batch
        else:
            raise Exception(f'unknown name: {name}')

        users = []
        items = []
        logits = []

        pbar = tqdm(desc=f'dump {name}, predicting...', leave=False)
        for pv in self.model.predict(data):
            pbar.update(1)
            users.extend(pv.user.tolist())
            _items = pv.top_items.tolist()
            _scores = pv.top_scores.tolist()
            items.extend(_items)
            logits.extend(_scores)

            if args.run_test and pbar.n > 10:
                break

        pbar.close()

        feat = [users, items, logits]

        fn = f'{utils.for_fuse_dir}/{args.msg}_{name}'

        print(f'{utils.get_time_str()} dump file {fn}')
        utils.save_pkl(feat, fn)
        print(f'{utils.get_time_str()} dump file {fn} over')

        return fn
Esempio n. 26
0
    def build_adj(self, G, M):
        # M: number of adj per node
        N = args.nb_nodes
        # adj shape: [N, M]
        adj = [None] * N
        adj[0] = [0] * M

        w = [None] * N
        w[0] = [0] * M

        rdm = np.random.RandomState(555)
        pbar = tqdm(total=N - 1, desc='building adj')
        for node in range(1, N):
            pbar.update(1)
            adj_list = G.get_adj(node)
            if len(adj_list) > M:
                adj_list = rdm.choice(adj_list, size=M, replace=False).tolist()
            mask = [0] * (M - len(adj_list))
            adj_list = adj_list[:] + mask
            adj[node] = adj_list
            w_list = [G.edge_cnt.get((node, x), 0) for x in adj_list]
            w[node] = w_list
        pbar.close()
        return [adj, w]
Esempio n. 27
0
        change_points['c'] = np.random.rand(len(change_points))
    else:
        change_points['c'] = np.repeat(conf.infChangeFrac, len(change_points))

    ranges = np.concatenate(([0], change_points.i))
    ranges = np.concatenate((ranges, [conf.Nsample]))
    print(ranges)

    df = pd.DataFrame()
    model = None
    assign_model = None
    assigns = None
    from utils import tqdm

    #create data according to a certain model until next change point
    for i in tqdm(range(1, len(ranges))):

        model, assign_model, assigns = generate_model(
            change_points.corr_coef[i - 2] if i > 1 else INITAL_CORR_COEF,
            ranges[i - 1] - ranges[i - 2], model, assign_model, assigns,
            change_points.t[i - 2] if i > 1 else '',
            change_points.c[i - 2] if i > 1 else 1)
        dat = generate_data(model, assigns, ranges[i] - ranges[i - 1])

        df = df.append(pd.DataFrame(dat), ignore_index=True)

    start = pd.to_datetime('2000-01-01 00:00')
    df.index = pd.DatetimeIndex(
        [start + pd.to_timedelta(5 * i, unit='m') for i in df.index], name='i')

    change_points['time'] = df.index[change_points['i']]
Esempio n. 28
0
 def on_run_started(self, dataloader):
     ret = super(PredictorWithProgress, self).on_run_started(dataloader)
     self.progress = utils.tqdm(total=len(dataloader.dataset),
                                desc="predicting",
                                disable=not self.show_progress)
     return ret
Esempio n. 29
0
parser.add_argument('--save_every', '-se', type=int, default=5)
parser.add_argument('--device', '-d', type=str, default=None)
args = parser.parse_args()

if not args.device:
    args.device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SiameseNet(mode='train', device=args.device)
datagen = DataLoader(Pairloader(split='train'), shuffle=True)
bce_loss = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(args.epochs):
    epoch_loss = 0.0

    with tqdm(datagen) as t:
        for i, batch in enumerate(t):

            t.set_description('EPOCH: %i'%(epoch+1))

            data1, data2, label = batch[0][0].to(device=args.device), batch[0][1].to(device=args.device), batch[1].to(device=args.device)

            optimizer.zero_grad()
            output = model(data1, data2)
            loss = bce_loss(output, label)
            loss.backward()
            optimizer.step()

            epoch_loss+=loss.item()
            t.set_postfix(loss=epoch_loss/(i+1))
Esempio n. 30
0
def loadAndProcessData(comb_row,
                       config_dir,
                       day_chg_incs,
                       minute_incs,
                       minute_dir=tdm_dir + 'Minute_Files/'):
    """
    1. Loads the minute files from the external hard drive
    2. Creates, saves, and returns all_minutesDF, sec_guideDF, and dailyDF
    """
    other_secs = comb_row[[
        col for col in comb_row.index if (col[:3] == 'Sec' and col != 'Sec1')
    ]].values
    other_secs = [i for i in other_secs if type(i) == str]
    print('No pre-loaded data found. Loading data for ' + comb_row.Sec1 +
          ' and ' + ','.join(other_secs))
    sec1_minuteDF = readCSV(minute_dir + comb_row.Sec1 + '.csv')
    sec1_minuteDF = sec1_minuteDF.loc[
        (sec1_minuteDF.Date >= comb_row.TrainStartDate)
        & (sec1_minuteDF.Date <= comb_row.ValEndDate)].reset_index(drop=True)

    print('Loading minuteDF for ' + comb_row.Sec2)
    other_secs_minuteDF = readCSV(minute_dir + comb_row.Sec2 + '.csv')[[
        'Product', 'Date', 'Minute', 'O_B', 'O_A', 'H_B', 'H_A', 'L_B', 'L_A',
        'C_B', 'C_A', 'Count', 'B_TickImb', 'A_TickImb', 'M_TickImb'
    ]]
    for sec in other_secs[1:]:
        print('Loading minuteDF for ' + sec)
        other_secs_minuteDF = other_secs_minuteDF.append(
            readCSV(minute_dir + sec + '.csv')[[
                'Product', 'Date', 'Minute', 'O_B', 'O_A', 'H_B', 'H_A', 'L_B',
                'L_A', 'C_B', 'C_A', 'Count', 'B_TickImb', 'A_TickImb',
                'M_TickImb'
            ]],
            ignore_index=True)
    other_secs_minuteDF = other_secs_minuteDF.loc[
        (other_secs_minuteDF.Date >= comb_row.TrainStartDate)
        & (other_secs_minuteDF.Date <= comb_row.ValEndDate)].reset_index(
            drop=True)
    print('other_secs_minuteDF has ' + str(len(other_secs_minuteDF)) +
          ' rows.')
    print('sec1_minuteDF has ' + str(len(sec1_minuteDF)) + ' rows.')
    print("readCSVs complete. Subsetting dates...")

    # [x] subset for dates
    dates_in_common = set(sec1_minuteDF.Date.unique())
    for sec in other_secs:
        print(sec)
        other_sec_dates = set(other_secs_minuteDF.loc[
            other_secs_minuteDF.Product == sec].Date.unique())
        print('removing', [
            str(d)
            for d in sorted(list(dates_in_common.difference(other_sec_dates)))
        ])
        dates_in_common = dates_in_common.intersection(other_sec_dates)
        print(len(dates_in_common), 'dates_in_common')
    sec1_dates_to_remove = set(
        sec1_minuteDF.Date.unique()).difference(dates_in_common)
    print(str(len(dates_in_common)) + ' dates_in_common')

    if len(sec1_dates_to_remove) > 0:
        print('- removing ' + str(len(sec1_dates_to_remove)) + ' dates from ' +
              comb_row.Sec1)
        sec1_minuteDF = sec1_minuteDF.loc[sec1_minuteDF.Date.isin(
            dates_in_common)].reset_index(drop=True)
    for sec in other_secs:
        sec_dates_to_remove = set(
            other_secs_minuteDF.loc[other_secs_minuteDF.Product == sec].Date.
            unique()).difference(dates_in_common)
        if len(sec_dates_to_remove) > 0:
            print('- removing ' + str(len(sec_dates_to_remove)) +
                  ' dates from ' + sec)
    other_secs_minuteDF = other_secs_minuteDF.loc[
        other_secs_minuteDF.Date.isin(dates_in_common)].reset_index(drop=True)
    print('other_secs_minuteDF has ' + str(len(other_secs_minuteDF)) +
          ' rows.')
    print('sec1_minuteDF has ' + str(len(sec1_minuteDF)) + ' rows.')
    print("Date subset complete. Determining each day's Open/Closes...")

    # [x] determine each day's open and close
    dailyDF = pd.DataFrame(columns=['Date', 'Open', 'Close'])
    dailyDF['Date'] = sec1_minuteDF.Date.unique()
    for i in tqdm(range(len(dailyDF))):
        date = dailyDF.loc[i].Date
        lastOpen = sec1_minuteDF.loc[sec1_minuteDF.Date == date].Minute.min()
        firstClose = sec1_minuteDF.loc[sec1_minuteDF.Date == date].Minute.max()
        other_sec_date_subDF = other_secs_minuteDF.loc[other_secs_minuteDF.Date
                                                       == date]
        for sec in other_secs:
            lastOpen = max(
                lastOpen, other_sec_date_subDF.loc[other_sec_date_subDF.Product
                                                   == sec].Minute.min())
            firstClose = min(
                firstClose, other_sec_date_subDF.loc[
                    other_sec_date_subDF.Product == sec].Minute.max())
        dailyDF.loc[i, 'Open'] = lastOpen
        dailyDF.loc[i, 'Close'] = firstClose
    dailyDF.Open = dailyDF.Open.dt.strftime(date_format='%H:%M')
    dailyDF.Close = dailyDF.Close.dt.strftime(date_format='%H:%M')
    dailyDF.to_csv(config_dir + 'Data/daily_summary.csv', index=False)
    print(
        "Each day's Open/Closes determination complete. Creating all_minutesDF..."
    )

    # [x] create all_minutesDF
    all_minutesDF = pd.DataFrame(columns=['Date', 'Minute'])
    # enumerate minutes
    for i in range(len(dailyDF)):
        open_dt = pd.to_datetime(
            dailyDF.loc[i].Date.strftime(format='%Y-%m-%d') + ' ' +
            dailyDF.loc[i].Open,
            format='%Y-%m-%d %H:%M')
        close_dt = pd.to_datetime(
            dailyDF.loc[i].Date.strftime(format='%Y-%m-%d') + ' ' +
            dailyDF.loc[i].Close,
            format='%Y-%m-%d %H:%M')
        minute_range = pd.date_range(start=open_dt, end=close_dt, freq='T')
        day_minutesDF = pd.DataFrame({
            'Date': minute_range.date,
            'Minute': minute_range.values
        })
        all_minutesDF = all_minutesDF.append(day_minutesDF, ignore_index=True)

    #populate minute data
    col_stems = [
        'O_B', 'O_A', 'H_B', 'H_A', 'L_B', 'L_A', 'C_B', 'C_A', 'Count',
        'B_TickImb', 'A_TickImb', 'M_TickImb'
    ]
    first_minute_populate_stems = [
        'O_B', 'O_A', 'H_B', 'H_A', 'L_B', 'L_A', 'C_B', 'C_A'
    ]
    for sec_num in range(1, len(other_secs) + 2):
        sec_cols = [col_stem + str(sec_num) for col_stem in col_stems]
        for sec_col in sec_cols:
            all_minutesDF[sec_col] = np.nan
    all_minutesDF[[c + '1' for c in col_stems
                   ]] = pd.merge(all_minutesDF[['Minute']],
                                 sec1_minuteDF[['Minute'] + col_stems],
                                 on='Minute',
                                 how='left')[col_stems]
    print('Merging into all_minutesDF...')
    for sec_num in range(2, len(other_secs) + 2):
        other_sec = other_secs[sec_num - 2]
        all_minutesDF[[c + str(sec_num) for c in col_stems]] = pd.merge(
            all_minutesDF[['Minute']],
            other_secs_minuteDF[['Minute'] + col_stems].loc[
                other_secs_minuteDF.Product == other_sec],
            on='Minute',
            how='left')[col_stems]
    print('Getting the first datapoint of each day...')

    #get first datapoint of each day
    for date in tqdm(dates_in_common):
        date = dailyDF.loc[i].Date
        if date in dates_in_common:
            open_dt = pd.to_datetime(
                dailyDF.loc[i].Date.strftime(format='%Y-%m-%d') + ' ' +
                dailyDF.loc[i].Open,
                format='%Y-%m-%d %H:%M')
            sec1_last_row = sec1_minuteDF.loc[(sec1_minuteDF.Date == date) & (
                sec1_minuteDF.Minute <= open_dt)].iloc[-1]

            if sec1_last_row.Minute < open_dt:
                if (open_dt - sec1_last_row.Minute).seconds / 60 > 20:
                    raise ValueError(
                        'Too much time has elapsed. ' + comb_row.Sec1 +
                        ' open quote is stale at ' +
                        open_dt.strftime(format='%Y-%m-%d %H:%M') + ' by ' +
                        str((open_dt - sec1_last_row.Minute).seconds / 60) +
                        ' minutes.')
                else:
                    all_minutesDF.loc[all_minutesDF.Minute == open_dt,
                                      [c + '1' for c in col_stems]] = 0
                    all_minutesDF.loc[
                        all_minutesDF.Minute == open_dt,
                        [c + '1' for c in first_minute_populate_stems
                         ]] = sec1_last_row[first_minute_populate_stems]

            other_secs_subDF = other_secs_minuteDF.loc[
                (other_secs_minuteDF.Date == date)
                & (other_secs_minuteDF.Minute <= open_dt)]
            for sec_num in range(2, len(other_secs) + 2):
                other_sec = other_secs[sec_num - 2]
                other_sec_last_row = other_secs_subDF.loc[
                    other_secs_subDF.Product == other_sec].iloc[-1]
                if other_sec_last_row.Minute < open_dt:
                    if (open_dt - other_sec_last_row.Minute).seconds / 60 > 20:
                        raise ValueError(
                            "Too much time has elapsed. " + other_sec +
                            " open quote is stale at " +
                            date.strftime(open_dt='%Y-%m-%d %H:%M') + ' by ' +
                            str((open_dt - other_sec_last_row.Minute).seconds /
                                60) + ' minutes.')
                    else:
                        all_minutesDF.loc[
                            all_minutesDF.Minute == open_dt,
                            [c + str(sec_num) for c in col_stems]] = 0
                        all_minutesDF.loc[all_minutesDF.Minute == open_dt, [
                            c + str(sec_num)
                            for c in first_minute_populate_stems
                        ]] = other_sec_last_row[first_minute_populate_stems]

    print('Saving all_minutesDF...')
    all_minutesDF.to_csv(config_dir + 'Data/all_minutes.csv', index=False)
    print('Save complete.')
    sec_guideDF = pd.DataFrame({'Sec': [comb_row.Sec1] + list(other_secs)})
    sec_guideDF.to_csv(config_dir + 'Data/sec_guide.csv', index=False)
    return all_minutesDF, dailyDF, sec_guideDF