def run_all_in_one_experiment(query_matcher, workers=1):
    perplexity_range = np.linspace(1, 101, 100)
    iterations_range = [int(1e6)]
    learning_rates_range = [1]

    combinations = []
    for idx in perplexity_range:
        pr = idx
        ir = np.random.choice(iterations_range)
        lrr = np.random.choice(learning_rates_range)
        identifier = constuct_identifier("tsne", pr, ir, lrr)
        file_name = f"trash/tsne_full_5_top15_fine_grained/{identifier}.png"
        combinations.append((query_matcher, idx, pr, ir, lrr, False, 15, False,
                             False, False, file_name))

    if workers == 1:
        for params in tqdm(set(combinations), total=len(combinations)):
            if os.path.exists(params[-1]):
                continue
            save_all_in_one(params)

    if workers > 1:
        pool = mp.Pool(workers)
        results = pool.imap(save_all_in_one,
                            tqdm(set(combinations), total=len(combinations)))
        _ = list(results)
Exemple #2
0
def pretrain(model, dataset, device, run_id, args):
    if args.optimizer == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd, momentum=args.momentum)
    elif args.optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd, betas=(0.9, 0.98), eps=1e-09,
                               amsgrad=True)
    else:
        raise ValueError('Invalid optimizer!')

    criterion = SoftLogitLoss().cuda(device)
    data_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers,
                             shuffle=True, pin_memory=True, drop_last=True)

    model.train()
    for epoch in range(args.pretrain_epochs):
        losses = []
        accuracies = []
        # adjust_learning_rate(optimizer, args.lr, epoch, args.pretrain_epochs, args)
        with tqdm(data_loader, desc=f'EPOCH [{epoch + 1}/{args.pretrain_epochs}]') as progress_bar:
            for x, y in progress_bar:
                x = x.cuda(device, non_blocking=True)
                y = y.cuda(device, non_blocking=True)

                out = model(x[:, 0], x[:, 1])
                loss = criterion(out, y)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                losses.append(loss.item())

                progress_bar.set_postfix({'Loss': np.mean(losses), 'Acc': np.mean(accuracies)})
Exemple #3
0
def main():
    data_dir = "/media/omnisky/D4T/huli/work/headpose/data"
    file_name = "/media/omnisky/D4T/huli/work/headpose/data/file_name_biwi_300w_lp_no_mask20210212.txt"
    data = BIWI_Pose_300W_LP(data_dir, file_name, transform=None)
    val_names = ""  #63340
    for i in tqdm(range(0, len(data))):
        val_imgs, val_labels, val_const_labels, val_names, orial_img, label = data.__getitem__(
            i)
        # draw_img = ImageDraw.ImageDraw(orial_img)
        # draw_img.rectangle((label[0],label[1],label[2],label[3]),outline='red',width=2)
        img = cv2.cvtColor(np.array(orial_img), cv2.COLOR_RGB2BGR)
        utils.draw_axis(img,
                        val_const_labels[0],
                        val_const_labels[1],
                        val_const_labels[2],
                        tdx=(img.shape[0]) // 2,
                        tdy=(img.shape[1]) // 2,
                        size=50)
        str_yan = "yan:{:.3f}".format(val_const_labels[0])
        str_pitch = "pitch:{:.3f}".format(val_const_labels[1])
        str_roll = "roll:{:.3f}".format(val_const_labels[2])
        cv2.putText(img, str_yan, (0, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    (255, 255, 0), 2)
        cv2.putText(img, str_pitch, (0, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    (255, 255, 0), 2)
        cv2.putText(img, str_roll, (0, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    (255, 255, 0), 2)
        cv2.imshow("qew", img)
        trs = cv2.waitKey(0)
        if trs == ord('q'):
            break
 def run(self) -> None:
     keywords_model = KeyBERT("xlm-r-distilroberta-base-paraphrase-v1")
     stop_words = stopwords.words("english")
     while True:
         urls = self.provider.get_records()
         if len(urls) == 0:
             break
         bulk = websites_db.initialize_unordered_bulk_op()
         for document in tqdm(urls, desc="thread", leave=False):
             page_text = document["page_text"].replace("\n", " ").strip()
             summary = document["xl_summary"]
             processed_text = " ".join(document["processed_text"])
             id = document["_id"]
             try:
                 summary_keywords, text_keywords, processed_keywords = keywords_model.extract_keywords(
                     [summary, page_text, processed_text],
                     keyphrase_ngram_range=(2, 2),
                     stop_words=stop_words)
             except Exception as ex:
                 print(ex)
                 continue
             bulk.find({
                 "_id": id
             }).update_one({
                 "$set": {
                     "summary_keywords": summary_keywords,
                     "text_keywords": text_keywords,
                     "processed_keywords": processed_keywords
                 }
             })
         bulk.execute()
Exemple #5
0
    def run(self) -> None:
        wiki_ft_model = FastText.load("./data/fasttext_300.model")
        while True:
            urls = self.provider.get_records()
            if len(urls) == 0:
                break
            bulk = websites_db.initialize_unordered_bulk_op()
            for document in tqdm(urls, desc="thread", leave=False):
                try:
                    processed_text = document["processed_text"]
                    id = document["_id"]
                    encoded_processed_text = np.mean(
                        [wiki_ft_model.wv[vec] for vec in processed_text],
                        axis=0)
                    if len(processed_text) == 1:
                        encoded_processed_text = [encoded_processed_text]

                    encoded = list([float(x) for x in encoded_processed_text])
                    bulk.find({
                        "_id": id
                    }).update_one({
                        "$set": {
                            "encoded_processed_text": encoded,
                            "encoded_processed_text_version": 1
                        }
                    })
                except Exception as ex:
                    print(ex, processed_text)
            bulk.execute()
Exemple #6
0
def show_valid():
    val_data_dir = "data/AFLW2000/"
    val_filename_list = "data/AFLW2000/filename_list.txt"
    valid_pose_dataset = AFLW2000(val_data_dir, val_filename_list, None)
    for i in tqdm(range(0, len(valid_pose_dataset))):
        img, labels, cont_labels, fil_name = valid_pose_dataset.__getitem__(i)
        img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        utils.draw_axis(img,
                        cont_labels[0],
                        cont_labels[1],
                        cont_labels[2],
                        tdx=(img.shape[0]) // 2,
                        tdy=(img.shape[1]) // 2,
                        size=50)
        str_yan = "yan:{:.3f}".format(cont_labels[0])
        str_pitch = "pitch:{:.3f}".format(cont_labels[1])
        str_roll = "roll:{:.3f}".format(cont_labels[2])
        cv2.putText(img, str_yan, (0, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    (255, 255, 0), 2)
        cv2.putText(img, str_pitch, (0, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    (255, 255, 0), 2)
        cv2.putText(img, str_roll, (0, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    (255, 255, 0), 2)
        cv2.imshow("qew", img)
        trs = cv2.waitKey(0)
        if trs == ord('q'):
            break
Exemple #7
0
def evaluate(classifier, dataset, device, args):
    data_loader = DataLoader(dataset,
                             batch_size=args.batch_size,
                             num_workers=args.num_workers,
                             shuffle=True,
                             pin_memory=True,
                             drop_last=True)

    targets = []
    scores = []

    classifier.eval()
    with torch.no_grad():
        for x1, y, x2, _ in tqdm(data_loader, desc='EVAL'):
            x1 = x1.cuda(device, non_blocking=True)
            x2 = x2.cuda(device, non_blocking=True)

            out = classifier(x1, x2)
            scores.append(
                out.view(args.batch_size * args.num_seq, -1).cpu().numpy())
            targets.append(y.view(-1).numpy())

    scores = np.concatenate(scores, axis=0)
    targets = np.concatenate(targets, axis=0)

    return scores, targets
Exemple #8
0
def pretrain(run_id, model, dataset, device, args):
    if args.optimizer == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              weight_decay=args.wd,
                              momentum=args.momentum)
    elif args.optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd,
                               betas=(0.9, 0.98),
                               eps=1e-09,
                               amsgrad=True)
    else:
        raise ValueError('Invalid optimizer!')

    if args.use_dist:
        sampler = DistributedSampler(dataset, shuffle=True)
        data_loader = DataLoader(dataset,
                                 batch_size=args.batch_size,
                                 num_workers=args.num_workers,
                                 shuffle=(sampler is None),
                                 pin_memory=True,
                                 drop_last=True,
                                 sampler=sampler)
    else:
        data_loader = DataLoader(dataset,
                                 batch_size=args.batch_size,
                                 num_workers=args.num_workers,
                                 shuffle=True,
                                 pin_memory=True,
                                 drop_last=True)

    model.train()
    for epoch in range(args.pretrain_epochs):
        losses = []
        accuracies = []
        if args.use_dist:
            data_loader.sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, args.lr, epoch, args.pretrain_epochs,
                             args)
        with tqdm(data_loader,
                  desc=f'EPOCH [{epoch + 1}/{args.pretrain_epochs}]'
                  ) as progress_bar:
            for x1, _, x2, __ in progress_bar:
                x1 = x1.cuda(device, non_blocking=True)
                x2 = x2.cuda(device, non_blocking=True)

                loss = model(x1, x2)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                losses.append(loss.item())

                progress_bar.set_postfix({
                    'Loss': np.mean(losses),
                    'Acc': np.mean(accuracies)
                })
Exemple #9
0
    def metric_F1(k_all_db_result, all_class_counts_list, k=10, use_class_k=True, weightbool=False, weight=1, silent=False):
        def prepare_params(name, class_label, ids, distance_values, clabels):
            use_k = all_class_counts_list.get(class_label) if use_class_k else k
            return Evaluator.confusion_matrix_vals(name, class_label, ids[:use_k], distance_values[:use_k], clabels[:use_k], all_class_counts_list)
        data_generator = tqdm(k_all_db_result.to_numpy()) if not silent else k_all_db_result.to_numpy()
        cm_vals_and_label = [(prepare_params(*params), params[1]) for params in data_generator]
        cm_vals = np.array([results for results, _ in cm_vals_and_label])
        TP, FP, TN, FN = cm_vals[:, 0], cm_vals[:, 1], cm_vals[:, 2], cm_vals[:, 3]

        # precision = proportion of returned class from all returned items
        # recall = proportion of returned class from all class members in database
        with np.errstate(divide='ignore', invalid='ignore'):
            precision = np.nan_to_num(TP / (TP + FP))
            recall = np.nan_to_num(TP / (TP + FN))
            if not weightbool:
                F1scores = np.nan_to_num(2 * ((precision * recall) / (precision + recall)))
                k_all_db_result['F1score'] = F1scores
                k_all_db_result['F1precision'] = precision
                k_all_db_result['F1recall'] = recall
            else:
                F1scores = np.nan_to_num((1 + np.square(weight)) * ((precision * recall) / ((np.square(weight) * precision) + recall)))
                k_all_db_result['F1score'] = F1scores
                k_all_db_result['F1precision'] = precision
                k_all_db_result['F1recall'] = recall



        # F1_list.append({'F1score': F1score})
        k_all_db_result['F1score'] = F1scores
        k_all_db_result['F1precision'] = precision
        k_all_db_result['F1recall'] = recall
        return k_all_db_result
def process_mit_arrhythmia(data_path):
    record_ids = list(
        map(lambda x: x.split('.')[0],
            list(filter(lambda x: x.endswith('.dat'), os.listdir(data_path)))))

    for idx in tqdm(record_ids):
        record = wfdb.rdrecord(os.path.join(data_path, idx))
        annotation = wfdb.rdann(os.path.join(data_path, idx), 'atr')

        signal_ch1 = record.p_signal[:, 0]
        signal_ch2 = record.p_signal[:, 0]

        ecg_ch1 = ecg.ecg(signal=signal_ch1,
                          sampling_rate=record.fs,
                          show=False)
        ecg_ch2 = ecg.ecg(signal=signal_ch2,
                          sampling_rate=record.fs,
                          show=False)

        # Smooth signals
        signal_smoothed_ch1 = ecg_ch1['filtered']
        signal_smoothed_ch2 = ecg_ch2['filtered']

        # Reading r-peaks
        r_peaks = ecg_ch1['rpeaks']

        # Reading annotations. `symbol` and `sample` are labels and values respectively.
        ann_symbol = annotation.symbol
        ann_sample = annotation.sample

        print(signal_ch1.shape, ann_sample, ann_symbol, r_peaks)
Exemple #11
0
def finetune(classifier, dataset, device, args):
    params = []
    if args.finetune_mode == 'freeze':
        print('[INFO] Finetune classifier only for the last layer...')
        for name, param in classifier.named_parameters():
            if 'encoder' in name or 'agg' in name:
                param.requires_grad = False
            else:
                params.append({'params': param})
    elif args.finetune_mode == 'smaller':
        print('[INFO] Finetune the whole classifier where the backbone have a smaller lr...')
        for name, param in classifier.named_parameters():
            if 'encoder' in name or 'agg' in name:
                params.append({'params': param, 'lr': args.lr / 10})
            else:
                params.append({'params': param})
    else:
        print('[INFO] Finetune the whole classifier...')
        for name, param in classifier.named_parameters():
            params.append({'params': param})

    if args.optimizer == 'sgd':
        optimizer = optim.SGD(params, lr=args.lr, weight_decay=args.wd, momentum=args.momentum)
    elif args.optimizer == 'adam':
        optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.wd, betas=(0.9, 0.98), eps=1e-09,
                               amsgrad=True)
    else:
        raise ValueError('Invalid optimizer!')

    criterion = nn.CrossEntropyLoss().cuda(device)

    sampled_indices = np.arange(len(dataset))
    np.random.shuffle(sampled_indices)
    sampled_indices = sampled_indices[:int(len(sampled_indices) * args.finetune_ratio)]
    data_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers,
                             shuffle=False, pin_memory=True, drop_last=True,
                             sampler=SubsetRandomSampler(sampled_indices))

    classifier.train()
    for epoch in range(args.finetune_epochs):
        losses = []
        accuracies = []
        with tqdm(data_loader, desc=f'EPOCH [{epoch + 1}/{args.finetune_epochs}]') as progress_bar:
            for x, y in progress_bar:
                x, y = x.cuda(device, non_blocking=True), y.cuda(device, non_blocking=True)

                x = x.view(x.shape[0] * x.shape[1], *x.shape[2:])
                out = classifier(x)
                loss = criterion(out, y.view(-1))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                losses.append(loss.item())
                accuracies.append(
                    logits_accuracy(out, y.view(-1), topk=(1,))[0])

                progress_bar.set_postfix({'Loss': np.mean(losses), 'Acc': np.mean(accuracies)})
Exemple #12
0
    def init(self, desc, total):
        self.__tq = tqdm(desc=desc, leave=False, total=total, colour=self.__COLOR_LIST[self.__pos],
                         dynamic_ncols=True, position=self.__pos)
        if desc and not self.__pos:
            self.__logger.wait(desc)

        self.sub_bars = []
        self.prog = 0
Exemple #13
0
def find_loop_number(public_key: int, max_loops=100000000) -> int:
    start_number = 1
    for loop_no in tqdm(range(1, max_loops)):
        start_number = loop(start_number, SUBJECT_NUMBER)

        if start_number == public_key:
            return loop_no

    return None
Exemple #14
0
    def __init__(self, data_path, num_seq, subject_list: List, label_dim=0, modal='eeg', transform=None):
        self.label_dim = label_dim
        self.transform = transform

        assert modal in ['eeg', 'emg', 'eog']

        files = sorted(os.listdir(data_path))
        assert len(files) == self.num_subject
        files = [files[i] for i in subject_list]

        all_data = []
        all_labels = []
        for a_file in tqdm(files):
            data = sio.loadmat(os.path.join(data_path, a_file))
            subject_data = data['data']  # trial x channel x data
            subject_label = data['labels']  # trial x label (valence, arousal, dominance, liking)
            # subject_data = tensor_standardize(subject_data, dim=-1)

            if modal == 'eeg':
                subject_data = subject_data[:, :32, :]
            elif modal == 'eog':
                subject_data = subject_data[:, 32: 36, :]
            elif modal == 'emg':
                subject_data = subject_data[:, 36:, :]
            else:
                raise ValueError

            subject_data = subject_data.reshape(*subject_data.shape[:2], subject_data.shape[-1] // self.sampling_rate,
                                                self.sampling_rate)  # (trial, channel, num_sec, time_len)
            subject_data = np.swapaxes(subject_data, 1, 2)  # (trial, num_sec, channel, time_len)

            if num_seq == 0:
                subject_data = np.expand_dims(subject_data, axis=2)
            else:
                if subject_data.shape[1] % num_seq != 0:
                    subject_data = subject_data[:, :subject_data.shape[1] // num_seq * num_seq]
                subject_data = subject_data.reshape(subject_data.shape[0], subject_data.shape[1] // num_seq, num_seq,
                                                    *subject_data.shape[-2:])

            subject_label = np.repeat(np.expand_dims(subject_label, axis=1), subject_data.shape[1], axis=1)
            subject_label = np.repeat(np.expand_dims(subject_label, axis=2), subject_data.shape[2], axis=2)

            subject_data = subject_data.reshape(subject_data.shape[0] * subject_data.shape[1], *subject_data.shape[2:])
            subject_label = subject_label.reshape(subject_label.shape[0] * subject_label.shape[1],
                                                  *subject_label.shape[2:])

            all_data.append(subject_data)
            all_labels.append(subject_label)
        all_data = np.concatenate(all_data, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)

        if num_seq == 0:
            all_data = np.squeeze(all_data)
            # all_labels = np.squeeze(all_labels)

        self.data = all_data
        self.labels = all_labels
    def download(self):
        graphs = self.graph_manager
        time_steps = (sorted(
            self.graph_manager.get_time_steps()[1])[::self.time_step])

        for time in tqdm(time_steps):
            graphs.graph_to_torch_tensor(time,
                                         self.attrs_dict,
                                         self.raw_dir,
                                         to_pickle=True)
def main():
    if len(sys.argv) < 3:
        print(f'Usage: {sys.argv[0]} <network> <num components>')
        return

    M, layout = old_read_network_file(sys.argv[1])
    G = nx.Graph(M)
    rand = np.random.default_rng(0)
    # objective = PartitioningObjective(G)
    # optimizer = ga.GAOptimizer(objective,
    #                            NextEdgesToRm(rand),
    #                            new_to_rm_pop(len(G.edges), 20, rand),
    #                            True, 1)  # it's like 4x faster with only one core
    # optimizer = ga.GAOptimizer(ChakrabortySatoObjective(G),
    #                            NextChakrabortySatoGen(rand, G),
    #                            new_chakraborty_sato_pop(rand, G, 50),
    #                            True, 5)
    n_comps = int(sys.argv[2])
    n_labels = n_comps
    print(f'Searching for {n_comps} components.')
    optimizer = ga.GAOptimizer(LabelObjective(G, n_comps),
                               NextLabelGen(n_labels, rand),
                               new_label_pop(rand, len(G), 50, n_labels), True,
                               2)

    n_steps = 200
    pbar = tqdm(range(n_steps))
    costs = np.zeros(n_steps)
    diversities = np.zeros(n_steps)
    global_best: Tuple = None  # type: ignore
    for step in pbar:
        cost_to_encoding = optimizer.step()
        local_best = min(cost_to_encoding, key=lambda x: x[0])
        if global_best is None or local_best[0] < global_best[0]:
            global_best = local_best
        costs[step] = local_best[0]
        diversities[step] = len({tuple(ce[1])
                                 for ce in cost_to_encoding
                                 }) / len(cost_to_encoding)
        pbar.set_description('Cost: {:.3f}'.format(local_best[0]))

    # partitioned = objective.partition(global_best[1])
    # partitioned = chakraborty_sato_partition(G, global_best[1])
    to_remove = partitioning.label_partition(G, global_best[1])
    partitioned = nx.Graph(G)
    partitioned.remove_edges_from(to_remove)
    print('Cost:', global_best[0])

    plt.title('Diversity')
    plt.plot(diversities)
    plt.figure()
    plt.title('Cost')
    plt.plot(costs)
    plt.figure()
    visualize_network(partitioned, layout, 'Partitioned via Label GA')
 def run(self) -> None:
     pipeline = Pipeline(stopwords=set(stopwords.words("english")))
     while True:
         urls = self.provider.get_records()
         if len(urls) == 0:
             break
         bulk = websites_db.initialize_unordered_bulk_op()
         for document in tqdm(urls, desc="thread", leave=False):
             page_text = document["page_text"]
             id = document["_id"]
             processed_text = list(pipeline.pipe(page_text))
             bulk.find({"_id": id}).update_one({"$set": {"processed_text": processed_text, "processed_text_version": 3}})
         bulk.execute()
Exemple #18
0
def predict_vol_from_np(net, device, nparray, threshold=True, p_threshold=0.5):
    """ Takes a vol_idx in the form [patient_idx, day_idx] and predicts a
    full-volume segmentation on a CNN model.
    
    @params:
    net : pytorch convnet model.
    device : pytorch device for computation.
    vol_idx : identifier for a patient data volume in the form [p, d].
    threshold : boolean for whether or not to threshold the output.
    p_threshold : probability above which prediction is considered True.
    
    @return:
    pred_volume : a prediction volume w/ shape: [n_classes, H, W, Z] 
    """
    net.eval()
    volume = nparray
    vol_shape = volume.shape
    n_cts = volume.shape[-1]

    pred_volume = torch.empty(net.n_classes, vol_shape[0], vol_shape[1],
                              vol_shape[2])

    with tqdm(
            total=n_cts,  # progress bar
            desc=f'Predicting Volume',
            unit='scans',
            ascii=True,
            leave=False,
            bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}') as pbar:

        with torch.no_grad():
            for idx in range(n_cts):
                ct = torch.Tensor(volume[:, :, idx]).unsqueeze(0).unsqueeze(0)
                ct = ct.to(device=device, dtype=torch.float32)

                pred = net(ct)  # output shape: (1, Classes, H, W)
                pred = torch.squeeze(pred)  # out shape: (Classes, H, W)

                if net.n_classes > 1:
                    pred = F.softmax(pred, dim=0)
                else:
                    pred = torch.sigmoid(pred)

                pred_volume[:, :, :, idx] = pred

                pbar.update()

        if threshold == True:
            pred_volume = pred_volume > p_threshold

    return pred_volume.numpy().astype(float)
def main():
    for idx in tqdm(scene_list):
        scene_name = 'scene_{}'.format(str(idx).zfill(4))
        dir_name = osp.join(root_path, scene_name, 'realsense', 'feature')
        if not osp.exists(dir_name): os.makedirs(dir_name)

        original_idx = idx % 100
        one_scene_data = TRAIN_DATASET.__getitem__(original_idx)
        for view_id, ann_data in enumerate(one_scene_data):
            file_id = view_id * 16 + ann_data['annid_offset']
            file_name = f'{str(file_id).zfill(4)}.npy'
            path_name = osp.join(dir_name, file_name)
            # os.remove(path_name)
            np.save(path_name, ann_data['point_clouds'])
Exemple #20
0
def play_game_of_life_again(tile_rules: Dict[int, List[complex]],
                            days: int,
                            compare: Dict[int, int] = {},
                            debug=False):
    floor = Floor(tile_rules)

    for day in tqdm(range(1, days + 1)):
        floor = floor.step()

        blacks = count_blacks(floor)
        if debug: print(f"Day {day}: {blacks}")

        if day in compare.keys():
            assert compare[day] == blacks, f"{compare[day]} != {blacks}"

    return floor
Exemple #21
0
def main():
    df = pd.DataFrame(columns=[
        "ID", "Title", "Year", "Crew", "Plot", "Rating", "Country", "Reviews"
    ])

    kp250 = pd.read_csv("kp250raw.csv")
    for i, mov in tqdm(kp250.iterrows()):
        kp_id = re.findall("_(\d+).jpg", mov["url_logo"])[0]
        df.loc[i + 1] = [
            kp_id, mov["movie"], mov["year"],
            ", ".join([mov["director"], mov["screenwriter"],
                       mov["actors"]]).replace(";", ","),
            mov["overview"].replace(";", ","), f'{mov["rating_ball"]:.2f}',
            mov["country"],
            load_reviews(kp_id)
        ]

    df.to_csv("yoohoo.csv")
 def run(self) -> None:
     parser = PagesParser()
     while True:
         urls = self.provider.get_records()
         if len(urls) == 0:
             break
         bulk = websites_db.initialize_unordered_bulk_op()
         for document in tqdm(urls, desc="thread", leave=False):
             html = document["html"]
             id = document["_id"]
             page = BeautifulSoup(html, "html.parser")
             page_text = parser.get_pure_page_text(page)
             bulk.find({
                 "_id": id
             }).update_one({
                 "$set": {
                     "page_text": page_text,
                     "text_generation_version": 2
                 }
             })
         bulk.execute()
Exemple #23
0
    def train(self):
        losses = []
        self.__load_to_device()
        for epoch in range(1, self.num_of_epoch+1):
            losses_epoch = []
            for _, (data, label, _) in enumerate(tqdm(self.loader_data["train"])):
                data = data.float().to(self.device)
                data.requires_grad = False
                label = label.long().to(self.device)
                label.requires_grad = False

                # forward
                output_batch = self.model(data)
                loss_batch = self.loss(output_batch, label)

                # backward
                self.optimizer.zero_grad()
                loss_batch.backward()
                self.optimizer.step()
                losses_epoch.append(loss_batch)
            # evaluate every epoch
            self.evaluate(
                epoch,
                save_score=True,
                loader_name=["val", "train"],
                fail_case_file="output_train_not_sm_thucth_xsub/result_fail.txt",
                pass_case_file="output_train_not_sm_thucth_xsub/result_pass.txt"
            )

            # draw loss chart every 5-epoch
            losses.append(torch.mean(torch.tensor(
                losses_epoch, dtype=torch.float)))
            if (epoch % 5 == 0 or epoch == self.num_of_epoch):
                plt.plot(losses)
                plt.xlabel('epoch')
                plt.ylabel('loss')
                plt.savefig(
                    "output_train_not_sm_thucth_xsub/loss/losses{}.png".format(epoch))
                torch.save(self.model.state_dict(),
                           "output_train_not_sm_thucth_xsub/model.pt")
 def run(self) -> None:
     model = TransformerSummarizer(transformer_type="XLNet",
                                   transformer_model_key="xlnet-base-cased")
     while True:
         urls = self.provider.get_records()
         if len(urls) == 0:
             break
         bulk = websites_db.initialize_unordered_bulk_op()
         for document in tqdm(urls, desc="thread", leave=False):
             page_text = document["page_text"]
             cut = page_text.find("↑")
             if cut > 0:
                 page_text = page_text[:cut]
             id = document["_id"]
             xl_summary = "".join(
                 model(page_text, min_length=60, max_length=120))
             bulk.find({
                 "_id": id
             }).update_one({"$set": {
                 "xl_summary": xl_summary
             }})
         bulk.execute()
Exemple #25
0
def __download_sources():
    tbd_sids = []
    for sid in __current_index['crawlers'].keys():
        if sid not in __latest_index['crawlers']:
            tbd_sids.append(sid)
    for sid in tbd_sids:
        del __current_index['crawlers'][sid]

    futures: Dict[str, Future] = {}
    for sid, latest in __latest_index['crawlers'].items():
        current = __current_index['crawlers'].get(sid)
        has_new_version = not current or current['version'] < latest['version']
        __current_index['crawlers'][sid] = latest
        user_file = (__user_data_path / str(latest['file_path'])).is_file()
        local_file = (__local_data_path / str(latest['file_path'])).is_file()
        if has_new_version or not (user_file or local_file):
            future = __executor.submit(__download_data, latest['url'])
            futures[sid] = future

    if not futures:
        return

    bar = tqdm(desc='Updating sources', total=len(futures), unit='file')
    if os.getenv('debug_mode') == 'yes':
        bar.update = lambda n=1: None  # Hide in debug mode
    bar.clear()

    for sid, future in futures.items():
        try:
            data = future.result()
            __save_source_data(sid, data)
        except Exception as e:
            logger.warn('Failed to download source file. Error: %s', e)
        finally:
            bar.update()

    bar.clear()
    bar.close()
Exemple #26
0
def parse(dir_path: str) -> PreDataList:
  if not os.path.exists(dir_path):
    print("Directory not found:", dir_path)
    raise Exception()

  result = PreDataList()
  lang = Language.ENG
  tmp: List[Tuple[Tuple, PreDataList]] = list()

  subfolders = get_subfolders(dir_path)
  for subfolder in tqdm(subfolders):
    data_path = os.path.join(subfolder, OATA_CSV_NAME)
    entries = cast_as(Entries.load(Entry, data_path), Entries)
    for entry in entries.items():
      gender = Gender.MALE if entry.gender == "m" else Gender.FEMALE

      symbols = text_to_symbols(entry.text, lang)
      wav_path = os.path.join(subfolder, AUDIO_FOLDER_NAME, entry.wav)
      data = PreData(
        name=entry.entry_id,
        speaker_name=entry.speaker,
        lang=lang,
        wav_path=wav_path,
        gender=gender,
        text=entry.text,
        symbols=symbols,
        accents=[entry.accent] * len(symbols),
      )
      sorting_keys = entry.speaker, subfolder, entry.entry_id
      tmp.append((sorting_keys, data))

  tmp.sort(key=lambda x: x[0])

  result = PreDataList([x for _, x in tmp])

  return result
Exemple #27
0

if __name__ == "__main__":

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    data, vocab, max_seq = get_data(max_length=300)
    seq, token_to_id, id_to_token = process_data(data, vocab, max_seq)
    seq = torch.from_numpy(seq).to(device)

    x = seq
    y = torch.hstack((x[:, 1:], torch.zeros(x.shape[0], 1,
                                            dtype=torch.int32))).to(device)

    mask = []
    for i, s in enumerate(tqdm(x, desc="Creating masks")):
        mask.append(create_mask(s, token_to_id["<PAD>"]))

    mask = torch.from_numpy(np.array(mask)).to(device)

    embedding_sizes = [32, 64, 128, 512]
    heads = [1, 2, 4, 8]
    no_stacked_layers = [3, 4, 5, 6]

    metrics = open('metrics.csv', 'w')
    metrics.write(
        "EMBEDDING_SIZE, HEADS, NUMBER OF LAYERS, EPOCH, TRAIN_LOSS, TRAIN_PERP, TEST_LOSS, TEST_PERP\n"
    )
    generations = open("generations.csv", "w")
    generations.write(
        "EMBEDDING_SIZE, HEADS, NUMBER_OF_LAYERS, EPOCH, AVG_SIM, SAMPLE\n")
Exemple #28
0
    def __init__(self, data_path, num_seq, subject_list: List, label_dim=0):
        files = sorted(os.listdir(data_path))
        assert len(files) == SEED_NUM_SUBJECT
        files = [files[i] for i in subject_list]

        all_data = []
        all_label = []
        # Enumerate all files
        for a_file in tqdm(files):
            data = sio.loadmat(os.path.join(data_path, a_file))
            # Each file contains 15 consecutive trials
            movie_ids = list(
                filter(lambda x: not x.startswith('__'), data.keys()))
            subject_data = []
            subject_label = []
            assert len(movie_ids) == len(SEED_LABELS)

            for i, key in enumerate(movie_ids):
                trial_data = data[key]
                trial_data = trial_data[:, :
                                        -1]  # remove the last redundant point
                # trial_data = tensor_standardize(trial_data, dim=-1)
                assert trial_data.shape[1] % SEED_SAMPLING_RATE == 0

                trial_data = trial_data.reshape(
                    trial_data.shape[0],
                    trial_data.shape[1] // SEED_SAMPLING_RATE,
                    SEED_SAMPLING_RATE)
                trial_data = np.swapaxes(trial_data, 0, 1)
                # Shape: (num_seq, channel, time_len)

                if num_seq == 0:
                    trial_data = np.expand_dims(trial_data, axis=1)
                else:
                    if trial_data.shape[0] % num_seq != 0:
                        trial_data = trial_data[:trial_data.shape[0] //
                                                num_seq * num_seq]
                    trial_data = trial_data.reshape(
                        trial_data.shape[0] // num_seq, num_seq,
                        *trial_data.shape[1:])

                trial_label = np.full(shape=trial_data.shape[:2],
                                      fill_value=SEED_LABELS[i])

                # Final shape: (num_sample, num_seq, channel, time_len)
                subject_data.append(trial_data)
                subject_label.append(trial_label)
            subject_data = np.concatenate(subject_data, axis=0)
            subject_label = np.concatenate(subject_label, axis=0)
            all_data.append(subject_data)
            all_label.append(subject_label)
        all_data = np.concatenate(all_data, axis=0)
        all_label = np.concatenate(all_label, axis=0)

        if num_seq == 0:
            all_data = np.squeeze(all_data)
            # all_label = np.squeeze(all_label)

        print(all_data.shape)
        print(all_label.shape)

        self.data = all_data
        self.labels = all_label
Exemple #29
0
    def __init__(self, data_path, num_seq, subject_list: List, label_dim=0):
        self.label_dim = label_dim

        files = sorted(os.listdir(data_path))
        assert len(files) == AMIGOS_NUM_SUBJECT
        files = [files[i] for i in subject_list]

        all_data = []
        all_labels = []
        for a_file in tqdm(files):
            data = sio.loadmat(os.path.join(data_path, a_file))

            subject_data = []
            subject_label = []
            for i in range(data['joined_data'].shape[1]):
                trial_data = data['joined_data'][0, i]
                trial_label = data['labels_selfassessment'][0, i]
                trial_data = trial_data[:trial_data.shape[0] //
                                        AMIGOS_SAMPLING_RATE *
                                        AMIGOS_SAMPLING_RATE]
                trial_data = trial_data.reshape(
                    trial_data.shape[0] // AMIGOS_SAMPLING_RATE,
                    AMIGOS_SAMPLING_RATE, trial_data.shape[-1])
                trial_data = np.swapaxes(trial_data, 1, 2)

                if np.isnan(trial_data).any():
                    warnings.warn(
                        f"The array of {a_file} - {i} contains {np.sum(np.isnan(trial_data))} NaN of total {np.prod(trial_data.shape)} points, dropped."
                    )
                    # trial_data[np.isnan(trial_data)] = 0
                    continue

                if trial_data.shape[0] % num_seq != 0:
                    trial_data = trial_data[:trial_data.shape[0] // num_seq *
                                            num_seq]

                # Standardize
                mean_value = np.expand_dims(trial_data.mean(axis=0), axis=0)
                std_value = np.expand_dims(trial_data.std(axis=0), axis=0)
                trial_data = (trial_data - mean_value) / std_value

                trial_data = trial_data.reshape(trial_data.shape[0] // num_seq,
                                                num_seq, *trial_data.shape[1:])

                if 0 in trial_data.shape:
                    warnings.warn(
                        f"The array of shape {data['joined_data'][0, i].shape} is too small, dropped."
                    )
                    continue

                trial_label = np.repeat(trial_label,
                                        trial_data.shape[1],
                                        axis=0)
                trial_label = np.repeat(np.expand_dims(trial_label, axis=0),
                                        trial_data.shape[0],
                                        axis=0)

                if 0 in trial_label.shape:
                    warnings.warn(
                        f"The label of {a_file} - {i} is malfunctioned, dropped."
                    )
                    continue

                subject_data.append(trial_data)
                subject_label.append(trial_label)

            subject_data = np.concatenate(subject_data, axis=0)
            subject_label = np.concatenate(subject_label, axis=0)

            all_data.append(subject_data)
            all_labels.append(subject_label)
        all_data = np.concatenate(all_data, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)
        print(all_data.shape)
        print(all_labels.shape)

        self.data = all_data
        self.labels = all_labels
Exemple #30
0
    def __init__(self, data_path, num_seq, subject_list: List, label_dim=0):
        self.label_dim = label_dim

        files = sorted(os.listdir(data_path))
        assert len(files) == DEAP_NUM_SUBJECT
        files = [files[i] for i in subject_list]

        all_data = []
        all_labels = []
        for a_file in tqdm(files):
            data = sio.loadmat(os.path.join(data_path, a_file))
            subject_data = data['data']  # trial x channel x data
            subject_label = data[
                'labels']  # trial x label (valence, arousal, dominance, liking)
            # subject_data = tensor_standardize(subject_data, dim=-1)

            subject_data = subject_data.reshape(
                *subject_data.shape[:2],
                subject_data.shape[-1] // DEAP_SAMPLING_RATE,
                DEAP_SAMPLING_RATE)  # (trial, channel, num_sec, time_len)
            subject_data = np.swapaxes(
                subject_data, 1, 2)  # (trial, num_sec, channel, time_len)

            if num_seq == 0:
                subject_data = np.expand_dims(subject_data, axis=2)
            else:
                if subject_data.shape[1] % num_seq != 0:
                    subject_data = subject_data[:, :subject_data.shape[1] //
                                                num_seq * num_seq]
                subject_data = subject_data.reshape(
                    subject_data.shape[0], subject_data.shape[1] // num_seq,
                    num_seq, *subject_data.shape[-2:])

            subject_label = np.repeat(np.expand_dims(subject_label, axis=1),
                                      subject_data.shape[1],
                                      axis=1)
            subject_label = np.repeat(np.expand_dims(subject_label, axis=2),
                                      subject_data.shape[2],
                                      axis=2)

            subject_data = subject_data.reshape(
                subject_data.shape[0] * subject_data.shape[1],
                *subject_data.shape[2:])
            subject_label = subject_label.reshape(
                subject_label.shape[0] * subject_label.shape[1],
                *subject_label.shape[2:])

            all_data.append(subject_data)
            all_labels.append(subject_label)
        all_data = np.concatenate(all_data, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)

        if num_seq == 0:
            all_data = np.squeeze(all_data)
            # all_labels = np.squeeze(all_labels)

        print(all_data.shape)
        print(all_labels.shape)

        self.data = all_data
        self.labels = all_labels