def run_all_in_one_experiment(query_matcher, workers=1): perplexity_range = np.linspace(1, 101, 100) iterations_range = [int(1e6)] learning_rates_range = [1] combinations = [] for idx in perplexity_range: pr = idx ir = np.random.choice(iterations_range) lrr = np.random.choice(learning_rates_range) identifier = constuct_identifier("tsne", pr, ir, lrr) file_name = f"trash/tsne_full_5_top15_fine_grained/{identifier}.png" combinations.append((query_matcher, idx, pr, ir, lrr, False, 15, False, False, False, file_name)) if workers == 1: for params in tqdm(set(combinations), total=len(combinations)): if os.path.exists(params[-1]): continue save_all_in_one(params) if workers > 1: pool = mp.Pool(workers) results = pool.imap(save_all_in_one, tqdm(set(combinations), total=len(combinations))) _ = list(results)
def pretrain(model, dataset, device, run_id, args): if args.optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd, momentum=args.momentum) elif args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd, betas=(0.9, 0.98), eps=1e-09, amsgrad=True) else: raise ValueError('Invalid optimizer!') criterion = SoftLogitLoss().cuda(device) data_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=True, drop_last=True) model.train() for epoch in range(args.pretrain_epochs): losses = [] accuracies = [] # adjust_learning_rate(optimizer, args.lr, epoch, args.pretrain_epochs, args) with tqdm(data_loader, desc=f'EPOCH [{epoch + 1}/{args.pretrain_epochs}]') as progress_bar: for x, y in progress_bar: x = x.cuda(device, non_blocking=True) y = y.cuda(device, non_blocking=True) out = model(x[:, 0], x[:, 1]) loss = criterion(out, y) optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.item()) progress_bar.set_postfix({'Loss': np.mean(losses), 'Acc': np.mean(accuracies)})
def main(): data_dir = "/media/omnisky/D4T/huli/work/headpose/data" file_name = "/media/omnisky/D4T/huli/work/headpose/data/file_name_biwi_300w_lp_no_mask20210212.txt" data = BIWI_Pose_300W_LP(data_dir, file_name, transform=None) val_names = "" #63340 for i in tqdm(range(0, len(data))): val_imgs, val_labels, val_const_labels, val_names, orial_img, label = data.__getitem__( i) # draw_img = ImageDraw.ImageDraw(orial_img) # draw_img.rectangle((label[0],label[1],label[2],label[3]),outline='red',width=2) img = cv2.cvtColor(np.array(orial_img), cv2.COLOR_RGB2BGR) utils.draw_axis(img, val_const_labels[0], val_const_labels[1], val_const_labels[2], tdx=(img.shape[0]) // 2, tdy=(img.shape[1]) // 2, size=50) str_yan = "yan:{:.3f}".format(val_const_labels[0]) str_pitch = "pitch:{:.3f}".format(val_const_labels[1]) str_roll = "roll:{:.3f}".format(val_const_labels[2]) cv2.putText(img, str_yan, (0, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2) cv2.putText(img, str_pitch, (0, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2) cv2.putText(img, str_roll, (0, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2) cv2.imshow("qew", img) trs = cv2.waitKey(0) if trs == ord('q'): break
def run(self) -> None: keywords_model = KeyBERT("xlm-r-distilroberta-base-paraphrase-v1") stop_words = stopwords.words("english") while True: urls = self.provider.get_records() if len(urls) == 0: break bulk = websites_db.initialize_unordered_bulk_op() for document in tqdm(urls, desc="thread", leave=False): page_text = document["page_text"].replace("\n", " ").strip() summary = document["xl_summary"] processed_text = " ".join(document["processed_text"]) id = document["_id"] try: summary_keywords, text_keywords, processed_keywords = keywords_model.extract_keywords( [summary, page_text, processed_text], keyphrase_ngram_range=(2, 2), stop_words=stop_words) except Exception as ex: print(ex) continue bulk.find({ "_id": id }).update_one({ "$set": { "summary_keywords": summary_keywords, "text_keywords": text_keywords, "processed_keywords": processed_keywords } }) bulk.execute()
def run(self) -> None: wiki_ft_model = FastText.load("./data/fasttext_300.model") while True: urls = self.provider.get_records() if len(urls) == 0: break bulk = websites_db.initialize_unordered_bulk_op() for document in tqdm(urls, desc="thread", leave=False): try: processed_text = document["processed_text"] id = document["_id"] encoded_processed_text = np.mean( [wiki_ft_model.wv[vec] for vec in processed_text], axis=0) if len(processed_text) == 1: encoded_processed_text = [encoded_processed_text] encoded = list([float(x) for x in encoded_processed_text]) bulk.find({ "_id": id }).update_one({ "$set": { "encoded_processed_text": encoded, "encoded_processed_text_version": 1 } }) except Exception as ex: print(ex, processed_text) bulk.execute()
def show_valid(): val_data_dir = "data/AFLW2000/" val_filename_list = "data/AFLW2000/filename_list.txt" valid_pose_dataset = AFLW2000(val_data_dir, val_filename_list, None) for i in tqdm(range(0, len(valid_pose_dataset))): img, labels, cont_labels, fil_name = valid_pose_dataset.__getitem__(i) img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) utils.draw_axis(img, cont_labels[0], cont_labels[1], cont_labels[2], tdx=(img.shape[0]) // 2, tdy=(img.shape[1]) // 2, size=50) str_yan = "yan:{:.3f}".format(cont_labels[0]) str_pitch = "pitch:{:.3f}".format(cont_labels[1]) str_roll = "roll:{:.3f}".format(cont_labels[2]) cv2.putText(img, str_yan, (0, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2) cv2.putText(img, str_pitch, (0, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2) cv2.putText(img, str_roll, (0, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2) cv2.imshow("qew", img) trs = cv2.waitKey(0) if trs == ord('q'): break
def evaluate(classifier, dataset, device, args): data_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=True, drop_last=True) targets = [] scores = [] classifier.eval() with torch.no_grad(): for x1, y, x2, _ in tqdm(data_loader, desc='EVAL'): x1 = x1.cuda(device, non_blocking=True) x2 = x2.cuda(device, non_blocking=True) out = classifier(x1, x2) scores.append( out.view(args.batch_size * args.num_seq, -1).cpu().numpy()) targets.append(y.view(-1).numpy()) scores = np.concatenate(scores, axis=0) targets = np.concatenate(targets, axis=0) return scores, targets
def pretrain(run_id, model, dataset, device, args): if args.optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd, momentum=args.momentum) elif args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd, betas=(0.9, 0.98), eps=1e-09, amsgrad=True) else: raise ValueError('Invalid optimizer!') if args.use_dist: sampler = DistributedSampler(dataset, shuffle=True) data_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=(sampler is None), pin_memory=True, drop_last=True, sampler=sampler) else: data_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=True, drop_last=True) model.train() for epoch in range(args.pretrain_epochs): losses = [] accuracies = [] if args.use_dist: data_loader.sampler.set_epoch(epoch) adjust_learning_rate(optimizer, args.lr, epoch, args.pretrain_epochs, args) with tqdm(data_loader, desc=f'EPOCH [{epoch + 1}/{args.pretrain_epochs}]' ) as progress_bar: for x1, _, x2, __ in progress_bar: x1 = x1.cuda(device, non_blocking=True) x2 = x2.cuda(device, non_blocking=True) loss = model(x1, x2) optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.item()) progress_bar.set_postfix({ 'Loss': np.mean(losses), 'Acc': np.mean(accuracies) })
def metric_F1(k_all_db_result, all_class_counts_list, k=10, use_class_k=True, weightbool=False, weight=1, silent=False): def prepare_params(name, class_label, ids, distance_values, clabels): use_k = all_class_counts_list.get(class_label) if use_class_k else k return Evaluator.confusion_matrix_vals(name, class_label, ids[:use_k], distance_values[:use_k], clabels[:use_k], all_class_counts_list) data_generator = tqdm(k_all_db_result.to_numpy()) if not silent else k_all_db_result.to_numpy() cm_vals_and_label = [(prepare_params(*params), params[1]) for params in data_generator] cm_vals = np.array([results for results, _ in cm_vals_and_label]) TP, FP, TN, FN = cm_vals[:, 0], cm_vals[:, 1], cm_vals[:, 2], cm_vals[:, 3] # precision = proportion of returned class from all returned items # recall = proportion of returned class from all class members in database with np.errstate(divide='ignore', invalid='ignore'): precision = np.nan_to_num(TP / (TP + FP)) recall = np.nan_to_num(TP / (TP + FN)) if not weightbool: F1scores = np.nan_to_num(2 * ((precision * recall) / (precision + recall))) k_all_db_result['F1score'] = F1scores k_all_db_result['F1precision'] = precision k_all_db_result['F1recall'] = recall else: F1scores = np.nan_to_num((1 + np.square(weight)) * ((precision * recall) / ((np.square(weight) * precision) + recall))) k_all_db_result['F1score'] = F1scores k_all_db_result['F1precision'] = precision k_all_db_result['F1recall'] = recall # F1_list.append({'F1score': F1score}) k_all_db_result['F1score'] = F1scores k_all_db_result['F1precision'] = precision k_all_db_result['F1recall'] = recall return k_all_db_result
def process_mit_arrhythmia(data_path): record_ids = list( map(lambda x: x.split('.')[0], list(filter(lambda x: x.endswith('.dat'), os.listdir(data_path))))) for idx in tqdm(record_ids): record = wfdb.rdrecord(os.path.join(data_path, idx)) annotation = wfdb.rdann(os.path.join(data_path, idx), 'atr') signal_ch1 = record.p_signal[:, 0] signal_ch2 = record.p_signal[:, 0] ecg_ch1 = ecg.ecg(signal=signal_ch1, sampling_rate=record.fs, show=False) ecg_ch2 = ecg.ecg(signal=signal_ch2, sampling_rate=record.fs, show=False) # Smooth signals signal_smoothed_ch1 = ecg_ch1['filtered'] signal_smoothed_ch2 = ecg_ch2['filtered'] # Reading r-peaks r_peaks = ecg_ch1['rpeaks'] # Reading annotations. `symbol` and `sample` are labels and values respectively. ann_symbol = annotation.symbol ann_sample = annotation.sample print(signal_ch1.shape, ann_sample, ann_symbol, r_peaks)
def finetune(classifier, dataset, device, args): params = [] if args.finetune_mode == 'freeze': print('[INFO] Finetune classifier only for the last layer...') for name, param in classifier.named_parameters(): if 'encoder' in name or 'agg' in name: param.requires_grad = False else: params.append({'params': param}) elif args.finetune_mode == 'smaller': print('[INFO] Finetune the whole classifier where the backbone have a smaller lr...') for name, param in classifier.named_parameters(): if 'encoder' in name or 'agg' in name: params.append({'params': param, 'lr': args.lr / 10}) else: params.append({'params': param}) else: print('[INFO] Finetune the whole classifier...') for name, param in classifier.named_parameters(): params.append({'params': param}) if args.optimizer == 'sgd': optimizer = optim.SGD(params, lr=args.lr, weight_decay=args.wd, momentum=args.momentum) elif args.optimizer == 'adam': optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.wd, betas=(0.9, 0.98), eps=1e-09, amsgrad=True) else: raise ValueError('Invalid optimizer!') criterion = nn.CrossEntropyLoss().cuda(device) sampled_indices = np.arange(len(dataset)) np.random.shuffle(sampled_indices) sampled_indices = sampled_indices[:int(len(sampled_indices) * args.finetune_ratio)] data_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False, pin_memory=True, drop_last=True, sampler=SubsetRandomSampler(sampled_indices)) classifier.train() for epoch in range(args.finetune_epochs): losses = [] accuracies = [] with tqdm(data_loader, desc=f'EPOCH [{epoch + 1}/{args.finetune_epochs}]') as progress_bar: for x, y in progress_bar: x, y = x.cuda(device, non_blocking=True), y.cuda(device, non_blocking=True) x = x.view(x.shape[0] * x.shape[1], *x.shape[2:]) out = classifier(x) loss = criterion(out, y.view(-1)) optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.item()) accuracies.append( logits_accuracy(out, y.view(-1), topk=(1,))[0]) progress_bar.set_postfix({'Loss': np.mean(losses), 'Acc': np.mean(accuracies)})
def init(self, desc, total): self.__tq = tqdm(desc=desc, leave=False, total=total, colour=self.__COLOR_LIST[self.__pos], dynamic_ncols=True, position=self.__pos) if desc and not self.__pos: self.__logger.wait(desc) self.sub_bars = [] self.prog = 0
def find_loop_number(public_key: int, max_loops=100000000) -> int: start_number = 1 for loop_no in tqdm(range(1, max_loops)): start_number = loop(start_number, SUBJECT_NUMBER) if start_number == public_key: return loop_no return None
def __init__(self, data_path, num_seq, subject_list: List, label_dim=0, modal='eeg', transform=None): self.label_dim = label_dim self.transform = transform assert modal in ['eeg', 'emg', 'eog'] files = sorted(os.listdir(data_path)) assert len(files) == self.num_subject files = [files[i] for i in subject_list] all_data = [] all_labels = [] for a_file in tqdm(files): data = sio.loadmat(os.path.join(data_path, a_file)) subject_data = data['data'] # trial x channel x data subject_label = data['labels'] # trial x label (valence, arousal, dominance, liking) # subject_data = tensor_standardize(subject_data, dim=-1) if modal == 'eeg': subject_data = subject_data[:, :32, :] elif modal == 'eog': subject_data = subject_data[:, 32: 36, :] elif modal == 'emg': subject_data = subject_data[:, 36:, :] else: raise ValueError subject_data = subject_data.reshape(*subject_data.shape[:2], subject_data.shape[-1] // self.sampling_rate, self.sampling_rate) # (trial, channel, num_sec, time_len) subject_data = np.swapaxes(subject_data, 1, 2) # (trial, num_sec, channel, time_len) if num_seq == 0: subject_data = np.expand_dims(subject_data, axis=2) else: if subject_data.shape[1] % num_seq != 0: subject_data = subject_data[:, :subject_data.shape[1] // num_seq * num_seq] subject_data = subject_data.reshape(subject_data.shape[0], subject_data.shape[1] // num_seq, num_seq, *subject_data.shape[-2:]) subject_label = np.repeat(np.expand_dims(subject_label, axis=1), subject_data.shape[1], axis=1) subject_label = np.repeat(np.expand_dims(subject_label, axis=2), subject_data.shape[2], axis=2) subject_data = subject_data.reshape(subject_data.shape[0] * subject_data.shape[1], *subject_data.shape[2:]) subject_label = subject_label.reshape(subject_label.shape[0] * subject_label.shape[1], *subject_label.shape[2:]) all_data.append(subject_data) all_labels.append(subject_label) all_data = np.concatenate(all_data, axis=0) all_labels = np.concatenate(all_labels, axis=0) if num_seq == 0: all_data = np.squeeze(all_data) # all_labels = np.squeeze(all_labels) self.data = all_data self.labels = all_labels
def download(self): graphs = self.graph_manager time_steps = (sorted( self.graph_manager.get_time_steps()[1])[::self.time_step]) for time in tqdm(time_steps): graphs.graph_to_torch_tensor(time, self.attrs_dict, self.raw_dir, to_pickle=True)
def main(): if len(sys.argv) < 3: print(f'Usage: {sys.argv[0]} <network> <num components>') return M, layout = old_read_network_file(sys.argv[1]) G = nx.Graph(M) rand = np.random.default_rng(0) # objective = PartitioningObjective(G) # optimizer = ga.GAOptimizer(objective, # NextEdgesToRm(rand), # new_to_rm_pop(len(G.edges), 20, rand), # True, 1) # it's like 4x faster with only one core # optimizer = ga.GAOptimizer(ChakrabortySatoObjective(G), # NextChakrabortySatoGen(rand, G), # new_chakraborty_sato_pop(rand, G, 50), # True, 5) n_comps = int(sys.argv[2]) n_labels = n_comps print(f'Searching for {n_comps} components.') optimizer = ga.GAOptimizer(LabelObjective(G, n_comps), NextLabelGen(n_labels, rand), new_label_pop(rand, len(G), 50, n_labels), True, 2) n_steps = 200 pbar = tqdm(range(n_steps)) costs = np.zeros(n_steps) diversities = np.zeros(n_steps) global_best: Tuple = None # type: ignore for step in pbar: cost_to_encoding = optimizer.step() local_best = min(cost_to_encoding, key=lambda x: x[0]) if global_best is None or local_best[0] < global_best[0]: global_best = local_best costs[step] = local_best[0] diversities[step] = len({tuple(ce[1]) for ce in cost_to_encoding }) / len(cost_to_encoding) pbar.set_description('Cost: {:.3f}'.format(local_best[0])) # partitioned = objective.partition(global_best[1]) # partitioned = chakraborty_sato_partition(G, global_best[1]) to_remove = partitioning.label_partition(G, global_best[1]) partitioned = nx.Graph(G) partitioned.remove_edges_from(to_remove) print('Cost:', global_best[0]) plt.title('Diversity') plt.plot(diversities) plt.figure() plt.title('Cost') plt.plot(costs) plt.figure() visualize_network(partitioned, layout, 'Partitioned via Label GA')
def run(self) -> None: pipeline = Pipeline(stopwords=set(stopwords.words("english"))) while True: urls = self.provider.get_records() if len(urls) == 0: break bulk = websites_db.initialize_unordered_bulk_op() for document in tqdm(urls, desc="thread", leave=False): page_text = document["page_text"] id = document["_id"] processed_text = list(pipeline.pipe(page_text)) bulk.find({"_id": id}).update_one({"$set": {"processed_text": processed_text, "processed_text_version": 3}}) bulk.execute()
def predict_vol_from_np(net, device, nparray, threshold=True, p_threshold=0.5): """ Takes a vol_idx in the form [patient_idx, day_idx] and predicts a full-volume segmentation on a CNN model. @params: net : pytorch convnet model. device : pytorch device for computation. vol_idx : identifier for a patient data volume in the form [p, d]. threshold : boolean for whether or not to threshold the output. p_threshold : probability above which prediction is considered True. @return: pred_volume : a prediction volume w/ shape: [n_classes, H, W, Z] """ net.eval() volume = nparray vol_shape = volume.shape n_cts = volume.shape[-1] pred_volume = torch.empty(net.n_classes, vol_shape[0], vol_shape[1], vol_shape[2]) with tqdm( total=n_cts, # progress bar desc=f'Predicting Volume', unit='scans', ascii=True, leave=False, bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}') as pbar: with torch.no_grad(): for idx in range(n_cts): ct = torch.Tensor(volume[:, :, idx]).unsqueeze(0).unsqueeze(0) ct = ct.to(device=device, dtype=torch.float32) pred = net(ct) # output shape: (1, Classes, H, W) pred = torch.squeeze(pred) # out shape: (Classes, H, W) if net.n_classes > 1: pred = F.softmax(pred, dim=0) else: pred = torch.sigmoid(pred) pred_volume[:, :, :, idx] = pred pbar.update() if threshold == True: pred_volume = pred_volume > p_threshold return pred_volume.numpy().astype(float)
def main(): for idx in tqdm(scene_list): scene_name = 'scene_{}'.format(str(idx).zfill(4)) dir_name = osp.join(root_path, scene_name, 'realsense', 'feature') if not osp.exists(dir_name): os.makedirs(dir_name) original_idx = idx % 100 one_scene_data = TRAIN_DATASET.__getitem__(original_idx) for view_id, ann_data in enumerate(one_scene_data): file_id = view_id * 16 + ann_data['annid_offset'] file_name = f'{str(file_id).zfill(4)}.npy' path_name = osp.join(dir_name, file_name) # os.remove(path_name) np.save(path_name, ann_data['point_clouds'])
def play_game_of_life_again(tile_rules: Dict[int, List[complex]], days: int, compare: Dict[int, int] = {}, debug=False): floor = Floor(tile_rules) for day in tqdm(range(1, days + 1)): floor = floor.step() blacks = count_blacks(floor) if debug: print(f"Day {day}: {blacks}") if day in compare.keys(): assert compare[day] == blacks, f"{compare[day]} != {blacks}" return floor
def main(): df = pd.DataFrame(columns=[ "ID", "Title", "Year", "Crew", "Plot", "Rating", "Country", "Reviews" ]) kp250 = pd.read_csv("kp250raw.csv") for i, mov in tqdm(kp250.iterrows()): kp_id = re.findall("_(\d+).jpg", mov["url_logo"])[0] df.loc[i + 1] = [ kp_id, mov["movie"], mov["year"], ", ".join([mov["director"], mov["screenwriter"], mov["actors"]]).replace(";", ","), mov["overview"].replace(";", ","), f'{mov["rating_ball"]:.2f}', mov["country"], load_reviews(kp_id) ] df.to_csv("yoohoo.csv")
def run(self) -> None: parser = PagesParser() while True: urls = self.provider.get_records() if len(urls) == 0: break bulk = websites_db.initialize_unordered_bulk_op() for document in tqdm(urls, desc="thread", leave=False): html = document["html"] id = document["_id"] page = BeautifulSoup(html, "html.parser") page_text = parser.get_pure_page_text(page) bulk.find({ "_id": id }).update_one({ "$set": { "page_text": page_text, "text_generation_version": 2 } }) bulk.execute()
def train(self): losses = [] self.__load_to_device() for epoch in range(1, self.num_of_epoch+1): losses_epoch = [] for _, (data, label, _) in enumerate(tqdm(self.loader_data["train"])): data = data.float().to(self.device) data.requires_grad = False label = label.long().to(self.device) label.requires_grad = False # forward output_batch = self.model(data) loss_batch = self.loss(output_batch, label) # backward self.optimizer.zero_grad() loss_batch.backward() self.optimizer.step() losses_epoch.append(loss_batch) # evaluate every epoch self.evaluate( epoch, save_score=True, loader_name=["val", "train"], fail_case_file="output_train_not_sm_thucth_xsub/result_fail.txt", pass_case_file="output_train_not_sm_thucth_xsub/result_pass.txt" ) # draw loss chart every 5-epoch losses.append(torch.mean(torch.tensor( losses_epoch, dtype=torch.float))) if (epoch % 5 == 0 or epoch == self.num_of_epoch): plt.plot(losses) plt.xlabel('epoch') plt.ylabel('loss') plt.savefig( "output_train_not_sm_thucth_xsub/loss/losses{}.png".format(epoch)) torch.save(self.model.state_dict(), "output_train_not_sm_thucth_xsub/model.pt")
def run(self) -> None: model = TransformerSummarizer(transformer_type="XLNet", transformer_model_key="xlnet-base-cased") while True: urls = self.provider.get_records() if len(urls) == 0: break bulk = websites_db.initialize_unordered_bulk_op() for document in tqdm(urls, desc="thread", leave=False): page_text = document["page_text"] cut = page_text.find("↑") if cut > 0: page_text = page_text[:cut] id = document["_id"] xl_summary = "".join( model(page_text, min_length=60, max_length=120)) bulk.find({ "_id": id }).update_one({"$set": { "xl_summary": xl_summary }}) bulk.execute()
def __download_sources(): tbd_sids = [] for sid in __current_index['crawlers'].keys(): if sid not in __latest_index['crawlers']: tbd_sids.append(sid) for sid in tbd_sids: del __current_index['crawlers'][sid] futures: Dict[str, Future] = {} for sid, latest in __latest_index['crawlers'].items(): current = __current_index['crawlers'].get(sid) has_new_version = not current or current['version'] < latest['version'] __current_index['crawlers'][sid] = latest user_file = (__user_data_path / str(latest['file_path'])).is_file() local_file = (__local_data_path / str(latest['file_path'])).is_file() if has_new_version or not (user_file or local_file): future = __executor.submit(__download_data, latest['url']) futures[sid] = future if not futures: return bar = tqdm(desc='Updating sources', total=len(futures), unit='file') if os.getenv('debug_mode') == 'yes': bar.update = lambda n=1: None # Hide in debug mode bar.clear() for sid, future in futures.items(): try: data = future.result() __save_source_data(sid, data) except Exception as e: logger.warn('Failed to download source file. Error: %s', e) finally: bar.update() bar.clear() bar.close()
def parse(dir_path: str) -> PreDataList: if not os.path.exists(dir_path): print("Directory not found:", dir_path) raise Exception() result = PreDataList() lang = Language.ENG tmp: List[Tuple[Tuple, PreDataList]] = list() subfolders = get_subfolders(dir_path) for subfolder in tqdm(subfolders): data_path = os.path.join(subfolder, OATA_CSV_NAME) entries = cast_as(Entries.load(Entry, data_path), Entries) for entry in entries.items(): gender = Gender.MALE if entry.gender == "m" else Gender.FEMALE symbols = text_to_symbols(entry.text, lang) wav_path = os.path.join(subfolder, AUDIO_FOLDER_NAME, entry.wav) data = PreData( name=entry.entry_id, speaker_name=entry.speaker, lang=lang, wav_path=wav_path, gender=gender, text=entry.text, symbols=symbols, accents=[entry.accent] * len(symbols), ) sorting_keys = entry.speaker, subfolder, entry.entry_id tmp.append((sorting_keys, data)) tmp.sort(key=lambda x: x[0]) result = PreDataList([x for _, x in tmp]) return result
if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") data, vocab, max_seq = get_data(max_length=300) seq, token_to_id, id_to_token = process_data(data, vocab, max_seq) seq = torch.from_numpy(seq).to(device) x = seq y = torch.hstack((x[:, 1:], torch.zeros(x.shape[0], 1, dtype=torch.int32))).to(device) mask = [] for i, s in enumerate(tqdm(x, desc="Creating masks")): mask.append(create_mask(s, token_to_id["<PAD>"])) mask = torch.from_numpy(np.array(mask)).to(device) embedding_sizes = [32, 64, 128, 512] heads = [1, 2, 4, 8] no_stacked_layers = [3, 4, 5, 6] metrics = open('metrics.csv', 'w') metrics.write( "EMBEDDING_SIZE, HEADS, NUMBER OF LAYERS, EPOCH, TRAIN_LOSS, TRAIN_PERP, TEST_LOSS, TEST_PERP\n" ) generations = open("generations.csv", "w") generations.write( "EMBEDDING_SIZE, HEADS, NUMBER_OF_LAYERS, EPOCH, AVG_SIM, SAMPLE\n")
def __init__(self, data_path, num_seq, subject_list: List, label_dim=0): files = sorted(os.listdir(data_path)) assert len(files) == SEED_NUM_SUBJECT files = [files[i] for i in subject_list] all_data = [] all_label = [] # Enumerate all files for a_file in tqdm(files): data = sio.loadmat(os.path.join(data_path, a_file)) # Each file contains 15 consecutive trials movie_ids = list( filter(lambda x: not x.startswith('__'), data.keys())) subject_data = [] subject_label = [] assert len(movie_ids) == len(SEED_LABELS) for i, key in enumerate(movie_ids): trial_data = data[key] trial_data = trial_data[:, : -1] # remove the last redundant point # trial_data = tensor_standardize(trial_data, dim=-1) assert trial_data.shape[1] % SEED_SAMPLING_RATE == 0 trial_data = trial_data.reshape( trial_data.shape[0], trial_data.shape[1] // SEED_SAMPLING_RATE, SEED_SAMPLING_RATE) trial_data = np.swapaxes(trial_data, 0, 1) # Shape: (num_seq, channel, time_len) if num_seq == 0: trial_data = np.expand_dims(trial_data, axis=1) else: if trial_data.shape[0] % num_seq != 0: trial_data = trial_data[:trial_data.shape[0] // num_seq * num_seq] trial_data = trial_data.reshape( trial_data.shape[0] // num_seq, num_seq, *trial_data.shape[1:]) trial_label = np.full(shape=trial_data.shape[:2], fill_value=SEED_LABELS[i]) # Final shape: (num_sample, num_seq, channel, time_len) subject_data.append(trial_data) subject_label.append(trial_label) subject_data = np.concatenate(subject_data, axis=0) subject_label = np.concatenate(subject_label, axis=0) all_data.append(subject_data) all_label.append(subject_label) all_data = np.concatenate(all_data, axis=0) all_label = np.concatenate(all_label, axis=0) if num_seq == 0: all_data = np.squeeze(all_data) # all_label = np.squeeze(all_label) print(all_data.shape) print(all_label.shape) self.data = all_data self.labels = all_label
def __init__(self, data_path, num_seq, subject_list: List, label_dim=0): self.label_dim = label_dim files = sorted(os.listdir(data_path)) assert len(files) == AMIGOS_NUM_SUBJECT files = [files[i] for i in subject_list] all_data = [] all_labels = [] for a_file in tqdm(files): data = sio.loadmat(os.path.join(data_path, a_file)) subject_data = [] subject_label = [] for i in range(data['joined_data'].shape[1]): trial_data = data['joined_data'][0, i] trial_label = data['labels_selfassessment'][0, i] trial_data = trial_data[:trial_data.shape[0] // AMIGOS_SAMPLING_RATE * AMIGOS_SAMPLING_RATE] trial_data = trial_data.reshape( trial_data.shape[0] // AMIGOS_SAMPLING_RATE, AMIGOS_SAMPLING_RATE, trial_data.shape[-1]) trial_data = np.swapaxes(trial_data, 1, 2) if np.isnan(trial_data).any(): warnings.warn( f"The array of {a_file} - {i} contains {np.sum(np.isnan(trial_data))} NaN of total {np.prod(trial_data.shape)} points, dropped." ) # trial_data[np.isnan(trial_data)] = 0 continue if trial_data.shape[0] % num_seq != 0: trial_data = trial_data[:trial_data.shape[0] // num_seq * num_seq] # Standardize mean_value = np.expand_dims(trial_data.mean(axis=0), axis=0) std_value = np.expand_dims(trial_data.std(axis=0), axis=0) trial_data = (trial_data - mean_value) / std_value trial_data = trial_data.reshape(trial_data.shape[0] // num_seq, num_seq, *trial_data.shape[1:]) if 0 in trial_data.shape: warnings.warn( f"The array of shape {data['joined_data'][0, i].shape} is too small, dropped." ) continue trial_label = np.repeat(trial_label, trial_data.shape[1], axis=0) trial_label = np.repeat(np.expand_dims(trial_label, axis=0), trial_data.shape[0], axis=0) if 0 in trial_label.shape: warnings.warn( f"The label of {a_file} - {i} is malfunctioned, dropped." ) continue subject_data.append(trial_data) subject_label.append(trial_label) subject_data = np.concatenate(subject_data, axis=0) subject_label = np.concatenate(subject_label, axis=0) all_data.append(subject_data) all_labels.append(subject_label) all_data = np.concatenate(all_data, axis=0) all_labels = np.concatenate(all_labels, axis=0) print(all_data.shape) print(all_labels.shape) self.data = all_data self.labels = all_labels
def __init__(self, data_path, num_seq, subject_list: List, label_dim=0): self.label_dim = label_dim files = sorted(os.listdir(data_path)) assert len(files) == DEAP_NUM_SUBJECT files = [files[i] for i in subject_list] all_data = [] all_labels = [] for a_file in tqdm(files): data = sio.loadmat(os.path.join(data_path, a_file)) subject_data = data['data'] # trial x channel x data subject_label = data[ 'labels'] # trial x label (valence, arousal, dominance, liking) # subject_data = tensor_standardize(subject_data, dim=-1) subject_data = subject_data.reshape( *subject_data.shape[:2], subject_data.shape[-1] // DEAP_SAMPLING_RATE, DEAP_SAMPLING_RATE) # (trial, channel, num_sec, time_len) subject_data = np.swapaxes( subject_data, 1, 2) # (trial, num_sec, channel, time_len) if num_seq == 0: subject_data = np.expand_dims(subject_data, axis=2) else: if subject_data.shape[1] % num_seq != 0: subject_data = subject_data[:, :subject_data.shape[1] // num_seq * num_seq] subject_data = subject_data.reshape( subject_data.shape[0], subject_data.shape[1] // num_seq, num_seq, *subject_data.shape[-2:]) subject_label = np.repeat(np.expand_dims(subject_label, axis=1), subject_data.shape[1], axis=1) subject_label = np.repeat(np.expand_dims(subject_label, axis=2), subject_data.shape[2], axis=2) subject_data = subject_data.reshape( subject_data.shape[0] * subject_data.shape[1], *subject_data.shape[2:]) subject_label = subject_label.reshape( subject_label.shape[0] * subject_label.shape[1], *subject_label.shape[2:]) all_data.append(subject_data) all_labels.append(subject_label) all_data = np.concatenate(all_data, axis=0) all_labels = np.concatenate(all_labels, axis=0) if num_seq == 0: all_data = np.squeeze(all_data) # all_labels = np.squeeze(all_labels) print(all_data.shape) print(all_labels.shape) self.data = all_data self.labels = all_labels