def get_proba(self, sentences): if self.model_name == 'bert': examples_for_processing = [(example[1], self.MAX_SEQ_LENGTH, self.bert_tokenizer) for example in sentences.iterrows()] process_count = cpu_count() - 1 print(f'Preparing to convert {len(sentences)} examples..') print(f'Spawning {process_count} processes..') with Pool(process_count) as p: features = list( tqdm_notebook(p.imap(self.convert_example_to_bert_feature, examples_for_processing), total=len(sentences))) all_input_ids, all_input_mask, all_segment_ids = self.get_bert_tensors( features) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.EVAL_BATCH_SIZE) model = BertForSequenceClassification.from_pretrained( 'bert/', cache_dir='cache/', num_labels=len(self.classes)) model.to(self.device) probas = None for input_ids, input_mask, segment_ids in tqdm_notebook( eval_dataloader, desc="Predicting"): input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) prob = torch.nn.functional.softmax(logits, dim=1) if probas is None: probas = prob.detach().cpu().numpy() else: probas = np.append(probas, prob.detach().cpu().numpy(), axis=0) return probas else: return self.model.predict_proba(sentences)
def pos_analysis(df, group_cols=None, round_decimal=1): # Assumes nltk universal pos-tagging # & df['pos'] has the part-of-speech tags # analysis along the POS used in the paper pos_syms = ['NOUN', 'PRON', 'ADJ', 'ADP', 'VERB'] pos_names = ['Nouns', 'Pronouns', 'Adjectives', 'Adpositions', 'Verbs'] if group_cols is not None: groups = df.groupby(group_cols) group_stats = [] group_lens = [] for n, gg in tqdm_notebook(groups): g_stats = defaultdict(set) group_lens.append(len(gg)) for t, p in zip(gg.tokens, gg.pos): for x, y in zip(t, p): g_stats[y[1]].add(x) group_stats.append(g_stats) for ps, pn in zip(pos_syms, pos_names): u_pos = [] u_pos_norm = [] for i, s in enumerate(group_stats): u_pos.append(len(s[ps])) u_pos_norm.append(u_pos[-1] / group_lens[i]) print(pn, '{:.{}f}'.format(np.mean(u_pos), round_decimal), '{:.{}f}'.format(np.mean(u_pos_norm), round_decimal)) else: for ps, pn in zip(pos_syms, pos_names): print( pn, df.pos.apply(lambda x: len([i[0] for i in x if i[1] == ps])). mean().round(round_decimal))
def outlier_dbscan(data): columns = [ 'wet_mean', 'green_mean', 'bright_mean', 'ARVI_mean', 'SAVI_mean', 'NDBI_mean', 'mNDWI_mean', 'NDWI_mean', 'mNDVI_mean', 'NDVI_mean', 'wet_p50', 'green_p50', 'bright_p50', 'ARVI_p50', 'SAVI_p50', 'NDBI_p50', 'mNDWI_p50', 'NDWI_p50', 'mNDVI_p50', 'NDVI_p50', 'S2_B12mean', 'S2_B11mean', 'S2_B8mean', 'S2_B4mean', 'S2_B3mean', 'S2_B2mean', 'S2_B12med', 'S2_B11med', 'S2_B8med', 'S2_B4med', 'S2_B3med', 'S2_B2med' ] t_c = data.TRAIN_CLASS.unique() for i in tqdm_notebook(range(len(t_c)), desc='Processing Clustering Outlier data'): cl_data = data.loc[data.TRAIN_CLASS == t_c[i], columns].dropna() st_sc = Normalizer() model_ = DBSCAN(eps=.05, min_samples=10).fit(st_sc.fit_transform(cl_data)) cl_data['label'] = model_.labels_ data.loc[cl_data.index, 'OUTLIER'] = cl_data.label data['OUTLIER'] = data.OUTLIER.apply(lambda y: 0 if y >= 0 else -1) data_outlier = data.loc[data.OUTLIER < 0, ['x', 'TRAIN_CLASS']].groupby( 'TRAIN_CLASS').agg('count').rename(columns={ 'x': 'COUNT_OUTLIER' }).reset_index() fig = px.bar(data_outlier, x="TRAIN_CLASS", y="COUNT_OUTLIER", title="OUTLIER") fig.show() return data
def _build_tqdm_iterator(iterable, verbose, **kwargs): """ Build an iterable, possibly using tqdm (either in notebook or regular mode) Parameters ---------- iterable verbose total Length of the iterator, helps in cases where tqdm is not detecting the total length. Returns ------- """ def _isnotebook(): try: shell = get_ipython().__class__.__name__ if shell == 'ZMQInteractiveShell': return True # Jupyter notebook or qtconsole elif shell == 'TerminalInteractiveShell': return False # Terminal running IPython else: return False # Other type (?) except NameError: return False # Probably standard Python interpreter if verbose: if _isnotebook(): iterator = tqdm_notebook(iterable, **kwargs) else: iterator = tqdm(iterable, **kwargs) else: iterator = iterable return iterator
def rfe_cat(train_x, train_y, valid_x, valid_y, min_): train_pool = Pool(train_x, train_y, cat_features=[0]) valid_pool = Pool(valid_x, valid_y, cat_features=[0]) f1_score_ = [] num_feature = [] feature_name = [] print('Start Recursive Feature Elimination') for i in tqdm_notebook(range(min_, 36), desc='Iterating Feature Elimination'): model = CatBoostClassifier(iterations=50, random_seed=1234, used_ram_limit='10gb') summary = model.select_features( train_pool, eval_set=valid_pool, features_for_select='0-34', num_features_to_select=i, steps=2, algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, shap_calc_type=EShapCalcType.Regular, train_final_model=True, logging_level='Silent', ) f1_ = f1_score(valid_y, model.predict(valid_pool).tolist(), average='micro') f1_score_.append(f1_) num_feature.append(i) feature_name.append(summary['selected_features_names']) print('Best F-1 score: ', max(f1_score_)) indices = f1_score_.index(max(f1_score_)) print('Best Number feature: ', num_feature[indices]) print('Selected of Feature names: \n', feature_name[indices]) return feature_name[indices]
def get_data(path, train=True): ids = next(os.walk(path + "/image"))[2] X = np.zeros((len(ids), im_height, im_width, 1), dtype=np.float32) if train: y = np.zeros((len(ids), im_height, im_width, 1), dtype=np.float32) print('Getting and resizing images ... ') for n, id_ in tqdm_notebook(enumerate(ids), total=len(ids)): # Load images img = load_img(path + '/image/' + id_, color_mode="grayscale") x_img = img_to_array(img) x_img = resize(x_img, (128, 128, 1), mode='constant', preserve_range=True) # Load annotation if train: an_ = id_[:6] + "_gtFine_polygons.json" with open(path_train + "/annotation/" + an_) as f: data = json.load(f) mask = img_to_array(create_mask(data)) mask = resize(mask, (128, 128, 1), mode='constant', preserve_range=True) # Save images X[n, ..., 0] = x_img.squeeze() / 255 if train: y[n] = mask / 255 print('Done!') if train: return X, y else: return X
def evaluate_on_dataset(model, data_loader, criterion, device, detailed=True, kl_div=True): epoch_loss = AverageMeter() model.eval() epoch_confidence = [] for batch in tqdm_notebook(data_loader): img = batch['image'].to(device) labels = batch['label'].to(device) # emotion_distribution logits = model(img) # Calculate loss loss = criterion(logits, labels) if detailed: if kl_div: epoch_confidence.append( torch.exp(logits).cpu()) # logits are log-soft-max else: epoch_confidence.append(F.softmax( logits, dim=-1).cpu()) # logits are pure logits b_size = len(labels) epoch_loss.update(loss.item(), b_size) if detailed: epoch_confidence = torch.cat(epoch_confidence).numpy() return epoch_loss.avg, epoch_confidence
def evaluate_on_dataset(model, data_loader, use_vision, criterion, device, detailed=True): epoch_loss = AverageMeter() epoch_acc = AverageMeter() model.eval() epoch_confidence = [] for batch in tqdm_notebook(data_loader): labels = batch['emotion'].to(device) tokens = batch['tokens'].to(device) if use_vision: img = batch['image'].to(device) logits = model(tokens, img) else: logits = model(tokens) # Calculate loss loss = criterion(logits, labels) guessed_correct = logits.argmax(1) == labels acc = torch.mean(guessed_correct.double()) if detailed: epoch_confidence.append(F.softmax(logits, dim=-1).cpu()) b_size = len(labels) epoch_loss.update(loss.item(), b_size) epoch_acc.update(acc.item(), b_size) if detailed: epoch_confidence = torch.cat(epoch_confidence).numpy() return epoch_loss.avg, epoch_acc.avg, epoch_confidence
def single_epoch_train(model, data_loader, use_vision, criterion, optimizer, device): epoch_loss = AverageMeter() epoch_acc = AverageMeter() model.train() for batch in tqdm_notebook(data_loader): labels = batch['emotion'].to(device) tokens = batch['tokens'].to(device) if use_vision: img = batch['image'].to(device) logits = model(tokens, img) else: logits = model(tokens) # Calculate loss loss = criterion(logits, labels) acc = torch.mean((logits.argmax(1) == labels).double()) # Back prop. optimizer.zero_grad() loss.backward() optimizer.step() b_size = len(labels) epoch_loss.update(loss.item(), b_size) epoch_acc.update(acc.item(), b_size) return epoch_loss.avg, epoch_acc.avg
def get_data(path, train=False): ids = [1] X = np.zeros((1, im_height, im_width, 1), dtype=np.float32) if train: y = np.zeros((len(ids), im_height, im_width, 1), dtype=np.float32) print("Getting and resizing images ... ") for n, id_ in tqdm_notebook(enumerate(ids), total=len(ids)): # Load images img = load_img(path, color_mode="grayscale") print(img) x_img = img_to_array(img) x_img = resize(x_img, (128, 128, 1), mode="constant", preserve_range=True) # Save images X[n, ..., 0] = x_img.squeeze() / 255 if train: y[n] = mask / 255 print("Done!") if train: return X, y else: return X
def _default_bar_func_mapping(): return { 'tqdm': lambda args: lambda x: tqdm(x, **args), 'tqdm_notebook': lambda args: lambda x: tqdm_notebook(x, **args), 'False': lambda args: iter, 'None': lambda args: iter, }
def get_neocr_dicts(xml_dir): xml_files = glob.glob(f'{xml_dir}/*.xml') xml_files.sort() dataset_dicts = [] for idx, xml_file in enumerate(tqdm_notebook(xml_files)): # Load XML format to Dict doc = xmltodict.parse(open(xml_file).read()) filename = os.path.join(img_dir, doc['annotation']['filename']) height, width = cv2.imread(filename).shape[:2] record = {} record["file_name"] = filename record["image_id"] = idx record[ "height"] = height # different from doc['annotation']['properties']['height'] record[ "width"] = width # different from doc['annotation']['properties']['width'] # for single object if not type(doc['annotation']['object']) == list: doc['annotation']['object'] = [doc['annotation']['object']] objs = [] # Explore every object for ann_object in doc['annotation']['object']: # Get bbox of this object rectangle = [] for pts in ann_object['polygon']['pt']: x, y = float(pts['x']), float( pts['y']) # int could not be dumped to json file coordinate = [x, y] rectangle.append(coordinate) rectangle = np.array(rectangle) x_min, y_min = np.min(rectangle, axis=0) x_max, y_max = np.max(rectangle, axis=0) obj = { "bbox": [x_min, y_min, x_max, y_max], "bbox_mode": BoxMode.XYXY_ABS, "category_id": 0, # Specify coordinates so that it goes around the boundary. "segmentation": [[x_min, y_min, x_min, y_max, x_max, y_max, x_max, y_min]], } objs.append(obj) record["annotations"] = objs dataset_dicts.append(record) return dataset_dicts
def modeling(list_of_df, base_estimator): list_models = [] for i in tqdm_notebook(range((len(list_of_df)))): x_train = list_of_df[i].drop(self.target, axis=1) y_train = list_of_df[i][self.target] model = copy.deepcopy(base_estimator) model.fit(x_train, y_train) list_models.append(model) return list_models
def __call__(self, current_size, max_size=None): """Update the progress bar""" if max_size is not None: self.max_size = max_size if self.pb is None: self.pb = tqdm_notebook(total=self.max_size, unit="B", unit_scale=True) self.pb.update(current_size)
def get_timeseries(df, y_index, time_steps): dim_0 = df.shape[0] - time_steps dim_1 = df.shape[1] x = np.zeros((dim_0, time_steps, dim_1)) y = np.zeros((dim_0, )) for i in tqdm_notebook(range(dim_0)): x[i] = df[i:time_steps + i] y[i] = df[time_steps + i, y_index] return x, y
def iterate_geom_(data_riox, train_vector): r_ = pd.DataFrame() range_max = train_vector.shape[0] for i in tqdm_notebook(range(range_max), desc='Processing clipping raster with vector'): try: r_t = preprocessing_tif_vector(data_riox, train_vector.geometry[i]) r_t['TRAIN_CLASS'] = train_vector.id[i] r_ = r_t.append(r_) except: pass return r_
def get_scores(data_loader, data_references): references = [] candidates = [] for batch_no, (v,inp) in notebook.tqdm_notebook(enumerate(data_loader)) : o = evaluate(inp.to(device = DEVICE),30) for i in range(inp.shape[0]): l = sum(len(s) for s in data_references[v[i]])//len(data_references[v[i]]) candidates.append(o[i].split()[:l]) references.append(data_references[v[i]]) result = {} result['BLEU1'] = corpus_bleu(references, candidates, weights=(1.0, 0, 0, 0)) return result
def read_BSSR1_scores_from_file(enrollees_id_filepath, users_id_filepath, path): # parse the XML files enrollees = ET.parse(enrollees_id_filepath) users = ET.parse(users_id_filepath) dataframe = [] files = glob.glob(path) # for filepath in glob.iglob(path): # replace the following two lines of code with the previous line if the tqdm package is not installed for i in tqdm_notebook(range(len(files))): filepath = files[i] file = open(filepath, 'r') file_name = filepath.split('/')[-1] file_name_split = file_name.split('_') read_data = np.array(file.read().split('\n')) sims = read_data[2:-2].astype(np.str) n_cmp = int(read_data[1]) assert sims.shape[0] == n_cmp assert sims.shape[0] == 6000 # "The order of the elements in the similarity file are fixed for all similarity files in the tree. # They are not sorted on similarity value. The order corresponds to the entries in the enrollees.xml # file." # grab the subject_id of current user subject_id = users.find( "./*[@name='{}']".format(file_name)).attrib['subject_id'] sims = np.insert(sims, 0, subject_id) dataframe.append(sims) file.close() # extract the column names for later indexing column_names = [e.attrib['subject_id'] for e in enrollees.findall("./*")] column_names_ex = column_names.copy() column_names_ex.insert(0, 'subject_id') # convert to pandas dataframe df = pd.DataFrame(dataframe, columns=column_names_ex) # set index to subject_id and organise rows according to column order df = df.set_index('subject_id') # show initial rows # df.head(10) return (df, enrollees, users, column_names)
def progressbar(*args, **kwargs): """Uses tqdm progressbar. This function exists for wrapping purposes only. Original docstring follows: ---------------------------------------- %s %s """ if preferences.General.nb_progressbar: try: return tqdm_notebook(*args, **kwargs) except: pass return tqdm(*args, **kwargs)
def __call__(self, current_size, max_size=None): """Update the progress bar""" if max_size is not None: self.max_size = max_size if self.pb is None: self.pb = tqdm_notebook( total=self.max_size, unit=self.unit, unit_scale=self.unit_scale, desc=self.desc, position=self.position, dynamic_ncols=True, ) self.pb.update(current_size)
def load_data(test_size=0.2): x, y = [], [] for file in tqdm_notebook(glob.glob("/content/Actor_*/*.wav")): file_name = os.path.basename(file) emotion = emotions[file_name.split("-")[2]] if emotion not in observed_emotions: continue feature = extract_feature(file, mfcc=True, chroma=False, mel=False) x.append(feature) y.append(emotion) return train_test_split(np.array(x), y, test_size=test_size, random_state=9)
def _progress_register(self, amount_of_work, description='', stage=0, tqdm_args=None): """ Registers a progress which can be reported/displayed via a progress bar. Parameters ---------- amount_of_work : int Amount of steps the underlying algorithm has to perform. description : str, optional This string will be displayed in the progress bar widget. stage : int, optional, default=0 If the algorithm has multiple different stages (eg. calculate means in the first pass over the data, calculate covariances in the second), one needs to estimate different times of arrival. """ if not self.show_progress: return if tqdm_args is None: tqdm_args = {} if not isinstance(amount_of_work, Integral): raise ValueError( 'amount_of_work has to be of integer type. But is {}'.format( type(amount_of_work))) # if we do not have enough work to do for the overhead of a progress bar just dont create a bar. if amount_of_work <= ProgressReporterMixin._pg_threshold: pg = None else: args = dict(total=amount_of_work, desc=description, dynamic_ncols=True, **tqdm_args) if _attached_to_ipy_notebook_with_widgets(): from tqdm.notebook import tqdm_notebook pg = tqdm_notebook(leave=False, **args) else: import tqdm pg = tqdm.tqdm(leave=True, **args) self._prog_rep_progressbars[stage] = pg self._prog_rep_descriptions[stage] = description assert stage in self._prog_rep_progressbars assert stage in self._prog_rep_descriptions
def evaluate(model, tokenizer, eval_dataset, batch_size): """ :param model: Newly trained Bert model :param tokenizer:Newly trained Bert tokenizer :param eval_dataset: :param batch_size: More flexible than training, the user can get away with picking a higher batch_size :return: The perplexity of the dataset """ eval_sampler = SequentialSampler(eval_dataset) # Same order samplinng eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=batch_size) positions_to_mask = eval_dataset.positions_to_mask # Eval! logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() # Evaluation loop i = 0 for batch in tqdm_notebook(eval_dataloader, desc='Evaluating'): inputs, labels = custom_mask_tokens(batch, tokenizer, positions_to_mask[i]) i += 1 inputs = inputs.to('cuda') labels = labels.to('cuda') with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)).item() result = { 'perplexity': perplexity, 'eval_loss': eval_loss } return result
def bgp(self, labelled=None): global_niter = 0 while global_niter < self.global_max_itr: self.max = 1 for j in tqdm_notebook( range(self.ndocs), disable=self.silence, ascii=True, desc=f'docs processed (itr {global_niter})'): self.local_propag(j) if self.is_labeled and not self.unlabeled[j]: self.suppress(j) self.global_propag() global_niter += 1 if not self.silence: self.print_top_topics() if self.eval_func: self.eval_func(self)
def make_progress_bar(*args, **kwargs): """Create iterable as progress bar if available. Ensure simple loop is returned or tqdm_notebook progress bar when prerequisities met Returns ------- iterable or tqdm_notebook tqdm_notebook based progress bar or simple iterable """ try: from tqdm.notebook import tqdm_notebook pbar = tqdm_notebook(*args, **kwargs) except Exception: logging.warning("No prerequisites installed for interactive progress bar, continuing without one.") return args[0] return pbar
def single_epoch_train(model, data_loader, criterion, optimizer, device): epoch_loss = AverageMeter() model.train() for batch in tqdm_notebook(data_loader): img = batch['image'].to(device) labels = batch['label'].to(device) # emotion_distribution logits = model(img) # Calculate loss loss = criterion(logits, labels) # Back prop. optimizer.zero_grad() loss.backward() optimizer.step() b_size = len(labels) epoch_loss.update(loss.item(), b_size) return epoch_loss.avg
def progress_bar(x: iter, verbose: bool = True, **kwargs) -> callable: """ Generate a progress bar using the tqdm library. If execution environment is Jupyter, return tqdm_notebook otherwise used tqdm. Parameters ----------- x: iterable some iterable to pass to tqdm function verbose: bool, (default=True) Provide feedback (if False, no progress bar produced) kwargs: additional keyword arguments for tqdm :return: tqdm or tqdm_notebook, depending on environment """ if not verbose: return x if which_environment() == 'jupyter': return tqdm_notebook(x, **kwargs) return tqdm(x, **kwargs)
def _init_supervised_matrices(self): print('oi') self._init_matrices() for j in range(self.ndocs): if not self.unlabeled[j]: self.suppress(j) for i in tqdm_notebook(range(self.nwords), ascii=True, desc='initialing.[]: '): docs = [d for d in self.X[:, i].nonzero()[0]] # if word w_i not belong in train documents set X_train if len(docs) == 0: self.log_B[i] = np.ones(self.n_components) continue log_F = np.log(self.X[docs, i].toarray()) log_A_j = self.log_A[docs] log_A_j = log_A_j - logsumexp(log_A_j, axis=1, keepdims=True) self.log_B[i] = logsumexp(log_F + log_A_j, axis=0) self.log_B = self.log_B - logsumexp(self.log_B, axis=0) self.log_B = np.log(self.beta + np.exp(self.log_B)) self.print_top_topics()
def validation(model, dataloader, multi): total_psnr = 0 for batch, images in tqdm_notebook(enumerate(dataloader)): with torch.no_grad(): input_b1 = Variable(images['input_b1'].cuda()) target_s1 = Variable(images['target_s1'].cuda()) if multi: input_b2 = Variable(images['input_b2'].cuda()) input_b3 = Variable(images['input_b3'].cuda()) output_l1, _, _ = model((input_b1, input_b2, input_b3)) else: output_l1 = model(input_b1) output_l1 = tensor_to_rgb(output_l1) target_s1 = tensor_to_rgb(target_s1) # compute psnr using function from utils psnr = compute_psnr(target_s1, output_l1) total_psnr += psnr return total_psnr / (batch + 1)
days_range[0], days_range[-1])) start_time = time.time() tweet = got.manager.TweetManager.getTweets(tweetCriteria) print("Collecting data end.. {0:0.2f} Minutes".format( (time.time() - start_time)/60)) print("=== Total num of tweets is {} ===".format(len(tweet))) # 원하는 변수 골라서 저장하기 # initialize tweet_list = [] for index in tqdm_notebook(tweet): # 메타데이터 목록 username = index.username link = index.permalink content = index.text # print(content) tweet_date = index.date.strftime("%Y-%m-%d") tweet_time = index.date.strftime("%H:%M:%S") retweets = index.retweets favorites = index.favorites # 결과 합치기 info_list = [tweet_date, tweet_time, username, content, link, retweets, favorites] tweet_list.append(info_list)