def plot_expected_lengths(lengths, batch_sizes, choose_length, markers=[], n_batches=10000): fig, axarr = plt.subplots(len(batch_sizes), 1, figsize=(14, 20), sharex=True) expected_lengths = {} for i, batch_size in enumerate(batch_sizes): maxs = [] for _ in tqdm(range(n_batches), disable=False): val = choose_length(np.random.choice(lengths, batch_size)) maxs.append(math.ceil(val)) pd.Series(maxs).plot.hist(bins=50, ax=axarr[i], density=True, color='black', edgecolor='white', alpha=0.1) expected = np.mean(maxs) expected_lengths[batch_size] = expected max_y = axarr[i].get_ylim()[1] axarr[i].vlines([expected], 0, 1e3, 'limegreen', lw=4) axarr[i].set_ylim([0, max_y]) axarr[i].set_xlim([0, max(lengths)]) axarr[i].set_ylabel(f'batch_size={batch_size}', rotation=0) axarr[i].yaxis.set_label_coords(-0.1, 0.45) axarr[i].set_yticks([]) for marker in markers: con = ConnectionPatch(xyA=(marker, axarr[0].get_ylim()[1]), xyB=(marker, 0), coordsA='data', coordsB='data', axesA=axarr[0], axesB=axarr[-1], color='red', lw=4) axarr[0].add_artist(con) axarr[0].set_zorder(1) axarr[0].set_title(f'Expected sequence lengths with various batch sizes (n per batch = {n_batches})') plt.subplots_adjust(hspace=0) return expected_lengths
def load_embeddings(path): if '.pkl' in path or '.pickle' in path: with open(path, 'rb') as f: return pickle.load(f) else: with open(path, encoding="utf8", errors='ignore') as f: return dict( get_coefs(*line.strip().split(' ')) for line in tqdm(f))
def train_model(model, train, test, loss_fn, output_dim, lr=0.001, batch_size=512, n_epochs=4, enable_checkpoint_ensemble=True): param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()] optimizer = torch.optim.Adam(param_lrs, lr=lr) # 举例说明 # optim.Adam([ # {'params': model.base.parameters()}, # {'params': model.classifier.parameters(), 'lr': 1e-3} # ], lr=1e-2, momentum=0.9) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch) train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False) all_test_preds = [] checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)] for epoch in range(n_epochs): start_time = time.time() scheduler.step() model.train() # 针对训练的 model.eval()针对测试 avg_loss = 0. for data in tqdm(train_loader, disable=False): x_batch = data[:-1] y_batch = data[-1] y_pred = model(*x_batch)# *x_batch == x_batch[0] loss = loss_fn(y_pred, y_batch) optimizer.zero_grad() loss.backward() optimizer.step() avg_loss += loss.item() / len(train_loader) model.eval() test_preds = np.zeros((len(test), output_dim)) for i, x_batch in enumerate(test_loader): y_pred = sigmoid(model(*x_batch).detach().cpu().numpy()) test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred all_test_preds.append(test_preds) elapsed_time = time.time() - start_time print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format( epoch + 1, n_epochs, avg_loss, elapsed_time)) if enable_checkpoint_ensemble: test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0) else: test_preds = all_test_preds[-1] return test_preds
def load_embeddings(path): with open(path) as f: emb_arr = [] for line in tqdm(f): try: emb_arr.append(get_coefs(*line.strip().split(' '))) except Exception as e: print(e) return dict(emb_arr)
def __init__(self, description="Processing", total=100): self._tqdm = tqdm( disable=False if is_notebook() else None, bar_format= " {desc:20.20} |{bar}| {percentage:3.0f}% [{elapsed}<{remaining}]") self._tqdm.desc = description self._tqdm.total = total if self._tqdm.disable: self._tqdm = None self._value = 0 self._total = total
def LoopThroughTime(self,Animate = False): # Now, start the time evolution calculation... self.time_store = np.arange(0,self.SimulationParams['MaxTime'],self.dt) self.InitializeMatrices() for t in tqdm(self.time_store): self.evolve_ts() if Animate: self.PlotConcentrations() display.clear_output(wait=True) display.display(plt.gcf()) time.sleep(0.1)
def get_AUCs_pancan(pickle_path, cancertypes): # cancertypes = [s.split('/')[-1][15:-4] for s in glob.glob(pickle_path+'/run_cnn_output_*.pkl')] AUCs = pd.DataFrame(index=cancertypes, columns=['ROC AUC', 'PR AUC']) pbar = tqdm(cancertypes) for cancertype in pbar: pbar.set_description("Processing %s" % cancertype) x = get_AUC(cancertype, pickle_path) AUCs.loc[cancertype, 'ROC AUC'] = x[0] AUCs.loc[cancertype, 'PR AUC'] = x[1] AUCs.to_csv(pickle_path + '/AUCs.txt') return AUCs
def build_timeseries(mat, y_col_index, time_steps): # total number of time-series samples would be len(mat) - TIME_STEPS dim_0 = mat.shape[0] - time_steps dim_1 = mat.shape[1] x = np.zeros((dim_0, time_steps, dim_1)) y = np.zeros((x.shape[0],)) for i in tqdm(range(dim_0)): x[i] = mat[i:time_steps + i] y[i] = mat[time_steps + i, y_col_index] print("length of time-series i/o {} {}".format(x.shape, y.shape)) return x, y
def build_vocab(sentences, verbose=True): """ :param sentences: list of list of words :return: dictionary of words and their count """ vocab = {} for sentence in tqdm(sentences, disable=(not verbose)): for word in sentence: try: vocab[word] += 1 except KeyError: vocab[word] = 1 return vocab
def convert_lines(example, max_seq_length, tokenizer): max_seq_length -= 2 all_tokens = [] longer = 0 for text in tqdm(example): tokens_a = tokenizer.tokenize(text) if len(tokens_a) > max_seq_length: tokens_a = tokens_a[:max_seq_length] longer += 1 one_token = tokenizer.convert_tokens_to_ids( ["[CLS]"] + tokens_a + ["[SEP]"]) + [0] * (max_seq_length - len(tokens_a)) all_tokens.append(one_token) return np.array(all_tokens)
def get_per_slide_probs_pancan(pickle_path, cancertypes): # cancertypes = [s.split('/')[-1][15:-4] for s in glob.glob(pickle_path+'/run_cnn_output_*.pkl')] per_slide_probs = pd.DataFrame() pbar = tqdm(cancertypes) for cancertype in pbar: pbar.set_description("Processing %s" % cancertype) tmp = get_per_slide_probs(cancertype, pickle_path) tmp['cancertype'] = cancertype.upper() per_slide_probs = pd.concat([per_slide_probs, tmp]) per_slide_probs.to_csv(pickle_path + '/per_slide_probs.txt') return per_slide_probs
def eval_model( model: nn.Module, valid_loader: DataLoader, device: torch.device = torch.device('cuda') ) -> Dict[str, float]: """Compute validation score. Parameters ---------- model : nn.Module Model for prediction. valid_loader : DataLoader Data loader of validation data. device : torch.device, optional Device for computation. Returns ------- dict Scores of validation data. `long_score`: score of long answers `short_score`: score of short answers `overall_score`: score of the competition metric """ model.to(device) #model.half() model.eval() with torch.no_grad(): result = Result() for inputs, examples in tqdm(valid_loader): input_ids, attention_mask, token_type_ids = inputs y_preds = model(input_ids.to(device), attention_mask.to(device), token_type_ids.to(device)) _, _, class_preds = (p.detach().cpu() for p in y_preds) # start_logits, start_index = torch.max(start_preds, dim=1) # end_logits, end_index = torch.max(end_preds, dim=1) # span logits minus the cls logits seems to be close to the best # cls_logits = start_preds[:, 0] + end_preds[:, 0] # '[CLS]' logits #logits = start_logits + end_logits - cls_logits # (batch_size,) #indices = torch.stack((start_index, end_index)).transpose(0, 1) # (batch_size, 2) #result.update(examples, np.array(list(class_preds))) #result.update(examples, class_preds.numpy()) result.update(examples, class_preds.numpy()) return result.score()
def get_metrics_pancan(pickle_path, cancertypes): # cancertypes = [s.split('/')[-1][15:-4] for s in glob.glob(pickle_path+'/run_cnn_output_*.pkl')] metrics = pd.DataFrame() pbar = tqdm(cancertypes) for cancertype in pbar: pbar.set_description("Processing %s" % cancertype) tmp = get_metrics(cancertype, pickle_path) tmp = tmp.T.reset_index().rename(columns={'index': 'metric'}) tmp['cancertype'] = cancertype.upper() metrics = pd.concat([metrics, tmp], sort=False) metrics.to_csv(pickle_path + '/metrics.txt') return metrics
def key_phrase(self, addword, result): sub_list = [] for w in tqdm(addword, desc='add key phrase>>>'): result['filter'] = result.關鍵字.isin(w) id_ = result.groupby('index')['filter'].sum() id_ = id_[id_ == 2].index sub = pd.DataFrame({'關鍵字': ''.join(w), 'Value': 1, 'index': id_}) sub_list.extend(sub.values.tolist()) result = result[(result['index'].isin(id_) == False) & (result.關鍵字.isin(w) == False)] result = result.drop(columns='filter') sub_list = pd.DataFrame(sub_list) sub_list.columns = ['關鍵字', 'Value', 'index'] result = pd.concat([result, sub_list]) return result
def calculate_toxicity(model, test_data): batch_size = 1 max_bert_length = 220 pytorch_conversion = False seed_everything(1235) device = torch.device('cpu') tqdm.pandas() bert_model_path = "./service/uncased_L-12_H-768_A-12/" base_tokenizer = BertTokenizer.from_pretrained(bert_model_path, cache_dir=None, do_lower_case=True) converted_text = convert_data(test_data, max_bert_length, base_tokenizer) bert_test_lengths = torch.from_numpy( np.array([len(x) for x in converted_text])) bert_test_set = torch.tensor(pad_sequences(converted_text, maxlen=max_bert_length, padding='post'), dtype=torch.long) bert_test_dataset = torch.utils.data.TensorDataset(bert_test_set) bert_test_loader = torch.utils.data.DataLoader(bert_test_dataset, batch_size=batch_size, shuffle=False) tk2 = tqdm(enumerate(bert_test_loader), total=len(bert_test_loader), leave=False) output_preds = [] for i, (batch) in tk2: tsrs = trim_tensors(batch) x_batch, = tuple(t.to(device) for t in tsrs) y_pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None) y_pred = torch.sigmoid( torch.tensor( y_pred[:, 0].detach().cpu().squeeze().numpy())).numpy().ravel() list.append(output_preds, y_pred) return output_preds
def get_per_slide_evaluation_metrics_for_many_thresholds_for_all_labels( per_slide_average_predictions, label_names, per_slide_average_thresholds=np.arange(0, 1.002, .001)): per_slide_evaluation_metrics_df = [] for label in tqdm(label_names): per_slide_evaluation_metrics = get_per_slide_evaluation_metrics_for_many_thresholds( per_slide_average_predictions[[label, label + '_pred']], label, per_slide_average_thresholds=per_slide_average_thresholds) per_slide_evaluation_metrics_df.append( per_slide_evaluation_metrics.set_index( 'per_slide_average_threshold')) per_slide_evaluation_metrics_df = pd.concat( per_slide_evaluation_metrics_df, axis=1, keys=label_names) return per_slide_evaluation_metrics_df
def check_coverage(vocab, embeddings_index): a = {} oov = {} k = 0 i = 0 for word in tqdm(vocab): try: a[word] = embeddings_index[word] k += vocab[word] except: oov[word] = vocab[word] i += vocab[word] pass print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab))) print('Found embeddings for {:.2%} of all text'.format(k / (k + i))) sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1] return sorted_x
def Word_Cloud(self, word_vector, n_key, data=None, dictionary=None): if dictionary is not None: # TFIDF_Vector/ TEXTRANK result = pd.Series(word_vector).apply( lambda x: [[dictionary[w[0]], w[1]] for w in x[:n_key]]) result_dt = [] id_ = 0 for row in tqdm(result, desc='transform to wordcloud>>>'): sub = pd.DataFrame(row) sub['index'] = id_ result_dt.extend(sub.values.tolist()) id_ += 1 result_dt = pd.DataFrame(result_dt) result_dt.columns = ['關鍵字', 'Value', 'index'] result = result_dt if data is not None: result = pd.concat([data, result], axis=1).reset_index(drop=True) else: # LDA result = pd.DataFrame(word_vector, columns=['分群類別', '關鍵字', 'Value']) return result
def perform_regularised_cv(train, y_colname, grid, high_card_cols, folds=5, metric=mean_absolute_error): '''Performs grid search crossfold validation with support for regularised mean encoding Inputs: train: Input data set y_colname : target column name grid: Set of hyperparameters over which the model is to be tuned high_card_col : categorical columns you want to consider for mean encoding folds: Number of folds to be used for cross validation Outputs: all_scores: the list of final scores ''' kf = KFold(folds, random_state=0, shuffle=True) param_grid = ParameterGrid(grid) all_scores = [] #Store all scores for params in tqdm(param_grid): errors = [] for train_idx, test_idx in kf.split(train): # Split data into train and test kf_train, kf_test = train.iloc[train_idx, :], train.iloc[ test_idx, :] kf_train.reset_index(inplace=True, drop=True) kf_test.reset_index(inplace=True, drop=True) _, error, _, _ = train_model(params, kf_train, kf_test, y_colname, high_card_cols, valid=True, metric=metric) errors.append(error) avg_score = np.mean(errors) #Average scores of all KFold all_scores.append((params, avg_score)) return all_scores
def load_embeddings(path): with open(path, encoding='utf8') as f: return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))
opt_level='O1', verbosity=0) model.zero_grad() model = model.train() tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case) convert_func = functools.partial(convert_data, tokenizer=tokenizer, max_seq_len=max_seq_len, max_question_len=max_question_len, doc_stride=doc_stride) data_reader = JsonChunkReader(DATA_PATH, convert_func, chunksize=chunksize) global_step = 0 for examples in tqdm(data_reader, total=int(np.ceil(train_size / chunksize))): train_dataset = TextDataset(examples) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) for x_batch, y_batch in train_loader: x_batch, attention_mask, token_type_ids = x_batch y_batch = (y.to(device) for y in y_batch) y_pred = model(x_batch.to(device), attention_mask=attention_mask.to(device), token_type_ids=token_type_ids.to(device)) loss = loss_fn(y_pred, y_batch) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward()
def train_model(model, train, val, y_val, test, loss_fn, output_dim=3, lr=0.00001, batch_size=32, n_epochs=2, enable_checkpoint_ensemble=True): param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()] optimizer = torch.optim.Adam(param_lrs, lr=lr) train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) val_loader = torch.utils.data.DataLoader(val, batch_size=batch_size, shuffle=False) test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False) max_f1_score = 0 for epoch in range(n_epochs): start_time = time.time() optimizer.step() model.train() avg_loss = 0. for data in tqdm(train_loader, disable=False): x_batch = data[:-1] y_batch = data[-1] y_pred = model(x_batch[0]) loss = loss_fn(y_pred, y_batch) optimizer.zero_grad() loss.backward() optimizer.step() avg_loss += loss.item() / len(train_loader) model.eval() val_preds = np.zeros((len(val), output_dim)) test_preds = np.zeros((len(test), output_dim)) for i, x_batch in enumerate(val_loader): y_pred = sigmoid(model(x_batch[0]).detach().cpu().numpy()) val_preds[i * batch_size:(i + 1) * batch_size, :] = y_pred for i, x_batch in enumerate(test_loader): y_pred = sigmoid(model(x_batch[0]).detach().cpu().numpy()) test_preds[i * batch_size:(i + 1) * batch_size, :] = y_pred f1_score = calculate_F1(y_val, val_preds) elapsed_time = time.time() - start_time print('Epoch {}/{} \t loss={:.4f} \t f1={:.4f} \t time={:.2f}s'.format( epoch + 1, n_epochs, avg_loss, f1_score, elapsed_time)) if f1_score > max_f1_score: max_f1_score = f1_score torch.save(model, str(epoch) + "net.pkl") # 保存整个网络 return val_preds, test_preds
lens = np.array(lens) max_len = min(int(np.percentile(lens, self.percentile)), MAX_LEN) texts = torch.tensor(sequence.pad_sequences(texts, maxlen=max_len), dtype=torch.long) if self.test: return texts return texts, torch.tensor(target, dtype=torch.float32) train_collate = Collator(percentile=100) train_dataset = TextDataset(x_train, lengths, y_train_torch.numpy()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, collate_fn=train_collate) n_repeats = 10 start_time = time.time() for _ in range(n_repeats): for batch in tqdm(train_loader): pass method1_time = (time.time() - start_time) / n_repeats class SequenceDataset(torch.utils.data.Dataset): """ Dataset using sequence bucketing to pad each batch individually. Arguments: sequences (list): A list of variable length tokens (e. g. from keras tokenizer.texts_to_sequences) choose_length (function): A function which receives a numpy array of sequence lengths of one batch as input and returns the length this batch should be padded to. other_features (list, optional): A list of tensors with other features that should be fed to the NN alongside the sequences. labels (Tensor, optional): A tensor with labels for the samples. indices (np.array, optional): A numpy array consisting of indices to iterate over. shuffle (bool): Whether to shuffle the dataset or not. Default false. batch_size (int): Batch size of the samples. Default 512.
def load_embeddings(path): def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32') with open(path, encoding="utf8") as f: return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))
vocab[word] = 1 return vocab tic = time.time() glove_embeddings = load_embeddings(GLOVE_EMBEDDING_PATH) print(f'loaded {len(glove_embeddings)} word vectors in {time.time()-tic}s') vocab = build_vocab(list(train['comment_text'].apply(lambda x: x.split()))) oov = check_coverage(vocab, glove_embeddings) oov[:10] oov[:10] import string latin_similar = "’'‘ÆÐƎƏƐƔIJŊŒẞÞǷȜæðǝəɛɣijŋœĸſßþƿȝĄƁÇĐƊĘĦĮƘŁØƠŞȘŢȚŦŲƯY̨Ƴąɓçđɗęħįƙłøơşșţțŧųưy̨ƴÁÀÂÄǍĂĀÃÅǺĄÆǼǢƁĆĊĈČÇĎḌĐƊÐÉÈĖÊËĚĔĒĘẸƎƏƐĠĜǦĞĢƔáàâäǎăāãåǻąæǽǣɓćċĉčçďḍđɗðéèėêëěĕēęẹǝəɛġĝǧğģɣĤḤĦIÍÌİÎÏǏĬĪĨĮỊIJĴĶƘĹĻŁĽĿʼNŃN̈ŇÑŅŊÓÒÔÖǑŎŌÕŐỌØǾƠŒĥḥħıíìiîïǐĭīĩįịijĵķƙĸĺļłľŀʼnńn̈ňñņŋóòôöǒŏōõőọøǿơœŔŘŖŚŜŠŞȘṢẞŤŢṬŦÞÚÙÛÜǓŬŪŨŰŮŲỤƯẂẀŴẄǷÝỲŶŸȲỸƳŹŻŽẒŕřŗſśŝšşșṣßťţṭŧþúùûüǔŭūũűůųụưẃẁŵẅƿýỳŷÿȳỹƴźżžẓ" white_list = string.ascii_letters + string.digits + latin_similar + ' ' white_list += "'" glove_chars = ''.join([c for c in tqdm(glove_embeddings) if len(c) == 1]) glove_symbols = ''.join([c for c in glove_chars if not c in white_list]) glove_symbols jigsaw_chars = build_vocab(list(train["comment_text"])) jigsaw_symbols = ''.join([c for c in jigsaw_chars if not c in white_list]) jigsaw_symbols symbols_to_delete = ''.join( [c for c in jigsaw_symbols if not c in glove_symbols]) symbols_to_delete symbols_to_isolate = ''.join([c for c in jigsaw_symbols if c in glove_symbols]) symbols_to_isolate isolate_dict = {ord(c): f' {c} ' for c in symbols_to_isolate} remove_dict = {ord(c): f'' for c in symbols_to_delete} def handle_punctuation(x):
# This will remove stopwords and punctuation. # Use token.text to return strings, which we'll need for Gensim. doc = [ token.text for token in doc if token.is_stop != True and token.is_punct != True ] return doc # The add_pipe function appends our functions to the default pipeline. nlp.add_pipe(lemmatizer, name='lemmatizer', after='ner') nlp.add_pipe(remove_stopwords, name="stopwords", last=True) doc_list = [] # Iterates through each article in the corpus. for doc in tqdm(newest_doc): # Passes that article through the pipeline and adds to a new list. pr = nlp(doc) doc_list.append(pr) # Creates, which is a mapping of word IDs to words. words = corpora.Dictionary(doc_list) # Turns each document into a bag of words. corpus = [words.doc2bow(doc) for doc in doc_list] lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=words, num_topics=10, random_state=2, update_every=1,
def train_model(model, train, val, test, loss_fn, output_dim, lr=0.001, batch_size=512, n_epochs=4, enable_checkpoint_ensemble=True, validation_frequency=30): param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()] optimizer = torch.optim.Adam(param_lrs, lr=lr) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch) train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) val_loader = torch.utils.data.DataLoader(val, batch_size=batch_size, shuffle=False) test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False) all_test_preds = [] checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)] step = 0 for epoch in range(n_epochs): start_time = time.time() scheduler.step() epoch_loss = 0. batches = 0 for data in tqdm(train_loader, disable=False): # train model.train() x_batch = data[0] y_batch = data[1] x_batch = x_batch.to("cuda") y_batch = y_batch.to("cuda") y_pred = model(x_batch) loss = loss_fn(y_pred, y_batch) weights = torch.zeros(y_batch.size()).cuda() weights[y_batch[:, 0] > 0.5] = 0.9 weights[y_batch[:, 0] < 0.5] = 0.1 loss_weighted = (loss * weights).mean() optimizer.zero_grad() loss_weighted.backward() optimizer.step() epoch_loss += loss_weighted.item() / len(train_loader) batch_loss = loss_weighted.item() with torch.no_grad(): acc, acc_toxic, acc_nontoxic = accuracy(y_pred[:, 0], y_batch[:, 0]) tbx.add_scalar('train/loss', batch_loss, step) tbx.add_scalar('train/acc', acc, step) tbx.add_scalar('train/acc_toxic', acc_toxic, step) tbx.add_scalar('train/acc_nontoxic', acc_nontoxic, step) batches += 1 step += batch_size if batches % validation_frequency == 0: # validation model.eval() with torch.no_grad(): val_acc = 0 val_acc_toxic = 0 val_acc_nontoxic = 0 val_loss = 0 for x_y in val_loader: x_batch = x_y[0] y_batch = x_y[1] x_batch = x_batch.cuda() y_batch = y_batch.cuda() y_pred = model(x_batch) loss = loss_fn(y_pred, y_batch) weights = torch.zeros(y_batch.size()).cuda() weights[y_batch[:, 0] > 0.5] = 0.9 weights[y_batch[:, 0] < 0.5] = 0.1 loss_weighted = (loss * weights).mean() val_loss += loss_weighted.item() / len(val_loader) val_acc += accuracy(y_pred[:, 0], y_batch[:, 0])[0] / len(val_loader) val_acc_toxic += accuracy(y_pred[:, 0], y_batch[:, 0])[1] / len(val_loader) val_acc_nontoxic += accuracy(y_pred[:, 0], y_batch[:, 0])[2] / len(val_loader) tbx.add_scalar('val/loss', val_loss, step) tbx.add_scalar('val/acc', val_acc, step) tbx.add_scalar('val/acc_toxic', val_acc_toxic, step) tbx.add_scalar('val/acc_nontoxic', val_acc_nontoxic, step) # test model.eval() test_preds = np.zeros((len(test), output_dim)) for i, x_batch in enumerate(test_loader): y_pred = sigmoid(model(*x_batch).detach().cpu().numpy()) test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred all_test_preds.append(test_preds) elapsed_time = time.time() - start_time print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format( epoch + 1, n_epochs, epoch_loss, elapsed_time)) if enable_checkpoint_ensemble: test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0) else: test_preds = all_test_preds[-1] return test_preds
def __init__(self, description, total=100): self._tqdm = tqdm( bar_format= " {desc:20.20} |{bar}| {percentage:3.0f}% [{elapsed}<{remaining}]") self._tqdm.desc = description self._tqdm.total = total
import matplotlib.pyplot as plt import pyLDAvis.gensim import seaborn as sns import time import warnings dictionary = gensim.corpora.Dictionary(Paper_Lemma) bow_corpus = [dictionary.doc2bow(doc) for doc in Paper_Lemma] Lda = models.LdaMulticore coherenceList_umass = [] coherenceList_cv = [] num_topics_list = np.arange(30, 50, 2) for num_topics in tqdm(num_topics_list): warnings.filterwarnings("ignore") lda = Lda(bow_corpus, num_topics = num_topics, id2word = dictionary, minimum_probability = 0, passes = 20) cm = CoherenceModel(model = lda, corpus = bow_corpus, dictionary = dictionary, coherence = 'u_mass') coherenceList_umass.append(cm.get_coherence()) cm_cv = CoherenceModel(model = lda, corpus = bow_corpus, texts = Paper_Lemma, dictionary = dictionary, coherence = 'c_v') coherenceList_cv.append(cm_cv.get_coherence()) vis = pyLDAvis.gensim.prepare(lda, bow_corpus, dictionary) tfidf = models.TfidfModel(bow_corpus) corpus_tfidf = tfidf[bow_corpus] lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics = num_topics_list[max_cv], alpha = 0.1, eta = 0.01, random_state = 123, id2word = dictionary, passes = 2, workers = 2) for idx, topic in lda_model_tfidf.print_topics(-1): print('Topic: {} Word: {}'.format(idx, topic))
def eval_model( qa_model: nn.Module, classifier_model: nn.Module, valid_loader: DataLoader, device: torch.device = torch.device('cuda') ) -> Dict[str, float]: """Compute validation score. Parameters ---------- model : nn.Module Model for prediction. valid_loader : DataLoader Data loader of validation data. device : torch.device, optional Device for computation. Returns ------- dict Scores of validation data. `long_score`: score of long answers `short_score`: score of short answers `overall_score`: score of the competition metric """ qa_model.to(device) #qa_model.half() qa_model.eval() classifier_model.to(device) classifier_model.eval() class_labels = ['LONG', 'NO', 'SHORT', 'UNKNOWN', 'YES'] unknown_label = class_labels.index('UNKNOWN') with torch.no_grad(): result = Result() classifier_rejects = ExampleBatch() classifier_forwards = ExampleBatch() for inputs, examples in tqdm(valid_loader): input_ids, attention_mask, token_type_ids = inputs y_preds1 = classifier_model(input_ids.to(device), attention_mask.to(device)) _, _, classifier_preds = (p.detach().cpu() for p in y_preds1) has_pred = (torch.argmax(classifier_preds, dim=1)) != unknown_label print(has_pred) neg_pred = ~has_pred qa_inputs = [ element[(has_pred != 0).nonzero().squeeze()] for element in inputs ] qa_examples = (np.array(examples)[has_pred.numpy()]).tolist() if qa_examples: classifier_forwards.update(qa_inputs, qa_examples) reject_inputs = [ element[(neg_pred != 0).nonzero().squeeze()] for element in inputs ] reject_examples = (np.array(examples)[neg_pred.numpy()]).tolist() if reject_examples: classifier_rejects.update(reject_inputs, reject_examples) if len(classifier_forwards.examples) >= batch_size: qa_inputs = classifier_forwards.inputs qa_examples = classifier_forwards.examples qa_input_ids, qa_attention_mask, qa_token_type_ids = qa_inputs y_preds = qa_model(qa_input_ids.to(device), qa_attention_mask.to(device), qa_token_type_ids.to(device)) start_preds, end_preds, class_preds = (p.detach().cpu() for p in y_preds) start_logits, start_index = torch.max(start_preds, dim=1) end_logits, end_index = torch.max(end_preds, dim=1) # span logits minus the cls logits seems to be close to the best cls_logits = start_preds[:, 0] + end_preds[:, 0] # '[CLS]' logits logits = start_logits + end_logits - cls_logits # (batch_size,) indices = torch.stack( (start_index, end_index)).transpose(0, 1) # (batch_size, 2) result.update(qa_examples, logits.numpy(), indices.numpy(), class_preds.numpy()) classifier_forwards.clear() if len(classifier_rejects.examples) >= batch_size: reject_examples = classifier_rejects.examples start_index = torch.full([len(classifier_rejects.examples)], -1) end_index = torch.full([len(classifier_rejects.examples)], -1) indices = torch.stack((start_index, end_index)).transpose(1, 0) result.update(reject_examples, np.zeros(len(classifier_rejects.examples)), indices, np.zeros(len(classifier_rejects.examples))) classifier_rejects.clear() if classifier_forwards.examples: qa_inputs = classifier_forwards.inputs qa_examples = classifier_forwards.examples qa_input_ids, qa_attention_mask, qa_token_type_ids = qa_inputs y_preds = qa_model(qa_input_ids.to(device), qa_attention_mask.to(device), qa_token_type_ids.to(device)) start_preds, end_preds, class_preds = (p.detach().cpu() for p in y_preds) start_logits, start_index = torch.max(start_preds, dim=1) end_logits, end_index = torch.max(end_preds, dim=1) # span logits minus the cls logits seems to be close to the best cls_logits = start_preds[:, 0] + end_preds[:, 0] # '[CLS]' logits logits = start_logits + end_logits - cls_logits # (batch_size,) indices = torch.stack( (start_index, end_index)).transpose(1, 0) # (batch_size, 2) result.update(qa_examples, logits.numpy(), indices.numpy(), class_preds.numpy()) if classifier_rejects.examples: start_index = torch.full([len(classifier_rejects.examples)], -1) end_index = torch.full([len(classifier_rejects.examples)], -1) indices = torch.stack((start_index, end_index)).transpose(1, 0) result.update(classifier_rejects.examples, np.zeros(len(classifier_rejects.examples)), indices, np.zeros(len(classifier_rejects.examples))) return result.score()