def plot_expected_lengths(lengths, batch_sizes, choose_length, markers=[], n_batches=10000):
    fig, axarr = plt.subplots(len(batch_sizes), 1, figsize=(14, 20), sharex=True)
    expected_lengths = {}
    
    for i, batch_size in enumerate(batch_sizes):
        maxs = []

        for _ in tqdm(range(n_batches), disable=False):
            val = choose_length(np.random.choice(lengths, batch_size))
            maxs.append(math.ceil(val))

        pd.Series(maxs).plot.hist(bins=50, ax=axarr[i], density=True, color='black', edgecolor='white', alpha=0.1)
        expected = np.mean(maxs)
        expected_lengths[batch_size] = expected
        
        max_y = axarr[i].get_ylim()[1]
        
        axarr[i].vlines([expected], 0, 1e3, 'limegreen', lw=4)
        axarr[i].set_ylim([0, max_y])
        axarr[i].set_xlim([0, max(lengths)])
        axarr[i].set_ylabel(f'batch_size={batch_size}', rotation=0)
        axarr[i].yaxis.set_label_coords(-0.1, 0.45)
        axarr[i].set_yticks([])

    for marker in markers:
        con = ConnectionPatch(xyA=(marker, axarr[0].get_ylim()[1]), xyB=(marker, 0), coordsA='data', 
                              coordsB='data', axesA=axarr[0], axesB=axarr[-1], color='red', lw=4)
        axarr[0].add_artist(con)
    
    axarr[0].set_zorder(1)
    axarr[0].set_title(f'Expected sequence lengths with various batch sizes (n per batch = {n_batches})')
    plt.subplots_adjust(hspace=0)
    
    return expected_lengths
def load_embeddings(path):
    if '.pkl' in path or '.pickle' in path:
        with open(path, 'rb') as f:
            return pickle.load(f)
    else:
        with open(path, encoding="utf8", errors='ignore') as f:
            return dict(
                get_coefs(*line.strip().split(' ')) for line in tqdm(f))
Ejemplo n.º 3
0
def train_model(model, train, test, loss_fn, output_dim, lr=0.001,
                batch_size=512, n_epochs=4,
                enable_checkpoint_ensemble=True):
    param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)
    # 举例说明
    # optim.Adam([
    #             {'params': model.base.parameters()},
    #             {'params': model.classifier.parameters(), 'lr': 1e-3}
    #         ], lr=1e-2, momentum=0.9)
    
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    all_test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]
    
    for epoch in range(n_epochs):
        start_time = time.time()
        
        scheduler.step()
        
        model.train() # 针对训练的  model.eval()针对测试
        avg_loss = 0.
        
        for data in tqdm(train_loader, disable=False):
            x_batch = data[:-1]
            y_batch = data[-1]

            y_pred = model(*x_batch)#  *x_batch == x_batch[0]
            loss = loss_fn(y_pred, y_batch)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        model.eval()
        test_preds = np.zeros((len(test), output_dim))
    
        for i, x_batch in enumerate(test_loader):
            y_pred = sigmoid(model(*x_batch).detach().cpu().numpy())

            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

        all_test_preds.append(test_preds)
        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format(
              epoch + 1, n_epochs, avg_loss, elapsed_time))

    if enable_checkpoint_ensemble:
        test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0)    
    else:
        test_preds = all_test_preds[-1]
        
    return test_preds
def load_embeddings(path):
    with open(path) as f:
        emb_arr = []
        for line in tqdm(f):
            try:
                emb_arr.append(get_coefs(*line.strip().split(' ')))
            except Exception as e:
                print(e)
                               
        return dict(emb_arr)
Ejemplo n.º 5
0
 def __init__(self, description="Processing", total=100):
     self._tqdm = tqdm(
         disable=False if is_notebook() else None,
         bar_format=
         " {desc:20.20} |{bar}| {percentage:3.0f}% [{elapsed}<{remaining}]")
     self._tqdm.desc = description
     self._tqdm.total = total
     if self._tqdm.disable:
         self._tqdm = None
         self._value = 0
         self._total = total
Ejemplo n.º 6
0
    def LoopThroughTime(self,Animate = False):  # Now, start the time evolution calculation...
        self.time_store = np.arange(0,self.SimulationParams['MaxTime'],self.dt)

        self.InitializeMatrices()
        for t in tqdm(self.time_store):                
            self.evolve_ts()
            if Animate:
                self.PlotConcentrations()
                display.clear_output(wait=True)
                display.display(plt.gcf())
                time.sleep(0.1)
Ejemplo n.º 7
0
def get_AUCs_pancan(pickle_path, cancertypes):
    #     cancertypes = [s.split('/')[-1][15:-4] for s in glob.glob(pickle_path+'/run_cnn_output_*.pkl')]
    AUCs = pd.DataFrame(index=cancertypes, columns=['ROC AUC', 'PR AUC'])

    pbar = tqdm(cancertypes)
    for cancertype in pbar:
        pbar.set_description("Processing %s" % cancertype)
        x = get_AUC(cancertype, pickle_path)
        AUCs.loc[cancertype, 'ROC AUC'] = x[0]
        AUCs.loc[cancertype, 'PR AUC'] = x[1]
    AUCs.to_csv(pickle_path + '/AUCs.txt')
    return AUCs
Ejemplo n.º 8
0
def build_timeseries(mat, y_col_index, time_steps):
    # total number of time-series samples would be len(mat) - TIME_STEPS
    dim_0 = mat.shape[0] - time_steps
    dim_1 = mat.shape[1]
    x = np.zeros((dim_0, time_steps, dim_1))
    y = np.zeros((x.shape[0],))

    for i in tqdm(range(dim_0)):
        x[i] = mat[i:time_steps + i]
        y[i] = mat[time_steps + i, y_col_index]
    print("length of time-series i/o {} {}".format(x.shape, y.shape))
    return x, y
def build_vocab(sentences, verbose=True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable=(not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab
Ejemplo n.º 10
0
def convert_lines(example, max_seq_length, tokenizer):
    max_seq_length -= 2
    all_tokens = []
    longer = 0
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a) > max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(
            ["[CLS]"] + tokens_a +
            ["[SEP]"]) + [0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    return np.array(all_tokens)
Ejemplo n.º 11
0
def get_per_slide_probs_pancan(pickle_path, cancertypes):
    #     cancertypes = [s.split('/')[-1][15:-4] for s in glob.glob(pickle_path+'/run_cnn_output_*.pkl')]
    per_slide_probs = pd.DataFrame()

    pbar = tqdm(cancertypes)

    for cancertype in pbar:
        pbar.set_description("Processing %s" % cancertype)
        tmp = get_per_slide_probs(cancertype, pickle_path)
        tmp['cancertype'] = cancertype.upper()
        per_slide_probs = pd.concat([per_slide_probs, tmp])

    per_slide_probs.to_csv(pickle_path + '/per_slide_probs.txt')
    return per_slide_probs
Ejemplo n.º 12
0
        def eval_model(
            model: nn.Module,
            valid_loader: DataLoader,
            device: torch.device = torch.device('cuda')
        ) -> Dict[str, float]:
            """Compute validation score.

            Parameters
            ----------
            model : nn.Module
                Model for prediction.
            valid_loader : DataLoader
                Data loader of validation data.
            device : torch.device, optional
                Device for computation.

            Returns
            -------
            dict
                Scores of validation data.
                `long_score`: score of long answers
                `short_score`: score of short answers
                `overall_score`: score of the competition metric
            """
            model.to(device)
            #model.half()
            model.eval()
            with torch.no_grad():
                result = Result()
                for inputs, examples in tqdm(valid_loader):
                    input_ids, attention_mask, token_type_ids = inputs
                    y_preds = model(input_ids.to(device),
                                    attention_mask.to(device),
                                    token_type_ids.to(device))

                    _, _, class_preds = (p.detach().cpu() for p in y_preds)
                    # start_logits, start_index = torch.max(start_preds, dim=1)
                    # end_logits, end_index = torch.max(end_preds, dim=1)

                    # span logits minus the cls logits seems to be close to the best
                    # cls_logits = start_preds[:, 0] + end_preds[:, 0]  # '[CLS]' logits
                    #logits = start_logits + end_logits - cls_logits  # (batch_size,)
                    #indices = torch.stack((start_index, end_index)).transpose(0, 1)  # (batch_size, 2)
                    #result.update(examples, np.array(list(class_preds)))
                    #result.update(examples, class_preds.numpy())

                    result.update(examples, class_preds.numpy())

            return result.score()
Ejemplo n.º 13
0
def get_metrics_pancan(pickle_path, cancertypes):
    #     cancertypes = [s.split('/')[-1][15:-4] for s in glob.glob(pickle_path+'/run_cnn_output_*.pkl')]
    metrics = pd.DataFrame()

    pbar = tqdm(cancertypes)

    for cancertype in pbar:
        pbar.set_description("Processing %s" % cancertype)
        tmp = get_metrics(cancertype, pickle_path)
        tmp = tmp.T.reset_index().rename(columns={'index': 'metric'})
        tmp['cancertype'] = cancertype.upper()
        metrics = pd.concat([metrics, tmp], sort=False)

    metrics.to_csv(pickle_path + '/metrics.txt')
    return metrics
 def key_phrase(self, addword, result):
     sub_list = []
     for w in tqdm(addword, desc='add key phrase>>>'):
         result['filter'] = result.關鍵字.isin(w)
         id_ = result.groupby('index')['filter'].sum()
         id_ = id_[id_ == 2].index
         sub = pd.DataFrame({'關鍵字': ''.join(w), 'Value': 1, 'index': id_})
         sub_list.extend(sub.values.tolist())
         result = result[(result['index'].isin(id_) == False)
                         & (result.關鍵字.isin(w) == False)]
     result = result.drop(columns='filter')
     sub_list = pd.DataFrame(sub_list)
     sub_list.columns = ['關鍵字', 'Value', 'index']
     result = pd.concat([result, sub_list])
     return result
Ejemplo n.º 15
0
def calculate_toxicity(model, test_data):

    batch_size = 1
    max_bert_length = 220
    pytorch_conversion = False

    seed_everything(1235)
    device = torch.device('cpu')
    tqdm.pandas()

    bert_model_path = "./service/uncased_L-12_H-768_A-12/"
    base_tokenizer = BertTokenizer.from_pretrained(bert_model_path,
                                                   cache_dir=None,
                                                   do_lower_case=True)
    converted_text = convert_data(test_data, max_bert_length, base_tokenizer)
    bert_test_lengths = torch.from_numpy(
        np.array([len(x) for x in converted_text]))
    bert_test_set = torch.tensor(pad_sequences(converted_text,
                                               maxlen=max_bert_length,
                                               padding='post'),
                                 dtype=torch.long)

    bert_test_dataset = torch.utils.data.TensorDataset(bert_test_set)
    bert_test_loader = torch.utils.data.DataLoader(bert_test_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=False)
    tk2 = tqdm(enumerate(bert_test_loader),
               total=len(bert_test_loader),
               leave=False)

    output_preds = []
    for i, (batch) in tk2:

        tsrs = trim_tensors(batch)
        x_batch, = tuple(t.to(device) for t in tsrs)
        y_pred = model(x_batch.to(device),
                       attention_mask=(x_batch > 0).to(device),
                       labels=None)
        y_pred = torch.sigmoid(
            torch.tensor(
                y_pred[:,
                       0].detach().cpu().squeeze().numpy())).numpy().ravel()
        list.append(output_preds, y_pred)

    return output_preds
Ejemplo n.º 16
0
def get_per_slide_evaluation_metrics_for_many_thresholds_for_all_labels(
    per_slide_average_predictions,
    label_names,
    per_slide_average_thresholds=np.arange(0, 1.002, .001)):

    per_slide_evaluation_metrics_df = []
    for label in tqdm(label_names):
        per_slide_evaluation_metrics = get_per_slide_evaluation_metrics_for_many_thresholds(
            per_slide_average_predictions[[label, label + '_pred']],
            label,
            per_slide_average_thresholds=per_slide_average_thresholds)

        per_slide_evaluation_metrics_df.append(
            per_slide_evaluation_metrics.set_index(
                'per_slide_average_threshold'))

    per_slide_evaluation_metrics_df = pd.concat(
        per_slide_evaluation_metrics_df, axis=1, keys=label_names)
    return per_slide_evaluation_metrics_df
def check_coverage(vocab, embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x
    def Word_Cloud(self, word_vector, n_key, data=None, dictionary=None):
        if dictionary is not None:  # TFIDF_Vector/ TEXTRANK
            result = pd.Series(word_vector).apply(
                lambda x: [[dictionary[w[0]], w[1]] for w in x[:n_key]])

            result_dt = []
            id_ = 0
            for row in tqdm(result, desc='transform to wordcloud>>>'):
                sub = pd.DataFrame(row)
                sub['index'] = id_
                result_dt.extend(sub.values.tolist())
                id_ += 1
            result_dt = pd.DataFrame(result_dt)
            result_dt.columns = ['關鍵字', 'Value', 'index']
            result = result_dt
            if data is not None:
                result = pd.concat([data, result],
                                   axis=1).reset_index(drop=True)
        else:  # LDA
            result = pd.DataFrame(word_vector,
                                  columns=['分群類別', '關鍵字', 'Value'])
        return result
def perform_regularised_cv(train,
                           y_colname,
                           grid,
                           high_card_cols,
                           folds=5,
                           metric=mean_absolute_error):
    '''Performs grid search crossfold validation with support for regularised mean encoding
    Inputs:
        train: Input data set
        y_colname : target column name
        grid: Set of hyperparameters over which the model is to be tuned
        high_card_col : categorical columns you want to consider for mean encoding
        folds: Number of folds to be used for cross validation
    Outputs:
        all_scores: the list of final scores
    '''
    kf = KFold(folds, random_state=0, shuffle=True)
    param_grid = ParameterGrid(grid)
    all_scores = []  #Store all scores
    for params in tqdm(param_grid):
        errors = []
        for train_idx, test_idx in kf.split(train):
            # Split data into train and test
            kf_train, kf_test = train.iloc[train_idx, :], train.iloc[
                test_idx, :]
            kf_train.reset_index(inplace=True, drop=True)
            kf_test.reset_index(inplace=True, drop=True)
            _, error, _, _ = train_model(params,
                                         kf_train,
                                         kf_test,
                                         y_colname,
                                         high_card_cols,
                                         valid=True,
                                         metric=metric)
            errors.append(error)
        avg_score = np.mean(errors)  #Average scores of all KFold
        all_scores.append((params, avg_score))
    return all_scores
def load_embeddings(path):
    with open(path, encoding='utf8') as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))
Ejemplo n.º 21
0
                                  opt_level='O1',
                                  verbosity=0)
model.zero_grad()
model = model.train()

tokenizer = BertTokenizer.from_pretrained(bert_model,
                                          do_lower_case=do_lower_case)
convert_func = functools.partial(convert_data,
                                 tokenizer=tokenizer,
                                 max_seq_len=max_seq_len,
                                 max_question_len=max_question_len,
                                 doc_stride=doc_stride)
data_reader = JsonChunkReader(DATA_PATH, convert_func, chunksize=chunksize)

global_step = 0
for examples in tqdm(data_reader, total=int(np.ceil(train_size / chunksize))):
    train_dataset = TextDataset(examples)
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              collate_fn=collate_fn)
    for x_batch, y_batch in train_loader:
        x_batch, attention_mask, token_type_ids = x_batch
        y_batch = (y.to(device) for y in y_batch)

        y_pred = model(x_batch.to(device),
                       attention_mask=attention_mask.to(device),
                       token_type_ids=token_type_ids.to(device))
        loss = loss_fn(y_pred, y_batch)
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
def train_model(model,
                train,
                val,
                y_val,
                test,
                loss_fn,
                output_dim=3,
                lr=0.00001,
                batch_size=32,
                n_epochs=2,
                enable_checkpoint_ensemble=True):
    param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)

    train_loader = torch.utils.data.DataLoader(train,
                                               batch_size=batch_size,
                                               shuffle=True)
    val_loader = torch.utils.data.DataLoader(val,
                                             batch_size=batch_size,
                                             shuffle=False)
    test_loader = torch.utils.data.DataLoader(test,
                                              batch_size=batch_size,
                                              shuffle=False)
    max_f1_score = 0

    for epoch in range(n_epochs):
        start_time = time.time()

        optimizer.step()

        model.train()
        avg_loss = 0.

        for data in tqdm(train_loader, disable=False):
            x_batch = data[:-1]
            y_batch = data[-1]
            y_pred = model(x_batch[0])
            loss = loss_fn(y_pred, y_batch)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            avg_loss += loss.item() / len(train_loader)

        model.eval()
        val_preds = np.zeros((len(val), output_dim))
        test_preds = np.zeros((len(test), output_dim))

        for i, x_batch in enumerate(val_loader):
            y_pred = sigmoid(model(x_batch[0]).detach().cpu().numpy())

            val_preds[i * batch_size:(i + 1) * batch_size, :] = y_pred

        for i, x_batch in enumerate(test_loader):
            y_pred = sigmoid(model(x_batch[0]).detach().cpu().numpy())

            test_preds[i * batch_size:(i + 1) * batch_size, :] = y_pred

        f1_score = calculate_F1(y_val, val_preds)
        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t f1={:.4f} \t time={:.2f}s'.format(
            epoch + 1, n_epochs, avg_loss, f1_score, elapsed_time))
        if f1_score > max_f1_score:
            max_f1_score = f1_score
            torch.save(model, str(epoch) + "net.pkl")  # 保存整个网络

    return val_preds, test_preds
        lens = np.array(lens)
        max_len = min(int(np.percentile(lens, self.percentile)), MAX_LEN)
        texts = torch.tensor(sequence.pad_sequences(texts, maxlen=max_len), dtype=torch.long)
        
        if self.test:
            return texts
        
        return texts, torch.tensor(target, dtype=torch.float32)
train_collate = Collator(percentile=100)
train_dataset = TextDataset(x_train, lengths, y_train_torch.numpy())
train_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, collate_fn=train_collate)
n_repeats = 10

start_time = time.time()
for _ in range(n_repeats):
    for batch in tqdm(train_loader):
        pass
method1_time = (time.time() - start_time) / n_repeats
class SequenceDataset(torch.utils.data.Dataset):
    """
    Dataset using sequence bucketing to pad each batch individually.
    
    Arguments:
        sequences (list): A list of variable length tokens (e. g. from keras tokenizer.texts_to_sequences)
        choose_length (function): A function which receives a numpy array of sequence lengths of one batch as input
                                  and returns the length this batch should be padded to.
        other_features (list, optional): A list of tensors with other features that should be fed to the NN alongside the sequences.
        labels (Tensor, optional): A tensor with labels for the samples.
        indices (np.array, optional): A numpy array consisting of indices to iterate over. 
        shuffle (bool): Whether to shuffle the dataset or not.  Default false.
        batch_size (int): Batch size of the samples. Default 512.
def load_embeddings(path):
    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    with open(path, encoding="utf8") as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))
                vocab[word] = 1
    return vocab


tic = time.time()
glove_embeddings = load_embeddings(GLOVE_EMBEDDING_PATH)
print(f'loaded {len(glove_embeddings)} word vectors in {time.time()-tic}s')
vocab = build_vocab(list(train['comment_text'].apply(lambda x: x.split())))
oov = check_coverage(vocab, glove_embeddings)
oov[:10]
oov[:10]
import string
latin_similar = "’'‘ÆÐƎƏƐƔIJŊŒẞÞǷȜæðǝəɛɣijŋœĸſßþƿȝĄƁÇĐƊĘĦĮƘŁØƠŞȘŢȚŦŲƯY̨Ƴąɓçđɗęħįƙłøơşșţțŧųưy̨ƴÁÀÂÄǍĂĀÃÅǺĄÆǼǢƁĆĊĈČÇĎḌĐƊÐÉÈĖÊËĚĔĒĘẸƎƏƐĠĜǦĞĢƔáàâäǎăāãåǻąæǽǣɓćċĉčçďḍđɗðéèėêëěĕēęẹǝəɛġĝǧğģɣĤḤĦIÍÌİÎÏǏĬĪĨĮỊIJĴĶƘĹĻŁĽĿʼNŃN̈ŇÑŅŊÓÒÔÖǑŎŌÕŐỌØǾƠŒĥḥħıíìiîïǐĭīĩįịijĵķƙĸĺļłľŀʼnńn̈ňñņŋóòôöǒŏōõőọøǿơœŔŘŖŚŜŠŞȘṢẞŤŢṬŦÞÚÙÛÜǓŬŪŨŰŮŲỤƯẂẀŴẄǷÝỲŶŸȲỸƳŹŻŽẒŕřŗſśŝšşșṣßťţṭŧþúùûüǔŭūũűůųụưẃẁŵẅƿýỳŷÿȳỹƴźżžẓ"
white_list = string.ascii_letters + string.digits + latin_similar + ' '
white_list += "'"
glove_chars = ''.join([c for c in tqdm(glove_embeddings) if len(c) == 1])
glove_symbols = ''.join([c for c in glove_chars if not c in white_list])
glove_symbols
jigsaw_chars = build_vocab(list(train["comment_text"]))
jigsaw_symbols = ''.join([c for c in jigsaw_chars if not c in white_list])
jigsaw_symbols
symbols_to_delete = ''.join(
    [c for c in jigsaw_symbols if not c in glove_symbols])
symbols_to_delete
symbols_to_isolate = ''.join([c for c in jigsaw_symbols if c in glove_symbols])
symbols_to_isolate
isolate_dict = {ord(c): f' {c} ' for c in symbols_to_isolate}
remove_dict = {ord(c): f'' for c in symbols_to_delete}


def handle_punctuation(x):
Ejemplo n.º 26
0
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [
        token.text for token in doc
        if token.is_stop != True and token.is_punct != True
    ]
    return doc


# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer, name='lemmatizer', after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

doc_list = []
# Iterates through each article in the corpus.
for doc in tqdm(newest_doc):
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(doc)
    doc_list.append(pr)

# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=words,
                                            num_topics=10,
                                            random_state=2,
                                            update_every=1,
Ejemplo n.º 27
0
def train_model(model, train, val, test, loss_fn, output_dim, lr=0.001,
                batch_size=512, n_epochs=4,
                enable_checkpoint_ensemble=True,
                validation_frequency=30):
    param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)

    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val, batch_size=batch_size, shuffle=False)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    all_test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]
    step = 0

    for epoch in range(n_epochs):
        start_time = time.time()

        scheduler.step()

        epoch_loss = 0.

        batches = 0

        for data in tqdm(train_loader, disable=False):
            # train
            model.train()
            x_batch = data[0]
            y_batch = data[1]
            x_batch = x_batch.to("cuda")
            y_batch = y_batch.to("cuda")

            y_pred = model(x_batch)
            loss = loss_fn(y_pred, y_batch)

            weights = torch.zeros(y_batch.size()).cuda()
            weights[y_batch[:, 0] > 0.5] = 0.9
            weights[y_batch[:, 0] < 0.5] = 0.1
            loss_weighted = (loss * weights).mean()

            optimizer.zero_grad()
            loss_weighted.backward()

            optimizer.step()
            epoch_loss += loss_weighted.item() / len(train_loader)
            batch_loss = loss_weighted.item()
            with torch.no_grad():
                acc, acc_toxic, acc_nontoxic = accuracy(y_pred[:, 0], y_batch[:, 0])
            tbx.add_scalar('train/loss', batch_loss, step)
            tbx.add_scalar('train/acc', acc, step)
            tbx.add_scalar('train/acc_toxic', acc_toxic, step)
            tbx.add_scalar('train/acc_nontoxic', acc_nontoxic, step)

            batches += 1
            step += batch_size
            if batches % validation_frequency == 0:
                # validation
                model.eval()
                with torch.no_grad():
                    val_acc = 0
                    val_acc_toxic = 0
                    val_acc_nontoxic = 0
                    val_loss = 0
                    for x_y in val_loader:
                        x_batch = x_y[0]
                        y_batch = x_y[1]
                        x_batch = x_batch.cuda()
                        y_batch = y_batch.cuda()
                        y_pred = model(x_batch)
                        loss = loss_fn(y_pred, y_batch)
                        weights = torch.zeros(y_batch.size()).cuda()
                        weights[y_batch[:, 0] > 0.5] = 0.9
                        weights[y_batch[:, 0] < 0.5] = 0.1
                        loss_weighted = (loss * weights).mean()
                        val_loss += loss_weighted.item() / len(val_loader)
                        val_acc += accuracy(y_pred[:, 0], y_batch[:, 0])[0] / len(val_loader)
                        val_acc_toxic += accuracy(y_pred[:, 0], y_batch[:, 0])[1] / len(val_loader)
                        val_acc_nontoxic += accuracy(y_pred[:, 0], y_batch[:, 0])[2] / len(val_loader)
                    tbx.add_scalar('val/loss', val_loss, step)
                    tbx.add_scalar('val/acc', val_acc, step)
                    tbx.add_scalar('val/acc_toxic', val_acc_toxic, step)
                    tbx.add_scalar('val/acc_nontoxic', val_acc_nontoxic, step)

        # test
        model.eval()
        test_preds = np.zeros((len(test), output_dim))

        for i, x_batch in enumerate(test_loader):
            y_pred = sigmoid(model(*x_batch).detach().cpu().numpy())

            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

        all_test_preds.append(test_preds)
        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format(
              epoch + 1, n_epochs, epoch_loss, elapsed_time))

    if enable_checkpoint_ensemble:
        test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0)
    else:
        test_preds = all_test_preds[-1]

    return test_preds
Ejemplo n.º 28
0
 def __init__(self, description, total=100):
     self._tqdm = tqdm(
         bar_format=
         " {desc:20.20} |{bar}| {percentage:3.0f}% [{elapsed}<{remaining}]")
     self._tqdm.desc = description
     self._tqdm.total = total
import matplotlib.pyplot as plt
import pyLDAvis.gensim
import seaborn as sns
import time
import warnings


dictionary = gensim.corpora.Dictionary(Paper_Lemma)
bow_corpus = [dictionary.doc2bow(doc) for doc in Paper_Lemma]

Lda = models.LdaMulticore
coherenceList_umass = []
coherenceList_cv = []

num_topics_list = np.arange(30, 50, 2)
for num_topics in tqdm(num_topics_list):
    warnings.filterwarnings("ignore")
    lda = Lda(bow_corpus, num_topics = num_topics, id2word = dictionary, minimum_probability = 0, passes = 20)
    cm = CoherenceModel(model = lda, corpus = bow_corpus, dictionary = dictionary, coherence = 'u_mass')
    coherenceList_umass.append(cm.get_coherence())
    cm_cv = CoherenceModel(model = lda, corpus = bow_corpus, texts = Paper_Lemma, dictionary = dictionary, coherence = 'c_v')
    coherenceList_cv.append(cm_cv.get_coherence())
    vis = pyLDAvis.gensim.prepare(lda, bow_corpus, dictionary)

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]


lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics = num_topics_list[max_cv], alpha = 0.1, eta = 0.01, random_state = 123, id2word = dictionary, passes = 2, workers = 2)
for idx, topic in lda_model_tfidf.print_topics(-1):
  print('Topic: {} Word: {}'.format(idx, topic))
Ejemplo n.º 30
0
def eval_model(
    qa_model: nn.Module,
    classifier_model: nn.Module,
    valid_loader: DataLoader,
    device: torch.device = torch.device('cuda')
) -> Dict[str, float]:
    """Compute validation score.

    Parameters
    ----------
    model : nn.Module
        Model for prediction.
    valid_loader : DataLoader
        Data loader of validation data.
    device : torch.device, optional
        Device for computation.

    Returns
    -------
    dict
        Scores of validation data.
        `long_score`: score of long answers
        `short_score`: score of short answers
        `overall_score`: score of the competition metric
    """
    qa_model.to(device)
    #qa_model.half()
    qa_model.eval()
    classifier_model.to(device)
    classifier_model.eval()
    class_labels = ['LONG', 'NO', 'SHORT', 'UNKNOWN', 'YES']
    unknown_label = class_labels.index('UNKNOWN')
    with torch.no_grad():
        result = Result()
        classifier_rejects = ExampleBatch()
        classifier_forwards = ExampleBatch()
        for inputs, examples in tqdm(valid_loader):
            input_ids, attention_mask, token_type_ids = inputs
            y_preds1 = classifier_model(input_ids.to(device),
                                        attention_mask.to(device))
            _, _, classifier_preds = (p.detach().cpu() for p in y_preds1)

            has_pred = (torch.argmax(classifier_preds, dim=1)) != unknown_label
            print(has_pred)
            neg_pred = ~has_pred
            qa_inputs = [
                element[(has_pred != 0).nonzero().squeeze()]
                for element in inputs
            ]
            qa_examples = (np.array(examples)[has_pred.numpy()]).tolist()
            if qa_examples:
                classifier_forwards.update(qa_inputs, qa_examples)
            reject_inputs = [
                element[(neg_pred != 0).nonzero().squeeze()]
                for element in inputs
            ]
            reject_examples = (np.array(examples)[neg_pred.numpy()]).tolist()
            if reject_examples:
                classifier_rejects.update(reject_inputs, reject_examples)

            if len(classifier_forwards.examples) >= batch_size:
                qa_inputs = classifier_forwards.inputs
                qa_examples = classifier_forwards.examples
                qa_input_ids, qa_attention_mask, qa_token_type_ids = qa_inputs

                y_preds = qa_model(qa_input_ids.to(device),
                                   qa_attention_mask.to(device),
                                   qa_token_type_ids.to(device))

                start_preds, end_preds, class_preds = (p.detach().cpu()
                                                       for p in y_preds)
                start_logits, start_index = torch.max(start_preds, dim=1)
                end_logits, end_index = torch.max(end_preds, dim=1)

                # span logits minus the cls logits seems to be close to the best
                cls_logits = start_preds[:, 0] + end_preds[:,
                                                           0]  # '[CLS]' logits
                logits = start_logits + end_logits - cls_logits  # (batch_size,)
                indices = torch.stack(
                    (start_index, end_index)).transpose(0,
                                                        1)  # (batch_size, 2)
                result.update(qa_examples, logits.numpy(), indices.numpy(),
                              class_preds.numpy())
                classifier_forwards.clear()
            if len(classifier_rejects.examples) >= batch_size:
                reject_examples = classifier_rejects.examples
                start_index = torch.full([len(classifier_rejects.examples)],
                                         -1)
                end_index = torch.full([len(classifier_rejects.examples)], -1)
                indices = torch.stack((start_index, end_index)).transpose(1, 0)
                result.update(reject_examples,
                              np.zeros(len(classifier_rejects.examples)),
                              indices,
                              np.zeros(len(classifier_rejects.examples)))
                classifier_rejects.clear()
        if classifier_forwards.examples:
            qa_inputs = classifier_forwards.inputs
            qa_examples = classifier_forwards.examples
            qa_input_ids, qa_attention_mask, qa_token_type_ids = qa_inputs

            y_preds = qa_model(qa_input_ids.to(device),
                               qa_attention_mask.to(device),
                               qa_token_type_ids.to(device))

            start_preds, end_preds, class_preds = (p.detach().cpu()
                                                   for p in y_preds)
            start_logits, start_index = torch.max(start_preds, dim=1)
            end_logits, end_index = torch.max(end_preds, dim=1)

            # span logits minus the cls logits seems to be close to the best
            cls_logits = start_preds[:, 0] + end_preds[:, 0]  # '[CLS]' logits
            logits = start_logits + end_logits - cls_logits  # (batch_size,)
            indices = torch.stack(
                (start_index, end_index)).transpose(1, 0)  # (batch_size, 2)
            result.update(qa_examples, logits.numpy(), indices.numpy(),
                          class_preds.numpy())
        if classifier_rejects.examples:
            start_index = torch.full([len(classifier_rejects.examples)], -1)
            end_index = torch.full([len(classifier_rejects.examples)], -1)
            indices = torch.stack((start_index, end_index)).transpose(1, 0)
            result.update(classifier_rejects.examples,
                          np.zeros(len(classifier_rejects.examples)), indices,
                          np.zeros(len(classifier_rejects.examples)))
    return result.score()