Exemple #1
0
 def train(self):
     # reduce learning rate
     reduce_lr = ReduceLROnPlateau(
         monitor='val_acc',
         factor=0.2,
         patience=5,
         verbose=1,
     )
     # Model Checkpoint
     cpt_save = ModelCheckpoint('weight.h5',
                                save_best_only=True,
                                monitor='val_acc',
                                mode='max')
     (X_train, y_train, X_val, y_val, X_test, y_test) = get_data()
     print(X_train.shape)
     print("Training......")
     self.model.fit(X_train,
                    y_train,
                    validation_data=(X_val, y_val),
                    callbacks=[cpt_save, reduce_lr],
                    verbose=1,
                    epochs=self.num_epochs,
                    shuffle=True,
                    batch_size=self.batch_size)
Exemple #2
0
    print('time_taken: {time} sec'.format(time=(end_time - start_time)))
    print('accuracy: {acc}'.format(acc=accuracy))

    #print(svm_clf.dual_coef_)

    print('\nValidation on test data:')
    res = svm_clf.predict(Test_X)
    accuracy = svm_clf.score(Test_X, Test_Y[:, 0]) * 100
    print('validation accuracy: {acc}'.format(acc=accuracy))

    print('\n\n')


base = os.path.dirname(os.path.abspath('__file__')) + '/mails'
Train_X, Train_Y, Test_X, Test_Y = cd.get_data(base,
                                               split=0.70,
                                               lengthfrac=0.1)

#run_logistic_regression(Train_X, Train_Y, Test_X, Test_Y)
#run_ann(Train_X, Train_Y, Test_X, Test_Y)
run_svm_linear(Train_X, Train_Y, Test_X, Test_Y)
run_svm(Train_X, Train_Y, Test_X, Test_Y)

plt.show()
'''
features = ['about', 'above', 'account', 'act', 'activity', 'address', 'adobe', 'after', 'agreement', 'aimee', 'align', 'all', 'allen', 'also', 'am', 'america', 'ami', 'an', 'and', 'anita', 'any', 'aol', 'april', 'are', 'as', 'at', 'attached', 'available', 'back', 'based', 'be', 'because', 'been', 'before', 'being', 'below', 'best', 'bgcolor', 'biz', 'bob', 'body', 'border', 'both', 'br', 'brenda', 'brian', 'bryan', 'business', 'but', 'buy', 'buyback', 'by', 'call', 'calls', 'camp', 'can', 'cash', 'cc', 'cd', 'center', 'ces', 'cf', 'change', 'changes', 'charge', 'china', 'chokshi', 'cialis', 'click', 'clynes', 'color', 'com', 'companies', 'company', 'computron', 'contact', 'content', 'contract', 'contracts', 'corp', 'could', 'counterparty', 'country', 'cs', 'currently', 'daily', 'daren', 'darial', 'data', 'date', 'day', 'days', 'de', 'deal', 'deals', 'dec', 'delivery', 'desk', 'details', 'did', 'div', 'do', 'does', 'dollars', 'don', 'down', 'drugs', 'due', 'each', 'eastrans', 'easy', 'ect', 'effective', 'email', 'ena', 'energy', 'enron', 'entex', 'face', 'farmer', 'feb', 'february', 'file', 'first', 'flow', 'following', 'font', 'fontfont', 'for', 'forward', 'forwarded', 'free', 'friday', 'from', 'ftar', 'full', 'fund', 'future', 'fyi', 'gary', 'gas', 'generic', 'george', 'get', 'gif', 'give', 'global', 'go', 'gold', 'great', 'group', 'had', 'has', 'have', 'he', 'health', 'height', 'help', 'here', 'hi', 'high', 'his', 'home', 'hotlist', 'hou', 'how', 'howard', 'hpl', 'hplc', 'href', 'hsc', 'html', 'htmlimg', 'http', 'id', 'if', 'images', 'img', 'in', 'inc', 'increase', 'index', 'info', 'information', 'international', 'internet', 'into', 'investment', 'is', 'issue', 'issues', 'it', 'its', 'jackie', 'jan', 'january', 'jpg', 'julie', 'just', 'keep', 'know', 'last', 'let', 'life', 'like', 'limited', 'line', 'link', 'lisa', 'list', 'll', 'lloyd', 'long', 'look', 'looking', 'lose', 'loss', 'low', 'luong', 'made', 'mail', 'make', 'management', 'many', 'mar', 'march', 'market', 'mary', 'may', 'me', 'meds', 'meeting', 'melissa', 'message', 'meter', 'meters', 'methanol', 'meyers', 'mg', 'microsoft', 'midcon', 'million', 'mmbtu', 'monday', 'money', 'month', 'moopid', 'more', 'morris', 'most', 'ms', 'much', 'music', 'my', 'name', 'natural', 'nbsp', 'nd', 'need', 'needed', 'needs', 'net', 'new', 'news', 'next', 'no', 'nom', 'nomination', 'noms', 'north', 'not', 'note', 'now', 'number', 'of', 'off', 'offer', 'offers', 'office', 'on', 'once', 'one', 'online', 'only', 'operations', 'or', 'order', 'other', 'our', 'out', 'over', 'own', 'pain', 'paliourg', 'pat', 'path', 'pec', 'people', 'per', 'pg', 'photoshop', 'php', 'pills', 'pipeline', 'place', 'plant', 'please', 'pm', 'point', 'pops', 'prescription', 'price', 'prices', 'private', 'pro', 'problem', 'product', 'production', 'products', 'professional', 'purchase', 'put', 'quality', 'questions', 'ranch', 'rates', 're', 'receipt', 'receive', 'reliantenergy', 'remove', 'removed', 'reply', 'report', 'request', 'required', 'results', 'retail', 'right', 'risk', 'robert', 'sale', 'sales', 'same', 'save', 'scheduled', 'section', 'securities', 'security', 'see', 'send', 'sent', 'service', 'services', 'set', 'shares', 'she', 'shipping', 'should', 'show', 'since', 'sitara', 'site', 'size', 'smith', 'so', 'software', 'some', 'someone', 'soon', 'spam', 'special', 'src', 'statements', 'stella', 'still', 'stock', 'stocks', 'stop', 'strong', 'subject', 'such', 'suite', 'super', 'support', 'sure', 'susan', 'system', 'table', 'take', 'taylor', 'td', 'team', 'texas', 'th', 'than', 'thank', 'thanks', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'think', 'this', 'through', 'thu', 'ticket', 'tickets', 'time', 'to', 'today', 'tom', 'top', 'total', 'tr', 'transfer', 'transport', 'two', 'unify', 'united', 'until', 'up', 'us', 'use', 'valero', 'valium', 'vance', 've', 'very', 'via', 'viagra', 'visit', 'volume', 'volumes', 'want', 'was', 'we', 'web', 'week', 'weight', 'well', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'width', 'will', 'windows', 'with', 'within', 'without', 'work', 'works', 'world', 'worldwide', 'would', 'www', 'xanax', 'xls', 'xlssubject', 'xp', 'year', 'you', 'your']
resultFile = open("words.csv",'wb')
wr = csv.writer(resultFile, dialect='excel')
wr.writerow(features)
np.savetxt("features.csv",theta,delimiter=",")
'''
Exemple #3
0
def main(**kwargs):
    if kwargs["seed"] != -1:
        utils.set_seed(kwargs["seed"])

    kwargs['num_labels'] = 1
    config_class, model_class, tokenizer_class, templates_class, max_sequence_len = utils.MODEL_CLASSES[
        "NLI"]
    kwargs['max_sequence_len'] = max_sequence_len
    config = config_class.from_pretrained(kwargs['model_name_or_path'])
    config.update(kwargs)
    tokenizer = tokenizer_class.from_pretrained(kwargs['model_name_or_path'])
    templates = templates_class()

    if kwargs['debugging']:
        train_relations = [
            'per:positive_impression', 'per:employee_or_member_of'
        ]  # , 'per:place_of_birth', 'per:visited_place']
        dev_relations = ['per:acquaintance', 'per:alumni']
    else:
        data_splits = json.load(open("data_v2/data_splits.json"))
        train_relations = data_splits[kwargs['data_split']]["train"][0]
        dev_relations = data_splits[kwargs['data_split']]["dev"][0]

    train_dataset = get_data(tokenizer, train_relations, templates, **kwargs)
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=kwargs['gpu_batch_size'], shuffle=True)

    dev_dataset = get_data(tokenizer, dev_relations, templates, **kwargs)
    dev_dataloader = torch.utils.data.DataLoader(
        dev_dataset, batch_size=kwargs['gpu_batch_size'], shuffle=False)

    # load model
    model = model_class.from_pretrained(kwargs['model_name_or_path'],
                                        config=config)
    model.to(kwargs['device'])

    # optimization vars
    gradient_accumulation_steps = kwargs["effective_batch_size"] / \
        kwargs["gpu_batch_size"]
    total_optimization_steps = kwargs["num_epochs"] * \
        (len(train_dataloader) // gradient_accumulation_steps)
    optimizer = torch.optim.AdamW(model.parameters(),
                                  lr=kwargs["learning_rate"])

    if kwargs['warmup_proportion'] > 0:
        num_warmup_steps = total_optimization_steps * kwargs[
            'warmup_proportion']
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=total_optimization_steps)
        # scheduler.verbose = True

    if kwargs["fp16"]:
        scaler = torch.cuda.amp.GradScaler()

    logger.info("******** Training ********")
    logger.info(f"    Num samples: {len(train_dataset)}")
    logger.info(f"    Num epochs: {kwargs['num_epochs']}")
    logger.info(f"    Batch size: {kwargs['effective_batch_size']}")
    logger.info(f"    Total optimization steps: {total_optimization_steps}")

    best_f1 = 0
    for epoch in range(kwargs['num_epochs']):
        logger.info(f"EPOCH: {epoch+1}")
        total_loss = 0
        optimizer.zero_grad()
        model.train()

        pbar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
        for step, batch in pbar:
            batch = utils.batch_to_device(batch, kwargs['device'])
            input_ids, attention_mask, labels, samples = batch
            input_ids = input_ids.squeeze()
            attention_mask = attention_mask.squeeze()

            if kwargs['fp16']:
                with torch.cuda.amp.autocast():
                    per_sample_loss = model.calculate_loss(
                        input_ids, attention_mask, labels)
                    if kwargs['pos_sample_weight'] > 1:
                        sample_weight = labels * kwargs['pos_sample_weight']
                        sample_weight = torch.clamp(sample_weight, min=1.0)
                        per_sample_loss = per_sample_loss * sample_weight
                    loss = torch.sum(per_sample_loss)
                    loss = loss / gradient_accumulation_steps
                scaler.scale(loss).backward()
                total_loss += loss.item()

                if ((step + 1) % gradient_accumulation_steps) == 0:
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   kwargs["max_grad_norm"])
                    scaler.step(optimizer)
                    scaler.update()
                    if kwargs['warmup_proportion'] > 0:
                        scheduler.step()
                    optimizer.zero_grad()
            else:
                per_sample_loss = model.calculate_loss(input_ids,
                                                       attention_mask, labels)
                if kwargs['pos_sample_weight'] > 1:
                    sample_weight = labels * kwargs['pos_sample_weight']
                    sample_weight = torch.clamp(sample_weight, min=1.0)
                    per_sample_loss = per_sample_loss * sample_weight
                loss = torch.sum(per_sample_loss)
                loss = loss / gradient_accumulation_steps
                loss.backward()
                total_loss += loss.item()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               kwargs["max_grad_norm"])
                if ((step + 1) % gradient_accumulation_steps) == 0:
                    optimizer.step()
                    if kwargs['warmup_proportion'] > 0:
                        scheduler.step()
                    optimizer.zero_grad()

            desc = f"TRAIN LOSS: {total_loss/(step+1):0.4f}"
            pbar.set_description(desc)

        tp, fp, fn, tn = 0, 0, 0, 0
        for batch in dev_dataloader:
            batch = utils.batch_to_device(batch, kwargs['device'])
            input_ids, attention_mask, labels, samples = batch
            input_ids = input_ids.squeeze()
            attention_mask = attention_mask.squeeze()
            with torch.no_grad():
                preds = model.predict(input_ids, attention_mask)
            for l, p in zip(labels.squeeze(), preds.squeeze()):
                if l == 1:
                    if p == 1:
                        tp += 1
                    else:
                        fn += 1
                else:
                    if p == 1:
                        fp += 1
                    else:
                        tn += 1

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 1
        f1 = 2*precision*recall / \
            (precision+recall) if (precision+recall) > 0 else 0
        logger.info(f"**DEV**    TP: {tp} - FP: {fp} - FN: {fn} - TN: {tn}")
        logger.info(f"**DEV**    PR: {precision} - RE: {recall} - F1: {f1}")
        if f1 > best_f1:
            best_f1 = f1
            if kwargs['output_dir']:
                output_dir = os.path.join(kwargs['output_dir'],
                                          f"F1-{best_f1:0.2f}")
                model.save_pretrained(output_dir)

    if kwargs['output_dir']:
        output_dir = os.path.join(kwargs['output_dir'], f"F1-{f1:0.2f}_final")
        model.save_pretrained(output_dir)
Exemple #4
0
 def evaluate(self):
     (X_train, y_train, X_val, y_val, X_test, y_test) = get_data()
     score = self.model.evaluate(X_test, y_test)
     return (score)
Exemple #5
0
    x_train = scale.transform(x_train)
    x_valid = scale.transform(x_valid)

    nn_train, nn_test = stacking_reg("", x_train, y_train, x_valid, "nn")
    return nn_train, nn_test, "nn_reg"


##########################################################################################################

#####################################################获取数据##############################################

###########################################################################################################
from create_data import get_data
if __name__ == "__main__":
    np.random.seed(1)
    x_train, x_valid, y_train, train, test = get_data()

    train_id = train["item_id"].values
    test_id = test["item_id"].values

    folds = 5
    seed = 1
    kf = KFold(x_train.shape[0],
               n_folds=folds,
               shuffle=True,
               random_state=seed)

    #############################################选择模型###############################################
    #
    #
    #