def train(self): # reduce learning rate reduce_lr = ReduceLROnPlateau( monitor='val_acc', factor=0.2, patience=5, verbose=1, ) # Model Checkpoint cpt_save = ModelCheckpoint('weight.h5', save_best_only=True, monitor='val_acc', mode='max') (X_train, y_train, X_val, y_val, X_test, y_test) = get_data() print(X_train.shape) print("Training......") self.model.fit(X_train, y_train, validation_data=(X_val, y_val), callbacks=[cpt_save, reduce_lr], verbose=1, epochs=self.num_epochs, shuffle=True, batch_size=self.batch_size)
print('time_taken: {time} sec'.format(time=(end_time - start_time))) print('accuracy: {acc}'.format(acc=accuracy)) #print(svm_clf.dual_coef_) print('\nValidation on test data:') res = svm_clf.predict(Test_X) accuracy = svm_clf.score(Test_X, Test_Y[:, 0]) * 100 print('validation accuracy: {acc}'.format(acc=accuracy)) print('\n\n') base = os.path.dirname(os.path.abspath('__file__')) + '/mails' Train_X, Train_Y, Test_X, Test_Y = cd.get_data(base, split=0.70, lengthfrac=0.1) #run_logistic_regression(Train_X, Train_Y, Test_X, Test_Y) #run_ann(Train_X, Train_Y, Test_X, Test_Y) run_svm_linear(Train_X, Train_Y, Test_X, Test_Y) run_svm(Train_X, Train_Y, Test_X, Test_Y) plt.show() ''' features = ['about', 'above', 'account', 'act', 'activity', 'address', 'adobe', 'after', 'agreement', 'aimee', 'align', 'all', 'allen', 'also', 'am', 'america', 'ami', 'an', 'and', 'anita', 'any', 'aol', 'april', 'are', 'as', 'at', 'attached', 'available', 'back', 'based', 'be', 'because', 'been', 'before', 'being', 'below', 'best', 'bgcolor', 'biz', 'bob', 'body', 'border', 'both', 'br', 'brenda', 'brian', 'bryan', 'business', 'but', 'buy', 'buyback', 'by', 'call', 'calls', 'camp', 'can', 'cash', 'cc', 'cd', 'center', 'ces', 'cf', 'change', 'changes', 'charge', 'china', 'chokshi', 'cialis', 'click', 'clynes', 'color', 'com', 'companies', 'company', 'computron', 'contact', 'content', 'contract', 'contracts', 'corp', 'could', 'counterparty', 'country', 'cs', 'currently', 'daily', 'daren', 'darial', 'data', 'date', 'day', 'days', 'de', 'deal', 'deals', 'dec', 'delivery', 'desk', 'details', 'did', 'div', 'do', 'does', 'dollars', 'don', 'down', 'drugs', 'due', 'each', 'eastrans', 'easy', 'ect', 'effective', 'email', 'ena', 'energy', 'enron', 'entex', 'face', 'farmer', 'feb', 'february', 'file', 'first', 'flow', 'following', 'font', 'fontfont', 'for', 'forward', 'forwarded', 'free', 'friday', 'from', 'ftar', 'full', 'fund', 'future', 'fyi', 'gary', 'gas', 'generic', 'george', 'get', 'gif', 'give', 'global', 'go', 'gold', 'great', 'group', 'had', 'has', 'have', 'he', 'health', 'height', 'help', 'here', 'hi', 'high', 'his', 'home', 'hotlist', 'hou', 'how', 'howard', 'hpl', 'hplc', 'href', 'hsc', 'html', 'htmlimg', 'http', 'id', 'if', 'images', 'img', 'in', 'inc', 'increase', 'index', 'info', 'information', 'international', 'internet', 'into', 'investment', 'is', 'issue', 'issues', 'it', 'its', 'jackie', 'jan', 'january', 'jpg', 'julie', 'just', 'keep', 'know', 'last', 'let', 'life', 'like', 'limited', 'line', 'link', 'lisa', 'list', 'll', 'lloyd', 'long', 'look', 'looking', 'lose', 'loss', 'low', 'luong', 'made', 'mail', 'make', 'management', 'many', 'mar', 'march', 'market', 'mary', 'may', 'me', 'meds', 'meeting', 'melissa', 'message', 'meter', 'meters', 'methanol', 'meyers', 'mg', 'microsoft', 'midcon', 'million', 'mmbtu', 'monday', 'money', 'month', 'moopid', 'more', 'morris', 'most', 'ms', 'much', 'music', 'my', 'name', 'natural', 'nbsp', 'nd', 'need', 'needed', 'needs', 'net', 'new', 'news', 'next', 'no', 'nom', 'nomination', 'noms', 'north', 'not', 'note', 'now', 'number', 'of', 'off', 'offer', 'offers', 'office', 'on', 'once', 'one', 'online', 'only', 'operations', 'or', 'order', 'other', 'our', 'out', 'over', 'own', 'pain', 'paliourg', 'pat', 'path', 'pec', 'people', 'per', 'pg', 'photoshop', 'php', 'pills', 'pipeline', 'place', 'plant', 'please', 'pm', 'point', 'pops', 'prescription', 'price', 'prices', 'private', 'pro', 'problem', 'product', 'production', 'products', 'professional', 'purchase', 'put', 'quality', 'questions', 'ranch', 'rates', 're', 'receipt', 'receive', 'reliantenergy', 'remove', 'removed', 'reply', 'report', 'request', 'required', 'results', 'retail', 'right', 'risk', 'robert', 'sale', 'sales', 'same', 'save', 'scheduled', 'section', 'securities', 'security', 'see', 'send', 'sent', 'service', 'services', 'set', 'shares', 'she', 'shipping', 'should', 'show', 'since', 'sitara', 'site', 'size', 'smith', 'so', 'software', 'some', 'someone', 'soon', 'spam', 'special', 'src', 'statements', 'stella', 'still', 'stock', 'stocks', 'stop', 'strong', 'subject', 'such', 'suite', 'super', 'support', 'sure', 'susan', 'system', 'table', 'take', 'taylor', 'td', 'team', 'texas', 'th', 'than', 'thank', 'thanks', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'think', 'this', 'through', 'thu', 'ticket', 'tickets', 'time', 'to', 'today', 'tom', 'top', 'total', 'tr', 'transfer', 'transport', 'two', 'unify', 'united', 'until', 'up', 'us', 'use', 'valero', 'valium', 'vance', 've', 'very', 'via', 'viagra', 'visit', 'volume', 'volumes', 'want', 'was', 'we', 'web', 'week', 'weight', 'well', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'width', 'will', 'windows', 'with', 'within', 'without', 'work', 'works', 'world', 'worldwide', 'would', 'www', 'xanax', 'xls', 'xlssubject', 'xp', 'year', 'you', 'your'] resultFile = open("words.csv",'wb') wr = csv.writer(resultFile, dialect='excel') wr.writerow(features) np.savetxt("features.csv",theta,delimiter=",") '''
def main(**kwargs): if kwargs["seed"] != -1: utils.set_seed(kwargs["seed"]) kwargs['num_labels'] = 1 config_class, model_class, tokenizer_class, templates_class, max_sequence_len = utils.MODEL_CLASSES[ "NLI"] kwargs['max_sequence_len'] = max_sequence_len config = config_class.from_pretrained(kwargs['model_name_or_path']) config.update(kwargs) tokenizer = tokenizer_class.from_pretrained(kwargs['model_name_or_path']) templates = templates_class() if kwargs['debugging']: train_relations = [ 'per:positive_impression', 'per:employee_or_member_of' ] # , 'per:place_of_birth', 'per:visited_place'] dev_relations = ['per:acquaintance', 'per:alumni'] else: data_splits = json.load(open("data_v2/data_splits.json")) train_relations = data_splits[kwargs['data_split']]["train"][0] dev_relations = data_splits[kwargs['data_split']]["dev"][0] train_dataset = get_data(tokenizer, train_relations, templates, **kwargs) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=kwargs['gpu_batch_size'], shuffle=True) dev_dataset = get_data(tokenizer, dev_relations, templates, **kwargs) dev_dataloader = torch.utils.data.DataLoader( dev_dataset, batch_size=kwargs['gpu_batch_size'], shuffle=False) # load model model = model_class.from_pretrained(kwargs['model_name_or_path'], config=config) model.to(kwargs['device']) # optimization vars gradient_accumulation_steps = kwargs["effective_batch_size"] / \ kwargs["gpu_batch_size"] total_optimization_steps = kwargs["num_epochs"] * \ (len(train_dataloader) // gradient_accumulation_steps) optimizer = torch.optim.AdamW(model.parameters(), lr=kwargs["learning_rate"]) if kwargs['warmup_proportion'] > 0: num_warmup_steps = total_optimization_steps * kwargs[ 'warmup_proportion'] scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=total_optimization_steps) # scheduler.verbose = True if kwargs["fp16"]: scaler = torch.cuda.amp.GradScaler() logger.info("******** Training ********") logger.info(f" Num samples: {len(train_dataset)}") logger.info(f" Num epochs: {kwargs['num_epochs']}") logger.info(f" Batch size: {kwargs['effective_batch_size']}") logger.info(f" Total optimization steps: {total_optimization_steps}") best_f1 = 0 for epoch in range(kwargs['num_epochs']): logger.info(f"EPOCH: {epoch+1}") total_loss = 0 optimizer.zero_grad() model.train() pbar = tqdm(enumerate(train_dataloader), total=len(train_dataloader)) for step, batch in pbar: batch = utils.batch_to_device(batch, kwargs['device']) input_ids, attention_mask, labels, samples = batch input_ids = input_ids.squeeze() attention_mask = attention_mask.squeeze() if kwargs['fp16']: with torch.cuda.amp.autocast(): per_sample_loss = model.calculate_loss( input_ids, attention_mask, labels) if kwargs['pos_sample_weight'] > 1: sample_weight = labels * kwargs['pos_sample_weight'] sample_weight = torch.clamp(sample_weight, min=1.0) per_sample_loss = per_sample_loss * sample_weight loss = torch.sum(per_sample_loss) loss = loss / gradient_accumulation_steps scaler.scale(loss).backward() total_loss += loss.item() if ((step + 1) % gradient_accumulation_steps) == 0: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), kwargs["max_grad_norm"]) scaler.step(optimizer) scaler.update() if kwargs['warmup_proportion'] > 0: scheduler.step() optimizer.zero_grad() else: per_sample_loss = model.calculate_loss(input_ids, attention_mask, labels) if kwargs['pos_sample_weight'] > 1: sample_weight = labels * kwargs['pos_sample_weight'] sample_weight = torch.clamp(sample_weight, min=1.0) per_sample_loss = per_sample_loss * sample_weight loss = torch.sum(per_sample_loss) loss = loss / gradient_accumulation_steps loss.backward() total_loss += loss.item() torch.nn.utils.clip_grad_norm_(model.parameters(), kwargs["max_grad_norm"]) if ((step + 1) % gradient_accumulation_steps) == 0: optimizer.step() if kwargs['warmup_proportion'] > 0: scheduler.step() optimizer.zero_grad() desc = f"TRAIN LOSS: {total_loss/(step+1):0.4f}" pbar.set_description(desc) tp, fp, fn, tn = 0, 0, 0, 0 for batch in dev_dataloader: batch = utils.batch_to_device(batch, kwargs['device']) input_ids, attention_mask, labels, samples = batch input_ids = input_ids.squeeze() attention_mask = attention_mask.squeeze() with torch.no_grad(): preds = model.predict(input_ids, attention_mask) for l, p in zip(labels.squeeze(), preds.squeeze()): if l == 1: if p == 1: tp += 1 else: fn += 1 else: if p == 1: fp += 1 else: tn += 1 precision = tp / (tp + fp) if (tp + fp) > 0 else 0 recall = tp / (tp + fn) if (tp + fn) > 0 else 1 f1 = 2*precision*recall / \ (precision+recall) if (precision+recall) > 0 else 0 logger.info(f"**DEV** TP: {tp} - FP: {fp} - FN: {fn} - TN: {tn}") logger.info(f"**DEV** PR: {precision} - RE: {recall} - F1: {f1}") if f1 > best_f1: best_f1 = f1 if kwargs['output_dir']: output_dir = os.path.join(kwargs['output_dir'], f"F1-{best_f1:0.2f}") model.save_pretrained(output_dir) if kwargs['output_dir']: output_dir = os.path.join(kwargs['output_dir'], f"F1-{f1:0.2f}_final") model.save_pretrained(output_dir)
def evaluate(self): (X_train, y_train, X_val, y_val, X_test, y_test) = get_data() score = self.model.evaluate(X_test, y_test) return (score)
x_train = scale.transform(x_train) x_valid = scale.transform(x_valid) nn_train, nn_test = stacking_reg("", x_train, y_train, x_valid, "nn") return nn_train, nn_test, "nn_reg" ########################################################################################################## #####################################################获取数据############################################## ########################################################################################################### from create_data import get_data if __name__ == "__main__": np.random.seed(1) x_train, x_valid, y_train, train, test = get_data() train_id = train["item_id"].values test_id = test["item_id"].values folds = 5 seed = 1 kf = KFold(x_train.shape[0], n_folds=folds, shuffle=True, random_state=seed) #############################################选择模型############################################### # # #