def benchmarkClicked(self, wrapper): if wrapper._benchmark.get('isOther', False): path = QFileDialog.getOpenFileName( None, "Open benchmark", join(dirname(__file__), "benchmarks"), "Benchmark files (*.hgr)" )[0] if path: outputPath = QFileDialog.getExistingDirectory( None, "Select placement engine output directory", dirname(__file__), ) else: outputPath = None else: path = wrapper._benchmark['path'] outputPath = dirname(path) if path and outputPath: app.setOverrideCursor(Qt.WaitCursor) results = load_data(path, outputPath) app.restoreOverrideCursor() if results[0] == -1: QMessageBox.critical( None, f"{basename(path)}", f"Could not load benchmark.\n\nReason: {results[1]}", ) else: placements = results[1] progress = QProgressDialog( labelText="Generating visualizations...", cancelButtonText=None, minimum=0, maximum=len(placements), flags=Qt.WindowStaysOnTopHint, ) progress.setWindowTitle(f"{basename(path)}") progress.setMinimumDuration(0) progress.setValue(0) progress.forceShow() app.processEvents() self.pws = [] for placement in placements: app.setOverrideCursor(Qt.WaitCursor) pw = PlacementWindow(placement, app) pw.show() pw.plot() self.pws.append(pw) progress.setValue(progress.value() + 1) app.processEvents() app.restoreOverrideCursor()
def main(): args = load_args() print(args) torch.manual_seed(args.seed) if args.use_cuda: torch.cuda.manual_seed(args.seed) np.random.seed(args.seed) X_train, y_train, X_val, y_val, _ = load_data(dataset=args.dataset) mask_train, mask_val, _ = load_masks(args.dataset) X_train *= mask_train X_val *= mask_val y_train = torch.from_numpy(y_train) y_val = torch.from_numpy(y_val) nb_train = int(0.8 * X_train.shape[0]) train_dset = TensorDataset(X_train[:nb_train].permute(0, 2, 1), y_train[:nb_train]) val_dset = TensorDataset(X_train[nb_train:].permute(0, 2, 1), y_train[nb_train:]) loader_args = {} if args.use_cuda: loader_args = {'num_workers': 1, 'pin_memory': True} train_loader = DataLoader( train_dset, batch_size=args.batch_size, shuffle=False, **loader_args) val_loader = DataLoader( val_dset, batch_size=args.batch_size, shuffle=False, **loader_args) model = SeqAttention( 768, 2, args.n_filters, args.len_motifs, args.subsamplings, kernel_args=args.kernel_params, alpha=args.weight_decay, eps=args.eps, heads=args.heads, out_size=args.out_size, max_iter=args.max_iter, fit_bias=False) print(model) print(len(train_dset)) print("Initializing...") tic = timer() if args.use_cuda: model.cuda() n_samples = 3000 if args.n_filters[-1] > 256: n_samples //= args.n_filters[-1] // 256 model.unsup_train(train_loader, args.sampling_patches, n_samples=n_samples, wb=args.wb, use_cuda=args.use_cuda) toc = timer() print("Finished feature learning, elapsed time: {:.2f}s".format(toc - tic)) print("Encoding...") Xtr, ytr = model.predict(train_loader, only_repr=True, use_cuda=args.use_cuda) preprocess(Xtr) print(Xtr.shape) Xval = [] yval = [] X, y = model.predict(val_loader, only_repr=True, use_cuda=args.use_cuda) preprocess(X) Xval.append(X) yval.append(y) search_grid = 2. ** np.arange(1, 15) search_grid = 1. / search_grid best_score = -np.inf clf = model.classifier criterion = torch.nn.CrossEntropyLoss(reduction='sum') if Xtr.shape[-1] > 20000: optimizer = torch.optim.Adam(clf.parameters(), lr=0.01) epochs = 800 else: optimizer = torch.optim.LBFGS( clf.parameters(), lr=1.0, max_eval=10, history_size=10, tolerance_grad=1e-05, tolerance_change=1e-05) epochs = 100 torch.cuda.empty_cache() print("Start crossing validation") for alpha in search_grid: tic = timer() clf.fit(Xtr, ytr, criterion, reg=alpha, epochs=epochs, optimizer=optimizer, use_cuda=args.use_cuda) toc = timer() scores = [] for X, y in zip(Xval, yval): if args.use_cuda: X = X.cuda() score = clf.score(X, y) scores.append(score) score = np.mean(scores) print("CV alpha={}, acc={:.2f}, ts={:.2f}s".format(alpha, score * 100., toc - tic)) if score > best_score: best_score = score best_alpha = alpha best_weight = copy.deepcopy(clf.state_dict()) clf.load_state_dict(best_weight) print("Finished, elapsed time: {:.2f}s".format(toc - tic)) test_dset = TensorDataset(X_val.permute(0, 2, 1), y_val) test_loader = DataLoader( test_dset, batch_size=args.batch_size, shuffle=False) Xte, y_true = model.predict(test_loader, only_repr=True, use_cuda=args.use_cuda) preprocess(Xte) if args.use_cuda: Xte = Xte.cuda() with torch.no_grad(): y_pred = clf(Xte).cpu() scores = accuracy(y_pred, y_true) print(scores) if args.save_logs: print('Saving logs...') data = { # 'title': title, 'score': scores, 'best_param': best_alpha, 'val_score': best_score, 'args': args } np.save(os.path.join(args.outdir, f"seed_{args.seed}_results.npy"), data) # torch.save( # {'args': args, # 'state_dict': model.state_dict()}, # args.outdir + '/model.pkl') return
if _name_ == "_main_": print("YOOO") #get arguments usage is python main.py --arg-1 arg1value , .. --arg-n argnvalue args = get_args() print(args) #check if there is a gpu in case sets it as device cuda = torch.cuda.is_available() device = torch.device("cuda" if cuda else "cpu") #device = torch.device('cpu') #here something is loading the (possibly already sharded) data structures: # if not os.path.exists(os.path.join(args.data_dir,'ISDdata.pt')): # sampled_data = load_raw_data(data_folder=args.data_dir) # else: # sampled_data = None ISD, SD, S, I , D = load_data(args.data_dir) #generete the datset objects from the datasets structures and the cv_out, here using None indicates that we use the last two seq_len as test set dataset_train = ReplenishmentDataset(ISD, D, SD, I, S, args.seq_len, cv=None, train=True) #dataset_train.debug() train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, num_workers=48) dataset_test = ReplenishmentDataset(ISD, D, SD, I, S, args.seq_len, cv=None, train=False) test_loader = torch.utils.data.DataLoader(dataset_test, batch_size=args.batch_size, shuffle=False, num_workers=16) print('Final train sampled %:', len(dataset_train)/len(dataset_train.ISD)) #print(len(dataset_test.ISD) - len(dataset_train) - len(dataset_test)) # for ii, data in enumerate(train_loader): # pass #print(dataset_train.emb_dict) # TODO: Rewrite gathering of seq/stat embed keys so that we don't need to force get item to set dataset's seq/stat_int_dict
import json import pandas as pd import time from modules.clustering import Executor from loaders import load_data, dump_clusters if __name__ == '__main__': data = list( zip(load_data('entities_title.json'), load_data('keyphrases_title.json'), load_data('keyphrases_text.json'))) try: labels_true = pd.factorize(load_data('labels_true.json')) except: print('No labels') with open('best_params.json', mode='r') as fp: best_params = json.load(fp) a, b, c, thr = best_params[0].values() config = dict(a=a, b=b, c=c, thr=thr) e = Executor(config) st = time.time() for article_id, article_dict in enumerate(data):
def main(): args = load_args() print(args) torch.manual_seed(args.seed) if args.use_cuda: torch.cuda.manual_seed(args.seed) np.random.seed(args.seed) X_train, y_train, X_val, y_val, _ = load_data(dataset=args.dataset) mask_train, mask_val, _ = load_masks(args.dataset) X_train *= mask_train X_val *= mask_val y_train = torch.from_numpy(y_train) y_val = torch.from_numpy(y_val) nb_train = int(0.8 * X_train.shape[0]) train_dset = TensorDataset(X_train[:nb_train].permute(0, 2, 1), y_train[:nb_train]) val_dset = TensorDataset(X_train[nb_train:].permute(0, 2, 1), y_train[nb_train:]) loader_args = {} if args.use_cuda: loader_args = {'num_workers': 1, 'pin_memory': True} init_loader = DataLoader( train_dset, batch_size=args.batch_size, shuffle=False, **loader_args) train_loader = DataLoader( train_dset, batch_size=args.batch_size, shuffle=True, **loader_args) val_loader = DataLoader( val_dset, batch_size=args.batch_size, shuffle=False, **loader_args) model = SeqAttention( 768, 2, args.n_filters, args.len_motifs, args.subsamplings, kernel_args=args.kernel_params, alpha=args.weight_decay, eps=args.eps, heads=args.heads, out_size=args.out_size, max_iter=args.max_iter) print(model) print(len(train_dset)) print("Initializing...") tic = timer() if args.use_cuda: model.cuda() n_samples = 3000 if args.n_filters[-1] > 256: n_samples //= args.n_filters[-1] // 256 model.unsup_train(init_loader, args.sampling_patches, n_samples=n_samples, use_cuda=args.use_cuda) criterion_clf = nn.CrossEntropyLoss(reduction='sum') if args.n_filters[-1] * args.out_size * args.heads < 30000: optimizer_clf = None epochs_clf = 20 else: print("low ram optimizer clf") optimizer_clf = optim.Adam(model.classifier.parameters(), lr=0.01) epochs_clf = 100 model.train_classifier(init_loader, criterion_clf, epochs=epochs_clf * 5, optimizer=optimizer_clf, use_cuda=args.use_cuda) toc = timer() print("Finished, elapsed time: {:.2f}s".format(toc - tic)) criterion = nn.CrossEntropyLoss() # epoch_loss, epoch_acc = eval_epoch_list( # model, val_loader, criterion, use_cuda=args.use_cuda) # criterion = nn.CrossEntropyLoss() if args.alternating: optimizer = optim.Adam(model.feature_parameters(), lr=args.lr) lr_scheduler = ReduceLROnPlateau( optimizer, factor=0.5, patience=5, min_lr=1e-4) else: weight_decay = args.weight_decay / args.batch_size optimizer = optim.Adam([ {'params': model.feature_parameters()}, {'params': model.classifier.parameters(), 'weight_decay': weight_decay} ], lr=args.lr) lr_scheduler = StepLR(optimizer, 30, gamma=0.5) print("Start training...") tic = timer() epoch_loss = None best_loss = float('inf') for epoch in range(args.epochs): print('Epoch {}/{}'.format(epoch + 1, args.epochs)) print('-' * 10) if args.alternating: model.eval() tic_c = timer() model.train_classifier(train_loader, criterion_clf, epochs=epochs_clf, optimizer=optimizer_clf, use_cuda=args.use_cuda) toc_c = timer() print("Classifier trained. Time: {:.2f}s".format(toc_c - tic_c)) print("current LR: {}".format( optimizer.param_groups[0]['lr'])) train_loss, train_acc = train_epoch( model, train_loader, criterion, optimizer, use_cuda=args.use_cuda) val_loss, val_acc = eval_epoch_list( model, [val_loader], criterion, use_cuda=args.use_cuda) if isinstance(lr_scheduler, ReduceLROnPlateau): lr_scheduler.step(val_loss) else: lr_scheduler.step() if val_loss < best_loss: best_loss = val_loss best_acc = val_acc best_epoch = epoch + 1 best_weights = copy.deepcopy(model.state_dict()) toc = timer() training_time = (toc - tic) / 60 print("Traning finished, elapsed time: {:.2f}s".format(toc - tic)) model.load_state_dict(best_weights) print("Testing...") test_dset = TensorDataset(X_val.permute(0, 2, 1), y_val) test_loader = DataLoader( test_dset, batch_size=args.batch_size, shuffle=False) y_pred, y_true = model.predict( test_loader, use_cuda=args.use_cuda) scores = accuracy(y_pred, y_true) print(scores) if args.save_logs: print('Saving logs...') data = { # 'title': title, 'score': scores, 'best_epoch': best_epoch, 'best_loss': best_loss, 'val_score': best_acc, 'args': args } np.save(os.path.join(args.outdir, f"seed_{args.seed}_results.npy"), data) # torch.save( # {'args': args, # 'state_dict': model.state_dict()}, # args.outdir + '/model.pkl') return
import json import numpy as np from tqdm import tqdm from modules.preprocessing import Preprocessor from config import DATASET, MAX_NUM_ARTICLES from loaders import make_data_path, load_data if __name__ == '__main__': articles = load_data('articles.json') if DATASET == 'lsir': np.random.seed(42) articles = np.random.choice(articles, MAX_NUM_ARTICLES, replace=False).tolist() np.random.seed() with open(make_data_path('processed_articles'), mode='w') as fp: for article in tqdm(articles): processed_article = { 'entities_title': Preprocessor.get_entities_spacy(article['title']), 'keyphrases_title': Preprocessor.get_keyphrases_pke(article['title']), 'keyphrases_text': Preprocessor.get_keyphrases_pke(article['text']), } line = json.dump(processed_article, fp) fp.write('\n')