def main(): parser = argparse.ArgumentParser() parser.add_argument('--ticker_file', required=True) parser.add_argument('--sample_dir', required=True) parser.add_argument('--market_sample_path', required=True) parser.add_argument('--output_dir', required=True) parser.add_argument('--er_months', default=ER_MONTHS) parser.add_argument('--ev_months', default=EV_MONTHS) parser.add_argument('--overwrite', action='store_true') parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) market_samples = utils.read_samples(args.market_sample_path) er_months = [int(m) for m in args.er_months.split(',')] ev_months = [int(m) for m in args.ev_months.split(',')] # Tickers are listed one per line. with open(args.ticker_file, 'r') as fp: tickers = fp.read().splitlines() logging.info('Processing %d tickers' % len(tickers)) for i in range(len(tickers)): ticker = tickers[i] assert ticker.find('^') == -1 # ^GSPC should not be in tickers. logging.info('%d/%d: %s' % (i + 1, len(tickers), ticker)) stock_sample_path = '%s/%s.csv' % (args.sample_dir, ticker) if not path.isfile(stock_sample_path): logging.warning('Input file does not exist: %s' % stock_sample_path) continue # The output format is no longer csv. Use txt instead. output_path = '%s/%s.txt' % (args.output_dir, ticker) if path.isfile(output_path) and not args.overwrite: logging.warning('Output file exists: %s, skipping' % output_path) continue stock_samples = utils.read_samples(stock_sample_path) compute_features(stock_samples, market_samples, er_months, ev_months, output_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--ticker_file', required=True) parser.add_argument('--sample_dir', required=True) parser.add_argument('--market_sample_path', required=True) parser.add_argument('--output_dir', required=True) parser.add_argument('--er_months', default=ER_MONTHS) parser.add_argument('--ev_months', default=EV_MONTHS) parser.add_argument('--overwrite', action='store_true') parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) market_samples = utils.read_samples(args.market_sample_path) er_months = [int(m) for m in args.er_months.split(',')] ev_months = [int(m) for m in args.ev_months.split(',')] # Tickers are listed one per line. with open(args.ticker_file, 'r') as fp: tickers = fp.read().splitlines() logging.info('Processing %d tickers' % len(tickers)) for i in range(len(tickers)): ticker = tickers[i] assert ticker.find('^') == -1 # ^GSPC should not be in tickers. logging.info('%d/%d: %s' % (i+1, len(tickers), ticker)) stock_sample_path = '%s/%s.csv' % (args.sample_dir, ticker) if not path.isfile(stock_sample_path): logging.warning('Input file does not exist: %s' % stock_sample_path) continue # The output format is no longer csv. Use txt instead. output_path = '%s/%s.txt' % (args.output_dir, ticker) if path.isfile(output_path) and not args.overwrite: logging.warning('Output file exists: %s, skipping' % output_path) continue stock_samples = utils.read_samples(stock_sample_path) compute_features(stock_samples, market_samples, er_months, ev_months, output_path)
def approximate_function(): samples = read_samples(sample_file) random.shuffle(samples) train_data = samples[:int(0.6 * len(samples))] val_data = samples[int(0.6 * len(samples)):int(0.8 * len(samples))] test_data = samples[int(0.8 * len(samples)):] model = Rd_difference_approximator() if GPU: differential_model = differential_model.cuda() R_2 = {'num_dims': 2, 'bounds': [(-1, 1), (-1, 1)]} domain = Bounded_Rd(R_2['num_dims'], R_2['bounds']) approximator = SingleSampleFunctionApproximator( train_data, model=model, model_file='square-single.model') optimizer = optim.Adam(model.parameters(), lr=1e-2) criterion = nn.MSELoss() approximator.approximate(val_data, optimizer, criterion, NUM_EPOCHS)
def main(): true_path = sys.argv[1] samples_path = sys.argv[2] base_output_path = sys.argv[3] appendix_output_path = sys.argv[4] runs = int(sys.argv[5]) global samples samples = utils.read_samples(samples_path) samples = np.copy(samples[1000:]) matches, errors = evaluate_filtered_match_iterative( true_path, base_output_path, appendix_output_path, runs) print 'Filtered matches:', matches print 'Perfect:', np.sum([(x[0] == 24 and x[1] == 24) for x in matches]) print 'Errors:', errors, np.average(errors)
def main(): # read data samples = read_samples(DATA_DIR) _ = samples.pop(0) samples = [append_path(line) for line in samples] # create train, validation and test sets train_set, test_set = train_test_split(samples, test_size=0.1) train_set, valid_set = train_test_split(train_set, test_size=0.1) # create data iterators to load and preprocess images train_iterator = ImageGenerator(train_set, batch_size=128, corr=0.2) valid_iterator = CenterImageGenerator(valid_set, batch_size=128) test_iterator = CenterImageGenerator(test_set, batch_size=128) model = comma_model() early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=1, mode='auto') filepath = 'comma.{epoch:02d}-{val_loss:.2f}.hdf5' checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1) print('Training comma.ai model') model.fit_generator(generator=train_iterator, epochs=10, validation_data=valid_iterator, steps_per_epoch=len(train_iterator), validation_steps=len(valid_iterator), callbacks=[checkpoint, early_stopping]) print('Done.') results = model.evaluate_generator(generator=test_iterator, steps=len(test_iterator)) print('Test mse: {}'.format(results))
for i in range(0,len(test_data)*len(ref_x),len(ref_x)): predictions = [x[0].detach().numpy() for x in pair_outputs[i:i+len(ref_x)]] predictions_reference = [x[1].detach() for x in pair_outputs[i:i+len(ref_x)] ] weights = [1-abs((x-y)/y) for x,y in zip(predictions_reference,ref_y) ] mean_prediction = np.average(predictions,weights=weights) outputs.append(mean_prediction) return outputs def evaluate(self,test_data,criterion): predictions = self.predict(test_data) print(criterion(Variable(torch.FloatTensor(predictions)),Variable(torch.FloatTensor([x[1] for x in test_data])))) def plot_figure(inputs,predictions,outfile): pass if __name__=='__main__': siamese_model = load_pytorch_model('square-siamese.model') single_model = load_pytorch_model("square-single.model") ood_samples = read_samples('square.csv') samples = read_samples('square.csv') R_2 ={'num_dims':2,'bounds':[(-1,1),(-1,1)]} domain = Bounded_Rd(R_2['num_dims'],R_2['bounds']) approximator = SamplePairCoApproximator(samples,domain,differential_model=siamese_model) approximator_single = SingleSampleFunctionApproximator(samples,model=single_model) criterion = nn.MSELoss() approximator.evaluate(ood_samples, criterion) approximator_single.evaluate(ood_samples, criterion) siamese_predictions = approximator.predict(ood_samples) single_predictions = approximator_single.predict(ood_samples) plot_figure(ood_samples,siamese_predictions,'results-siamese.png') plot_figure(ood_samples,single_predictions,'results-single.png')
if u.proxy_hours_left() < 60: print ">>> Proxy near end of lifetime, renewing." u.proxy_renew() u.copy_jecs() all_samples = [] for i in range(5000): data = { "samples": [], "last_updated": None } # read instructions file. if new sample found, add it to list # for existing samples, try to update params (xsec, kfact, etc.) for samp in u.read_samples(instructions): if samp not in all_samples: if DO_TEST: samp["specialdir_test"] = True print ">>> You have specified DO_TEST, so final samples will end up in snt/test/!" s = Samples.Sample(**samp) all_samples.append(s) else: all_samples[all_samples.index(samp)].update_params(samp) # for isample, s in enumerate(all_samples): # s.nuke() # sys.exit() for isample, s in enumerate(all_samples): stat = s.get_status()
def train_model(args: dict, hparams:dict): file = args.dataset_filepath # truncation = args.truncation seed_val = hparams["seed_val"] device = utils.get_device(device_no=args.device_no) saves_dir = "saves/" Path(saves_dir).mkdir(parents=True, exist_ok=True) time = datetime.datetime.now() saves_path = os.path.join(saves_dir, utils.get_filename(time)) Path(saves_path).mkdir(parents=True, exist_ok=True) log_path = os.path.join(saves_path, "training.log") logging.basicConfig(filename=log_path, filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG) logger=logging.getLogger() logger.info("File: "+str(file)) logger.info("Parameters: "+str(args)) logger.info("Hyperparameters: "+str(hparams)) # logger.info("Truncation: "+truncation) # Load the BERT tokenizer. logger.info('Loading BERT tokenizer...') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) max_len = 0 samples = utils.read_samples(file) article_type_map = {} if args.lcr: samples = [[val[0].lower()+" [SEP] "+val[1].lower()+" [SEP] "+val[2].lower(), val[3], val[4]] for val in samples] else: samples = [[val[0].lower()+" [SEP] "+val[1].lower(), val[2], val[3]] for val in samples] for s in samples: article_type_map[s[0]] = s[2] # samples = samples[:100] # if args.binary_classifier: # samples = utils.read_pairwise(file, args.data_1, args.data_2, dataset_amount=args.dataset_amount) # else: # samples = utils.read_and_sample(file, dataset_amount=args.dataset_amount) no_of_labels = len(set([val[1] for val in samples])) logger.info("No of unique labels: "+str(no_of_labels)) train_size = int(0.9 * len(samples)) val_size = len(samples) - train_size random.shuffle(samples) train_samples = samples[:train_size] val_samples = samples[train_size:] train_samples_text = [val[0] for val in train_samples] train_samples_label = [val[1] for val in train_samples] val_samples_text = [val[0] for val in val_samples] val_samples_label = [val[1] for val in val_samples] max_len = 0 # For every sentence... for text in train_samples_text+val_samples_text: # Tokenize the text and add `[CLS]` and `[SEP]` tokens. input_id = tokenizer(text, add_special_tokens=True) # Update the maximum sentence length. max_len = max(max_len, len(input_id['input_ids'])) logger.info('Max text length: ' + str(max_len)) max_len = pow(2, math.ceil(math.log2(max_len))) max_len = min(512, max_len) batch_size = args.batch_size (train_input_ids, train_attention_masks, train_samples_label_tensor) = make_smart_batches(train_samples_text, train_samples_label, batch_size, logger, tokenizer, max_len) (val_input_ids, val_attention_masks, val_samples_label_tensor) = make_smart_batches(val_samples_text, val_samples_label, batch_size, logger, tokenizer, max_len) logger.info('{:>5,} training samples'.format(train_size)) logger.info('{:>5,} validation samples'.format(val_size)) model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels = no_of_labels, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions = False, # Whether the model returns attentions weights. output_hidden_states = False, # Whether the model returns all hidden-states. ) model = model.to(device=device) # model.cuda(device=device) optimizer = AdamW(model.parameters(), lr = args.learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps = hparams["adam_epsilon"] # args.adam_epsilon - default is 1e-8. ) epochs = args.n_epochs total_steps = len(train_input_ids) * epochs scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, # Default value in run_glue.py num_training_steps = total_steps) random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) training_stats = [] correct_counts = { "domestic": 0, "international": 0 } total_counts = { "domestic": 0, "international": 0 } for epoch_i in range(0, epochs): logger.info("") logger.info('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) logger.info('Training...') total_train_loss = 0 model.train() step = 0 for batch in zip(train_input_ids, train_attention_masks, train_samples_label_tensor): if step % 40 == 0 and not step == 0: logger.info(' Batch {:>5,} of {:>5,}. '.format(step, len(train_input_ids))) b_input_ids = batch[0].to(device=device) b_input_mask = batch[1].to(device=device) b_labels = batch[2].to(device=device) # Converting labels to float32 because I was getting some runtime error. # Not sure why we need to make labels float32. Keeping it Long or int64 works in case of headlines. # b_labels = batch[2].to(device=device, dtype=torch.float32) model.zero_grad() loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) total_train_loss += loss.detach().cpu().numpy() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() step+=1 avg_train_loss = total_train_loss / len(train_input_ids) logger.info("") logger.info("Average training loss: {0:.2f}".format(avg_train_loss)) logger.info("") logger.info("Running Validation...") model.eval() total_eval_accuracy = 0 total_eval_loss = 0 for batch in zip(val_input_ids, val_attention_masks, val_samples_label_tensor): b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) with torch.no_grad(): (loss, logits) = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) total_eval_loss += loss.detach().cpu().numpy() logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() total_eval_accuracy += flat_accuracy(logits, label_ids) for idx in range(batch_size): pred = np.argmax(logits[idx]) == label_ids[idx] tokens = tokenizer.convert_ids_to_tokens(b_input_ids[idx]) avg_val_accuracy = total_eval_accuracy / len(val_input_ids) logger.info("Accuracy: {0:.2f}".format(avg_val_accuracy)) avg_val_loss = total_eval_loss / len(val_input_ids) logger.info("Validation Loss: {0:.2f}".format(avg_val_loss)) training_stats.append( { 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'Valid. Loss': avg_val_loss, 'Valid. Accur.': avg_val_accuracy, } ) model_save_path = os.path.join(saves_path, "model_"+str(epoch_i+1)+"epochs") torch.save(model, model_save_path) logger.info("") logger.info("Training complete!") handlers = logger.handlers[:] for handler in handlers: handler.close() logger.removeHandler(handler)
'epochs': 2000, 'heldout': 100 } batch_size = config['batch_size'] if len(sys.argv) < 4: print "usage: python train_relu.py samples_path output_path topics seed" sys.exit() samples_path = sys.argv[1] output_path = sys.argv[2] K = int(sys.argv[3]) if len(sys.argv) > 4: config['seed'] = int(sys.argv[4]) np.random.seed(config['seed']) samples = utils.read_samples(samples_path) N = samples.shape[1] samples_heldout = np.copy(samples[:config['heldout']]) samples = np.copy(samples[config['heldout']:]) np.random.shuffle(samples) S = samples.shape[0] # number of samples S_heldout = samples_heldout.shape[0] # number of heldout samples A = utils.get_initial_A_gaussian(N, K) sys.stdout = open(output_path + '/' + 'log.txt', 'w') print config # Train. for epoch in range(config['epochs']):
def main(instructions=None, params=None, do_one_iteration=False): if not instructions: return if not params: params = __import__('params') data_json = "data.json" actions_fname = os.path.abspath(__file__).rsplit("/", 1)[0] + "/actions.txt" u.copy_jecs() logger_name = u.setup_logger() logger = logging.getLogger(logger_name) time_stats = [] if os.path.isfile(data_json): with open(data_json, "r") as fhin: data = json.load(fhin) if "time_stats" in data: time_stats = data["time_stats"] all_samples = [] for i in range(5000): if u.proxy_hours_left() < 60 and not params.FORSAKE_HEAVENLY_PROXY: u.proxy_renew() data = {"samples": [], "last_updated": None, "time_stats": time_stats} # read instructions file. if new sample found, add it to list # for existing samples, try to update params (xsec, kfact, etc.) for samp in u.read_samples(instructions): samp["params"] = params if samp not in all_samples: s = Samples.Sample(**samp) all_samples.append(s) else: all_samples[all_samples.index(samp)].update_params(samp) n_done = 0 n_samples = len(all_samples) for isample, s in enumerate(all_samples): try: stat = s.get_status() typ = s.get_type() # grab actions from a text file and act on them, consuming (removing) them if successful for dataset_name, action in u.get_actions( actions_fname=actions_fname, dataset_name=s["dataset"]): if s.handle_action(action): u.consume_actions(dataset_name=s["dataset"], action=action, actions_fname=actions_fname) if not s.pass_tsa_prechecks(): continue if typ == "CMS3": if stat == "new": s.crab_submit() elif stat == "crab": s.crab_parse_status() if s.is_crab_done(): s.make_miniaod_map() s.make_merging_chunks() s.submit_merge_jobs() elif stat == "postprocessing": if s.is_merging_done(): if s.check_output(): s.make_metadata() s.copy_files() else: s.submit_merge_jobs() elif stat == "done": s.do_done_stuff() n_done += 1 elif typ == "BABY": if stat == "new": s.set_baby_inputs() s.submit_baby_jobs() elif stat == "condor" or stat == "postprocessing": if params.open_datasets: s.check_new_merged_for_babies() if not params.open_datasets and s.is_babymaking_done(): s.set_status("done") else: s.sweep_babies() s.submit_baby_jobs() elif stat == "done": if params.open_datasets: s.check_new_merged_for_babies() else: s.do_done_stuff() n_done += 1 s.save() data["samples"].append(s.get_slimmed_dict()) except Exception, err: logger.info( "send an (angry?) email to Nick with the Traceback below!!" ) logger.info(traceback.format_exc()) breakdown_crab = u.sum_dicts([ samp["crab"]["breakdown"] for samp in data["samples"] if "crab" in samp and "breakdown" in samp["crab"] ]) # breakdown_baby = u.sum_dicts([{"baby_"+key:samp["baby"].get(key,0) for key in ["running", "sweepRooted"]} for samp in data["samples"] if samp["type"] == "BABY"]) breakdown_baby = u.sum_dicts([{ "running_babies": samp["baby"]["running"], "sweepRooted_babies": samp["baby"]["sweepRooted"] } for samp in data["samples"] if samp["type"] == "BABY"]) tot_breakdown = u.sum_dicts([breakdown_crab, breakdown_baby]) data["last_updated"] = u.get_timestamp() data["time_stats"].append((u.get_timestamp(), tot_breakdown)) data["log"] = u.get_last_n_lines(fname=params.log_file, N=100) with open(data_json, "w") as fhout: data["samples"] = sorted( data["samples"], key=lambda x: x.get("status", "done") == "done") json.dump(data, fhout, sort_keys=True, indent=4) u.copy_json(params) if params.exit_when_done and (n_done == n_samples): print ">>> All %i samples are done. Exiting." % n_samples break if not do_one_iteration: sleep_time = 60 if i < 2 else 2 * 600 logger.debug("sleeping for %i seconds..." % sleep_time) u.smart_sleep(sleep_time, files_to_watch=[actions_fname, instructions]) else: break
# Pick the proxy proxy_file_dict = {} if not params.FORSAKE_HEAVENLY_PROXY: proxy_file_dict = {"proxy": u.get_proxy_file()} else: print ">>> You have chosen to forsake your heavenly proxy. Be wary of prompts for your password." # Check write permissions print BLUE,"Checking write permissions to UCSD...",ENDC out = crabCommand('checkwrite', site="T2_US_UCSD", **proxy_file_dict) print "Done. Status: %s" % out["status"] print # Take first dataset name in instructions.txt print BLUE, "Taking the first sample in instructions.txt. If it's not a FullSim MC sample, then you're going to have a bad time!", ENDC sample = u.read_samples()[0] dataset_name = sample["dataset"] gtag = sample["gtag"] print " --> %s" % dataset_name print # Find the smallest MINIAOD file filelist = dis.query(dataset_name, detail=True, typ="files") filelist = filelist["response"]["payload"] filelist = sorted(filelist, key=lambda x: x.get("sizeGB", 999.0)) smallest_filename = filelist[0]["name"] print BLUE, "Smallest file", ENDC print " --> %s" % smallest_filename print
def main(instructions=None, params=None, do_one_iteration=False): if not instructions: return if not params: params = __import__('params') data_json = "data.json" actions_fname = os.path.abspath(__file__).rsplit("/",1)[0]+"/actions.txt" u.copy_jecs() logger_name = u.setup_logger() logger = logging.getLogger(logger_name) time_stats = [] if os.path.isfile(data_json): with open(data_json, "r") as fhin: data = json.load(fhin) if "time_stats" in data: time_stats = data["time_stats"] all_samples = [] for i in range(5000): if u.proxy_hours_left() < 60 and not params.FORSAKE_HEAVENLY_PROXY: u.proxy_renew() data = { "samples": [], "last_updated": None, "time_stats": time_stats } # read instructions file. if new sample found, add it to list # for existing samples, try to update params (xsec, kfact, etc.) for samp in u.read_samples(instructions): samp["params"] = params if samp not in all_samples: s = Samples.Sample(**samp) all_samples.append(s) else: all_samples[all_samples.index(samp)].update_params(samp) n_done = 0 n_samples = len(all_samples) for isample, s in enumerate(all_samples): try: stat = s.get_status() typ = s.get_type() # grab actions from a text file and act on them, consuming (removing) them if successful for dataset_name, action in u.get_actions(actions_fname=actions_fname,dataset_name=s["dataset"]): if s.handle_action(action): u.consume_actions(dataset_name=s["dataset"],action=action, actions_fname=actions_fname) if not s.pass_tsa_prechecks(): continue if typ == "CMS3": if stat == "new": s.crab_submit() elif stat == "crab": s.crab_parse_status() if s.is_crab_done(): s.make_miniaod_map() s.make_merging_chunks() s.submit_merge_jobs() elif stat == "postprocessing": if s.is_merging_done(): if s.check_output(): s.make_metadata() s.copy_files() else: s.submit_merge_jobs() elif stat == "done": s.do_done_stuff() n_done += 1 elif typ == "BABY": if stat == "new": s.set_baby_inputs() s.submit_baby_jobs() elif stat == "condor" or stat == "postprocessing": if params.open_datasets: s.check_new_merged_for_babies() if not params.open_datasets and s.is_babymaking_done(): s.set_status("done") else: # s.sweep_babies() s.sweep_babies_parallel() s.submit_baby_jobs() elif stat == "done": if params.open_datasets: s.check_new_merged_for_babies() else: s.do_done_stuff() n_done += 1 s.save() data["samples"].append( s.get_slimmed_dict() ) except Exception, err: logger.info( "send an (angry?) email to Nick with the Traceback below!!") logger.info( traceback.format_exc() ) breakdown_crab = u.sum_dicts([samp["crab"]["breakdown"] for samp in data["samples"] if "crab" in samp and "breakdown" in samp["crab"]]) # breakdown_baby = u.sum_dicts([{"baby_"+key:samp["baby"].get(key,0) for key in ["running", "sweepRooted"]} for samp in data["samples"] if samp["type"] == "BABY"]) breakdown_baby = u.sum_dicts([{"running_babies":samp["baby"]["running"], "sweepRooted_babies":samp["baby"]["sweepRooted"]} for samp in data["samples"] if samp["type"] == "BABY"]) tot_breakdown = u.sum_dicts([breakdown_crab, breakdown_baby]) data["last_updated"] = u.get_timestamp() data["time_stats"].append( (u.get_timestamp(), tot_breakdown) ) data["log"] = u.get_last_n_lines(fname=params.log_file, N=100) with open(data_json, "w") as fhout: data["samples"] = sorted(data["samples"], key=lambda x: x.get("status","done")=="done") json.dump(data, fhout, sort_keys = True, indent = 4) u.copy_json(params) if params.exit_when_done and (n_done == n_samples): print ">>> All %i samples are done. Exiting." % n_samples break if not do_one_iteration: sleep_time = 60 if i < 2 else 2*600 logger.debug("sleeping for %i seconds..." % sleep_time) u.smart_sleep(sleep_time, files_to_watch=[actions_fname, instructions]) else: break
# Pick the proxy proxy_file_dict = {} if not params.FORSAKE_HEAVENLY_PROXY: proxy_file_dict = {"proxy": u.get_proxy_file()} else: print ">>> You have chosen to forsake your heavenly proxy. Be wary of prompts for your password." # Check write permissions print BLUE, "Checking write permissions to UCSD...", ENDC out = crabCommand('checkwrite', site="T2_US_UCSD", **proxy_file_dict) print "Done. Status: %s" % out["status"] print # Take first dataset name in instructions.txt print BLUE, "Taking the first sample in instructions.txt. If it's not a FullSim MC sample, then you're going to have a bad time!", ENDC sample = u.read_samples()[0] dataset_name = sample["dataset"] gtag = sample["gtag"] print " --> %s" % dataset_name print # Find the smallest MINIAOD file filelist = dis.query(dataset_name, detail=True, typ="files") filelist = filelist["response"]["payload"] filelist = sorted(filelist, key=lambda x: x.get("sizeGB", 999.0)) smallest_filename = filelist[0]["name"] print BLUE, "Smallest file", ENDC print " --> %s" % smallest_filename print # Use xrootd to get that file
def train_model(args: dict, hparams: dict): file = args.dataset_filepath # truncation = args.truncation seed_val = hparams["seed_val"] device = utils.get_device(device_no=args.device_no) saves_dir = "saves/bert/" Path(saves_dir).mkdir(parents=True, exist_ok=True) time = datetime.datetime.now() saves_path = os.path.join(saves_dir, utils.get_filename(time)) Path(saves_path).mkdir(parents=True, exist_ok=True) log_path = os.path.join(saves_path, "training.log") summary_filename = os.path.join(saves_path, "tensorboard_summary") writer = SummaryWriter(summary_filename) logging.basicConfig(filename=log_path, filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG) logger = logging.getLogger() logger.info("File: " + str(file)) logger.info("Parameters: " + str(args)) logger.info("Hyperparameters: " + str(hparams)) # logger.info("Truncation: "+truncation) # Load the BERT tokenizer. logger.info('Loading BERT tokenizer...') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) max_len = 0 samples = utils.read_samples(file) article_type_map = {} if not args.nonpair_data: if args.lcr: samples = [[ val[0].lower() + " [SEP] " + val[1].lower() + " [SEP] " + val[2].lower(), val[3], val[4] ] for val in samples] else: samples = [[ val[0].lower() + " [SEP] " + val[1].lower(), val[2], val[3] ] for val in samples] if args.group_by_domestic: samples_new = [] for s in samples: # article_type_map[s[0]] = s[2] if s[2] == "domestic": samples_new.append([s[0], s[1], 0]) else: samples_new.append([s[0], s[1], 1]) samples = samples_new # samples = samples[:100] # if args.binary_classifier: # samples = utils.read_pairwise(file, args.data_1, args.data_2, dataset_amount=args.dataset_amount) # else: # samples = utils.read_and_sample(file, dataset_amount=args.dataset_amount) no_of_labels = len(set([val[1] for val in samples])) logger.info("No of unique labels: " + str(no_of_labels)) # train_size = int(0.9 * len(samples)) # val_size = len(samples) - train_size # random.shuffle(samples) # train_samples = samples[:train_size] # val_samples = samples[train_size:] # train_samples_text = [val[0] for val in train_samples] # train_samples_label = [val[1] for val in train_samples] # val_samples_text = [val[0] for val in val_samples] # val_samples_label = [val[1] for val in val_samples] samples_text = [val[0] for val in samples] samples_label = [val[1] for val in samples] if args.group_by_domestic: samples_article_type = [val[2] for val in samples] max_len = 0 input_ids = [] attention_masks = [] # For every sentence... for text in samples_text: input_id = tokenizer(text, add_special_tokens=True) # Update the maximum sentence length. max_len = max(max_len, len(input_id['input_ids'])) logger.info('Max text length: ' + str(max_len)) max_len = pow(2, math.ceil(math.log2(max_len))) max_len = min(512, max_len) for text in samples_text: input_id = tokenizer(text, add_special_tokens=True) if len(input_id) > 512: if args.truncation == "tail-only": input_id['input_ids'] = [tokenizer.cls_token_id ] + input_id['input_ids'][-511:] elif args.truncation == "head-and-tail": input_id['input_ids'] = [tokenizer.cls_token_id] + input_id[ 'input_ids'][1:129] + input_id['input_ids'][-382:] + [ tokenizer.sep_token_id ] else: input_id['input_ids'] = input_id['input_ids'][:511] + [ tokenizer.sep_token_id ] input_ids.append(torch.tensor(input_id['input_ids']).view(1, -1)) attention_masks.append( torch.ones([1, len(input_id['input_ids'])], dtype=torch.long)) else: encoded_dict = tokenizer( text, # Sentence to encode. add_special_tokens=True, # Add '[CLS]' and '[SEP]' max_length=max_len, # Pad & truncate all sentences. pad_to_max_length=True, return_attention_mask=True, # Construct attn. masks. return_tensors='pt', # Return pytorch tensors. ) input_ids.append(encoded_dict['input_ids']) attention_masks.append(encoded_dict['attention_mask']) batch_size = args.batch_size input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labels = torch.tensor(samples_label) if args.group_by_domestic: samples_article_type_tensor = torch.tensor(samples_article_type) # Combine the training inputs into a TensorDataset. if args.group_by_domestic: dataset = TensorDataset(input_ids, attention_masks, labels, samples_article_type_tensor) else: dataset = TensorDataset(input_ids, attention_masks, labels) # (train_input_ids, train_attention_masks, train_samples_label_tensor) = make_smart_batches(train_samples_text, train_samples_label, batch_size, logger, tokenizer, max_len) # (val_input_ids, val_attention_masks, val_samples_label_tensor) = make_smart_batches(val_samples_text, val_samples_label, batch_size, logger, tokenizer, max_len) train_size = int(0.9 * len(dataset)) val_size = len(dataset) - train_size train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) logger.info('{:>5,} training samples'.format(train_size)) logger.info('{:>5,} validation samples'.format(val_size)) train_dataloader = DataLoader( train_dataset, # The training samples. sampler=RandomSampler(train_dataset), # Select batches randomly batch_size=batch_size # Trains with this batch size. ) validation_dataloader = DataLoader( val_dataset, # The validation samples. sampler=SequentialSampler( val_dataset), # Pull out batches sequentially. batch_size=batch_size # Evaluate with this batch size. ) model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels= no_of_labels, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions=False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) model = model.to(device=device) # model.cuda(device=device) optimizer = AdamW( model.parameters(), lr=args. learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=hparams["adam_epsilon"] # args.adam_epsilon - default is 1e-8. ) epochs = args.n_epochs total_steps = len(train_dataloader) * epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) training_stats = [] best_stats = { 'epoch': 0, 'training_loss': -sys.maxsize, 'training_accuracy': -sys.maxsize, 'validation_loss': -sys.maxsize, 'validation_accuracy': -sys.maxsize, } for epoch_i in range(0, epochs): if len(training_stats) > 2: if training_stats[-1]['validation_accuracy'] <= training_stats[-2]['validation_accuracy'] \ and training_stats[-2]['validation_accuracy'] <= training_stats[-3]['validation_accuracy']: break correct_counts = {"domestic": 0, "international": 0} total_counts = {"domestic": 0, "international": 0} logger.info("") logger.info('======== Epoch {:} / {:} ========'.format( epoch_i + 1, epochs)) logger.info('Training...') total_train_loss = 0 total_train_accuracy = 0 model.train() step = 0 for step, batch in enumerate(train_dataloader): if step % 40 == 0 and not step == 0: logger.info(' Batch {:>5,} of {:>5,}. '.format( step, len(train_dataloader))) b_input_ids = batch[0].to(device=device) b_input_mask = batch[1].to(device=device) b_labels = batch[2].to(device=device) # Converting labels to float32 because I was getting some runtime error. # Not sure why we need to make labels float32. Keeping it Long or int64 works in case of headlines. # b_labels = batch[2].to(device=device, dtype=torch.float32) model.zero_grad() loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() total_train_accuracy += flat_accuracy(logits, label_ids) total_train_loss += loss.detach().cpu().numpy() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() step += 1 avg_train_accuracy = total_train_accuracy / len(train_dataloader) logger.info("") logger.info( "Average training accuracy: {0:.2f}".format(avg_train_accuracy)) avg_train_loss = total_train_loss / len(train_dataloader) logger.info("") logger.info("Average training loss: {0:.2f}".format(avg_train_loss)) logger.info("") logger.info("Running Validation...") model.eval() total_eval_accuracy = 0 total_eval_loss = 0 for batch in validation_dataloader: b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) if args.group_by_domestic: b_article_types = batch[3].to(device) with torch.no_grad(): (loss, logits) = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) total_eval_loss += loss.detach().cpu().numpy() logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() total_eval_accuracy += flat_accuracy(logits, label_ids) if args.group_by_domestic: for idx in range(len(b_labels)): pred = np.argmax(logits[idx]) == label_ids[idx] if b_article_types[idx] == 0: if pred == True: correct_counts["domestic"] += 1 total_counts["domestic"] += 1 avg_val_accuracy = total_eval_accuracy / len(validation_dataloader) logger.info( "Avg validation accuracy: {0:.2f}".format(avg_val_accuracy)) if args.group_by_domestic: avg_val_accuracy_domestic = correct_counts[ "domestic"] / total_counts["domestic"] logger.info("Domestic validation accuracy: {0:.2f}".format( avg_val_accuracy_domestic)) avg_val_loss = total_eval_loss / len(validation_dataloader) logger.info("Validation Loss: {0:.2f}".format(avg_val_loss)) training_stats.append({ 'epoch': epoch_i + 1, 'training_loss': avg_train_loss, 'training_accuracy': avg_train_accuracy, 'validation_loss': avg_val_loss, 'validation_accuracy': avg_val_accuracy, }) if avg_val_accuracy > best_stats['validation_accuracy']: best_stats = { 'epoch': epoch_i + 1, 'training_loss': avg_train_loss, 'training_accuracy': avg_train_accuracy, 'validation_loss': avg_val_loss, 'validation_accuracy': avg_val_accuracy, } writer.add_scalars( 'losses_and_accuracies', { 'training_loss': avg_train_loss, 'training_accuracy': avg_train_accuracy, 'validation_loss': avg_val_loss, 'validation_accuracy': avg_val_accuracy, }, epoch_i + 1) model_save_path = os.path.join(saves_path, "model_" + str(epoch_i + 1) + "epochs") torch.save(model, model_save_path) logger.info("") logger.info("Training complete!") logger.info("Best stats") logger.info("training_accuracy: {}".format(best_stats['training_loss'])) logger.info("training_loss: {}".format(best_stats['training_loss'])) logger.info("validation_accuracy: {}".format( best_stats['validation_accuracy'])) logger.info("validation_loss: {}".format(best_stats['validation_loss'])) handlers = logger.handlers[:] for handler in handlers: handler.close() logger.removeHandler(handler)