def concatenate_page(base_dir, manuscript, page, columns, rows): ''' Concatenate image blocks into a single page (still jpg). ''' # How concat blocks into row looks in shell: # montage -mode concatenate -tile x1 `ls -1cr add_ms_24686_f044r_5_*` row_0.jpg for row in range(rows + 1): row_filename = J(base_dir, manuscript, page, 'row_{0}.jpg'.format(row)) if os.path.exists(row_filename): continue glob_name = '{0}_{1}_*.jpg'.format(J(base_dir, manuscript, page, page), row) row_blocks = sorted(glob.glob(glob_name), key=natural_keys) cmd = ('montage -mode concatenate -tile x1'.split() + row_blocks + [row_filename]) call(cmd) put('.') # How concat rows into page looks in shell: # montage -mode concatenate -tile 1x `ls -1cr row_*` add_ms_24686_f044r.jpg page_filename = J(base_dir, manuscript, page) + '.jpg' if os.path.exists(page_filename): return glob_name = '{0}_*.jpg'.format(J(base_dir, manuscript, page, 'row')) rows = sorted(glob.glob(glob_name), key=natural_keys) cmd = ('montage -mode concatenate -tile 1x'.split() + rows + [page_filename]) call(cmd) put('\n')
def convert_pages(base_dir, manuscript, pages): ''' Convert manuscript images into PDFs and join into single PDF. ''' for i, page in enumerate(pages): input_name = J(base_dir, manuscript, '{0}.jpg'.format(page)) output_name = J(base_dir, manuscript, '{0}.pdf'.format(page)) if os.path.exists(output_name): continue print('Converting page {0} ({1}/{2})'.format(page, i + 1, len(pages))) cmd = ['convert', input_name, output_name] call(cmd)
def load_specifications(specification_dir): """Loads experiment specifications from a specified directory. Args: specification_dir (str): The specified directory containing experiment specifications. Returns: list(dict): A list of experiment specification JSONs. """ assert E( specification_dir), "Specification directory {} does not exist".format( specification_dir) specification_jsons = glob.glob(J(specification_dir, '*.json')) logger.info("Loading experiment specificaitons...") if not specification_jsons: logger.warning( "Could not find any experiment specifications in {}".format( specification_dir)) specs = [] for spec_path in specification_jsons: with open(spec_path, 'r') as f: specs.append(json.load(f)) logger.info("Found {} experiment specifications".format(len(specs))) return specs
def fold_pages(base_dir, manuscript, pages, output_name): ''' Fold pdf pages into one by applying concat operation to a pair of docs. ''' tmp_name = J(base_dir, manuscript + '.pdf.tmp') pdfs = ['{0}.pdf'.format(page) for page in pages] for i, pdf in enumerate(pdfs): print('Folding page {0} ({1}/{2})'.format(pdf, i + 1, len(pages))) pdf_name = J(base_dir, manuscript, pdf) if os.path.exists(output_name): cmd = ['pdftk', output_name, pdf_name, 'cat', 'output', tmp_name] call(cmd) os.unlink(output_name) os.rename(tmp_name, output_name) else: shutil.copy2(pdf_name, output_name)
def convert_manuscript(resolution, base_dir, manuscript, pages): ''' Convert manuscript and fold its pages into a single PDF. ''' convert_pages(base_dir, manuscript, pages) suffix = '-p{0}-r{1}.pdf'.format(len(pages), resolution) output_name = J(base_dir, manuscript + suffix) fold_pages(base_dir, manuscript, pages, output_name)
def start_server(name): try: if name == SERVER_REALMD: process_name = J(MANGOS_DIR, REALMD_BIN) elif name == SERVER_WORLDD: process_name = J(MANGOS_DIR, MANGOSD_BIN) count = int( os.popen("ps ax|grep %s | grep -v grep | wc -l" % process_name).read().strip()) if count >= 1: logger.warn( 'Requested for start, but look for server already started. %s' % count) return except Exception, e: logger.error('%s' % e) mail_admins(traceback.format_exc())
def parameter_count(spec, experiment_directory): # spec, experiment_directory = args # Unpack some of the specification information try: spec = set_spec_default_values(spec) algorithm = spec["algorithm"] batch_size = spec['batch_size'] bptt_len = spec['bptt_len'] spec['device'] = 'cpu' device = 'cpu' hmm_hidden = spec['hmm_hidden'] max_step = spec['max_step'] name = spec['name'] sequence_dependence = spec['sequence_dependence'] vocab = spec['vocab'] # Unpack additional arguments <here> except KeyError: print("Invalid experiment specification: {}".format(spec)) raise logging.basicConfig(level=logging.DEBUG) # filename=J(experiment_directory, 'out.log'), # filemode='w') logger = logging.getLogger('exp_runner') logger.info("Starting the parameter counter!") logger.info(str(spec)) # Create the directory if not os.path.exists(experiment_directory): os.makedirs(experiment_directory) else: assert c.EXPERIMENT_RUNNER_SHOULD_OVERWRITE, "Experiment directory {} already exists".format( experiment_directory) # Choose sequence model type if algorithm == 'transformer': sequence_model = TransformerXL(**spec) elif algorithm == 'lstm': sequence_model = LSTMModel(**spec) elif algorithm == 'cnn': sequence_model = GatedCNN(**spec) else: print(spec) # Model model = sequence_model.get_model() pp = 0 for p in list(model.parameters()): nn = 1 for s in list(p.size()): nn = nn * s pp += nn print(pp) np.save(J(experiment_directory, 'parameters.npy'), [pp])
def replace_practitioner(combined_dataset_path, id1, id2): """Merges all files with id1 into files with id2""" files = list(filter(lambda x: x.endswith('json'), os.listdir(combined_dataset_path))) # First, find the object for id2 practitioner. prac_object = None for fname in files: if fname.replace('practitioner', '').startswith(id2): with open(J(combined_dataset_path, fname), 'r', encoding='utf-8') as f: prac_object = json.loads(f.read())['entry'][0]['entry'][1] break # Replace it all for fname in files: if fname.replace('practitioner', '').startswith(id1): new_obj = None with open(J(combined_dataset_path, fname), 'r', encoding='utf-8') as f: old = f.read() id_replaced = old.replace(id1, id2) new_obj = json.loads(id_replaced) for x in range(len(new_obj['entry'])): new_obj['entry'][x]['entry'][1] = prac_object new_path = fname.replace(id1, id2) if os.path.exists(J(combined_dataset_path, new_path)): complete_obj = None with open(J(combined_dataset_path, new_path), 'r', encoding='utf-8') as f: complete_obj = json.loads(f.read()) complete_obj['entry'].extend(new_obj['entry']) with open(J(combined_dataset_path, new_path), 'w', encoding='utf-8') as f: f.write(json.dumps(complete_obj, indent=2)) else: with open(J(combined_dataset_path, new_path), 'w', encoding='utf-8') as f: f.write(json.dumps(new_obj, indent=2)) os.remove(J(combined_dataset_path, fname))
def group_count_pracs(combined_dataset_path): filename_regex = r'practitioner(.+)_.*' pracs = {} files = list(filter(lambda x: x.endswith('json'), os.listdir(combined_dataset_path))) for file in files: with open(J(combined_dataset_path, file), 'r') as f: useless_bundle = json.loads(f.read()) prac_id = re.match(filename_regex, file).group(1) if prac_id not in pracs: pracs[prac_id] = 0 pracs[prac_id] += len(useless_bundle['entry']) return pracs
def setup_config(): cfg = ConfigParser(CFG_DEFAULTS) conf_file = J(WORK_DIR, 'checker.conf') cfg.read(conf_file) if not cfg.has_section('checker'): cfg.add_section('checker') if not cfg.has_section('mangos'): cfg.add_section('mangos') fp = open(conf_file, 'wt') cfg.write(fp) fp.close() return cfg
def rst2html(rst, theme=None, opts=None): rst_opts = default_rst_opts.copy() if opts: rst_opts.update(opts) rst_opts['template'] = 'var/themes/template.txt' stylesheets = ['basic.css'] if theme: stylesheets.append('%s/%s.css' % (theme, theme)) rst_opts['stylesheet'] = ','.join( [J('var/themes/', p) for p in stylesheets]) out = publish_string(rst, writer_name='html', settings_overrides=rst_opts) return out
def main(specification_dir, out_dir, num_gpus, exps_per_gpu): """Run the experiment orchestrator """ # 1. Load the specifications specs = load_specifications(specification_dir) # 2. Create the output directory if not os.path.exists(out_dir): os.makedirs(out_dir) if os.listdir(out_dir): logger.warning( "The output directory {} is not empty. Are you sure you want to continue?" .format(out_dir)) # time.sleep(3) # 3. Create the workers with specific environment variables num_workers = num_gpus * exps_per_gpu with NonDaemonPool(num_workers) as pool: logger.info("Created {} workers".format(num_workers)) # Create the available device queue. m = multiprocessing.Manager() available_devices = m.Queue() for g in range(num_gpus): for _ in range(exps_per_gpu): available_devices.put(g) # 4. Create and distribute the workload workload = list( sorted([(spec, J(out_dir, spec["name"]), available_devices) for spec in specs], key=lambda x: (1 + 10000 * x[0]['depth']) * x[0]['width'])) logger.info("Running {} jobs accross {} GPUs".format( len(workload), num_gpus)) # 5. Launch the workers. logger.info("Launching the workers using `run_experiment`.") list(pool.imap_unordered(launch_experiment_on_device, workload)) # pool.join() logger.info("Success, all experiments completed!")
def main(specification_dir, start, end, info): # First construct the cartesian product if info: print_info() return vals = c.HYPERPARAMETERS.values() product = itertools.product(*vals) product_to_dict = [{ k: v[i] for i, k in enumerate(c.HYPERPARAMETERS) } for v in product] # Create the specification directory if not E(specification_dir): os.makedirs(specification_dir) else: if os.listdir(specification_dir): logger.warning( "Specification directory is not empty, " "are you sure you want to create it.") logger.info("Making specifications.") for i, spec in enumerate(product_to_dict): # Set the name file_name = "{}_{}".format(spec['algorithm'], i) spec["name"] = file_name alg = spec["algorithm"] for k in c.ALGORITHM_SPECIFIC_PARAMETERS[alg]: spec[k] = c.ALGORITHM_SPECIFIC_PARAMETERS[alg][k] spec["embedding_dim"] = spec["width"] for key, value in c.DEFAULT_VALUES_SPEC.items(): if key not in spec: spec[key] = value with open(J(specification_dir, file_name + ".json"), "w") as f: f.write(json.dumps(spec)) logger.info("Specifications complete.")
def main(): parser = argparse.ArgumentParser() parser.add_argument("output_directory", type=str, help="The experiment output directory") args = parser.parse_args() assert E( args.output_directory), "Output directory {} does not exist".format( args.output_directory) experiments = glob.glob(J(args.output_directory, "*")) for ex in tqdm.tqdm(experiments): try: losses = np.load(J(ex, "losses.npy")) test_perplexity, test_acc = zip( *np.load(J(ex, "test_performance.npy"))) train_perplexity, train_acc = zip( *np.load(J(ex, "train_performance.npy"))) except FileNotFoundError as e: continue ls = moving_average(losses, 50) plt.scatter(range(len(ls)), ls, s=1) plt.title("Loss") plt.savefig(J(ex, 'losses.png')) plt.clf() plt.figure() plt.plot(train_perplexity, label="Train") plt.plot(test_perplexity, label="Test") plt.legend() plt.title("Perplexity") plt.savefig(J(ex, 'perplexity.png')) plt.clf() plt.figure() plt.plot(train_acc, label="Train") plt.plot(test_acc, label="Test") plt.legend() plt.title("Accuracy") plt.savefig(J(ex, 'accuracy.png')) plt.clf()
window = 10 epochs = 20 training = False files = [ "rd1_train.csv", "rd1_testA.csv", # "rd2_train.csv", # "rd1_testB.csv", ] if __name__ == '__main__': rpt_list = [] for f in files: df = pd.read_csv(J(data_path, f), index_col=0) rpt_list += [[c for c in t.strip().split()] for t in df.desc.tolist()] print("number of rpt:", len(rpt_list)) # logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') # logging.root.setLevel(level=logging.INFO) wv_model = Word2Vec(size=embedding_size, min_count=1, sg=1, workers=4, window=window) wv_model.build_vocab(rpt_list) if training: wv_model.train(rpt_list,
import yaml import pytest from collections import defaultdict from os import pardir from os.path import join as J from os.path import dirname, realpath from pyBabyMaker.babymaker import BabyVariable, BabyMaker, BabyConfigParser, \ BabyVariableResolver from pyBabyMaker.base import UniqueList from pyBabyMaker.io.NestedYAMLLoader import NestedYAMLLoader from pyBabyMaker.io.TupleDump import PyTupleDump PWD = dirname(realpath(__file__)) PARDIR = J(PWD, pardir) SAMPLE_YAML = J(PARDIR, 'samples', 'sample-babymaker.yml') SAMPLE_ROOT = '../samples/sample.root' SAMPLE_FRIEND = '../samples/sample_friend.root' SAMPLE_TMPL = J(PARDIR, 'pyBabyMaker', 'cpp_templates', 'babymaker.cpp') SAMPLE_CPP = J(PARDIR, 'samples', 'sample-babymaker.cpp') ###################### # Variable container # ###################### def test_BabyVariable_default(): var = BabyVariable('stuff') assert var.input is False
def read_dataset(dataset_path): files = list(filter(lambda x: x.endswith('json'), os.listdir(dataset_path))) organizations = {} practitioners = {} patients = {} encounters = {} observations = {} for file in files: print('Reading file', file) with open(J(dataset_path, file), 'r') as f: bundle_group = json.loads(f.read())['entry'] for bundle in bundle_group: local_organizations = list( filter( lambda x: x['resource']['resourceType'] == 'Organization', bundle['entry'])) local_practitioners = list( filter( lambda x: x['resource']['resourceType'] == 'Practitioner', bundle['entry'])) local_patients = list( filter(lambda x: x['resource']['resourceType'] == 'Patient', bundle['entry'])) local_encounters = list( filter(lambda x: x['resource']['resourceType'] == 'Encounter', bundle['entry'])) local_observations = list( filter( lambda x: x['resource']['resourceType'] == 'Observation', bundle['entry'])) if len(local_organizations) > 1: print("Oh no. More than one organization in bundle.") if len(local_practitioners) > 1: print("Oh no. More than one practitioner in bundle.") if len(local_patients) > 1: print("Oh no. More than one patient in bundle.") local_organization: Organization = Organization( local_organizations[0]) local_practitioner: Practitioner = Practitioner( local_practitioners[0]) local_patient: Patient = Patient(local_patients[0]) local_encounters: List[Encounter] = list( map(Encounter, local_encounters)) local_observations: List[Observation] = list( map(Observation, local_observations)) if local_organization.id not in organizations: organizations[local_organization.id] = local_organization else: local_organization = organizations[local_organization.id] if local_practitioner.id not in practitioners: practitioners[local_practitioner.id] = local_practitioner else: local_practitioner = practitioners[local_practitioner.id] if local_patient.id not in patients: patients[local_patient.id] = local_patient else: print("Oh no. Duplicate patient.") local_patient = patients[local_patient.id] local_patient.organization = local_organization.id local_patient.practitioner = local_practitioner.id local_patient.observations = list( map(lambda x: x.id, local_observations)) local_patient.encounters = list( map(lambda x: x.id, local_encounters)) local_organization.patients.append(local_patient.id) local_practitioner.patients.append(local_patient.id) if local_practitioner.id not in local_organization.practitioners: local_organization.practitioners.append(local_practitioner.id) if local_organization.id not in local_practitioner.organizations: local_practitioner.organizations.append(local_organization.id) for local_encounter in local_encounters: if local_encounter.id in encounters: print("Oh no. Duplicate encounter.") if local_encounter.patient != local_patient.id: print( "Oh no. Encounter patient id does not match with bundle patient." ) encounters[local_encounter.id] = local_encounter local_encounters_ids = list(map(lambda x: x.id, local_encounters)) for local_observation in local_observations: if local_observation.id in observations: print("Oh no. Duplicate encounter.") if local_observation.patient != local_patient.id: print( "Oh no. Observation patient id does not match with bundle patient." ) if local_observation.encounter not in local_encounters_ids: print("Oh no. Observation encounter not found in bundle.") observations[local_observation.id] = local_observation print( "Read data:\n{} Organizations\n{} Practitioners\n{} Patients\n{} Encounters\n{} Observations" .format(len(organizations), len(practitioners), len(patients), len(encounters), len(observations))) return { 'organizations': organizations, 'practitioners': practitioners, 'patients': patients, 'encounters': encounters, 'observations': observations }
"folder_id", "fold_start", } if __name__ == "__main__": if len(sys.argv) < 3: print("args: dst src") sys.exit(-1) dst = sys.argv[1] path = sys.argv[2] assert '.json' not in path src = glob(J(path, "**", "config.json"))[0] print("SRC:", src) print("DST:", dst) with open(src, 'r', encoding='utf-8') as f: conf_src = json.load(f) with open(dst, 'r', encoding='utf-8') as f: conf_dst = json.load(f) for k, v in conf_src.items(): if k in conf_dst and k not in ignore and v != conf_dst[k]: print("CHANGE {}: {} -> {}".format(k, conf_dst[k], v)) conf_dst[k] = v
import os import sys from glob2 import glob from os.path import join as J import shutil import pandas as pd if __name__ == "__main__": if len(sys.argv) < 3: print("please give path and n!") sys.exit(-1) root_path = sys.argv[1] n = int(sys.argv[2]) files = glob(J(root_path, "**", "info.csv")) for f in files: df = pd.read_csv(f) fds = df.iloc[:, 0].tolist() if len(fds) <= n: continue path = os.path.split(f)[0] for t in fds[n:]: fd = J(path, t) print("REMOVE", fd) try: shutil.rmtree(fd) except Exception as e: print(e)
from time import time, sleep import datetime from redis import Redis import smtplib from smtplib import SMTPSenderRefused import logging import traceback import logging.handlers import cPickle as pickle from subprocess import PIPE, Popen from multiprocessing import Process from ConfigParser import ConfigParser, NoSectionError WORK_DIR = J(os.environ['HOME'], '.mangop') ################## default settings for checker.conf ############## # CFG_DEFAULTS = { 'time_to_wakeup': 90, 'mangos_dir': '/home/mangos/bin/used_rev/bin/', 'mangos_log_dir': '/var/log/mangos/', 'run_socket_path': J(WORK_DIR, 'run.sock'), 'mangosd_bin': 'mangosd', 'realmd_bin': 'realmd', 'redis_port': 6379, 'redis_host': 'localhost', 'smtp_host': 'localhost', 'smtp_from': '*****@*****.**',
def main(args): for name in args.names: download_manuscript(args.pages, args.resolution, J(args.base_dir, str(args.resolution)), name)
def train(self, F, model, dl_tr, dl_val=None, forward_batch_fun=None, get_loss_fun=None, eval_fun=None, step_fun=None, hold_best_model=False, optimizer=None, verbose=1, stop_cond=None, lr_scheduler=None, **kws): self.best_score = None self.best_epoch = None self.best_model = None self.swa_model = None self.n_avg = 0 if lr_scheduler is not None: assert optimizer is not None if forward_batch_fun is None: forward_batch_fun = _forward_batch if get_loss_fun is None: get_loss_fun = _get_loss if eval_fun is None: eval_fun = _eval_model if step_fun is None: step_fun = _train_step old_flag = copy.deepcopy(F.__dict__) dl_val = dl_tr if dl_val is None else dl_val ignore_keys = None if F.not_save_keys_file is not None: with open(F.not_save_keys_file, 'r') as f: ignore_keys = [ t.strip() for t in f.readlines() if len(t.strip()) > 0 ] if optimizer is not None: F.optimizer = optimizer.__class__.__name__ else: if F.optimizer.lower() == "sgd": optimizer = torch.optim.SGD(lr=F.lr, params=model.parameters(), momentum=F.momentum, weight_decay=F.weight_decay) elif F.optimizer.lower() == "adam": optimizer = torch.optim.Adam(lr=F.lr, params=model.parameters(), weight_decay=F.weight_decay) elif F.optimizer.lower() == "adamw": optimizer = torch.optim.AdamW(lr=F.lr, params=model.parameters(), weight_decay=F.weight_decay) else: print("optimizer not found or not support!") sys.exit(-1) if F.resume_path is not None: optimizer.load_state_dict( torch.load(J(F.resume_path, "optimizer.pth"))) model.load_state_dict(torch.load(J(F.resume_path, "model.pth")), strict=False) F.start_epoch = json.load( open(J(F.resume_path, "info.json"), mode='r', encoding='utf-8'))['epoch'] + 1 if lr_scheduler is not None: lr_scheduler.load_state_dict( torch.load(J(F.resume_path, "lr_scheduler.pth"))) L = Logger(verbose=verbose) if F.folder_id is None: F.folder_id = "={}=".format(get_name()) else: F.folder_id = "={}_{}=".format(get_name(), F.folder_id) if F.resume_path is not None: logging_path = J(F.logging_path, "={}=".format(F.resume_path.split('=')[1])) else: logging_path = J(F.logging_path, F.folder_id) if F.enable_logging: if not os.path.exists(logging_path): os.mkdir(logging_path) L.add_file_handler(J(logging_path, 'log.txt'), mode='a') if F.resume_path is not None: saving_path = os.path.split(F.resume_path)[0] else: saving_path = J(F.saving_path, F.folder_id) if F.enable_saving and not os.path.exists(saving_path): os.mkdir(saving_path) F.logging_path = logging_path F.saving_path = saving_path for k, v in F.__dict__.items(): L.info("{} = {}".format(k, v)) if F.enable_saving: save_config(F, J(F.saving_path, 'config.json')) if F.use_swa: swa_scheduler = SWALR(optimizer, swa_lr=F.swa_lr) with get_logger(logging_path) as L2, \ get_saver(saving_path, num_best=F.save_num_best, mode=F.save_mode, every_epochs=F.save_every_epochs) as S: L2.disabled = not F.enable_logging S.disabled = not F.enable_saving _best = -np.inf if F.higher_better else np.inf _num = 0 _best_epoch = 1 for epoch in range(F.start_epoch, F.epochs + 1): model.train() for it, batch in enumerate(dl_tr): loss, sc = step_fun(F, model, optimizer, batch, forward_batch_fun, get_loss_fun, **kws) L.debug("[{}/{}][{}/{}] - {}".format( epoch, F.epochs, it + 1, len(dl_tr), " - ".join([ "{}: {:.3f}".format(k, v) for k, v in sc.items() ]))) L2.write(data=sc, step=(epoch - 1) * len(dl_tr) + it + 1) if not F.use_swa or epoch < F.swa_start: model.eval() score = eval_fun(F, model, dl_val, forward_batch_fun, **kws) else: self.update_parameters(model) self.swa_model.eval() score = eval_fun(F, self.swa_model, dl_val, forward_batch_fun, **kws) swa_scheduler.step() score['lr'] = swa_scheduler.get_lr()[0] if lr_scheduler is not None and epoch < F.swa_start: if lr_scheduler.__class__.__name__ == "ReduceLROnPlateau": lr_scheduler.step(score[F.primary_score]) else: lr_scheduler.step() score['lr'] = lr_scheduler.get_lr()[0] L.info("[{}/{}][{}/{}] - {}".format( epoch, F.epochs, len(dl_tr), len(dl_tr), " - ".join([ "{}: {:.3f}".format(k, v) for k, v in score.items() ]))) L2.write(data=score, step=epoch * len(dl_tr)) save_state = { F.save_model_name: model if not F.use_swa or epoch < F.swa_start else self.swa_model, 'optimizer': optimizer } if lr_scheduler is not None: save_state['lr_scheduler'] = lr_scheduler save_info = {'epoch': epoch, **score} S.check( save_state, cost=-score[F.primary_score] if F.higher_better else score[F.primary_score], epoch=epoch, info=save_info, ignore_keys=ignore_keys, ) if F.enable_saving and F.save_last: S.save_model( save_state, "last", info=save_info, ignore_keys=ignore_keys, ) if F.higher_better and score[ F. primary_score] > _best or not F.higher_better and score[ F.primary_score] < _best: _best = score[F.primary_score] _num = 0 _best_epoch = epoch self.best_score = score self.best_epoch = _best_epoch if hold_best_model: self.best_model = copy.deepcopy(model) else: _num += 1 if F.early_stop and _num == F.early_stop_num: L.info( '>>>>>>>> Meet early-stopping, the best score is {} on epoch {} <<<<<<<<' .format(_best, _best_epoch)) break if stop_cond is not None and stop_cond(score): L.info( '>>>>>>>> Meet cond-stopping, the best score is {} on epoch {} <<<<<<<<' .format(_best, _best_epoch)) break if F.early_stop and _num < F.early_stop_num: L.info( '>>>>>>>> Do not meet early-stopping! The best score is {} on epoch {} <<<<<<<<' .format(_best, _best_epoch)) if stop_cond is not None and not stop_cond(score): L.info('>>>>>>>> Do not meet cond-stopping! <<<<<<<<') L.clear() for k, v in old_flag.items(): setattr(F, k, v)
def download_page(resolution, base_dir, manuscript, page): ''' Download single page into base_dir/manuscript/page directory. There will be a bunch of block files that you will need to concatenate later. ''' mkpath(J(base_dir, manuscript, page)) # First download image block that is out of range to see how such image # looks like (this is used to detect edges later) nil_block = _session.get( URL_IMAGE_BLOCK.format(manuscript_and_page=page, resolution=resolution, column=999, row=999)) column, row = 0, 0 max_column, max_row = 0, 0 while True: filename = J(base_dir, manuscript, page, '{0}_{1}_{2}.jpg'.format(page, row, column)) #print('Getting block {0}x{1}'.format(row, column)) url = URL_IMAGE_BLOCK.format(manuscript_and_page=page, resolution=resolution, column=column, row=row) try: download_block(url, filename, nil_block) except BlockAlreadyDownloaded: max_row = max(row, max_row) max_column = max(column, max_column) column += 1 put('.') continue except BlockInvalid: put('\n') # We are out of range if column == 0: # The end of the page print('End of the page') print('Page {0} has size row x column = {1} x {2}'.format( page, max_row, max_column)) break else: # The end of the row, reset column, increment row column = 0 row += 1 continue except BlockMaxRetriesReached: put('X') else: put('.') # Update page size max_row = max(row, max_row) max_column = max(column, max_column) # Go to next column column += 1 return max_column, max_row
def grid_search( F, T, model, dl_tr, dl_val, params, fast=True, out_path='.', verbose=-1, **kws, ): assert type(params) == dict for k in params: assert k in F.__dict__ F.enable_logging = False F.enable_saving = False mid_params = {k: np.median(v) for k, v in params.items()} best_params = copy.deepcopy(mid_params) curr_params = copy.deepcopy(mid_params) for k, v in mid_params.items(): setattr(F, k, v) recd = [] _best = -np.inf if F.higher_better else np.inf for key, values in params.items(): for v in values: print(bcolors.OKGREEN + "****** training when {} = {}".format(key, v) + bcolors.ENDC) curr_params[key] = v setattr(F, key, v) T.train( F, model, dl_tr, dl_val, verbose=verbose, **kws, ) curr_params['score'] = T.best_score[F.primary_score] recd.append(copy.deepcopy(curr_params)) print(bcolors.OKGREEN + ">>>>>> finish when {} = {}, best = {}".format( key, v, curr_params['score']) + bcolors.ENDC) if (F.higher_better and T.best_score[F.primary_score] > _best) or ( not F.higher_better and T.best_score[F.primary_score] < _best): _best = T.best_score[F.primary_score] best_params[key] = v curr_params[key] = best_params[key] setattr(F, key, best_params[key]) print("=" * 60) for k, v in best_params.items(): print(bcolors.OKCYAN + "{} = {}".format(k, v) + bcolors.ENDC) print(bcolors.OKCYAN + "best = {}".format(_best) + bcolors.ENDC) recd = pd.DataFrame(recd).sort_values( 'score', ascending=not F.higher_better).reset_index(drop=True) recd.to_csv(J(out_path, "{}_record.csv".format(get_name())))
from os.path import join as J def we_are_frozen(): # All of the modules are built-in to the interpreter, e.g., by py2exe return hasattr(sys, "frozen") def module_path(): encoding = sys.getfilesystemencoding() if we_are_frozen(): return os.path.dirname((sys.executable)) return os.path.dirname((__file__)) ASSETS_DIR = os.path.abspath(J(os.path.dirname(__file__), '..', 'assets')) J = os.path.join E = os.path.exists BASE_DIR = os.environ.get('MINERL_OUTPUT_ROOT', os.path.expanduser( J('~', 'minerl.data') )) RENDERERS_DIR = os.path.expanduser( J('~', 'renderers')) NUM_MINECRAFTS = 28 OUTPUT_DIR = J(BASE_DIR, 'output') DOWNLOAD_DIR = J(BASE_DIR, 'downloaded_sync') BUCKET_NAME = 'pizza-party'
import os import sys import json import numpy as np from glob2 import glob from os.path import join as J if __name__ == "__main__": if len(sys.argv) < 2: print("please give path!") sys.exit(-1) root_path = sys.argv[1] files = glob(J(root_path, "**", "info.json")) d = [] for f in files: with open(f) as ff: d.append(json.load(ff)) mean_score = {k: np.mean([t[k] for t in d]) for k in d[0].keys()} with open(J(root_path, "mean_score.json"), 'w') as f: json.dump(mean_score, f, indent=4) for k, v in mean_score.items(): print("{} = {}".format(k, v))
def run_experiment(spec, experiment_directory): """Runs an experiment based on the desired experiment specification. This process will record the desired response variables and write them to the experiment directory. Args: spec (dict): The JSON object specifying the experiment to run. experiment_directory (str): The directory path to which to write the response variables. """ # spec, experiment_directory = args # Unpack some of the specification information try: spec = set_spec_default_values(spec) algorithm = spec["algorithm"] batch_size = spec['batch_size'] bptt_len = spec['bptt_len'] device = spec['device'] hmm_hidden = spec['hmm_hidden'] max_step = spec['max_step'] name = spec['name'] sequence_dependence = spec['sequence_dependence'] vocab = spec['vocab'] # Unpack additional arguments <here> except KeyError: print("Invalid experiment specification: {}".format(spec)) raise logging.basicConfig(level=logging.DEBUG) # filename=J(experiment_directory, 'out.log'), # filemode='w') logger = logging.getLogger('exp_runner') logger.info("Starting the experiment!") logger.info(str(spec)) # Create the directory if not os.path.exists(experiment_directory): os.makedirs(experiment_directory) else: assert c.EXPERIMENT_RUNNER_SHOULD_OVERWRITE, "Experiment directory {} already exists".format( experiment_directory) # Output a copy of the experiment specification with open(J(experiment_directory, 'params.json'), 'w') as f: json.dump(spec, f) # Choose sequence model type if algorithm == 'transformer': sequence_model = TransformerXL(**spec) elif algorithm == 'lstm': sequence_model = LSTMModel(**spec) elif algorithm == 'cnn': sequence_model = GatedCNN(**spec) else: print(spec) # TODO: loop over trainig files/algorithm specification ROOT_PATH = 'generated_data' DATA_FILE = 'V{}_hmm_hidden_{}_lag_{}_vocab_{}.txt'.format( c.DATA_GENERATION_VERSION, hmm_hidden, sequence_dependence, vocab) train_file = 'train_' + DATA_FILE test_file = 'test_' + DATA_FILE device = torch.device(device) # Create dataset iterators train_iter, test_iter = torchtext_batch_iterators(ROOT_PATH, train_file, test_file, batch_size=batch_size, bptt_len=bptt_len, device=device, batch_first=True, repeat=False) train_perplex_iter, test_perplex_iter = torchtext_batch_iterators( ROOT_PATH, train_file, test_file, batch_size=batch_size, bptt_len=bptt_len, device=device, batch_first=True, repeat=False) # Model model = sequence_model.get_model() optimizer = sequence_model.get_optimizer() scheduler = sequence_model.get_scheduler() max_step = spec['max_step'] eval_steps = spec["eval_steps"] train_step = 0 train_loss = 0 best_val_loss = None losses = [] test_performance = [] train_performance = [] step_to_performance = [] num_steps = 0 # Training Loop tqdm_out = TqdmLogger(logger, level=logging.INFO) progress = tqdm.tqdm(total=max_step, ) try: for epoch in itertools.count(start=1): model.train() mems = tuple() print() for train_step, batch in enumerate(train_iter): num_steps += 1 progress.update() loss = sequence_model.train_step(batch.text, batch.target, mems=mems) losses.append(loss) progress.set_description("Loss {:.4f}".format(loss)) # Update scheduler sequence_model.update_scheduler(num_steps) if num_steps % 500 == 0: progress.write("Saving loss performance!") np.save(J(experiment_directory, 'losses.npy'), losses) np.save(J(experiment_directory, 'test_performance.npy'), test_performance) np.save(J(experiment_directory, 'train_performance.npy'), train_performance) np.save(J(experiment_directory, 'step_to_performance.npy'), step_to_performance) if num_steps % 1000 == 0: # Calculate perplexity progress.write("-" * 100) progress.write("Model Performance:") test_performance.append( evaluate_model(sequence_model, test_perplex_iter, 2000, vocab)) train_performance.append( evaluate_model(sequence_model, train_perplex_iter, 1000, vocab)) step_to_performance.append(num_steps) progress.write( "Test (Perplex, Accuracy): {:.6f}, {:.6f}".format( *test_performance[-1])) progress.write( "Train (Perplex, Accuracy): {:.6f}, {:.6f}".format( *train_performance[-1])) progress.write("Average loss (past 1000): {}".format( np.mean(losses[-1000:]))) if num_steps >= max_step: break if num_steps >= max_step: progress.write('-' * 100) progress.write('End of training') break # if val_loss is None or val_loss < best_val_loss: # best_val_loss = val_loss # # TODO: save the best performing model so far(and its stats) except KeyboardInterrupt: logger.info('-' * 100) logger.info('Exiting from training early') raise
lab = df.label.map(lambda t: func(t, NUM_LABEL)) lab = pd.DataFrame(np.stack(lab.values), index=lab.index) ret['label'] = lab df.drop('label', inplace=True, axis=1) if "label2" in df.columns: lab = df.label2.map(lambda t: func(t, NUM_LABEL2)) lab = pd.DataFrame(np.stack(lab.values), index=lab.index) ret['label2'] = lab df.drop('label2', inplace=True, axis=1) ret['desc'] = df return ret print("preprocessing data ...") ret = process_data(J(data_path, rd1_train_name)) ret['desc'].to_csv(J(out_path, "rd1_train.csv")) ret['label'].to_csv(J(out_path, "rd1_train_label.csv")) ret = process_data(J(data_path, rd1_testA_name)) ret['desc'].to_csv(J(out_path, "rd1_testA.csv")) ret = process_data(J(data_path, rd1_testB_name)) ret['desc'].to_csv(J(out_path, "rd1_testB.csv")) ret = process_data(J(data_path, rd2_train_name)) ret['desc'].to_csv(J(out_path, "rd2_train.csv")) ret['label'].to_csv(J(out_path, "rd2_train_label.csv")) ret['label2'].to_csv(J(out_path, "rd2_train_label2.csv")) try:
def getdatalist(train = True): if train: return open(trainlist,'r').readlines() else: return open(testlist,'r').readlines() def getMapper(idxpath = "/data/keshav/ucf/ucflist/classInd.txt"): indexes = [*map(lambda x: x.strip(), open(idxpath, 'r').readlines())] return bidict({y: torch.tensor([int(x)-1]) for x, y in map(lambda i: i.split(), indexes)}) mapper = getMapper() def randomSequenceChunk(x, n): start = random.randint(0,len(x)-n) end = start + n return x[start:end] # x is single instance from open(testlist,'r').readlines() getactualtestpath = lambda testpath:J(datapath,testpath.strip().replace('.avi','')) getactualtrainpath = lambda trainpath:J(datapath,trainpath.split('.avi ')[0]) getframesfrompath = lambda x,n,pathgetter:randomSequenceChunk((order([*Path(pathgetter(x)).glob("*.jpg")])),n) getactualpath = {True:getactualtrainpath, False:getactualtestpath} # path : Instance from trainlist or testlist # Returns n random frames in sequence from a video def getXorY(path,n = 10, train = True): frames = getframesfrompath(path, n, getactualpath.get(train)) label = mapper.get(frames[0].parent.parent.name) return data(frames = frames, label = label)
seq_pad_meth=F.seq_pad_meth, seq_mask_ratio=0, seq_rep_prob=0, token_range=token_range, ), ) model = crt_model(F).to(device) if F.pretrain_model_file is not None: if not os.path.isdir(F.pretrain_model_file): model.load_state_dict(torch.load(F.pretrain_model_file), strict=False) else: model.load_state_dict(torch.load( glob(J(F.pretrain_model_file, "**", "last", "model.pth"))[0]), strict=False) # base_opt = torch.optim.AdamW(lr=F.lr, params=model.parameters(), weight_decay=F.weight_decay) # lookahead = Lookahead(base_opt, k=5, alpha=0.5) # lr_scheduler = LambdaLR(base_opt, lr_lambda=lambda epoch: warmup_only(epoch)) # lr_scheduler = CosineAnnealingWarmRestarts(base_opt, T_0=F.T_0, T_mult=1) T.train( F, model, dl_tr, dl_val, forward_batch_fun=forward_batch_fun, hold_best_model=False, stop_cond=lambda sc: sc['val_score'] > F.val_score_limit,