def get_whatrequires(pkg, yum_conf): """ Write list of packages. Write packages that require the current package to a file using dnf repoquery what-requires and --recursive commands. """ # clean up dnf cache to avoid 'no more mirrors repo' error try: subprocess.check_output([ 'dnf', '--config', yum_conf, '--releasever', 'clear', 'clean', 'all' ]) except subprocess.CalledProcessError as err: util.print_warning("Unable to clean dnf repo: {}, {}".format(pkg, err)) return try: out = subprocess.check_output([ 'dnf', 'repoquery', '--config', yum_conf, '--releasever', 'clear', '--archlist=src', '--recursive', '--queryformat=%{NAME}', '--whatrequires', pkg ]).decode('utf-8') except subprocess.CalledProcessError as err: util.print_warning( "dnf repoquery whatrequires for {} failed with: {}".format( pkg, err)) return util.write_out( 'whatrequires', '# This file contains recursive sources that ' 'require this package\n' + out)
def attempt_key_import(keyid): print(SEPT) ig = InputGetter( '\nDo you want to attempt to import keyid {}: (y/N) '.format(keyid)) import_key_answer = ig.get_answer() if import_key_answer in [None, False]: return False with cli_gpg_ctx() as ctx: err, _ = ctx.import_key(keyid) if err is not None: print_error(err.strerror) return False err, key_content = ctx.export_key(keyid) if err is not None: print_error(err.strerror) key_fullpath = PUBKEY_PATH.format(keyid) util.write_out(key_fullpath, key_content) print('\n') print_success('Public key id: {} was imported'.format(keyid)) err, content = ctx.display_keyinfo(key_fullpath) if err is not None: print_error( 'Unable to parse {}, will be removed'.format(key_fullpath)) os.unlink(key_fullpath) return False print('\n', '\n'.join(content.split('\n')[:10])) ig = InputGetter(message='\nDo you want to keep this key: (Y/n) ', default='y') if ig.get_answer() is True: return True else: os.unlink(key_fullpath) return False
def compute_loss(self, minibatch, processed_minibatches, minimum_updates): (original_aa_string, actual_coords_list, _) = minibatch emissions, _backbone_atoms_padded, _batch_sizes = \ self._get_network_emissions(original_aa_string) actual_coords_list_padded = torch.nn.utils.rnn.pad_sequence( actual_coords_list) if self.use_gpu: actual_coords_list_padded = actual_coords_list_padded.cuda() start = time.time() if isinstance(_batch_sizes[0], int): _batch_sizes = torch.tensor(_batch_sizes) emissions_actual, _ = \ calculate_dihedral_angles_over_minibatch(actual_coords_list_padded, _batch_sizes, self.use_gpu) drmsd_avg = calc_avg_drmsd_over_minibatch(_backbone_atoms_padded, actual_coords_list_padded, _batch_sizes) write_out("Angle calculation time:", time.time() - start) if self.use_gpu: emissions_actual = emissions_actual.cuda() drmsd_avg = drmsd_avg.cuda() angular_loss = calc_angular_difference(emissions, emissions_actual) multiplier = 0.4 if (processed_minibatches < minimum_updates * (40 / 100)): multiplier = processed_minibatches / minimum_updates normalized_angular_loss = angular_loss / 5 normalized_drmsd_avg = drmsd_avg / 100 return (normalized_drmsd_avg * multiplier) + (normalized_angular_loss * (1 - multiplier))
def main(): parser = argparse.ArgumentParser(description="OpenProtein version 0.1") parser.add_argument('--silent', dest='silent', action='store_true', help='Dont print verbose debug statements.') parser.add_argument('--hide-ui', dest='hide_ui', action='store_true', default=False, help='Hide loss graph and ' 'visualization UI while training goes on.') parser.add_argument('--evaluate-on-test', dest='evaluate_on_test', action='store_true', default=False, help='Run model of test data.') parser.add_argument('--use-gpu', dest='use_gpu', action='store_true', default=False, help='Use GPU.') parser.add_argument( '--eval-interval', dest='eval_interval', type=int, default=10, help='Evaluate model on validation set every n minibatches.') parser.add_argument('--min-updates', dest='minimum_updates', type=int, default=100, help='Minimum number of minibatch iterations.') parser.add_argument('--minibatch-size', dest='minibatch_size', type=int, default=8, help='Size of each minibatch.') parser.add_argument('--experiment-id', dest='experiment_id', type=str, default="example", help='Which experiment to run.') args, _ = parser.parse_known_args() if args.hide_ui: write_out("Live plot deactivated, see output folder for plot.") use_gpu = args.use_gpu if use_gpu and not torch.cuda.is_available(): write_out("Error: --use-gpu was set, but no GPU is available.") sys.exit(1) if not args.hide_ui: # start web server start_dashboard_server() experiment = importlib.import_module("experiments." + args.experiment_id) experiment.run_experiment(parser, use_gpu)
def check_regression(pkg_dir, skip_tests, test_round): """Check the build log for test regressions using the count module.""" if skip_tests: return log_path = os.path.join(pkg_dir, 'results', 'build.log') result = count.parse_log(log_path) if len(result) == 0 or result[0:2] == ',0': log_path = os.path.join(pkg_dir, 'results', f"round{test_round}-build.log") result = count.parse_log(log_path) titles = [('Package', 'package name', 1), ('Total', 'total tests', 1), ('Pass', 'total passing', 1), ('Fail', 'total failing', 0), ('Skip', 'tests skipped', 0), ('XFail', 'expected fail', 0)] res_str = "" for line in result.strip('\n').split('\n'): s_line = line.split(',') for idx, title in enumerate(titles): if s_line[idx]: if (s_line[idx] != '0') or (title[2] > 0): print("{}: {}".format(title[1], s_line[idx])) res_str += "{} : {}\n".format(title[0], s_line[idx]) util.write_out(os.path.join(pkg_dir, "testresults"), res_str)
def compute_loss(self, minibatch): (original_aa_string, actual_coords_list, _) = minibatch emissions, _backbone_atoms_padded, _batch_sizes = \ self._get_network_emissions(original_aa_string) actual_coords_list_padded = torch.nn.utils.rnn.pad_sequence(actual_coords_list) if self.use_gpu: actual_coords_list_padded = actual_coords_list_padded.cuda() start = time.time() if isinstance(_batch_sizes[0], int): _batch_sizes = torch.tensor(_batch_sizes) emissions_actual, _ = \ calculate_dihedral_angles_over_minibatch(actual_coords_list_padded, _batch_sizes, self.use_gpu) # drmsd_avg = calc_avg_drmsd_over_minibatch(backbone_atoms_padded, # actual_coords_list_padded, # batch_sizes) write_out("Angle calculation time:", time.time() - start) if self.use_gpu: emissions_actual = emissions_actual.cuda() # drmsd_avg = drmsd_avg.cuda() angular_loss = calc_angular_difference(emissions, emissions_actual) return angular_loss # + drmsd_avg
def write_upstream(sha, tarfile, mode="w"): """ Write the upstream hash to the upstream file """ write_out(os.path.join(build.download_path, "upstream"), os.path.join(sha, tarfile) + "\n", mode=mode)
def attempt_key_import(keyid, key_fullpath): """Ask user to import key.""" global IMPORTED print(SEPT) ig = InputGetter('\nDo you want to attempt to import keyid {}: (y/N) '.format(keyid)) import_key_answer = ig.get_answer() if import_key_answer in [None, False]: return False with cli_gpg_ctx() as ctx: err, _ = ctx.import_key(keyid) if err is not None: util.print_error(err.strerror) return False err, key_content = ctx.export_key(keyid) if err is not None: util.print_error(err.strerror) return False util.write_out(key_fullpath, key_content) print('\n') util.print_success('Public key id: {} was imported'.format(keyid)) err, content = ctx.display_keyinfo(key_fullpath) if err is not None: util.print_error('Unable to parse {}, will be removed'.format(key_fullpath)) os.unlink(key_fullpath) return False print("\n", content) ig = InputGetter(message='\nDo you want to keep this key: (Y/n) ', default='y') if ig.get_answer() is True: IMPORTED = content return True else: os.unlink(key_fullpath) return False
def compute_loss(self, minibatch): (original_aa_string, actual_coords_list, _) = minibatch if any(np.isnan(x.cpu().detach().numpy()).any() for x in original_aa_string) or \ any(np.isnan(x.cpu().detach().numpy()).any() for x in actual_coords_list): return None emissions, _backbone_atoms_padded, _batch_sizes = \ self._get_network_emissions(original_aa_string) assert not np.isnan(emissions.cpu().detach().numpy()).any() actual_coords_list_padded, batch_sizes_coords = torch.nn.utils.rnn\ .pad_packed_sequence( torch.nn.utils.rnn.pack_sequence(actual_coords_list)) assert not np.isnan( actual_coords_list_padded.cpu().detach().numpy()).any() if self.use_gpu: actual_coords_list_padded = actual_coords_list_padded.cuda() start = time.time() emissions_actual, _ = \ calculate_dihedral_angles_over_minibatch(actual_coords_list_padded, batch_sizes_coords, self.use_gpu) # drmsd_avg = calc_avg_drmsd_over_minibatch(backbone_atoms_padded, # actual_coords_list_padded, # batch_sizes) write_out("Angle calculation time:", time.time() - start) if self.use_gpu: emissions_actual = emissions_actual.cuda() # drmsd_avg = drmsd_avg.cuda() angular_loss = calc_angular_difference(emissions, emissions_actual) return angular_loss # + drmsd_avg
def __iter__(self): data_class_map = {} data_class_map[0] = [] data_class_map[1] = [] data_class_map[2] = [] data_class_map[3] = [] for idx in self.sampler: data_class_map[self.dataset[idx][4]].append(idx) num_each_class = int(self.batch_size / 4) max_class_size = max( [len(data_class_map[0]), len(data_class_map[1]), len(data_class_map[2]), len(data_class_map[3])]) batch_num = int(max_class_size / num_each_class) if max_class_size % num_each_class != 0: batch_num += 1 batch_relative_offset = (1.0 / float(batch_num)) / 2.0 batches = [] for _ in range(batch_num): batch = [] for _class_id, data_rows in data_class_map.items(): int_offset = int(batch_relative_offset * len(data_rows)) batch.extend(sample_at_index(data_rows, int_offset, num_each_class)) batch_relative_offset += 1.0 / float(batch_num) batches.append(batch) random.shuffle(batches) for batch in batches: write_out("Using minibatch from RandomBatchClassBalancedSequentialSampler") yield batch
def write_default_conf_file(name, description): """Write default configuration file with description to file name.""" config_files.add(name) filename = os.path.join(path, name) if os.path.isfile(filename): return write_out(filename, wrapper.fill(description) + "\n")
def run_on_everything(): """Stupidly long exhaustive search of each timestamp Note: Since it takes more than a second to exhaust a timestamp (currently) This loop will not finish before needing more than a 4 byte timestamp""" for unix_time in range(0x00000000, 0xffffffff + 1): success = distribute(unix_time) if not success: write_out("ErrorLog.txt", "Failed on {}".format(unix_time))
def merge_samples_to_minibatch(samples): samples_list = [] for s in samples: samples_list.append(s) # sort according to length of aa sequence samples_list.sort(key=lambda x: len(x[7]), reverse=True) aa_list, labels_list, remapped_labels_list_crf_hmm, remapped_labels_list_crf_marg, prot_type_list, prot_topology_list, prot_name, original_aa_string, original_label_string = zip( *samples_list) write_out(prot_type_list) return aa_list, labels_list, remapped_labels_list_crf_hmm, remapped_labels_list_crf_marg, prot_type_list, prot_topology_list, prot_name, original_aa_string, original_label_string
def construct_model(model_parameters, embedding_size, use_gpu, minibatch_size): model_type = model_parameters["architecture"] mixture_size = model_parameters["output_size"] dropout = model_parameters["dropout"] model = None soft_max_to_angle = models.soft_to_angle(mixture_size) if model_type == "rnn": model = ExampleModel(embedding_size, minibatch_size, use_gpu, dropout=dropout, mixture_size=mixture_size, hidden_size=model_parameters["hidden_size"]) elif model_type == "cnn" or model_type == "cnn_angles": num_layers = model_parameters["layers"] channels = [embedding_size] + model_parameters["channels"][:num_layers-1] + [mixture_size] kernels = model_parameters["kernel"] * num_layers paddings = model_parameters["padding"] * num_layers stride = model_parameters["stride"] * num_layers dilation = model_parameters["dilation"] * num_layers spatial_dropout = model_parameters["spatial_dropout"] layers = [] for i in range(num_layers): params = (channels[i], channels[i+1], kernels[i], paddings[i], stride[i], dilation[i]) layers.append(params) if model_type == "cnn_angles": soft_max_to_angle = None model = CNNBaseModelAngles(embedding_size, layers, minibatch_size, use_gpu, mixture_size=mixture_size) else: model = CNNBaseModel(embedding_size, layers, minibatch_size, use_gpu, dropout=dropout, mixture_size=mixture_size, spatial_dropout=spatial_dropout) elif model_type == "resnet": resnet_type = model_parameters["resnet_type"] kernel = model_parameters["kernel"] padding = model_parameters["padding"] stride = model_parameters["stride"] droprate = model_parameters["dropout"] * 5 parameters = { "input_channels":embedding_size, "out_channels":mixture_size, "kernel": kernel, "padding":padding, "stride":stride, "use_gpu":use_gpu, "droprate":droprate } model_func = PResNet.name_dict.get(resnet_type, None) if model_func is None: write_out('RESNET TYPE NOT SUPPORTED PLEASE USE SUPPORTED TYPE [resnet18,resnet34,resnet50,restnet101,resnet152] BY SPECIFYING "resnet_type" IN CONFIG FILE') exit() model = model_func(**parameters) else: write_out("MODEL TYPE NOT RECOGNICED PLEASE USE A SUPPORTED ARCHITECTURE IN CONFIG FILE [cnn,cnn_angles,resnet,rnn]") exit() return openprotein.BaseModel(use_gpu, mixture_size, model, soft_max_to_angle)
def track_best_hash(oven): """Watches the output queue and reports the best hash found for each process The global best is stored""" best = ("1", "1", "1") for attempt in iter(oven.get, STOP): print(attempt) if attempt[0] < best[0]: print("NEW BEST") best = attempt write_out("best_found.txt", best)
def embed(data, batch_sizes, device): # one-hot encoding start_compute_embed = time.time() prot_aa_list = data.unsqueeze(1) embed_tensor = torch.zeros(prot_aa_list.size(0), 21, prot_aa_list.size(2)).to(device) # 21 classes #prot_aa_list.to(device) #should already be embedded. input_sequences = embed_tensor.scatter_(1, prot_aa_list.data, 1).transpose(1, 2) end = time.time() write_out("Embed time:", end - start_compute_embed) packed_input_sequences = rnn_utils.pack_padded_sequence( input_sequences, batch_sizes) return packed_input_sequences
def __init__(self, pubkey=None, home=None): _gpghome = home if _gpghome is None: _gpghome = tempfile.mkdtemp(prefix='tmp.gpghome') os.environ['GNUPGHOME'] = _gpghome self.args = ['gpg', '--homedir', _gpghome] util.write_out(os.path.join(_gpghome, 'gpg.conf'), GNUPGCONF) if pubkey is not None: args = self.args + ['--import', pubkey] output, err, code = self.exec_cmd(args) if code == -9: raise Exception('Command {} timeout after {} seconds'.format(' '.join(args), CMD_TIMEOUT)) elif code != 0: raise Exception(err.decode('utf-8')) self._home = _gpghome
def main(): parser = argparse.ArgumentParser(description="OpenProtein version 0.1") parser.add_argument('--no_force_pre_processing_overwrite', dest='no_force_pre_processing_overwrite', action='store_false', help='Force overwrite existing preprocessed files', default=True) args, _unknown = parser.parse_known_args() uge_gpu = False if torch.cuda.is_available(): write_out("CUDA is available, using GPU") uge_gpu = True process_raw_data( uge_gpu, force_pre_processing_overwrite=args.force_pre_processing_overwrite)
def embed(self, original_aa_string): data, batch_sizes = torch.nn.utils.rnn.pad_packed_sequence( torch.nn.utils.rnn.pack_sequence(original_aa_string)) # one-hot encoding start_compute_embed = time.time() prot_aa_list = data.unsqueeze(1) embed_tensor = torch.zeros(prot_aa_list.size(0), 21, prot_aa_list.size(2)) # 21 classes if self.use_gpu: prot_aa_list = prot_aa_list.cuda() embed_tensor = embed_tensor.cuda() input_sequences = embed_tensor.scatter_(1, prot_aa_list.data, 1).transpose(1, 2) end = time.time() write_out("Embed time:", end - start_compute_embed) packed_input_sequences = rnn_utils.pack_padded_sequence( input_sequences, batch_sizes) return packed_input_sequences
def check_regression(pkg_dir): """Check the build log for test regressions using the count module.""" if config.config_opts['skip_tests']: return result = count.parse_log(os.path.join(pkg_dir, "results/build.log")) titles = [('Package', 'package name', 1), ('Total', 'total tests', 1), ('Pass', 'total passing', 1), ('Fail', 'total failing', 0), ('Skip', 'tests skipped', 0), ('XFail', 'expected fail', 0)] res_str = "" for line in result.strip('\n').split('\n'): s_line = line.split(',') for idx, title in enumerate(titles): if s_line[idx]: if (s_line[idx] != '0') or (title[2] > 0): print("{}: {}".format(title[1], s_line[idx])) res_str += "{} : {}\n".format(title[0], s_line[idx]) util.write_out(os.path.join(pkg_dir, "testresults"), res_str)
def compute_loss(self, training_minibatch): _, labels_list, remapped_labels_list_crf_hmm, remapped_labels_list_crf_marg, \ _prot_type_list, _prot_topology_list, _prot_name_list, original_aa_string, \ _original_label_string = training_minibatch minibatch_size = len(labels_list) if self.model_mode == TMHMM3Mode.LSTM_CRF_MARG: labels_to_use = remapped_labels_list_crf_marg elif self.model_mode == TMHMM3Mode.LSTM_CRF_HMM: labels_to_use = remapped_labels_list_crf_hmm else: labels_to_use = labels_list input_sequences = [ autograd.Variable(x) for x in self.embed(original_aa_string) ] actual_labels = torch.nn.utils.rnn.pad_sequence( [autograd.Variable(l) for l in labels_to_use]) emissions, batch_sizes = self._get_network_emissions(input_sequences) if self.model_mode == TMHMM3Mode.LSTM: prediction = emissions.transpose(0, 1).contiguous().view( -1, emissions.size(-1)) target = actual_labels.transpose(0, 1).contiguous().view(-1, 1) losses = -torch.gather( nn.functional.log_softmax(prediction), dim=1, index=target).view(*actual_labels.transpose(0, 1).size()) mask_expand = torch.range(0, batch_sizes.data.max() - 1).long() \ .unsqueeze(0).expand(batch_sizes.size(0), batch_sizes.data.max()) if self.use_gpu: mask_expand = mask_expand.cuda() batch_sizes = batch_sizes.cuda() mask = mask_expand < batch_sizes.unsqueeze(1).expand_as( mask_expand) loss = (losses * mask.float()).sum() / batch_sizes.float().sum() else: mask = (self.batch_sizes_to_mask(batch_sizes)) loss = -1 * self.crf_model(emissions, actual_labels, mask=mask) / minibatch_size if float( loss ) > 100000: # if loss is this large, an invalid tx must have been found for idx, batch_size in enumerate(batch_sizes): last_label = None for i in range(batch_size): label = int(actual_labels[i][idx]) write_out(str(label) + ",", end='') if last_label is not None and (last_label, label) \ not in self.allowed_transitions: write_out("Error: invalid transition found") write_out((last_label, label)) sys.exit(1) last_label = label write_out(" ") return loss
def __iter__(self): data = [] for idx in self.sampler: data.append(idx) batch_num = int(len(data) / self.batch_size) if len(data) % self.batch_size != 0: batch_num += 1 batch_order = list(range(batch_num)) random.shuffle(batch_order) batch = [] for batch_id in batch_order: write_out("Accessing minibatch #" + str(batch_id)) for i in range(self.batch_size): if i + (batch_id * self.batch_size) < len(data): batch.append(data[i + (batch_id * self.batch_size)]) yield batch batch = []
def calculate_partitions(partitions_count, cluster_partitions, types): partition_distribution = torch.ones( (partitions_count, len(torch.unique(types))), dtype=torch.long) partition_assignments = torch.zeros(cluster_partitions.shape[0], dtype=torch.long) for i in torch.unique(cluster_partitions): cluster_positions = (cluster_partitions == i).nonzero() cluster_types = types[cluster_positions] unique_types_in_cluster, type_count = torch.unique(cluster_types, return_counts=True) tmp_distribution = partition_distribution.clone() tmp_distribution[:, unique_types_in_cluster] += type_count relative_distribution = partition_distribution.double( ) / tmp_distribution.double() min_relative_distribution_group = torch.argmin( torch.sum(relative_distribution, dim=1)) partition_distribution[min_relative_distribution_group, unique_types_in_cluster] += type_count partition_assignments[ cluster_positions] = min_relative_distribution_group write_out("Loaded data into the following partitions") write_out("[[ TM SP+TM SP Glob]") write_out(partition_distribution - torch.ones(partition_distribution.shape, dtype=torch.long)) return partition_assignments
def compute_loss(self, minibatch): (original_aa_string, actual_coords_list, _, pssms, token) = minibatch emissions, _backbone_atoms_padded, _batch_sizes = self._get_network_emissions( original_aa_string, pssms, token) actual_coords_list_padded, batch_sizes_coords = torch.nn.utils.rnn.pad_packed_sequence( torch.nn.utils.rnn.pack_sequence(actual_coords_list)) if self.use_gpu: actual_coords_list_padded = actual_coords_list_padded.cuda() start = time.time() emissions_actual, _ = calculate_dihedral_angles_over_minibatch( actual_coords_list_padded, batch_sizes_coords, self.use_gpu) drmsd_avg = calc_avg_drmsd_over_minibatch(_backbone_atoms_padded, actual_coords_list_padded, _batch_sizes) write_out("Angle calculation time:", time.time() - start) if self.use_gpu: emissions_actual = emissions_actual.cuda() drmsd_avg = drmsd_avg.cuda() angular_loss = calc_angular_difference(emissions, emissions_actual) return angular_loss, drmsd_avg
def embed(self, original_aa_string): max_len = max([s.size(0) for s in original_aa_string]) seqs = [] for tensor in original_aa_string: padding_to_add = torch.zeros(max_len-tensor.size(0)).int() seqs.append(torch.cat((tensor, padding_to_add))) data = torch.stack(seqs).transpose(0, 1) # one-hot encoding start_compute_embed = time.time() arange_tensor = torch.arange(21).int().repeat( len(original_aa_string), 1 ).unsqueeze(0).repeat(max_len, 1, 1) data_tensor = data.unsqueeze(2).repeat(1, 1, 21) embed_tensor = (arange_tensor == data_tensor).float() if self.use_gpu: embed_tensor = embed_tensor.cuda() end = time.time() write_out("Embed time:", end - start_compute_embed) return embed_tensor
def write_prep(conf, workingdir, content): """Write metadata to the local workingdir when --prep-only is used.""" if conf.urlban: used_url = re.sub(conf.urlban, "localhost", content.url) else: used_url = content.url print() print("Exiting after prep due to --prep-only flag") print() print("Results under ./workingdir") print("Source (./workingdir/{})".format(content.tarball_prefix)) print("Name (./workingdir/name) :", content.name) print("Version (./workingdir/version) :", content.version) print("URL (./workingdir/source0) :", used_url) write_out(os.path.join(workingdir, "name"), content.name) write_out(os.path.join(workingdir, "version"), content.version) write_out(os.path.join(workingdir, "source0"), used_url)
def train_model(data_set_identifier, model, train_loader, validation_loader, learning_rate, minibatch_size=64, eval_interval=50, hide_ui=False, use_gpu=False, minimum_updates=1000, optimizer_type='adam', restart=False): set_experiment_id(data_set_identifier, learning_rate, minibatch_size) validation_dataset_size = validation_loader.dataset.__len__() if use_gpu: model = model.cuda() if optimizer_type == 'adam': optimizer = optim.Adam(model.parameters(), lr=learning_rate) elif optimizer_type == 'sgd': optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9) elif optimizer_type == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=learning_rate) else: optimizer = optim.AdamW(model.parameters(), lr=learning_rate) if restart: scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=32) sample_num = list() train_loss_values = list() train_drmsd_values = list() validation_loss_values = list() validation_angles_loss_values = list() best_model_loss = 1e20 best_model_minibatch_time = None best_model_path = None best_json_data = None stopping_condition_met = False minibatches_proccesed = 0 while not stopping_condition_met: # for i in range(2): optimizer.zero_grad() model.zero_grad() loss_tracker = np.zeros(0) drmsd_tracker = np.zeros(0) for _minibatch_id, training_minibatch in enumerate(train_loader, 0): minibatches_proccesed += 1 start_compute_loss = time.time() loss, drmsd_avg = model.compute_loss(training_minibatch) write_out("Train loss:", float(loss)) start_compute_grad = time.time() loss.backward() loss_tracker = np.append(loss_tracker, float(loss)) drmsd_tracker = np.append(drmsd_tracker, float(drmsd_avg)) end = time.time() write_out("Loss time:", start_compute_grad - start_compute_loss, "Grad time:", end - start_compute_grad) optimizer.step() if restart: scheduler.step() optimizer.zero_grad() model.zero_grad() # for every eval_interval samples, plot performance on the validation set if minibatches_proccesed % eval_interval == 0: write_out("Testing model on validation set...") train_loss = float(loss_tracker.mean()) train_drmsd = float(drmsd_tracker.mean()) loss_tracker = np.zeros(0) drmsd_tracker = np.zeros(0) validation_loss, json_data, _, validation_angles_loss = model.evaluate_model(validation_loader) if validation_loss < best_model_loss: best_model_loss = validation_loss best_model_minibatch_time = minibatches_proccesed best_model_path = write_model_to_disk(model) best_json_data = json_data write_out("Validation loss:", validation_loss, "Train loss:", train_loss, "Train drmsd:", train_drmsd) write_out("Best model so far (validation loss): ", best_model_loss, "at time", best_model_minibatch_time) write_out("Best model stored at " + best_model_path) write_out("Minibatches processed:", minibatches_proccesed) sample_num.append(minibatches_proccesed) train_loss_values.append(train_loss) train_drmsd_values.append(train_drmsd) validation_loss_values.append(validation_loss) validation_angles_loss_values.append(validation_angles_loss) json_data["validation_dataset_size"] = validation_dataset_size json_data["sample_num"] = sample_num json_data["train_loss_values"] = train_loss_values json_data["train_drmsd_values"] = train_drmsd_values json_data["validation_loss_values"] = validation_loss_values json_data['validation_angles_loss_values'] = validation_angles_loss_values write_out(json_data) if not hide_ui: res = requests.post('http://localhost:5000/graph', json=json_data) if res.ok: print(res.json()) if minibatches_proccesed > minimum_updates and minibatches_proccesed \ >= best_model_minibatch_time + minimum_updates: stopping_condition_met = True break write_result_summary(best_model_loss) write_result_summary(json.dumps(best_json_data)) return best_model_path
def commit_to_git(config, name, success): """Update package's git tree for autospec managed changes.""" path = config.download_path call("git init", stdout=subprocess.DEVNULL, cwd=path) # This config is used for setting the remote URI, so it is optional. if config.git_uri: try: call("git config --get remote.origin.url", cwd=path) except subprocess.CalledProcessError: upstream_uri = config.git_uri % {'NAME': name} call("git remote add origin %s" % upstream_uri, cwd=path) for config_file in config.config_files: call("git add %s" % config_file, cwd=path, check=False) for unit in config.sources["unit"]: call("git add %s" % unit, cwd=path) call("git add Makefile", cwd=path) call("git add upstream", cwd=path) call("bash -c 'shopt -s failglob; git add *.spec'", cwd=path) call("git add %s.tmpfiles" % name, check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add %s.sysusers" % name, check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add prep_prepend", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add pypi.json", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add build_prepend", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add make_prepend", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add install_prepend", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add install_append", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add series", check=False, stderr=subprocess.DEVNULL, cwd=path) # Add/remove version specific patch lists for filename in glob.glob('series.*'): base, version = filename.split('.', 1) if version in config.versions: call("git add {}".format(filename), check=False, stderr=subprocess.DEVNULL, cwd=path) else: call("git rm {}".format(filename), check=False, stderr=subprocess.DEVNULL, cwd=path) call("bash -c 'shopt -s failglob; git add -f *.asc'", check=False, stderr=subprocess.DEVNULL, cwd=path) call("bash -c 'shopt -s failglob; git add -f *.sig'", check=False, stderr=subprocess.DEVNULL, cwd=path) call("bash -c 'shopt -s failglob; git add -f *.sha256'", check=False, stderr=subprocess.DEVNULL, cwd=path) call("bash -c 'shopt -s failglob; git add -f *.sign'", check=False, stderr=subprocess.DEVNULL, cwd=path) call("bash -c 'shopt -s failglob; git add -f *.pkey'", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add configure", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add configure32", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add configure64", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add configure_avx2", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add configure_avx512", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add make_check_command", check=False, stderr=subprocess.DEVNULL, cwd=path) call("bash -c 'shopt -s failglob; git add *.patch'", check=False, stderr=subprocess.DEVNULL, cwd=path) call("bash -c 'shopt -s failglob; git add *.nopatch'", check=False, stderr=subprocess.DEVNULL, cwd=path) for item in config.transforms.values(): call("git add {}".format(item), check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add release", cwd=path) call("git add symbols", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add symbols32", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add used_libs", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add used_libs32", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add testresults", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add profile_payload", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add options.conf", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add configure_misses", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add whatrequires", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add description", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git add attrs", check=False, stderr=subprocess.DEVNULL, cwd=path) # remove deprecated config files call("git rm make_install_append", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm prep_append", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm use_clang", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm use_lto", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm use_avx2", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm fast-math", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm broken_c++", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm skip_test_suite", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm optimize_size", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm asneeded", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm broken_parallel_build", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm pgo", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm unit_tests_must_pass", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm funroll-loops", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm keepstatic", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm allow_test_failures", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm no_autostart", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm insecure_build", check=False, stderr=subprocess.DEVNULL, cwd=path) call("git rm conservative_flags", check=False, stderr=subprocess.DEVNULL, cwd=path) # add a gitignore ignorelist = [ ".*~", "*~", "*.info", "*.mod", "*.swp", ".repo-index", "*.log", "build.log.round*", "*.tar.*", "*.tgz", "!*.tar.*.*", "*.zip", "*.jar", "*.pom", "*.xml", "commitmsg", "results/", "rpms/", "for-review.txt", "" ] write_out(os.path.join(path, '.gitignore'), '\n'.join(ignorelist)) call("git add .gitignore", check=False, stderr=subprocess.DEVNULL, cwd=path) if success == 0: return call("git commit -a -F commitmsg ", cwd=path) call("rm commitmsg", cwd=path)
parser.add_argument('--hide-ui', dest = 'hide_ui', action = 'store_true', default=False, help='Hide loss graph and visualization UI while training goes on.') parser.add_argument('--evaluate-on-test', dest = 'evaluate_on_test', action = 'store_true', default=False, help='Run model of test data.') parser.add_argument('--eval-interval', dest = 'eval_interval', type=int, default=5, help='Evaluate model on validation set every n minibatches.') parser.add_argument('--min-updates', dest = 'minimum_updates', type=int, default=5000, help='Minimum number of minibatch iterations.') parser.add_argument('--minibatch-size', dest = 'minibatch_size', type=int, default=1, help='Size of each minibatch.') parser.add_argument('--learning-rate', dest = 'learning_rate', type=float, default=0.01, help='Learning rate to use during training.') args, unknown = parser.parse_known_args() if args.hide_ui: write_out("Live plot deactivated, see output folder for plot.") use_gpu = False if torch.cuda.is_available(): write_out("CUDA is available, using GPU") use_gpu = True # start web server start_dashboard_server() process_raw_data(use_gpu, force_pre_processing_overwrite=False) training_file = "data/preprocessed/sample.txt.hdf5" validation_file = "data/preprocessed/sample.txt.hdf5" testing_file = "data/preprocessed/testing.hdf5"
def train_model(data_set_identifier, train_file, val_file, learning_rate, minibatch_size, name): set_experiment_id(data_set_identifier, learning_rate, minibatch_size, name) train_loader = contruct_dataloader_from_disk(train_file, minibatch_size, use_evolutionary=True) validation_loader = contruct_dataloader_from_disk(val_file, minibatch_size, use_evolutionary=True) validation_dataset_size = validation_loader.dataset.__len__() train_dataset_size = train_loader.dataset.__len__() embedding_size = 21 if configs.run_params["use_evolutionary"]: embedding_size = 42 #Load in existing model if given as argument if args.model is not None: model_path = "output/models/" + args.model + ".model" model = load_model_from_disk(model_path, use_gpu) else: #else construct new model from config file model = construct_model(configs.model_params, embedding_size, use_gpu,minibatch_size) #optimizer parameters betas = tuple(configs.run_params["betas"]) weight_decay = configs.run_params["weight_decay"] angle_lr = configs.run_params["angles_lr"] if configs.model_params['architecture'] == 'cnn_angles': optimizer = optim.Adam(model.parameters(), betas=betas, lr=learning_rate, weight_decay=weight_decay) else: optimizer = optim.Adam([ {'params' : model.model.parameters(), 'lr':learning_rate}, {'params' : model.soft_to_angle.parameters(), 'lr':angle_lr}], betas=betas, weight_decay=weight_decay) #print number of trainable parameters print_number_of_parameters(model) #For creating a summary table of the model (does not work on ExampleModel!) if configs.run_params["print_model_summary"]: if configs.model_params["architecture"] != 'rnn': summary(model, configs.run_params["max_sequence_length"], 2) else: write_out("DETAILED MODEL SUMMARY IS NOT SUPPORTED FOR RNN MODELS") if use_gpu: model = model.cuda() # TODO: is soft_to_angle.parameters() included here? sample_num = list() train_loss_values = list() validation_loss_values = list() rmsd_avg_values = list() drmsd_avg_values = list() break_point_values = list() breakpoints = configs.run_params['breakpoints'] best_model_loss = 1e20 best_model_train_loss = 1e20 best_model_minibatch_time = None best_model_path = None stopping_condition_met = False minibatches_proccesed = 0 loss_atoms = configs.run_params["loss_atoms"] start_time = time.time() max_time = configs.run_params["max_time"] C_epochs = configs.run_params["c_epochs"] # TODO: Change to parameter C_batch_updates = C_epochs while not stopping_condition_met: optimizer.zero_grad() model.zero_grad() loss_tracker = np.zeros(0) start_time_n_minibatches = time.time() for minibatch_id, training_minibatch in enumerate(train_loader, 0): minibatches_proccesed += 1 training_minibatch = list(training_minibatch) primary_sequence, tertiary_positions, mask, p_id = training_minibatch[:-1] # Update C C = 1.0 if minibatches_proccesed >= C_batch_updates else float(minibatches_proccesed) / C_batch_updates #One Hot encode amino string and concate PSSM values. amino_acids, batch_sizes = one_hot_encode(primary_sequence, 21, use_gpu) if configs.run_params["use_evolutionary"]: evolutionary = training_minibatch[-1] evolutionary, batch_sizes = torch.nn.utils.rnn.pad_packed_sequence(torch.nn.utils.rnn.pack_sequence(evolutionary)) if use_gpu: evolutionary = evolutionary.cuda() amino_acids = torch.cat((amino_acids, evolutionary.view(-1, len(batch_sizes) , 21)), 2) start_compute_loss = time.time() if configs.run_params["only_angular_loss"]: #raise NotImplementedError("Only_angular_loss function has not been implemented correctly yet.") loss = model.compute_angular_loss((amino_acids, batch_sizes), tertiary_positions, mask) else: loss = model.compute_loss((amino_acids, batch_sizes), tertiary_positions, mask, C=C, loss_atoms=loss_atoms) if C != 1: write_out("C:", C) write_out("Train loss:", float(loss)) start_compute_grad = time.time() loss.backward() loss_tracker = np.append(loss_tracker, float(loss)) end = time.time() write_out("Loss time:", start_compute_grad-start_compute_loss, "Grad time:", end-start_compute_grad) optimizer.step() optimizer.zero_grad() model.zero_grad() # for every eval_interval samples, plot performance on the validation set if minibatches_proccesed % configs.run_params["eval_interval"] == 0: model.eval() write_out("Testing model on validation set...") train_loss = loss_tracker.mean() loss_tracker = np.zeros(0) validation_loss, data_total, rmsd_avg, drmsd_avg = evaluate_model(validation_loader, model, use_gpu, loss_atoms, configs.run_params["use_evolutionary"]) prim = data_total[0][0] pos = data_total[0][1] pos_pred = data_total[0][3] mask = data_total[0][4] pos = apply_mask(pos, mask) angles_pred = data_total[0][2] angles_pred = apply_mask(angles_pred, mask, size=3) pos_pred = apply_mask(pos_pred, mask) prim = torch.masked_select(prim, mask) if use_gpu: pos = pos.cuda() pos_pred = pos_pred.cuda() angles = calculate_dihedral_angels(pos, use_gpu) #angles_pred = calculate_dihedral_angels(pos_pred, use_gpu) #angles_pred = data_total[0][2] # Use angles output from model - calculate_dihedral_angels(pos_pred, use_gpu) write_to_pdb(get_structure_from_angles(prim, angles), "test") write_to_pdb(get_structure_from_angles(prim, angles_pred), "test_pred") if validation_loss < best_model_loss: best_model_loss = validation_loss best_model_minibatch_time = minibatches_proccesed best_model_path = write_model_to_disk(model) if train_loss < best_model_train_loss: best_model_train_loss = train_loss best_model_train_path = write_model_to_disk(model, model_type="train") write_out("Validation loss:", validation_loss, "Train loss:", train_loss) write_out("Best model so far (validation loss): ", best_model_loss, "at time", best_model_minibatch_time) write_out("Best model stored at " + best_model_path) write_out("Best model train stored at " + best_model_train_path) write_out("Minibatches processed:",minibatches_proccesed) end_time_n_minibatches = time.time() n_minibatches_time_used = end_time_n_minibatches - start_time_n_minibatches minibatches_left = configs.run_params["max_updates"] - minibatches_proccesed seconds_left = int(n_minibatches_time_used * (minibatches_left/configs.run_params["eval_interval"])) m, s = divmod(seconds_left, 60) h, m = divmod(m, 60) write_out("Estimated time until maximum number of updates:", '{:d}:{:02d}:{:02d}'.format(h, m, s) ) sample_num.append(minibatches_proccesed) train_loss_values.append(train_loss) validation_loss_values.append(validation_loss) rmsd_avg_values.append(rmsd_avg) drmsd_avg_values.append(drmsd_avg) if breakpoints and minibatches_proccesed > breakpoints[0]: break_point_values.append(drmsd_avg) breakpoints = breakpoints[1:] data = {} data["pdb_data_pred"] = open("output/protein_test_pred.pdb","r").read() data["pdb_data_true"] = open("output/protein_test.pdb","r").read() data["validation_dataset_size"] = validation_dataset_size data["sample_num"] = sample_num data["train_loss_values"] = train_loss_values data["break_point_values"] = break_point_values data["validation_loss_values"] = validation_loss_values data["phi_actual"] = list([math.degrees(float(v)) for v in angles[1:,1]]) data["psi_actual"] = list([math.degrees(float(v)) for v in angles[:-1,2]]) data["phi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[1:,1]]) data["psi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[:-1,2]]) data["drmsd_avg"] = drmsd_avg_values data["rmsd_avg"] = rmsd_avg_values if not configs.run_params["hide_ui"]: res = requests.post('http://localhost:5000/graph', json=data) if res.ok: print(res.json()) # Save run data write_run_to_disk(data) #Check if maximum time is reached. start_time_n_minibatches = time.time() time_used = time.time() - start_time time_condition = (max_time is not None and time_used > max_time) max_update_condition = minibatches_proccesed >= configs.run_params["max_updates"] min_update_condition = (minibatches_proccesed > configs.run_params["min_updates"] and minibatches_proccesed > best_model_minibatch_time * 2) model.train() #Checking for stop conditions if time_condition or max_update_condition or min_update_condition: stopping_condition_met = True break write_out("Best validation model found after" , best_model_minibatch_time , "minibatches.") write_result_summary(best_model_loss) return best_model_path