def compute_target(answers_dset, ans2label, name, cache_root='data/cache'): """Augment answers_dset with soft score as label ***answers_dset should be preprocessed*** Write result into a cache file """ target = [] for ans_entry in answers_dset: answers = ans_entry['answers'] answer_count = {} for answer in answers: answer_ = answer['answer'] answer_count[answer_] = answer_count.get(answer_, 0) + 1 labels = [] scores = [] for answer in answer_count: if answer not in ans2label: continue labels.append(ans2label[answer]) score = get_score(answer_count[answer]) scores.append(score) target.append({ 'question_id': ans_entry['question_id'], 'image_id': ans_entry['image_id'], 'labels': labels, 'scores': scores }) utils.create_dir(cache_root) cache_file = os.path.join(cache_root, name + '_target.pkl') cPickle.dump(target, open(cache_file, 'wb')) return target
def prepare_output_path(output_dir): """ Creates a path for the certifications exports directory """ if not output_dir: output_dir = 'exports' output_path = os.path.join(output_dir, 'certifications') utils.create_dir(output_path) return output_path
def _read(self): """ Read the json file and parse it. """ from src.app import App do_later = ["app_path", "icons_path", "icons"] try: with open(self._db_file, 'r') as db_obj: data = json.load(db_obj) for key, value in data.items(): if key not in do_later: setattr(self, key, value) except (FileNotFoundError, ValueError, KeyError): Logger.error("Application file is broken: {}".format( self._db_file)) self._parse_paths(data["app_path"], "app_path") self._parse_paths(data["icons_path"], "icons_path") self._parse_icons(data["icons"]) if len(App.get("only")) == 1 and App.path(): self.app_path.append(App.path()) found = self.icons and self.app_path if self.force_create_folder and found: for icon_path in self.icons_path: create_dir(str(icon_path)) self.dont_install = False else: self.dont_install = not (found and self.icons_path) # NWJS special case if self.get_type() == "nwjs" and not self.dont_install: self.dont_install = not App.get("nwjs")
def prepare_output_path(output_path): """ Set output_path and create a content dir if needed """ if not output_path: output_path = 'exports/gitbook' content_path = os.path.join(output_path, 'content') utils.create_dir(content_path) return output_path
def reproduce_taxonomic_classifier_testset( pretrained_models: PretrainedModels, batch_size: int = 64, output_dir: str = "./results/taxonomic") -> None: print("Test Taxonomic-Classifier...") create_dir(output_dir) # Get Taxonomix testset dataset = SequenceReadingsDataset(test_type="taxonomic") dataloader = DataLoader(dataset, num_workers=4, shuffle=False, batch_size=batch_size) # Create Result DataFrame for Taxonomic Classification results = pd.DataFrame( data={ "seq_id": dataset.sequence_id, "aa": dataset.aa_sequence, "stop_codons": dataset.contains_stop, "species": dataset.label_species }) # Taxonomic Testset Classification logits, results["tax_pred"] = predict(pretrained_models.taxonomic, dataloader, pretrained_models.tokenizer, pretrained_models.device) np.save(os.path.join(output_dir, "logits"), logits) # Confusion Matrix plot_confusion_heatmap(results["species"], results["tax_pred"], os.path.join(output_dir, "taxonomic_conf-matrix.png"), ["Actual Class"], ["Prediction"], normalize=True) # ROC plot_roc(logits, results["species"].to_numpy(), save_fig=os.path.join(output_dir, "taxonomic_ROC.png")) # Accuracy confusion_matrix = pd.crosstab(results["species"], results["tax_pred"], normalize=True).to_numpy() accuracy_to_stdout(confusion_matrix, { "1": "Bacteria", "0": "Virus", "2": "Human" }) # Save results and checkout results.to_hdf(os.path.join(output_dir, "result_dataframe.h5"), key='classification_results', mode='w', format='table') print("... finished. Results are saved in {}\n".format(output_dir))
def test_dir_created(self): '''test that a tmp dir is created''' utils.create_dir('tmp') assert os.path.exists('tmp') shutil.rmtree('tmp')
def export_local_files(item_path, io_paths): """ Export local files to the new directory """ if io_paths: output_path = os.path.join(io_paths["output"], "artifacts", item_path) input_path = os.path.join(io_paths["input"], item_path) utils.create_dir(os.path.dirname(output_path)) if not os.path.exists(output_path) or not filecmp.cmp(input_path, output_path): shutil.copy(input_path, output_path)
def inventory(certification, exports_dir, output_dir): """ Creates an inventory for a specific certification """ certs_dir = os.path.join(exports_dir, 'certifications') utils.create_dir(output_dir) cert_path = verify_certification_path(certification, certs_dir) if cert_path: output_path = inventory_builder.create_inventory(cert_path, output_dir) click.echo('Inventory yaml created at `{0}`'.format(output_path))
def certs(certification, data_dir, output_dir): """ Create certification yamls """ utils.create_dir(output_dir) certs_dir = os.path.join(data_dir, 'certifications') if verify_certification_path(certification, certs_dir): output_path = yamls_to_certification.create_yaml_certifications( certification, data_dir, output_dir ) click.echo('Certification created in: `{0}`'.format(output_path))
def _download_testsets(self): """ Downloads the pretrained classification models for a certain model type. """ download_urls = { "frame": "https://zenodo.org/record/4306248/files/refseq.tar.gz", "taxonomic": "https://zenodo.org/record/4306240/files/uniprot.tar.gz", "SRR": "https://redmine.f4.htw-berlin.de/owncloud/index.php/s/NoXtz6ezSZHPB6T/download", "inORF": "https://redmine.f4.htw-berlin.de/owncloud/index.php/s/REkM3Zi5K8n9QW2/download" } def reporthook(count: int, block_size: int, total_size: int) -> None: global start_time if count == 0: start_time = time.time() return duration = time.time() - start_time progress_size = int(count * block_size) speed = int(progress_size / (1024 * duration)) percent = min(int(count * block_size * 100 / total_size), 100) sys.stdout.write("\r %d%% | %d MB | %d KB/s" % (percent, progress_size / (1024 * 1024), speed)) sys.stdout.flush() for key in download_urls: if key == "frame" and os.path.isfile( "./data/refseq/refseq_ds_all_off-frames_fb_DNA_test.fasta" ): continue if key == "taxonomic" and os.path.isfile( "./data/uniprot/uniprot_swiss-prot_vbh_p100d_w_test.fasta" ): continue if key == "SRR" and os.path.isfile( "./data/srr/SRR2940986_filtered.fasta"): continue if key == "inORF" and os.path.isfile( "./data/inORF/inORF_unique.fasta"): continue # Download dir_path = "./data" create_dir(dir_path) file_name = download_urls[key].split("/")[-1] file_path = os.path.join(dir_path, file_name) urlretrieve(download_urls[key], filename=file_path, reporthook=reporthook) print(" - {} successfully downloaded".format(file_name)) # Unzip shutil.unpack_archive(file_path, extract_dir=dir_path, format="gztar") # Remove downloaded archive os.remove(file_path)
def get_file_path(output_dir, system_key, component_key=None): """ Creates the path for the directory that will contain the component if it doesn't exist and returns the file path of component yaml""" filepath = os.path.join(output_dir, system_key) filename = 'system.yaml' if component_key: filepath = os.path.join(filepath, component_key) filename = 'component.yaml' utils.create_dir(filepath) return os.path.join(filepath, filename)
def init_project(output_dir): """ Create a new control masonry project template """ if not output_dir: output_dir = 'data' output_container, _ = os.path.split(output_dir) utils.create_dir(output_container) template_dir = get_template_dir() copy_to_path = os.path.join(os.getcwd(), output_dir) shutil.copytree(template_dir, copy_to_path) return output_dir
def generate_data_I(self, ds, save_to_dir=None, prefix='test'): """ Generates augmented images using ImgAug library class. The results are persisted in disc. """ create_dir(save_to_dir) for imgs_batch, _ in ds.as_numpy_iterator(): batches = UnnormalizedBatch(images=(imgs_batch*255).astype(np.uint8)) images_aug = [next(seq.augment_batches(batches, background=True)).images_aug for i in range(5)] [imageio.imwrite("%s/%s_%d_%d.png" % (str(save_to_dir), prefix, i, random.randint(0, 1000),), ia_j) for i, images in enumerate(images_aug) for ia_j in images]
def prepare_locally_stored_files(element, io_paths): """ Prepare the files by moving locally stored files to the `artifacts` directory and linking filepaths to that directory """ item_path = element['url'] if not ('http://' in item_path or 'https://' in item_path): element['url'] = os.path.join('/artifacts', item_path).replace('\\', '/') if io_paths: output_path = os.path.join(io_paths['output'], 'artifacts', item_path) input_path = os.path.join(io_paths['input'], item_path) utils.create_dir(os.path.dirname(output_path)) if not os.path.exists(output_path) or not filecmp.cmp(input_path, output_path): shutil.copy(input_path, output_path)
def _download_pretrained_model(self) -> None: """ Downloads the pretrained classification models for a certain model type. """ download_urls = { "ProtBert": { "source": [ "https://s3.amazonaws.com/models.huggingface.co/bert/Rostlab/prot_bert/config.json", "https://cdn.huggingface.co/Rostlab/prot_bert/pytorch_model.bin", "https://cdn.huggingface.co/Rostlab/prot_bert/vocab.txt" ], "frame": [ "https://zenodo.org/record/4306420/files/metadata.json", "https://zenodo.org/record/4306420/files/state_dict.pth" ], "taxonomic": [ "https://zenodo.org/record/4306499/files/metadata.json", "https://zenodo.org/record/4306499/files/state_dict.pth" ] } } def reporthook(count: int, block_size: int, total_size: int) -> None: global start_time if count == 0: start_time = time.time() return duration = time.time() - start_time progress_size = int(count * block_size) speed = int(progress_size / (1024 * duration)) percent = min(int(count * block_size * 100 / total_size), 100) sys.stdout.write("\r %d%% | %d MB | %d KB/s" % (percent, progress_size / (1024 * 1024), speed)) sys.stdout.flush() if not os.path.isdir(self.path): url_dict = download_urls[self.type] for key in url_dict: subfolder_path = os.path.join(self.path, key) create_dir(subfolder_path) if os.listdir(subfolder_path): print("There are already files in {}.".format( subfolder_path)) else: print("Downloading pre-trained models") for url in url_dict[key]: file_name = url.split("/")[-1] urlretrieve(url, filename=os.path.join( subfolder_path, file_name), reporthook=reporthook) print( " - {} successfully downloaded".format(file_name))
def collect_references(self, references, output_base_path, relative_base_path): for reference in utils.inplace_gen(references): path = reference.get('path', 'NONE') file_import_path = os.path.join(self.component_directory, path) is_local = not ('http://' in file_import_path or 'https://' in file_import_path) if os.path.exists(file_import_path) and is_local: # Create dir and copy file file_output_path = os.path.join(output_base_path, path) utils.create_dir(os.path.dirname(file_output_path)) shutil.copy(file_import_path, file_output_path) # Rename path file_relative_path = os.path.join(relative_base_path, path) reference['path'] = file_relative_path
def test_dir_replaced(self): '''test that a tmp dir is replaced if it exists''' os.mkdir('tmp') os.mkdir('tmp/misc') assert os.path.exists('tmp/misc') utils.create_dir('tmp') assert not os.path.exists('tmp/misc') shutil.rmtree('tmp')
def generate_data_K(self, ds, save_to_dir=None, prefix='test', data_gen=None): """ Generates augmented images using Keras ImageDataGenerator class. The results are persisted in disc.""" if save_to_dir is None: save_to_dir = self.processed_dir else: create_dir(save_to_dir) img_gen = ImageDataGenerator(**data_gen) # Create by default 5 new augmented pictures by each original images. for img, _ in ds.as_numpy_iterator(): img_flow = img_gen.flow(img, batch_size=32, save_to_dir=str(save_to_dir), save_prefix=prefix) [next(img_flow)[0].astype(np.uint8) for i in range(5)]
def concat_markdowns(markdown_path, output_path): """ Add markdown content files to the gitbook directory and make the summary file the base summary string in order to join the markdown summary with the gitbook generated in this file. """ for filename in glob.iglob(os.path.join(markdown_path, "*", "*")): # Get the output file path and create the directory before copying output_filepath = os.path.join(output_path, filename.replace(os.path.join(markdown_path, ""), "")) ouput_dir = os.path.dirname(output_filepath) utils.create_dir(ouput_dir) shutil.copy(filename, output_filepath) summary_path = os.path.join(markdown_path, "SUMMARY.md") with open(summary_path, "r") as summary_file: main_summary = summary_file.read() return main_summary
def export_references(self, references, export_dir): """ Given a list of references in either list or dict format, determin which references were saved locally and saves those to the appropriate location in the export directory """ if not export_dir: return references relative_base_path = os.path.join(self.system_key, self.component_key) output_base_path = os.path.join(export_dir, relative_base_path) utils.create_dir(output_base_path) self.collect_references( references=references, output_base_path=output_base_path, relative_base_path=relative_base_path ) return references
def docs(export_format, certification, exports_dir, data_dir, output_dir): """ Create certification documentation """ certs_dir = os.path.join(exports_dir, 'certifications') cert_path = verify_certification_path(certification, certs_dir) markdown_dir = os.path.join(data_dir, 'markdowns') if cert_path: if export_format == 'gitbook': gitbook_output_dir = os.path.join(output_dir, 'gitbook') gitbook_markdown_dir = os.path.join(markdown_dir, 'gitbook') utils.create_dir(os.path.join(gitbook_output_dir, 'content')) output_path = certifications_to_gitbook.create_gitbook_documentation( cert_path, gitbook_output_dir, gitbook_markdown_dir ) click.echo('Gitbook Files Created in `{0}`'.format(output_path)) else: click.echo('{0} format is not supported yet...'.format(export_format))
def main(input_img_path, ann_path, output_pred_path, model_path, is_batch, model_type): """ Predict Images. """ logger = logging.getLogger(__name__) # Create Directory if doesn't exits otherwise remove items inside it. create_dir(Path(output_pred_path)) # collect images path input_img_dir = Path(input_img_path) imgs_path_test = sorted([ i.absolute() for i in (input_img_dir / 'test').glob("*.png") if i.is_file() ]) # Annotation path ann_path_dir = Path(ann_path) ann_test_path = ann_path_dir / 'test' / ann_file_name imgs_path_test = create_random_list_of_size(imgs_path_test, len(imgs_path_test) * 3) mask = Mask(output_pred_path) # calculate_iou_metric(model, data_generator_test, mask) if model_type == 'unet': # Load model pre-trained data_generator_test = GDXrayDataGenerator(imgs_path_test, ann_test_path, labels, n_classes, batch_size=batch_size, dim=dim) model = Unet(dim, n_classes, n_filters=n_filters, pretrained_weights=model_path) predict_unet(model, data_generator_test, mask) elif model_type == 'contours': data_generator_test = GDXrayDataGenerator(imgs_path_test, ann_test_path, labels, n_classes, batch_size=batch_size, dim=dim, task='binary') predict_contours_batch(data_generator_test, mask)
def create_backup_dir(self): """Create a backup directory for an application (application_name).""" backup_dir = path.join(BACKUP_FOLDER, self.app.name, strftime(BACKUP_FILE_FORMAT), "") exists = True new_backup_dir = backup_dir i = 1 while exists: if path.exists(new_backup_dir): new_backup_dir = backup_dir + "_" + str(i) if not path.isdir(new_backup_dir): Logger.debug("Create new backup folder " "for {}".format(self.app.name)) create_dir(new_backup_dir) exists = False i += 1 self._backup_dir = new_backup_dir
def prepare_local_files(component_dict, ref_key, components_path, output_dir): """ Prepare references by saving files referenced locally to certifications repository """ relative_base_path = os.path.join(component_dict['system'], '') output_base_path = os.path.join(output_dir, relative_base_path) import_base_path = os.path.join(components_path, relative_base_path) utils.create_dir(output_base_path) for reference in inplace_gen(component_dict.get(ref_key)): path = reference.get('url', 'NONE') file_import_path = os.path.join(import_base_path, path) is_local = not ('http://' in file_import_path or 'https://' in file_import_path) if os.path.exists(file_import_path) and is_local: # Create dir and copy file file_output_path = os.path.join(output_base_path, path) utils.create_dir(os.path.dirname(file_output_path)) shutil.copy(file_import_path, file_output_path) # Rename url file_relative_path = os.path.join(relative_base_path, path) reference['url'] = file_relative_path return component_dict.get(ref_key)
def main(): path = create_dir() url = "https://brasil.diplo.de/br-de/service/matriculaconsular/" \ "2222228?fbclid=IwAR0MojqudTlgMKQjZG7nNqp9gr3-QSKyiiRdY0jeeBY336zm_3yqr_Oc_nc" content = scraping_de(url) for c in content: save_file(pathname=path, filename=c["file_name"], content=requests.get(c["xls_url"]).content)
def generate_hed_dataset(input_flist, output_dir): if os.path.isfile(input_flist): flist = numpy.genfromtxt(input_flist, dtype=numpy.str, encoding='utf-8') create_dir(output_dir) for path in flist: path = os.path.normpath(path) outfile = os.path.join(output_dir, path.split("/")[-3], path.split("/")[-2], os.path.basename(path)) Path(os.path.dirname(outfile)).mkdir(parents=True, exist_ok=True) # outfile=arguments_output_dir+path.split("/")[-3]+'/'+path.split("/")[-2]+'/'+os.path.basename(path) tensorInput = torch.FloatTensor( numpy.array(PIL.Image.open(path).convert('RGB'))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * ( 1.0 / 255.0)) tensorOutput = estimate(tensorInput) PIL.Image.fromarray( (tensorOutput.clamp(0.0, 1.0).numpy().transpose(1, 2, 0)[:, :, 0] * 255.0).astype(numpy.uint8)).save( outfile)
def create_ans2label(occurence, name, cache_root='data/cache'): """Note that this will also create label2ans.pkl at the same time occurence: dict {answer -> whatever} name: prefix of the output file cache_root: str """ ans2label = {} label2ans = [] label = 0 for answer in occurence: label2ans.append(answer) ans2label[answer] = label label += 1 utils.create_dir(cache_root) cache_file = os.path.join(cache_root, name + '_ans2label.pkl') cPickle.dump(ans2label, open(cache_file, 'wb')) cache_file = os.path.join(cache_root, name + '_label2ans.pkl') cPickle.dump(label2ans, open(cache_file, 'wb')) return ans2label
def export_references(self, references, export_dir): """ Given a list of references in either list or dict format, determin which references were saved locally and saves those to the appropriate location in the export directory """ if not export_dir: return references relative_base_path = os.path.join(self.system_key, self.component_key) output_base_path = os.path.join(export_dir, relative_base_path) utils.create_dir(output_base_path) for reference in utils.inplace_gen(references): path = reference.get('path', 'NONE') file_import_path = os.path.join(self.component_directory, path) is_local = not ('http://' in file_import_path or 'https://' in file_import_path) if os.path.exists(file_import_path) and is_local: # Create dir and copy file file_output_path = os.path.join(output_base_path, path) utils.create_dir(os.path.dirname(file_output_path)) shutil.copy(file_import_path, file_output_path) # Rename path file_relative_path = os.path.join(relative_base_path, path) reference['path'] = file_relative_path return references
def process(args, model, eval_loader): model_path = args.input + '/model_epoch%s.pth' % (args.epoch) print('loading %s' % model_path) print(torch.cuda.current_device()) model_data = torch.load(model_path) # Comment because not using multi-gpu or distributed # model = nn.DataParallel(model).cuda() model = model.to(args.device) model.load_state_dict(model_data.get('model_state', model_data)) model.train(False) logits, qIds = get_logits(args, model, eval_loader, device) results = make_json(logits, qIds, eval_loader) # results = make_json_with_logits(logits, qIds) model_label = '%s%s%d_%s' % (args.model, args.op, args.num_hid, args.label) if args.logits: utils.create_dir('logits/' + model_label) torch.save(logits, 'logits/' + model_label + '/logits%d.pth' % args.index) utils.create_dir(args.output) model_label += 'epoch%s' % args.epoch # out_file = args.output + '/' + args.input.split('/')[-1] + '.json' # with open(args.output + '/%s_%s.pkl' % (args.split, model_label), 'wb') as f: # pickle.dump(results, f, protocol=2) with open(args.output + '/%s_%s.json' % (args.split, model_label), 'w') as f: json.dump(results, f) if args.model == 'cti': results = make_json_with_logits(logits, qIds) with open('results/%s_%s_logits.pkl' % (args.model, args.split), 'wb') as f: pickle.dump(results, f)
def randomTune(config): # 'LR': 0.0001, # learning rate # 'D2G_LR': 0.1, # discriminator/generator learning rate ratio # 'BETA1': 0.0, # adam optimizer beta1 # 'BETA2': 0.9, # adam optimizer beta2 # 'L1_LOSS_WEIGHT': 1, # l1 loss weight # 'FM_LOSS_WEIGHT': 10, # feature-matching loss weight config.MAX_STEPS = 1500 config.EVAL_INTERVAL = 80 config.MAX_EPOCHES = 1 # config.MAX_STEPS = 3 experiments = 50 for i in range(experiments): # sample from a Uniform distribution on a log-scale # config.LR = 10 ** np.random.uniform(-3, -5) # Sample learning rate candidates in the range (0.001 to 0.00001) # config.D2G_LR = 10 ** np.random.uniform(-2, # 0) # Sample regularization candidates in the range (0.01 to 0.0001) # config.LR = 0.0001 # config.D2G_LR =0.1 # # config.LR=0.0001 # config.PATH = './checkpoints/places2_tune_%d_%f%f_' % (i, config.LR, config.D2G_LR) # logdir= config.PATH+('/log_%s_%s' % (config.LR , config.D2G_LR)) create_dir(config.PATH) if TRAIN_LOSS: # if config.MODEL == 1: # config.L1_LOSS_WEIGHT = 10 ** np.random.uniform(-1, # 1) # Sample regularization candidates in the range (1 to 200) # config.FM_LOSS_WEIGHT = 10 ** np.random.uniform(-1, # 1.5) # Sample regularization candidates in the range (1 to 200) # config.ADV_LOSS_WEIGHT = 10 ** np.random.uniform(-1, # 1) # Sample regularization candidates in the range (1 to 200) if config.MODEL != 1: # config.L1_LOSS_WEIGHT = 10 ** np.random.uniform(-1, # 1) # Sample regularization candidates in the range (1 to 200) # config.FM_LOSS_WEIGHT = 10 ** np.random.uniform(-1, # 1.5) # Sample regularization candidates in the range (1 to 200) config.STYLE_LOSS_WEIGHT = np.random.uniform( 10, 400 ) # Sample regularization candidates in the range (1 to 200) # config.CONTENT_LOSS_WEIGHT = 2 * 10 ** np.random.uniform(0, # 2) # Sample regularization candidates in the range (1 to 200) # config.INPAINT_ADV_LOSS_WEIGHT = 10 ** np.random.uniform(-1, # 1) # Sample regularization candidates in the range (1 to 200) model = EdgeConnect(config) model.load() # config.print() print('\nEx %d: learning_rate:%f D_Learning_rate: %f:' % (i, config.LR, config.D2G_LR)) if TRAIN_LOSS: if config.MODEL == 1: print('Ex %d: L1:%f FM: %f ADV: %f:' % (i, config.L1_LOSS_WEIGHT, config.FM_LOSS_WEIGHT, config.ADV_LOSS_WEIGHT)) if config.MODEL != 1: print('Ex %d: L1:%f FM: %f STYLE: %f CONTENT: %f ADV: %f:' % (i, config.L1_LOSS_WEIGHT, config.FM_LOSS_WEIGHT, config.STYLE_LOSS_WEIGHT, config.CONTENT_LOSS_WEIGHT, config.INPAINT_ADV_LOSS_WEIGHT)) model.train() os._exit(0)
# v7w parser.add_argument('--use_feature', default='bottom', type=str, help='use bottom-up feature or grid feature') # SAN parser.add_argument('--num_stacks', default=2, type=int, help='num of stacks in Stack Attention Networks') args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() utils.create_dir(args.output) logger = utils.Logger(os.path.join(args.output, 'log.txt')) logger.write(args.__repr__()) # Initializes the distributed backend which will take care of sychronizing nodes/GPUs device = torch.device("cuda:" + str(args.gpu) if args.gpu >= 0 else "cpu") args.device = device torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True torch.cuda.set_device(args.gpu) dictionary = Dictionary.load_from_file('data_v7w/dictionary.pkl') train_dset = V7WDataset('train', args, dictionary, adaptive=True, max_boxes=args.max_boxes, question_len=args.question_len) val_dset = V7WDataset('val', args, dictionary, adaptive=True, max_boxes=args.max_boxes,
def reproduce_frame(pretrained_models: PretrainedModels, batch_size: int = 64, output_dir: str = "./results/frame"): print("Test Frame-Classifier...") create_dir(output_dir) model = pretrained_models.frame # Frame Dataset/Dataloader dataset = SequenceReadingsDataset(test_type="frame") dataloader = DataLoader(dataset, num_workers=4, shuffle=False, batch_size=batch_size, collate_fn=collate_tensors) # Create result DataFrame results = pd.DataFrame( data={ "seq_id": dataset.sequence_id, "dna": dataset.dna_sequence, "aa": dataset.aa_sequence, "stop_codons": dataset.contains_stop, "frame": dataset.label_frame }) # Frame classification logits, results["frame_pred"] = predict(model, dataloader, pretrained_models.tokenizer, pretrained_models.device) np.save(os.path.join(output_dir, "frame_logits"), logits) # Confusion matrix plot_confusion_heatmap(results["frame"], results["frame_pred"], os.path.join(output_dir, "frame_conf-matrix.png"), ["Actual Frame"], ["Frame Prediction"], normalize=True) # ROC plot_roc(logits, results["frame"].to_numpy(), save_fig=os.path.join(output_dir, "frame_ROC.png")) # Accuracy confusion_matrix = pd.crosstab(results["frame"], results["frame_pred"], normalize=True).to_numpy() frame_classes = { "0": "on-frame", "1": "offset by one base", "2": "offset by two bases", "3": "reverse-complementary", "4": "reverse-complementary and offset by one base", "5": "reverse complementary and offset by two bases" } accuracy_to_stdout(confusion_matrix, frame_classes) # Correct frames results["aa_shifted"] = frame_correction(results["dna"], results["frame_pred"]) dataloader = DataLoader(Frame_Dataset(results["aa_shifted"]), num_workers=4, shuffle=False, batch_size=batch_size) # Rerun frame xlassification _, results["aa_shifted_frame_pred"] = predict(model, dataloader, pretrained_models.tokenizer, pretrained_models.device) # Evaluate Frame Re-Classification plot_confusion_heatmap(results["frame"], results["aa_shifted_frame_pred"], os.path.join(output_dir, "shifted_frame_conf-matrix.png"), ["Actual Frame"], ["Frame Prediction"], normalize=True) # Save results and checkout results.to_hdf(os.path.join(output_dir, "results_dataframe.h5"), key='classification_results', mode='w', format='table') print("... finished. Results are saved in {}\n".format(output_dir))
def randomTune(config): # 'LR': 0.0001, # learning rate # 'D2G_LR': 0.1, # discriminator/generator learning rate ratio # 'BETA1': 0.0, # adam optimizer beta1 # 'BETA2': 0.9, # adam optimizer beta2 # 'L1_LOSS_WEIGHT': 1, # l1 loss weight # 'FM_LOSS_WEIGHT': 10, # feature-matching loss weight config.MAX_STEPS = 3200 config.EVAL_INTERVAL = 80 config.MAX_EPOCHES = 10 # config.MAX_STEPS = 3 config.BATCH_SIZE = 16 experiments = 50 for i in range(experiments): # sample from a Uniform distribution on a log-scale # config.LR = 10 ** np.random.uniform(-3, -5) # Sample learning rate candidates in the range (0.001 to 0.00001) # config.D2G_LR = 10 ** np.random.uniform(-2, # 0) # Sample regularization candidates in the range (0.01 to 0.0001) # config.LR = 0.0001 # config.D2G_LR =0.1 # # config.LR=0.0001 # config.PATH = './checkpoints/tune_parameters/places2_tune_%d_%f%f_' % (i, config.LR, config.D2G_LR) # logdir= config.PATH+('/log_%s_%s' % (config.LR , config.D2G_LR)) if TRAIN_LOSS: # if config.MODEL == 1: # config.L1_LOSS_WEIGHT = 10 ** np.random.uniform(-1,1) # config.FM_LOSS_WEIGHT = 10 ** np.random.uniform(-1,1.5) # config.ADV_LOSS_WEIGHT = 10 ** np.random.uniform(-1,1) # config.STYLE_LOSS_WEIGHT = np.random.uniform(0, 300) # config.CONTENT_LOSS_WEIGHT = 2 * 10 ** np.random.uniform(0, 2) # config.INPAINT_ADV_LOSS_WEIGHT = 10 ** np.random.uniform(-1, 1) # if config.MODEL != 1: # Sample regularization candidates in the range (1 to 200) max_number = math.log(300, 10) config.L1_LOSS_WEIGHT = 10**np.random.uniform(-1, max_number) config.FM_LOSS_WEIGHT = 10**np.random.uniform(-1, max_number) # config.GRADIENT_LOSS_WEIGHT= 10 ** np.random.uniform(-1,max_number) config.STYLE_LOSS_WEIGHT = 10**np.random.uniform(-1, max_number) config.CONTENT_LOSS_WEIGHT = 10**np.random.uniform(-1, max_number) config.INPAINT_ADV_LOSS_WEIGHT = 10**np.random.uniform( -1, max_number) config.PATH = './checkpoints/tune_parameters/ex%d_L1_%f_ADV_%f_Style_%f_Perc_%f_Grad_%f_FM_%f' % ( i, config.L1_LOSS_WEIGHT, config.INPAINT_ADV_LOSS_WEIGHT, config.STYLE_LOSS_WEIGHT, config.CONTENT_LOSS_WEIGHT, config.GRADIENT_LOSS_WEIGHT, config.FM_LOSS_WEIGHT) create_dir(config.PATH) model = CLFNet(config) model.load() # config.print() # print('\nEx %d: learning_rate:%f D_Learning_rate: %f:' % (i, config.LR, config.D2G_LR)) if TRAIN_LOSS: print( 'Ex %d - L1:%f FM: %f STYLE: %f CONTENT: %f ADV: %f: GRAD: %f' % (i, config.L1_LOSS_WEIGHT, config.FM_LOSS_WEIGHT, config.STYLE_LOSS_WEIGHT, config.CONTENT_LOSS_WEIGHT, config.INPAINT_ADV_LOSS_WEIGHT, config.GRADIENT_LOSS_WEIGHT)) # if config.MODEL == 1: # print('Ex %d: L1:%f FM: %f ADV: %f:' % ( # i, config.L1_LOSS_WEIGHT, config.FM_LOSS_WEIGHT, config.ADV_LOSS_WEIGHT)) # if config.MODEL != 1: # print('Ex %d: L1:%f FM: %f STYLE: %f CONTENT: %f ADV: %f:' % (i, config.L1_LOSS_WEIGHT, # config.FM_LOSS_WEIGHT, # config.STYLE_LOSS_WEIGHT, # config.CONTENT_LOSS_WEIGHT, # config.INPAINT_ADV_LOSS_WEIGHT)) model.train() os._exit(0)
import numpy as np from main import main import multiprocessing import os from src.utils import create_config, create_dir, init_config import yaml from shutil import copyfile debug = False if __name__ == '__main__': # inital multiprocessing.set_start_method('spawn') checkpoints_path = './checkpoints/cell' # model checkpoints path create_dir(checkpoints_path) config_path = os.path.join(checkpoints_path, 'config.yml') create_config(config_path) init_config(checkpoints_path, debug, EPOCH=50, INTERVAL=1000) # pre_train main(0, config_path) # train config create_config(config_path) init_config(checkpoints_path, debug, EPOCH=50, INTERVAL=1000, EVAL_INTERVAL_EPOCH=0.1) copyfile('checkpoints/cell/EdgeDetect_pre.pth', 'checkpoints/cell/EdgeDetect.pth') # # train main(1, config_path)
else: # split into sets based on folds val_df = df[df.fold == int(args.val_fold)] train_df = df[df.fold != int(args.val_fold)] args.run = f"{args.run}.fold_{args.val_fold:.0f}" assert val_df.shape[0] + train_df.shape[0] == df.shape[0] print(f"* Training set size: {train_df.shape[0]}") print(f"* Validation set size: {val_df.shape[0]}") # endregion # region: prepare paths td_dir = f"/app/.tensorboard/{args.run}" create_dir(td_dir, remove=True) checkpoint_path = f"{c['WORK_DIR']}/models/{args.run}" create_dir(f"{c['WORK_DIR']}/models", remove=False) # endregion # region: problem type-dependent params assert ds_meta["args"]["labels_mode"] in ["multilabel", "multiclass"] if ds_meta["args"]["labels_mode"] == "multiclass": final_activation = "softmax" loss = "categorical_crossentropy" elif ds_meta["args"]["labels_mode"] == "multilabel": final_activation = "sigmoid"
def temp_dir(): create_dir(TEST_TEMP_DIR)
def prepare_output_path(output_path): """ Set output_path and create a content dir if needed """ if not output_path: output_path = 'exports/inventory' utils.create_dir(output_path) return output_path
def get_file_path(system, name, output_dir): """ Creates the path for the directory that will contain the component if it doesn't exist and returns the file path of component yaml""" output_path = os.path.join(output_dir, system) utils.create_dir(output_path) return os.path.join(output_path, '{0}.yaml'.format(slugify(name)))
# Parse configuration args = parse_args() # Set computational device if args.cpu is True: device = "cpu" else: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Calculations will be executed on: ", device) # Create output folder. User defined name or subfolder in a result directory with input file name if args.output is None: output_path = os.path.join( "./results/", args.input.split(".fasta")[-2].split("/")[-1]) create_dir(output_path) args.output = output_path else: create_dir(args.output) # Initialize pretrained models pretrained_models = PretrainedModels(args.pretrained_model, device=device) # Prepare data dataset = SequenceReadingsDataset(args.input) print("Dataset loaded ({} items) ".format(len(dataset))) dataloader = DataLoader(dataset, num_workers=4, shuffle=False, batch_size=args.batch_size, collate_fn=collate_tensors)
def train(args, model, train_loader, eval_loader, num_epochs, output, opt=None, s_epoch=0): device = args.device lr_default = args.lr lr_decay_step = 2 lr_decay_rate = .25 lr_decay_epochs = range( 10, 20, lr_decay_step) if eval_loader is not None else range( 10, 20, lr_decay_step) gradual_warmup_steps = [ 0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default ] saving_epoch = 0 grad_clip = args.clip_norm utils.create_dir(output) optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \ if opt is None else opt # Initial loss function criterion = torch.nn.BCEWithLogitsLoss(reduction='sum') logger = utils.Logger(os.path.join(output, 'log.txt')) logger.write(args.__repr__()) best_eval_score = 0 utils.print_model(model, logger) logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \ (lr_default, lr_decay_step, lr_decay_rate, grad_clip)) # Create trainer trainer = Trainer(args, model, criterion, optim) update_freq = int(args.update_freq) wall_time_start = time.time() for epoch in range(s_epoch, num_epochs): total_loss = 0 train_score = 0 total_norm = 0 count_norm = 0 num_updates = 0 t = time.time() N = len(train_loader.dataset) num_batches = int(N / args.batch_size + 1) if epoch < len(gradual_warmup_steps): trainer.optimizer.param_groups[0]['lr'] = gradual_warmup_steps[ epoch] logger.write('gradual warmup lr: %.8f' % trainer.optimizer.param_groups[0]['lr']) elif epoch in lr_decay_epochs: trainer.optimizer.param_groups[0]['lr'] *= lr_decay_rate logger.write('decreased lr: %.8f' % trainer.optimizer.param_groups[0]['lr']) else: logger.write('lr: %.8f' % trainer.optimizer.param_groups[0]['lr']) for i, (v, b, q, a, ans_mc, ans_gt) in enumerate(train_loader): v = v.to(device) b = b.to(device) q = q.to(device) a = a.to(device) ans_mc = ans_mc.to(device) # Clone each sample to 4 samples v = v.unsqueeze(1).expand(v.size(0), 4, v.size(1), v.size(2)).contiguous().view( v.size(0) * 4, v.size(1), v.size(2)) q = q.unsqueeze(1).expand(q.size(0), 4, q.size(1)).contiguous().view( q.size(0) * 4, q.size(1)) ans_mc = ans_mc.view( ans_mc.size(0) * ans_mc.size(1), ans_mc.size(2)) a = a.view(ans_mc.size(0), 1) labels = torch.cat([a, 1 - a], 1) labels = labels.to(device) sample = [v, b, q, labels, ans_mc] if i < num_batches - 1 and (i + 1) % update_freq > 0: trainer.train_step(sample, update_params=False) else: loss, grad_norm, batch_score = trainer.train_step( sample, update_params=True) total_norm += grad_norm count_norm += 1 total_loss += loss.item() train_score += batch_score num_updates += 1 if num_updates % int(args.print_interval / update_freq) == 0: print( "Iter: {}, Loss {:.4f}, Norm: {:.4f}, Total norm: {:.4f}, Num updates: {}, Wall time: {:.2f}," " ETA: {}".format(i + 1, total_loss / ((num_updates + 1)), grad_norm, total_norm, num_updates, time.time() - wall_time_start, utils.time_since(t, i / num_batches))) total_loss /= num_updates train_score = 100 * train_score / (num_updates * args.batch_size) if eval_loader is not None: print("Evaluating...") trainer.model.train(False) eval_score, bound = evaluate(model, eval_loader, args) trainer.model.train(True) logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t)) logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm / count_norm, train_score)) if eval_loader is not None: logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound)) # Save per epoch if epoch >= saving_epoch: model_path = os.path.join(output, 'model_epoch%d.pth' % epoch) utils.save_model(model_path, model, epoch, trainer.optimizer) # Save best epoch if eval_loader is not None and eval_score > best_eval_score: model_path = os.path.join(output, 'model_epoch_best.pth') utils.save_model(model_path, model, epoch, trainer.optimizer) best_eval_score = eval_score