def sample_cv(dataset, model_name, num_folds, fold, use_cosmic, num_signatures, shuffle_seed, random_seed, max_iterations, epsilon, out_dir): if fold >= num_folds: raise ValueError('num_folds is {} but fold is {}'.format( num_folds, fold)) dataset_name = dataset dataset, active_signatures = get_data_by_model_name(dataset, model_name) if use_cosmic: num_signatures = len(active_signatures) signatures = get_cosmic_signatures()[active_signatures] elif num_signatures == 0: print( 'use_cosmic is False and num_signatures is 0, using number of active cosmic signatures {}' .format(len(active_signatures))) num_signatures = len(active_signatures) signatures = None else: signatures = None use_cosmic_dir = 'refit' if use_cosmic else 'denovo' out_dir = os.path.join(out_dir, dataset_name, use_cosmic_dir, model_name, str(num_signatures), str(shuffle_seed), str(num_folds), str(fold)) try: os.makedirs(out_dir) except OSError: pass random_seed = int(time.time()) if random_seed == 0 else random_seed out_file = out_dir + "/" + str(random_seed) if os.path.isfile(out_file + '.json'): print( 'Experiment with parameters {} {} {} {} {} {} {} {} already exist'. format(dataset_name, model_name, num_folds, fold, use_cosmic, num_signatures, shuffle_seed, random_seed)) return train_data, test_data = split_train_test_sample_cv(dataset, num_folds, fold, shuffle_seed) model, train_ll, test_ll = train_test_stickysig(train_data, test_data, num_signatures, signatures, random_seed, epsilon, max_iterations) parameters = model.get_params() parameters['alpha'] = parameters['alpha'].tolist() parameters['e'] = parameters['e'].tolist() for sample in parameters['pi']: parameters['pi'][sample] = parameters['pi'][sample].tolist() out = { 'log-likelihood-train': train_ll, 'log-likelihood-test': test_ll, 'parameters': parameters } save_json(out_file, out)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--train_path", type=str, required=True) parser.add_argument("--dset_name", type=str, default="anet", choices=["anet", "yc2"]) parser.add_argument("--cache", type=str, default="./cache") parser.add_argument("--min_word_count", type=int, default=5) parser.add_argument("--raw_glove_path", type=str, help="downloaded glove vectors path") opt = parser.parse_args() if not os.path.exists(opt.cache): os.makedirs(opt.cache) # load, merge, clean, split data train_data = load_json(opt.train_path) all_sentences = flat_list_of_lists( [v["sentences"] for k, v in train_data.items()]) all_sentences = [ nltk.tokenize.word_tokenize(sen.lower()) for sen in all_sentences ] word2idx = build_vocab_idx(all_sentences, opt.min_word_count) print("[Info] Dumping the processed data to json file", opt.cache) word2idx_path = os.path.join(opt.cache, "{}_word2idx.json".format(opt.dset_name)) save_json(word2idx, word2idx_path, save_pretty=True) print("[Info] Finish.") vocab_glove_path = os.path.join(opt.cache, "{}_vocab_glove.pt".format(opt.dset_name)) extract_glove(word2idx, opt.raw_glove_path, vocab_glove_path)
def get_joints_labels_and_images(self) -> Tuple[dict, dict]: """Returns the dictionary conatinign the bound box of the image and dictionary containig image information. Returns: Tuple[dict, dict]: joints, image_dict image_dict - `name` - Image name in the form of `youtube/VIDEO_ID/video/frames/FRAME_ID.png`. - `width` - Width of the image. - `height` - Height of the image. - `id` - Image ID. joints - `joints` - 21 joints, containing bound box limits as vertices. - `is_left` - Binary value indicating a right/left hand side. - `image_id` - ID to the corresponding entry in `images`. - `id` - Annotation ID (an image can contain multiple hands). """ data_json_path = os.path.join(self.root_dir, f"youtube_{self.split}.json") joints_path = os.path.join(self.root_dir, f"youtube_{self.split}_joints.json") images_json_path = os.path.join(self.root_dir, f"youtube_{self.split}_images.json") if os.path.exists(joints_path) and os.path.exists(images_json_path): return read_json(joints_path), read_json(images_json_path) else: data_json = read_json(data_json_path) images_dict = data_json["images"] save_json(images_dict, images_json_path) annotations_dict = data_json["annotations"] joints = self.get_joints_from_annotations(annotations_dict) save_json(joints, joints_path) return joints, images_dict
def dump_index(self, index, patch_idx): """Simply saves index as json file under export directory Args: index (dict): dictionnary to dump as json """ index_path = self._get_index_path(patch_idx) save_json(path=index_path, jsonFile=index)
def main(args): root = args['--root'] experiment = build_experiment(load_yaml(args['--cfg'])) bar = Bar("Patch directory", max=len(experiment.test_set)) iqa_metrics = defaultdict(list) for patch_idx in patches_subset_from(experiment.test_set): patch_directory = os.path.join(root, patch_idx) if not os.path.isdir(patch_directory): # Some patches aren't predicted by ESTARFM as it requires a sample before and one after continue for date in os.listdir(patch_directory): # Load predicted bands date_directory = os.path.join(patch_directory, date) files_paths = [os.path.join(date_directory, band) for band in os.listdir(date_directory)] predicted_bands = load_in_multiband_raster(files_paths) # Load groundtruth bands target_directory = os.path.join(args['--target'], patch_idx, 'landsat', date) target_files_paths = [os.path.join(target_directory, band) for band in os.listdir(target_directory)] target_bands = load_in_multiband_raster(target_files_paths) # Compute PSNR and SSIM by band patch_bands_iqa = defaultdict(list) for src, tgt in zip(predicted_bands, target_bands): data_range = np.max([src, tgt]) src = src.clip(min=np.finfo(np.float16).eps) / data_range tgt = tgt.clip(min=np.finfo(np.float16).eps) / data_range patch_bands_iqa['psnr'] += [metrics.psnr(tgt, src)] patch_bands_iqa['ssim'] += [metrics.ssim(tgt, src)] # Record bandwise value iqa_metrics['psnr'] += [patch_bands_iqa['psnr']] iqa_metrics['ssim'] += [patch_bands_iqa['ssim']] # Compute bandwise spectral angle mapper predicted_patch = np.dstack(predicted_bands).astype(np.float32) target_patch = np.dstack(target_bands).astype(np.float32) sam = metrics.sam(target_patch, predicted_patch).mean(axis=(0, 1)) iqa_metrics['sam'] += [sam] # Log running averages avg_psnr, avg_ssim, avg_sam = np.mean(iqa_metrics['psnr']), np.mean(iqa_metrics['ssim']), np.mean(iqa_metrics['sam']) bar.suffix = "PSNR = {:.2f} | SSIM = {:.4f} | SAM = {:.6f}".format(avg_psnr, avg_ssim, avg_sam) bar.next() # Make bandwise average output dictionnary bandwise_avg_psnr = np.asarray(iqa_metrics['psnr']).mean(axis=0).astype(np.float64) bandwise_avg_ssim = np.asarray(iqa_metrics['ssim']).mean(axis=0).astype(np.float64) bandwise_avg_sam = np.asarray(iqa_metrics['sam']).mean(axis=0).astype(np.float64) avg_iqa_metrics = {'test_psnr': bandwise_avg_psnr.tolist(), 'test_ssim': bandwise_avg_ssim.tolist(), 'test_sam': bandwise_avg_sam.tolist()} os.makedirs(args['--o'], exist_ok=True) dump_path = os.path.join(args['--o'], f"test_scores_starfm.json") save_json(dump_path, avg_iqa_metrics)
def leave_one_chromosome_out(dataset, model_name, chromosome, use_cosmic, num_signatures, random_seed, max_iterations, epsilon, out_dir): use_cosmic_dir = 'refit' if use_cosmic else 'denovo' all_chromosomes = [str(i) for i in range(1, 23)] all_chromosomes.extend(['X', 'Y']) chromosome_name = all_chromosomes[chromosome] dataset_name = dataset dataset, active_signatures = get_data_by_model_name(dataset, model_name) if use_cosmic: num_signatures = len(active_signatures) signatures = get_cosmic_signatures()[active_signatures] elif num_signatures == 0: print( 'use_cosmic is False and num_signatures is 0, using number of active cosmic signatures {}' .format(len(active_signatures))) num_signatures = len(active_signatures) signatures = None else: signatures = None out_dir = os.path.join(out_dir, dataset_name, use_cosmic_dir, model_name, str(num_signatures), chromosome_name) try: os.makedirs(out_dir) except OSError: pass random_seed = int(time.time()) if random_seed == 0 else random_seed out_file = out_dir + "/" + str(random_seed) if os.path.isfile(out_file + '.json'): print('Experiment with parameters {} {} {} {} {} {} already exist'. format(dataset_name, model_name, chromosome, use_cosmic, num_signatures, random_seed)) return train_data, test_data = split_train_test_loco(dataset, chromosome) model, train_ll, test_ll = train_test_stickysig(train_data, test_data, num_signatures, signatures, random_seed, epsilon, max_iterations) parameters = model.get_params() parameters['alpha'] = parameters['alpha'].tolist() parameters['e'] = parameters['e'].tolist() for sample in parameters['pi']: parameters['pi'][sample] = parameters['pi'][sample].tolist() out = { 'log-likelihood-train': train_ll, 'log-likelihood-test': test_ll, 'parameters': parameters } save_json(out_file, out)
def _write_filtering_values_to_file(self, file_path, name): self._update_filtering_values() filter_values = {k: v for k, v in self.filter_values.items() if v != "" and pd.notnull(v)} filter_values.pop("min_date", None) filter_values.pop("max_date", None) values = load_json(file_path) values[name] = filter_values save_json(file_path, values) return values
def log_metrics(self, metrics, step=None): # If on testing mode, log output score as json file if self.test: epoch = metrics['epoch'] dump_path = os.path.join(self.log_dir, f"test_scores_epoch={epoch}.json") save_json(dump_path, metrics) # Else, usual tensorboard logging mode else: super().log_metrics(metrics, step)
def create_base_model(): try: os.makedirs(os.path.join(ROOT_DIR, 'data/simulated-data')) except OSError: pass base_model = load_json( os.path.join( ROOT_DIR, 'experiments/trained_models/MSK-ALL/denovo/mix_010clusters_006signatures/314179seed.json' )) save_json(os.path.join(ROOT_DIR, 'data/simulated-data/base_model'), base_model)
def save_stats(df: pd.DataFrame, config: dict): stats: Dict[str, Any] = {} data_dir = Path(config["dir"]) data_path = data_dir / config["name"] stats_path = data_dir / config["stats"] stats["line_count"] = len(df) stats["size"] = total_size(data_path) stats["dtypes"] = {k: str(v) for k, v in df.dtypes.to_dict().items()} stats["nuniques"] = {c: df[c].nunique() for c in df.columns} utils.save_json(stats, stats_path)
def process_config(config_path, override_dotmap=None, exp_name_suffix=None): """ Processes config file: 1) Converts it to a DotMap 2) Creates experiments path and required subdirs 3) Set up logging """ config_json = load_json(config_path) config = DotMap(config_json) if override_dotmap is not None: config.update(override_dotmap) if exp_name_suffix is not None: config.exp_name = f'{config.exp_name}_{exp_name_suffix}' print("Loaded configuration: ") pprint(config) print() print(" *************************************** ") print(" Running experiment {}".format(config.exp_name)) print(" *************************************** ") print() exp_base = config.exp_base exp_dir = os.path.join(exp_base, "experiments", config.exp_name) # create some important directories to be used for the experiment. config.checkpoint_dir = os.path.join(exp_dir, "checkpoints/") config.log_dir = os.path.join(exp_dir, "logs/") config.summary_dir = os.path.join(exp_dir, "summaries/") config.exp_dir = exp_dir # will not create if already existing makedirs([ config.checkpoint_dir, config.log_dir, config.summary_dir, ]) # save config to experiment dir config_out = os.path.join(exp_dir, 'config.json') save_json(config.toDict(), config_out) # setup logging in the project setup_logging(config.log_dir) logging.getLogger().info( "Configurations and directories successfully set up.") return config
def prepare_prediction_dir(trained_models_dir, prediction_dir): datasets = os.listdir(trained_models_dir) prediction_dir = os.path.join(prediction_dir, 'prediction') for dataset in datasets: print(dataset) dataset_dir = os.path.join(trained_models_dir, dataset) for signature_learning in os.listdir(dataset_dir): for model in os.listdir( os.path.join(dataset_dir, signature_learning)): dataset_path = os.path.join(prediction_dir, dataset, signature_learning, model) try: os.makedirs(dataset_path) except OSError: pass data, _ = get_data_by_model_name(dataset, model) json_data = {} for sample, sample_data in data.items(): json_data[sample] = {} for chrom, chrom_data in sample_data.items(): json_data[sample][chrom] = {} json_data[sample][chrom]['Sequence'] = chrom_data[ 'Sequence'].tolist() json_data[sample][chrom]['StrandInfo'] = chrom_data[ 'StrandInfo'].tolist() save_json(os.path.join(dataset_path, 'data'), json_data) del json_data for num_sigs in os.listdir( os.path.join(dataset_dir, signature_learning, model)): num_sig_dir = os.path.join(dataset_path, num_sigs) try: os.makedirs(num_sig_dir) except OSError: pass experiment_dir = os.path.join(dataset_dir, signature_learning, model, num_sigs) runs = os.listdir(experiment_dir) for run in runs: model_parameters = load_json( os.path.join(experiment_dir, run))['parameters'] if not model_parameters['e'][0][0] >= 0: print('There was a bug in run {}'.format( os.path.join(experiment_dir, run))) prediction = predict_hidden_variables( data, model_parameters) save_json(os.path.join(num_sig_dir, run), prepare_data_to_json(prediction)) print('\n')
def build_word_dict(config, min_freq=5): cnt = 0 word_cnt = collections.Counter() attr_cnt = collections.Counter() for line in read_json_lines(config.train_data): we = WikiEntity(line) box = we.get_box() for a in box.keys(): for w in box[a].split(): if config.to_lower: w = w.lower() word_cnt[w] += 1 if config.to_lower: a = a.lower() attr_cnt[a] += 1 desc = we.get_desc() for w in desc.split(): if config.to_lower: w = w.lower() word_cnt[w] += 1 cnt += 1 if cnt % 10000 == 0: print('\rprocessing: {}'.format(cnt), end='') print() word_cnt[config.pad] = attr_cnt[config.pad] = 1e9 - config.pad_id word_cnt[config.unk] = attr_cnt[config.unk] = 1e9 - config.unk_id word_cnt[config.sos] = attr_cnt[config.sos] = 1e9 - config.sos_id word_cnt[config.eos] = attr_cnt[config.eos] = 1e9 - config.eos_id word_cnt[config.num] = attr_cnt[config.num] = 1e9 - config.num_id word_cnt[config.time] = attr_cnt[config.time] = 1e9 - config.time_id print('number of words in word counter: {}'.format(len(word_cnt))) print('number of words in attribute counter: {}'.format(len(attr_cnt))) word_dict = {} for word, cnt in word_cnt.most_common(): if cnt < min_freq: break word_dict[word] = len(word_dict) save_json(word_dict, config.word_dict) attr_dict = {} for attr, _ in attr_cnt.most_common(): attr_dict[attr] = len(attr_dict) save_json(attr_dict, config.attr_dict)
def _process_config(config_json, override_dotmap=None): """ Processes config file: 1) Converts it to a DotMap 2) Creates experiments path and required subdirs 3) Set up logging """ config = DotMap(config_json) if override_dotmap is not None: config.update(override_dotmap) print("Loaded configuration: ") pprint(config) print() print(" *************************************** ") print(" Running experiment {}".format(config.exp_name)) print(" *************************************** ") print() exp_base = config.exp_base timestamp = strftime('%Y-%m-%d--%H_%M_%S', localtime()) exp_dir = os.path.join(exp_base, "experiments", config.exp_name, timestamp) # create some important directories to be used for the experiment. config.summary_dir = os.path.join(exp_dir, "summaries/") config.checkpoint_dir = os.path.join(exp_dir, "checkpoints/") config.out_dir = os.path.join(exp_dir, "out/") config.log_dir = os.path.join(exp_dir, "logs/") makedirs([ config.summary_dir, config.checkpoint_dir, config.out_dir, config.log_dir ]) # save config to experiment dir config_out = os.path.join(exp_dir, 'config.json') save_json(config.toDict(), config_out) # setup logging in the project setup_logging(config.log_dir) logging.getLogger().info( "Configurations and directories successfully set up.") return config
def train_model(dataset, model_name, use_cosmic, num_signatures, random_seed, max_iterations, epsilon, out_dir): use_cosmic_dir = 'refit' if use_cosmic else 'denovo' dataset_name = dataset dataset, active_signatures = get_data_by_model_name(dataset, model_name) if use_cosmic: num_signatures = len(active_signatures) signatures = get_cosmic_signatures()[active_signatures] elif num_signatures == 0: print( 'use_cosmic is False and num_signatures is 0, using number of active cosmic signatures {}' .format(len(active_signatures))) num_signatures = len(active_signatures) signatures = None else: signatures = None out_dir = os.path.join(out_dir, dataset_name, use_cosmic_dir, model_name, str(num_signatures)) try: os.makedirs(out_dir) except OSError: pass random_seed = int(time.time()) if random_seed == 0 else random_seed out_file = out_dir + "/" + str(random_seed) if os.path.isfile(out_file + '.json'): print('Experiment with parameters {} {} {} {} {} already exist'.format( dataset_name, model_name, use_cosmic, num_signatures, random_seed)) return model, ll = train_stickysig(dataset, num_signatures, signatures, random_seed, epsilon, max_iterations) parameters = model.get_params() parameters['alpha'] = parameters['alpha'].tolist() parameters['e'] = parameters['e'].tolist() for sample in parameters['pi']: parameters['pi'][sample] = parameters['pi'][sample].tolist() out = {'log-likelihood': ll, 'parameters': parameters} save_json(out_file, out)
def add_to_drop_data(self, event): col_index = self.table_view.currentIndex().column() row_index = self.table_view.currentIndex().row() column = self.table_data_sorted.columns[col_index] content = self.table_data_sorted.iloc[row_index, col_index] try: drop_data = load_json(self.config["paths"]["drop_data"]) values = drop_data.get(column, None) if values: drop_data[column] = values + [content] else: drop_data[column] = [content] save_json(self.config["paths"]["drop_data"], drop_data) self.drop_data_added_signal.emit() except Exception as e: print(e) show_warning("Drop data addition failure", "Something went wrong")
def save_initial_data_to_json(): skills_json_path = os.path.join(SAVED_DATA_FOLDER_PATH, "skills.json") save_json(skills_json_path, SKILLS_DATA_STRUCTURES) print(f"Saved skills to file {skills_json_path}") events_json_path = os.path.join(SAVED_DATA_FOLDER_PATH, "events.json") save_json(events_json_path, EVENTS_DATA_STRUCTURES) print(f"Saved events to file {events_json_path}") jobs_json_path = os.path.join(SAVED_DATA_FOLDER_PATH, "jobs.json") save_json(jobs_json_path, JOB_DATA_STRUCTURES) print(f"Saved jobs to file {jobs_json_path}") departments_json_path = os.path.join(SAVED_DATA_FOLDER_PATH, "departments.json") save_json(departments_json_path, DEPARTMENTS_DATA_SCTRUCTURES) print(f"Saved departments to file {departments_json_path}")
import os from src.generate.students.random_students import generate_multiple_random_students from src.utils import save_json from data.departments import DEPARTMENTS_DATA_SCTRUCTURES from data.events import EVENTS_DATA_STRUCTURES from data.jobs import JOB_DATA_STRUCTURES cwd = os.getcwd() GENERATED_STUDENTS_COUNT = 30 STEM_DEPARTMENTS = DEPARTMENTS_DATA_SCTRUCTURES[:8] STEM_EVENTS = EVENTS_DATA_STRUCTURES[:5] STEM_JOBS = JOB_DATA_STRUCTURES[:12] SAVED_JSON_FILE = os.path.join(cwd, "saved_data/students/stem_students.json") if __name__ == "__main__": students = generate_multiple_random_students(STEM_DEPARTMENTS, STEM_EVENTS, STEM_JOBS, students_count=GENERATED_STUDENTS_COUNT) save_json(SAVED_JSON_FILE, students)
# =============================== # === Make submission # =============================== sample_submission = pd.read_csv(input_dir / "sample_submission.csv") submission_df = make_submission(test_preds, sample_submission) # =============================== # === Save # =============================== config["eval_results"] = dict() for k, v in evals_results.items(): config["eval_results"][k] = v save_path = output_dir / "output.json" save_json(config, save_path) plot_feature_importance(feature_importance, output_dir / "feature_importance.png") np.save(output_dir / "oof_preds.npy", oof_preds) np.save(output_dir / "test_preds.npy", test_preds) submission_df.to_csv(output_dir / "submission.csv", index=False) save_pickle(models, output_dir / "model.pkl") slack_notify(config_name + "終わったぞ\n" + str(config))
def main(): """ Main eval loop: Iterates over all evaluation samples and saves the corresponding predictions as json and zip file. This is the format expected at https://competitions.codalab.org/competitions/21238#learn_the_details-overview """ parser = argparse.ArgumentParser( description="Evaluation on Freihand eval set.") parser.add_argument("-key", type=str, help="Add comet key of experiment to restore.") parser.add_argument( "-resnet_size", type=str, help="Resnet sizes", choices=["18", "34", "50", "101", "152"], default=50, ) parser.add_argument("--heatmap", action="store_true", help="Choose Resnet", default=False) parser.add_argument( "--palm_trained", action="store_true", help="Use when palm is regressed during training.", default=False, ) parser.add_argument( "-split", type=str, help="For debugging select val split", default="test", choices=["test", "val"], ) parser.add_argument("-checkpoint", type=str, help="selectign checkpoint", default="") args = parser.parse_args() model = load_model(args.key, args.resnet_size, args.heatmap, args.checkpoint) if args.split == "val": print( "DEBUG MODE ACTIVATED.\n Evaluation pipeline is executed on validation set" ) train_param = edict(read_json(TRAINING_CONFIG_PATH)) train_param.augmentation_flags.resize = True train_param.augmentation_flags.crop = True # train_param.augmentation_params.crop_margin = 1.5 train_param.augmentation_params.crop_box_jitter = [0.0, 0.0] augmenter = SampleAugmenter(train_param.augmentation_flags, train_param.augmentation_params) # Normalization for BGR mode. # transform = transforms.Compose( # [ # transforms.ToTensor(), # transforms.Normalize( # (0.485, 0.456, 0.406)[::-1], (0.229, 0.224, 0.225)[::-1] # ), # ] # ) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ]) data = F_DB(FREIHAND_DATA, split=args.split) xyz_pred = [] debug_mean = [] with torch.no_grad(): for i in tqdm(range(len(data))): joints3d_normalized = normalize_joints( model_refined_inference(model, data[i], augmenter, transform, args.palm_trained)) if args.split == "val": # DEBUG CODE: joints3d = joints3d_normalized * data.scale[data.indices[i] % 32560] debug_mean.append( torch.mean(torch.abs(joints3d - data[i]["joints3D"]))) else: joints3d = joints3d_normalized * data.scale[data.indices[i]] xyz_pred.append(JOINTS.ait_to_freihand(joints3d).tolist()) if args.split == "val": # DEBUG CODE: print( f"MAE 3d\nMean : {np.mean(debug_mean)}\nMax: { np.max(debug_mean)}" "\nMedian: { np.median(debug_mean)}") exit() verts = np.zeros((len(xyz_pred), 778, 3)).tolist() save_json([xyz_pred, verts], f"{args.key}_pred.json") subprocess.call( ["zip", "-j", f"{args.key}_pred.zip", f"{args.key}_pred.json"]) subprocess.call(["rm", f"{args.key}_pred.json"])
def evaluation(): """evaluation""" print('********************** loading corpus ********************** ') s_lc = time.time() data_generator = DataGen(config) queries = read_query(config) print("loading corpus time (h):", (time.time() - s_lc) / 3600) print('********************** loading model ********************** ') s_lm = time.time() model_onehop_bert = ModelOneHop() param_dict = load_checkpoint(config.onehop_bert_path) load_param_into_net(model_onehop_bert, param_dict) model_twohop_bert = ModelTwoHop() param_dict2 = load_checkpoint(config.twohop_bert_path) load_param_into_net(model_twohop_bert, param_dict2) onehop = OneHopBert(config, model_onehop_bert) twohop = TwoHopBert(config, model_twohop_bert) print("loading model time (h):", (time.time() - s_lm) / 3600) print('********************** evaluation ********************** ') s_tr = time.time() f_dev = open(config.dev_path, 'rb') dev_data = json.load(f_dev) q_gold = {} q_2id = {} for onedata in dev_data: if onedata["question"] not in q_gold: q_gold[onedata["question"]] = [ get_new_title(get_raw_title(item)) for item in onedata['path'] ] q_2id[onedata["question"]] = onedata['_id'] val, true_count, count, step = 0, 0, 0, 0 batch_queries = split_queries(config, queries)[:-1] output_path = [] for _, batch in enumerate(batch_queries): print("###step###: ", step) query = batch[0] temp_dict = {} temp_dict['q_id'] = q_2id[query] temp_dict['question'] = query gold_path = q_gold[query] input_ids_1, token_type_ids_1, input_mask_1 = data_generator.convert_onehop_to_features( batch) start = 0 TOTAL = len(input_ids_1) split_chunk = 8 while start < TOTAL: end = min(start + split_chunk - 1, TOTAL - 1) chunk_len = end - start + 1 input_ids_1_ = input_ids_1[start:start + chunk_len] input_ids_1_ = Tensor(input_ids_1_, mstype.int32) token_type_ids_1_ = token_type_ids_1[start:start + chunk_len] token_type_ids_1_ = Tensor(token_type_ids_1_, mstype.int32) input_mask_1_ = input_mask_1[start:start + chunk_len] input_mask_1_ = Tensor(input_mask_1_, mstype.int32) cls_out = onehop(input_ids_1_, token_type_ids_1_, input_mask_1_) if start == 0: out = cls_out else: out = P.Concat(0)((out, cls_out)) start = end + 1 out = P.Squeeze(1)(out) onehop_prob, onehop_index = P.TopK(sorted=True)(out, config.topk) onehop_prob = P.Softmax()(onehop_prob) sample, path_raw, last_out = data_generator.get_samples( query, onehop_index, onehop_prob) input_ids_2, token_type_ids_2, input_mask_2 = data_generator.convert_twohop_to_features( sample) start_2 = 0 TOTAL_2 = len(input_ids_2) split_chunk = 8 while start_2 < TOTAL_2: end_2 = min(start_2 + split_chunk - 1, TOTAL_2 - 1) chunk_len = end_2 - start_2 + 1 input_ids_2_ = input_ids_2[start_2:start_2 + chunk_len] input_ids_2_ = Tensor(input_ids_2_, mstype.int32) token_type_ids_2_ = token_type_ids_2[start_2:start_2 + chunk_len] token_type_ids_2_ = Tensor(token_type_ids_2_, mstype.int32) input_mask_2_ = input_mask_2[start_2:start_2 + chunk_len] input_mask_2_ = Tensor(input_mask_2_, mstype.int32) cls_out = twohop(input_ids_2_, token_type_ids_2_, input_mask_2_) if start_2 == 0: out_2 = cls_out else: out_2 = P.Concat(0)((out_2, cls_out)) start_2 = end_2 + 1 out_2 = P.Softmax()(out_2) last_out = Tensor(last_out, mstype.float32) out_2 = P.Mul()(out_2, last_out) val, true_count, topk_titles = eval_output(out_2, last_out, path_raw, gold_path, val, true_count) temp_dict['topk_titles'] = topk_titles output_path.append(temp_dict) count += 1 print("val:", val) print("count:", count) print("true count:", true_count) if count: print("PEM:", val / count) if true_count: print("true top8 PEM:", val / true_count) step += 1 save_json(output_path, config.save_path, config.save_name) print("evaluation time (h):", (time.time() - s_tr) / 3600)
def run_train(sess, model, train_data, valid_data, saver, evaluator, summary_writer=None): flag = 0 best_valid_result = 0.0 valid_log_history = defaultdict(list) global_step = 0 for i in range(config.num_epoch): logger.info(log_title('Train Epoch: {}'.format(i + 1))) steps = 0 total_loss = 0.0 total_accu = 0.0 batch_iter = tqdm( list( make_batch_iter(list(zip(*train_data)), config.batch_size, shuffle=True))) for batch in batch_iter: topic, topic_len, triple, triple_len, src, src_len, tgt, tgt_len = make_batch_data( batch) _, loss, accu, global_step, summary = sess.run( [ model.train_op, model.loss, model.accu, model.global_step, model.summary ], feed_dict={ model.batch_size: len(topic), model.topic: topic, model.topic_len: topic_len, model.triple: triple, model.triple_len: triple_len, model.src: src, model.src_len: src_len, model.tgt: tgt, model.tgt_len: tgt_len, model.training: True }) steps += 1 total_loss += loss total_accu += accu batch_iter.set_description( 'loss: {:>.4f} accuracy: {:>.4f}'.format(loss, accu)) if global_step % args.log_steps == 0 and summary_writer is not None: summary_writer.add_summary(summary, global_step) if global_step % args.save_steps == 0: # evaluate saved models after pre-train epochs if i < args.pre_train_epochs: saver.save(sess, config.model_file, global_step=global_step) else: predicted_ids, valid_loss, valid_accu = run_evaluate( sess, model, valid_data) logger.info( 'valid loss: {:>.4f}, valid accuracy: {:>.4f}'.format( valid_loss, valid_accu)) save_outputs(predicted_ids, config.id_2_word, config.valid_data, config.valid_outputs) valid_results = evaluator.evaluate(config.valid_data, config.valid_outputs, config.to_lower) # early stop if valid_results['BLEU 4'] >= best_valid_result: flag = 0 best_valid_result = valid_results['BLEU 4'] logger.info('saving model-{}'.format(global_step)) saver.save(sess, config.model_file, global_step=global_step) save_json(valid_results, config.valid_results) elif flag < args.early_stop: flag += 1 elif args.early_stop: return valid_log_history for key, value in valid_results.items(): valid_log_history[key].append(value) valid_log_history['loss'].append(valid_loss) valid_log_history['accuracy'].append(valid_accu) valid_log_history['global_step'].append(int(global_step)) logger.info('train loss: {:>.4f}, train accuracy: {:>.4f}'.format( total_loss / steps, total_accu / steps)) saver.save(sess, config.model_file, global_step=global_step) return valid_log_history
x="importance", y="feature", data=feature_importance.sort_values("mean_importance", ascending=False), ) plt.title("Model Features") plt.tight_layout() plt.savefig(output / "feature_importance.png") # =============================== # === Make submission # =============================== sample_submission = pd.read_csv(input_dir / "sample_submission.csv") submission_df = make_submission(test_pred, sample_submission) # =============================== # === Save # =============================== save_path = output / "output.json" output_dict["feature_importance"] = dict() output_dict["feature_importance"] = feature_importance_dict save_json(output_dict, save_path) np.save(output / "oof_preds.npy", oof_pred) np.save(output / "test_preds.npy", test_pred) config_name = args.output.split("/")[-1] submission_df.to_csv(output / f"{config_name}_sub.csv", index=False)
def eval_language_metrics(checkpoint, eval_data_loader, opt, model=None, eval_mode="val"): """eval_mode can only be set to `val` here, as setting to `test` is cheating 0, run inference 1, Get METEOR, BLEU1-4, CIDEr scores 2, Get vocab size, sentence length """ translator = Translator(opt, checkpoint, model=model) json_res = run_translate(eval_data_loader, translator, opt=opt) res_filepath = os.path.abspath( opt.save_model + "_tmp_greedy_pred_{}.json".format(eval_mode)) save_json(json_res, res_filepath, save_pretty=True) if opt.dset_name == "anet": reference_files_map = { "val": [ os.path.join(opt.data_dir, e) for e in [ "anet_entities_val_1_para.json", "anet_entities_val_2_para.json" ] ], "test": [ os.path.join(opt.data_dir, e) for e in [ "anet_entities_test_1_para.json", "anet_entities_test_2_para.json" ] ] } else: # yc2 reference_files_map = { "val": [os.path.join(opt.data_dir, "yc2_val_anet_format_para.json")] } # COCO language evaluation eval_references = reference_files_map[eval_mode] lang_filepath = res_filepath.replace(".json", "_lang.json") eval_cmd = [ "python", "para-evaluate.py", "-s", res_filepath, "-o", lang_filepath, "-v", "-r" ] + eval_references subprocess.call(eval_cmd, cwd=opt.eval_tool_dir) # basic stats stat_filepath = res_filepath.replace(".json", "_stat.json") eval_stat_cmd = [ "python", "get_caption_stat.py", "-s", res_filepath, "-r", eval_references[0], "-o", stat_filepath, "-v" ] subprocess.call(eval_stat_cmd, cwd=opt.eval_tool_dir) # repetition evaluation rep_filepath = res_filepath.replace(".json", "_rep.json") eval_rep_cmd = [ "python", "evaluateRepetition.py", "-s", res_filepath, "-r", eval_references[0], "-o", rep_filepath ] subprocess.call(eval_rep_cmd, cwd=opt.eval_tool_dir) # save results logger.info("Finished eval {}.".format(eval_mode)) metric_filepaths = [lang_filepath, stat_filepath, rep_filepath] all_metrics = merge_dicts([load_json(e) for e in metric_filepaths]) all_metrics_filepath = res_filepath.replace(".json", "_all_metrics.json") save_json(all_metrics, all_metrics_filepath, save_pretty=True) return all_metrics, [res_filepath, all_metrics_filepath]
parser.add_argument("-ei", "--exp_id", default=None) parser.add_argument("-mf", "--metrics_flag", default=1, type=int) args = parser.parse_args() exp_list = [] for exp_group_name in args.exp_group_list: exp_list += exp_configs.EXP_GROUPS[exp_group_name] # loop over experiments for exp_dict in exp_list: exp_id = ut.hash_dict(exp_dict) if args.exp_id is not None and args.exp_id != exp_id: continue savedir = args.savedir_base + "/%s/" % exp_id os.makedirs(savedir, exist_ok=True) ut.save_json(savedir + "/exp_dict.json", exp_dict) # check if experiment exists if args.reset: if os.path.exists(savedir + "/score_list.pkl"): os.remove(savedir + "/score_list.pkl") if os.path.exists(savedir + "/run_dict.pkl"): os.remove(savedir + "/run_dict.pkl") # do trainval trainval(exp_dict=exp_dict, savedir=savedir, datadir=args.datadir, metrics_flag=args.metrics_flag)
def main(): os.makedirs(config.temp_dir, exist_ok=True) os.makedirs(config.result_dir, exist_ok=True) os.makedirs(config.train_log_dir, exist_ok=True) logger.setLevel(logging.INFO) init_logger(logging.INFO, 'temp.log.txt', 'w') logger.info('preparing data...') config.word_2_id, config.id_2_word = read_json_dict(config.vocab_dict) config.vocab_size = min(config.vocab_size, len(config.word_2_id)) config.oov_vocab_size = min(config.oov_vocab_size, len(config.word_2_id) - config.vocab_size) embedding_matrix = None if args.do_train: if os.path.exists(config.glove_file): logger.info('loading embedding matrix from file: {}'.format( config.glove_file)) embedding_matrix, config.word_em_size = load_glove_embedding( config.glove_file, list(config.word_2_id.keys())) logger.info('shape of embedding matrix: {}'.format( embedding_matrix.shape)) else: if os.path.exists(config.glove_file): with open(config.glove_file, 'r', encoding='utf-8') as fin: line = fin.readline() config.word_em_size = len(line.strip().split()) - 1 data_reader = DataReader(config) evaluator = Evaluator('tgt') logger.info('building model...') model = get_model(config, embedding_matrix) saver = tf.train.Saver(max_to_keep=10) if args.do_train: logger.info('loading data...') train_data = data_reader.read_train_data() valid_data = data_reader.read_valid_data() logger.info(log_title('Trainable Variables')) for v in tf.trainable_variables(): logger.info(v) logger.info(log_title('Gradients')) for g in model.gradients: logger.info(g) with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint( os.path.join(config.result_dir, config.current_model)) if model_file is not None: logger.info('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) else: logger.info('initializing from scratch...') tf.global_variables_initializer().run() train_writer = tf.summary.FileWriter(config.train_log_dir, sess.graph) valid_log_history = run_train(sess, model, train_data, valid_data, saver, evaluator, train_writer) save_json( valid_log_history, os.path.join(config.result_dir, config.current_model, 'valid_log_history.json')) if args.do_eval: logger.info('loading data...') valid_data = data_reader.read_valid_data() with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint( os.path.join(config.result_dir, config.current_model)) if model_file is not None: logger.info('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) predicted_ids, valid_loss, valid_accu = run_evaluate( sess, model, valid_data) logger.info( 'average valid loss: {:>.4f}, average valid accuracy: {:>.4f}' .format(valid_loss, valid_accu)) logger.info(log_title('Saving Result')) save_outputs(predicted_ids, config.id_2_word, config.valid_data, config.valid_outputs) results = evaluator.evaluate(config.valid_data, config.valid_outputs, config.to_lower) save_json(results, config.valid_results) else: logger.info('model not found!') if args.do_test: logger.info('loading data...') test_data = data_reader.read_test_data() with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint( os.path.join(config.result_dir, config.current_model)) if model_file is not None: logger.info('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) predicted_ids = run_test(sess, model, test_data) logger.info(log_title('Saving Result')) save_outputs(predicted_ids, config.id_2_word, config.test_data, config.test_outputs) results = evaluator.evaluate(config.test_data, config.test_outputs, config.to_lower) save_json(results, config.test_results) else: logger.info('model not found!')
def main(): os.makedirs(config.temp_dir, exist_ok=True) os.makedirs(config.result_dir, exist_ok=True) os.makedirs(config.train_log_dir, exist_ok=True) logger.setLevel(logging.INFO) init_logger(logging.INFO) logger.info('loading dict...') config.src_2_id, config.id_2_src = read_json_dict(config.src_vocab_dict) config.src_vocab_size = min(config.src_vocab_size, len(config.src_2_id)) config.tgt_2_id, config.id_2_tgt = read_json_dict(config.tgt_vocab_dict) config.tgt_vocab_size = min(config.tgt_vocab_size, len(config.tgt_2_id)) data_reader = DataReader(config) evaluator = Evaluator('tgt') logger.info('building model...') model = get_model(config) saver = tf.train.Saver(max_to_keep=10) if args.do_train: logger.info('loading data...') train_data = data_reader.load_train_data() valid_data = data_reader.load_valid_data() logger.info(log_title('Trainable Variables')) for v in tf.trainable_variables(): logger.info(v) logger.info(log_title('Gradients')) for g in model.gradients: logger.info(g) with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint( os.path.join(config.result_dir, config.current_model)) if model_file is not None: logger.info('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) else: logger.info('initializing from scratch...') tf.global_variables_initializer().run() train_writer = tf.summary.FileWriter(config.train_log_dir, sess.graph) valid_log_history = run_train(sess, model, train_data, valid_data, saver, evaluator, train_writer) save_json( valid_log_history, os.path.join(config.result_dir, config.current_model, 'valid_log_history.json')) if args.do_eval: logger.info('loading data...') valid_data = data_reader.load_valid_data() with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint( os.path.join(config.result_dir, config.current_model)) if model_file is not None: logger.info('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) predicted_ids, valid_loss, valid_accu = run_evaluate( sess, model, valid_data) logger.info( 'average valid loss: {:>.4f}, average valid accuracy: {:>.4f}' .format(valid_loss, valid_accu)) logger.info(log_title('Saving Result')) save_outputs(predicted_ids, config.id_2_tgt, config.valid_data, config.valid_outputs) results = evaluator.evaluate(config.valid_data, config.valid_outputs, config.to_lower) save_json(results, config.valid_results) else: logger.info('model not found!') if args.do_test: logger.info('loading data...') test_data = data_reader.load_test_data() with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint( os.path.join(config.result_dir, config.current_model)) if model_file is not None: logger.info('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) predicted_ids = run_test(sess, model, test_data) logger.info(log_title('Saving Result')) save_outputs(predicted_ids, config.id_2_tgt, config.test_data, config.test_outputs) results = evaluator.evaluate(config.test_data, config.test_outputs, config.to_lower) save_json(results, config.test_results) else: logger.info('model not found!')
def simulate(num_clusters, num_signatures, num_samples, random_seed): np.random.seed(random_seed) base_model = get_model( load_json( os.path.join(ROOT_DIR, 'data', 'simulated-data', 'base_model.json'))['parameters']) if num_clusters > base_model.num_clusters: raise ValueError( 'num_clusters cannot be larger than base_model.num_clusters ({})'. format(base_model.num_clusters)) if num_signatures > base_model.num_topics: raise ValueError( 'num_clusters cannot be larger than base_model.num_topics ({})'. format(base_model.num_topics)) msk_data, _ = get_data('MSK-ALL') msk_sizes = np.sum(msk_data, 1).astype('int') clusters = np.random.choice(base_model.num_clusters, size=num_clusters, replace=False, p=base_model.w) pi = base_model.pi[clusters] w = base_model.w[clusters] w /= w.sum() prob_sig = np.dot(w, pi) signatures = np.random.choice(base_model.num_topics, size=num_signatures, replace=False, p=prob_sig) pi = pi[:, signatures] pi /= pi.sum(1, keepdims=True) e = base_model.e[signatures] model = Mix(num_clusters, num_signatures, init_params={ 'w': w, 'pi': pi, 'e': e }) sample_sizes = np.random.choice(msk_sizes, num_samples) clusters, signatures, mutations = model.sample(sample_sizes) curr_dir = os.path.join( ROOT_DIR, 'data', 'simulated-data', '{}_{}_{}_{}'.format(num_clusters, num_signatures, num_samples, random_seed)) try: os.makedirs(curr_dir) except OSError: pass # Save model, base data save_json(os.path.join(curr_dir, 'full_simulated'), { 'clusters': clusters, 'signatures': signatures, 'mutations': mutations }) parameters = model.get_params() parameters['w'] = parameters['w'].tolist() parameters['pi'] = parameters['pi'].tolist() parameters['e'] = parameters['e'].tolist() save_json(os.path.join(curr_dir, 'model'), parameters) # Transform the basic data into mutation matrix mutation_mat = np.zeros((num_samples, 96), dtype='int') for i in range(num_samples): a, b = np.unique(mutations[i], return_counts=True) mutation_mat[i, a] = b np.save(os.path.join(curr_dir, 'mutations'), mutation_mat)
if __name__ == "__main__": t_s = time.time() config = ThinkRetrieverConfig() pool = Pool(processes=config.device_num) results = [] for device_id in range(config.device_num): results.append(pool.apply_async(evaluation, (device_id, ))) print("Waiting for all subprocess done...") pool.close() pool.join() val_all, true_count_all, count_all = 0, 0, 0 output_path_all = [] for res in results: output = res.get() val_all += output['val'] count_all += output['count'] true_count_all += output['true_count'] output_path_all += output['path'] print("val:", val_all) print("count:", count_all) print("true count:", true_count_all) print("PEM:", val_all / count_all) print("true top8 PEM:", val_all / true_count_all) save_json(output_path_all, config.save_path, config.save_name) print("evaluation time (h):", (time.time() - t_s) / 3600)
def main(): parser = argparse.ArgumentParser(description="translate.py") parser.add_argument("--eval_splits", type=str, nargs="+", default=["val", ], choices=["val", "test"], help="evaluate on val/test set, yc2 only has val") parser.add_argument("--res_dir", required=True, help="path to dir containing model .pt file") parser.add_argument("--batch_size", type=int, default=100, help="batch size") # beam search configs parser.add_argument("--use_beam", action="store_true", help="use beam search, otherwise greedy search") parser.add_argument("--beam_size", type=int, default=2, help="beam size") parser.add_argument("--n_best", type=int, default=1, help="stop searching when get n_best from beam search") parser.add_argument("--min_sen_len", type=int, default=5, help="minimum length of the decoded sentences") parser.add_argument("--max_sen_len", type=int, default=30, help="maximum length of the decoded sentences") parser.add_argument("--block_ngram_repeat", type=int, default=0, help="block repetition of ngrams during decoding.") parser.add_argument("--length_penalty_name", default="none", choices=["none", "wu", "avg"], help="length penalty to use.") parser.add_argument("--length_penalty_alpha", type=float, default=0., help="Google NMT length penalty parameter (higher = longer generation)") parser.add_argument("--eval_tool_dir", type=str, default="./densevid_eval") parser.add_argument("--no_cuda", action="store_true") parser.add_argument("--seed", default=2019, type=int) parser.add_argument("--debug", action="store_true") opt = parser.parse_args() opt.cuda = not opt.no_cuda # random seed random.seed(opt.seed) np.random.seed(opt.seed) torch.manual_seed(opt.seed) checkpoint = torch.load(os.path.join(opt.res_dir, "model.chkpt")) # add some of the train configs train_opt = checkpoint["opt"] # EDict(load_json(os.path.join(opt.res_dir, "model.cfg.json"))) for k in train_opt.__dict__: if k not in opt.__dict__: setattr(opt, k, getattr(train_opt, k)) print("train_opt", train_opt) decoding_strategy = "beam{}_lp_{}_la_{}".format( opt.beam_size, opt.length_penalty_name, opt.length_penalty_alpha) if opt.use_beam else "greedy" save_json(vars(opt), os.path.join(opt.res_dir, "{}_eval_cfg.json".format(decoding_strategy)), save_pretty=True) if opt.dset_name == "anet": reference_files_map = { "val": [os.path.join(opt.data_dir, e) for e in ["anet_entities_val_1_para.json", "anet_entities_val_2_para.json"]], "test": [os.path.join(opt.data_dir, e) for e in ["anet_entities_test_1_para.json", "anet_entities_test_2_para.json"]]} else: # yc2 reference_files_map = {"val": [os.path.join(opt.data_dir, "yc2_val_anet_format_para.json")]} for eval_mode in opt.eval_splits: print("Start evaluating {}".format(eval_mode)) # add 10 at max_n_sen to make the inference stage use all the segments eval_data_loader = get_data_loader(opt, eval_mode=eval_mode) eval_references = reference_files_map[eval_mode] # setup model translator = Translator(opt, checkpoint) pred_file = os.path.join(opt.res_dir, "{}_pred_{}.json".format(decoding_strategy, eval_mode)) pred_file = os.path.abspath(pred_file) if not os.path.exists(pred_file): json_res = run_translate(eval_data_loader, translator, opt=opt) save_json(json_res, pred_file, save_pretty=True) else: print("Using existing prediction file at {}".format(pred_file)) # COCO language evaluation lang_file = pred_file.replace(".json", "_lang.json") eval_command = ["python", "para-evaluate.py", "-s", pred_file, "-o", lang_file, "-v", "-r"] + eval_references subprocess.call(eval_command, cwd=opt.eval_tool_dir) # basic stats stat_filepath = pred_file.replace(".json", "_stat.json") eval_stat_cmd = ["python", "get_caption_stat.py", "-s", pred_file, "-r", eval_references[0], "-o", stat_filepath, "-v"] subprocess.call(eval_stat_cmd, cwd=opt.eval_tool_dir) # repetition evaluation rep_filepath = pred_file.replace(".json", "_rep.json") eval_rep_cmd = ["python", "evaluateRepetition.py", "-s", pred_file, "-r", eval_references[0], "-o", rep_filepath] subprocess.call(eval_rep_cmd, cwd=opt.eval_tool_dir) metric_filepaths = [lang_file, stat_filepath, rep_filepath] all_metrics = merge_dicts([load_json(e) for e in metric_filepaths]) all_metrics_filepath = pred_file.replace(".json", "_all_metrics.json") save_json(all_metrics, all_metrics_filepath, save_pretty=True) print("pred_file {} lang_file {}".format(pred_file, lang_file)) print("[Info] Finished {}.".format(eval_mode))