def main(): parser = argparse.ArgumentParser() parser.add_argument("--train_path", type=str, required=True) parser.add_argument("--dset_name", type=str, default="anet", choices=["anet", "yc2"]) parser.add_argument("--cache", type=str, default="./cache") parser.add_argument("--min_word_count", type=int, default=5) parser.add_argument("--raw_glove_path", type=str, help="downloaded glove vectors path") opt = parser.parse_args() if not os.path.exists(opt.cache): os.makedirs(opt.cache) # load, merge, clean, split data train_data = load_json(opt.train_path) all_sentences = flat_list_of_lists( [v["sentences"] for k, v in train_data.items()]) all_sentences = [ nltk.tokenize.word_tokenize(sen.lower()) for sen in all_sentences ] word2idx = build_vocab_idx(all_sentences, opt.min_word_count) print("[Info] Dumping the processed data to json file", opt.cache) word2idx_path = os.path.join(opt.cache, "{}_word2idx.json".format(opt.dset_name)) save_json(word2idx, word2idx_path, save_pretty=True) print("[Info] Finish.") vocab_glove_path = os.path.join(opt.cache, "{}_vocab_glove.pt".format(opt.dset_name)) extract_glove(word2idx, opt.raw_glove_path, vocab_glove_path)
def load_transform_data(data_path): data = load_json(data_path) transformed_data = [] for v_id, cap in data.items(): cap["v_id"] = v_id transformed_data.append(cap) return transformed_data
def __init__(self, root, transform=None): self.root = root self.transform = transform index_path = os.path.join(root, 'index.json') self.index = load_json(index_path) self._modis_path = self._get_paths('modis') self._landsat_path = self._get_paths('landsat')
def __init__(self, dset_name, data_dir, video_feature_dir, duration_file, word2idx_path, max_t_len, max_v_len, max_n_sen, mode="train", recurrent=True, untied=False): self.dset_name = dset_name self.word2idx = load_json(word2idx_path) self.idx2word = {int(v): k for k, v in self.word2idx.items()} self.data_dir = data_dir # containing training data self.video_feature_dir = video_feature_dir # a set of .h5 files self.duration_file = duration_file self.frame_to_second = self._load_duration() self.max_seq_len = max_v_len + max_t_len self.max_v_len = max_v_len self.max_t_len = max_t_len # sen self.max_n_sen = max_n_sen self.mode = mode self.recurrent = recurrent self.untied = untied assert not (self.recurrent and self.untied), "untied and recurrent cannot be True for both" # data entries self.data = None self.set_data_mode(mode=mode) self.missing_video_names = [] self.fix_missing() self.num_sens = None # number of sentence for each video, set in self._load_data()
def _load_data(self, data_path): logging.info("Loading data from {}".format(data_path)) raw_data = load_json(data_path) data = [] for k, line in tqdm(raw_data.items()): line["name"] = k line["timestamps"] = line["timestamps"][:self.max_n_sen] line["sentences"] = line["sentences"][:self.max_n_sen] data.append(line) if self.recurrent: # recurrent self.data = data else: # non-recurrent single sentence singel_sentence_data = [] for d in sorted(data, key=lambda x: x['name']): num_sen = min(self.max_n_sen, len(d["sentences"])) singel_sentence_data.extend([{ "duration": d["duration"], "name": d["name"], "timestamp": d["timestamps"][idx], "sentence": d["sentences"][idx] } for idx in range(num_sen)]) self.data = singel_sentence_data logging.info("Loading complete! {} examples".format(len(self)))
def process_loco(loco_dir='experiments/LOCO'): chromosomes = [str(i) for i in range(1, 23)] chromosomes.extend(['X', 'Y']) chromosomes = np.array(chromosomes) experiment_string = '{} with {} signatures' datasets = os.listdir(loco_dir) for dataset in datasets: print(dataset + ':') dataset_dir = os.path.join(loco_dir, dataset) for signature_learning in os.listdir(dataset_dir): for model in os.listdir( os.path.join(dataset_dir, signature_learning)): for num_sigs in os.listdir( os.path.join(dataset_dir, signature_learning, model)): if signature_learning == 'denovo': signatures_string = str(num_sigs) else: signatures_string = '{} known cosmic'.format(num_sigs) curr_experiment_string = experiment_string.format( model, signatures_string) experiment_dir = os.path.join(dataset_dir, signature_learning, model, num_sigs) folds = np.array(os.listdir(experiment_dir)) no_dir_folds = [ fold for fold in chromosomes if fold not in folds ] no_run_folds = [] experiment_score = 0 for fold in folds: runs = os.listdir(os.path.join(experiment_dir, fold)) num_runs = len(runs) if num_runs == 0: no_run_folds.append(fold) continue train_scores = np.zeros(num_runs) test_scores = np.zeros(num_runs) for i, run in enumerate(runs): file_path = os.path.join(experiment_dir, fold, run) run_dict = load_json(file_path) for s in run_dict['log-likelihood-train'].values(): train_scores[i] += s for s in run_dict['log-likelihood-test'].values(): test_scores[i] += s # Deciding what run to use according to the log likelihood of the train data best_run = np.argmax(train_scores) experiment_score += test_scores[best_run] if len(no_run_folds) > 0 or len(no_dir_folds): print( curr_experiment_string + ' completely missing folds {} and missing runs for {}' .format(no_dir_folds, no_run_folds)) else: print(curr_experiment_string + ' score is {}'.format(experiment_score)) print('\n')
def read_mm(datadir, name): """ name can be one of {ts, diagnoses, labels, flat}. """ info = load_json(Path(datadir) / (name + '_info.json')) dat_path = Path(datadir) / (name + '.dat') data = np.memmap(dat_path, dtype=np.float32, shape=tuple(info['shape'])) return data, info
def train_model(args): params = load_json(args.param_path) serialization_dir = args.serialization_dir if not serialization_dir: param_filename = os.path.splitext(os.path.split(args.param_path)[1])[0] serialization_dir = os.path.join('checkpoints', param_filename) return func(params, serialization_dir, args.recover, args.force)
def load_db_local_file(filename): try: print('Loading data from local file', filename) return utils.load_json(filename) except Exception as e: print('No se encuentra en la ruta especificada', filename) print('Error:', e) return -1
def _write_filtering_values_to_file(self, file_path, name): self._update_filtering_values() filter_values = {k: v for k, v in self.filter_values.items() if v != "" and pd.notnull(v)} filter_values.pop("min_date", None) filter_values.pop("max_date", None) values = load_json(file_path) values[name] = filter_values save_json(file_path, values) return values
def main(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--config", default="configurations/config.json", help="Path to configuration.") args = parser.parse_args() print(f"Reading configuration from {args.config}") config = load_json(args.config) app = QApplication(sys.argv) mw = MainWindow(config) mw.show() app.exec()
def get_best_run(experiment_dir): runs = os.listdir(experiment_dir) best_score = -np.inf if len(runs) == 0: return '' best_run = runs[0] for run in runs: total_score = load_json(os.path.join(experiment_dir, run))['log-likelihood'] if total_score > best_score: best_score = total_score best_run = run return os.path.join(experiment_dir, best_run)
def create_base_model(): try: os.makedirs(os.path.join(ROOT_DIR, 'data/simulated-data')) except OSError: pass base_model = load_json( os.path.join( ROOT_DIR, 'experiments/trained_models/MSK-ALL/denovo/mix_010clusters_006signatures/314179seed.json' )) save_json(os.path.join(ROOT_DIR, 'data/simulated-data/base_model'), base_model)
def process_config(config_path, custom_split=None, custom_gpu_device=None, override_dotmap=None): config_json = load_json(config_path) if custom_split is not None: config_json['data_params']['split'] = str(custom_split) if custom_gpu_device is not None: config_json['gpu_device'] = int(custom_gpu_device) # so we dont need to manually change folders config_json['exp_name'] = '{}_{}'.format( config_json['exp_name'], config_json['data_params']['split']) return _process_config(config_json, override_dotmap=override_dotmap)
def plot_sig_correlations(dataset, num_sigs, beta_loss=2, plot_title=True, save_plot=True): cosmic_signatures = get_cosmic_signatures() mix_dir = os.path.join(ROOT_DIR, 'experiments/trained_models/{}/denovo'.format(dataset)) scores_dict = get_best_model(mix_dir, return_params=True) BIC_scores = scores_dict['BIC_scores'] num_signatures = scores_dict['num_signatures'] model_paths = scores_dict['model_paths'] signatures_dict = {} # MIX signatures indices = num_signatures == num_sigs best_model_path = model_paths[indices][np.argmin(BIC_scores[indices])] e = np.array(load_json(best_model_path)['parameters']['e']) signatures_dict['MIX'] = e.copy() # NMF signatures data, _ = get_data(dataset) e = learn_NMF(data, num_sigs, beta_loss=beta_loss) signatures_dict['NMF'] = e.copy() # clustered-NMF signatures (if needed) if dataset == 'MSK-ALL': data, _ = get_data('clustered-MSK-ALL') e = learn_NMF(data, num_sigs, beta_loss=beta_loss) signatures_dict['clustered-NMF'] = e.copy() plt.rcParams.update({'font.size': 12}) x_axis = np.array([str(i + 1) for i in range(num_sigs)]) plt.axhline(0.80, color='grey', linestyle='--', label='_nolegend_') legends = [] for i, model in enumerate(signatures_dict.keys()): legends.append(model) e = signatures_dict[model] sigs, corrs = get_signatures_correlations(e, cosmic_signatures) sigs = sigs[np.argsort(-corrs)] corrs = corrs[np.argsort(-corrs)] curr_x_axis = x_axis color = 'C{}'.format(i) plt.plot(curr_x_axis, corrs, '.-k', color=color) for i in range(len(sigs)): plt.annotate(str(sigs[i] + 1), (i, corrs[i] + 0.002), color=color) print('{} - {} - {} - {}'.format(model, sigs.tolist(), corrs.tolist(), sum(corrs))) plt.yticks(np.arange(2, 6) * 0.2) plt.ylabel('Cosine similarity', fontsize='large') plt.xlabel('Rank of signature', fontsize='large') plt.legend(legends, loc='lower left') if plot_title: plt.title('{} signatures'.format(num_sigs)) if save_plot: plt.savefig(os.path.join(ROOT_DIR, 'results', 'signatures_similarity', '{}-signatures.pdf'.format(num_sigs)))
def prepare_prediction_dir(trained_models_dir, prediction_dir): datasets = os.listdir(trained_models_dir) prediction_dir = os.path.join(prediction_dir, 'prediction') for dataset in datasets: print(dataset) dataset_dir = os.path.join(trained_models_dir, dataset) for signature_learning in os.listdir(dataset_dir): for model in os.listdir( os.path.join(dataset_dir, signature_learning)): dataset_path = os.path.join(prediction_dir, dataset, signature_learning, model) try: os.makedirs(dataset_path) except OSError: pass data, _ = get_data_by_model_name(dataset, model) json_data = {} for sample, sample_data in data.items(): json_data[sample] = {} for chrom, chrom_data in sample_data.items(): json_data[sample][chrom] = {} json_data[sample][chrom]['Sequence'] = chrom_data[ 'Sequence'].tolist() json_data[sample][chrom]['StrandInfo'] = chrom_data[ 'StrandInfo'].tolist() save_json(os.path.join(dataset_path, 'data'), json_data) del json_data for num_sigs in os.listdir( os.path.join(dataset_dir, signature_learning, model)): num_sig_dir = os.path.join(dataset_path, num_sigs) try: os.makedirs(num_sig_dir) except OSError: pass experiment_dir = os.path.join(dataset_dir, signature_learning, model, num_sigs) runs = os.listdir(experiment_dir) for run in runs: model_parameters = load_json( os.path.join(experiment_dir, run))['parameters'] if not model_parameters['e'][0][0] >= 0: print('There was a bug in run {}'.format( os.path.join(experiment_dir, run))) prediction = predict_hidden_variables( data, model_parameters) save_json(os.path.join(num_sig_dir, run), prepare_data_to_json(prediction)) print('\n')
def process_config(config_path, override_dotmap=None, exp_name_suffix=None): """ Processes config file: 1) Converts it to a DotMap 2) Creates experiments path and required subdirs 3) Set up logging """ config_json = load_json(config_path) config = DotMap(config_json) if override_dotmap is not None: config.update(override_dotmap) if exp_name_suffix is not None: config.exp_name = f'{config.exp_name}_{exp_name_suffix}' print("Loaded configuration: ") pprint(config) print() print(" *************************************** ") print(" Running experiment {}".format(config.exp_name)) print(" *************************************** ") print() exp_base = config.exp_base exp_dir = os.path.join(exp_base, "experiments", config.exp_name) # create some important directories to be used for the experiment. config.checkpoint_dir = os.path.join(exp_dir, "checkpoints/") config.log_dir = os.path.join(exp_dir, "logs/") config.summary_dir = os.path.join(exp_dir, "summaries/") config.exp_dir = exp_dir # will not create if already existing makedirs([ config.checkpoint_dir, config.log_dir, config.summary_dir, ]) # save config to experiment dir config_out = os.path.join(exp_dir, 'config.json') save_json(config.toDict(), config_out) # setup logging in the project setup_logging(config.log_dir) logging.getLogger().info( "Configurations and directories successfully set up.") return config
def get_best_model(dataset_dir, return_model=False, return_params=False): if len(os.listdir(dataset_dir)) == 0: return None dataset = os.path.split(os.path.split(dataset_dir)[0])[-1] data, _ = get_data(dataset) num_data_points = np.sum(data) models = [] BIC_scores = [] sigs = [] clusters = [] for model in os.listdir(dataset_dir): experiment_dir = os.path.join(dataset_dir, model) best_run = get_best_run(experiment_dir) if len(best_run) > 0: best_score = load_json(best_run)['log-likelihood'] num_sigs = int(model.split('_')[2][:3]) num_clusters = int(model.split('_')[1][:3]) num_params = (num_clusters - 1) + (num_sigs - 1) * num_clusters + (96 - 1) * num_sigs models.append(best_run) clusters.append(num_clusters) sigs.append(num_sigs) BIC_scores.append(np.log(num_data_points) * num_params - 2 * best_score) models = np.array(models) BIC_scores = np.array(BIC_scores) sigs = np.array(sigs, dtype='int') clusters = np.array(clusters, dtype='int') best_model = models[np.argmin(BIC_scores)] if return_model: return get_model(load_json(best_model)['parameters']) if return_params: return {'BIC_scores': BIC_scores, 'num_clusters': clusters, 'model_paths': models, 'num_signatures': sigs} return best_model
def _get_file_name_root(self, coordinate, date): """Loads metadata to extact naming root of band files (e.g. 'MCD43A4.A2018001.h18v04.006.2018010031310' in docstring example) Args: coordinate (tuple[int]): modis coordinate as (horizontal tile, vertical tile) date (str): date formatted as yyyy-mm-dd Returns: type: str """ infos_path = self.get_path_to_infos(coordinate, date) meta_data = load_json(infos_path) producer_granule_id = meta_data['producer_granule_id'] file_name_root = '.'.join(producer_granule_id.split('.')[:-1]) return file_name_root
def load_pretrained_model(self): base_dir = self.config.pretrain_model.exp_dir checkpoint_name = self.config.pretrain_model.checkpoint_name config_path = os.path.join(base_dir, 'config.json') config_json = load_json(config_path) config = DotMap(config_json) SystemClass = globals()[config.system] system = SystemClass(config) checkpoint_file = os.path.join(base_dir, 'checkpoints', checkpoint_name) checkpoint = torch.load(checkpoint_file, map_location=self.device) system.load_state_dict(checkpoint['state_dict']) encoder = system.model.eval() for param in encoder.parameters(): param.requires_grad = False return encoder, config
def add_to_drop_data(self, event): col_index = self.table_view.currentIndex().column() row_index = self.table_view.currentIndex().row() column = self.table_data_sorted.columns[col_index] content = self.table_data_sorted.iloc[row_index, col_index] try: drop_data = load_json(self.config["paths"]["drop_data"]) values = drop_data.get(column, None) if values: drop_data[column] = values + [content] else: drop_data[column] = [content] save_json(self.config["paths"]["drop_data"], drop_data) self.drop_data_added_signal.emit() except Exception as e: print(e) show_warning("Drop data addition failure", "Something went wrong")
def read_params_from_file(arg_dict, overwrite=False): """ Read params defined in config_file (paths.py by default.) """ if '/' not in arg_dict['config_file']: config_path = Path(sys.path[0]) / arg_dict['config_file'] else: config_path = Path(arg_dict['config_file']) data = load_json(config_path) arg_dict.pop('config_file') if not overwrite: for key, value in data.items(): if isinstance(value, list) and (key in arg_dict): for v in value: arg_dict[key].append(v) elif (key not in arg_dict) or (arg_dict[key] is None): arg_dict[key] = value else: for key, value in data.items(): arg_dict[key] = value
def train_model(dataset, num_clusters, use_cosmic, num_signatures, random_seed, max_iterations, epsilon, out_dir): use_cosmic_dir = 'refit' if use_cosmic else 'denovo' dataset_name = dataset data, active_signatures = get_data(dataset) if use_cosmic: num_signatures = len(active_signatures) signatures = get_cosmic_signatures()[active_signatures] elif num_signatures == 0: print('use_cosmic is False and num_signatures is 0, using number of active cosmic signatures {}'.format(len(active_signatures))) num_signatures = len(active_signatures) signatures = None else: signatures = None model_name = 'mix_' + str(num_clusters).zfill(3) + 'clusters' + '_' + str(num_signatures).zfill(3) + 'signatures' out_dir = os.path.join(out_dir, dataset_name, use_cosmic_dir, model_name) try: os.makedirs(out_dir) except OSError: pass random_seed = int(time.time()) if random_seed == 0 else random_seed out_file = out_dir + "/" + str(random_seed) + 'seed' if os.path.isfile(out_file + '.json'): print('Experiment with parameters {} {} {} {} {} already exist'.format( dataset_name, model_name, use_cosmic, num_signatures, random_seed)) save_json(out_file, load_json(out_file + '.json')) return model, ll = train_mix(data, num_clusters, num_signatures, signatures, random_seed, epsilon, max_iterations) parameters = model.get_params() parameters['w'] = parameters['w'].tolist() parameters['pi'] = parameters['pi'].tolist() parameters['e'] = parameters['e'].tolist() out = {'log-likelihood': ll, 'parameters': parameters} save_json(out_file, out)
def setup_index(self, patch_idx, patch_bounds): """Initializes generation index as described above (this is the to be `index.json`) If already exists, loads it instead Returns: type: dict """ index_path = self._get_index_path(patch_idx) if os.path.exists(index_path): index = load_json(path=index_path) else: index = { 'features': { 'patch_idx': patch_idx, 'patch_bounds': patch_bounds, 'horizon': 0 }, 'files': dict() } return index
def process_trained_models(trained_models_dir='experiments/trained_models'): datasets = os.listdir(trained_models_dir) for dataset in datasets: print(dataset + ':') dataset_dir = os.path.join(trained_models_dir, dataset) for signature_learning in os.listdir(dataset_dir): for model in os.listdir( os.path.join(dataset_dir, signature_learning)): for num_sigs in os.listdir( os.path.join(dataset_dir, signature_learning, model)): experiment_dir = os.path.join(dataset_dir, signature_learning, model, num_sigs) runs = os.listdir(experiment_dir) print(experiment_dir) for run in runs: scores = load_json(os.path.join(experiment_dir, run))['log-likelihood'] total_score = 0 for score in scores.values(): total_score += score print(run, total_score) print('\n')
def __init__(self, weights_file, classes_file): self.classes = load_json(classes_file) self.weights = np.load(weights_file, encoding='latin1').item()
def eval_language_metrics(checkpoint, eval_data_loader, opt, model=None, eval_mode="val"): """eval_mode can only be set to `val` here, as setting to `test` is cheating 0, run inference 1, Get METEOR, BLEU1-4, CIDEr scores 2, Get vocab size, sentence length """ translator = Translator(opt, checkpoint, model=model) json_res = run_translate(eval_data_loader, translator, opt=opt) res_filepath = os.path.abspath( opt.save_model + "_tmp_greedy_pred_{}.json".format(eval_mode)) save_json(json_res, res_filepath, save_pretty=True) if opt.dset_name == "anet": reference_files_map = { "val": [ os.path.join(opt.data_dir, e) for e in [ "anet_entities_val_1_para.json", "anet_entities_val_2_para.json" ] ], "test": [ os.path.join(opt.data_dir, e) for e in [ "anet_entities_test_1_para.json", "anet_entities_test_2_para.json" ] ] } else: # yc2 reference_files_map = { "val": [os.path.join(opt.data_dir, "yc2_val_anet_format_para.json")] } # COCO language evaluation eval_references = reference_files_map[eval_mode] lang_filepath = res_filepath.replace(".json", "_lang.json") eval_cmd = [ "python", "para-evaluate.py", "-s", res_filepath, "-o", lang_filepath, "-v", "-r" ] + eval_references subprocess.call(eval_cmd, cwd=opt.eval_tool_dir) # basic stats stat_filepath = res_filepath.replace(".json", "_stat.json") eval_stat_cmd = [ "python", "get_caption_stat.py", "-s", res_filepath, "-r", eval_references[0], "-o", stat_filepath, "-v" ] subprocess.call(eval_stat_cmd, cwd=opt.eval_tool_dir) # repetition evaluation rep_filepath = res_filepath.replace(".json", "_rep.json") eval_rep_cmd = [ "python", "evaluateRepetition.py", "-s", res_filepath, "-r", eval_references[0], "-o", rep_filepath ] subprocess.call(eval_rep_cmd, cwd=opt.eval_tool_dir) # save results logger.info("Finished eval {}.".format(eval_mode)) metric_filepaths = [lang_filepath, stat_filepath, rep_filepath] all_metrics = merge_dicts([load_json(e) for e in metric_filepaths]) all_metrics_filepath = res_filepath.replace(".json", "_all_metrics.json") save_json(all_metrics, all_metrics_filepath, save_pretty=True) return all_metrics, [res_filepath, all_metrics_filepath]
info['columns'] = list(df)[1:] write_json(info, Path(save_dir) / f'{csv_name}_info.json') print(info) def read_mm(datadir, name): """ name can be one of {ts, diagnoses, labels, flat}. """ info = load_json(Path(datadir) / (name + '_info.json')) dat_path = Path(datadir) / (name + '.dat') data = np.memmap(dat_path, dtype=np.float32, shape=tuple(info['shape'])) return data, info if __name__ == '__main__': paths = load_json('paths.json') data_dir = paths['eICU_path'] save_dir = paths['data_dir'] print(f'Load eICU processed data from {data_dir}') print(f'Saving mmap data in {save_dir}') print('--' * 30) Path(save_dir).mkdir(exist_ok=True) print('** Converting time series **') convert_timeseries_into_mmap(data_dir, save_dir) for csv_name in ['flat', 'diagnoses', 'labels']: print(f'** Converting {csv_name} **') convert_into_mmap(data_dir, save_dir, csv_name) print('--' * 30) print(f'Done! Saved data in {save_dir}')
def compare_panel_clusters(): np.random.seed(1359) # full_dataset, panel_dataset = 'BRCA-panel-full', 'BRCA-panel' full_dataset, panel_dataset = 'nature2019-full', 'nature2019-panel' full_data, active_signatures = get_data(full_dataset) panel_data, _ = get_data(panel_dataset) signatures = get_cosmic_signatures()[active_signatures] num_samples = len(full_data) full_data_exposures = stack_nnls(full_data, signatures) full_data_exposures_dists = cosine_similarity(full_data_exposures) corrs = [] models = [] relations = [] for model in ['MIX', 'SigMA']: if model == 'MIX': d = os.path.join(ROOT_DIR, 'experiments/trained_models/{}/refit'.format('BRCA-panel')) mix = get_model(load_json(get_best_model(d))['parameters']) clusters = np.argmax(mix.soft_cluster(full_data), 1) elif model == 'SigMA': # d = os.path.join(ROOT_DIR, 'data/ICGC-BRCA/out-sigma-brca-panel.tsv') d = os.path.join(ROOT_DIR, 'data/nature2019/SigMA_output.tsv') all_df = pd.read_csv(d, sep='\t') # In case this is comma separated if len(all_df.columns) == 1: all_df = pd.read_csv(d, sep=',') clusters = all_df['categ'].values unique_clusters = np.unique(clusters) cluster_to_num = {} for i, c in enumerate(unique_clusters): cluster_to_num[c] = i clusters = np.array([cluster_to_num[c] for c in clusters]) else: raise ValueError('error') dists_in_clusters = [] dists_out_clusters = [] for i in range(num_samples): for j in range(i + 1, num_samples): if clusters[i] == clusters[j]: dists_in_clusters.append(full_data_exposures_dists[i, j]) else: dists_out_clusters.append(full_data_exposures_dists[i, j]) dists_in_clusters = np.array(dists_in_clusters) dists_out_clusters = np.array(dists_out_clusters) dists_in_clusters = np.random.choice(dists_in_clusters, 200, replace=False) dists_out_clusters = np.random.choice(dists_out_clusters, 200, replace=False) corrs.extend(dists_in_clusters) corrs.extend(dists_out_clusters) models.extend([model] * len(dists_out_clusters) * 2) relations.extend(['Intra-cluster pairs'] * len(dists_out_clusters)) relations.extend(['Inter-cluster pairs'] * len(dists_out_clusters)) print(model, len(np.unique(clusters))) print(ranksums(dists_in_clusters, dists_out_clusters), np.mean(dists_in_clusters), np.mean(dists_out_clusters)) df = {'Cosine similarity': corrs, 'model': models, 'relation': relations} df = pd.DataFrame(df) sns.violinplot(x='relation', y='Cosine similarity', hue='model', data=df, split=True, inner='stick') plt.xlabel('') plt.savefig(os.path.join(ROOT_DIR, 'results', 'clusters_quality', 'clusters_quality.pdf'))
def main(): parser = argparse.ArgumentParser(description="translate.py") parser.add_argument("--eval_splits", type=str, nargs="+", default=["val", ], choices=["val", "test"], help="evaluate on val/test set, yc2 only has val") parser.add_argument("--res_dir", required=True, help="path to dir containing model .pt file") parser.add_argument("--batch_size", type=int, default=100, help="batch size") # beam search configs parser.add_argument("--use_beam", action="store_true", help="use beam search, otherwise greedy search") parser.add_argument("--beam_size", type=int, default=2, help="beam size") parser.add_argument("--n_best", type=int, default=1, help="stop searching when get n_best from beam search") parser.add_argument("--min_sen_len", type=int, default=5, help="minimum length of the decoded sentences") parser.add_argument("--max_sen_len", type=int, default=30, help="maximum length of the decoded sentences") parser.add_argument("--block_ngram_repeat", type=int, default=0, help="block repetition of ngrams during decoding.") parser.add_argument("--length_penalty_name", default="none", choices=["none", "wu", "avg"], help="length penalty to use.") parser.add_argument("--length_penalty_alpha", type=float, default=0., help="Google NMT length penalty parameter (higher = longer generation)") parser.add_argument("--eval_tool_dir", type=str, default="./densevid_eval") parser.add_argument("--no_cuda", action="store_true") parser.add_argument("--seed", default=2019, type=int) parser.add_argument("--debug", action="store_true") opt = parser.parse_args() opt.cuda = not opt.no_cuda # random seed random.seed(opt.seed) np.random.seed(opt.seed) torch.manual_seed(opt.seed) checkpoint = torch.load(os.path.join(opt.res_dir, "model.chkpt")) # add some of the train configs train_opt = checkpoint["opt"] # EDict(load_json(os.path.join(opt.res_dir, "model.cfg.json"))) for k in train_opt.__dict__: if k not in opt.__dict__: setattr(opt, k, getattr(train_opt, k)) print("train_opt", train_opt) decoding_strategy = "beam{}_lp_{}_la_{}".format( opt.beam_size, opt.length_penalty_name, opt.length_penalty_alpha) if opt.use_beam else "greedy" save_json(vars(opt), os.path.join(opt.res_dir, "{}_eval_cfg.json".format(decoding_strategy)), save_pretty=True) if opt.dset_name == "anet": reference_files_map = { "val": [os.path.join(opt.data_dir, e) for e in ["anet_entities_val_1_para.json", "anet_entities_val_2_para.json"]], "test": [os.path.join(opt.data_dir, e) for e in ["anet_entities_test_1_para.json", "anet_entities_test_2_para.json"]]} else: # yc2 reference_files_map = {"val": [os.path.join(opt.data_dir, "yc2_val_anet_format_para.json")]} for eval_mode in opt.eval_splits: print("Start evaluating {}".format(eval_mode)) # add 10 at max_n_sen to make the inference stage use all the segments eval_data_loader = get_data_loader(opt, eval_mode=eval_mode) eval_references = reference_files_map[eval_mode] # setup model translator = Translator(opt, checkpoint) pred_file = os.path.join(opt.res_dir, "{}_pred_{}.json".format(decoding_strategy, eval_mode)) pred_file = os.path.abspath(pred_file) if not os.path.exists(pred_file): json_res = run_translate(eval_data_loader, translator, opt=opt) save_json(json_res, pred_file, save_pretty=True) else: print("Using existing prediction file at {}".format(pred_file)) # COCO language evaluation lang_file = pred_file.replace(".json", "_lang.json") eval_command = ["python", "para-evaluate.py", "-s", pred_file, "-o", lang_file, "-v", "-r"] + eval_references subprocess.call(eval_command, cwd=opt.eval_tool_dir) # basic stats stat_filepath = pred_file.replace(".json", "_stat.json") eval_stat_cmd = ["python", "get_caption_stat.py", "-s", pred_file, "-r", eval_references[0], "-o", stat_filepath, "-v"] subprocess.call(eval_stat_cmd, cwd=opt.eval_tool_dir) # repetition evaluation rep_filepath = pred_file.replace(".json", "_rep.json") eval_rep_cmd = ["python", "evaluateRepetition.py", "-s", pred_file, "-r", eval_references[0], "-o", rep_filepath] subprocess.call(eval_rep_cmd, cwd=opt.eval_tool_dir) metric_filepaths = [lang_file, stat_filepath, rep_filepath] all_metrics = merge_dicts([load_json(e) for e in metric_filepaths]) all_metrics_filepath = pred_file.replace(".json", "_all_metrics.json") save_json(all_metrics, all_metrics_filepath, save_pretty=True) print("pred_file {} lang_file {}".format(pred_file, lang_file)) print("[Info] Finished {}.".format(eval_mode))