def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_path", type=str, required=True)
    parser.add_argument("--dset_name",
                        type=str,
                        default="anet",
                        choices=["anet", "yc2"])
    parser.add_argument("--cache", type=str, default="./cache")
    parser.add_argument("--min_word_count", type=int, default=5)
    parser.add_argument("--raw_glove_path",
                        type=str,
                        help="downloaded glove vectors path")

    opt = parser.parse_args()
    if not os.path.exists(opt.cache):
        os.makedirs(opt.cache)

    # load, merge, clean, split data
    train_data = load_json(opt.train_path)
    all_sentences = flat_list_of_lists(
        [v["sentences"] for k, v in train_data.items()])
    all_sentences = [
        nltk.tokenize.word_tokenize(sen.lower()) for sen in all_sentences
    ]
    word2idx = build_vocab_idx(all_sentences, opt.min_word_count)
    print("[Info] Dumping the processed data to json file", opt.cache)
    word2idx_path = os.path.join(opt.cache,
                                 "{}_word2idx.json".format(opt.dset_name))
    save_json(word2idx, word2idx_path, save_pretty=True)
    print("[Info] Finish.")

    vocab_glove_path = os.path.join(opt.cache,
                                    "{}_vocab_glove.pt".format(opt.dset_name))
    extract_glove(word2idx, opt.raw_glove_path, vocab_glove_path)
def load_transform_data(data_path):
    data = load_json(data_path)
    transformed_data = []
    for v_id, cap in data.items():
        cap["v_id"] = v_id
        transformed_data.append(cap)
    return transformed_data
 def __init__(self, root, transform=None):
     self.root = root
     self.transform = transform
     index_path = os.path.join(root, 'index.json')
     self.index = load_json(index_path)
     self._modis_path = self._get_paths('modis')
     self._landsat_path = self._get_paths('landsat')
Beispiel #4
0
    def __init__(self, dset_name, data_dir, video_feature_dir, duration_file, word2idx_path,
                 max_t_len, max_v_len, max_n_sen, mode="train", recurrent=True, untied=False):
        self.dset_name = dset_name
        self.word2idx = load_json(word2idx_path)
        self.idx2word = {int(v): k for k, v in self.word2idx.items()}
        self.data_dir = data_dir  # containing training data
        self.video_feature_dir = video_feature_dir  # a set of .h5 files
        self.duration_file = duration_file
        self.frame_to_second = self._load_duration()
        self.max_seq_len = max_v_len + max_t_len
        self.max_v_len = max_v_len
        self.max_t_len = max_t_len  # sen
        self.max_n_sen = max_n_sen

        self.mode = mode
        self.recurrent = recurrent
        self.untied = untied
        assert not (self.recurrent and self.untied), "untied and recurrent cannot be True for both"

        # data entries
        self.data = None
        self.set_data_mode(mode=mode)
        self.missing_video_names = []
        self.fix_missing()

        self.num_sens = None  # number of sentence for each video, set in self._load_data()
    def _load_data(self, data_path):
        logging.info("Loading data from {}".format(data_path))
        raw_data = load_json(data_path)
        data = []
        for k, line in tqdm(raw_data.items()):
            line["name"] = k
            line["timestamps"] = line["timestamps"][:self.max_n_sen]
            line["sentences"] = line["sentences"][:self.max_n_sen]
            data.append(line)

        if self.recurrent:  # recurrent
            self.data = data
        else:  # non-recurrent single sentence
            singel_sentence_data = []
            for d in sorted(data, key=lambda x: x['name']):
                num_sen = min(self.max_n_sen, len(d["sentences"]))
                singel_sentence_data.extend([{
                    "duration": d["duration"],
                    "name": d["name"],
                    "timestamp": d["timestamps"][idx],
                    "sentence": d["sentences"][idx]
                } for idx in range(num_sen)])
            self.data = singel_sentence_data

        logging.info("Loading complete! {} examples".format(len(self)))
Beispiel #6
0
def process_loco(loco_dir='experiments/LOCO'):
    chromosomes = [str(i) for i in range(1, 23)]
    chromosomes.extend(['X', 'Y'])
    chromosomes = np.array(chromosomes)

    experiment_string = '{} with {} signatures'

    datasets = os.listdir(loco_dir)
    for dataset in datasets:
        print(dataset + ':')
        dataset_dir = os.path.join(loco_dir, dataset)
        for signature_learning in os.listdir(dataset_dir):
            for model in os.listdir(
                    os.path.join(dataset_dir, signature_learning)):
                for num_sigs in os.listdir(
                        os.path.join(dataset_dir, signature_learning, model)):
                    if signature_learning == 'denovo':
                        signatures_string = str(num_sigs)
                    else:
                        signatures_string = '{} known cosmic'.format(num_sigs)
                    curr_experiment_string = experiment_string.format(
                        model, signatures_string)
                    experiment_dir = os.path.join(dataset_dir,
                                                  signature_learning, model,
                                                  num_sigs)
                    folds = np.array(os.listdir(experiment_dir))
                    no_dir_folds = [
                        fold for fold in chromosomes if fold not in folds
                    ]
                    no_run_folds = []
                    experiment_score = 0
                    for fold in folds:
                        runs = os.listdir(os.path.join(experiment_dir, fold))
                        num_runs = len(runs)
                        if num_runs == 0:
                            no_run_folds.append(fold)
                            continue
                        train_scores = np.zeros(num_runs)
                        test_scores = np.zeros(num_runs)
                        for i, run in enumerate(runs):
                            file_path = os.path.join(experiment_dir, fold, run)
                            run_dict = load_json(file_path)
                            for s in run_dict['log-likelihood-train'].values():
                                train_scores[i] += s
                            for s in run_dict['log-likelihood-test'].values():
                                test_scores[i] += s

                        # Deciding what run to use according to the log likelihood of the train data
                        best_run = np.argmax(train_scores)
                        experiment_score += test_scores[best_run]
                    if len(no_run_folds) > 0 or len(no_dir_folds):
                        print(
                            curr_experiment_string +
                            ' completely missing folds {} and missing runs for {}'
                            .format(no_dir_folds, no_run_folds))
                    else:
                        print(curr_experiment_string +
                              ' score is {}'.format(experiment_score))
        print('\n')
Beispiel #7
0
def read_mm(datadir, name):
    """
    name can be one of {ts, diagnoses, labels, flat}.
    """
    info = load_json(Path(datadir) / (name + '_info.json'))
    dat_path = Path(datadir) / (name + '.dat')
    data = np.memmap(dat_path, dtype=np.float32, shape=tuple(info['shape']))
    return data, info
Beispiel #8
0
def train_model(args):
    params = load_json(args.param_path)
    serialization_dir = args.serialization_dir
    if not serialization_dir:
        param_filename = os.path.splitext(os.path.split(args.param_path)[1])[0]
        serialization_dir = os.path.join('checkpoints', param_filename)
    return func(params, serialization_dir, args.recover,
                args.force)
Beispiel #9
0
def load_db_local_file(filename):
    try:
        print('Loading data from local file', filename)
        return utils.load_json(filename)
    except Exception as e:
        print('No se encuentra en la ruta especificada', filename)
        print('Error:', e)
        return -1
Beispiel #10
0
 def _write_filtering_values_to_file(self, file_path, name):
     self._update_filtering_values()
     filter_values = {k: v for k, v in self.filter_values.items() if v != "" and pd.notnull(v)}
     filter_values.pop("min_date", None)
     filter_values.pop("max_date", None)
     values = load_json(file_path)
     values[name] = filter_values
     save_json(file_path, values)
     return values
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-c", "--config", default="configurations/config.json", help="Path to configuration.")
    args = parser.parse_args()
    print(f"Reading configuration from {args.config}")
    config = load_json(args.config)
    app = QApplication(sys.argv)
    mw = MainWindow(config)
    mw.show()
    app.exec()
Beispiel #12
0
def get_best_run(experiment_dir):
    runs = os.listdir(experiment_dir)
    best_score = -np.inf
    if len(runs) == 0:
        return ''
    best_run = runs[0]
    for run in runs:
        total_score = load_json(os.path.join(experiment_dir, run))['log-likelihood']
        if total_score > best_score:
            best_score = total_score
            best_run = run
    return os.path.join(experiment_dir, best_run)
Beispiel #13
0
def create_base_model():
    try:
        os.makedirs(os.path.join(ROOT_DIR, 'data/simulated-data'))
    except OSError:
        pass
    base_model = load_json(
        os.path.join(
            ROOT_DIR,
            'experiments/trained_models/MSK-ALL/denovo/mix_010clusters_006signatures/314179seed.json'
        ))
    save_json(os.path.join(ROOT_DIR, 'data/simulated-data/base_model'),
              base_model)
Beispiel #14
0
def process_config(config_path,
                   custom_split=None,
                   custom_gpu_device=None,
                   override_dotmap=None):
    config_json = load_json(config_path)
    if custom_split is not None:
        config_json['data_params']['split'] = str(custom_split)
    if custom_gpu_device is not None:
        config_json['gpu_device'] = int(custom_gpu_device)
    # so we dont need to manually change folders
    config_json['exp_name'] = '{}_{}'.format(
        config_json['exp_name'], config_json['data_params']['split'])
    return _process_config(config_json, override_dotmap=override_dotmap)
Beispiel #15
0
def plot_sig_correlations(dataset, num_sigs, beta_loss=2, plot_title=True, save_plot=True):
    cosmic_signatures = get_cosmic_signatures()
    mix_dir = os.path.join(ROOT_DIR, 'experiments/trained_models/{}/denovo'.format(dataset))
    scores_dict = get_best_model(mix_dir, return_params=True)
    BIC_scores = scores_dict['BIC_scores']
    num_signatures = scores_dict['num_signatures']
    model_paths = scores_dict['model_paths']
    signatures_dict = {}

    # MIX signatures
    indices = num_signatures == num_sigs
    best_model_path = model_paths[indices][np.argmin(BIC_scores[indices])]
    e = np.array(load_json(best_model_path)['parameters']['e'])
    signatures_dict['MIX'] = e.copy()

    # NMF signatures
    data, _ = get_data(dataset)
    e = learn_NMF(data, num_sigs, beta_loss=beta_loss)
    signatures_dict['NMF'] = e.copy()

    # clustered-NMF signatures (if needed)
    if dataset == 'MSK-ALL':
        data, _ = get_data('clustered-MSK-ALL')
        e = learn_NMF(data, num_sigs, beta_loss=beta_loss)
        signatures_dict['clustered-NMF'] = e.copy()

    plt.rcParams.update({'font.size': 12})
    x_axis = np.array([str(i + 1) for i in range(num_sigs)])
    plt.axhline(0.80, color='grey', linestyle='--', label='_nolegend_')
    legends = []
    for i, model in enumerate(signatures_dict.keys()):
        legends.append(model)
        e = signatures_dict[model]
        sigs, corrs = get_signatures_correlations(e, cosmic_signatures)
        sigs = sigs[np.argsort(-corrs)]
        corrs = corrs[np.argsort(-corrs)]
        curr_x_axis = x_axis
        color = 'C{}'.format(i)
        plt.plot(curr_x_axis, corrs, '.-k', color=color)
        for i in range(len(sigs)):
            plt.annotate(str(sigs[i] + 1), (i, corrs[i] + 0.002), color=color)
        print('{} - {} - {} - {}'.format(model, sigs.tolist(), corrs.tolist(), sum(corrs)))

    plt.yticks(np.arange(2, 6) * 0.2)
    plt.ylabel('Cosine similarity', fontsize='large')
    plt.xlabel('Rank of signature', fontsize='large')
    plt.legend(legends, loc='lower left')
    if plot_title:
        plt.title('{} signatures'.format(num_sigs))
    if save_plot:
        plt.savefig(os.path.join(ROOT_DIR, 'results', 'signatures_similarity', '{}-signatures.pdf'.format(num_sigs)))
Beispiel #16
0
def prepare_prediction_dir(trained_models_dir, prediction_dir):
    datasets = os.listdir(trained_models_dir)
    prediction_dir = os.path.join(prediction_dir, 'prediction')
    for dataset in datasets:
        print(dataset)
        dataset_dir = os.path.join(trained_models_dir, dataset)
        for signature_learning in os.listdir(dataset_dir):
            for model in os.listdir(
                    os.path.join(dataset_dir, signature_learning)):
                dataset_path = os.path.join(prediction_dir, dataset,
                                            signature_learning, model)
                try:
                    os.makedirs(dataset_path)
                except OSError:
                    pass
                data, _ = get_data_by_model_name(dataset, model)
                json_data = {}
                for sample, sample_data in data.items():
                    json_data[sample] = {}
                    for chrom, chrom_data in sample_data.items():
                        json_data[sample][chrom] = {}
                        json_data[sample][chrom]['Sequence'] = chrom_data[
                            'Sequence'].tolist()
                        json_data[sample][chrom]['StrandInfo'] = chrom_data[
                            'StrandInfo'].tolist()

                save_json(os.path.join(dataset_path, 'data'), json_data)
                del json_data
                for num_sigs in os.listdir(
                        os.path.join(dataset_dir, signature_learning, model)):
                    num_sig_dir = os.path.join(dataset_path, num_sigs)
                    try:
                        os.makedirs(num_sig_dir)
                    except OSError:
                        pass
                    experiment_dir = os.path.join(dataset_dir,
                                                  signature_learning, model,
                                                  num_sigs)
                    runs = os.listdir(experiment_dir)
                    for run in runs:
                        model_parameters = load_json(
                            os.path.join(experiment_dir, run))['parameters']
                        if not model_parameters['e'][0][0] >= 0:
                            print('There was a bug in run {}'.format(
                                os.path.join(experiment_dir, run)))
                        prediction = predict_hidden_variables(
                            data, model_parameters)
                        save_json(os.path.join(num_sig_dir, run),
                                  prepare_data_to_json(prediction))

        print('\n')
def process_config(config_path, override_dotmap=None, exp_name_suffix=None):
    """
    Processes config file:
        1) Converts it to a DotMap
        2) Creates experiments path and required subdirs
        3) Set up logging
    """
    config_json = load_json(config_path)
    config = DotMap(config_json)
    if override_dotmap is not None:
        config.update(override_dotmap)

    if exp_name_suffix is not None:
        config.exp_name = f'{config.exp_name}_{exp_name_suffix}'

    print("Loaded configuration: ")
    pprint(config)

    print()
    print(" *************************************** ")
    print("      Running experiment {}".format(config.exp_name))
    print(" *************************************** ")
    print()

    exp_base = config.exp_base
    exp_dir = os.path.join(exp_base, "experiments", config.exp_name)

    # create some important directories to be used for the experiment.
    config.checkpoint_dir = os.path.join(exp_dir, "checkpoints/")
    config.log_dir = os.path.join(exp_dir, "logs/")
    config.summary_dir = os.path.join(exp_dir, "summaries/")
    config.exp_dir = exp_dir

    # will not create if already existing
    makedirs([
        config.checkpoint_dir, 
        config.log_dir,
        config.summary_dir,
    ])

    # save config to experiment dir
    config_out = os.path.join(exp_dir, 'config.json')
    save_json(config.toDict(), config_out)

    # setup logging in the project
    setup_logging(config.log_dir)

    logging.getLogger().info(
        "Configurations and directories successfully set up.")

    return config
Beispiel #18
0
def get_best_model(dataset_dir, return_model=False, return_params=False):
    if len(os.listdir(dataset_dir)) == 0:
        return None
    dataset = os.path.split(os.path.split(dataset_dir)[0])[-1]
    data, _ = get_data(dataset)
    num_data_points = np.sum(data)

    models = []
    BIC_scores = []
    sigs = []
    clusters = []
    for model in os.listdir(dataset_dir):
        experiment_dir = os.path.join(dataset_dir, model)
        best_run = get_best_run(experiment_dir)
        if len(best_run) > 0:
            best_score = load_json(best_run)['log-likelihood']
            num_sigs = int(model.split('_')[2][:3])
            num_clusters = int(model.split('_')[1][:3])
            num_params = (num_clusters - 1) + (num_sigs - 1) * num_clusters + (96 - 1) * num_sigs
            models.append(best_run)
            clusters.append(num_clusters)
            sigs.append(num_sigs)
            BIC_scores.append(np.log(num_data_points) * num_params - 2 * best_score)

    models = np.array(models)
    BIC_scores = np.array(BIC_scores)
    sigs = np.array(sigs, dtype='int')
    clusters = np.array(clusters, dtype='int')
    best_model = models[np.argmin(BIC_scores)]

    if return_model:
        return get_model(load_json(best_model)['parameters'])

    if return_params:
        return {'BIC_scores': BIC_scores, 'num_clusters': clusters, 'model_paths': models, 'num_signatures': sigs}

    return best_model
    def _get_file_name_root(self, coordinate, date):
        """Loads metadata to extact naming root of band files
        (e.g. 'MCD43A4.A2018001.h18v04.006.2018010031310' in docstring example)

        Args:
            coordinate (tuple[int]): modis coordinate as (horizontal tile, vertical tile)
            date (str): date formatted as yyyy-mm-dd

        Returns:
            type: str
        """
        infos_path = self.get_path_to_infos(coordinate, date)
        meta_data = load_json(infos_path)
        producer_granule_id = meta_data['producer_granule_id']
        file_name_root = '.'.join(producer_granule_id.split('.')[:-1])
        return file_name_root
Beispiel #20
0
    def load_pretrained_model(self):
        base_dir = self.config.pretrain_model.exp_dir
        checkpoint_name = self.config.pretrain_model.checkpoint_name

        config_path = os.path.join(base_dir, 'config.json')
        config_json = load_json(config_path)
        config = DotMap(config_json)

        SystemClass = globals()[config.system]
        system = SystemClass(config)
        checkpoint_file = os.path.join(base_dir, 'checkpoints', checkpoint_name)
        checkpoint = torch.load(checkpoint_file, map_location=self.device)
        system.load_state_dict(checkpoint['state_dict'])

        encoder = system.model.eval()
        for param in encoder.parameters():
            param.requires_grad = False

        return encoder, config
    def add_to_drop_data(self, event):
        col_index = self.table_view.currentIndex().column()
        row_index = self.table_view.currentIndex().row()
        column = self.table_data_sorted.columns[col_index]
        content = self.table_data_sorted.iloc[row_index, col_index]

        try:
            drop_data = load_json(self.config["paths"]["drop_data"])
            values = drop_data.get(column, None)
            if values:
                drop_data[column] = values + [content]
            else:
                drop_data[column] = [content]
            save_json(self.config["paths"]["drop_data"], drop_data)
            self.drop_data_added_signal.emit()

        except Exception as e:
            print(e)
            show_warning("Drop data addition failure", "Something went wrong")
Beispiel #22
0
def read_params_from_file(arg_dict, overwrite=False):
    """
    Read params defined in config_file (paths.py by default.)
    """
    if '/' not in arg_dict['config_file']:
        config_path = Path(sys.path[0]) / arg_dict['config_file']
    else:
        config_path = Path(arg_dict['config_file'])

    data = load_json(config_path)
    arg_dict.pop('config_file')

    if not overwrite:
        for key, value in data.items():
            if isinstance(value, list) and (key in arg_dict):
                for v in value:
                    arg_dict[key].append(v)
            elif (key not in arg_dict) or (arg_dict[key] is None):
                arg_dict[key] = value
    else:
        for key, value in data.items():
            arg_dict[key] = value
Beispiel #23
0
def train_model(dataset, num_clusters, use_cosmic, num_signatures, random_seed, max_iterations, epsilon, out_dir):
    use_cosmic_dir = 'refit' if use_cosmic else 'denovo'
    dataset_name = dataset
    data, active_signatures = get_data(dataset)
    if use_cosmic:
        num_signatures = len(active_signatures)
        signatures = get_cosmic_signatures()[active_signatures]
    elif num_signatures == 0:
        print('use_cosmic is False and num_signatures is 0, using number of active cosmic signatures {}'.format(len(active_signatures)))
        num_signatures = len(active_signatures)
        signatures = None
    else:
        signatures = None

    model_name = 'mix_' + str(num_clusters).zfill(3) + 'clusters' + '_' + str(num_signatures).zfill(3) + 'signatures'
    out_dir = os.path.join(out_dir, dataset_name, use_cosmic_dir, model_name)

    try:
        os.makedirs(out_dir)
    except OSError:
        pass

    random_seed = int(time.time()) if random_seed == 0 else random_seed
    out_file = out_dir + "/" + str(random_seed) + 'seed'
    if os.path.isfile(out_file + '.json'):
        print('Experiment with parameters {} {} {} {} {} already exist'.format(
            dataset_name, model_name, use_cosmic, num_signatures, random_seed))
        save_json(out_file, load_json(out_file + '.json'))
        return

    model, ll = train_mix(data, num_clusters, num_signatures, signatures, random_seed, epsilon, max_iterations)
    parameters = model.get_params()

    parameters['w'] = parameters['w'].tolist()
    parameters['pi'] = parameters['pi'].tolist()
    parameters['e'] = parameters['e'].tolist()

    out = {'log-likelihood': ll, 'parameters': parameters}
    save_json(out_file, out)
    def setup_index(self, patch_idx, patch_bounds):
        """Initializes generation index as described above (this is the
        to be `index.json`)

        If already exists, loads it instead

        Returns:
            type: dict
        """
        index_path = self._get_index_path(patch_idx)

        if os.path.exists(index_path):
            index = load_json(path=index_path)
        else:
            index = {
                'features': {
                    'patch_idx': patch_idx,
                    'patch_bounds': patch_bounds,
                    'horizon': 0
                },
                'files': dict()
            }
        return index
Beispiel #25
0
def process_trained_models(trained_models_dir='experiments/trained_models'):
    datasets = os.listdir(trained_models_dir)
    for dataset in datasets:
        print(dataset + ':')
        dataset_dir = os.path.join(trained_models_dir, dataset)
        for signature_learning in os.listdir(dataset_dir):
            for model in os.listdir(
                    os.path.join(dataset_dir, signature_learning)):
                for num_sigs in os.listdir(
                        os.path.join(dataset_dir, signature_learning, model)):
                    experiment_dir = os.path.join(dataset_dir,
                                                  signature_learning, model,
                                                  num_sigs)
                    runs = os.listdir(experiment_dir)
                    print(experiment_dir)
                    for run in runs:
                        scores = load_json(os.path.join(experiment_dir,
                                                        run))['log-likelihood']
                        total_score = 0
                        for score in scores.values():
                            total_score += score
                        print(run, total_score)
        print('\n')
Beispiel #26
0
 def __init__(self, weights_file, classes_file):
     self.classes = load_json(classes_file)
     self.weights = np.load(weights_file, encoding='latin1').item()
Beispiel #27
0
def eval_language_metrics(checkpoint,
                          eval_data_loader,
                          opt,
                          model=None,
                          eval_mode="val"):
    """eval_mode can only be set to `val` here, as setting to `test` is cheating
    0, run inference
    1, Get METEOR, BLEU1-4, CIDEr scores
    2, Get vocab size, sentence length
    """
    translator = Translator(opt, checkpoint, model=model)
    json_res = run_translate(eval_data_loader, translator, opt=opt)
    res_filepath = os.path.abspath(
        opt.save_model + "_tmp_greedy_pred_{}.json".format(eval_mode))
    save_json(json_res, res_filepath, save_pretty=True)

    if opt.dset_name == "anet":
        reference_files_map = {
            "val": [
                os.path.join(opt.data_dir, e) for e in [
                    "anet_entities_val_1_para.json",
                    "anet_entities_val_2_para.json"
                ]
            ],
            "test": [
                os.path.join(opt.data_dir, e) for e in [
                    "anet_entities_test_1_para.json",
                    "anet_entities_test_2_para.json"
                ]
            ]
        }
    else:  # yc2
        reference_files_map = {
            "val":
            [os.path.join(opt.data_dir, "yc2_val_anet_format_para.json")]
        }

    # COCO language evaluation
    eval_references = reference_files_map[eval_mode]
    lang_filepath = res_filepath.replace(".json", "_lang.json")
    eval_cmd = [
        "python", "para-evaluate.py", "-s", res_filepath, "-o", lang_filepath,
        "-v", "-r"
    ] + eval_references
    subprocess.call(eval_cmd, cwd=opt.eval_tool_dir)

    # basic stats
    stat_filepath = res_filepath.replace(".json", "_stat.json")
    eval_stat_cmd = [
        "python", "get_caption_stat.py", "-s", res_filepath, "-r",
        eval_references[0], "-o", stat_filepath, "-v"
    ]
    subprocess.call(eval_stat_cmd, cwd=opt.eval_tool_dir)

    # repetition evaluation
    rep_filepath = res_filepath.replace(".json", "_rep.json")
    eval_rep_cmd = [
        "python", "evaluateRepetition.py", "-s", res_filepath, "-r",
        eval_references[0], "-o", rep_filepath
    ]
    subprocess.call(eval_rep_cmd, cwd=opt.eval_tool_dir)

    # save results
    logger.info("Finished eval {}.".format(eval_mode))
    metric_filepaths = [lang_filepath, stat_filepath, rep_filepath]
    all_metrics = merge_dicts([load_json(e) for e in metric_filepaths])

    all_metrics_filepath = res_filepath.replace(".json", "_all_metrics.json")
    save_json(all_metrics, all_metrics_filepath, save_pretty=True)
    return all_metrics, [res_filepath, all_metrics_filepath]
Beispiel #28
0
    info['columns'] = list(df)[1:]

    write_json(info, Path(save_dir) / f'{csv_name}_info.json')
    print(info)


def read_mm(datadir, name):
    """
    name can be one of {ts, diagnoses, labels, flat}.
    """
    info = load_json(Path(datadir) / (name + '_info.json'))
    dat_path = Path(datadir) / (name + '.dat')
    data = np.memmap(dat_path, dtype=np.float32, shape=tuple(info['shape']))
    return data, info


if __name__ == '__main__':
    paths = load_json('paths.json')
    data_dir = paths['eICU_path']
    save_dir = paths['data_dir']
    print(f'Load eICU processed data from {data_dir}')
    print(f'Saving mmap data in {save_dir}')
    print('--' * 30)
    Path(save_dir).mkdir(exist_ok=True)
    print('** Converting time series **')
    convert_timeseries_into_mmap(data_dir, save_dir)
    for csv_name in ['flat', 'diagnoses', 'labels']:
        print(f'** Converting {csv_name} **')
        convert_into_mmap(data_dir, save_dir, csv_name)
    print('--' * 30)
    print(f'Done! Saved data in {save_dir}')
Beispiel #29
0
def compare_panel_clusters():
    np.random.seed(1359)
    # full_dataset, panel_dataset = 'BRCA-panel-full', 'BRCA-panel'
    full_dataset, panel_dataset = 'nature2019-full', 'nature2019-panel'
    full_data, active_signatures = get_data(full_dataset)
    panel_data, _ = get_data(panel_dataset)
    signatures = get_cosmic_signatures()[active_signatures]
    num_samples = len(full_data)

    full_data_exposures = stack_nnls(full_data, signatures)

    full_data_exposures_dists = cosine_similarity(full_data_exposures)
    corrs = []
    models = []
    relations = []

    for model in ['MIX', 'SigMA']:
        if model == 'MIX':
            d = os.path.join(ROOT_DIR, 'experiments/trained_models/{}/refit'.format('BRCA-panel'))

            mix = get_model(load_json(get_best_model(d))['parameters'])
            clusters = np.argmax(mix.soft_cluster(full_data), 1)

        elif model == 'SigMA':
            # d = os.path.join(ROOT_DIR, 'data/ICGC-BRCA/out-sigma-brca-panel.tsv')
            d = os.path.join(ROOT_DIR, 'data/nature2019/SigMA_output.tsv')
            all_df = pd.read_csv(d, sep='\t')
            # In case this is comma separated
            if len(all_df.columns) == 1:
                all_df = pd.read_csv(d, sep=',')
            clusters = all_df['categ'].values
            unique_clusters = np.unique(clusters)
            cluster_to_num = {}
            for i, c in enumerate(unique_clusters):
                cluster_to_num[c] = i

            clusters = np.array([cluster_to_num[c] for c in clusters])

        else:
            raise ValueError('error')

        dists_in_clusters = []
        dists_out_clusters = []
        for i in range(num_samples):
            for j in range(i + 1, num_samples):
                if clusters[i] == clusters[j]:
                    dists_in_clusters.append(full_data_exposures_dists[i, j])
                else:
                    dists_out_clusters.append(full_data_exposures_dists[i, j])

        dists_in_clusters = np.array(dists_in_clusters)
        dists_out_clusters = np.array(dists_out_clusters)

        dists_in_clusters = np.random.choice(dists_in_clusters, 200, replace=False)
        dists_out_clusters = np.random.choice(dists_out_clusters, 200, replace=False)
        corrs.extend(dists_in_clusters)
        corrs.extend(dists_out_clusters)
        models.extend([model] * len(dists_out_clusters) * 2)
        relations.extend(['Intra-cluster pairs'] * len(dists_out_clusters))
        relations.extend(['Inter-cluster pairs'] * len(dists_out_clusters))

        print(model, len(np.unique(clusters)))
        print(ranksums(dists_in_clusters, dists_out_clusters), np.mean(dists_in_clusters), np.mean(dists_out_clusters))

    df = {'Cosine similarity': corrs, 'model': models, 'relation': relations}
    df = pd.DataFrame(df)
    sns.violinplot(x='relation', y='Cosine similarity', hue='model', data=df, split=True, inner='stick')

    plt.xlabel('')
    plt.savefig(os.path.join(ROOT_DIR, 'results', 'clusters_quality', 'clusters_quality.pdf'))
def main():
    parser = argparse.ArgumentParser(description="translate.py")

    parser.add_argument("--eval_splits", type=str, nargs="+", default=["val", ],
                        choices=["val", "test"], help="evaluate on val/test set, yc2 only has val")
    parser.add_argument("--res_dir", required=True, help="path to dir containing model .pt file")
    parser.add_argument("--batch_size", type=int, default=100, help="batch size")

    # beam search configs
    parser.add_argument("--use_beam", action="store_true", help="use beam search, otherwise greedy search")
    parser.add_argument("--beam_size", type=int, default=2, help="beam size")
    parser.add_argument("--n_best", type=int, default=1, help="stop searching when get n_best from beam search")
    parser.add_argument("--min_sen_len", type=int, default=5, help="minimum length of the decoded sentences")
    parser.add_argument("--max_sen_len", type=int, default=30, help="maximum length of the decoded sentences")
    parser.add_argument("--block_ngram_repeat", type=int, default=0, help="block repetition of ngrams during decoding.")
    parser.add_argument("--length_penalty_name", default="none",
                        choices=["none", "wu", "avg"], help="length penalty to use.")
    parser.add_argument("--length_penalty_alpha", type=float, default=0.,
                        help="Google NMT length penalty parameter (higher = longer generation)")
    parser.add_argument("--eval_tool_dir", type=str, default="./densevid_eval")

    parser.add_argument("--no_cuda", action="store_true")
    parser.add_argument("--seed", default=2019, type=int)
    parser.add_argument("--debug", action="store_true")

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # random seed
    random.seed(opt.seed)
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)

    checkpoint = torch.load(os.path.join(opt.res_dir, "model.chkpt"))

    # add some of the train configs
    train_opt = checkpoint["opt"]  # EDict(load_json(os.path.join(opt.res_dir, "model.cfg.json")))
    for k in train_opt.__dict__:
        if k not in opt.__dict__:
            setattr(opt, k, getattr(train_opt, k))
    print("train_opt", train_opt)

    decoding_strategy = "beam{}_lp_{}_la_{}".format(
        opt.beam_size, opt.length_penalty_name, opt.length_penalty_alpha) if opt.use_beam else "greedy"
    save_json(vars(opt),
              os.path.join(opt.res_dir, "{}_eval_cfg.json".format(decoding_strategy)),
              save_pretty=True)

    if opt.dset_name == "anet":
        reference_files_map = {
            "val": [os.path.join(opt.data_dir, e) for e in
                    ["anet_entities_val_1_para.json", "anet_entities_val_2_para.json"]],
            "test": [os.path.join(opt.data_dir, e) for e in
                     ["anet_entities_test_1_para.json", "anet_entities_test_2_para.json"]]}
    else:  # yc2
        reference_files_map = {"val": [os.path.join(opt.data_dir, "yc2_val_anet_format_para.json")]}
    for eval_mode in opt.eval_splits:
        print("Start evaluating {}".format(eval_mode))
        # add 10 at max_n_sen to make the inference stage use all the segments
        eval_data_loader = get_data_loader(opt, eval_mode=eval_mode)
        eval_references = reference_files_map[eval_mode]

        # setup model
        translator = Translator(opt, checkpoint)

        pred_file = os.path.join(opt.res_dir, "{}_pred_{}.json".format(decoding_strategy, eval_mode))
        pred_file = os.path.abspath(pred_file)
        if not os.path.exists(pred_file):
            json_res = run_translate(eval_data_loader, translator, opt=opt)
            save_json(json_res, pred_file, save_pretty=True)
        else:
            print("Using existing prediction file at {}".format(pred_file))

        # COCO language evaluation
        lang_file = pred_file.replace(".json", "_lang.json")
        eval_command = ["python", "para-evaluate.py", "-s", pred_file, "-o", lang_file,
                        "-v", "-r"] + eval_references
        subprocess.call(eval_command, cwd=opt.eval_tool_dir)

        # basic stats
        stat_filepath = pred_file.replace(".json", "_stat.json")
        eval_stat_cmd = ["python", "get_caption_stat.py", "-s", pred_file, "-r", eval_references[0],
                         "-o", stat_filepath, "-v"]
        subprocess.call(eval_stat_cmd, cwd=opt.eval_tool_dir)

        # repetition evaluation
        rep_filepath = pred_file.replace(".json", "_rep.json")
        eval_rep_cmd = ["python", "evaluateRepetition.py", "-s", pred_file,
                        "-r", eval_references[0], "-o", rep_filepath]
        subprocess.call(eval_rep_cmd, cwd=opt.eval_tool_dir)

        metric_filepaths = [lang_file, stat_filepath, rep_filepath]
        all_metrics = merge_dicts([load_json(e) for e in metric_filepaths])
        all_metrics_filepath = pred_file.replace(".json", "_all_metrics.json")
        save_json(all_metrics, all_metrics_filepath, save_pretty=True)

        print("pred_file {} lang_file {}".format(pred_file, lang_file))
        print("[Info] Finished {}.".format(eval_mode))