コード例 #1
0
ファイル: main.py プロジェクト: itaysason/StickySig
def sample_cv(dataset, model_name, num_folds, fold, use_cosmic, num_signatures,
              shuffle_seed, random_seed, max_iterations, epsilon, out_dir):

    if fold >= num_folds:
        raise ValueError('num_folds is {} but fold is {}'.format(
            num_folds, fold))

    dataset_name = dataset
    dataset, active_signatures = get_data_by_model_name(dataset, model_name)
    if use_cosmic:
        num_signatures = len(active_signatures)
        signatures = get_cosmic_signatures()[active_signatures]
    elif num_signatures == 0:
        print(
            'use_cosmic is False and num_signatures is 0, using number of active cosmic signatures {}'
            .format(len(active_signatures)))
        num_signatures = len(active_signatures)
        signatures = None
    else:
        signatures = None

    use_cosmic_dir = 'refit' if use_cosmic else 'denovo'
    out_dir = os.path.join(out_dir, dataset_name, use_cosmic_dir, model_name,
                           str(num_signatures), str(shuffle_seed),
                           str(num_folds), str(fold))

    try:
        os.makedirs(out_dir)
    except OSError:
        pass

    random_seed = int(time.time()) if random_seed == 0 else random_seed
    out_file = out_dir + "/" + str(random_seed)
    if os.path.isfile(out_file + '.json'):
        print(
            'Experiment with parameters {} {} {} {} {} {} {} {} already exist'.
            format(dataset_name, model_name, num_folds, fold, use_cosmic,
                   num_signatures, shuffle_seed, random_seed))
        return

    train_data, test_data = split_train_test_sample_cv(dataset, num_folds,
                                                       fold, shuffle_seed)

    model, train_ll, test_ll = train_test_stickysig(train_data, test_data,
                                                    num_signatures, signatures,
                                                    random_seed, epsilon,
                                                    max_iterations)
    parameters = model.get_params()

    parameters['alpha'] = parameters['alpha'].tolist()
    parameters['e'] = parameters['e'].tolist()
    for sample in parameters['pi']:
        parameters['pi'][sample] = parameters['pi'][sample].tolist()

    out = {
        'log-likelihood-train': train_ll,
        'log-likelihood-test': test_ll,
        'parameters': parameters
    }
    save_json(out_file, out)
コード例 #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_path", type=str, required=True)
    parser.add_argument("--dset_name",
                        type=str,
                        default="anet",
                        choices=["anet", "yc2"])
    parser.add_argument("--cache", type=str, default="./cache")
    parser.add_argument("--min_word_count", type=int, default=5)
    parser.add_argument("--raw_glove_path",
                        type=str,
                        help="downloaded glove vectors path")

    opt = parser.parse_args()
    if not os.path.exists(opt.cache):
        os.makedirs(opt.cache)

    # load, merge, clean, split data
    train_data = load_json(opt.train_path)
    all_sentences = flat_list_of_lists(
        [v["sentences"] for k, v in train_data.items()])
    all_sentences = [
        nltk.tokenize.word_tokenize(sen.lower()) for sen in all_sentences
    ]
    word2idx = build_vocab_idx(all_sentences, opt.min_word_count)
    print("[Info] Dumping the processed data to json file", opt.cache)
    word2idx_path = os.path.join(opt.cache,
                                 "{}_word2idx.json".format(opt.dset_name))
    save_json(word2idx, word2idx_path, save_pretty=True)
    print("[Info] Finish.")

    vocab_glove_path = os.path.join(opt.cache,
                                    "{}_vocab_glove.pt".format(opt.dset_name))
    extract_glove(word2idx, opt.raw_glove_path, vocab_glove_path)
コード例 #3
0
    def get_joints_labels_and_images(self) -> Tuple[dict, dict]:
        """Returns the dictionary conatinign the bound box of the image and dictionary
        containig image information.

        Returns:
            Tuple[dict, dict]: joints, image_dict
                image_dict
                    - `name` - Image name in the form
                        of `youtube/VIDEO_ID/video/frames/FRAME_ID.png`.
                    - `width` - Width of the image.
                    - `height` - Height of the image.
                    - `id` - Image ID.
                joints
                    - `joints` - 21 joints, containing bound box limits as vertices.
                    - `is_left` - Binary value indicating a right/left hand side.
                    - `image_id` - ID to the corresponding entry in `images`.
                    - `id` - Annotation ID (an image can contain multiple hands).
        """
        data_json_path = os.path.join(self.root_dir,
                                      f"youtube_{self.split}.json")
        joints_path = os.path.join(self.root_dir,
                                   f"youtube_{self.split}_joints.json")
        images_json_path = os.path.join(self.root_dir,
                                        f"youtube_{self.split}_images.json")
        if os.path.exists(joints_path) and os.path.exists(images_json_path):
            return read_json(joints_path), read_json(images_json_path)
        else:
            data_json = read_json(data_json_path)
            images_dict = data_json["images"]
            save_json(images_dict, images_json_path)
            annotations_dict = data_json["annotations"]
            joints = self.get_joints_from_annotations(annotations_dict)
            save_json(joints, joints_path)
            return joints, images_dict
コード例 #4
0
    def dump_index(self, index, patch_idx):
        """Simply saves index as json file under export directory

        Args:
            index (dict): dictionnary to dump as json
        """
        index_path = self._get_index_path(patch_idx)
        save_json(path=index_path, jsonFile=index)
コード例 #5
0
def main(args):
    root = args['--root']
    experiment = build_experiment(load_yaml(args['--cfg']))
    bar = Bar("Patch directory", max=len(experiment.test_set))
    iqa_metrics = defaultdict(list)

    for patch_idx in patches_subset_from(experiment.test_set):
        patch_directory = os.path.join(root, patch_idx)
        if not os.path.isdir(patch_directory):
            # Some patches aren't predicted by ESTARFM as it requires a sample before and one after
            continue

        for date in os.listdir(patch_directory):
            # Load predicted bands
            date_directory = os.path.join(patch_directory, date)
            files_paths = [os.path.join(date_directory, band) for band in os.listdir(date_directory)]
            predicted_bands = load_in_multiband_raster(files_paths)

            # Load groundtruth bands
            target_directory = os.path.join(args['--target'], patch_idx, 'landsat', date)
            target_files_paths = [os.path.join(target_directory, band) for band in os.listdir(target_directory)]
            target_bands = load_in_multiband_raster(target_files_paths)

            # Compute PSNR and SSIM by band
            patch_bands_iqa = defaultdict(list)
            for src, tgt in zip(predicted_bands, target_bands):
                data_range = np.max([src, tgt])
                src = src.clip(min=np.finfo(np.float16).eps) / data_range
                tgt = tgt.clip(min=np.finfo(np.float16).eps) / data_range
                patch_bands_iqa['psnr'] += [metrics.psnr(tgt, src)]
                patch_bands_iqa['ssim'] += [metrics.ssim(tgt, src)]

            # Record bandwise value
            iqa_metrics['psnr'] += [patch_bands_iqa['psnr']]
            iqa_metrics['ssim'] += [patch_bands_iqa['ssim']]

            # Compute bandwise spectral angle mapper
            predicted_patch = np.dstack(predicted_bands).astype(np.float32)
            target_patch = np.dstack(target_bands).astype(np.float32)
            sam = metrics.sam(target_patch, predicted_patch).mean(axis=(0, 1))
            iqa_metrics['sam'] += [sam]

        # Log running averages
        avg_psnr, avg_ssim, avg_sam = np.mean(iqa_metrics['psnr']), np.mean(iqa_metrics['ssim']), np.mean(iqa_metrics['sam'])
        bar.suffix = "PSNR = {:.2f} | SSIM = {:.4f} | SAM = {:.6f}".format(avg_psnr, avg_ssim, avg_sam)
        bar.next()

    # Make bandwise average output dictionnary
    bandwise_avg_psnr = np.asarray(iqa_metrics['psnr']).mean(axis=0).astype(np.float64)
    bandwise_avg_ssim = np.asarray(iqa_metrics['ssim']).mean(axis=0).astype(np.float64)
    bandwise_avg_sam = np.asarray(iqa_metrics['sam']).mean(axis=0).astype(np.float64)

    avg_iqa_metrics = {'test_psnr': bandwise_avg_psnr.tolist(),
                       'test_ssim': bandwise_avg_ssim.tolist(),
                       'test_sam': bandwise_avg_sam.tolist()}
    os.makedirs(args['--o'], exist_ok=True)
    dump_path = os.path.join(args['--o'], f"test_scores_starfm.json")
    save_json(dump_path, avg_iqa_metrics)
コード例 #6
0
ファイル: main.py プロジェクト: itaysason/StickySig
def leave_one_chromosome_out(dataset, model_name, chromosome, use_cosmic,
                             num_signatures, random_seed, max_iterations,
                             epsilon, out_dir):
    use_cosmic_dir = 'refit' if use_cosmic else 'denovo'

    all_chromosomes = [str(i) for i in range(1, 23)]
    all_chromosomes.extend(['X', 'Y'])
    chromosome_name = all_chromosomes[chromosome]

    dataset_name = dataset
    dataset, active_signatures = get_data_by_model_name(dataset, model_name)
    if use_cosmic:
        num_signatures = len(active_signatures)
        signatures = get_cosmic_signatures()[active_signatures]
    elif num_signatures == 0:
        print(
            'use_cosmic is False and num_signatures is 0, using number of active cosmic signatures {}'
            .format(len(active_signatures)))
        num_signatures = len(active_signatures)
        signatures = None
    else:
        signatures = None

    out_dir = os.path.join(out_dir, dataset_name, use_cosmic_dir, model_name,
                           str(num_signatures), chromosome_name)

    try:
        os.makedirs(out_dir)
    except OSError:
        pass

    random_seed = int(time.time()) if random_seed == 0 else random_seed
    out_file = out_dir + "/" + str(random_seed)
    if os.path.isfile(out_file + '.json'):
        print('Experiment with parameters {} {} {} {} {} {} already exist'.
              format(dataset_name, model_name, chromosome, use_cosmic,
                     num_signatures, random_seed))
        return

    train_data, test_data = split_train_test_loco(dataset, chromosome)

    model, train_ll, test_ll = train_test_stickysig(train_data, test_data,
                                                    num_signatures, signatures,
                                                    random_seed, epsilon,
                                                    max_iterations)
    parameters = model.get_params()

    parameters['alpha'] = parameters['alpha'].tolist()
    parameters['e'] = parameters['e'].tolist()
    for sample in parameters['pi']:
        parameters['pi'][sample] = parameters['pi'][sample].tolist()

    out = {
        'log-likelihood-train': train_ll,
        'log-likelihood-test': test_ll,
        'parameters': parameters
    }
    save_json(out_file, out)
コード例 #7
0
 def _write_filtering_values_to_file(self, file_path, name):
     self._update_filtering_values()
     filter_values = {k: v for k, v in self.filter_values.items() if v != "" and pd.notnull(v)}
     filter_values.pop("min_date", None)
     filter_values.pop("max_date", None)
     values = load_json(file_path)
     values[name] = filter_values
     save_json(file_path, values)
     return values
コード例 #8
0
ファイル: loggers.py プロジェクト: shahineb/ci-hackathon
 def log_metrics(self, metrics, step=None):
     # If on testing mode, log output score as json file
     if self.test:
         epoch = metrics['epoch']
         dump_path = os.path.join(self.log_dir,
                                  f"test_scores_epoch={epoch}.json")
         save_json(dump_path, metrics)
     # Else, usual tensorboard logging mode
     else:
         super().log_metrics(metrics, step)
コード例 #9
0
ファイル: data_simulation.py プロジェクト: itaysason/Mix-MMM
def create_base_model():
    try:
        os.makedirs(os.path.join(ROOT_DIR, 'data/simulated-data'))
    except OSError:
        pass
    base_model = load_json(
        os.path.join(
            ROOT_DIR,
            'experiments/trained_models/MSK-ALL/denovo/mix_010clusters_006signatures/314179seed.json'
        ))
    save_json(os.path.join(ROOT_DIR, 'data/simulated-data/base_model'),
              base_model)
コード例 #10
0
def save_stats(df: pd.DataFrame, config: dict):
    stats: Dict[str, Any] = {}

    data_dir = Path(config["dir"])
    data_path = data_dir / config["name"]
    stats_path = data_dir / config["stats"]

    stats["line_count"] = len(df)
    stats["size"] = total_size(data_path)
    stats["dtypes"] = {k: str(v) for k, v in df.dtypes.to_dict().items()}
    stats["nuniques"] = {c: df[c].nunique() for c in df.columns}

    utils.save_json(stats, stats_path)
コード例 #11
0
def process_config(config_path, override_dotmap=None, exp_name_suffix=None):
    """
    Processes config file:
        1) Converts it to a DotMap
        2) Creates experiments path and required subdirs
        3) Set up logging
    """
    config_json = load_json(config_path)
    config = DotMap(config_json)
    if override_dotmap is not None:
        config.update(override_dotmap)

    if exp_name_suffix is not None:
        config.exp_name = f'{config.exp_name}_{exp_name_suffix}'

    print("Loaded configuration: ")
    pprint(config)

    print()
    print(" *************************************** ")
    print("      Running experiment {}".format(config.exp_name))
    print(" *************************************** ")
    print()

    exp_base = config.exp_base
    exp_dir = os.path.join(exp_base, "experiments", config.exp_name)

    # create some important directories to be used for the experiment.
    config.checkpoint_dir = os.path.join(exp_dir, "checkpoints/")
    config.log_dir = os.path.join(exp_dir, "logs/")
    config.summary_dir = os.path.join(exp_dir, "summaries/")
    config.exp_dir = exp_dir

    # will not create if already existing
    makedirs([
        config.checkpoint_dir, 
        config.log_dir,
        config.summary_dir,
    ])

    # save config to experiment dir
    config_out = os.path.join(exp_dir, 'config.json')
    save_json(config.toDict(), config_out)

    # setup logging in the project
    setup_logging(config.log_dir)

    logging.getLogger().info(
        "Configurations and directories successfully set up.")

    return config
コード例 #12
0
ファイル: main.py プロジェクト: itaysason/StickySig
def prepare_prediction_dir(trained_models_dir, prediction_dir):
    datasets = os.listdir(trained_models_dir)
    prediction_dir = os.path.join(prediction_dir, 'prediction')
    for dataset in datasets:
        print(dataset)
        dataset_dir = os.path.join(trained_models_dir, dataset)
        for signature_learning in os.listdir(dataset_dir):
            for model in os.listdir(
                    os.path.join(dataset_dir, signature_learning)):
                dataset_path = os.path.join(prediction_dir, dataset,
                                            signature_learning, model)
                try:
                    os.makedirs(dataset_path)
                except OSError:
                    pass
                data, _ = get_data_by_model_name(dataset, model)
                json_data = {}
                for sample, sample_data in data.items():
                    json_data[sample] = {}
                    for chrom, chrom_data in sample_data.items():
                        json_data[sample][chrom] = {}
                        json_data[sample][chrom]['Sequence'] = chrom_data[
                            'Sequence'].tolist()
                        json_data[sample][chrom]['StrandInfo'] = chrom_data[
                            'StrandInfo'].tolist()

                save_json(os.path.join(dataset_path, 'data'), json_data)
                del json_data
                for num_sigs in os.listdir(
                        os.path.join(dataset_dir, signature_learning, model)):
                    num_sig_dir = os.path.join(dataset_path, num_sigs)
                    try:
                        os.makedirs(num_sig_dir)
                    except OSError:
                        pass
                    experiment_dir = os.path.join(dataset_dir,
                                                  signature_learning, model,
                                                  num_sigs)
                    runs = os.listdir(experiment_dir)
                    for run in runs:
                        model_parameters = load_json(
                            os.path.join(experiment_dir, run))['parameters']
                        if not model_parameters['e'][0][0] >= 0:
                            print('There was a bug in run {}'.format(
                                os.path.join(experiment_dir, run)))
                        prediction = predict_hidden_variables(
                            data, model_parameters)
                        save_json(os.path.join(num_sig_dir, run),
                                  prepare_data_to_json(prediction))

        print('\n')
コード例 #13
0
def build_word_dict(config, min_freq=5):
    cnt = 0
    word_cnt = collections.Counter()
    attr_cnt = collections.Counter()

    for line in read_json_lines(config.train_data):
        we = WikiEntity(line)

        box = we.get_box()
        for a in box.keys():
            for w in box[a].split():
                if config.to_lower:
                    w = w.lower()
                word_cnt[w] += 1
            if config.to_lower:
                a = a.lower()
            attr_cnt[a] += 1

        desc = we.get_desc()
        for w in desc.split():
            if config.to_lower:
                w = w.lower()
            word_cnt[w] += 1

        cnt += 1
        if cnt % 10000 == 0:
            print('\rprocessing: {}'.format(cnt), end='')
    print()

    word_cnt[config.pad] = attr_cnt[config.pad] = 1e9 - config.pad_id
    word_cnt[config.unk] = attr_cnt[config.unk] = 1e9 - config.unk_id
    word_cnt[config.sos] = attr_cnt[config.sos] = 1e9 - config.sos_id
    word_cnt[config.eos] = attr_cnt[config.eos] = 1e9 - config.eos_id
    word_cnt[config.num] = attr_cnt[config.num] = 1e9 - config.num_id
    word_cnt[config.time] = attr_cnt[config.time] = 1e9 - config.time_id
    print('number of words in word counter: {}'.format(len(word_cnt)))
    print('number of words in attribute counter: {}'.format(len(attr_cnt)))

    word_dict = {}
    for word, cnt in word_cnt.most_common():
        if cnt < min_freq:
            break
        word_dict[word] = len(word_dict)
    save_json(word_dict, config.word_dict)

    attr_dict = {}
    for attr, _ in attr_cnt.most_common():
        attr_dict[attr] = len(attr_dict)
    save_json(attr_dict, config.attr_dict)
コード例 #14
0
ファイル: setup.py プロジェクト: mhw32/meta-inference-public
def _process_config(config_json, override_dotmap=None):
    """
    Processes config file:
        1) Converts it to a DotMap
        2) Creates experiments path and required subdirs
        3) Set up logging
    """
    config = DotMap(config_json)
    if override_dotmap is not None:
        config.update(override_dotmap)

    print("Loaded configuration: ")
    pprint(config)

    print()
    print(" *************************************** ")
    print("      Running experiment {}".format(config.exp_name))
    print(" *************************************** ")
    print()

    exp_base = config.exp_base
    timestamp = strftime('%Y-%m-%d--%H_%M_%S', localtime())
    exp_dir = os.path.join(exp_base, "experiments", config.exp_name, timestamp)

    # create some important directories to be used for the experiment.
    config.summary_dir = os.path.join(exp_dir, "summaries/")
    config.checkpoint_dir = os.path.join(exp_dir, "checkpoints/")
    config.out_dir = os.path.join(exp_dir, "out/")
    config.log_dir = os.path.join(exp_dir, "logs/")

    makedirs([
        config.summary_dir, config.checkpoint_dir, config.out_dir,
        config.log_dir
    ])

    # save config to experiment dir
    config_out = os.path.join(exp_dir, 'config.json')
    save_json(config.toDict(), config_out)

    # setup logging in the project
    setup_logging(config.log_dir)

    logging.getLogger().info(
        "Configurations and directories successfully set up.")

    return config
コード例 #15
0
ファイル: main.py プロジェクト: itaysason/StickySig
def train_model(dataset, model_name, use_cosmic, num_signatures, random_seed,
                max_iterations, epsilon, out_dir):
    use_cosmic_dir = 'refit' if use_cosmic else 'denovo'
    dataset_name = dataset
    dataset, active_signatures = get_data_by_model_name(dataset, model_name)
    if use_cosmic:
        num_signatures = len(active_signatures)
        signatures = get_cosmic_signatures()[active_signatures]
    elif num_signatures == 0:
        print(
            'use_cosmic is False and num_signatures is 0, using number of active cosmic signatures {}'
            .format(len(active_signatures)))
        num_signatures = len(active_signatures)
        signatures = None
    else:
        signatures = None

    out_dir = os.path.join(out_dir, dataset_name, use_cosmic_dir, model_name,
                           str(num_signatures))

    try:
        os.makedirs(out_dir)
    except OSError:
        pass

    random_seed = int(time.time()) if random_seed == 0 else random_seed
    out_file = out_dir + "/" + str(random_seed)
    if os.path.isfile(out_file + '.json'):
        print('Experiment with parameters {} {} {} {} {} already exist'.format(
            dataset_name, model_name, use_cosmic, num_signatures, random_seed))
        return

    model, ll = train_stickysig(dataset, num_signatures, signatures,
                                random_seed, epsilon, max_iterations)
    parameters = model.get_params()

    parameters['alpha'] = parameters['alpha'].tolist()
    parameters['e'] = parameters['e'].tolist()
    for sample in parameters['pi']:
        parameters['pi'][sample] = parameters['pi'][sample].tolist()

    out = {'log-likelihood': ll, 'parameters': parameters}
    save_json(out_file, out)
コード例 #16
0
    def add_to_drop_data(self, event):
        col_index = self.table_view.currentIndex().column()
        row_index = self.table_view.currentIndex().row()
        column = self.table_data_sorted.columns[col_index]
        content = self.table_data_sorted.iloc[row_index, col_index]

        try:
            drop_data = load_json(self.config["paths"]["drop_data"])
            values = drop_data.get(column, None)
            if values:
                drop_data[column] = values + [content]
            else:
                drop_data[column] = [content]
            save_json(self.config["paths"]["drop_data"], drop_data)
            self.drop_data_added_signal.emit()

        except Exception as e:
            print(e)
            show_warning("Drop data addition failure", "Something went wrong")
コード例 #17
0
ファイル: initial_data.py プロジェクト: SunSunHack/ml-rec-dev
def save_initial_data_to_json():
    skills_json_path = os.path.join(SAVED_DATA_FOLDER_PATH, "skills.json")
    save_json(skills_json_path, SKILLS_DATA_STRUCTURES)
    print(f"Saved skills to file {skills_json_path}")

    events_json_path = os.path.join(SAVED_DATA_FOLDER_PATH, "events.json")
    save_json(events_json_path, EVENTS_DATA_STRUCTURES)
    print(f"Saved events to file {events_json_path}")

    jobs_json_path = os.path.join(SAVED_DATA_FOLDER_PATH, "jobs.json")
    save_json(jobs_json_path, JOB_DATA_STRUCTURES)
    print(f"Saved jobs to file {jobs_json_path}")

    departments_json_path = os.path.join(SAVED_DATA_FOLDER_PATH,
                                         "departments.json")
    save_json(departments_json_path, DEPARTMENTS_DATA_SCTRUCTURES)
    print(f"Saved departments to file {departments_json_path}")
コード例 #18
0
import os
from src.generate.students.random_students import generate_multiple_random_students
from src.utils import save_json
from data.departments import DEPARTMENTS_DATA_SCTRUCTURES
from data.events import EVENTS_DATA_STRUCTURES
from data.jobs import JOB_DATA_STRUCTURES


cwd = os.getcwd()

GENERATED_STUDENTS_COUNT = 30

STEM_DEPARTMENTS = DEPARTMENTS_DATA_SCTRUCTURES[:8]
STEM_EVENTS = EVENTS_DATA_STRUCTURES[:5]
STEM_JOBS = JOB_DATA_STRUCTURES[:12]

SAVED_JSON_FILE = os.path.join(cwd, "saved_data/students/stem_students.json")

if __name__ == "__main__":
    students = generate_multiple_random_students(STEM_DEPARTMENTS, STEM_EVENTS, STEM_JOBS, students_count=GENERATED_STUDENTS_COUNT)
    save_json(SAVED_JSON_FILE, students)
コード例 #19
0
    # ===============================
    # === Make submission
    # ===============================

    sample_submission = pd.read_csv(input_dir / "sample_submission.csv")
    submission_df = make_submission(test_preds, sample_submission)

    # ===============================
    # === Save
    # ===============================

    config["eval_results"] = dict()
    for k, v in evals_results.items():
        config["eval_results"][k] = v
    save_path = output_dir / "output.json"
    save_json(config, save_path)

    plot_feature_importance(feature_importance,
                            output_dir / "feature_importance.png")

    np.save(output_dir / "oof_preds.npy", oof_preds)

    np.save(output_dir / "test_preds.npy", test_preds)

    submission_df.to_csv(output_dir / "submission.csv", index=False)

    save_pickle(models, output_dir / "model.pkl")

    slack_notify(config_name + "終わったぞ\n" + str(config))
コード例 #20
0
def main():
    """
    Main eval loop: Iterates over all evaluation samples and saves the corresponding
    predictions as json and zip file. This is the format expected at
    https://competitions.codalab.org/competitions/21238#learn_the_details-overview
    """
    parser = argparse.ArgumentParser(
        description="Evaluation on Freihand eval set.")
    parser.add_argument("-key",
                        type=str,
                        help="Add comet key of experiment to restore.")
    parser.add_argument(
        "-resnet_size",
        type=str,
        help="Resnet sizes",
        choices=["18", "34", "50", "101", "152"],
        default=50,
    )
    parser.add_argument("--heatmap",
                        action="store_true",
                        help="Choose Resnet",
                        default=False)
    parser.add_argument(
        "--palm_trained",
        action="store_true",
        help="Use when palm is regressed during training.",
        default=False,
    )
    parser.add_argument(
        "-split",
        type=str,
        help="For debugging select val split",
        default="test",
        choices=["test", "val"],
    )
    parser.add_argument("-checkpoint",
                        type=str,
                        help="selectign checkpoint",
                        default="")
    args = parser.parse_args()
    model = load_model(args.key, args.resnet_size, args.heatmap,
                       args.checkpoint)
    if args.split == "val":
        print(
            "DEBUG MODE ACTIVATED.\n Evaluation pipeline is executed on validation set"
        )
    train_param = edict(read_json(TRAINING_CONFIG_PATH))
    train_param.augmentation_flags.resize = True
    train_param.augmentation_flags.crop = True
    # train_param.augmentation_params.crop_margin = 1.5
    train_param.augmentation_params.crop_box_jitter = [0.0, 0.0]
    augmenter = SampleAugmenter(train_param.augmentation_flags,
                                train_param.augmentation_params)
    # Normalization for BGR mode.
    # transform = transforms.Compose(
    #     [
    #         transforms.ToTensor(),
    #         transforms.Normalize(
    #             (0.485, 0.456, 0.406)[::-1], (0.229, 0.224, 0.225)[::-1]
    #         ),
    #     ]
    # )
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])

    data = F_DB(FREIHAND_DATA, split=args.split)
    xyz_pred = []
    debug_mean = []
    with torch.no_grad():
        for i in tqdm(range(len(data))):
            joints3d_normalized = normalize_joints(
                model_refined_inference(model, data[i], augmenter, transform,
                                        args.palm_trained))
            if args.split == "val":
                # DEBUG CODE:
                joints3d = joints3d_normalized * data.scale[data.indices[i] %
                                                            32560]
                debug_mean.append(
                    torch.mean(torch.abs(joints3d - data[i]["joints3D"])))
            else:
                joints3d = joints3d_normalized * data.scale[data.indices[i]]

            xyz_pred.append(JOINTS.ait_to_freihand(joints3d).tolist())

    if args.split == "val":
        # DEBUG CODE:
        print(
            f"MAE 3d\nMean : {np.mean(debug_mean)}\nMax: { np.max(debug_mean)}"
            "\nMedian: { np.median(debug_mean)}")
        exit()

    verts = np.zeros((len(xyz_pred), 778, 3)).tolist()
    save_json([xyz_pred, verts], f"{args.key}_pred.json")
    subprocess.call(
        ["zip", "-j", f"{args.key}_pred.zip", f"{args.key}_pred.json"])
    subprocess.call(["rm", f"{args.key}_pred.json"])
コード例 #21
0
def evaluation():
    """evaluation"""
    print('********************** loading corpus ********************** ')
    s_lc = time.time()
    data_generator = DataGen(config)
    queries = read_query(config)
    print("loading corpus time (h):", (time.time() - s_lc) / 3600)
    print('********************** loading model ********************** ')
    s_lm = time.time()

    model_onehop_bert = ModelOneHop()
    param_dict = load_checkpoint(config.onehop_bert_path)
    load_param_into_net(model_onehop_bert, param_dict)
    model_twohop_bert = ModelTwoHop()
    param_dict2 = load_checkpoint(config.twohop_bert_path)
    load_param_into_net(model_twohop_bert, param_dict2)
    onehop = OneHopBert(config, model_onehop_bert)
    twohop = TwoHopBert(config, model_twohop_bert)

    print("loading model time (h):", (time.time() - s_lm) / 3600)
    print('********************** evaluation ********************** ')
    s_tr = time.time()

    f_dev = open(config.dev_path, 'rb')
    dev_data = json.load(f_dev)
    q_gold = {}
    q_2id = {}
    for onedata in dev_data:
        if onedata["question"] not in q_gold:
            q_gold[onedata["question"]] = [
                get_new_title(get_raw_title(item)) for item in onedata['path']
            ]
            q_2id[onedata["question"]] = onedata['_id']
    val, true_count, count, step = 0, 0, 0, 0
    batch_queries = split_queries(config, queries)[:-1]
    output_path = []
    for _, batch in enumerate(batch_queries):
        print("###step###: ", step)
        query = batch[0]
        temp_dict = {}
        temp_dict['q_id'] = q_2id[query]
        temp_dict['question'] = query
        gold_path = q_gold[query]
        input_ids_1, token_type_ids_1, input_mask_1 = data_generator.convert_onehop_to_features(
            batch)
        start = 0
        TOTAL = len(input_ids_1)
        split_chunk = 8
        while start < TOTAL:
            end = min(start + split_chunk - 1, TOTAL - 1)
            chunk_len = end - start + 1
            input_ids_1_ = input_ids_1[start:start + chunk_len]
            input_ids_1_ = Tensor(input_ids_1_, mstype.int32)
            token_type_ids_1_ = token_type_ids_1[start:start + chunk_len]
            token_type_ids_1_ = Tensor(token_type_ids_1_, mstype.int32)
            input_mask_1_ = input_mask_1[start:start + chunk_len]
            input_mask_1_ = Tensor(input_mask_1_, mstype.int32)
            cls_out = onehop(input_ids_1_, token_type_ids_1_, input_mask_1_)
            if start == 0:
                out = cls_out
            else:
                out = P.Concat(0)((out, cls_out))
            start = end + 1
        out = P.Squeeze(1)(out)
        onehop_prob, onehop_index = P.TopK(sorted=True)(out, config.topk)
        onehop_prob = P.Softmax()(onehop_prob)
        sample, path_raw, last_out = data_generator.get_samples(
            query, onehop_index, onehop_prob)
        input_ids_2, token_type_ids_2, input_mask_2 = data_generator.convert_twohop_to_features(
            sample)
        start_2 = 0
        TOTAL_2 = len(input_ids_2)
        split_chunk = 8
        while start_2 < TOTAL_2:
            end_2 = min(start_2 + split_chunk - 1, TOTAL_2 - 1)
            chunk_len = end_2 - start_2 + 1
            input_ids_2_ = input_ids_2[start_2:start_2 + chunk_len]
            input_ids_2_ = Tensor(input_ids_2_, mstype.int32)
            token_type_ids_2_ = token_type_ids_2[start_2:start_2 + chunk_len]
            token_type_ids_2_ = Tensor(token_type_ids_2_, mstype.int32)
            input_mask_2_ = input_mask_2[start_2:start_2 + chunk_len]
            input_mask_2_ = Tensor(input_mask_2_, mstype.int32)
            cls_out = twohop(input_ids_2_, token_type_ids_2_, input_mask_2_)
            if start_2 == 0:
                out_2 = cls_out
            else:
                out_2 = P.Concat(0)((out_2, cls_out))
            start_2 = end_2 + 1
        out_2 = P.Softmax()(out_2)
        last_out = Tensor(last_out, mstype.float32)
        out_2 = P.Mul()(out_2, last_out)
        val, true_count, topk_titles = eval_output(out_2, last_out, path_raw,
                                                   gold_path, val, true_count)
        temp_dict['topk_titles'] = topk_titles
        output_path.append(temp_dict)
        count += 1
        print("val:", val)
        print("count:", count)
        print("true count:", true_count)
        if count:
            print("PEM:", val / count)
        if true_count:
            print("true top8 PEM:", val / true_count)
        step += 1
    save_json(output_path, config.save_path, config.save_name)
    print("evaluation time (h):", (time.time() - s_tr) / 3600)
コード例 #22
0
def run_train(sess,
              model,
              train_data,
              valid_data,
              saver,
              evaluator,
              summary_writer=None):
    flag = 0
    best_valid_result = 0.0
    valid_log_history = defaultdict(list)
    global_step = 0
    for i in range(config.num_epoch):
        logger.info(log_title('Train Epoch: {}'.format(i + 1)))
        steps = 0
        total_loss = 0.0
        total_accu = 0.0
        batch_iter = tqdm(
            list(
                make_batch_iter(list(zip(*train_data)),
                                config.batch_size,
                                shuffle=True)))
        for batch in batch_iter:
            topic, topic_len, triple, triple_len, src, src_len, tgt, tgt_len = make_batch_data(
                batch)

            _, loss, accu, global_step, summary = sess.run(
                [
                    model.train_op, model.loss, model.accu, model.global_step,
                    model.summary
                ],
                feed_dict={
                    model.batch_size: len(topic),
                    model.topic: topic,
                    model.topic_len: topic_len,
                    model.triple: triple,
                    model.triple_len: triple_len,
                    model.src: src,
                    model.src_len: src_len,
                    model.tgt: tgt,
                    model.tgt_len: tgt_len,
                    model.training: True
                })

            steps += 1
            total_loss += loss
            total_accu += accu
            batch_iter.set_description(
                'loss: {:>.4f} accuracy: {:>.4f}'.format(loss, accu))
            if global_step % args.log_steps == 0 and summary_writer is not None:
                summary_writer.add_summary(summary, global_step)
            if global_step % args.save_steps == 0:
                # evaluate saved models after pre-train epochs
                if i < args.pre_train_epochs:
                    saver.save(sess,
                               config.model_file,
                               global_step=global_step)
                else:
                    predicted_ids, valid_loss, valid_accu = run_evaluate(
                        sess, model, valid_data)
                    logger.info(
                        'valid loss: {:>.4f}, valid accuracy: {:>.4f}'.format(
                            valid_loss, valid_accu))

                    save_outputs(predicted_ids, config.id_2_word,
                                 config.valid_data, config.valid_outputs)
                    valid_results = evaluator.evaluate(config.valid_data,
                                                       config.valid_outputs,
                                                       config.to_lower)

                    # early stop
                    if valid_results['BLEU 4'] >= best_valid_result:
                        flag = 0
                        best_valid_result = valid_results['BLEU 4']
                        logger.info('saving model-{}'.format(global_step))
                        saver.save(sess,
                                   config.model_file,
                                   global_step=global_step)
                        save_json(valid_results, config.valid_results)
                    elif flag < args.early_stop:
                        flag += 1
                    elif args.early_stop:
                        return valid_log_history

                    for key, value in valid_results.items():
                        valid_log_history[key].append(value)
                    valid_log_history['loss'].append(valid_loss)
                    valid_log_history['accuracy'].append(valid_accu)
                    valid_log_history['global_step'].append(int(global_step))
        logger.info('train loss: {:>.4f}, train accuracy: {:>.4f}'.format(
            total_loss / steps, total_accu / steps))
    saver.save(sess, config.model_file, global_step=global_step)

    return valid_log_history
コード例 #23
0
        x="importance",
        y="feature",
        data=feature_importance.sort_values("mean_importance", ascending=False),
    )
    plt.title("Model Features")
    plt.tight_layout()
    plt.savefig(output / "feature_importance.png")

    # ===============================
    # === Make submission
    # ===============================

    sample_submission = pd.read_csv(input_dir / "sample_submission.csv")
    submission_df = make_submission(test_pred, sample_submission)

    # ===============================
    # === Save
    # ===============================

    save_path = output / "output.json"
    output_dict["feature_importance"] = dict()
    output_dict["feature_importance"] = feature_importance_dict
    save_json(output_dict, save_path)

    np.save(output / "oof_preds.npy", oof_pred)

    np.save(output / "test_preds.npy", test_pred)

    config_name = args.output.split("/")[-1]
    submission_df.to_csv(output / f"{config_name}_sub.csv", index=False)
コード例 #24
0
def eval_language_metrics(checkpoint,
                          eval_data_loader,
                          opt,
                          model=None,
                          eval_mode="val"):
    """eval_mode can only be set to `val` here, as setting to `test` is cheating
    0, run inference
    1, Get METEOR, BLEU1-4, CIDEr scores
    2, Get vocab size, sentence length
    """
    translator = Translator(opt, checkpoint, model=model)
    json_res = run_translate(eval_data_loader, translator, opt=opt)
    res_filepath = os.path.abspath(
        opt.save_model + "_tmp_greedy_pred_{}.json".format(eval_mode))
    save_json(json_res, res_filepath, save_pretty=True)

    if opt.dset_name == "anet":
        reference_files_map = {
            "val": [
                os.path.join(opt.data_dir, e) for e in [
                    "anet_entities_val_1_para.json",
                    "anet_entities_val_2_para.json"
                ]
            ],
            "test": [
                os.path.join(opt.data_dir, e) for e in [
                    "anet_entities_test_1_para.json",
                    "anet_entities_test_2_para.json"
                ]
            ]
        }
    else:  # yc2
        reference_files_map = {
            "val":
            [os.path.join(opt.data_dir, "yc2_val_anet_format_para.json")]
        }

    # COCO language evaluation
    eval_references = reference_files_map[eval_mode]
    lang_filepath = res_filepath.replace(".json", "_lang.json")
    eval_cmd = [
        "python", "para-evaluate.py", "-s", res_filepath, "-o", lang_filepath,
        "-v", "-r"
    ] + eval_references
    subprocess.call(eval_cmd, cwd=opt.eval_tool_dir)

    # basic stats
    stat_filepath = res_filepath.replace(".json", "_stat.json")
    eval_stat_cmd = [
        "python", "get_caption_stat.py", "-s", res_filepath, "-r",
        eval_references[0], "-o", stat_filepath, "-v"
    ]
    subprocess.call(eval_stat_cmd, cwd=opt.eval_tool_dir)

    # repetition evaluation
    rep_filepath = res_filepath.replace(".json", "_rep.json")
    eval_rep_cmd = [
        "python", "evaluateRepetition.py", "-s", res_filepath, "-r",
        eval_references[0], "-o", rep_filepath
    ]
    subprocess.call(eval_rep_cmd, cwd=opt.eval_tool_dir)

    # save results
    logger.info("Finished eval {}.".format(eval_mode))
    metric_filepaths = [lang_filepath, stat_filepath, rep_filepath]
    all_metrics = merge_dicts([load_json(e) for e in metric_filepaths])

    all_metrics_filepath = res_filepath.replace(".json", "_all_metrics.json")
    save_json(all_metrics, all_metrics_filepath, save_pretty=True)
    return all_metrics, [res_filepath, all_metrics_filepath]
コード例 #25
0
    parser.add_argument("-ei", "--exp_id", default=None)
    parser.add_argument("-mf", "--metrics_flag", default=1, type=int)

    args = parser.parse_args()
    exp_list = []
    for exp_group_name in args.exp_group_list:
        exp_list += exp_configs.EXP_GROUPS[exp_group_name]

    # loop over experiments
    for exp_dict in exp_list:
        exp_id = ut.hash_dict(exp_dict)

        if args.exp_id is not None and args.exp_id != exp_id:
            continue

        savedir = args.savedir_base + "/%s/" % exp_id
        os.makedirs(savedir, exist_ok=True)
        ut.save_json(savedir + "/exp_dict.json", exp_dict)

        # check if experiment exists
        if args.reset:
            if os.path.exists(savedir + "/score_list.pkl"):
                os.remove(savedir + "/score_list.pkl")
            if os.path.exists(savedir + "/run_dict.pkl"):
                os.remove(savedir + "/run_dict.pkl")

        # do trainval
        trainval(exp_dict=exp_dict,
                 savedir=savedir,
                 datadir=args.datadir,
                 metrics_flag=args.metrics_flag)
コード例 #26
0
def main():
    os.makedirs(config.temp_dir, exist_ok=True)
    os.makedirs(config.result_dir, exist_ok=True)
    os.makedirs(config.train_log_dir, exist_ok=True)

    logger.setLevel(logging.INFO)
    init_logger(logging.INFO, 'temp.log.txt', 'w')

    logger.info('preparing data...')
    config.word_2_id, config.id_2_word = read_json_dict(config.vocab_dict)
    config.vocab_size = min(config.vocab_size, len(config.word_2_id))
    config.oov_vocab_size = min(config.oov_vocab_size,
                                len(config.word_2_id) - config.vocab_size)

    embedding_matrix = None
    if args.do_train:
        if os.path.exists(config.glove_file):
            logger.info('loading embedding matrix from file: {}'.format(
                config.glove_file))
            embedding_matrix, config.word_em_size = load_glove_embedding(
                config.glove_file, list(config.word_2_id.keys()))
            logger.info('shape of embedding matrix: {}'.format(
                embedding_matrix.shape))
    else:
        if os.path.exists(config.glove_file):
            with open(config.glove_file, 'r', encoding='utf-8') as fin:
                line = fin.readline()
                config.word_em_size = len(line.strip().split()) - 1

    data_reader = DataReader(config)
    evaluator = Evaluator('tgt')

    logger.info('building model...')
    model = get_model(config, embedding_matrix)
    saver = tf.train.Saver(max_to_keep=10)

    if args.do_train:
        logger.info('loading data...')
        train_data = data_reader.read_train_data()
        valid_data = data_reader.read_valid_data()

        logger.info(log_title('Trainable Variables'))
        for v in tf.trainable_variables():
            logger.info(v)

        logger.info(log_title('Gradients'))
        for g in model.gradients:
            logger.info(g)

        with tf.Session(config=sess_config) as sess:
            model_file = args.model_file
            if model_file is None:
                model_file = tf.train.latest_checkpoint(
                    os.path.join(config.result_dir, config.current_model))
            if model_file is not None:
                logger.info('loading model from {}...'.format(model_file))
                saver.restore(sess, model_file)
            else:
                logger.info('initializing from scratch...')
                tf.global_variables_initializer().run()

            train_writer = tf.summary.FileWriter(config.train_log_dir,
                                                 sess.graph)

            valid_log_history = run_train(sess, model, train_data, valid_data,
                                          saver, evaluator, train_writer)
            save_json(
                valid_log_history,
                os.path.join(config.result_dir, config.current_model,
                             'valid_log_history.json'))

    if args.do_eval:
        logger.info('loading data...')
        valid_data = data_reader.read_valid_data()

        with tf.Session(config=sess_config) as sess:
            model_file = args.model_file
            if model_file is None:
                model_file = tf.train.latest_checkpoint(
                    os.path.join(config.result_dir, config.current_model))
            if model_file is not None:
                logger.info('loading model from {}...'.format(model_file))
                saver.restore(sess, model_file)

                predicted_ids, valid_loss, valid_accu = run_evaluate(
                    sess, model, valid_data)
                logger.info(
                    'average valid loss: {:>.4f}, average valid accuracy: {:>.4f}'
                    .format(valid_loss, valid_accu))

                logger.info(log_title('Saving Result'))
                save_outputs(predicted_ids, config.id_2_word,
                             config.valid_data, config.valid_outputs)
                results = evaluator.evaluate(config.valid_data,
                                             config.valid_outputs,
                                             config.to_lower)
                save_json(results, config.valid_results)
            else:
                logger.info('model not found!')

    if args.do_test:
        logger.info('loading data...')
        test_data = data_reader.read_test_data()

        with tf.Session(config=sess_config) as sess:
            model_file = args.model_file
            if model_file is None:
                model_file = tf.train.latest_checkpoint(
                    os.path.join(config.result_dir, config.current_model))
            if model_file is not None:
                logger.info('loading model from {}...'.format(model_file))
                saver.restore(sess, model_file)

                predicted_ids = run_test(sess, model, test_data)

                logger.info(log_title('Saving Result'))
                save_outputs(predicted_ids, config.id_2_word, config.test_data,
                             config.test_outputs)
                results = evaluator.evaluate(config.test_data,
                                             config.test_outputs,
                                             config.to_lower)
                save_json(results, config.test_results)
            else:
                logger.info('model not found!')
コード例 #27
0
def main():
    os.makedirs(config.temp_dir, exist_ok=True)
    os.makedirs(config.result_dir, exist_ok=True)
    os.makedirs(config.train_log_dir, exist_ok=True)

    logger.setLevel(logging.INFO)
    init_logger(logging.INFO)

    logger.info('loading dict...')
    config.src_2_id, config.id_2_src = read_json_dict(config.src_vocab_dict)
    config.src_vocab_size = min(config.src_vocab_size, len(config.src_2_id))
    config.tgt_2_id, config.id_2_tgt = read_json_dict(config.tgt_vocab_dict)
    config.tgt_vocab_size = min(config.tgt_vocab_size, len(config.tgt_2_id))

    data_reader = DataReader(config)
    evaluator = Evaluator('tgt')

    logger.info('building model...')
    model = get_model(config)
    saver = tf.train.Saver(max_to_keep=10)

    if args.do_train:
        logger.info('loading data...')
        train_data = data_reader.load_train_data()
        valid_data = data_reader.load_valid_data()

        logger.info(log_title('Trainable Variables'))
        for v in tf.trainable_variables():
            logger.info(v)

        logger.info(log_title('Gradients'))
        for g in model.gradients:
            logger.info(g)

        with tf.Session(config=sess_config) as sess:
            model_file = args.model_file
            if model_file is None:
                model_file = tf.train.latest_checkpoint(
                    os.path.join(config.result_dir, config.current_model))
            if model_file is not None:
                logger.info('loading model from {}...'.format(model_file))
                saver.restore(sess, model_file)
            else:
                logger.info('initializing from scratch...')
                tf.global_variables_initializer().run()

            train_writer = tf.summary.FileWriter(config.train_log_dir,
                                                 sess.graph)
            valid_log_history = run_train(sess, model, train_data, valid_data,
                                          saver, evaluator, train_writer)
            save_json(
                valid_log_history,
                os.path.join(config.result_dir, config.current_model,
                             'valid_log_history.json'))

    if args.do_eval:
        logger.info('loading data...')
        valid_data = data_reader.load_valid_data()

        with tf.Session(config=sess_config) as sess:
            model_file = args.model_file
            if model_file is None:
                model_file = tf.train.latest_checkpoint(
                    os.path.join(config.result_dir, config.current_model))
            if model_file is not None:
                logger.info('loading model from {}...'.format(model_file))
                saver.restore(sess, model_file)

                predicted_ids, valid_loss, valid_accu = run_evaluate(
                    sess, model, valid_data)
                logger.info(
                    'average valid loss: {:>.4f}, average valid accuracy: {:>.4f}'
                    .format(valid_loss, valid_accu))

                logger.info(log_title('Saving Result'))
                save_outputs(predicted_ids, config.id_2_tgt, config.valid_data,
                             config.valid_outputs)
                results = evaluator.evaluate(config.valid_data,
                                             config.valid_outputs,
                                             config.to_lower)
                save_json(results, config.valid_results)
            else:
                logger.info('model not found!')

    if args.do_test:
        logger.info('loading data...')
        test_data = data_reader.load_test_data()

        with tf.Session(config=sess_config) as sess:
            model_file = args.model_file
            if model_file is None:
                model_file = tf.train.latest_checkpoint(
                    os.path.join(config.result_dir, config.current_model))
            if model_file is not None:
                logger.info('loading model from {}...'.format(model_file))
                saver.restore(sess, model_file)

                predicted_ids = run_test(sess, model, test_data)

                logger.info(log_title('Saving Result'))
                save_outputs(predicted_ids, config.id_2_tgt, config.test_data,
                             config.test_outputs)
                results = evaluator.evaluate(config.test_data,
                                             config.test_outputs,
                                             config.to_lower)
                save_json(results, config.test_results)
            else:
                logger.info('model not found!')
コード例 #28
0
ファイル: data_simulation.py プロジェクト: itaysason/Mix-MMM
def simulate(num_clusters, num_signatures, num_samples, random_seed):
    np.random.seed(random_seed)
    base_model = get_model(
        load_json(
            os.path.join(ROOT_DIR, 'data', 'simulated-data',
                         'base_model.json'))['parameters'])
    if num_clusters > base_model.num_clusters:
        raise ValueError(
            'num_clusters cannot be larger than base_model.num_clusters ({})'.
            format(base_model.num_clusters))
    if num_signatures > base_model.num_topics:
        raise ValueError(
            'num_clusters cannot be larger than base_model.num_topics ({})'.
            format(base_model.num_topics))

    msk_data, _ = get_data('MSK-ALL')
    msk_sizes = np.sum(msk_data, 1).astype('int')

    clusters = np.random.choice(base_model.num_clusters,
                                size=num_clusters,
                                replace=False,
                                p=base_model.w)
    pi = base_model.pi[clusters]
    w = base_model.w[clusters]
    w /= w.sum()
    prob_sig = np.dot(w, pi)
    signatures = np.random.choice(base_model.num_topics,
                                  size=num_signatures,
                                  replace=False,
                                  p=prob_sig)

    pi = pi[:, signatures]
    pi /= pi.sum(1, keepdims=True)
    e = base_model.e[signatures]
    model = Mix(num_clusters,
                num_signatures,
                init_params={
                    'w': w,
                    'pi': pi,
                    'e': e
                })
    sample_sizes = np.random.choice(msk_sizes, num_samples)
    clusters, signatures, mutations = model.sample(sample_sizes)

    curr_dir = os.path.join(
        ROOT_DIR, 'data', 'simulated-data',
        '{}_{}_{}_{}'.format(num_clusters, num_signatures, num_samples,
                             random_seed))
    try:
        os.makedirs(curr_dir)
    except OSError:
        pass

    # Save model, base data
    save_json(os.path.join(curr_dir, 'full_simulated'), {
        'clusters': clusters,
        'signatures': signatures,
        'mutations': mutations
    })
    parameters = model.get_params()

    parameters['w'] = parameters['w'].tolist()
    parameters['pi'] = parameters['pi'].tolist()
    parameters['e'] = parameters['e'].tolist()

    save_json(os.path.join(curr_dir, 'model'), parameters)

    # Transform the basic data into mutation matrix
    mutation_mat = np.zeros((num_samples, 96), dtype='int')
    for i in range(num_samples):
        a, b = np.unique(mutations[i], return_counts=True)
        mutation_mat[i, a] = b

    np.save(os.path.join(curr_dir, 'mutations'), mutation_mat)
コード例 #29
0

if __name__ == "__main__":
    t_s = time.time()
    config = ThinkRetrieverConfig()
    pool = Pool(processes=config.device_num)
    results = []
    for device_id in range(config.device_num):
        results.append(pool.apply_async(evaluation, (device_id, )))

    print("Waiting for all subprocess done...")

    pool.close()
    pool.join()

    val_all, true_count_all, count_all = 0, 0, 0
    output_path_all = []
    for res in results:
        output = res.get()
        val_all += output['val']
        count_all += output['count']
        true_count_all += output['true_count']
        output_path_all += output['path']
    print("val:", val_all)
    print("count:", count_all)
    print("true count:", true_count_all)
    print("PEM:", val_all / count_all)
    print("true top8 PEM:", val_all / true_count_all)
    save_json(output_path_all, config.save_path, config.save_name)
    print("evaluation time (h):", (time.time() - t_s) / 3600)
コード例 #30
0
def main():
    parser = argparse.ArgumentParser(description="translate.py")

    parser.add_argument("--eval_splits", type=str, nargs="+", default=["val", ],
                        choices=["val", "test"], help="evaluate on val/test set, yc2 only has val")
    parser.add_argument("--res_dir", required=True, help="path to dir containing model .pt file")
    parser.add_argument("--batch_size", type=int, default=100, help="batch size")

    # beam search configs
    parser.add_argument("--use_beam", action="store_true", help="use beam search, otherwise greedy search")
    parser.add_argument("--beam_size", type=int, default=2, help="beam size")
    parser.add_argument("--n_best", type=int, default=1, help="stop searching when get n_best from beam search")
    parser.add_argument("--min_sen_len", type=int, default=5, help="minimum length of the decoded sentences")
    parser.add_argument("--max_sen_len", type=int, default=30, help="maximum length of the decoded sentences")
    parser.add_argument("--block_ngram_repeat", type=int, default=0, help="block repetition of ngrams during decoding.")
    parser.add_argument("--length_penalty_name", default="none",
                        choices=["none", "wu", "avg"], help="length penalty to use.")
    parser.add_argument("--length_penalty_alpha", type=float, default=0.,
                        help="Google NMT length penalty parameter (higher = longer generation)")
    parser.add_argument("--eval_tool_dir", type=str, default="./densevid_eval")

    parser.add_argument("--no_cuda", action="store_true")
    parser.add_argument("--seed", default=2019, type=int)
    parser.add_argument("--debug", action="store_true")

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # random seed
    random.seed(opt.seed)
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)

    checkpoint = torch.load(os.path.join(opt.res_dir, "model.chkpt"))

    # add some of the train configs
    train_opt = checkpoint["opt"]  # EDict(load_json(os.path.join(opt.res_dir, "model.cfg.json")))
    for k in train_opt.__dict__:
        if k not in opt.__dict__:
            setattr(opt, k, getattr(train_opt, k))
    print("train_opt", train_opt)

    decoding_strategy = "beam{}_lp_{}_la_{}".format(
        opt.beam_size, opt.length_penalty_name, opt.length_penalty_alpha) if opt.use_beam else "greedy"
    save_json(vars(opt),
              os.path.join(opt.res_dir, "{}_eval_cfg.json".format(decoding_strategy)),
              save_pretty=True)

    if opt.dset_name == "anet":
        reference_files_map = {
            "val": [os.path.join(opt.data_dir, e) for e in
                    ["anet_entities_val_1_para.json", "anet_entities_val_2_para.json"]],
            "test": [os.path.join(opt.data_dir, e) for e in
                     ["anet_entities_test_1_para.json", "anet_entities_test_2_para.json"]]}
    else:  # yc2
        reference_files_map = {"val": [os.path.join(opt.data_dir, "yc2_val_anet_format_para.json")]}
    for eval_mode in opt.eval_splits:
        print("Start evaluating {}".format(eval_mode))
        # add 10 at max_n_sen to make the inference stage use all the segments
        eval_data_loader = get_data_loader(opt, eval_mode=eval_mode)
        eval_references = reference_files_map[eval_mode]

        # setup model
        translator = Translator(opt, checkpoint)

        pred_file = os.path.join(opt.res_dir, "{}_pred_{}.json".format(decoding_strategy, eval_mode))
        pred_file = os.path.abspath(pred_file)
        if not os.path.exists(pred_file):
            json_res = run_translate(eval_data_loader, translator, opt=opt)
            save_json(json_res, pred_file, save_pretty=True)
        else:
            print("Using existing prediction file at {}".format(pred_file))

        # COCO language evaluation
        lang_file = pred_file.replace(".json", "_lang.json")
        eval_command = ["python", "para-evaluate.py", "-s", pred_file, "-o", lang_file,
                        "-v", "-r"] + eval_references
        subprocess.call(eval_command, cwd=opt.eval_tool_dir)

        # basic stats
        stat_filepath = pred_file.replace(".json", "_stat.json")
        eval_stat_cmd = ["python", "get_caption_stat.py", "-s", pred_file, "-r", eval_references[0],
                         "-o", stat_filepath, "-v"]
        subprocess.call(eval_stat_cmd, cwd=opt.eval_tool_dir)

        # repetition evaluation
        rep_filepath = pred_file.replace(".json", "_rep.json")
        eval_rep_cmd = ["python", "evaluateRepetition.py", "-s", pred_file,
                        "-r", eval_references[0], "-o", rep_filepath]
        subprocess.call(eval_rep_cmd, cwd=opt.eval_tool_dir)

        metric_filepaths = [lang_file, stat_filepath, rep_filepath]
        all_metrics = merge_dicts([load_json(e) for e in metric_filepaths])
        all_metrics_filepath = pred_file.replace(".json", "_all_metrics.json")
        save_json(all_metrics, all_metrics_filepath, save_pretty=True)

        print("pred_file {} lang_file {}".format(pred_file, lang_file))
        print("[Info] Finished {}.".format(eval_mode))