def check_lstm(monkeypatch, use_granular=False, **kwargs):
    monkeypatch.setattr('builtins.input', lambda _: "0")
    # start the review process.
    reviewer = get_reviewer(data_fp,
                            mode="oracle",
                            embedding_fp=embedding_fp,
                            prior_included=[1, 3],
                            prior_excluded=[2, 4],
                            **kwargs)
    if use_granular:
        # Two loops of training and classification.
        reviewer.train()
        reviewer.log_probabilities()
        query_idx = reviewer.query(1)
        inclusions = reviewer._get_labels(query_idx)
        reviewer.classify(query_idx, inclusions)

        reviewer.train()
        reviewer.log_probabilities()
        query_idx = reviewer.query(1)
        inclusions = reviewer._get_labels(query_idx)
        reviewer.classify(query_idx, inclusions)
    else:
        reviewer.review()
    check_log(reviewer._logger._log_dict)
Ejemplo n.º 2
0
def check_model(monkeypatch=None,
                use_granular=False,
                state_file=h5_state_file,
                continue_from_state=False,
                mode="simulate",
                data_fp=data_fp,
                state_checker=check_state,
                prior_idx=[1, 2, 3, 4],
                **kwargs):
    if not continue_from_state:
        try:
            if state_file is not None:
                os.unlink(state_file)
        except OSError:
            pass

    if monkeypatch is not None:
        monkeypatch.setattr('builtins.input', lambda _: "0")
    # start the review process.
    reviewer = get_reviewer(data_fp,
                            mode=mode,
                            embedding_fp=embedding_fp,
                            prior_idx=prior_idx,
                            state_file=state_file,
                            **kwargs)
    if use_granular:
        with open_state(state_file) as state:
            # Two loops of training and classification.
            reviewer.train()
            reviewer.log_probabilities(state)
            query_idx = reviewer.query(1)
            inclusions = reviewer._get_labels(query_idx)
            reviewer.classify(query_idx, inclusions, state)

            reviewer.train()
            reviewer.log_probabilities(state)
            query_idx = reviewer.query(1)
            inclusions = reviewer._get_labels(query_idx)
            reviewer.classify(query_idx, inclusions, state)
    else:
        with open_state(state_file) as state:
            if state_file is None:
                state.set_labels(reviewer.y)
                init_idx, init_labels = reviewer._prior_knowledge()
                reviewer.query_i = 0
                reviewer.train_idx = np.array([], dtype=np.int)
                reviewer.classify(init_idx,
                                  init_labels,
                                  state,
                                  method="initial")

            reviewer._do_review(state)
            if state_file is None:
                print(state._state_dict)
                check_state(state)

    if state_file is not None:
        with open_state(state_file, read_only=True) as state:
            state_checker(state)
Ejemplo n.º 3
0
def check_model(monkeypatch=None,
                use_granular=False,
                log_file=h5_log_file,
                continue_from_log=False,
                mode="oracle",
                **kwargs):
    if not continue_from_log:
        try:
            if log_file is not None:
                os.unlink(log_file)
        except OSError:
            pass

    if monkeypatch is not None:
        monkeypatch.setattr('builtins.input', lambda _: "0")
    # start the review process.
    reviewer = get_reviewer(data_fp,
                            mode=mode,
                            embedding_fp=embedding_fp,
                            prior_included=[1, 3],
                            prior_excluded=[2, 4],
                            log_file=log_file,
                            **kwargs)
    if use_granular:
        with open_logger(log_file) as logger:
            # Two loops of training and classification.
            reviewer.train()
            reviewer.log_probabilities(logger)
            query_idx = reviewer.query(1)
            inclusions = reviewer._get_labels(query_idx)
            reviewer.classify(query_idx, inclusions, logger)

            reviewer.train()
            reviewer.log_probabilities(logger)
            query_idx = reviewer.query(1)
            inclusions = reviewer._get_labels(query_idx)
            reviewer.classify(query_idx, inclusions, logger)
    else:
        with open_logger(log_file) as logger:
            if log_file is None:
                logger.set_labels(reviewer.y)
                init_idx, init_labels = reviewer._prior_knowledge()
                reviewer.query_i = 0
                reviewer.train_idx = np.array([], dtype=np.int)
                reviewer.classify(init_idx,
                                  init_labels,
                                  logger,
                                  method="initial")

            reviewer._do_review(logger)
            if log_file is None:
                print(logger._log_dict)
                check_log(logger)

    if log_file is not None:
        with open_logger(log_file, read_only=True) as logger:
            check_log(logger)
Ejemplo n.º 4
0
def test_state_continue_h5():
    inter_file = os.path.join(state_dir, "test_1_inst.h5")
    if not os.path.isfile(inter_file):
        reviewer = get_reviewer(
            data_fp, mode="simulate", model="nb", embedding_fp=embedding_fp,
            prior_idx=[1, 2, 3, 4], state_file=inter_file,
            n_instances=1, n_queries=1)
        reviewer.review()
    copyfile(inter_file, h5_state_file)
    check_model(mode="simulate", model="nb", state_file=h5_state_file,
                continue_from_state=True, n_instances=1, n_queries=2)
Ejemplo n.º 5
0
def test_model_seed():
    n_test = 4
    seed = 192874123
    last_train_idx = None
    for _ in range(n_test):
        reviewer = get_reviewer(data_fp,
                                mode="simulate",
                                model="rf",
                                query_strategy="random",
                                state_file=None,
                                init_seed=seed,
                                seed=seed,
                                n_prior_excluded=1,
                                n_prior_included=1)
        reviewer.review()
        if last_train_idx is None:
            last_train_idx = reviewer.train_idx
        assert np.all(last_train_idx == reviewer.train_idx)
Ejemplo n.º 6
0
def test_no_seed():
    n_test_max = 100
    as_data = ASReviewData.from_file(data_fp)
    n_priored = np.zeros(len(as_data), dtype=int)

    for _ in range(n_test_max):
        reviewer = get_reviewer(data_fp,
                                mode="simulate",
                                model="nb",
                                state_file=None,
                                init_seed=None,
                                n_prior_excluded=1,
                                n_prior_included=1)
        assert len(reviewer.start_idx) == 2
        n_priored[reviewer.start_idx] += 1
        if np.all(n_priored > 0):
            return
    raise ValueError(f"Error getting all priors in {n_test_max} iterations.")
Ejemplo n.º 7
0
def test_state_continue_json():
    inter_file = Path(state_dir, "test_1_inst.json")
    if not inter_file.is_file():
        reviewer = get_reviewer(data_fp,
                                mode="simulate",
                                model="nb",
                                embedding_fp=embedding_fp,
                                prior_idx=[1, 2, 3, 4],
                                state_file=inter_file,
                                n_instances=1,
                                n_queries=1)
        reviewer.review()

    copyfile(inter_file, json_state_file)
    check_model(model="nb",
                state_file=json_state_file,
                continue_from_state=True,
                n_instances=1,
                n_queries=2)
Ejemplo n.º 8
0
def test_log_continue_h5():
    inter_file = os.path.join(log_dir, "test_1_inst.h5")
    if not os.path.isfile(inter_file):
        reviewer = get_reviewer(data_fp,
                                mode="simulate",
                                model="nb",
                                embedding_fp=embedding_fp,
                                prior_included=[1, 3],
                                prior_excluded=[2, 4],
                                log_file=inter_file,
                                n_instances=1,
                                n_queries=1)
        reviewer.review()
    copyfile(inter_file, h5_log_file)
    check_model(mode="simulate",
                model="nb",
                log_file=h5_log_file,
                continue_from_log=True,
                n_instances=1,
                n_queries=2)
Ejemplo n.º 9
0
def test_init_seed():

    base_start_idx = None
    n_test = 4
    seeds = np.random.randint(0, 2**63, 5)
    for _ in range(n_test):
        all_start_idx = []
        for seed in seeds:
            reviewer = get_reviewer(data_fp,
                                    mode="simulate",
                                    model="nb",
                                    state_file=None,
                                    init_seed=seed,
                                    n_prior_excluded=1,
                                    n_prior_included=1)
            assert len(reviewer.start_idx) == 2
            all_start_idx.append(reviewer.start_idx)
        if base_start_idx is None:
            base_start_idx = all_start_idx
            continue

        assert np.all(np.array(base_start_idx) == np.array(all_start_idx))
def check_lstm(use_granular=False, **kwargs):
    # start the review process.
    reviewer = get_reviewer(data_fp, mode="simulate",
                            embedding_fp=embedding_fp,
                            prior_included=[1, 3], prior_excluded=[2, 4],
                            **kwargs)
    if use_granular:
        # Two loops of training and classification.
        reviewer.train()
        reviewer.log_probabilities()
        query_idx = reviewer.query(1)
        inclusions = reviewer._get_labels(query_idx)
        reviewer.classify(query_idx, inclusions)

        reviewer.train()
        reviewer.log_probabilities()
        query_idx = reviewer.query(1)
        inclusions = reviewer._get_labels(query_idx)
        reviewer.classify(query_idx, inclusions)
    else:
        reviewer.review()
    check_log(reviewer._logger._log_dict)
Ejemplo n.º 11
0
    def execute(self, param, data_name, i_run):
        split_param = get_split_param(param)
        state_file = get_state_file_name(self.trials_dir, data_name, i_run)
        try:
            os.remove(state_file)
        except FileNotFoundError:
            pass

        start_idx = self.get_cached_priors(data_name, i_run)

        reviewer = get_reviewer(data_fp_from_name(self.data_dir, data_name),
                                mode='simulate',
                                model=self.model_name,
                                query_strategy=self.query_name,
                                balance_strategy=self.balance_name,
                                feature_extraction=self.feature_name,
                                n_instances=self.n_instances,
                                n_papers=self.n_papers,
                                state_file=state_file,
                                prior_idx=start_idx,
                                **split_param)

        reviewer.review()
Ejemplo n.º 12
0
def test_state_continue_h5(tmpdir):

    inter_file = Path(STATE_DIR, "test_1_inst.h5")

    if not inter_file.is_file():
        reviewer = get_reviewer(DATA_FP,
                                mode="simulate",
                                model="nb",
                                embedding_fp=EMBEDDING_FP,
                                prior_idx=[1, 2, 3, 4],
                                state_file=inter_file,
                                n_instances=1,
                                n_queries=1)
        reviewer.review()

    # copy state file to tmp dir for changes
    tmp_h5_state_fp = Path(tmpdir, "tmp_state.h5")
    copyfile(inter_file, tmp_h5_state_fp)

    check_model(model="nb",
                state_file=tmp_h5_state_fp,
                continue_from_state=True,
                n_instances=1,
                n_queries=2)
Ejemplo n.º 13
0
def train_model(project_id, label_method=None):
    """Add the new labels to the review and do the modeling.

    It uses a lock to ensure only one model is running at the same time.
    Old results directories are deleted after 4 iterations.

    It has one argument on the CLI, which is the base project directory.
    """

    logging.info(f"Project {project_id} - Train a new model for project")

    # get file locations
    asr_kwargs_file = get_kwargs_path(project_id)
    lock_file = get_lock_path(project_id)

    # Lock so that only one training run is running at the same time.
    # It doesn't lock the flask server/client.
    with SQLiteLock(lock_file,
                    blocking=False,
                    lock_name="training",
                    project_id=project_id) as lock:

        # If the lock is not acquired, another training instance is running.
        if not lock.locked():
            logging.info("Project {project_id} - "
                         "Cannot acquire lock, other instance running.")
            return

        # Lock the current state. We want to have a consistent active state.
        # This does communicate with the flask backend; it prevents writing and
        # reading to the same files at the same time.
        with SQLiteLock(lock_file,
                        blocking=True,
                        lock_name="active",
                        project_id=project_id) as lock:
            # Get the all labels since last run. If no new labels, quit.
            new_label_history = read_label_history(project_id)

        data_fp = str(get_data_file_path(project_id))
        as_data = read_data(project_id)
        state_file = get_state_path(project_id)

        # collect command line arguments and pass them to the reviewer
        with open(asr_kwargs_file, "r") as fp:
            asr_kwargs = json.load(fp)
        asr_kwargs['state_file'] = str(state_file)
        reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs)

        with open_state(state_file) as state:
            old_label_history = get_label_train_history(state)

        diff_history = get_diff_history(new_label_history, old_label_history)

        if len(diff_history) == 0:
            logging.info(
                "Project {project_id} - No new labels since last run.")
            return

        query_idx = np.array([x[0] for x in diff_history], dtype=int)
        inclusions = np.array([x[1] for x in diff_history], dtype=int)

        # Classify the new labels, train and store the results.
        with open_state(state_file) as state:
            reviewer.classify(query_idx,
                              inclusions,
                              state,
                              method=label_method)
            reviewer.train()
            reviewer.log_probabilities(state)
            new_query_idx = reviewer.query(reviewer.n_pool()).tolist()
            reviewer.log_current_query(state)
            proba = state.pred_proba.tolist()

        with SQLiteLock(lock_file,
                        blocking=True,
                        lock_name="active",
                        project_id=project_id) as lock:
            current_pool = read_pool(project_id)
            in_current_pool = np.zeros(len(as_data))
            in_current_pool[current_pool] = 1
            new_pool = [x for x in new_query_idx if in_current_pool[x]]
            write_pool(project_id, new_pool)
            write_proba(project_id, proba)
Ejemplo n.º 14
0
def test_dataset_not_found():
    reviewer = get_reviewer("doesnt_exist.csv", mode="simulate")
    reviewer.review()
Ejemplo n.º 15
0
def test_dataset_from_benchmark_group():
    reviewer = get_reviewer("benchmark:Cohen_2006_ACEInhibitors",
                            mode="simulate")
    reviewer.review()
Ejemplo n.º 16
0
def test_dataset_from_url():
    reviewer = get_reviewer(DATA_FP_URL, mode="simulate")
    reviewer.review()
Ejemplo n.º 17
0
def train_model(project_id, label_method=None):
    """Add the new labels to the review and do the modeling.

    It uses a lock to ensure only one model is running at the same time.
    Old results directories are deleted after 4 iterations.

    It has one argument on the CLI, which is the base project directory.
    """

    logging.info(f"Project {project_id} - Train a new model for project")

    # get file locations
    asr_kwargs_file = get_kwargs_path(project_id)
    lock_file = get_lock_path(project_id)

    # Lock so that only one training run is running at the same time.
    # It doesn't lock the flask server/client.
    with SQLiteLock(
            lock_file, blocking=False, lock_name="training",
            project_id=project_id) as lock:

        # If the lock is not acquired, another training instance is running.
        if not lock.locked():
            logging.info("Project {project_id} - "
                         "Cannot acquire lock, other instance running.")
            return

        # Lock the current state. We want to have a consistent active state.
        # This does communicate with the flask backend; it prevents writing and
        # reading to the same files at the same time.
        with SQLiteLock(
                lock_file,
                blocking=True,
                lock_name="active",
                project_id=project_id) as lock:
            # Get the all labels since last run. If no new labels, quit.
            new_label_history = read_label_history(project_id)

        data_fp = str(get_data_file_path(project_id))
        as_data = read_data(project_id)
        state_file = get_state_path(project_id)

        # collect command line arguments and pass them to the reviewer
        with open(asr_kwargs_file, "r") as fp:
            asr_kwargs = json.load(fp)

        try:
            del asr_kwargs["abstract_only"]
        except KeyError:
            pass

        asr_kwargs['state_file'] = str(state_file)
        reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs)

        with open_state(state_file) as state:
            old_label_history = _get_label_train_history(state)

        diff_history = _get_diff_history(new_label_history, old_label_history)

        if len(diff_history) == 0:
            logging.info(
                "Project {project_id} - No new labels since last run.")
            return

        query_record_ids = np.array([x[0] for x in diff_history], dtype=int)
        inclusions = np.array([x[1] for x in diff_history], dtype=int)

        query_idx = convert_id_to_idx(as_data, query_record_ids)

        # Classify the new labels, train and store the results.
        with open_state(state_file) as state:
            reviewer.classify(
                query_idx, inclusions, state, method=label_method)
            reviewer.train()
            reviewer.log_probabilities(state)
            new_query_idx = reviewer.query(reviewer.n_pool()).tolist()
            reviewer.log_current_query(state)

            # write the proba to a pandas dataframe with record_ids as index
            proba = pd.DataFrame(
                {"proba": state.pred_proba.tolist()},
                index=pd.Index(as_data.record_ids, name="record_id")
            )

        # update the pool and output the proba's
        # important: pool is sorted on query
        with SQLiteLock(
                lock_file,
                blocking=True,
                lock_name="active",
                project_id=project_id) as lock:

            # read the pool
            current_pool = read_pool(project_id)

            # diff pool and new_query_ind
            current_pool_idx = convert_id_to_idx(as_data, current_pool)
            current_pool_idx = frozenset(current_pool_idx)
            new_pool_idx = [x for x in new_query_idx if x in current_pool_idx]

            # convert new_pool_idx back to record_ids
            new_pool = convert_idx_to_id(as_data, new_pool_idx)

            # write the pool and proba
            write_pool(project_id, new_pool)
            write_proba(project_id, proba)