def get_reviewer(dataset, mode="simulate", model=DEFAULT_MODEL, query_strategy=DEFAULT_QUERY_STRATEGY, balance_strategy=DEFAULT_BALANCE_STRATEGY, feature_extraction=DEFAULT_FEATURE_EXTRACTION, n_instances=DEFAULT_N_INSTANCES, n_papers=None, n_queries=None, embedding_fp=None, verbose=0, prior_idx=None, prior_record_id=None, n_prior_included=DEFAULT_N_PRIOR_INCLUDED, n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED, config_file=None, state_file=None, model_param=None, query_param=None, balance_param=None, feature_param=None, seed=None, included_dataset=[], excluded_dataset=[], prior_dataset=[], new=False, **kwargs): """Get a review object from arguments. See __main__.py for a description of the arguments. """ as_data = create_as_data(dataset, included_dataset, excluded_dataset, prior_dataset, new=new) if len(as_data) == 0: raise ValueError("Supply at least one dataset" " with at least one record.") cli_settings = ASReviewSettings(model=model, n_instances=n_instances, n_queries=n_queries, n_papers=n_papers, n_prior_included=n_prior_included, n_prior_excluded=n_prior_excluded, query_strategy=query_strategy, balance_strategy=balance_strategy, feature_extraction=feature_extraction, mode=mode, data_fp=None) cli_settings.from_file(config_file) if state_file is not None: with open_state(state_file) as state: if state.is_empty(): state.settings = cli_settings settings = state.settings else: settings = cli_settings if n_queries is not None: settings.n_queries = n_queries if n_papers is not None: settings.n_papers = n_papers if model_param is not None: settings.model_param = model_param if query_param is not None: settings.query_param = query_param if balance_param is not None: settings.balance_param = balance_param if feature_param is not None: settings.feature_param = feature_param # Check if mode is valid if mode in AVAILABLE_REVIEW_CLASSES: logging.info(f"Start review in '{mode}' mode.") else: raise ValueError(f"Unknown mode '{mode}'.") logging.debug(settings) # Initialize models. random_state = get_random_state(seed) train_model = get_classifier(settings.model, **settings.model_param, random_state=random_state) query_model = get_query_model(settings.query_strategy, **settings.query_param, random_state=random_state) balance_model = get_balance_model(settings.balance_strategy, **settings.balance_param, random_state=random_state) feature_model = get_feature_model(settings.feature_extraction, **settings.feature_param, random_state=random_state) # LSTM models need embedding matrices. if train_model.name.startswith("lstm-"): texts = as_data.texts train_model.embedding_matrix = feature_model.get_embedding_matrix( texts, embedding_fp) # prior knowledge if prior_idx is not None and prior_record_id is not None and \ len(prior_idx) > 0 and len(prior_record_id) > 0: raise ValueError( "Not possible to provide both prior_idx and prior_record_id") if prior_record_id is not None and len(prior_record_id) > 0: prior_idx = convert_id_to_idx(as_data, prior_record_id) # Initialize the review class. if mode == "simulate": reviewer = ReviewSimulate(as_data, model=train_model, query_model=query_model, balance_model=balance_model, feature_model=feature_model, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, prior_idx=prior_idx, n_prior_included=settings.n_prior_included, n_prior_excluded=settings.n_prior_excluded, state_file=state_file, **kwargs) elif mode == "minimal": reviewer = MinimalReview(as_data, model=train_model, query_model=query_model, balance_model=balance_model, feature_model=feature_model, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, state_file=state_file, **kwargs) else: raise ValueError("Error finding mode, should never come here...") return reviewer
def get_reviewer(dataset, mode='oracle', model=DEFAULT_MODEL, query_strategy=DEFAULT_QUERY_STRATEGY, balance_strategy=DEFAULT_BALANCE_STRATEGY, feature_extraction=DEFAULT_FEATURE_EXTRACTION, n_instances=DEFAULT_N_INSTANCES, n_papers=None, n_queries=None, embedding_fp=None, verbose=0, prior_included=None, prior_excluded=None, n_prior_included=DEFAULT_N_PRIOR_INCLUDED, n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED, config_file=None, log_file=None, model_param=None, query_param=None, balance_param=None, feature_param=None, abstract_only=False, extra_dataset=[], **kwargs ): """ Get a review object from arguments. See __main__.py for a description Of the arguments. """ # Find the URL of the datasets if the dataset is an example dataset. if dataset in DEMO_DATASETS.keys(): dataset = DEMO_DATASETS[dataset] cli_settings = ASReviewSettings( model=model, n_instances=n_instances, n_queries=n_queries, n_papers=n_papers, n_prior_included=n_prior_included, n_prior_excluded=n_prior_excluded, query_strategy=query_strategy, balance_strategy=balance_strategy, feature_extraction=feature_extraction, mode=mode, data_fp=dataset, abstract_only=abstract_only) cli_settings.from_file(config_file) if log_file is not None: with open_logger(log_file) as logger: if logger.is_empty(): logger.add_settings(cli_settings) settings = logger.settings else: settings = cli_settings logger = None if n_queries is not None: settings.n_queries = n_queries if n_papers is not None: settings.n_papers = n_papers if model_param is not None: settings.model_param = model_param if query_param is not None: settings.query_param = query_param if balance_param is not None: settings.balance_param = balance_param if feature_param is not None: settings.feature_param = feature_param # Check if mode is valid if mode in AVAILABLE_REVIEW_CLASSES: logging.info(f"Start review in '{mode}' mode.") else: raise ValueError(f"Unknown mode '{mode}'.") logging.debug(settings) as_data = ASReviewData.from_file(dataset, extra_dataset=extra_dataset, abstract_only=settings.abstract_only) texts = as_data.texts y = as_data.labels data_prior_included, data_prior_excluded = as_data.get_priors() if len(data_prior_included) != 0: if prior_included is None: prior_included = [] prior_included.extend(data_prior_included.tolist()) if len(data_prior_excluded) != 0: if prior_excluded is None: prior_excluded = [] prior_excluded.extend(data_prior_excluded.tolist()) if as_data.final_labels is not None: with open_logger(log_file) as logger: logger.set_final_labels(as_data.final_labels) train_model = get_model(settings.model, **settings.model_param) query_model = get_query_model(settings.query_strategy, **settings.query_param) balance_model = get_balance_model(settings.balance_strategy, **settings.balance_param) feature_model = get_feature_model(settings.feature_extraction, **settings.feature_param) X = feature_model.fit_transform(texts, as_data.title, as_data.abstract) if train_model.name.startswith("lstm-"): train_model.embedding_matrix = feature_model.get_embedding_matrix( texts, embedding_fp) # Initialize the review class. if mode == "simulate": reviewer = ReviewSimulate( X, y, model=train_model, query_model=query_model, balance_model=balance_model, feature_model=feature_model, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, n_prior_included=settings.n_prior_included, n_prior_excluded=settings.n_prior_excluded, log_file=log_file, final_labels=as_data.final_labels, data_fp=dataset, **kwargs) elif mode == "oracle": reviewer = ReviewOracle( X, model=train_model, query_model=query_model, balance_model=balance_model, feature_model=feature_model, as_data=as_data, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, log_file=log_file, data_fp=dataset, **kwargs) elif mode == "minimal": reviewer = MinimalReview( X, model=model, query_model=query_model, balance_model=balance_model, feature_model=feature_model, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, log_file=log_file, data_fp=dataset, **kwargs) else: raise ValueError("Error finding mode, should never come here...") return reviewer
def get_reviewer(dataset, mode='oracle', model=DEFAULT_MODEL, query_strategy=DEFAULT_QUERY_STRATEGY, balance_strategy=DEFAULT_BALANCE_STRATEGY, n_instances=DEFAULT_N_INSTANCES, n_papers=None, n_queries=None, embedding_fp=None, verbose=0, prior_included=None, prior_excluded=None, n_prior_included=DEFAULT_N_PRIOR_INCLUDED, n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED, config_file=None, log_file=None, model_param=None, query_param=None, balance_param=None, abstract_only=False, **kwargs): """ Get a review object from arguments. See __main__.py for a description Of the arguments. """ # Find the URL of the datasets if the dataset is an example dataset. if dataset in DEMO_DATASETS.keys(): dataset = DEMO_DATASETS[dataset] cli_settings = ASReviewSettings(model=model, n_instances=n_instances, n_queries=n_queries, n_papers=n_papers, n_prior_included=n_prior_included, n_prior_excluded=n_prior_excluded, query_strategy=query_strategy, balance_strategy=balance_strategy, mode=mode, data_fp=dataset, abstract_only=abstract_only) cli_settings.from_file(config_file) if log_file is not None: with open_logger(log_file) as logger: if logger.is_empty(): logger.add_settings(cli_settings) settings = logger.settings else: settings = cli_settings logger = None if n_queries is not None: settings.n_queries = n_queries if n_papers is not None: settings.n_papers = n_papers if model_param is not None: settings.model_param = model_param if query_param is not None: settings.query_param = query_param if balance_param is not None: settings.balance_param = balance_param model = settings.model # Check if mode is valid if mode in AVAILABLE_REVIEW_CLASSES: logging.info(f"Start review in '{mode}' mode.") else: raise ValueError(f"Unknown mode '{mode}'.") logging.debug(settings) as_data = ASReviewData.from_file(dataset, abstract_only=settings.abstract_only) _, texts, labels = as_data.get_data() if as_data.final_labels is not None: with open_logger(log_file) as logger: logger.set_final_labels(as_data.final_labels) model_class = get_model_class(model) model_inst = model_class(param=settings.model_param, embedding_fp=embedding_fp) X, y = model_inst.get_Xy(texts, labels) model_fn = model_inst.model() settings.fit_kwargs = model_inst.fit_kwargs() settings.query_kwargs = {} # Pick query strategy query_fn, query_str = get_query_with_settings(settings) logging.info(f"Query strategy: {query_str}") train_data_fn, train_method = get_balance_with_settings(settings) logging.info(f"Using {train_method} method to obtain training data.") # Initialize the review class. if mode == "simulate": reviewer = ReviewSimulate(X, y, model=model_fn, query_strategy=query_fn, train_data_fn=train_data_fn, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, n_prior_included=settings.n_prior_included, n_prior_excluded=settings.n_prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, log_file=log_file, final_labels=as_data.final_labels, **kwargs) elif mode == "oracle": reviewer = ReviewOracle(X, model=model_fn, query_strategy=query_fn, as_data=as_data, train_data_fn=train_data_fn, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, log_file=log_file, **kwargs) elif mode == "minimal": reviewer = MinimalReview(X, model=model_fn, query_strategy=query_fn, train_data_fn=train_data_fn, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, log_file=log_file, **kwargs) else: raise ValueError("Error finding mode, should never come here...") return reviewer
def get_reviewer(dataset, mode='oracle', model=DEFAULT_MODEL, query_strategy=DEFAULT_QUERY_STRATEGY, balance_strategy=DEFAULT_BALANCE_STRATEGY, n_instances=DEFAULT_N_INSTANCES, n_queries=1, embedding_fp=None, verbose=1, prior_included=None, prior_excluded=None, n_prior_included=DEFAULT_N_PRIOR_INCLUDED, n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED, config_file=None, src_log_fp=None, **kwargs): # Find the URL of the datasets if the dataset is an example dataset. if dataset in DEMO_DATASETS.keys(): dataset = DEMO_DATASETS[dataset] if src_log_fp is not None: logger = Logger(log_fp=src_log_fp) settings = logger.settings else: logger = None settings = ASReviewSettings(model=model, n_instances=n_instances, n_queries=n_queries, n_prior_included=n_prior_included, n_prior_excluded=n_prior_excluded, query_strategy=query_strategy, balance_strategy=balance_strategy, mode=mode, data_fp=dataset) settings.from_file(config_file) model = settings.model if model in ["lstm_base", "lstm_pool"]: base_model = "RNN" else: base_model = "other" # Check if mode is valid if mode in AVAILABLE_REVIEW_CLASSES: if verbose: print(f"Start review in '{mode}' mode.") else: raise ValueError(f"Unknown mode '{mode}'.") print(f"Model: '{model}'") # if the provided file is a pickle file if is_pickle(dataset): with open(dataset, 'rb') as f: data_obj = pickle.load(f) if isinstance(data_obj, tuple) and len(data_obj) == 3: X, y, embedding_matrix = data_obj elif isinstance(data_obj, tuple) and len(data_obj) == 4: X, y, embedding_matrix, _ = data_obj else: raise ValueError("Incorrect pickle object.") else: as_data = ASReviewData.from_file(dataset) _, texts, labels = as_data.get_data() # get the model if base_model == "RNN": if embedding_fp is None: embedding_fp = Path(get_data_home(), EMBEDDING_EN["name"]).expanduser() if not embedding_fp.exists(): print("Warning: will start to download large " "embedding file in 10 seconds.") time.sleep(10) download_embedding(verbose=verbose) # create features and labels X, word_index = text_to_features(texts) y = labels embedding = load_embedding(embedding_fp, word_index=word_index) embedding_matrix = sample_embedding(embedding, word_index) elif model.lower() in ['nb', 'svc', 'svm']: from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer())]) X = text_clf.fit_transform(texts) y = labels settings.fit_kwargs = {} settings.query_kwargs = {} if base_model == 'RNN': if model == "lstm_base": model_kwargs = lstm_base_model_defaults(settings, verbose) create_lstm_model = create_lstm_base_model elif model == "lstm_pool": model_kwargs = lstm_pool_model_defaults(settings, verbose) create_lstm_model = create_lstm_pool_model else: raise ValueError(f"Unknown model {model}") settings.fit_kwargs = lstm_fit_defaults(settings, verbose) settings.query_kwargs['verbose'] = verbose # create the model model = KerasClassifier(create_lstm_model( embedding_matrix=embedding_matrix, **model_kwargs), verbose=verbose) elif model.lower() in ['nb']: from asreview.models import create_nb_model model = create_nb_model() elif model.lower() in ['svm', 'svc']: from asreview.models import create_svc_model model = create_svc_model() else: raise ValueError('Model not found.') # Pick query strategy query_fn, query_str = get_query_strategy(settings) if verbose: print(f"Query strategy: {query_str}") train_data_fn, train_method = get_balance_strategy(settings) if verbose: print(f"Using {train_method} method to obtain training data.") # Initialize the review class. if mode == "simulate": reviewer = ReviewSimulate(X, y, model=model, query_strategy=query_fn, train_data_fn=train_data_fn, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, n_prior_included=settings.n_prior_included, n_prior_excluded=settings.n_prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, logger=logger, **kwargs) elif mode == "oracle": reviewer = ReviewOracle(X, model=model, query_strategy=query_fn, as_data=as_data, train_data_fn=train_data_fn, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, logger=logger, **kwargs) elif mode == "minimal": reviewer = MinimalReview(X, model=model, query_strategy=query_fn, train_data_fn=train_data_fn, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, logger=logger, **kwargs) else: raise ValueError("Error finding mode, should never come here...") reviewer._logger.add_settings(settings) return reviewer