all_titles_dict = {} for dataset_name, anddata in datasets.items(): # this random seed matches the current default # and preserves the train/val/test split anddata.random_seed = 1111 ( train_signatures, val_signatures, test_signatures, ) = anddata.split_cluster_signatures() # a different random seed to get a random subset of pairs from train/val/test # this tries to get more diversity in the training data instead of just repeating # the same exact pairs anddata.random_seed = 12455678 if dataset_name == "medline": train_pairs, val_pairs, test_pairs = anddata.fixed_pairs() else: train_pairs, val_pairs, test_pairs = anddata.split_pairs(train_signatures, val_signatures, test_signatures) train_pairs = train_pairs[:25000] train_sigs = set() val_sigs = set() test_sigs = set() for i in train_pairs: train_sigs.update([i[0], i[1]]) for i in val_pairs: val_sigs.update([i[0], i[1]]) for i in test_pairs: test_sigs.update([i[0], i[1]]) with open(os.path.join(DATA_DIR, dataset_name, dataset_name + "_papers.json")) as f: papers_dict = json.load(f) titles_dict = {
def featurize( dataset: ANDData, featurizer_info: FeaturizationInfo, n_jobs: int = 1, use_cache: bool = False, chunk_size: int = DEFAULT_CHUNK_SIZE, nameless_featurizer_info: Optional[FeaturizationInfo] = None, nan_value: float = np.nan, delete_training_data: bool = False, ) -> Union[Tuple[TupleOfArrays, TupleOfArrays, TupleOfArrays], TupleOfArrays]: """ Featurizes the input dataset Parameters ---------- dataset: ANDData the dataset containing the relevant data featurizer_info: FeaturizationInfo the FeautrizationInfo object containing the listing of features to use and featurizer version n_jobs: int the number of cpus to use use_cache: bool whether or not to use write to/read from the features cache chunk_size: int the chunk size for multiprocessing nameless_featurizer_info: FeaturizationInfo the FeaturizationInfo for creating the features that do not use any name features, these will not be computed if this is None nan_value: float the value to replace nans with delete_training_data: bool Whether to delete some suspicious training examples Returns ------- train/val/test features and labels if mode is 'train', features and labels for all pairs if mode is 'inference' """ if dataset.mode == "inference": logger.info("featurizing all pairs") all_pairs = dataset.all_pairs() all_features = many_pairs_featurize( all_pairs, dataset, featurizer_info, n_jobs, use_cache, chunk_size, nameless_featurizer_info, nan_value, False, ) logger.info("featurized all pairs") return all_features else: if dataset.train_pairs is None: if dataset.train_blocks is not None: ( train_signatures, val_signatures, test_signatures, ) = dataset.split_cluster_signatures_fixed() elif dataset.train_signatures is not None: ( train_signatures, val_signatures, test_signatures, ) = dataset.split_data_signatures_fixed() else: ( train_signatures, val_signatures, test_signatures, ) = dataset.split_cluster_signatures() # type: ignore train_pairs, val_pairs, test_pairs = dataset.split_pairs(train_signatures, val_signatures, test_signatures) else: train_pairs, val_pairs, test_pairs = dataset.fixed_pairs() logger.info("featurizing train") train_features = many_pairs_featurize( train_pairs, dataset, featurizer_info, n_jobs, use_cache, chunk_size, nameless_featurizer_info, nan_value, delete_training_data, ) logger.info("featurized train, featurizing val") val_features = many_pairs_featurize( val_pairs, dataset, featurizer_info, n_jobs, use_cache, chunk_size, nameless_featurizer_info, nan_value, False, ) logger.info("featurized val, featurizing test") test_features = many_pairs_featurize( test_pairs, dataset, featurizer_info, n_jobs, use_cache, chunk_size, nameless_featurizer_info, nan_value, False, ) logger.info("featurized test") return train_features, val_features, test_features