def setUp(self): super().setUp() self.dummy_dataset = ANDData( "tests/dummy/signatures.json", "tests/dummy/papers.json", clusters="tests/dummy/clusters.json", name="dummy", load_name_counts=True, ) features_to_use = [ "name_similarity", "affiliation_similarity", "email_similarity", "coauthor_similarity", "venue_similarity", "year_diff", "title_similarity", "reference_features", "misc_features", "name_counts", "journal_similarity", "advanced_name_similarity", ] self.dummy_featurizer = FeaturizationInfo(features_to_use=features_to_use)
def setUp(self): super().setUp() self.dummy_dataset = ANDData( "tests/dummy/signatures.json", "tests/dummy/papers.json", clusters="tests/dummy/clusters.json", cluster_seeds="tests/dummy/cluster_seeds.json", name="dummy", load_name_counts=True, ) features_to_use = [ "year_diff", "misc_features", ] featurizer_info = FeaturizationInfo(features_to_use=features_to_use) np.random.seed(1) X_random = np.random.random((10, 6)) y_random = np.random.randint(0, 6, 10) self.dummy_clusterer = Clusterer( featurizer_info=featurizer_info, classifier=lgb.LGBMClassifier(random_state=1, data_random_seed=1, feature_fraction_seed=1).fit( X_random, y_random), n_jobs=1, use_cache=False, use_default_constraints_as_supervision=False, )
def setUp(self): super().setUp() self.dummy_dataset = ANDData( "tests/dummy/signatures.json", "tests/dummy/papers.json", clusters="tests/dummy/clusters.json", name="dummy", load_name_counts=False, )
def setUp(self): super().setUp() self.qian_dataset = ANDData( "tests/qian/signatures.json", # "tests/qian/papers.json", {}, clusters="tests/qian/clusters.json", name="qian", load_name_counts=False, preprocess=False, ) self.dummy_dataset = ANDData( "tests/dummy/signatures.json", # "tests/dummy/papers.json", {}, clusters="tests/dummy/clusters.json", name="dummy", load_name_counts=False, preprocess=False, )
def main( experiment_name: str, dont_use_nameless_model: bool, dont_use_rules: bool, dont_use_monotone_constraints: bool, exclude_augmented: bool, single_dataset: str, feature_groups_to_skip: List[str], n_jobs: int, random_seed: int, ): """ This script is used to train and dump a model trained on all the datasets """ DATA_DIR = CONFIG["internal_data_dir"] USE_NAMELESS_MODEL = not dont_use_nameless_model USE_RULES = not dont_use_rules USE_AUGMENTATION = not exclude_augmented USE_MONOTONE_CONSTRAINTS = not dont_use_monotone_constraints N_JOBS = n_jobs for feature_group in feature_groups_to_skip: FEATURES_TO_USE.remove(feature_group) NAMELESS_FEATURES_TO_USE = [ feature_name for feature_name in FEATURES_TO_USE if feature_name not in {"name_similarity", "advanced_name_similarity", "name_counts"} ] FEATURIZER_INFO = FeaturizationInfo(features_to_use=FEATURES_TO_USE, featurizer_version=FEATURIZER_VERSION) NAMELESS_FEATURIZER_INFO = FeaturizationInfo( features_to_use=NAMELESS_FEATURES_TO_USE, featurizer_version=FEATURIZER_VERSION) MONOTONE_CONSTRAINTS = FEATURIZER_INFO.lightgbm_monotone_constraints NAMELESS_MONOTONE_CONSTRAINTS = NAMELESS_FEATURIZER_INFO.lightgbm_monotone_constraints SOURCE_DATASET_NAMES = [ "aminer", "arnetminer", "inspire", "kisti", "orcid", "pubmed", "qian", "zbmath" ] PAIRWISE_ONLY_DATASETS = {"medline", "augmented"} if USE_AUGMENTATION: SOURCE_DATASET_NAMES.append("augmented") if single_dataset != "": SOURCE_DATASET_NAMES = [single_dataset] datasets = {} for dataset_name in tqdm( SOURCE_DATASET_NAMES, desc="Processing datasets and fitting base models"): logger.info(f"processing dataset {dataset_name}") clusters_path: Optional[str] = None if dataset_name not in PAIRWISE_ONLY_DATASETS: clusters_path = os.path.join(DATA_DIR, dataset_name, dataset_name + "_clusters.json") train_pairs_path = None val_pairs_path = None test_pairs_path = None else: train_pairs_path = os.path.join(DATA_DIR, dataset_name, "train_pairs.csv") val_pairs_path = os.path.join(DATA_DIR, dataset_name, "val_pairs.csv") if not os.path.exists(val_pairs_path): val_pairs_path = None test_pairs_path = os.path.join(DATA_DIR, dataset_name, "test_pairs.csv") logger.info(f"loading dataset {dataset_name}") anddata = ANDData( signatures=os.path.join(DATA_DIR, dataset_name, dataset_name + "_signatures.json"), papers=os.path.join(DATA_DIR, dataset_name, dataset_name + "_papers.json"), name=dataset_name, mode="train", specter_embeddings=os.path.join(DATA_DIR, dataset_name, dataset_name + "_specter.pickle"), clusters=clusters_path, block_type=BLOCK_TYPE, train_pairs=train_pairs_path, val_pairs=val_pairs_path, test_pairs=test_pairs_path, train_pairs_size=N_TRAIN_PAIRS_SIZE, val_pairs_size=N_VAL_TEST_SIZE, test_pairs_size=N_VAL_TEST_SIZE, preprocess=True, random_seed=random_seed if random_seed is not None else 1111, ) logger.info(f"featurizing {dataset_name}") train, val, _ = featurize( anddata, FEATURIZER_INFO, n_jobs=N_JOBS, use_cache=True, chunk_size=100, nameless_featurizer_info=NAMELESS_FEATURIZER_INFO, nan_value=np.nan, ) X_train, y_train, nameless_X_train = train X_val, y_val, nameless_X_val = val dataset: Dict[Any, Any] = {} dataset["anddata"] = anddata dataset["X_train"] = X_train dataset["y_train"] = y_train dataset["X_val"] = X_val dataset["y_val"] = y_val dataset["nameless_X_train"] = nameless_X_train dataset["nameless_X_val"] = nameless_X_val datasets[dataset_name] = dataset anddatas = [ datasets[dataset_name]["anddata"] for dataset_name in SOURCE_DATASET_NAMES if dataset_name not in PAIRWISE_ONLY_DATASETS ] X_train = np.vstack([ datasets[dataset_name]["X_train"] for dataset_name in SOURCE_DATASET_NAMES ]) y_train = np.hstack([ datasets[dataset_name]["y_train"] for dataset_name in SOURCE_DATASET_NAMES ]) X_val = np.vstack([ datasets[dataset_name]["X_val"] for dataset_name in SOURCE_DATASET_NAMES if dataset_name not in {"augmented"} ]) y_val = np.hstack([ datasets[dataset_name]["y_val"] for dataset_name in SOURCE_DATASET_NAMES if dataset_name not in {"augmented"} ]) nameless_X_train = np.vstack([ datasets[dataset_name]["nameless_X_train"] for dataset_name in SOURCE_DATASET_NAMES ]) nameless_X_val = np.vstack([ datasets[dataset_name]["nameless_X_val"] for dataset_name in SOURCE_DATASET_NAMES if dataset_name not in {"augmented"} ]) logger.info("fitting pairwise") union_classifier = PairwiseModeler( n_iter=N_ITER, monotone_constraints=MONOTONE_CONSTRAINTS if USE_MONOTONE_CONSTRAINTS else None, random_state=random_seed if random_seed is not None else 42, ) union_classifier.fit(X_train, y_train, X_val, y_val) nameless_union_classifier = None if USE_NAMELESS_MODEL: logger.info("nameless fitting pairwise for " + str(SOURCE_DATASET_NAMES)) nameless_union_classifier = PairwiseModeler( n_iter=N_ITER, monotone_constraints=NAMELESS_MONOTONE_CONSTRAINTS if USE_MONOTONE_CONSTRAINTS else None, random_state=random_seed if random_seed is not None else 42, ) nameless_union_classifier.fit(nameless_X_train, y_train, nameless_X_val, y_val) logger.info("nameless pairwise fit for " + str(SOURCE_DATASET_NAMES)) logger.info("fitting clusterer for") clusterer = Clusterer( FEATURIZER_INFO, union_classifier.classifier, cluster_model=FastCluster(), search_space=search_space, n_jobs=N_JOBS, nameless_classifier=nameless_union_classifier.classifier if nameless_union_classifier is not None else None, nameless_featurizer_info=NAMELESS_FEATURIZER_INFO if nameless_union_classifier is not None else None, use_default_constraints_as_supervision=USE_RULES, use_cache=True, random_state=random_seed if random_seed is not None else 42, ) clusterer.fit(anddatas) print( "best clustering parameters:", clusterer.best_params, ) # now working on the blocks CLAIMS_DATA_DIR = os.path.join(CONFIG["internal_data_dir"], "claims") BLOCK_DATASETS_DIR = os.path.join(CLAIMS_DATA_DIR, "block_datasets") with open(os.path.join(CLAIMS_DATA_DIR, "claims_pairs_remapped.json")) as _json_file: claims_pairs = json.load(_json_file) logger.info("Claims pairs loaded") clusterer.batch_size = 10000000 block_keys = sorted( filter( lambda x: not x.endswith(".json") and not x.endswith(".pickle") and not x.endswith(".py") and not x.endswith( ".vscode") and not x.endswith(".csv"), os.listdir(BLOCK_DATASETS_DIR), ), key=lambda x: os.path.getsize( os.path.join(os.path.join(BLOCK_DATASETS_DIR, x), "claims_signatures.json")), ) # these had errors when manually evaluating for block_key in ["t_xiao", "m_dagostino", "s_tunster", "n_smith"]: block_keys.remove(block_key) # let's only keep the first ~130 for speed purposes block_keys = block_keys[:130] logger.info("starting transfer experiment main, loading name counts") with open(cached_path(NAME_COUNTS_PATH), "rb") as f: ( first_dict, last_dict, first_last_dict, last_first_initial_dict, ) = pickle.load(f) name_counts = { "first_dict": first_dict, "last_dict": last_dict, "first_last_dict": first_last_dict, "last_first_initial_dict": last_first_initial_dict, } logger.info("loaded name counts") results_dict = {} for block_key in tqdm(block_keys): results = {} block_dir = os.path.join(BLOCK_DATASETS_DIR, block_key) logger.info(f"Loading dataset {block_key}") claims_dataset = ANDData( signatures=os.path.join(block_dir, "claims_signatures.json"), papers=os.path.join(block_dir, "claims_papers.json"), mode="inference", specter_embeddings=os.path.join(block_dir, "claims_specter.pickle"), block_type="s2", name=block_key.replace(" ", "_"), n_jobs=n_jobs, load_name_counts=name_counts, ) logger.info("Dataset loaded") result = claims_eval( claims_dataset, clusterer, claims_pairs, os.path.join(BLOCK_DATASETS_DIR, claims_dataset.name), output_shap=False, optional_name=experiment_name, ) results[block_key.replace(" ", "_")] = result logger.info(f"Claims eval output: {result}") with open( os.path.join( BLOCK_DATASETS_DIR, claims_dataset.name, f"results_{experiment_name}.json", ), "w", ) as _json_file: json.dump(results, _json_file) results_dict.update(results) pd.DataFrame(results_dict).T.to_csv( os.path.join(BLOCK_DATASETS_DIR, f"{experiment_name}.csv"))
def main( experiment_name: str, dont_use_nameless_model: bool, n_jobs: int, dont_use_monotone_constraints: bool, linkage: str, use_dbscan: bool, negative_one_for_nan: bool, random_seed: int, inspire_split: int, inspire_only: bool, aminer_only: bool, ): USE_NAMELESS_MODEL = not dont_use_nameless_model N_JOBS = n_jobs USE_MONOTONE_CONSTRAINTS = not dont_use_monotone_constraints logger.info((f"USE_NAMELESS_MODEL={USE_NAMELESS_MODEL}, " f"N_JOBS={N_JOBS}, " f"USE_MONOTONE_CONSTRAINTS={USE_MONOTONE_CONSTRAINTS}, " f"linkage={linkage}, " f"use_dbscan={use_dbscan}, " f"negative_one_for_nan={negative_one_for_nan}, " f"random_seed={random_seed}")) if inspire_only: DATASET_NAMES = ["inspire"] elif aminer_only: DATASET_NAMES = ["aminer"] else: DATASET_NAMES = [ "kisti", "pubmed", "medline", ] FIXED_BLOCK = ["aminer"] FIXED_SIGNATURE = ["inspire"] if negative_one_for_nan: MONOTONE_CONSTRAINTS = None NAMELESS_MONOTONE_CONSTRAINTS = None NAN_VALUE = -1 else: MONOTONE_CONSTRAINTS = FEATURIZER_INFO.lightgbm_monotone_constraints NAMELESS_MONOTONE_CONSTRAINTS = NAMELESS_FEATURIZER_INFO.lightgbm_monotone_constraints NAN_VALUE = np.nan with open(cached_path(NAME_COUNTS_PATH), "rb") as f: ( first_dict, last_dict, first_last_dict, last_first_initial_dict, ) = pickle.load(f) name_counts = { "first_dict": first_dict, "last_dict": last_dict, "first_last_dict": first_last_dict, "last_first_initial_dict": last_first_initial_dict, } logger.info("loaded name counts") datasets: Dict[str, Any] = {} for dataset_name in tqdm( DATASET_NAMES, desc="Processing datasets and fitting base models"): logger.info("") logger.info(f"processing dataset {dataset_name}") clusters_path: Optional[str] = None train_blocks: Optional[str] = None val_blocks: Optional[str] = None test_blocks: Optional[str] = None train_pairs_path: Optional[str] = None val_pairs_path: Optional[str] = None test_pairs_path: Optional[str] = None train_signatures: Optional[str] = None val_signatures: Optional[str] = None test_signatures: Optional[str] = None if dataset_name in FIXED_BLOCK: logger.info("FIXED BLOCK") train_blocks_fname: str = "train_keys.json" val_blocks_fname: str = "val_keys.json" test_blocks_fname: str = "test_keys.json" logger.info( f"File names, FIXED BLOCK {train_blocks_fname, val_blocks_fname, test_blocks_fname}" ) clusters_path = os.path.join(DATA_DIR, dataset_name, dataset_name + "_clusters.json") train_blocks = os.path.join(DATA_DIR, dataset_name, train_blocks_fname) if not os.path.exists( os.path.join(DATA_DIR, dataset_name, val_blocks_fname)): val_blocks = None test_blocks = os.path.join(DATA_DIR, dataset_name, test_blocks_fname) elif dataset_name in FIXED_SIGNATURE: train_sign_fname: str = "train_keys_" + str( inspire_split) + ".json" val_sign_fname: str = "val_keys_" + str(inspire_split) + ".json" test_sign_fname: str = "test_keys_" + str(inspire_split) + ".json" logger.info( f"File names, FIXED_SIGNATURE {train_sign_fname, val_sign_fname, test_sign_fname}" ) clusters_path = os.path.join(DATA_DIR, dataset_name, dataset_name + "_clusters.json") train_signatures = os.path.join(DATA_DIR, dataset_name, train_sign_fname) if not os.path.exists( os.path.join(DATA_DIR, dataset_name, val_sign_fname)): val_signatures = None test_signatures = os.path.join(DATA_DIR, dataset_name, test_sign_fname) elif dataset_name not in PAIRWISE_ONLY_DATASETS: logger.info("CLUSTER with random split") clusters_path = os.path.join(DATA_DIR, dataset_name, dataset_name + "_clusters.json") else: logger.info("Pairwise model") train_pairs_path = os.path.join(DATA_DIR, dataset_name, "train_pairs.csv") val_pairs_path = os.path.join(DATA_DIR, dataset_name, "val_pairs.csv") if not os.path.exists(val_pairs_path): val_pairs_path = None test_pairs_path = os.path.join(DATA_DIR, dataset_name, "test_pairs.csv") logger.info(f"loading dataset {dataset_name}") if dataset_name == "inspire" or dataset_name == "kisti": unit_of_data_split = "signatures" else: unit_of_data_split = "blocks" if dataset_name == "kisti": train_ratio = 0.4 val_ratio = 0.1 test_ratio = 0.5 else: train_ratio = 0.8 val_ratio = 0.1 test_ratio = 0.1 logger.info(f"ratios {train_ratio, val_ratio, test_ratio}") logger.info(f"block keys {train_blocks, val_blocks, test_blocks}") logger.info( f"signature keys {train_signatures, val_signatures, test_signatures}" ) anddata = ANDData( signatures=os.path.join(DATA_DIR, dataset_name, dataset_name + "_signatures.json"), papers=os.path.join(DATA_DIR, dataset_name, dataset_name + "_papers.json"), name=dataset_name, mode="train", specter_embeddings=os.path.join(DATA_DIR, dataset_name, dataset_name + "_specter.pickle"), clusters=clusters_path, block_type=BLOCK_TYPE, train_pairs=train_pairs_path, val_pairs=val_pairs_path, test_pairs=test_pairs_path, train_pairs_size=N_TRAIN_PAIRS_SIZE, val_pairs_size=N_VAL_TEST_SIZE, test_pairs_size=N_VAL_TEST_SIZE, n_jobs=N_JOBS, load_name_counts=name_counts, preprocess=PREPROCESS, random_seed=random_seed, train_blocks=train_blocks, val_blocks=val_blocks, test_blocks=test_blocks, train_signatures=train_signatures, val_signatures=val_signatures, test_signatures=test_signatures, train_ratio=train_ratio, val_ratio=val_ratio, test_ratio=test_ratio, unit_of_data_split=unit_of_data_split, ) logger.info(f"dataset {dataset_name} loaded") logger.info(f"featurizing {dataset_name}") train, val, test = featurize( anddata, FEATURIZER_INFO, n_jobs=N_JOBS, use_cache=USE_CACHE, chunk_size=DEFAULT_CHUNK_SIZE, nameless_featurizer_info=NAMELESS_FEATURIZER_INFO, nan_value=NAN_VALUE) # type: ignore X_train, y_train, nameless_X_train = train X_val, y_val, nameless_X_val = val assert test is not None X_test, y_test, nameless_X_test = test logger.info(f"dataset {dataset_name} featurized") pairwise_modeler: Optional[PairwiseModeler] = None nameless_pairwise_modeler = None cluster: Optional[Clusterer] = None logger.info(f"fitting pairwise for {dataset_name}") pairwise_modeler = PairwiseModeler( n_iter=N_ITER, monotone_constraints=MONOTONE_CONSTRAINTS if USE_MONOTONE_CONSTRAINTS else None, random_state=random_seed, ) pairwise_modeler.fit(X_train, y_train, X_val, y_val) logger.info(f"pairwise fit for {dataset_name}") if USE_NAMELESS_MODEL: logger.info(f"nameless fitting pairwise for {dataset_name}") nameless_pairwise_modeler = PairwiseModeler( n_iter=N_ITER, monotone_constraints=NAMELESS_MONOTONE_CONSTRAINTS if USE_MONOTONE_CONSTRAINTS else None, random_state=random_seed, ) nameless_pairwise_modeler.fit(nameless_X_train, y_train, nameless_X_val, y_val) logger.info(f"nameless pairwise fit for {dataset_name}") distances_for_sparsity = [ 1 - pred[1] for pred in pairwise_modeler.predict_proba(X_train) ] threshold = np.percentile(distances_for_sparsity, [10, 20, 30, 40, 50, 60, 70, 80, 90]) logger.info(f"Thresholds {threshold}") if dataset_name not in PAIRWISE_ONLY_DATASETS: logger.info(f"fitting clusterer for {dataset_name}") cluster = Clusterer( FEATURIZER_INFO, pairwise_modeler.classifier, cluster_model=FastCluster(linkage=linkage) if not use_dbscan else DBSCAN(min_samples=1, metric="precomputed"), search_space=search_space, n_jobs=N_JOBS, use_cache=USE_CACHE, nameless_classifier=nameless_pairwise_modeler.classifier if nameless_pairwise_modeler is not None else None, nameless_featurizer_info=NAMELESS_FEATURIZER_INFO, random_state=random_seed, use_default_constraints_as_supervision=False, ) cluster.fit(anddata) logger.info(f"clusterer fit for {dataset_name}") logger.info(f"{dataset_name} best clustering parameters: " + str(cluster.best_params)) dataset: Dict[str, Any] = {} dataset["anddata"] = anddata dataset["X_train"] = X_train dataset["y_train"] = y_train dataset["X_val"] = X_val dataset["y_val"] = y_val dataset["X_test"] = X_test dataset["y_test"] = y_test dataset["pairwise_modeler"] = pairwise_modeler dataset["nameless_X_train"] = nameless_X_train dataset["nameless_X_val"] = nameless_X_val dataset["nameless_X_test"] = nameless_X_test dataset["nameless_pairwise_modeler"] = nameless_pairwise_modeler dataset["clusterer"] = cluster dataset["name"] = anddata.name datasets[dataset_name] = dataset logger.info("") logger.info("making evaluation grids") b3_f1_grid = [["" for j in range(len(DATASET_NAMES) + 1)] for i in range(len(DATASET_NAMES) + 1)] for i in range(max(len(DATASET_NAMES), len(DATASET_NAMES))): if i < len(DATASET_NAMES): b3_f1_grid[0][i + 1] = DATASET_NAMES[i] if i < len(DATASET_NAMES): b3_f1_grid[i + 1][0] = DATASET_NAMES[i] pairwise_auroc_grid = copy.deepcopy(b3_f1_grid) # makes a copy of the grid pairwise_f1_classification_grid = copy.deepcopy( b3_f1_grid) # makes a copy of the grid pairwise_average_precisision_grid = copy.deepcopy( b3_f1_grid) # makes a copy of the grid pairwise_macro_f1_grid = copy.deepcopy( b3_f1_grid) # makes a copy of the grid # transfer of individual models logger.info("starting individual model evaluation") for _, source_dataset in tqdm(datasets.items(), desc="Evaluating individual models"): logger.info("") logger.info( f"evaluating source {source_dataset['name']} target {source_dataset['name']}" ) pairwise_metrics, cluster_metrics, _ = sota_helper( source_dataset, experiment_name, random_seed) b3_f1_grid[DATASET_NAMES.index(source_dataset["name"]) + 1][DATASET_NAMES.index(source_dataset["name"]) + 1] = cluster_metrics["B3 (P, R, F1)"][2] pairwise_macro_f1_grid[ DATASET_NAMES.index(source_dataset["name"]) + 1][DATASET_NAMES.index(source_dataset["name"]) + 1] = cluster_metrics["Cluster Macro (P, R, F1)"][2] pairwise_auroc_grid[DATASET_NAMES.index(source_dataset["name"]) + 1][DATASET_NAMES.index(source_dataset["name"]) + 1] = pairwise_metrics["AUROC"] pairwise_f1_classification_grid[ DATASET_NAMES.index(source_dataset["name"]) + 1][DATASET_NAMES.index(source_dataset["name"]) + 1] = pairwise_metrics["F1"] pairwise_average_precisision_grid[ DATASET_NAMES.index(source_dataset["name"]) + 1][DATASET_NAMES.index(source_dataset["name"]) + 1] = pairwise_metrics["Average Precision"] logger.info("finished individual model evaluation") # union logger.info("") logger.info("writing results to disk") print("B3 F1:") b3_df = pd.DataFrame(b3_f1_grid) print(b3_df) print() print("Pairwise Macro F1 (cluster):") pairwise_macro_f1_df = pd.DataFrame(pairwise_macro_f1_grid) print(pairwise_macro_f1_df) print() print("Pairwise AUROC:") pairwise_df = pd.DataFrame(pairwise_auroc_grid) print(pairwise_df) print() print("Pairwise classification F1:") pairwise_classification_f1_df = pd.DataFrame( pairwise_f1_classification_grid) print(pairwise_classification_f1_df) print() print("Pairwise AP:") pairwise_ap_df = pd.DataFrame(pairwise_average_precisision_grid) print(pairwise_ap_df) print() with open( os.path.join( DATA_DIR, "experiments", experiment_name, "sota", f"seed_{random_seed}", "metrics", "full_grid.json", ), "w", ) as _json_file: json.dump( { "b3": b3_f1_grid, "pairwisef1": pairwise_macro_f1_grid, "auroc": pairwise_auroc_grid, "classificationf1": pairwise_f1_classification_grid, "averageprecision": pairwise_average_precisision_grid, }, _json_file, ) b3_df.to_csv( os.path.join( DATA_DIR, "experiments", experiment_name, "sota", f"seed_{random_seed}", "metrics", "b3.csv", ), index=False, ) pairwise_macro_f1_df.to_csv( os.path.join( DATA_DIR, "experiments", experiment_name, "sota", f"seed_{random_seed}", "metrics", "pair_macro_f1_cluster.csv", ), index=False, ) pairwise_df.to_csv( os.path.join( DATA_DIR, "experiments", experiment_name, "sota", f"seed_{random_seed}", "metrics", "pairwise_auc.csv", ), index=False, ) pairwise_classification_f1_df.to_csv( os.path.join( DATA_DIR, "experiments", experiment_name, "sota", f"seed_{random_seed}", "metrics", "classification_f1.csv", ), index=False, ) pairwise_ap_df.to_csv( os.path.join( DATA_DIR, "experiments", experiment_name, "sota", f"seed_{random_seed}", "metrics", "average_precision.csv", ), index=False, ) return ( b3_f1_grid, pairwise_macro_f1_grid, pairwise_auroc_grid, pairwise_f1_classification_grid, pairwise_average_precisision_grid, )
def main( max_train_positives_per_dataset: int, max_val_positives_per_dataset: int, max_test_positives_per_dataset: int, negatives_multiplier: float, drop_abstract_prob: float, drop_affiliations_prob: float, drop_references_prob: float, drop_first_name_prob: float, drop_venue_journal_prob: float, drop_coauthors_prob: float, translate_title_prob: float, ): """ This script creates the extra "augmentation" dataset from the existing datasets, by randomly removing features, to simulate real usage better """ random.seed(1111) augmentation_pairs = pd.read_csv( os.path.join(AUGMENTATION_DIR, "source_tuples.csv")).to_dict("records") with open(os.path.join(AUGMENTATION_DIR, "title_only_specters.pickle"), "rb") as _pickle_file: title_only_specter = pickle.load(_pickle_file) datasets: Dict[str, Any] = {} for dataset_name in tqdm( SOURCE_DATASET_NAMES, desc="Processing datasets and fitting base models"): logger.info("") logger.info(f"processing dataset {dataset_name}") logger.info(f"loading dataset {dataset_name}") anddata = ANDData( signatures=os.path.join(DATA_DIR, dataset_name, dataset_name + "_signatures.json"), papers=os.path.join(DATA_DIR, dataset_name, dataset_name + "_papers.json"), name=dataset_name, mode="inference", specter_embeddings=os.path.join(DATA_DIR, dataset_name, dataset_name + "_specter.pickle"), block_type="s2", n_jobs=25, load_name_counts=False, preprocess=False, ) logger.info(f"dataset {dataset_name} loaded") datasets[dataset_name] = anddata full_papers = {} full_signatures = {} full_specter_keys = [] full_specter_D = [] train_pairs = [] val_pairs = [] test_pairs = [] pair_counts: Dict[str, Dict[str, Dict[int, int]]] = defaultdict( lambda: defaultdict(lambda: defaultdict(int))) for row in augmentation_pairs: split = row["split"] dataset_name = row["dataset_name"] signature_id_1 = row["signature_id_1"] signature_id_2 = row["signature_id_2"] label = row["label"] count_value = pair_counts[dataset_name][split][label] max_value = (max_train_positives_per_dataset if split == "train" else max_val_positives_per_dataset if split == "val" else max_test_positives_per_dataset) * (negatives_multiplier if label == 0 else 1.0) if count_value >= max_value or dataset_name not in SOURCE_DATASET_NAMES: continue pair_counts[dataset_name][split][label] += 1 pair = (dataset_name + "___" + str(signature_id_1), dataset_name + "___" + str(signature_id_2), label) if split == "train": train_pairs.append(pair) elif split == "val": val_pairs.append(pair) elif split == "test": test_pairs.append(pair) logger.info( f"Total pairs (train, val, test): {len(train_pairs)}, {len(val_pairs)}, {len(test_pairs)}" ) pair_counts_dict: Dict[str, Dict[str, Dict[int, int]]] = {} for dataset, d1 in pair_counts.items(): pair_counts_dict[dataset] = {} for split, d2 in d1.items(): pair_counts_dict[dataset][split] = {} for label, count in d2.items(): pair_counts_dict[dataset][split][label] = count logger.info(pair_counts_dict) all_signatures = set( [item for sublist in train_pairs for item in sublist[:2]] + [item for sublist in val_pairs for item in sublist[:2]] + [item for sublist in test_pairs for item in sublist[:2]]) reference_papers_to_add = set() for signature in all_signatures: original_dataset, original_signature_id = signature.split("___") original_signature = datasets[original_dataset].signatures[ original_signature_id] original_paper = datasets[original_dataset].papers[str( original_signature.paper_id)] original_references = [(original_dataset, paper_id) for paper_id in original_paper.references] new_signature_id = signature new_references = [ copy.deepcopy(reference) for reference in original_references ] coin_flip = random.uniform(0, 1) if coin_flip < drop_abstract_prob: new_has_abstract = False full_specter_keys.append(str(original_signature.paper_id)) full_specter_D.append( title_only_specter[original_dataset + "_" + str(original_signature.paper_id)]) else: new_has_abstract = original_paper.has_abstract full_specter_keys.append(str(original_signature.paper_id)) full_specter_D.append( datasets[original_dataset].specter_embeddings[str( original_signature.paper_id)]) coin_flip = random.uniform(0, 1) if coin_flip < drop_references_prob: new_references = [] else: reference_papers_to_add.update(new_references) new_references = [reference[1] for reference in new_references] coin_flip = random.uniform(0, 1) if coin_flip < drop_affiliations_prob: new_affiliations = [] else: new_affiliations = original_signature.author_info_affiliations coin_flip = random.uniform(0, 1) if coin_flip < drop_venue_journal_prob: new_venue = None new_journal_name = None else: new_venue = original_paper.venue new_journal_name = original_paper.journal_name coin_flip = random.uniform(0, 1) if coin_flip < drop_first_name_prob: new_first = (original_signature.author_info_first[0] if original_signature.author_info_first is not None and len(original_signature.author_info_first) > 0 else original_signature.author_info_first) else: new_first = original_signature.author_info_first coin_flip = random.uniform(0, 1) if coin_flip < drop_coauthors_prob: new_paper_authors = [ author for author in original_paper.authors if author.position == original_signature.author_info_position ] else: new_paper_authors = original_paper.authors coin_flip = random.uniform(0, 1) if coin_flip < translate_title_prob: new_title = translate(original_paper.title) else: new_title = original_paper.title new_signature = original_signature._replace( author_info_first=new_first, author_info_affiliations=new_affiliations, signature_id=new_signature_id, author_info_first_normalized=None, author_info_first_normalized_without_apostrophe=None, author_info_middle_normalized=None, author_info_middle_normalized_without_apostrophe=None, author_info_last_normalized=None, author_info_suffix_normalized=None, author_info_coauthors=None, author_info_coauthor_blocks=None, ) new_paper = original_paper._replace( venue=new_venue, journal_name=new_journal_name, references=new_references, title=new_title, has_abstract=new_has_abstract, authors=new_paper_authors, ) new_signature_dict = dict(new_signature._asdict()) new_signature_dict["author_info"] = {} keys_to_delete = [] for key, value in new_signature_dict.items(): if key.startswith("author_info_"): keys_to_delete.append(key) new_signature_dict["author_info"][key[12:]] = value for key in keys_to_delete: del new_signature_dict[key] full_signatures[signature] = new_signature_dict full_papers[str(new_paper.paper_id)] = dict(new_paper._asdict()) full_papers[str(new_paper.paper_id)]["authors"] = [ dict(author._asdict()) for author in full_papers[str(new_paper.paper_id)]["authors"] ] # we currently don't need the actual abstract, but just need to know if it exists or not if full_papers[str(new_paper.paper_id)]["has_abstract"]: full_papers[str(new_paper.paper_id)]["abstract"] = "EXISTS" else: full_papers[str(new_paper.paper_id)]["abstract"] = "" logger.info(f"Adding {len(reference_papers_to_add)} reference papers") reference_papers_added = 0 for dataset_name, paper_id in reference_papers_to_add: if str(paper_id) not in full_papers and str( paper_id) in datasets[dataset_name].papers: full_papers[str(paper_id)] = dict( datasets[dataset_name].papers[str(paper_id)]._asdict()) full_papers[str(paper_id)]["authors"] = [ dict(author._asdict()) for author in full_papers[str(paper_id)]["authors"] ] if full_papers[str(paper_id)]["has_abstract"]: full_papers[str(paper_id)]["abstract"] = "EXISTS" else: full_papers[str(paper_id)]["abstract"] = "" reference_papers_added += 1 logger.info(f"Added {reference_papers_added} reference papers") logger.info(f"Dumping {len(full_papers)} papers") with open(os.path.join(AUGMENTATION_DIR, "augmented_papers.json"), "w") as _json_file: json.dump(full_papers, _json_file) logger.info(f"Dumping {len(full_signatures)} signatures") with open(os.path.join(AUGMENTATION_DIR, "augmented_signatures.json"), "w") as _json_file: json.dump(full_signatures, _json_file) full_specter_D_np = np.array(full_specter_D) logger.info( f"Dumping {full_specter_D_np.shape, len(full_specter_keys)} specter") with open(os.path.join(AUGMENTATION_DIR, "augmented_specter.pickle"), "wb") as _pickle_file: pickle.dump((full_specter_D_np, full_specter_keys), _pickle_file, protocol=pickle.HIGHEST_PROTOCOL) train_pairs_df = pd.DataFrame(train_pairs, columns=["pair1", "pair2", "label"]) train_pairs_df["label"] = train_pairs_df["label"].apply( lambda x: "YES" if x == 1 else "NO") val_pairs_df = pd.DataFrame(val_pairs, columns=["pair1", "pair2", "label"]) val_pairs_df["label"] = val_pairs_df["label"].apply(lambda x: "YES" if x == 1 else "NO") test_pairs_df = pd.DataFrame(test_pairs, columns=["pairs1", "pair2", "label"]) test_pairs_df["label"] = test_pairs_df["label"].apply(lambda x: "YES" if x == 1 else "NO") logger.info("Writing pairs csvs") train_pairs_df.to_csv(os.path.join(AUGMENTATION_DIR, "train_pairs.csv"), index=False, header=True) val_pairs_df.to_csv(os.path.join(AUGMENTATION_DIR, "val_pairs.csv"), index=False, header=True) test_pairs_df.to_csv(os.path.join(AUGMENTATION_DIR, "test_pairs.csv"), index=False, header=True) logger.info("Done.")
else: train_pairs_path = os.path.join(DATA_DIR, dataset_name, "train_pairs.csv") val_pairs_path = os.path.join(DATA_DIR, dataset_name, "val_pairs.csv") if not os.path.exists(val_pairs_path): val_pairs_path = None test_pairs_path = os.path.join(DATA_DIR, dataset_name, "test_pairs.csv") anddata = ANDData( signatures=os.path.join(DATA_DIR, dataset_name, dataset_name + "_signatures.json"), papers=os.path.join(DATA_DIR, dataset_name, dataset_name + "_papers.json"), name=dataset_name, mode="train", specter_embeddings=os.path.join(DATA_DIR, dataset_name, dataset_name + "_specter.pickle"), clusters=clusters_path, block_type="s2", train_pairs=train_pairs_path, val_pairs=val_pairs_path, test_pairs=test_pairs_path, train_pairs_size=N_TRAIN_PAIRS_SIZE, val_pairs_size=N_VAL_TEST_SIZE, test_pairs_size=N_VAL_TEST_SIZE, n_jobs=N_JOBS, load_name_counts=False, preprocess=False, ) datasets[dataset_name] = anddata tuples = [] all_titles_dict = {} for dataset_name, anddata in datasets.items(): # this random seed matches the current default
class TestData(unittest.TestCase): def setUp(self): super().setUp() self.dummy_dataset = ANDData( "tests/dummy/signatures.json", "tests/dummy/papers.json", clusters="tests/dummy/clusters.json", name="dummy", load_name_counts=True, ) features_to_use = [ "name_similarity", "affiliation_similarity", "email_similarity", "coauthor_similarity", "venue_similarity", "year_diff", "title_similarity", "reference_features", "misc_features", "name_counts", "journal_similarity", "advanced_name_similarity", ] self.dummy_featurizer = FeaturizationInfo(features_to_use=features_to_use) def check_features_array_equal(self, array_1, array_2): assert len(array_1) == len(array_2) for i in range(len(array_1)): both_nan = np.isnan(array_1[i]) and np.isnan(array_2[i]) if not both_nan: self.assertAlmostEqual(array_1[i], array_2[i], msg=i) def test_featurizer(self): test_pairs = [ ("3", "0", 0), ("3", "1", 0), ("3", "2", 0), ("3", "2", -1), ] features, labels, _ = many_pairs_featurize( test_pairs, self.dummy_dataset, self.dummy_featurizer, 2, False, 1, nan_value=-1 ) expected_features_1 = [ 0.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.2, -1.0, -1.0, -1.0, -1.0, -1.0, 0.0, 4.0, 0.0, 0.03067484662576687, -1.0, -1.0, -1.0, -1.0, 0.0, -1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 82081.0, 12.0, 807.0, 1.0, -1.0, -1.0, -1.0, 0.7777777777777778, 0.8, 0.7777777777777778, 0.5407407407407407, ] expected_features_2 = [ 0.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.2, -1.0, -1.0, -1.0, -1.0, -1.0, 0.0, 6.0, 0.02857142857142857, 0.09615384615384616, 0.25757575757575757, 0.34615384615384615, 0.8181818181818182, 0.2222222222222222, 0.0, 0.5, 1.0, 2.0, 2.0, 1.0, 2.0, 23425.0, 12.0, 807.0, 1.0, 82081.0, 20.0, -1.0, 0.7777777777777778, 0.8, 0.7777777777777778, 0.5407407407407407, ] expected_features_3 = [ 0.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.2, -1.0, -1.0, -1.0, -1.0, -1.0, 0.0, 6.0, 0.0, 0.058823529411764705, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 23425.0, 12.0, 807.0, 1.0, 82081.0, 20.0, -1.0, 0.7777777777777778, 0.8, 0.7777777777777778, 0.5407407407407407, ] self.check_features_array_equal(list(features[0, :]), expected_features_1) self.check_features_array_equal(list(features[1, :]), expected_features_2) self.check_features_array_equal(list(features[2, :]), expected_features_3) self.assertEqual(features[3, 0], -LARGE_INTEGER) def test_get_constraint(self): first_constraint = self.dummy_dataset.get_constraint("0", "8", high_value=100) assert first_constraint == 100 middle_constraint = self.dummy_dataset.get_constraint("6", "8", high_value=100) assert middle_constraint == 100 no_constraint = self.dummy_dataset.get_constraint("0", "1") assert no_constraint is None
class TestData(unittest.TestCase): def setUp(self): super().setUp() self.qian_dataset = ANDData( "tests/qian/signatures.json", # "tests/qian/papers.json", {}, clusters="tests/qian/clusters.json", name="qian", load_name_counts=False, preprocess=False, ) self.dummy_dataset = ANDData( "tests/dummy/signatures.json", # "tests/dummy/papers.json", {}, clusters="tests/dummy/clusters.json", name="dummy", load_name_counts=False, preprocess=False, ) def test_split_pairs_within_blocks(self): # Test random sampling within blocks self.qian_dataset.pair_sampling_block = True self.qian_dataset.pair_sampling_balanced_classes = False self.qian_dataset.pair_sampling_balanced_homonym_synonym = False self.qian_dataset.train_pairs_size = 1000 self.qian_dataset.val_pairs_size = 500 self.qian_dataset.test_pairs_size = 500 self.qian_dataset.random_seed = 1111 ( train_block_dict, val_block_dict, test_block_dict, ) = self.qian_dataset.split_cluster_signatures() train_pairs, val_pairs, test_pairs = self.qian_dataset.split_pairs( train_block_dict, val_block_dict, test_block_dict) assert len(train_pairs) == 1000 and len(val_pairs) == 500 and len( test_pairs) == 500 assert (train_pairs[0] == ("5259", "5270", 1) and val_pairs[0] == ("3830", "3847", 1) and test_pairs[0] == ("1050", "1063", 1)) # Test balanced pos/neg sampling within blocks self.qian_dataset.pair_sampling_block = True self.qian_dataset.pair_sampling_balanced_classes = True self.qian_dataset.pair_sampling_balanced_homonym_synonym = False train_pairs, val_pairs, test_pairs = self.qian_dataset.split_pairs( train_block_dict, val_block_dict, test_block_dict) assert sum([int(pair[2]) for pair in train_pairs]) == 500 assert len(train_pairs) == 1000 and len(val_pairs) == 500 and len( test_pairs) == 500 assert (train_pairs[0] == ("5694", "5702", 1) and val_pairs[0] == ("781", "787", 1) and test_pairs[0] == ("2428", "2581", 0)) # Test balanced pos/neg and homonym/synonym sampling within blocks self.qian_dataset.pair_sampling_block = True self.qian_dataset.pair_sampling_balanced_classes = True self.qian_dataset.pair_sampling_balanced_homonym_synonym = True train_pairs, val_pairs, test_pairs = self.qian_dataset.split_pairs( train_block_dict, val_block_dict, test_block_dict) assert sum([int(pair[2]) for pair in train_pairs]) == 500 assert len(train_pairs) == 1000 and len(val_pairs) == 429 and len( test_pairs) == 376 assert (train_pairs[0] == ("4389", "4493", 0) and val_pairs[0] == ("621", "636", 0) and test_pairs[0] == ("2550", "2622", 0)) # Test adding the all test pairs flag to the test above self.qian_dataset.all_test_pairs_flag = True train_pairs, val_pairs, test_pairs = self.qian_dataset.split_pairs( train_block_dict, val_block_dict, test_block_dict) assert len(train_pairs ) == 1000, len(val_pairs) == 429 and len(test_pairs) == 7244 def test_blocks(self): original_blocks = self.dummy_dataset.get_original_blocks() s2_blocks = self.dummy_dataset.get_s2_blocks() expected_original_blocks = { "a sattar": ["0", "1", "2"], "a konovalov": ["3", "4", "5", "6", "7", "8"], } expected_s2_blocks = { "a sattary": ["0", "1", "2"], "a konovalov": ["3", "4", "5", "6", "7", "8"], } self.dummy_dataset.block_type = "s2" s2_blocks_2 = self.dummy_dataset.get_blocks() self.dummy_dataset.block_type = "original" original_blocks_2 = self.dummy_dataset.get_blocks() self.dummy_dataset.block_type = "dummy" with pytest.raises(Exception): blocks = self.dummy_dataset.get_blocks() self.dummy_dataset.block_type = "s2" assert original_blocks == expected_original_blocks assert original_blocks_2 == expected_original_blocks assert s2_blocks == expected_s2_blocks assert s2_blocks_2 == expected_s2_blocks def test_initialization(self): with pytest.raises(Exception): dataset = ANDData( signatures={}, papers={}, clusters={}, name="", mode="train", train_blocks=[], block_type="s2", load_name_counts=False, preprocess=False, ) with pytest.raises(Exception): dataset = ANDData( signatures={}, papers={}, clusters={}, name="", mode="train", unit_of_data_split="blocks", pair_sampling_block=False, load_name_counts=False, preprocess=False, ) with pytest.raises(Exception): dataset = ANDData( signatures={}, papers={}, name="", mode="train", clusters={}, train_pairs=[], load_name_counts=False, preprocess=False, ) with pytest.raises(Exception): dataset = ANDData( signatures={}, papers={}, name="", mode="train", clusters=None, train_pairs=None, train_blocks=None, load_name_counts=False, preprocess=False, ) with pytest.raises(Exception): dataset = ANDData( signatures={}, papers={}, name="", mode="train", train_blocks=[], train_pairs=[], load_name_counts=False, preprocess=False, ) with pytest.raises(Exception): dataset = ANDData( signatures={}, papers={}, name="", mode="train", train_blocks=[], clusters=None, load_name_counts=False, preprocess=False, ) dataset = ANDData(signatures={}, papers={}, name="", mode="inference", load_name_counts=False, preprocess=False) assert dataset.signature_to_cluster_id is None dataset = ANDData(signatures={}, papers={}, name="", mode="inference", load_name_counts=False, preprocess=False) assert dataset.pair_sampling_block assert not dataset.pair_sampling_balanced_classes assert not dataset.pair_sampling_balanced_homonym_synonym assert dataset.all_test_pairs_flag assert dataset.block_type == "s2" with pytest.raises(Exception): dataset = ANDData(signatures={}, papers={}, clusters={}, name="", mode="dummy", load_name_counts=False, preprocess=False) def test_construct_cluster_to_signatures(self): cluster_to_signatures = self.dummy_dataset.construct_cluster_to_signatures( { "a": ["0", "1"], "b": ["3", "4"] }) expected_cluster_to_signatures = {"1": ["0", "1"], "3": ["3", "4"]} assert cluster_to_signatures == expected_cluster_to_signatures
def test_initialization(self): with pytest.raises(Exception): dataset = ANDData( signatures={}, papers={}, clusters={}, name="", mode="train", train_blocks=[], block_type="s2", load_name_counts=False, preprocess=False, ) with pytest.raises(Exception): dataset = ANDData( signatures={}, papers={}, clusters={}, name="", mode="train", unit_of_data_split="blocks", pair_sampling_block=False, load_name_counts=False, preprocess=False, ) with pytest.raises(Exception): dataset = ANDData( signatures={}, papers={}, name="", mode="train", clusters={}, train_pairs=[], load_name_counts=False, preprocess=False, ) with pytest.raises(Exception): dataset = ANDData( signatures={}, papers={}, name="", mode="train", clusters=None, train_pairs=None, train_blocks=None, load_name_counts=False, preprocess=False, ) with pytest.raises(Exception): dataset = ANDData( signatures={}, papers={}, name="", mode="train", train_blocks=[], train_pairs=[], load_name_counts=False, preprocess=False, ) with pytest.raises(Exception): dataset = ANDData( signatures={}, papers={}, name="", mode="train", train_blocks=[], clusters=None, load_name_counts=False, preprocess=False, ) dataset = ANDData(signatures={}, papers={}, name="", mode="inference", load_name_counts=False, preprocess=False) assert dataset.signature_to_cluster_id is None dataset = ANDData(signatures={}, papers={}, name="", mode="inference", load_name_counts=False, preprocess=False) assert dataset.pair_sampling_block assert not dataset.pair_sampling_balanced_classes assert not dataset.pair_sampling_balanced_homonym_synonym assert dataset.all_test_pairs_flag assert dataset.block_type == "s2" with pytest.raises(Exception): dataset = ANDData(signatures={}, papers={}, clusters={}, name="", mode="dummy", load_name_counts=False, preprocess=False)
def main(): """ This script is used to train and dump a model trained on all the datasets """ datasets = {} for dataset_name in tqdm( SOURCE_DATASET_NAMES, desc="Processing datasets and fitting base models"): logger.info(f"processing dataset {dataset_name}") clusters_path: Optional[str] = None if dataset_name not in PAIRWISE_ONLY_DATASETS: clusters_path = os.path.join(DATA_DIR, dataset_name, dataset_name + "_clusters.json") train_pairs_path = None val_pairs_path = None test_pairs_path = None else: train_pairs_path = os.path.join(DATA_DIR, dataset_name, "train_pairs.csv") val_pairs_path = os.path.join(DATA_DIR, dataset_name, "val_pairs.csv") if not os.path.exists(val_pairs_path): val_pairs_path = None test_pairs_path = os.path.join(DATA_DIR, dataset_name, "test_pairs.csv") logger.info(f"loading dataset {dataset_name}") anddata = ANDData( signatures=os.path.join(DATA_DIR, dataset_name, dataset_name + "_signatures.json"), papers=os.path.join(DATA_DIR, dataset_name, dataset_name + "_papers.json"), name=dataset_name, mode="train", specter_embeddings=os.path.join(DATA_DIR, dataset_name, dataset_name + "_specter.pickle"), clusters=clusters_path, block_type=BLOCK_TYPE, train_pairs=train_pairs_path, val_pairs=val_pairs_path, test_pairs=test_pairs_path, train_pairs_size=N_TRAIN_PAIRS_SIZE, val_pairs_size=N_VAL_TEST_SIZE, test_pairs_size=N_VAL_TEST_SIZE, preprocess=True, ) logger.info(f"featurizing {dataset_name}") train, val, test = featurize( anddata, FEATURIZER_INFO, n_jobs=N_JOBS, use_cache=True, chunk_size=100, nameless_featurizer_info=NAMELESS_FEATURIZER_INFO, nan_value=NAN_VALUE, ) X_train, y_train, nameless_X_train = train X_val, y_val, nameless_X_val = val X_test, y_test, nameless_X_test = test dataset = {} dataset["anddata"] = anddata dataset["X_train"] = X_train dataset["y_train"] = y_train dataset["X_val"] = X_val dataset["y_val"] = y_val dataset["X_test"] = X_test dataset["y_test"] = y_test dataset["nameless_X_train"] = nameless_X_train dataset["nameless_X_val"] = nameless_X_val dataset["nameless_X_test"] = nameless_X_test dataset["name"] = anddata.name datasets[dataset_name] = dataset anddatas = [ datasets[dataset_name]["anddata"] for dataset_name in SOURCE_DATASET_NAMES if dataset_name not in PAIRWISE_ONLY_DATASETS ] X_train = np.vstack([ datasets[dataset_name]["X_train"] for dataset_name in SOURCE_DATASET_NAMES ]) y_train = np.hstack([ datasets[dataset_name]["y_train"] for dataset_name in SOURCE_DATASET_NAMES ]) X_val = np.vstack([ datasets[dataset_name]["X_val"] for dataset_name in SOURCE_DATASET_NAMES if dataset_name not in {"augmented"} ]) y_val = np.hstack([ datasets[dataset_name]["y_val"] for dataset_name in SOURCE_DATASET_NAMES if dataset_name not in {"augmented"} ]) nameless_X_train = np.vstack([ datasets[dataset_name]["nameless_X_train"] for dataset_name in SOURCE_DATASET_NAMES ]) nameless_X_val = np.vstack([ datasets[dataset_name]["nameless_X_val"] for dataset_name in SOURCE_DATASET_NAMES if dataset_name not in {"augmented"} ]) logger.info("fitting pairwise") union_classifier = PairwiseModeler( n_iter=N_ITER, monotone_constraints=MONOTONE_CONSTRAINTS) union_classifier.fit(X_train, y_train, X_val, y_val) nameless_union_classifier = None if USE_NAMELESS_MODEL: logger.info("nameless fitting pairwise for " + str(SOURCE_DATASET_NAMES)) nameless_union_classifier = PairwiseModeler( n_iter=N_ITER, monotone_constraints=NAMELESS_MONOTONE_CONSTRAINTS, ) nameless_union_classifier.fit(nameless_X_train, y_train, nameless_X_val, y_val) logger.info("nameless pairwise fit for " + str(SOURCE_DATASET_NAMES)) logger.info("fitting clusterer for") union_clusterer = Clusterer( FEATURIZER_INFO, union_classifier.classifier, cluster_model=FastCluster(), search_space=search_space, n_jobs=N_JOBS, nameless_classifier=nameless_union_classifier.classifier if nameless_union_classifier is not None else None, nameless_featurizer_info=NAMELESS_FEATURIZER_INFO if nameless_union_classifier is not None else None, ) union_clusterer.fit(anddatas) print( "best clustering parameters:", union_clusterer.best_params, ) models = {} models["clusterer"] = union_clusterer with open( f"full_union_model_script_dump_average_{FEATURIZER_VERSION}.pickle", "wb", ) as _pickle_file: pickle.dump(models, _pickle_file) logger.info("Done.")
def featurize( dataset: ANDData, featurizer_info: FeaturizationInfo, n_jobs: int = 1, use_cache: bool = False, chunk_size: int = DEFAULT_CHUNK_SIZE, nameless_featurizer_info: Optional[FeaturizationInfo] = None, nan_value: float = np.nan, delete_training_data: bool = False, ) -> Union[Tuple[TupleOfArrays, TupleOfArrays, TupleOfArrays], TupleOfArrays]: """ Featurizes the input dataset Parameters ---------- dataset: ANDData the dataset containing the relevant data featurizer_info: FeaturizationInfo the FeautrizationInfo object containing the listing of features to use and featurizer version n_jobs: int the number of cpus to use use_cache: bool whether or not to use write to/read from the features cache chunk_size: int the chunk size for multiprocessing nameless_featurizer_info: FeaturizationInfo the FeaturizationInfo for creating the features that do not use any name features, these will not be computed if this is None nan_value: float the value to replace nans with delete_training_data: bool Whether to delete some suspicious training examples Returns ------- train/val/test features and labels if mode is 'train', features and labels for all pairs if mode is 'inference' """ if dataset.mode == "inference": logger.info("featurizing all pairs") all_pairs = dataset.all_pairs() all_features = many_pairs_featurize( all_pairs, dataset, featurizer_info, n_jobs, use_cache, chunk_size, nameless_featurizer_info, nan_value, False, ) logger.info("featurized all pairs") return all_features else: if dataset.train_pairs is None: if dataset.train_blocks is not None: ( train_signatures, val_signatures, test_signatures, ) = dataset.split_cluster_signatures_fixed() elif dataset.train_signatures is not None: ( train_signatures, val_signatures, test_signatures, ) = dataset.split_data_signatures_fixed() else: ( train_signatures, val_signatures, test_signatures, ) = dataset.split_cluster_signatures() # type: ignore train_pairs, val_pairs, test_pairs = dataset.split_pairs(train_signatures, val_signatures, test_signatures) else: train_pairs, val_pairs, test_pairs = dataset.fixed_pairs() logger.info("featurizing train") train_features = many_pairs_featurize( train_pairs, dataset, featurizer_info, n_jobs, use_cache, chunk_size, nameless_featurizer_info, nan_value, delete_training_data, ) logger.info("featurized train, featurizing val") val_features = many_pairs_featurize( val_pairs, dataset, featurizer_info, n_jobs, use_cache, chunk_size, nameless_featurizer_info, nan_value, False, ) logger.info("featurized val, featurizing test") test_features = many_pairs_featurize( test_pairs, dataset, featurizer_info, n_jobs, use_cache, chunk_size, nameless_featurizer_info, nan_value, False, ) logger.info("featurized test") return train_features, val_features, test_features
class TestClusterer(unittest.TestCase): def setUp(self): super().setUp() self.dummy_dataset = ANDData( "tests/dummy/signatures.json", "tests/dummy/papers.json", clusters="tests/dummy/clusters.json", cluster_seeds="tests/dummy/cluster_seeds.json", name="dummy", load_name_counts=True, ) features_to_use = [ "year_diff", "misc_features", ] featurizer_info = FeaturizationInfo(features_to_use=features_to_use) np.random.seed(1) X_random = np.random.random((10, 6)) y_random = np.random.randint(0, 6, 10) self.dummy_clusterer = Clusterer( featurizer_info=featurizer_info, classifier=lgb.LGBMClassifier(random_state=1, data_random_seed=1, feature_fraction_seed=1).fit( X_random, y_random), n_jobs=1, use_cache=False, use_default_constraints_as_supervision=False, ) def test_get_constraints(self): block = { "a sattar": ["0", "1", "2"], } constraint_1 = self.dummy_dataset.get_constraint("0", "1", low_value=0, high_value=2) constraint_2 = self.dummy_dataset.get_constraint("1", "0", low_value=0, high_value=2) constraint_3 = self.dummy_dataset.get_constraint("1", "2", low_value=0, high_value=2) constraint_4 = self.dummy_dataset.get_constraint("2", "1", low_value=0, high_value=2) self.assertIs(constraint_1, LARGE_DISTANCE) self.assertIs(constraint_2, LARGE_DISTANCE) self.assertIs(constraint_3, 0) self.assertIs(constraint_4, 0) def test_make_distance_matrix_fastcluster(self): block = { "a sattar": ["0", "1", "2"], } partial_supervision = {("0", "1"): 1.1, ("1", "2"): 1e-6} distance_matrices = self.dummy_clusterer.make_distance_matrices( block_dict=block, dataset=self.dummy_dataset, partial_supervision=partial_supervision, ) distance_matrix = distance_matrices["a sattar"] self.assertEqual(distance_matrix[0], np.float16(1.1)) self.assertEqual(distance_matrix[1], np.float16(0.3)) self.assertEqual(distance_matrix[2], np.float16(1e-6)) distance_matrices = self.dummy_clusterer.make_distance_matrices( block_dict=block, dataset=self.dummy_dataset, partial_supervision={}, ) distance_matrix = distance_matrices["a sattar"] self.assertEqual(distance_matrix[0], np.float16(0.3)) self.assertEqual(distance_matrix[1], np.float16(0.3)) self.assertEqual(distance_matrix[2], np.float16(0.3))
def main(model_path: str, n_jobs: int = 20, use_constraints: bool = True): """ This script is for evaluating a model on the Semantic Scholar corrections data. It clusters each block for which we have pairwise corrections data (and the data is already pulled from Semantic Scholar for), and runs clustering and prints metrics out """ with open(os.path.join(DATA_DIR, "claims_pairs_remapped.json")) as _json_file: claims_pairs = json.load(_json_file) logger.info("Claims pairs loaded") with open(model_path, "rb") as _pickle_file: models = pickle.load(_pickle_file) clusterer = models["clusterer"] clusterer.n_jobs = n_jobs clusterer.use_cache = True clusterer.use_default_constraints_as_supervision = use_constraints clusterer.batch_size = 10000000 logger.info(f"Linkage type: {clusterer.cluster_model.linkage}") logger.info(f"EPS: {clusterer.cluster_model.eps}") logger.info( f"Use constraints: {clusterer.use_default_constraints_as_supervision}") logger.info( f"Featurizer version: {clusterer.featurizer_info.featurizer_version}") logger.info( f"Use constraints: {clusterer.use_default_constraints_as_supervision}") block_keys = sorted( filter( lambda x: not x.endswith(".json") and not x.endswith(".pickle") and not x.endswith(".py") and not x.endswith( ".vscode") and not x.endswith(".csv"), os.listdir(BLOCK_DATASETS_DIR), ), key=lambda x: os.path.getsize( os.path.join(os.path.join(BLOCK_DATASETS_DIR, x), "claims_signatures.json")), ) logger.info("starting transfer experiment main, loading name counts") with open(cached_path(NAME_COUNTS_PATH), "rb") as f: ( first_dict, last_dict, first_last_dict, last_first_initial_dict, ) = pickle.load(f) name_counts = { "first_dict": first_dict, "last_dict": last_dict, "first_last_dict": first_last_dict, "last_first_initial_dict": last_first_initial_dict, } logger.info("loaded name counts") for block_key in tqdm(block_keys): results = {} block_dir = os.path.join(BLOCK_DATASETS_DIR, block_key) logger.info(f"Loading dataset {block_key}") dataset = ANDData( signatures=os.path.join(block_dir, "claims_signatures.json"), papers=os.path.join(block_dir, "claims_papers.json"), mode="inference", specter_embeddings=os.path.join(block_dir, "claims_specter.pickle"), block_type="s2", name=block_key.replace(" ", "_"), n_jobs=n_jobs, load_name_counts=name_counts, ) logger.info("Dataset loaded") result = claims_eval( dataset, clusterer, claims_pairs, os.path.join(BLOCK_DATASETS_DIR, dataset.name), output_shap=False, ) results[block_key.replace(" ", "_")] = result logger.info(f"Claims eval output: {result}") with open( os.path.join( BLOCK_DATASETS_DIR, dataset.name, f"results_{clusterer.featurizer_info.featurizer_version}.json", ), "w", ) as _json_file: json.dump(results, _json_file) logger.info("Done.")