Example #1
0
    def setUp(self):
        super().setUp()
        self.dummy_dataset = ANDData(
            "tests/dummy/signatures.json",
            "tests/dummy/papers.json",
            clusters="tests/dummy/clusters.json",
            name="dummy",
            load_name_counts=True,
        )

        features_to_use = [
            "name_similarity",
            "affiliation_similarity",
            "email_similarity",
            "coauthor_similarity",
            "venue_similarity",
            "year_diff",
            "title_similarity",
            "reference_features",
            "misc_features",
            "name_counts",
            "journal_similarity",
            "advanced_name_similarity",
        ]
        self.dummy_featurizer = FeaturizationInfo(features_to_use=features_to_use)
Example #2
0
    def setUp(self):
        super().setUp()
        self.dummy_dataset = ANDData(
            "tests/dummy/signatures.json",
            "tests/dummy/papers.json",
            clusters="tests/dummy/clusters.json",
            cluster_seeds="tests/dummy/cluster_seeds.json",
            name="dummy",
            load_name_counts=True,
        )

        features_to_use = [
            "year_diff",
            "misc_features",
        ]
        featurizer_info = FeaturizationInfo(features_to_use=features_to_use)
        np.random.seed(1)
        X_random = np.random.random((10, 6))
        y_random = np.random.randint(0, 6, 10)
        self.dummy_clusterer = Clusterer(
            featurizer_info=featurizer_info,
            classifier=lgb.LGBMClassifier(random_state=1,
                                          data_random_seed=1,
                                          feature_fraction_seed=1).fit(
                                              X_random, y_random),
            n_jobs=1,
            use_cache=False,
            use_default_constraints_as_supervision=False,
        )
Example #3
0
 def setUp(self):
     super().setUp()
     self.dummy_dataset = ANDData(
         "tests/dummy/signatures.json",
         "tests/dummy/papers.json",
         clusters="tests/dummy/clusters.json",
         name="dummy",
         load_name_counts=False,
     )
Example #4
0
 def setUp(self):
     super().setUp()
     self.qian_dataset = ANDData(
         "tests/qian/signatures.json",
         # "tests/qian/papers.json",
         {},
         clusters="tests/qian/clusters.json",
         name="qian",
         load_name_counts=False,
         preprocess=False,
     )
     self.dummy_dataset = ANDData(
         "tests/dummy/signatures.json",
         # "tests/dummy/papers.json",
         {},
         clusters="tests/dummy/clusters.json",
         name="dummy",
         load_name_counts=False,
         preprocess=False,
     )
Example #5
0
def main(
    experiment_name: str,
    dont_use_nameless_model: bool,
    dont_use_rules: bool,
    dont_use_monotone_constraints: bool,
    exclude_augmented: bool,
    single_dataset: str,
    feature_groups_to_skip: List[str],
    n_jobs: int,
    random_seed: int,
):
    """
    This script is used to train and dump a model trained on all the datasets
    """
    DATA_DIR = CONFIG["internal_data_dir"]
    USE_NAMELESS_MODEL = not dont_use_nameless_model
    USE_RULES = not dont_use_rules
    USE_AUGMENTATION = not exclude_augmented
    USE_MONOTONE_CONSTRAINTS = not dont_use_monotone_constraints
    N_JOBS = n_jobs

    for feature_group in feature_groups_to_skip:
        FEATURES_TO_USE.remove(feature_group)

    NAMELESS_FEATURES_TO_USE = [
        feature_name for feature_name in FEATURES_TO_USE if feature_name not in
        {"name_similarity", "advanced_name_similarity", "name_counts"}
    ]

    FEATURIZER_INFO = FeaturizationInfo(features_to_use=FEATURES_TO_USE,
                                        featurizer_version=FEATURIZER_VERSION)
    NAMELESS_FEATURIZER_INFO = FeaturizationInfo(
        features_to_use=NAMELESS_FEATURES_TO_USE,
        featurizer_version=FEATURIZER_VERSION)

    MONOTONE_CONSTRAINTS = FEATURIZER_INFO.lightgbm_monotone_constraints
    NAMELESS_MONOTONE_CONSTRAINTS = NAMELESS_FEATURIZER_INFO.lightgbm_monotone_constraints

    SOURCE_DATASET_NAMES = [
        "aminer", "arnetminer", "inspire", "kisti", "orcid", "pubmed", "qian",
        "zbmath"
    ]
    PAIRWISE_ONLY_DATASETS = {"medline", "augmented"}

    if USE_AUGMENTATION:
        SOURCE_DATASET_NAMES.append("augmented")

    if single_dataset != "":
        SOURCE_DATASET_NAMES = [single_dataset]

    datasets = {}
    for dataset_name in tqdm(
            SOURCE_DATASET_NAMES,
            desc="Processing datasets and fitting base models"):
        logger.info(f"processing dataset {dataset_name}")
        clusters_path: Optional[str] = None
        if dataset_name not in PAIRWISE_ONLY_DATASETS:
            clusters_path = os.path.join(DATA_DIR, dataset_name,
                                         dataset_name + "_clusters.json")
            train_pairs_path = None
            val_pairs_path = None
            test_pairs_path = None
        else:
            train_pairs_path = os.path.join(DATA_DIR, dataset_name,
                                            "train_pairs.csv")
            val_pairs_path = os.path.join(DATA_DIR, dataset_name,
                                          "val_pairs.csv")
            if not os.path.exists(val_pairs_path):
                val_pairs_path = None
            test_pairs_path = os.path.join(DATA_DIR, dataset_name,
                                           "test_pairs.csv")

        logger.info(f"loading dataset {dataset_name}")
        anddata = ANDData(
            signatures=os.path.join(DATA_DIR, dataset_name,
                                    dataset_name + "_signatures.json"),
            papers=os.path.join(DATA_DIR, dataset_name,
                                dataset_name + "_papers.json"),
            name=dataset_name,
            mode="train",
            specter_embeddings=os.path.join(DATA_DIR, dataset_name,
                                            dataset_name + "_specter.pickle"),
            clusters=clusters_path,
            block_type=BLOCK_TYPE,
            train_pairs=train_pairs_path,
            val_pairs=val_pairs_path,
            test_pairs=test_pairs_path,
            train_pairs_size=N_TRAIN_PAIRS_SIZE,
            val_pairs_size=N_VAL_TEST_SIZE,
            test_pairs_size=N_VAL_TEST_SIZE,
            preprocess=True,
            random_seed=random_seed if random_seed is not None else 1111,
        )

        logger.info(f"featurizing {dataset_name}")
        train, val, _ = featurize(
            anddata,
            FEATURIZER_INFO,
            n_jobs=N_JOBS,
            use_cache=True,
            chunk_size=100,
            nameless_featurizer_info=NAMELESS_FEATURIZER_INFO,
            nan_value=np.nan,
        )
        X_train, y_train, nameless_X_train = train
        X_val, y_val, nameless_X_val = val

        dataset: Dict[Any, Any] = {}
        dataset["anddata"] = anddata
        dataset["X_train"] = X_train
        dataset["y_train"] = y_train
        dataset["X_val"] = X_val
        dataset["y_val"] = y_val
        dataset["nameless_X_train"] = nameless_X_train
        dataset["nameless_X_val"] = nameless_X_val

        datasets[dataset_name] = dataset

    anddatas = [
        datasets[dataset_name]["anddata"]
        for dataset_name in SOURCE_DATASET_NAMES
        if dataset_name not in PAIRWISE_ONLY_DATASETS
    ]

    X_train = np.vstack([
        datasets[dataset_name]["X_train"]
        for dataset_name in SOURCE_DATASET_NAMES
    ])
    y_train = np.hstack([
        datasets[dataset_name]["y_train"]
        for dataset_name in SOURCE_DATASET_NAMES
    ])
    X_val = np.vstack([
        datasets[dataset_name]["X_val"]
        for dataset_name in SOURCE_DATASET_NAMES
        if dataset_name not in {"augmented"}
    ])
    y_val = np.hstack([
        datasets[dataset_name]["y_val"]
        for dataset_name in SOURCE_DATASET_NAMES
        if dataset_name not in {"augmented"}
    ])

    nameless_X_train = np.vstack([
        datasets[dataset_name]["nameless_X_train"]
        for dataset_name in SOURCE_DATASET_NAMES
    ])
    nameless_X_val = np.vstack([
        datasets[dataset_name]["nameless_X_val"]
        for dataset_name in SOURCE_DATASET_NAMES
        if dataset_name not in {"augmented"}
    ])

    logger.info("fitting pairwise")
    union_classifier = PairwiseModeler(
        n_iter=N_ITER,
        monotone_constraints=MONOTONE_CONSTRAINTS
        if USE_MONOTONE_CONSTRAINTS else None,
        random_state=random_seed if random_seed is not None else 42,
    )
    union_classifier.fit(X_train, y_train, X_val, y_val)

    nameless_union_classifier = None
    if USE_NAMELESS_MODEL:
        logger.info("nameless fitting pairwise for " +
                    str(SOURCE_DATASET_NAMES))
        nameless_union_classifier = PairwiseModeler(
            n_iter=N_ITER,
            monotone_constraints=NAMELESS_MONOTONE_CONSTRAINTS
            if USE_MONOTONE_CONSTRAINTS else None,
            random_state=random_seed if random_seed is not None else 42,
        )
        nameless_union_classifier.fit(nameless_X_train, y_train,
                                      nameless_X_val, y_val)
        logger.info("nameless pairwise fit for " + str(SOURCE_DATASET_NAMES))

    logger.info("fitting clusterer for")
    clusterer = Clusterer(
        FEATURIZER_INFO,
        union_classifier.classifier,
        cluster_model=FastCluster(),
        search_space=search_space,
        n_jobs=N_JOBS,
        nameless_classifier=nameless_union_classifier.classifier
        if nameless_union_classifier is not None else None,
        nameless_featurizer_info=NAMELESS_FEATURIZER_INFO
        if nameless_union_classifier is not None else None,
        use_default_constraints_as_supervision=USE_RULES,
        use_cache=True,
        random_state=random_seed if random_seed is not None else 42,
    )
    clusterer.fit(anddatas)
    print(
        "best clustering parameters:",
        clusterer.best_params,
    )

    # now working on the blocks
    CLAIMS_DATA_DIR = os.path.join(CONFIG["internal_data_dir"], "claims")
    BLOCK_DATASETS_DIR = os.path.join(CLAIMS_DATA_DIR, "block_datasets")

    with open(os.path.join(CLAIMS_DATA_DIR,
                           "claims_pairs_remapped.json")) as _json_file:
        claims_pairs = json.load(_json_file)
    logger.info("Claims pairs loaded")

    clusterer.batch_size = 10000000

    block_keys = sorted(
        filter(
            lambda x: not x.endswith(".json") and not x.endswith(".pickle") and
            not x.endswith(".py") and not x.endswith(
                ".vscode") and not x.endswith(".csv"),
            os.listdir(BLOCK_DATASETS_DIR),
        ),
        key=lambda x: os.path.getsize(
            os.path.join(os.path.join(BLOCK_DATASETS_DIR, x),
                         "claims_signatures.json")),
    )
    # these had errors when manually evaluating
    for block_key in ["t_xiao", "m_dagostino", "s_tunster", "n_smith"]:
        block_keys.remove(block_key)

    # let's only keep the first ~130 for speed purposes
    block_keys = block_keys[:130]

    logger.info("starting transfer experiment main, loading name counts")
    with open(cached_path(NAME_COUNTS_PATH), "rb") as f:
        (
            first_dict,
            last_dict,
            first_last_dict,
            last_first_initial_dict,
        ) = pickle.load(f)
    name_counts = {
        "first_dict": first_dict,
        "last_dict": last_dict,
        "first_last_dict": first_last_dict,
        "last_first_initial_dict": last_first_initial_dict,
    }
    logger.info("loaded name counts")

    results_dict = {}
    for block_key in tqdm(block_keys):
        results = {}
        block_dir = os.path.join(BLOCK_DATASETS_DIR, block_key)
        logger.info(f"Loading dataset {block_key}")
        claims_dataset = ANDData(
            signatures=os.path.join(block_dir, "claims_signatures.json"),
            papers=os.path.join(block_dir, "claims_papers.json"),
            mode="inference",
            specter_embeddings=os.path.join(block_dir,
                                            "claims_specter.pickle"),
            block_type="s2",
            name=block_key.replace(" ", "_"),
            n_jobs=n_jobs,
            load_name_counts=name_counts,
        )
        logger.info("Dataset loaded")

        result = claims_eval(
            claims_dataset,
            clusterer,
            claims_pairs,
            os.path.join(BLOCK_DATASETS_DIR, claims_dataset.name),
            output_shap=False,
            optional_name=experiment_name,
        )
        results[block_key.replace(" ", "_")] = result
        logger.info(f"Claims eval output: {result}")

        with open(
                os.path.join(
                    BLOCK_DATASETS_DIR,
                    claims_dataset.name,
                    f"results_{experiment_name}.json",
                ),
                "w",
        ) as _json_file:
            json.dump(results, _json_file)
        results_dict.update(results)

    pd.DataFrame(results_dict).T.to_csv(
        os.path.join(BLOCK_DATASETS_DIR, f"{experiment_name}.csv"))
Example #6
0
def main(
    experiment_name: str,
    dont_use_nameless_model: bool,
    n_jobs: int,
    dont_use_monotone_constraints: bool,
    linkage: str,
    use_dbscan: bool,
    negative_one_for_nan: bool,
    random_seed: int,
    inspire_split: int,
    inspire_only: bool,
    aminer_only: bool,
):
    USE_NAMELESS_MODEL = not dont_use_nameless_model
    N_JOBS = n_jobs
    USE_MONOTONE_CONSTRAINTS = not dont_use_monotone_constraints
    logger.info((f"USE_NAMELESS_MODEL={USE_NAMELESS_MODEL}, "
                 f"N_JOBS={N_JOBS}, "
                 f"USE_MONOTONE_CONSTRAINTS={USE_MONOTONE_CONSTRAINTS}, "
                 f"linkage={linkage}, "
                 f"use_dbscan={use_dbscan}, "
                 f"negative_one_for_nan={negative_one_for_nan}, "
                 f"random_seed={random_seed}"))

    if inspire_only:
        DATASET_NAMES = ["inspire"]
    elif aminer_only:
        DATASET_NAMES = ["aminer"]
    else:
        DATASET_NAMES = [
            "kisti",
            "pubmed",
            "medline",
        ]

    FIXED_BLOCK = ["aminer"]
    FIXED_SIGNATURE = ["inspire"]

    if negative_one_for_nan:
        MONOTONE_CONSTRAINTS = None
        NAMELESS_MONOTONE_CONSTRAINTS = None
        NAN_VALUE = -1
    else:
        MONOTONE_CONSTRAINTS = FEATURIZER_INFO.lightgbm_monotone_constraints
        NAMELESS_MONOTONE_CONSTRAINTS = NAMELESS_FEATURIZER_INFO.lightgbm_monotone_constraints
        NAN_VALUE = np.nan

    with open(cached_path(NAME_COUNTS_PATH), "rb") as f:
        (
            first_dict,
            last_dict,
            first_last_dict,
            last_first_initial_dict,
        ) = pickle.load(f)
    name_counts = {
        "first_dict": first_dict,
        "last_dict": last_dict,
        "first_last_dict": first_last_dict,
        "last_first_initial_dict": last_first_initial_dict,
    }
    logger.info("loaded name counts")

    datasets: Dict[str, Any] = {}

    for dataset_name in tqdm(
            DATASET_NAMES, desc="Processing datasets and fitting base models"):
        logger.info("")
        logger.info(f"processing dataset {dataset_name}")
        clusters_path: Optional[str] = None
        train_blocks: Optional[str] = None
        val_blocks: Optional[str] = None
        test_blocks: Optional[str] = None
        train_pairs_path: Optional[str] = None
        val_pairs_path: Optional[str] = None
        test_pairs_path: Optional[str] = None
        train_signatures: Optional[str] = None
        val_signatures: Optional[str] = None
        test_signatures: Optional[str] = None

        if dataset_name in FIXED_BLOCK:
            logger.info("FIXED BLOCK")
            train_blocks_fname: str = "train_keys.json"
            val_blocks_fname: str = "val_keys.json"
            test_blocks_fname: str = "test_keys.json"

            logger.info(
                f"File names, FIXED BLOCK {train_blocks_fname, val_blocks_fname, test_blocks_fname}"
            )
            clusters_path = os.path.join(DATA_DIR, dataset_name,
                                         dataset_name + "_clusters.json")
            train_blocks = os.path.join(DATA_DIR, dataset_name,
                                        train_blocks_fname)
            if not os.path.exists(
                    os.path.join(DATA_DIR, dataset_name, val_blocks_fname)):
                val_blocks = None
            test_blocks = os.path.join(DATA_DIR, dataset_name,
                                       test_blocks_fname)

        elif dataset_name in FIXED_SIGNATURE:
            train_sign_fname: str = "train_keys_" + str(
                inspire_split) + ".json"
            val_sign_fname: str = "val_keys_" + str(inspire_split) + ".json"
            test_sign_fname: str = "test_keys_" + str(inspire_split) + ".json"

            logger.info(
                f"File names, FIXED_SIGNATURE {train_sign_fname, val_sign_fname, test_sign_fname}"
            )
            clusters_path = os.path.join(DATA_DIR, dataset_name,
                                         dataset_name + "_clusters.json")
            train_signatures = os.path.join(DATA_DIR, dataset_name,
                                            train_sign_fname)
            if not os.path.exists(
                    os.path.join(DATA_DIR, dataset_name, val_sign_fname)):
                val_signatures = None
            test_signatures = os.path.join(DATA_DIR, dataset_name,
                                           test_sign_fname)

        elif dataset_name not in PAIRWISE_ONLY_DATASETS:
            logger.info("CLUSTER with random split")
            clusters_path = os.path.join(DATA_DIR, dataset_name,
                                         dataset_name + "_clusters.json")

        else:
            logger.info("Pairwise model")
            train_pairs_path = os.path.join(DATA_DIR, dataset_name,
                                            "train_pairs.csv")
            val_pairs_path = os.path.join(DATA_DIR, dataset_name,
                                          "val_pairs.csv")
            if not os.path.exists(val_pairs_path):
                val_pairs_path = None
            test_pairs_path = os.path.join(DATA_DIR, dataset_name,
                                           "test_pairs.csv")

        logger.info(f"loading dataset {dataset_name}")

        if dataset_name == "inspire" or dataset_name == "kisti":
            unit_of_data_split = "signatures"
        else:
            unit_of_data_split = "blocks"

        if dataset_name == "kisti":
            train_ratio = 0.4
            val_ratio = 0.1
            test_ratio = 0.5
        else:
            train_ratio = 0.8
            val_ratio = 0.1
            test_ratio = 0.1

        logger.info(f"ratios {train_ratio, val_ratio, test_ratio}")
        logger.info(f"block keys {train_blocks, val_blocks, test_blocks}")
        logger.info(
            f"signature keys {train_signatures, val_signatures, test_signatures}"
        )

        anddata = ANDData(
            signatures=os.path.join(DATA_DIR, dataset_name,
                                    dataset_name + "_signatures.json"),
            papers=os.path.join(DATA_DIR, dataset_name,
                                dataset_name + "_papers.json"),
            name=dataset_name,
            mode="train",
            specter_embeddings=os.path.join(DATA_DIR, dataset_name,
                                            dataset_name + "_specter.pickle"),
            clusters=clusters_path,
            block_type=BLOCK_TYPE,
            train_pairs=train_pairs_path,
            val_pairs=val_pairs_path,
            test_pairs=test_pairs_path,
            train_pairs_size=N_TRAIN_PAIRS_SIZE,
            val_pairs_size=N_VAL_TEST_SIZE,
            test_pairs_size=N_VAL_TEST_SIZE,
            n_jobs=N_JOBS,
            load_name_counts=name_counts,
            preprocess=PREPROCESS,
            random_seed=random_seed,
            train_blocks=train_blocks,
            val_blocks=val_blocks,
            test_blocks=test_blocks,
            train_signatures=train_signatures,
            val_signatures=val_signatures,
            test_signatures=test_signatures,
            train_ratio=train_ratio,
            val_ratio=val_ratio,
            test_ratio=test_ratio,
            unit_of_data_split=unit_of_data_split,
        )
        logger.info(f"dataset {dataset_name} loaded")

        logger.info(f"featurizing {dataset_name}")
        train, val, test = featurize(
            anddata,
            FEATURIZER_INFO,
            n_jobs=N_JOBS,
            use_cache=USE_CACHE,
            chunk_size=DEFAULT_CHUNK_SIZE,
            nameless_featurizer_info=NAMELESS_FEATURIZER_INFO,
            nan_value=NAN_VALUE)  # type: ignore
        X_train, y_train, nameless_X_train = train
        X_val, y_val, nameless_X_val = val
        assert test is not None
        X_test, y_test, nameless_X_test = test
        logger.info(f"dataset {dataset_name} featurized")

        pairwise_modeler: Optional[PairwiseModeler] = None
        nameless_pairwise_modeler = None
        cluster: Optional[Clusterer] = None
        logger.info(f"fitting pairwise for {dataset_name}")
        pairwise_modeler = PairwiseModeler(
            n_iter=N_ITER,
            monotone_constraints=MONOTONE_CONSTRAINTS
            if USE_MONOTONE_CONSTRAINTS else None,
            random_state=random_seed,
        )
        pairwise_modeler.fit(X_train, y_train, X_val, y_val)
        logger.info(f"pairwise fit for {dataset_name}")

        if USE_NAMELESS_MODEL:
            logger.info(f"nameless fitting pairwise for {dataset_name}")
            nameless_pairwise_modeler = PairwiseModeler(
                n_iter=N_ITER,
                monotone_constraints=NAMELESS_MONOTONE_CONSTRAINTS
                if USE_MONOTONE_CONSTRAINTS else None,
                random_state=random_seed,
            )
            nameless_pairwise_modeler.fit(nameless_X_train, y_train,
                                          nameless_X_val, y_val)
            logger.info(f"nameless pairwise fit for {dataset_name}")

        distances_for_sparsity = [
            1 - pred[1] for pred in pairwise_modeler.predict_proba(X_train)
        ]
        threshold = np.percentile(distances_for_sparsity,
                                  [10, 20, 30, 40, 50, 60, 70, 80, 90])
        logger.info(f"Thresholds {threshold}")

        if dataset_name not in PAIRWISE_ONLY_DATASETS:
            logger.info(f"fitting clusterer for {dataset_name}")
            cluster = Clusterer(
                FEATURIZER_INFO,
                pairwise_modeler.classifier,
                cluster_model=FastCluster(linkage=linkage) if not use_dbscan
                else DBSCAN(min_samples=1, metric="precomputed"),
                search_space=search_space,
                n_jobs=N_JOBS,
                use_cache=USE_CACHE,
                nameless_classifier=nameless_pairwise_modeler.classifier
                if nameless_pairwise_modeler is not None else None,
                nameless_featurizer_info=NAMELESS_FEATURIZER_INFO,
                random_state=random_seed,
                use_default_constraints_as_supervision=False,
            )
            cluster.fit(anddata)
            logger.info(f"clusterer fit for {dataset_name}")
            logger.info(f"{dataset_name} best clustering parameters: " +
                        str(cluster.best_params))

        dataset: Dict[str, Any] = {}
        dataset["anddata"] = anddata
        dataset["X_train"] = X_train
        dataset["y_train"] = y_train
        dataset["X_val"] = X_val
        dataset["y_val"] = y_val
        dataset["X_test"] = X_test
        dataset["y_test"] = y_test
        dataset["pairwise_modeler"] = pairwise_modeler
        dataset["nameless_X_train"] = nameless_X_train
        dataset["nameless_X_val"] = nameless_X_val
        dataset["nameless_X_test"] = nameless_X_test
        dataset["nameless_pairwise_modeler"] = nameless_pairwise_modeler
        dataset["clusterer"] = cluster
        dataset["name"] = anddata.name
        datasets[dataset_name] = dataset

    logger.info("")
    logger.info("making evaluation grids")

    b3_f1_grid = [["" for j in range(len(DATASET_NAMES) + 1)]
                  for i in range(len(DATASET_NAMES) + 1)]

    for i in range(max(len(DATASET_NAMES), len(DATASET_NAMES))):
        if i < len(DATASET_NAMES):
            b3_f1_grid[0][i + 1] = DATASET_NAMES[i]
        if i < len(DATASET_NAMES):
            b3_f1_grid[i + 1][0] = DATASET_NAMES[i]

    pairwise_auroc_grid = copy.deepcopy(b3_f1_grid)  # makes a copy of the grid
    pairwise_f1_classification_grid = copy.deepcopy(
        b3_f1_grid)  # makes a copy of the grid
    pairwise_average_precisision_grid = copy.deepcopy(
        b3_f1_grid)  # makes a copy of the grid
    pairwise_macro_f1_grid = copy.deepcopy(
        b3_f1_grid)  # makes a copy of the grid

    # transfer of individual models
    logger.info("starting individual model evaluation")
    for _, source_dataset in tqdm(datasets.items(),
                                  desc="Evaluating individual models"):
        logger.info("")
        logger.info(
            f"evaluating source {source_dataset['name']} target {source_dataset['name']}"
        )
        pairwise_metrics, cluster_metrics, _ = sota_helper(
            source_dataset, experiment_name, random_seed)
        b3_f1_grid[DATASET_NAMES.index(source_dataset["name"]) +
                   1][DATASET_NAMES.index(source_dataset["name"]) +
                      1] = cluster_metrics["B3 (P, R, F1)"][2]
        pairwise_macro_f1_grid[
            DATASET_NAMES.index(source_dataset["name"]) +
            1][DATASET_NAMES.index(source_dataset["name"]) +
               1] = cluster_metrics["Cluster Macro (P, R, F1)"][2]
        pairwise_auroc_grid[DATASET_NAMES.index(source_dataset["name"]) +
                            1][DATASET_NAMES.index(source_dataset["name"]) +
                               1] = pairwise_metrics["AUROC"]
        pairwise_f1_classification_grid[
            DATASET_NAMES.index(source_dataset["name"]) +
            1][DATASET_NAMES.index(source_dataset["name"]) +
               1] = pairwise_metrics["F1"]
        pairwise_average_precisision_grid[
            DATASET_NAMES.index(source_dataset["name"]) +
            1][DATASET_NAMES.index(source_dataset["name"]) +
               1] = pairwise_metrics["Average Precision"]
    logger.info("finished individual model evaluation")

    # union
    logger.info("")
    logger.info("writing results to disk")
    print("B3 F1:")
    b3_df = pd.DataFrame(b3_f1_grid)
    print(b3_df)

    print()

    print("Pairwise Macro F1 (cluster):")
    pairwise_macro_f1_df = pd.DataFrame(pairwise_macro_f1_grid)
    print(pairwise_macro_f1_df)

    print()

    print("Pairwise AUROC:")
    pairwise_df = pd.DataFrame(pairwise_auroc_grid)
    print(pairwise_df)

    print()

    print("Pairwise classification F1:")
    pairwise_classification_f1_df = pd.DataFrame(
        pairwise_f1_classification_grid)
    print(pairwise_classification_f1_df)

    print()

    print("Pairwise AP:")
    pairwise_ap_df = pd.DataFrame(pairwise_average_precisision_grid)
    print(pairwise_ap_df)

    print()

    with open(
            os.path.join(
                DATA_DIR,
                "experiments",
                experiment_name,
                "sota",
                f"seed_{random_seed}",
                "metrics",
                "full_grid.json",
            ),
            "w",
    ) as _json_file:
        json.dump(
            {
                "b3": b3_f1_grid,
                "pairwisef1": pairwise_macro_f1_grid,
                "auroc": pairwise_auroc_grid,
                "classificationf1": pairwise_f1_classification_grid,
                "averageprecision": pairwise_average_precisision_grid,
            },
            _json_file,
        )

    b3_df.to_csv(
        os.path.join(
            DATA_DIR,
            "experiments",
            experiment_name,
            "sota",
            f"seed_{random_seed}",
            "metrics",
            "b3.csv",
        ),
        index=False,
    )

    pairwise_macro_f1_df.to_csv(
        os.path.join(
            DATA_DIR,
            "experiments",
            experiment_name,
            "sota",
            f"seed_{random_seed}",
            "metrics",
            "pair_macro_f1_cluster.csv",
        ),
        index=False,
    )
    pairwise_df.to_csv(
        os.path.join(
            DATA_DIR,
            "experiments",
            experiment_name,
            "sota",
            f"seed_{random_seed}",
            "metrics",
            "pairwise_auc.csv",
        ),
        index=False,
    )

    pairwise_classification_f1_df.to_csv(
        os.path.join(
            DATA_DIR,
            "experiments",
            experiment_name,
            "sota",
            f"seed_{random_seed}",
            "metrics",
            "classification_f1.csv",
        ),
        index=False,
    )

    pairwise_ap_df.to_csv(
        os.path.join(
            DATA_DIR,
            "experiments",
            experiment_name,
            "sota",
            f"seed_{random_seed}",
            "metrics",
            "average_precision.csv",
        ),
        index=False,
    )

    return (
        b3_f1_grid,
        pairwise_macro_f1_grid,
        pairwise_auroc_grid,
        pairwise_f1_classification_grid,
        pairwise_average_precisision_grid,
    )
def main(
    max_train_positives_per_dataset: int,
    max_val_positives_per_dataset: int,
    max_test_positives_per_dataset: int,
    negatives_multiplier: float,
    drop_abstract_prob: float,
    drop_affiliations_prob: float,
    drop_references_prob: float,
    drop_first_name_prob: float,
    drop_venue_journal_prob: float,
    drop_coauthors_prob: float,
    translate_title_prob: float,
):
    """
    This script creates the extra "augmentation" dataset from the existing datasets, by randomly removing features,
    to simulate real usage better
    """
    random.seed(1111)
    augmentation_pairs = pd.read_csv(
        os.path.join(AUGMENTATION_DIR, "source_tuples.csv")).to_dict("records")
    with open(os.path.join(AUGMENTATION_DIR, "title_only_specters.pickle"),
              "rb") as _pickle_file:
        title_only_specter = pickle.load(_pickle_file)

    datasets: Dict[str, Any] = {}
    for dataset_name in tqdm(
            SOURCE_DATASET_NAMES,
            desc="Processing datasets and fitting base models"):
        logger.info("")
        logger.info(f"processing dataset {dataset_name}")
        logger.info(f"loading dataset {dataset_name}")
        anddata = ANDData(
            signatures=os.path.join(DATA_DIR, dataset_name,
                                    dataset_name + "_signatures.json"),
            papers=os.path.join(DATA_DIR, dataset_name,
                                dataset_name + "_papers.json"),
            name=dataset_name,
            mode="inference",
            specter_embeddings=os.path.join(DATA_DIR, dataset_name,
                                            dataset_name + "_specter.pickle"),
            block_type="s2",
            n_jobs=25,
            load_name_counts=False,
            preprocess=False,
        )
        logger.info(f"dataset {dataset_name} loaded")
        datasets[dataset_name] = anddata

    full_papers = {}
    full_signatures = {}
    full_specter_keys = []
    full_specter_D = []
    train_pairs = []
    val_pairs = []
    test_pairs = []
    pair_counts: Dict[str, Dict[str, Dict[int, int]]] = defaultdict(
        lambda: defaultdict(lambda: defaultdict(int)))

    for row in augmentation_pairs:
        split = row["split"]
        dataset_name = row["dataset_name"]
        signature_id_1 = row["signature_id_1"]
        signature_id_2 = row["signature_id_2"]
        label = row["label"]

        count_value = pair_counts[dataset_name][split][label]
        max_value = (max_train_positives_per_dataset if split == "train" else
                     max_val_positives_per_dataset if split == "val" else
                     max_test_positives_per_dataset) * (negatives_multiplier
                                                        if label == 0 else 1.0)
        if count_value >= max_value or dataset_name not in SOURCE_DATASET_NAMES:
            continue

        pair_counts[dataset_name][split][label] += 1

        pair = (dataset_name + "___" + str(signature_id_1),
                dataset_name + "___" + str(signature_id_2), label)

        if split == "train":
            train_pairs.append(pair)
        elif split == "val":
            val_pairs.append(pair)
        elif split == "test":
            test_pairs.append(pair)

    logger.info(
        f"Total pairs (train, val, test): {len(train_pairs)}, {len(val_pairs)}, {len(test_pairs)}"
    )
    pair_counts_dict: Dict[str, Dict[str, Dict[int, int]]] = {}
    for dataset, d1 in pair_counts.items():
        pair_counts_dict[dataset] = {}
        for split, d2 in d1.items():
            pair_counts_dict[dataset][split] = {}
            for label, count in d2.items():
                pair_counts_dict[dataset][split][label] = count

    logger.info(pair_counts_dict)

    all_signatures = set(
        [item for sublist in train_pairs for item in sublist[:2]] +
        [item for sublist in val_pairs for item in sublist[:2]] +
        [item for sublist in test_pairs for item in sublist[:2]])
    reference_papers_to_add = set()
    for signature in all_signatures:
        original_dataset, original_signature_id = signature.split("___")
        original_signature = datasets[original_dataset].signatures[
            original_signature_id]
        original_paper = datasets[original_dataset].papers[str(
            original_signature.paper_id)]
        original_references = [(original_dataset, paper_id)
                               for paper_id in original_paper.references]

        new_signature_id = signature
        new_references = [
            copy.deepcopy(reference) for reference in original_references
        ]

        coin_flip = random.uniform(0, 1)
        if coin_flip < drop_abstract_prob:
            new_has_abstract = False
            full_specter_keys.append(str(original_signature.paper_id))
            full_specter_D.append(
                title_only_specter[original_dataset + "_" +
                                   str(original_signature.paper_id)])
        else:
            new_has_abstract = original_paper.has_abstract
            full_specter_keys.append(str(original_signature.paper_id))
            full_specter_D.append(
                datasets[original_dataset].specter_embeddings[str(
                    original_signature.paper_id)])

        coin_flip = random.uniform(0, 1)
        if coin_flip < drop_references_prob:
            new_references = []
        else:
            reference_papers_to_add.update(new_references)
            new_references = [reference[1] for reference in new_references]

        coin_flip = random.uniform(0, 1)
        if coin_flip < drop_affiliations_prob:
            new_affiliations = []
        else:
            new_affiliations = original_signature.author_info_affiliations

        coin_flip = random.uniform(0, 1)
        if coin_flip < drop_venue_journal_prob:
            new_venue = None
            new_journal_name = None
        else:
            new_venue = original_paper.venue
            new_journal_name = original_paper.journal_name

        coin_flip = random.uniform(0, 1)
        if coin_flip < drop_first_name_prob:
            new_first = (original_signature.author_info_first[0]
                         if original_signature.author_info_first is not None
                         and len(original_signature.author_info_first) > 0 else
                         original_signature.author_info_first)
        else:
            new_first = original_signature.author_info_first

        coin_flip = random.uniform(0, 1)
        if coin_flip < drop_coauthors_prob:
            new_paper_authors = [
                author for author in original_paper.authors
                if author.position == original_signature.author_info_position
            ]
        else:
            new_paper_authors = original_paper.authors

        coin_flip = random.uniform(0, 1)
        if coin_flip < translate_title_prob:
            new_title = translate(original_paper.title)
        else:
            new_title = original_paper.title

        new_signature = original_signature._replace(
            author_info_first=new_first,
            author_info_affiliations=new_affiliations,
            signature_id=new_signature_id,
            author_info_first_normalized=None,
            author_info_first_normalized_without_apostrophe=None,
            author_info_middle_normalized=None,
            author_info_middle_normalized_without_apostrophe=None,
            author_info_last_normalized=None,
            author_info_suffix_normalized=None,
            author_info_coauthors=None,
            author_info_coauthor_blocks=None,
        )
        new_paper = original_paper._replace(
            venue=new_venue,
            journal_name=new_journal_name,
            references=new_references,
            title=new_title,
            has_abstract=new_has_abstract,
            authors=new_paper_authors,
        )

        new_signature_dict = dict(new_signature._asdict())
        new_signature_dict["author_info"] = {}
        keys_to_delete = []
        for key, value in new_signature_dict.items():
            if key.startswith("author_info_"):
                keys_to_delete.append(key)
                new_signature_dict["author_info"][key[12:]] = value
        for key in keys_to_delete:
            del new_signature_dict[key]

        full_signatures[signature] = new_signature_dict
        full_papers[str(new_paper.paper_id)] = dict(new_paper._asdict())
        full_papers[str(new_paper.paper_id)]["authors"] = [
            dict(author._asdict())
            for author in full_papers[str(new_paper.paper_id)]["authors"]
        ]
        # we currently don't need the actual abstract, but just need to know if it exists or not
        if full_papers[str(new_paper.paper_id)]["has_abstract"]:
            full_papers[str(new_paper.paper_id)]["abstract"] = "EXISTS"
        else:
            full_papers[str(new_paper.paper_id)]["abstract"] = ""

    logger.info(f"Adding {len(reference_papers_to_add)} reference papers")
    reference_papers_added = 0
    for dataset_name, paper_id in reference_papers_to_add:
        if str(paper_id) not in full_papers and str(
                paper_id) in datasets[dataset_name].papers:
            full_papers[str(paper_id)] = dict(
                datasets[dataset_name].papers[str(paper_id)]._asdict())
            full_papers[str(paper_id)]["authors"] = [
                dict(author._asdict())
                for author in full_papers[str(paper_id)]["authors"]
            ]
            if full_papers[str(paper_id)]["has_abstract"]:
                full_papers[str(paper_id)]["abstract"] = "EXISTS"
            else:
                full_papers[str(paper_id)]["abstract"] = ""
            reference_papers_added += 1
    logger.info(f"Added {reference_papers_added} reference papers")

    logger.info(f"Dumping {len(full_papers)} papers")
    with open(os.path.join(AUGMENTATION_DIR, "augmented_papers.json"),
              "w") as _json_file:
        json.dump(full_papers, _json_file)

    logger.info(f"Dumping {len(full_signatures)} signatures")
    with open(os.path.join(AUGMENTATION_DIR, "augmented_signatures.json"),
              "w") as _json_file:
        json.dump(full_signatures, _json_file)

    full_specter_D_np = np.array(full_specter_D)
    logger.info(
        f"Dumping {full_specter_D_np.shape, len(full_specter_keys)} specter")
    with open(os.path.join(AUGMENTATION_DIR, "augmented_specter.pickle"),
              "wb") as _pickle_file:
        pickle.dump((full_specter_D_np, full_specter_keys),
                    _pickle_file,
                    protocol=pickle.HIGHEST_PROTOCOL)

    train_pairs_df = pd.DataFrame(train_pairs,
                                  columns=["pair1", "pair2", "label"])
    train_pairs_df["label"] = train_pairs_df["label"].apply(
        lambda x: "YES" if x == 1 else "NO")
    val_pairs_df = pd.DataFrame(val_pairs, columns=["pair1", "pair2", "label"])
    val_pairs_df["label"] = val_pairs_df["label"].apply(lambda x: "YES"
                                                        if x == 1 else "NO")
    test_pairs_df = pd.DataFrame(test_pairs,
                                 columns=["pairs1", "pair2", "label"])
    test_pairs_df["label"] = test_pairs_df["label"].apply(lambda x: "YES"
                                                          if x == 1 else "NO")

    logger.info("Writing pairs csvs")
    train_pairs_df.to_csv(os.path.join(AUGMENTATION_DIR, "train_pairs.csv"),
                          index=False,
                          header=True)
    val_pairs_df.to_csv(os.path.join(AUGMENTATION_DIR, "val_pairs.csv"),
                        index=False,
                        header=True)
    test_pairs_df.to_csv(os.path.join(AUGMENTATION_DIR, "test_pairs.csv"),
                         index=False,
                         header=True)
    logger.info("Done.")
    else:
        train_pairs_path = os.path.join(DATA_DIR, dataset_name, "train_pairs.csv")
        val_pairs_path = os.path.join(DATA_DIR, dataset_name, "val_pairs.csv")
        if not os.path.exists(val_pairs_path):
            val_pairs_path = None
        test_pairs_path = os.path.join(DATA_DIR, dataset_name, "test_pairs.csv")

    anddata = ANDData(
        signatures=os.path.join(DATA_DIR, dataset_name, dataset_name + "_signatures.json"),
        papers=os.path.join(DATA_DIR, dataset_name, dataset_name + "_papers.json"),
        name=dataset_name,
        mode="train",
        specter_embeddings=os.path.join(DATA_DIR, dataset_name, dataset_name + "_specter.pickle"),
        clusters=clusters_path,
        block_type="s2",
        train_pairs=train_pairs_path,
        val_pairs=val_pairs_path,
        test_pairs=test_pairs_path,
        train_pairs_size=N_TRAIN_PAIRS_SIZE,
        val_pairs_size=N_VAL_TEST_SIZE,
        test_pairs_size=N_VAL_TEST_SIZE,
        n_jobs=N_JOBS,
        load_name_counts=False,
        preprocess=False,
    )

    datasets[dataset_name] = anddata

tuples = []
all_titles_dict = {}
for dataset_name, anddata in datasets.items():
    # this random seed matches the current default
Example #9
0
class TestData(unittest.TestCase):
    def setUp(self):
        super().setUp()
        self.dummy_dataset = ANDData(
            "tests/dummy/signatures.json",
            "tests/dummy/papers.json",
            clusters="tests/dummy/clusters.json",
            name="dummy",
            load_name_counts=True,
        )

        features_to_use = [
            "name_similarity",
            "affiliation_similarity",
            "email_similarity",
            "coauthor_similarity",
            "venue_similarity",
            "year_diff",
            "title_similarity",
            "reference_features",
            "misc_features",
            "name_counts",
            "journal_similarity",
            "advanced_name_similarity",
        ]
        self.dummy_featurizer = FeaturizationInfo(features_to_use=features_to_use)

    def check_features_array_equal(self, array_1, array_2):
        assert len(array_1) == len(array_2)
        for i in range(len(array_1)):
            both_nan = np.isnan(array_1[i]) and np.isnan(array_2[i])
            if not both_nan:
                self.assertAlmostEqual(array_1[i], array_2[i], msg=i)

    def test_featurizer(self):
        test_pairs = [
            ("3", "0", 0),
            ("3", "1", 0),
            ("3", "2", 0),
            ("3", "2", -1),
        ]
        features, labels, _ = many_pairs_featurize(
            test_pairs, self.dummy_dataset, self.dummy_featurizer, 2, False, 1, nan_value=-1
        )

        expected_features_1 = [
            0.0,
            -1.0,
            -1.0,
            0.0,
            0.0,
            0.0,
            0.2,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            0.0,
            4.0,
            0.0,
            0.03067484662576687,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            0.0,
            -1.0,
            1.0,
            2.0,
            2.0,
            1.0,
            2.0,
            82081.0,
            12.0,
            807.0,
            1.0,
            -1.0,
            -1.0,
            -1.0,
            0.7777777777777778,
            0.8,
            0.7777777777777778,
            0.5407407407407407,
        ]
        expected_features_2 = [
            0.0,
            -1.0,
            -1.0,
            0.0,
            0.0,
            0.0,
            0.2,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            0.0,
            6.0,
            0.02857142857142857,
            0.09615384615384616,
            0.25757575757575757,
            0.34615384615384615,
            0.8181818181818182,
            0.2222222222222222,
            0.0,
            0.5,
            1.0,
            2.0,
            2.0,
            1.0,
            2.0,
            23425.0,
            12.0,
            807.0,
            1.0,
            82081.0,
            20.0,
            -1.0,
            0.7777777777777778,
            0.8,
            0.7777777777777778,
            0.5407407407407407,
        ]
        expected_features_3 = [
            0.0,
            -1.0,
            -1.0,
            0.0,
            0.0,
            0.0,
            0.2,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            0.0,
            6.0,
            0.0,
            0.058823529411764705,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            1.0,
            -1.0,
            1.0,
            2.0,
            2.0,
            1.0,
            2.0,
            23425.0,
            12.0,
            807.0,
            1.0,
            82081.0,
            20.0,
            -1.0,
            0.7777777777777778,
            0.8,
            0.7777777777777778,
            0.5407407407407407,
        ]
        self.check_features_array_equal(list(features[0, :]), expected_features_1)
        self.check_features_array_equal(list(features[1, :]), expected_features_2)
        self.check_features_array_equal(list(features[2, :]), expected_features_3)
        self.assertEqual(features[3, 0], -LARGE_INTEGER)

    def test_get_constraint(self):
        first_constraint = self.dummy_dataset.get_constraint("0", "8", high_value=100)
        assert first_constraint == 100
        middle_constraint = self.dummy_dataset.get_constraint("6", "8", high_value=100)
        assert middle_constraint == 100
        no_constraint = self.dummy_dataset.get_constraint("0", "1")
        assert no_constraint is None
Example #10
0
class TestData(unittest.TestCase):
    def setUp(self):
        super().setUp()
        self.qian_dataset = ANDData(
            "tests/qian/signatures.json",
            # "tests/qian/papers.json",
            {},
            clusters="tests/qian/clusters.json",
            name="qian",
            load_name_counts=False,
            preprocess=False,
        )
        self.dummy_dataset = ANDData(
            "tests/dummy/signatures.json",
            # "tests/dummy/papers.json",
            {},
            clusters="tests/dummy/clusters.json",
            name="dummy",
            load_name_counts=False,
            preprocess=False,
        )

    def test_split_pairs_within_blocks(self):
        # Test random sampling within blocks
        self.qian_dataset.pair_sampling_block = True
        self.qian_dataset.pair_sampling_balanced_classes = False
        self.qian_dataset.pair_sampling_balanced_homonym_synonym = False
        self.qian_dataset.train_pairs_size = 1000
        self.qian_dataset.val_pairs_size = 500
        self.qian_dataset.test_pairs_size = 500
        self.qian_dataset.random_seed = 1111
        (
            train_block_dict,
            val_block_dict,
            test_block_dict,
        ) = self.qian_dataset.split_cluster_signatures()
        train_pairs, val_pairs, test_pairs = self.qian_dataset.split_pairs(
            train_block_dict, val_block_dict, test_block_dict)

        assert len(train_pairs) == 1000 and len(val_pairs) == 500 and len(
            test_pairs) == 500
        assert (train_pairs[0] == ("5259", "5270", 1)
                and val_pairs[0] == ("3830", "3847", 1)
                and test_pairs[0] == ("1050", "1063", 1))

        # Test balanced pos/neg sampling within blocks
        self.qian_dataset.pair_sampling_block = True
        self.qian_dataset.pair_sampling_balanced_classes = True
        self.qian_dataset.pair_sampling_balanced_homonym_synonym = False
        train_pairs, val_pairs, test_pairs = self.qian_dataset.split_pairs(
            train_block_dict, val_block_dict, test_block_dict)
        assert sum([int(pair[2]) for pair in train_pairs]) == 500
        assert len(train_pairs) == 1000 and len(val_pairs) == 500 and len(
            test_pairs) == 500
        assert (train_pairs[0] == ("5694", "5702", 1)
                and val_pairs[0] == ("781", "787", 1)
                and test_pairs[0] == ("2428", "2581", 0))

        # Test balanced pos/neg and homonym/synonym sampling within blocks
        self.qian_dataset.pair_sampling_block = True
        self.qian_dataset.pair_sampling_balanced_classes = True
        self.qian_dataset.pair_sampling_balanced_homonym_synonym = True
        train_pairs, val_pairs, test_pairs = self.qian_dataset.split_pairs(
            train_block_dict, val_block_dict, test_block_dict)
        assert sum([int(pair[2]) for pair in train_pairs]) == 500
        assert len(train_pairs) == 1000 and len(val_pairs) == 429 and len(
            test_pairs) == 376
        assert (train_pairs[0] == ("4389", "4493", 0)
                and val_pairs[0] == ("621", "636", 0)
                and test_pairs[0] == ("2550", "2622", 0))

        # Test adding the all test pairs flag to the test above
        self.qian_dataset.all_test_pairs_flag = True
        train_pairs, val_pairs, test_pairs = self.qian_dataset.split_pairs(
            train_block_dict, val_block_dict, test_block_dict)
        assert len(train_pairs
                   ) == 1000, len(val_pairs) == 429 and len(test_pairs) == 7244

    def test_blocks(self):
        original_blocks = self.dummy_dataset.get_original_blocks()
        s2_blocks = self.dummy_dataset.get_s2_blocks()

        expected_original_blocks = {
            "a sattar": ["0", "1", "2"],
            "a konovalov": ["3", "4", "5", "6", "7", "8"],
        }
        expected_s2_blocks = {
            "a sattary": ["0", "1", "2"],
            "a konovalov": ["3", "4", "5", "6", "7", "8"],
        }

        self.dummy_dataset.block_type = "s2"
        s2_blocks_2 = self.dummy_dataset.get_blocks()
        self.dummy_dataset.block_type = "original"
        original_blocks_2 = self.dummy_dataset.get_blocks()
        self.dummy_dataset.block_type = "dummy"
        with pytest.raises(Exception):
            blocks = self.dummy_dataset.get_blocks()
        self.dummy_dataset.block_type = "s2"

        assert original_blocks == expected_original_blocks
        assert original_blocks_2 == expected_original_blocks
        assert s2_blocks == expected_s2_blocks
        assert s2_blocks_2 == expected_s2_blocks

    def test_initialization(self):
        with pytest.raises(Exception):
            dataset = ANDData(
                signatures={},
                papers={},
                clusters={},
                name="",
                mode="train",
                train_blocks=[],
                block_type="s2",
                load_name_counts=False,
                preprocess=False,
            )

        with pytest.raises(Exception):
            dataset = ANDData(
                signatures={},
                papers={},
                clusters={},
                name="",
                mode="train",
                unit_of_data_split="blocks",
                pair_sampling_block=False,
                load_name_counts=False,
                preprocess=False,
            )

        with pytest.raises(Exception):
            dataset = ANDData(
                signatures={},
                papers={},
                name="",
                mode="train",
                clusters={},
                train_pairs=[],
                load_name_counts=False,
                preprocess=False,
            )

        with pytest.raises(Exception):
            dataset = ANDData(
                signatures={},
                papers={},
                name="",
                mode="train",
                clusters=None,
                train_pairs=None,
                train_blocks=None,
                load_name_counts=False,
                preprocess=False,
            )

        with pytest.raises(Exception):
            dataset = ANDData(
                signatures={},
                papers={},
                name="",
                mode="train",
                train_blocks=[],
                train_pairs=[],
                load_name_counts=False,
                preprocess=False,
            )

        with pytest.raises(Exception):
            dataset = ANDData(
                signatures={},
                papers={},
                name="",
                mode="train",
                train_blocks=[],
                clusters=None,
                load_name_counts=False,
                preprocess=False,
            )

        dataset = ANDData(signatures={},
                          papers={},
                          name="",
                          mode="inference",
                          load_name_counts=False,
                          preprocess=False)
        assert dataset.signature_to_cluster_id is None

        dataset = ANDData(signatures={},
                          papers={},
                          name="",
                          mode="inference",
                          load_name_counts=False,
                          preprocess=False)
        assert dataset.pair_sampling_block
        assert not dataset.pair_sampling_balanced_classes
        assert not dataset.pair_sampling_balanced_homonym_synonym
        assert dataset.all_test_pairs_flag
        assert dataset.block_type == "s2"

        with pytest.raises(Exception):
            dataset = ANDData(signatures={},
                              papers={},
                              clusters={},
                              name="",
                              mode="dummy",
                              load_name_counts=False,
                              preprocess=False)

    def test_construct_cluster_to_signatures(self):
        cluster_to_signatures = self.dummy_dataset.construct_cluster_to_signatures(
            {
                "a": ["0", "1"],
                "b": ["3", "4"]
            })
        expected_cluster_to_signatures = {"1": ["0", "1"], "3": ["3", "4"]}
        assert cluster_to_signatures == expected_cluster_to_signatures
Example #11
0
    def test_initialization(self):
        with pytest.raises(Exception):
            dataset = ANDData(
                signatures={},
                papers={},
                clusters={},
                name="",
                mode="train",
                train_blocks=[],
                block_type="s2",
                load_name_counts=False,
                preprocess=False,
            )

        with pytest.raises(Exception):
            dataset = ANDData(
                signatures={},
                papers={},
                clusters={},
                name="",
                mode="train",
                unit_of_data_split="blocks",
                pair_sampling_block=False,
                load_name_counts=False,
                preprocess=False,
            )

        with pytest.raises(Exception):
            dataset = ANDData(
                signatures={},
                papers={},
                name="",
                mode="train",
                clusters={},
                train_pairs=[],
                load_name_counts=False,
                preprocess=False,
            )

        with pytest.raises(Exception):
            dataset = ANDData(
                signatures={},
                papers={},
                name="",
                mode="train",
                clusters=None,
                train_pairs=None,
                train_blocks=None,
                load_name_counts=False,
                preprocess=False,
            )

        with pytest.raises(Exception):
            dataset = ANDData(
                signatures={},
                papers={},
                name="",
                mode="train",
                train_blocks=[],
                train_pairs=[],
                load_name_counts=False,
                preprocess=False,
            )

        with pytest.raises(Exception):
            dataset = ANDData(
                signatures={},
                papers={},
                name="",
                mode="train",
                train_blocks=[],
                clusters=None,
                load_name_counts=False,
                preprocess=False,
            )

        dataset = ANDData(signatures={},
                          papers={},
                          name="",
                          mode="inference",
                          load_name_counts=False,
                          preprocess=False)
        assert dataset.signature_to_cluster_id is None

        dataset = ANDData(signatures={},
                          papers={},
                          name="",
                          mode="inference",
                          load_name_counts=False,
                          preprocess=False)
        assert dataset.pair_sampling_block
        assert not dataset.pair_sampling_balanced_classes
        assert not dataset.pair_sampling_balanced_homonym_synonym
        assert dataset.all_test_pairs_flag
        assert dataset.block_type == "s2"

        with pytest.raises(Exception):
            dataset = ANDData(signatures={},
                              papers={},
                              clusters={},
                              name="",
                              mode="dummy",
                              load_name_counts=False,
                              preprocess=False)
Example #12
0
def main():
    """
    This script is used to train and dump a model trained on all the datasets
    """
    datasets = {}
    for dataset_name in tqdm(
            SOURCE_DATASET_NAMES,
            desc="Processing datasets and fitting base models"):
        logger.info(f"processing dataset {dataset_name}")
        clusters_path: Optional[str] = None
        if dataset_name not in PAIRWISE_ONLY_DATASETS:
            clusters_path = os.path.join(DATA_DIR, dataset_name,
                                         dataset_name + "_clusters.json")
            train_pairs_path = None
            val_pairs_path = None
            test_pairs_path = None
        else:
            train_pairs_path = os.path.join(DATA_DIR, dataset_name,
                                            "train_pairs.csv")
            val_pairs_path = os.path.join(DATA_DIR, dataset_name,
                                          "val_pairs.csv")
            if not os.path.exists(val_pairs_path):
                val_pairs_path = None
            test_pairs_path = os.path.join(DATA_DIR, dataset_name,
                                           "test_pairs.csv")

        logger.info(f"loading dataset {dataset_name}")
        anddata = ANDData(
            signatures=os.path.join(DATA_DIR, dataset_name,
                                    dataset_name + "_signatures.json"),
            papers=os.path.join(DATA_DIR, dataset_name,
                                dataset_name + "_papers.json"),
            name=dataset_name,
            mode="train",
            specter_embeddings=os.path.join(DATA_DIR, dataset_name,
                                            dataset_name + "_specter.pickle"),
            clusters=clusters_path,
            block_type=BLOCK_TYPE,
            train_pairs=train_pairs_path,
            val_pairs=val_pairs_path,
            test_pairs=test_pairs_path,
            train_pairs_size=N_TRAIN_PAIRS_SIZE,
            val_pairs_size=N_VAL_TEST_SIZE,
            test_pairs_size=N_VAL_TEST_SIZE,
            preprocess=True,
        )

        logger.info(f"featurizing {dataset_name}")
        train, val, test = featurize(
            anddata,
            FEATURIZER_INFO,
            n_jobs=N_JOBS,
            use_cache=True,
            chunk_size=100,
            nameless_featurizer_info=NAMELESS_FEATURIZER_INFO,
            nan_value=NAN_VALUE,
        )
        X_train, y_train, nameless_X_train = train
        X_val, y_val, nameless_X_val = val
        X_test, y_test, nameless_X_test = test

        dataset = {}
        dataset["anddata"] = anddata
        dataset["X_train"] = X_train
        dataset["y_train"] = y_train
        dataset["X_val"] = X_val
        dataset["y_val"] = y_val
        dataset["X_test"] = X_test
        dataset["y_test"] = y_test
        dataset["nameless_X_train"] = nameless_X_train
        dataset["nameless_X_val"] = nameless_X_val
        dataset["nameless_X_test"] = nameless_X_test
        dataset["name"] = anddata.name
        datasets[dataset_name] = dataset

    anddatas = [
        datasets[dataset_name]["anddata"]
        for dataset_name in SOURCE_DATASET_NAMES
        if dataset_name not in PAIRWISE_ONLY_DATASETS
    ]

    X_train = np.vstack([
        datasets[dataset_name]["X_train"]
        for dataset_name in SOURCE_DATASET_NAMES
    ])
    y_train = np.hstack([
        datasets[dataset_name]["y_train"]
        for dataset_name in SOURCE_DATASET_NAMES
    ])
    X_val = np.vstack([
        datasets[dataset_name]["X_val"]
        for dataset_name in SOURCE_DATASET_NAMES
        if dataset_name not in {"augmented"}
    ])
    y_val = np.hstack([
        datasets[dataset_name]["y_val"]
        for dataset_name in SOURCE_DATASET_NAMES
        if dataset_name not in {"augmented"}
    ])

    nameless_X_train = np.vstack([
        datasets[dataset_name]["nameless_X_train"]
        for dataset_name in SOURCE_DATASET_NAMES
    ])
    nameless_X_val = np.vstack([
        datasets[dataset_name]["nameless_X_val"]
        for dataset_name in SOURCE_DATASET_NAMES
        if dataset_name not in {"augmented"}
    ])

    logger.info("fitting pairwise")
    union_classifier = PairwiseModeler(
        n_iter=N_ITER, monotone_constraints=MONOTONE_CONSTRAINTS)
    union_classifier.fit(X_train, y_train, X_val, y_val)

    nameless_union_classifier = None
    if USE_NAMELESS_MODEL:
        logger.info("nameless fitting pairwise for " +
                    str(SOURCE_DATASET_NAMES))
        nameless_union_classifier = PairwiseModeler(
            n_iter=N_ITER,
            monotone_constraints=NAMELESS_MONOTONE_CONSTRAINTS,
        )
        nameless_union_classifier.fit(nameless_X_train, y_train,
                                      nameless_X_val, y_val)
        logger.info("nameless pairwise fit for " + str(SOURCE_DATASET_NAMES))

    logger.info("fitting clusterer for")
    union_clusterer = Clusterer(
        FEATURIZER_INFO,
        union_classifier.classifier,
        cluster_model=FastCluster(),
        search_space=search_space,
        n_jobs=N_JOBS,
        nameless_classifier=nameless_union_classifier.classifier
        if nameless_union_classifier is not None else None,
        nameless_featurizer_info=NAMELESS_FEATURIZER_INFO
        if nameless_union_classifier is not None else None,
    )
    union_clusterer.fit(anddatas)
    print(
        "best clustering parameters:",
        union_clusterer.best_params,
    )

    models = {}
    models["clusterer"] = union_clusterer

    with open(
            f"full_union_model_script_dump_average_{FEATURIZER_VERSION}.pickle",
            "wb",
    ) as _pickle_file:
        pickle.dump(models, _pickle_file)
    logger.info("Done.")
Example #13
0
def featurize(
    dataset: ANDData,
    featurizer_info: FeaturizationInfo,
    n_jobs: int = 1,
    use_cache: bool = False,
    chunk_size: int = DEFAULT_CHUNK_SIZE,
    nameless_featurizer_info: Optional[FeaturizationInfo] = None,
    nan_value: float = np.nan,
    delete_training_data: bool = False,
) -> Union[Tuple[TupleOfArrays, TupleOfArrays, TupleOfArrays], TupleOfArrays]:
    """
    Featurizes the input dataset

    Parameters
    ----------
    dataset: ANDData
        the dataset containing the relevant data
    featurizer_info: FeaturizationInfo
        the FeautrizationInfo object containing the listing of features to use
        and featurizer version
    n_jobs: int
        the number of cpus to use
    use_cache: bool
        whether or not to use write to/read from the features cache
    chunk_size: int
        the chunk size for multiprocessing
    nameless_featurizer_info: FeaturizationInfo
        the FeaturizationInfo for creating the features that do not use any name features,
        these will not be computed if this is None
    nan_value: float
        the value to replace nans with
    delete_training_data: bool
        Whether to delete some suspicious training examples

    Returns
    -------
    train/val/test features and labels if mode is 'train',
    features and labels for all pairs if mode is 'inference'
    """
    if dataset.mode == "inference":
        logger.info("featurizing all pairs")
        all_pairs = dataset.all_pairs()
        all_features = many_pairs_featurize(
            all_pairs,
            dataset,
            featurizer_info,
            n_jobs,
            use_cache,
            chunk_size,
            nameless_featurizer_info,
            nan_value,
            False,
        )
        logger.info("featurized all pairs")
        return all_features
    else:
        if dataset.train_pairs is None:
            if dataset.train_blocks is not None:
                (
                    train_signatures,
                    val_signatures,
                    test_signatures,
                ) = dataset.split_cluster_signatures_fixed()
            elif dataset.train_signatures is not None:
                (
                    train_signatures,
                    val_signatures,
                    test_signatures,
                ) = dataset.split_data_signatures_fixed()
            else:
                (
                    train_signatures,
                    val_signatures,
                    test_signatures,
                ) = dataset.split_cluster_signatures()  # type: ignore

            train_pairs, val_pairs, test_pairs = dataset.split_pairs(train_signatures, val_signatures, test_signatures)

        else:
            train_pairs, val_pairs, test_pairs = dataset.fixed_pairs()

        logger.info("featurizing train")
        train_features = many_pairs_featurize(
            train_pairs,
            dataset,
            featurizer_info,
            n_jobs,
            use_cache,
            chunk_size,
            nameless_featurizer_info,
            nan_value,
            delete_training_data,
        )
        logger.info("featurized train, featurizing val")
        val_features = many_pairs_featurize(
            val_pairs,
            dataset,
            featurizer_info,
            n_jobs,
            use_cache,
            chunk_size,
            nameless_featurizer_info,
            nan_value,
            False,
        )
        logger.info("featurized val, featurizing test")
        test_features = many_pairs_featurize(
            test_pairs,
            dataset,
            featurizer_info,
            n_jobs,
            use_cache,
            chunk_size,
            nameless_featurizer_info,
            nan_value,
            False,
        )
        logger.info("featurized test")
        return train_features, val_features, test_features
Example #14
0
class TestClusterer(unittest.TestCase):
    def setUp(self):
        super().setUp()
        self.dummy_dataset = ANDData(
            "tests/dummy/signatures.json",
            "tests/dummy/papers.json",
            clusters="tests/dummy/clusters.json",
            cluster_seeds="tests/dummy/cluster_seeds.json",
            name="dummy",
            load_name_counts=True,
        )

        features_to_use = [
            "year_diff",
            "misc_features",
        ]
        featurizer_info = FeaturizationInfo(features_to_use=features_to_use)
        np.random.seed(1)
        X_random = np.random.random((10, 6))
        y_random = np.random.randint(0, 6, 10)
        self.dummy_clusterer = Clusterer(
            featurizer_info=featurizer_info,
            classifier=lgb.LGBMClassifier(random_state=1,
                                          data_random_seed=1,
                                          feature_fraction_seed=1).fit(
                                              X_random, y_random),
            n_jobs=1,
            use_cache=False,
            use_default_constraints_as_supervision=False,
        )

    def test_get_constraints(self):
        block = {
            "a sattar": ["0", "1", "2"],
        }
        constraint_1 = self.dummy_dataset.get_constraint("0",
                                                         "1",
                                                         low_value=0,
                                                         high_value=2)
        constraint_2 = self.dummy_dataset.get_constraint("1",
                                                         "0",
                                                         low_value=0,
                                                         high_value=2)
        constraint_3 = self.dummy_dataset.get_constraint("1",
                                                         "2",
                                                         low_value=0,
                                                         high_value=2)
        constraint_4 = self.dummy_dataset.get_constraint("2",
                                                         "1",
                                                         low_value=0,
                                                         high_value=2)

        self.assertIs(constraint_1, LARGE_DISTANCE)
        self.assertIs(constraint_2, LARGE_DISTANCE)
        self.assertIs(constraint_3, 0)
        self.assertIs(constraint_4, 0)

    def test_make_distance_matrix_fastcluster(self):
        block = {
            "a sattar": ["0", "1", "2"],
        }
        partial_supervision = {("0", "1"): 1.1, ("1", "2"): 1e-6}
        distance_matrices = self.dummy_clusterer.make_distance_matrices(
            block_dict=block,
            dataset=self.dummy_dataset,
            partial_supervision=partial_supervision,
        )
        distance_matrix = distance_matrices["a sattar"]
        self.assertEqual(distance_matrix[0], np.float16(1.1))
        self.assertEqual(distance_matrix[1], np.float16(0.3))
        self.assertEqual(distance_matrix[2], np.float16(1e-6))

        distance_matrices = self.dummy_clusterer.make_distance_matrices(
            block_dict=block,
            dataset=self.dummy_dataset,
            partial_supervision={},
        )
        distance_matrix = distance_matrices["a sattar"]
        self.assertEqual(distance_matrix[0], np.float16(0.3))
        self.assertEqual(distance_matrix[1], np.float16(0.3))
        self.assertEqual(distance_matrix[2], np.float16(0.3))
Example #15
0
def main(model_path: str, n_jobs: int = 20, use_constraints: bool = True):
    """
    This script is for evaluating a model on the Semantic Scholar corrections data.
    It clusters each block for which we have pairwise corrections data (and the data is already
    pulled from Semantic Scholar for), and runs clustering and prints metrics out
    """
    with open(os.path.join(DATA_DIR,
                           "claims_pairs_remapped.json")) as _json_file:
        claims_pairs = json.load(_json_file)
    logger.info("Claims pairs loaded")

    with open(model_path, "rb") as _pickle_file:
        models = pickle.load(_pickle_file)
    clusterer = models["clusterer"]

    clusterer.n_jobs = n_jobs
    clusterer.use_cache = True
    clusterer.use_default_constraints_as_supervision = use_constraints
    clusterer.batch_size = 10000000
    logger.info(f"Linkage type: {clusterer.cluster_model.linkage}")
    logger.info(f"EPS: {clusterer.cluster_model.eps}")
    logger.info(
        f"Use constraints: {clusterer.use_default_constraints_as_supervision}")
    logger.info(
        f"Featurizer version: {clusterer.featurizer_info.featurizer_version}")
    logger.info(
        f"Use constraints: {clusterer.use_default_constraints_as_supervision}")

    block_keys = sorted(
        filter(
            lambda x: not x.endswith(".json") and not x.endswith(".pickle") and
            not x.endswith(".py") and not x.endswith(
                ".vscode") and not x.endswith(".csv"),
            os.listdir(BLOCK_DATASETS_DIR),
        ),
        key=lambda x: os.path.getsize(
            os.path.join(os.path.join(BLOCK_DATASETS_DIR, x),
                         "claims_signatures.json")),
    )

    logger.info("starting transfer experiment main, loading name counts")
    with open(cached_path(NAME_COUNTS_PATH), "rb") as f:
        (
            first_dict,
            last_dict,
            first_last_dict,
            last_first_initial_dict,
        ) = pickle.load(f)
    name_counts = {
        "first_dict": first_dict,
        "last_dict": last_dict,
        "first_last_dict": first_last_dict,
        "last_first_initial_dict": last_first_initial_dict,
    }
    logger.info("loaded name counts")

    for block_key in tqdm(block_keys):
        results = {}
        block_dir = os.path.join(BLOCK_DATASETS_DIR, block_key)
        logger.info(f"Loading dataset {block_key}")
        dataset = ANDData(
            signatures=os.path.join(block_dir, "claims_signatures.json"),
            papers=os.path.join(block_dir, "claims_papers.json"),
            mode="inference",
            specter_embeddings=os.path.join(block_dir,
                                            "claims_specter.pickle"),
            block_type="s2",
            name=block_key.replace(" ", "_"),
            n_jobs=n_jobs,
            load_name_counts=name_counts,
        )
        logger.info("Dataset loaded")

        result = claims_eval(
            dataset,
            clusterer,
            claims_pairs,
            os.path.join(BLOCK_DATASETS_DIR, dataset.name),
            output_shap=False,
        )
        results[block_key.replace(" ", "_")] = result
        logger.info(f"Claims eval output: {result}")

        with open(
                os.path.join(
                    BLOCK_DATASETS_DIR,
                    dataset.name,
                    f"results_{clusterer.featurizer_info.featurizer_version}.json",
                ),
                "w",
        ) as _json_file:
            json.dump(results, _json_file)
    logger.info("Done.")