def parse_sentence(self, sentence: str, properties: Optional[Dict] = None):
        """
        Run CoreNLP over a sentence.
        :param sentence: a single sentence
        :param properties: additional properties for CoreNLP
        :return: parsing result
        """
        # The same input sentence can result in different annotations depending on the CoreNLP properties specified.
        # We therefore use a cache identifier for the sentence which includes the annotation properties.
        sent_cache_identifier = get_dict_hash(
            {
                "sentence": sentence,
                "properties": properties
            }, shorten=False)

        if not sent_cache_identifier in self.cache:
            # Kludge ahead: We want to cache the parsed sentence provided by CoreNLP, but also want to work with it in
            # a convenient format. A convenient format is the default format (protobuf-based), but that's not
            # pickle-able for the cache. We therefore convert the protobuf-format back into a bytestring and cache that.
            # When reading from the cache, we reassemble the protobuf object.
            req_properties = {"outputFormat": "serialized"}
            if properties is not None:
                req_properties.update(properties)
            doc = self.client.annotate(sentence, properties=req_properties)
            stream = writeToDelimitedString(doc)
            buf = stream.getvalue()
            stream.close()
            self.cache[sent_cache_identifier] = buf
        else:
            buf = self.cache[sent_cache_identifier]
            doc = Document()
            parseFromDelimitedString(doc, buf)

        return doc
Beispiel #2
0
    def transform(self, X: Tuple):
        dataset, pairs, labels, unique_mentions = X

        if self.use_cache:
            # We want to cache feature transformation outputs similar to what is asked for / proposed here:
            # (1) https://mail.python.org/pipermail/scikit-learn/2017-August/001828.html
            # (2) https://gist.github.com/jnothman/019d594d197c98a3d6192fa0cb19c850
            # We cannot implement the caching 1:1 as in the github gist because our feature extractors have constructor
            # parameters which change the output of transform(), i.e. we want one cache for each set of parameters. To
            # do this conveniently, we take the __dict__ of a feature extractor, remove irrelevant entries and hash the
            # result. Irrelevant entries are the features to select (read-only modification) and any data-dependent
            # attributes ending with an underscore (see https://scikit-learn.org/stable/developers/develop.html#estimated-attributes)
            attrs = copy.deepcopy(self.__dict__)
            attrs = {k:v for k,v in attrs.items() if not k.endswith("_") and not k in ["name", "features_to_select"]}
            cache_key = get_dict_hash(attrs)
            cache_location = Path(tempfile.gettempdir()) / f"feature_{self.name}_{cache_key}"
            memory = Memory(cache_location, verbose=0)

            feature_matrix = memory.cache(self._transform)(dataset, FeatureExtractorMixin.from_np_array_back_to_list_of_tuples(pairs), unique_mentions)
        else:
            feature_matrix = self._transform(dataset, FeatureExtractorMixin.from_np_array_back_to_list_of_tuples(pairs), unique_mentions)

        # filter feature matrix according to feature selection
        if self.features_to_select:
            all_feature_names = self._get_plain_names_of_all_features()

            # sanity check: we can only select what we can extract
            for fname in self.features_to_select:
                if not fname in all_feature_names:
                    raise ValueError("Cannot select unknown feature name: " + fname)

            mask = np.array([fname in self.features_to_select for fname in all_feature_names])
            filtered_feature_matrix = feature_matrix[:, mask]
            return filtered_feature_matrix
        else:
            return feature_matrix
Beispiel #3
0
def feature_selection(config_data: Dict,
                      config_global: Dict,
                      logger: Logger):
    """
    Runs feature selection on the EVALUATION split.
    Uses 10 runs of 5-fold cross-validation for recursive feature elimination with a Random Forest mention classifier to
    find the most useful features.
    :param config_data:
    :param config_global:
    :param logger:
    :return:
    """
    serialization_dir = config_global[RUN_WORKING_DIR]

    eval_data_path = config_data["eval_data_path"]
    oracle_mention_pair_generation = config_data["oracle_mention_pair_generation"]

    data = load_data(eval_data_path)
    X, y = get_X_and_y_for_pipeline(logger,
                                    data,
                                    doc_partitioning=None,
                                    oracle_mention_pair_generation=oracle_mention_pair_generation)

    config_base = {
              "classifier": {_TYPE: "RandomForest",
                             _KWARGS: {"n_estimators": 100}},
              "features": {
                  "extractors": get_feature_extractors_config_with_all_and_defaults(),
                  "selected_features": None
              },
              "pairs": config_data["pairs"]
              }

    def run_rfecv_iteration(random_seed: int,
                            n_splits: int = 6) -> Tuple[List[str], np.array, np.array]:
        # RFECV needs X to be an matrix-like of shape (n_samples, n_features). This means we cannot use our pipeline as is,
        # because our X's are not matrix-like. So we run our pipeline up to the point where we input the feature matrix +
        # labels into the mention pair classifier, and feed that to RFECV. To do that, we need to chop up the pipeline.
        config = copy.deepcopy(config_base)
        config["random_seed"] = random_seed

        pipeline, scoring = instantiate_pipeline(logger,
                                                 config,
                                                 with_clustering=False,
                                                 scorer_should_return_single_scalar=True,
                                                 serialization_dir=serialization_dir / "pipeline")

        # remove the classifier at the end of the pipeline
        classifier_wrapper = pipeline.steps.pop(-1)[1]  # type: PredictOnTransformClassifierWrapper
        assert type(classifier_wrapper) is PredictOnTransformClassifierWrapper
        random_forest_clf = classifier_wrapper.classifier_

        # obtain feature matrix and labels
        conflated_X = pipeline.fit_transform(X, y)
        actual_X, actual_y = classifier_wrapper._take_apart_X(conflated_X)

        cv = KFold(n_splits=n_splits, random_state=random_seed, shuffle=True)

        # We set min_impurity_decrease depending on the number of instances to obtain a useful feature selection result.
        # min_impurity_decrease was determined based on a series of manual experiments with a varying number of features
        # producing random and zero values. For 1e3 instances, values between 1e-7 and 1e-1 were tested, and 0.0015
        # produced plots closest to the optimal expected result (i.e. significant peak around the number of non-garbage
        # features). Similar experiments were conducted for 1e4 and 1e5 instances. We interpolate between these data points.
        num_instances = len(actual_y)
        xp = np.log10([1e3, 1e5])
        fp = np.log10([0.0015, 0.00025])
        min_impurity_decrease = 10**np.interp(np.log10(num_instances), xp, fp)
        random_forest_clf.set_params(min_impurity_decrease=min_impurity_decrease)

        logger.info("Running feature selection...")
        selector = RFECV(estimator=random_forest_clf,
                         n_jobs=config_global[MAX_CORES],
                         cv=cv,
                         scoring="f1_weighted",  # use f1_weighted because we have very imbalanced data
                         verbose=1)
        selector.fit(actual_X, actual_y)
        logger.info("Done.")

        feature_names = get_feature_names_from_pipeline(pipeline)
        support = selector.support_
        grid_scores = selector.grid_scores_
        assert len(support) == len(feature_names)

        return feature_names, support, grid_scores

    # When using oracle mention pair generation, a randomly determined subset of all mention pairs is used. This has a
    # big influence on the results. We therefore make sure run multiple RFECV iterations with different random seeds for
    # the mention pair generation and aggregate those.
    results = []
    for seed in range(7):
        results.append(run_rfecv_iteration(seed))
    feature_names, supports, grid_scores = list(zip(*results))

    # assert that all results are compatible
    assert len(set(len(s) for s in supports)) == 1
    assert len(set(get_dict_hash(fn) for fn in feature_names)) == 1

    # collect selections in DataFrame
    selections = pd.DataFrame(np.vstack(supports).transpose(), index=pd.Index(feature_names[0], name="feature-name"))
    selected_features = selections.loc[selections.mean(axis=1) > 0.5].index.values

    # write to file(s)
    selections.to_csv(str(serialization_dir / "selected_features_unaggregated.csv"))
    with (serialization_dir / "selected_features.txt").open("w") as f:
        f.write("\n".join(selected_features))
    logger.info("Selected features: " + "\n".join(selected_features))

    # collect scores
    df_grid_scores = []
    for m in grid_scores:
        # number of features and CV-score for that number of features
        x_and_y = np.vstack([np.arange(1, len(m) + 1), m]).transpose()
        df_grid_scores.append(x_and_y)
    df_grid_scores = pd.DataFrame(np.vstack(df_grid_scores))
    df_grid_scores.columns = ["num-features", "weighted-f1"]

    df_grid_scores.to_csv(str(serialization_dir / "grid_scores.csv"))

    # plot feature selection results
    plot_destination = serialization_dir / "rfecv_plot.png"
    ax = sns.lineplot(x="num-features", y="weighted-f1", data=df_grid_scores)
    fig = ax.get_figure()
    fig.savefig(str(plot_destination))
def set_up_dir_structure(config):
    """
    Preparation steps to get a directory structure like the following:
    <working_dir>
    ├── <config_name>
    │   └── <config_hash>
    │       ├── 00__<pipeline_stage_name>
    │       ├── 01__<other_pipeline_stage_name>
    │       ├── ...
    │       └── <timestamp of run>
    │           └── event.log
    └── global
    :param config: yaml config
    :return:
    """
    config_id = get_dict_hash(config)

    # Load the configuration and store its own identifier and the current timestamp in the global config
    timestamp = time.strftime("%Y-%m-%d_%H-%M-%S")
    config_global = config["global"]
    config_global[ID] = config_id
    config_global[TIMESTAMP] = timestamp

    # use specified working dir, resolved against the working directory of the shell
    working_dir = Path.cwd() / Path(config_global["working_dir"])

    # set up a shared working dir for all runs exists (for cached datasets etc.)
    global_working_dir = working_dir / "global"

    # set up a config-name (!) specific directory if a name is given - this is just for better grouping of runs
    config_name = config_global.get("config_name", "pipeline")
    config_name_working_dir = working_dir / get_filename_safe_string(
        config_name)

    # set up a config-specific directory
    config_working_dir = config_name_working_dir / config_id

    # set up a run-specific directory - some conditional changes depending on whether we run inside Slurm or not
    run_working_dir_filename_parts = []
    config_global[TASK_ID] = None
    config_global[JOB_ID] = None
    config_global[JOB_ID_RAW] = None
    slurm_job_id = slurmee.get_job_id()
    job_array_info = slurmee.get_job_array_info()

    # if running on slurm:
    if slurm_job_id is not None:
        # use as many CPU cores as there are available
        config_global[MAX_CORES] = slurmee.get_cpus_on_node()

        # if running inside job array
        if job_array_info is not None:
            root_job_id = job_array_info["array_job_id"]
            task_id = job_array_info["task_id"]
            run_working_dir_filename_parts += [
                str(root_job_id), f"{task_id:0>2}"
            ]
            config_global[JOB_ID] = root_job_id
            config_global[TASK_ID] = task_id
            config_global[JOB_ID_RAW] = root_job_id + task_id
        else:
            run_working_dir_filename_parts += [str(slurm_job_id)]
            config_global[JOB_ID] = slurm_job_id
            config_global[JOB_ID_RAW] = slurm_job_id
    run_working_dir_filename_parts += [timestamp]
    run_working_dir_filename = "_".join(run_working_dir_filename_parts)
    run_working_dir = config_working_dir / run_working_dir_filename

    # create folders
    for directory in [global_working_dir, run_working_dir]:
        directory.mkdir(parents=True, exist_ok=True)

    config_global[GLOBAL_WORKING_DIR] = global_working_dir
    config_global[CONFIG_NAME_WORKING_DIR] = config_name_working_dir
    config_global[CONFIG_WORKING_DIR] = config_working_dir
    config_global[RUN_WORKING_DIR] = run_working_dir

    # ...redirect destination for the logging file handler
    if "logging" not in config_global:
        config_global["logging"] = {}
    config_global["logging"]["path"] = run_working_dir / "log_events.log"

    return config
def instantiate_pipeline(logger: Logger,
                         config: Dict,
                         with_clustering: bool = False,
                         use_caching: bool = False,
                         scorer_should_return_single_scalar: bool = False,
                         serialization_dir: Optional[Path] = None) -> Tuple[Pipeline, Callable]:
    """
    Uses the entries of a config dictionary to instantiate a scikit-learn pipeline. Additionally returns the scoring
    function to use.
    :param logger:
    :param config: config dictionary
    :param with_clustering: If True, the pipeline will include agglomerative clustering, if False, only mention pair
                            classification is done. The scoring function depends on this choice.
    :param use_caching: Whether fit() calls for all pipeline steps, and transform() calls for features should be cached.
    :param scorer_should_return_single_scalar: If True, the scoring function will return only a single metric as a
                                               scalar. This is useful for running cross-validation. If False, more
                                               metrics are returned as a pd.Series.
    :param serialization_dir: optional serialization dir, only used for debugging
    :return: sklearn pipeline and the scoring function to use for evaluation
    """
    random_seed = config.pop("random_seed")
    random_state = check_random_state(random_seed)

    pairs_config = config.pop("pairs")
    feature_extractors_config = config.pop("features")
    classifier_config = config.pop("classifier")

    # We make use of joblib's caching feature implemented for pipelines in sklearn. joblib only checks if it has seen
    # a pipeline transformer's function arguments before, so we need to make sure to create separate caches when
    # mention pair sampler, feature or classifier config parameters are changed. We use config dict hashes for that.
    if use_caching:
        config_hashes = [get_dict_hash(pairs_config), get_dict_hash(feature_extractors_config)]
        if with_clustering:
            # clustering additionally depends on the classifier config
            config_hashes += [get_dict_hash(classifier_config)]
        pipeline_cache = Path(tempfile.gettempdir()) / ("pipeline_" +  "_".join(config_hashes))
        memory = str(pipeline_cache)
    else:
        memory = None

    # instantiate some bits
    feature_extractors = instantiate_feature_extractors(feature_extractors_config, use_caching)
    classifier, classifier_fit_params = instantiate_classifier(classifier_config, random_state)

    # instantiate mention pair generator stage, which shares parameters with the mention pair scorer (if we use that)
    mpg_training_config = pairs_config.pop("mpg_training")
    mpg_prediction_config = pairs_config.pop("mpg_prediction")
    if pairs_config:
        raise ValueError("Leftover 'pairs' config entries: " + pprint.pformat(pairs_config))

    if with_clustering and mpg_prediction_config is not None:
        # Reasoning: Our mention pair generation parameters only affect the number and distribution of pairs, the
        # number of distribution of mentions is unchanged. Tweaking the mention pair generation process is therefore
        # only useful when the evaluation directly on the pairs, not on the mentions. For clustering, we evaluate based
        # on mentions, and we need distances between all mention pairs, therefore it does not make any sense to use
        # tweaked mention pair sampling at prediction time.
        raise ValueError("'mpg_prediction' cannot be used with clustering!")

    pair_generation_stage = MentionPairGeneratorStage(mpg_training_config,
                                                      mpg_prediction_config,
                                                      random_state=random_state,
                                                      serialization_dir=serialization_dir / "mpg" if serialization_dir is not None else None)
    if with_clustering:
        # using only the most discriminating metric (LEA) is faster when running cross-validation
        scorer = CrossDocCorefScoring(only_lea_f1_for_cv=scorer_should_return_single_scalar)
    else:
        scorer = MentionPairScoring(mpg_prediction_config,
                                    return_neg_log_loss_for_cv=scorer_should_return_single_scalar,
                                    serialization_dir=serialization_dir / "scorer" if serialization_dir is not None else None)

    # Now it's time to assemble the pipeline.
    # For reference, this is the sequence of calls on a pipeline with stages [a, b]:
    #   training:
    #     a: fit called
    #     a: transform
    #     b: fit called
    #   estimating:
    #     a: transform called
    #     b: predict called

    # Combine all feature extractors in a feature union, remove mean and normalize variance.
    feature_extraction_pipeline_steps = [
        ("features", FeatureUnion(feature_extractors)),
        ("scaling", StandardScaler())
    ]

    # This section merges the boolean label of each mention pair with the feature matrix and passes it to the
    # classifier.
    mention_pair_distance_pipeline_steps = [
        ("join_labels_and_feature_matrix", FeatureUnion([
            ("get_labels_from_X", FunctionTransformer(get_mention_pair_labels_from_X)),
            ("feature_extraction", Pipeline(feature_extraction_pipeline_steps)),
        ])),
        (CLASSIFIER_PIPELINE_STEP_NAME, PredictOnTransformClassifierWrapper(classifier, classifier_fit_params)),
    ]

    if with_clustering:
        # create clustering step
        clustering_config = config.pop("clustering")
        clustering = ScipyClustering.from_params(clustering_config)

        # When clustering, we start with generating pairs, then we classify those pairs (see above), but we additionally
        # need to retain the two mention identifiers of each mention pair. Mention pair identifiers and their distance
        # (between 0 and 1) are merged with a FeatureUnion. This "feature matrix" is pulled apart in the clustering step
        # where mentions are clustered agglomeratively according to their pairwise distances.
        pipeline = Pipeline([
            ("pair_generation", pair_generation_stage),
            ("mention_pair_distance_with_identifiers", FeatureUnion([
                ("get_mention_pair_identifiers_from_X", FunctionTransformer(get_mention_pair_identifiers_from_X)),
                ("mention_pair_distance", Pipeline(mention_pair_distance_pipeline_steps))
            ])),
            (CLUSTERING_PIPELINE_STEP_NAME, clustering)
        ], memory=memory)
    else:
        if "clustering" in config:
            logger.warning("Clustering configuration will not be used.")
            config.pop("clustering")

        # In the simplified case, we only need to pass the generated mention pairs to the classification pipeline part.
        pipeline = Pipeline([
            ("pair_generation", pair_generation_stage),
            *mention_pair_distance_pipeline_steps
        ], memory=memory)

    if config:
        raise ValueError("Leftover config entries: " + pprint.pformat(config))

    return pipeline, scorer
def perform_prediction_analysis(dataset: Dataset, outcomes: List[pd.DataFrame],
                                num_samples_per_quadrant: int,
                                serialization_dir: Path) -> None:
    """
    Given outcomes from mention pair classifications, computes detailed confusion matrices per link type. Also picks one
    run and samples several instances for each quadrant of the 2x2 confusion matrix and prints those for manual analysis.
    :param dataset: evaluation dataset
    :param outcomes: list of dataframe containing evaluated pairs with predicted and gold label, one for each run
    :param num_samples_per_quadrant: number of instances sampled per confusion matrix quadrant
    :param serialization_dir
    :return:
    """

    # assert that all passed outcome dataframes are compatible
    df_lengths = [len(df) for df in outcomes]
    assert len(set(df_lengths)) == 1

    # check sameness of a-doc-ids and b-mention-ids, if one of those two mismatches we have a problem anyway
    a_doc_id_hashes = [get_dict_hash(df[IDX_A_DOC].values) for df in outcomes]
    b_mention_id_hashes = [
        get_dict_hash(df[IDX_B_MENTION].values) for df in outcomes
    ]
    assert len(set(a_doc_id_hashes)) == 1
    assert len(set(b_mention_id_hashes)) == 1

    # All dataframes contain the same mention indices of each mention. We just need to keep this once, then we can throw
    # away mention indices for the outcomes of each run.
    index_df = outcomes[0][[
        IDX_A_DOC, IDX_A_MENTION, IDX_B_DOC, IDX_B_MENTION
    ]].copy()
    for outcomes_df in outcomes:
        outcomes_df.drop(
            columns=[IDX_A_DOC, IDX_A_MENTION, IDX_B_DOC, IDX_B_MENTION],
            inplace=True)

    # In the mention pair index dataframe, label each pair with its type: cross-topic, cross-subtopic,
    # within-subtopic, within-document.
    # First, convert docs to usable format:
    docs = dataset.documents
    docs = pd.concat([
        docs.index.to_frame()[[TOPIC_ID, SUBTOPIC]].reset_index(drop=True),
        docs[DOCUMENT_ID].reset_index(drop=True)
    ],
                     axis=1)

    # Merging resets the index to the default. We want to keep it intact, so that we can concat index_df and the
    # outcomes again later.
    index_df_index = index_df.index
    index_df = index_df.merge(docs,
                              left_on=IDX_A_DOC,
                              right_on=DOCUMENT_ID,
                              how="left")
    index_df = index_df.drop(columns=[DOCUMENT_ID]).rename(
        columns={
            TOPIC_ID: "a-topic-id",
            SUBTOPIC: "a-subtopic"
        })
    index_df = index_df.merge(docs,
                              left_on=IDX_B_DOC,
                              right_on=DOCUMENT_ID,
                              how="left")
    index_df = index_df.drop(columns=[DOCUMENT_ID]).rename(
        columns={
            TOPIC_ID: "b-topic-id",
            SUBTOPIC: "b-subtopic"
        })
    index_df.index = index_df_index

    topic_match = (index_df["a-topic-id"] == index_df["b-topic-id"])
    subtopic_match = (index_df["a-subtopic"] == index_df["b-subtopic"])
    document_match = (index_df[IDX_A_DOC] == index_df[IDX_B_DOC])
    index_df.loc[~topic_match, PAIR_TYPE] = CT
    index_df.loc[topic_match & ~subtopic_match, PAIR_TYPE] = CS
    index_df.loc[topic_match & subtopic_match & ~document_match,
                 PAIR_TYPE] = WS
    index_df.loc[topic_match & subtopic_match & document_match, PAIR_TYPE] = WD

    # For each run, label each pair with true positive, false positive, etc.
    for outcome_df in outcomes:
        outcome_df.loc[outcome_df[LABEL] & outcome_df[PREDICTION],
                       QUADRANT] = TP
        outcome_df.loc[outcome_df[LABEL] & ~outcome_df[PREDICTION],
                       QUADRANT] = FN
        outcome_df.loc[~outcome_df[LABEL] & outcome_df[PREDICTION],
                       QUADRANT] = FP
        outcome_df.loc[~outcome_df[LABEL] & ~outcome_df[PREDICTION],
                       QUADRANT] = TN

    _create_confusion_matrices(index_df, outcomes, serialization_dir)
    _print_prediction_pairs(index_df, outcomes[0], dataset,
                            num_samples_per_quadrant, serialization_dir)