Example #1
0
    # Lower case the first and last names
    df["FIRST_NAME"] = df["FIRST_NAME"].apply(lambda x: str(x).lower())
    df["LAST_NAME"] = df["LAST_NAME"].apply(lambda x: str(x).lower())

    first_names = set(df["FIRST_NAME"].values)
    last_names = set(df["LAST_NAME"].values)

    sample_with_names = preprocess_parallel(has_name,
                                            all_sentences,
                                            chunksize=1000,
                                            names=first_names | last_names)

    print(len(sample_with_names), len(all_sentences))

    subject_id_to_patient_info = get_subject_id_to_patient_info("medcat")

    name_to_subject_id = {}
    for subject_id, patient_info in subject_id_to_patient_info.items():
        name_to_subject_id.setdefault(patient_info.FIRST_NAME.lower(),
                                      []).append(subject_id)
        name_to_subject_id.setdefault(patient_info.LAST_NAME.lower(),
                                      []).append(subject_id)

    found_sentences = 0
    for sample_sentence, sample_names in tqdm(sample_with_names):
        entities = [x for x in get_entities(sample_sentence)]
        subject_ids = list(
            set([
                subject_id for name in sample_names
                if name in name_to_subject_id
def main(model: gensim.models.KeyedVectors, condition_type: str,
         metrics_output_path: str):
    subject_id_to_patient_info = get_subject_id_to_patient_info(
        condition_type=condition_type)
    condition_code_to_count = get_condition_code_to_count(
        condition_type=condition_type)
    condition_code_to_description = get_condition_code_to_descriptions(
        condition_type=condition_type)

    set_to_use = filter_condition_code_by_count(condition_code_to_count,
                                                min_count=50,
                                                max_count=500000)

    condition_code_to_index: Dict[str, int] = dict(
        zip(set_to_use, range(len(set_to_use))))

    mean_condition_embeddings = []
    max_condition_embeddings = []
    all_condition_embeddings = []

    for condition in set_to_use:
        desc = condition_code_to_description[condition]
        condition_embeddings = get_embedding(model, desc)

        mean_embedding = normalize(
            np.mean(condition_embeddings, axis=0, keepdims=True))
        mean_condition_embeddings.append(mean_embedding)

        max_embedding = normalize(
            np.max(condition_embeddings, axis=0, keepdims=True))
        max_condition_embeddings.append(max_embedding)

        all_condition_embeddings.append(normalize(condition_embeddings))

    mean_condition_embeddings = np.concatenate(
        mean_condition_embeddings,
        axis=0)  # Shape = (Num_Conditions, Embedding Size)
    max_condition_embeddings = np.concatenate(
        max_condition_embeddings,
        axis=0)  # Shape = (Num_Conditions, Embedding Size)

    mean_differential_sim, max_differential_sim, all_pair_differential_sim = [], [], []

    for subject_id, patient_info in tqdm(subject_id_to_patient_info.items()):
        name = patient_info.FIRST_NAME + " " + patient_info.LAST_NAME
        name_embeddings = get_embedding(model, name)

        mean_name_embedding = normalize(
            name_embeddings.mean(0))  # Shape = (Embedding Size,)
        max_name_embedding = normalize(
            name_embeddings.max(0))  # Shape = (Embedding Size,)

        mean_similarities = mean_condition_embeddings @ mean_name_embedding
        max_similarities = max_condition_embeddings @ max_name_embedding

        name_embeddings = normalize(name_embeddings)

        all_pair_similarities = []
        for condition_embeddings in all_condition_embeddings:
            similarity_matrix = condition_embeddings @ name_embeddings.T
            all_pair_similarities.append(np.max(similarity_matrix))

        condition_labels = get_condition_labels_as_vector(
            patient_info.CONDITIONS, condition_code_to_index)
        if sum(condition_labels) == 0: continue
        mean_differential_sim.append(
            differential_score(condition_labels, mean_similarities))
        max_differential_sim.append(
            differential_score(condition_labels, max_similarities))
        all_pair_differential_sim.append(
            differential_score(condition_labels, all_pair_similarities))

    print(f"Mean Mean Pos-Neg {np.average(mean_differential_sim)}")
    print(f"SD Mean Pos-Neg {np.std(mean_differential_sim)}")
    print(f"Mean Max Pos-Neg {np.average(max_differential_sim)}")
    print(f"SD Max Pos-Neg {np.std(max_differential_sim)}")
    print(f"Mean All Pair Pos-Neg {np.average(all_pair_differential_sim)}")
    print(f"SD All Pair Pos-Neg {np.std(all_pair_differential_sim)}")
    from experiments.MLM.common import mean_std_as_string
    with open(f"{metrics_output_path}/results.txt", "w") as f:
        f.write(mean_std_as_string("Mean Sim", mean_differential_sim))
        f.write(mean_std_as_string("Max Sim", max_differential_sim))
        f.write(mean_std_as_string("All Pair Sim", all_pair_differential_sim))
Example #3
0
def train_and_evaluate(
    model: BertModel,
    tokenizer: BertTokenizer,
    condition_type: str,
    sampling_bin: int,
    n: int,
    metrics_output_path: str,
):
    """Train and evaluate the model on N conditions.
    @param n is the number of conditions to sample the bin from.
    """
    ### Get Relevant Data

    subject_id_to_patient_info = get_subject_id_to_patient_info(condition_type=condition_type)
    condition_code_to_count = get_condition_code_to_count(condition_type=condition_type)
    condition_code_to_description = get_condition_code_to_descriptions(condition_type=condition_type)

    set_to_use = filter_condition_code_by_count(condition_code_to_count, min_count=0, max_count=500000)

    binned_conditions = get_frequency_bins(condition_code_to_count, condition_type)

    subject_ids = sorted(list(subject_id_to_patient_info.keys()))
    train_subject_ids, test_subject_ids = train_test_split(
        subject_ids, train_size=0.5, random_state=2021, shuffle=True
    )

    ### Filter condition in each bin so we have atleast one positive training example
    ### And One positive test example
    ### Otherwise, we can't train a LR model or calculate roc_auc_score

    train_set_conditions = get_non_zero_count_conditions(
        set_to_use, train_subject_ids, subject_id_to_patient_info
    )
    test_set_conditions = get_non_zero_count_conditions(
        set_to_use, test_subject_ids, subject_id_to_patient_info
    )
    binned_conditions = [set(bin_) & train_set_conditions & test_set_conditions for bin_ in binned_conditions]
    binned_conditions = [sorted(list(bin_)) for bin_ in binned_conditions]

    ### Sample condition in selected bin

    condition_bin = binned_conditions[sampling_bin]
    np.random.seed(2021)
    sampled_conditions = np.random.choice(condition_bin, size=n, replace=False)

    ## Train a Classifier for Each Condition

    auc_score_list, precision_at_10_list = [], []
    for condition in tqdm(sampled_conditions):
        desc = condition_code_to_description[condition]

        ## Get all train templates and labels for all train patients, for this condition
        train_templates = []
        train_labels = []
        for subject_id in train_subject_ids:
            patient_info = subject_id_to_patient_info[subject_id]
            template = generate_name_condition_template(
                patient_info.FIRST_NAME, patient_info.LAST_NAME, patient_info.GENDER, desc
            )
            label = condition in patient_info.CONDITIONS

            train_templates.append(template)
            train_labels.append(label)

        ## Resample to Upsample positive examples

        negative_indices = [i for i, x in enumerate(train_labels) if x == 0]
        positive_indices = [i for i, x in enumerate(train_labels) if x == 1]

        # We set replace = to False in another file; does this matter?
        positive_indices = resample(
            positive_indices, replace=True, n_samples=len(negative_indices), random_state=2021
        )
        total_indices = negative_indices + positive_indices

        train_templates = [train_templates[i] for i in total_indices]
        train_labels = [train_labels[i] for i in total_indices]

        ## Train the LR model

        train_embeddings = get_cls_embeddings(model, tokenizer, train_templates, disable_tqdm=True)
        clf = MLPClassifier(hidden_layer_sizes=(128,), random_state=2021).fit(train_embeddings, train_labels)

        ## Get all test templates and labels for all test patients, for this condition

        test_templates = []
        test_labels = []

        for subject_id in test_subject_ids:
            patient_info = subject_id_to_patient_info[subject_id]
            template = generate_name_condition_template(
                patient_info.FIRST_NAME, patient_info.LAST_NAME, patient_info.GENDER, desc
            )
            label = condition in patient_info.CONDITIONS

            test_templates.append(template)
            test_labels.append(label)

        ## Get Embeddings for all test patients, and make prediction with LR model

        test_embeddings = get_cls_embeddings(model, tokenizer, test_templates, disable_tqdm=True)
        test_predictions = clf.predict_proba(test_embeddings)[:, 1]

        auc_score = roc_auc_score(test_labels, test_predictions)
        precision_at_10 = precision_at_k(test_labels, test_predictions, k=10)

        auc_score_list.append(auc_score)
        precision_at_10_list.append(precision_at_10)

    from experiments.MLM.common import mean_std_as_string

    with open(f"{metrics_output_path}/results.txt", "w") as f:
        f.write(mean_std_as_string("Model AUC", auc_score_list))
        f.write(mean_std_as_string("Model P@K", precision_at_10_list))
Example #4
0
def main(model, tokenizer: BertTokenizerFast, condition_type: str,
         metrics_output_path: str):
    """Compute the BERT representations + cosine similarities."""

    ## Get Relevant data

    subject_id_to_patient_info = get_subject_id_to_patient_info(
        condition_type=condition_type)
    condition_code_to_count = get_condition_code_to_count(
        condition_type=condition_type)
    condition_code_to_description = get_condition_code_to_descriptions(
        condition_type=condition_type)

    set_to_use = filter_condition_code_by_count(condition_code_to_count,
                                                min_count=0,
                                                max_count=500000)

    condition_code_to_index: Dict[str, int] = dict(
        zip(set_to_use, range(len(set_to_use))))

    mean_differential_sim, max_differential_sim, all_pair_differential_sim = [], [], []

    ## For each patient and condition, get a template, pass through BERT and return similarities

    all_subject_ids = sorted(list(subject_id_to_patient_info.keys()))
    all_subject_ids = sorted(
        resample(all_subject_ids,
                 replace=False,
                 n_samples=10000,
                 random_state=2021))

    for subject_id in tqdm(all_subject_ids):
        patient_info = subject_id_to_patient_info[subject_id]
        templates = []
        for condition in set_to_use:
            desc = condition_code_to_description[condition]
            templates.append(
                generate_template(patient_info.FIRST_NAME,
                                  patient_info.LAST_NAME, patient_info.GENDER,
                                  desc))

        name = patient_info.FIRST_NAME + " " + patient_info.LAST_NAME
        name_length = len(tokenizer.tokenize(name))

        ## Following info may change if we change the template structure.
        ## Following are on basis of structure [CLS] {title} {name} is a yo patient with {condition} [SEP]
        example_template = tokenizer.tokenize(templates[0])
        name_start_index = 2  # Name Starts after [CLS] {title}
        name_end_index = name_start_index + name_length
        condition_start_index = example_template.index("patient") + 2
        condition_end_index = -1

        assert (tokenizer.convert_tokens_to_string(
            example_template[name_start_index:name_end_index]) == " ".join(
                name.lower().split())), breakpoint()
        assert (tokenizer.convert_tokens_to_string(
            example_template[condition_start_index:condition_end_index]) ==
                " ".join(condition_code_to_description[
                    set_to_use[0]].lower().split())), breakpoint()

        ## Pass all templates to BERT and return similarities

        mean_similarities, max_similarities, all_pair_similarities = get_name_condition_similarities(
            model,
            tokenizer,
            templates,
            name_start_index,
            name_end_index,
            condition_start_index,
            condition_end_index,
        )

        condition_labels = get_condition_labels_as_vector(
            patient_info.CONDITIONS, condition_code_to_index)

        mean_differential_sim.append(
            differential_score(condition_labels, mean_similarities))
        max_differential_sim.append(
            differential_score(condition_labels, max_similarities))
        all_pair_differential_sim.append(
            differential_score(condition_labels, all_pair_similarities))

    print(f"Mean Mean Pos-Neg {np.average(mean_differential_sim)}")
    print(f"SD Mean Pos-Neg {np.std(mean_differential_sim)}")
    print(f"Mean Max Pos-Neg {np.average(max_differential_sim)}")
    print(f"SD Max Pos-Neg {np.std(max_differential_sim)}")
    print(f"Mean All Pair Pos-Neg {np.average(all_pair_differential_sim)}")
    print(f"SD All Pair Pos-Neg {np.std(all_pair_differential_sim)}")

    from experiments.MLM.common import mean_std_as_string

    with open(f"{metrics_output_path}/results.txt", "w") as f:
        f.write(mean_std_as_string("Mean Sim", mean_differential_sim))
        f.write(mean_std_as_string("Max Sim", max_differential_sim))
        f.write(mean_std_as_string("All Pair Sim", all_pair_differential_sim))
def run_probe(
    model: BertModel,
    tokenizer: BertTokenizerFast,
    condition_type: str,
    template_mode: str,
    prober: str,
    metrics_output_path: str,
):
    """Train and evaluate the model trained on the data.

    Args:
        condition_type: icd9 or Stanza
        template_mode: Choices in [name_and_condition, condition_only].
                        Specify if name should be included in template
        prober: LR or MLP
    """

    ### Get Relevant Data

    subject_id_to_patient_info = get_subject_id_to_patient_info(
        condition_type=condition_type)
    condition_code_to_count = get_condition_code_to_count(
        condition_type=condition_type)
    condition_code_to_description = get_condition_code_to_descriptions(
        condition_type=condition_type)

    set_to_use = filter_condition_code_by_count(condition_code_to_count,
                                                min_count=0,
                                                max_count=500000)

    condition_code_to_index: Dict[str, int] = dict(
        zip(set_to_use, range(len(set_to_use))))

    ## Divide patients into train and test group

    all_subject_ids = sorted(list(subject_id_to_patient_info.keys()))

    ## Sample 10K subjects because 27K takes timeeeeeee
    n_samples = min(len(all_subject_ids), 10000)
    all_subject_ids = sorted(
        resample(all_subject_ids,
                 replace=False,
                 n_samples=n_samples,
                 random_state=2021))

    train_subject_ids, test_subject_ids = train_test_split(all_subject_ids,
                                                           random_state=2021,
                                                           test_size=0.5,
                                                           shuffle=True)

    print(f"Train Subject Ids : {len(train_subject_ids)}")
    print(f"Test Subject Ids : {len(test_subject_ids)}")
    import pdb
    pdb.set_trace()

    ## Get training example by generating template for all train patients and all conditions

    subject_condition_templates = []
    subject_condition_labels = []

    for subject_id in train_subject_ids:
        patient_info = subject_id_to_patient_info[subject_id]
        for condition in set_to_use:
            desc = condition_code_to_description[condition]
            if template_mode == "name_and_condition":
                template = generate_name_condition_template(
                    patient_info.FIRST_NAME, patient_info.LAST_NAME,
                    patient_info.GENDER, desc)
            elif template_mode == "condition_only":
                template = generate_condition_only_template(desc)
            else:
                raise NotImplementedError(f"{template_mode} is not available")
            subject_condition_templates.append(template)

        condition_labels = get_condition_labels_as_vector(
            patient_info.CONDITIONS, condition_code_to_index)
        subject_condition_labels += list(condition_labels)

    ## Downsample negative labels since most patients only have few positive conditions

    negative_indices = [
        i for i, x in enumerate(subject_condition_labels) if x == 0
    ]
    positive_indices = [
        i for i, x in enumerate(subject_condition_labels) if x == 1
    ]

    negative_indices = resample(negative_indices,
                                replace=False,
                                n_samples=len(positive_indices),
                                random_state=2021)
    total_indices = negative_indices + positive_indices

    train_templates = [subject_condition_templates[i] for i in total_indices]
    train_labels = [subject_condition_labels[i] for i in total_indices]

    print(len(train_templates))

    ## Get [CLS] token embedding for each template and train a LR classifier

    train_cls_embeddings = get_cls_embeddings(model, tokenizer,
                                              train_templates)

    print(f"Training {prober} Model")
    if prober == "LR":
        classifier = LogisticRegression(random_state=2021, max_iter=10000).fit(
            train_cls_embeddings, train_labels)
    elif prober == "MLP":
        classifier = MLPClassifier(hidden_layer_sizes=(128, ),
                                   random_state=2021).fit(
                                       train_cls_embeddings, train_labels)
    else:
        raise NotImplementedError(f"{prober} not implemented")
    print(f"{prober} Model Trained")

    ## Get templates and labels for test set patients

    auc_scores, paks = [], []

    for subject_id in tqdm(test_subject_ids):
        test_templates = []
        patient_info = subject_id_to_patient_info[subject_id]
        for condition in set_to_use:
            desc = condition_code_to_description[condition]
            if template_mode == "name_and_condition":
                template = generate_name_condition_template(
                    patient_info.FIRST_NAME, patient_info.LAST_NAME,
                    patient_info.GENDER, desc)
            elif template_mode == "condition_only":
                template = generate_condition_only_template(desc)
            else:
                raise NotImplementedError(f"{template_mode} is not available")
            test_templates.append(template)

        condition_labels = get_condition_labels_as_vector(
            patient_info.CONDITIONS, condition_code_to_index)
        test_labels = list(condition_labels)

        test_cls_embeddings = get_cls_embeddings(model,
                                                 tokenizer,
                                                 test_templates,
                                                 disable_tqdm=True)
        test_predictions = classifier.predict_proba(test_cls_embeddings)[:, 1]

        try:
            model_auc = roc_auc_score(test_labels, test_predictions)
            model_precision_at_k = precision_at_k(test_labels,
                                                  test_predictions,
                                                  k=10)

            auc_scores.append(model_auc)
            paks.append(model_precision_at_k)
        except:
            continue

    from experiments.MLM.common import mean_std_as_string

    with open(f"{metrics_output_path}/results.txt", "w") as f:
        f.write(mean_std_as_string("Model AUC", auc_scores))
        f.write(mean_std_as_string("Model P@K", paks))
def evaluate(
    model: BertForMaskedLM,
    tokenizer: BertTokenizer,
    condition_type: str,
    template_idx: int,
    max_count: int,
    metrics_output_path: str,
):
    """
    Evaluate the performance of the model in terms of being able to predict
    conditions associated with certain names (via masked language modelling task on templates).

    For each patient,
        For each condition,
            create template with name filled and condition masked.\\
            get MLM probability distribution for [MASK] tokens.\\
            get average probability of condition's wordpieces.\\
        Compute ROC / P@10 against true condition labels.
    Report Average over patients.

    Ideally we want higher average probability for conditions that patient have, than for conditions they
    don't have.

    We also include a condition only baseline (where template doesn't contain the patient name). The
    algorithm above remains same.

    ### Args:
        condition_type: Which conditions to load for patients. Currently take value in [icd9, medcat]
        template: Which template to use for probing the model.
    """

    ### Load relevant data

    subject_id_to_patient_info = get_subject_id_to_patient_info(
        condition_type=condition_type)
    condition_code_to_count = get_condition_code_to_count(
        condition_type=condition_type)
    condition_code_to_description = get_condition_code_to_descriptions(
        condition_type=condition_type)

    set_to_use = filter_condition_code_by_count(condition_code_to_count,
                                                min_count=0,
                                                max_count=max_count)

    print(len(set_to_use))
    condition_code_to_index: Dict[str, int] = dict(
        zip(set_to_use, range(len(set_to_use))))

    ### Get list of unique condition lengths (in wordpieces) to generate templates

    condition_code_to_wordpiece_ids: Dict[str, List[str]] = {
        condition: tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(condition_code_to_description[condition]))
        for condition in set_to_use
    }  ## Time Saving Measure since we only need condition wordpiece ids moving forward

    condition_wordpiece_lengths = [
        len(condition_code_to_wordpiece_ids[condition])
        for condition in set_to_use
    ]

    from collections import Counter

    print(sorted(list(Counter(condition_wordpiece_lengths).items())))
    set_to_use_lengths = np.array(condition_wordpiece_lengths)

    condition_wordpiece_lengths: List[int] = sorted(
        list(set(condition_wordpiece_lengths)))  ## Keep Unique Lengths only

    ### Get Condition Frequency counts

    condition_baseline_counts = np.array(
        get_condition_counts_as_vector(condition_code_to_count,
                                       condition_code_to_index))
    condition_baseline_freq = condition_baseline_counts / np.sum(
        condition_baseline_counts)

    ### Get Condition only template logits

    ## Generate Template for each unique condition wordpiece length
    condition_only_templates = [
        condition_only_template(length)
        for length in condition_wordpiece_lengths
    ]

    logits = get_logits_from_templates(model,
                                       tokenizer,
                                       condition_only_templates,
                                       normalize=True)
    logits = {
        length: logit
        for length, logit in zip(condition_wordpiece_lengths, logits)
    }

    # Isn't the start index always 1 here? Yes. This is to keep code consistent.
    start_indices = [
        tokenizer.tokenize(template).index("[MASK]")
        for template in condition_only_templates
    ]
    start_indices = {
        length: start_index
        for length, start_index in zip(condition_wordpiece_lengths,
                                       start_indices)
    }

    condition_only_logits: List[float] = []
    for condition in set_to_use:
        condition_wp_ids = condition_code_to_wordpiece_ids[condition]
        condition_length = len(condition_wp_ids)
        condition_only_logits.append(
            get_average_predicted_score(logits[condition_length],
                                        condition_wp_ids,
                                        start_indices[condition_length]))

    condition_only_logits = normalize_logits(condition_only_logits,
                                             condition_baseline_freq,
                                             condition_wordpiece_lengths,
                                             set_to_use_lengths)

    ### Get Subject Specific Condition logits

    k = 10
    rocs, precisions_at_k, spearmans = {}, {}, {}
    for length in condition_wordpiece_lengths + ["all"]:
        rocs[length] = {"baseline": [], "condition_only": [], "model": []}
        precisions_at_k[length] = {
            "baseline": [],
            "condition_only": [],
            "model": []
        }
        spearmans[length] = {"baseline": [], "condition_only": [], "model": []}

        if length == "all":
            rocs[length]["bin_prob"] = []
            precisions_at_k[length]["bin_prob"] = []
            spearmans[length]["bin_prob"] = []

    patient_to_run = [
        sum(
            get_condition_labels_as_vector(patient_info.CONDITIONS,
                                           condition_code_to_index)) > 0
        for subject_id, patient_info in subject_id_to_patient_info.items()
    ]

    print(sum(patient_to_run))

    condition_bin_prob_baselines = np.zeros_like(condition_baseline_freq)
    for length in condition_wordpiece_lengths:
        mask = set_to_use_lengths == length
        condition_bin_prob_baselines[mask] = condition_baseline_freq[
            mask].mean()

    for subject_id, patient_info in tqdm(subject_id_to_patient_info.items()):
        condition_labels = get_condition_labels_as_vector(
            patient_info.CONDITIONS, condition_code_to_index)
        condition_labels = np.array(condition_labels)

        if condition_labels.sum() == 0:
            continue  ## Skip if patient is negative for all conditions

        ## Generate Template for each unique condition wordpiece length
        templates = []
        for length in condition_wordpiece_lengths:
            template = name_with_condition_template(patient_info.FIRST_NAME,
                                                    patient_info.LAST_NAME,
                                                    patient_info.GENDER,
                                                    length, template_idx)
            templates.append(template)

        ## Get logits for all templates
        logits = get_logits_from_templates(model,
                                           tokenizer,
                                           templates,
                                           normalize=True)
        logits = {
            length: logit
            for length, logit in zip(condition_wordpiece_lengths, logits)
        }

        ## Get Start index for (masked) condition in each template
        ## Not sure we need to do this for all templates? -> Once we have the index of the mask of one, it should be the same for all?
        start_indices = [
            tokenizer.tokenize(template).index("[MASK]")
            for template in templates
        ]
        start_indices = {
            length: start_index
            for length, start_index in zip(condition_wordpiece_lengths,
                                           start_indices)
        }

        ## For each condition, get corresponding logit array and then compute average score
        condition_subject_logits = []
        for condition in set_to_use:
            condition_wp_ids = condition_code_to_wordpiece_ids[condition]
            condition_length = len(condition_wp_ids)
            condition_subject_logits.append(
                get_average_predicted_score(logits[condition_length],
                                            condition_wp_ids,
                                            start_indices[condition_length]))

        condition_subject_logits = normalize_logits(
            condition_subject_logits, condition_baseline_freq,
            condition_wordpiece_lengths, set_to_use_lengths)

        for length in condition_wordpiece_lengths + ["all"]:
            mask = (set_to_use_lengths == length if length != "all" else
                    np.full_like(set_to_use_lengths, True, dtype=bool))
            length_condition_baseline_counts = condition_baseline_counts[mask]
            length_condition_only_logits = condition_only_logits[mask]
            length_condition_subject_logits = condition_subject_logits[mask]
            length_condition_labels = condition_labels[mask]

            if length_condition_labels.sum() == 0 or mask.sum() < 2:
                ## If patient doesn't have any positive condition or only one condition in bin
                continue

            ### Calculate and store metrics for this patient
            _baseline_roc = roc_auc_score(length_condition_labels,
                                          length_condition_baseline_counts)
            _condition_only_roc = roc_auc_score(length_condition_labels,
                                                length_condition_only_logits)
            _model_roc = roc_auc_score(length_condition_labels,
                                       length_condition_subject_logits)

            rocs[length]["baseline"].append(_baseline_roc)
            rocs[length]["condition_only"].append(_condition_only_roc)
            rocs[length]["model"].append(_model_roc)

            _baseline_spearman = spearmanr(
                length_condition_baseline_counts,
                length_condition_baseline_counts).correlation
            _condition_only_spearman = spearmanr(
                length_condition_baseline_counts, length_condition_only_logits)
            _model_spearman = spearmanr(length_condition_baseline_counts,
                                        length_condition_subject_logits)

            spearmans[length]["baseline"].append(_baseline_spearman)
            spearmans[length]["condition_only"].append(
                _condition_only_spearman)
            spearmans[length]["model"].append(_model_spearman)

            _model_precision_at_k = precision_at_k(
                length_condition_labels, length_condition_subject_logits, k)
            _condition_only_precision_at_k = precision_at_k(
                length_condition_labels, length_condition_only_logits, k)
            _baseline_precision_at_k = precision_at_k(
                length_condition_labels, length_condition_baseline_counts, k)

            precisions_at_k[length]["baseline"].append(
                _baseline_precision_at_k)
            precisions_at_k[length]["condition_only"].append(
                _condition_only_precision_at_k)
            precisions_at_k[length]["model"].append(_model_precision_at_k)

            if length == "all":
                rocs[length]["bin_prob"].append(
                    roc_auc_score(length_condition_labels,
                                  condition_bin_prob_baselines))
                spearmans[length]["bin_prob"].append(
                    spearmanr(length_condition_baseline_counts,
                              condition_bin_prob_baselines).correlation)
                precisions_at_k[length]["bin_prob"].append(
                    precision_at_k(length_condition_labels,
                                   condition_bin_prob_baselines, k))

    ### Computing and print metrics (averaged over patients)
    with open(f"{metrics_output_path}/results.txt", "w") as f:
        for length in ["all"] + condition_wordpiece_lengths:
            bin_length = (set_to_use_lengths == length if length != "all" else
                          np.full_like(set_to_use_lengths, True, dtype=bool))

            f.write(
                f"Length {length} # Num conditions in Length bin {bin_length.sum()}\n"
            )
            if len(rocs[length]["model"]) == 0:
                continue

            for method, values in rocs[length].items():
                f.write(mean_std_as_string(f"{method} AUC", values))

            for method, values in precisions_at_k[length].items():
                f.write(mean_std_as_string(f"{method} P@K", values))

            for method, values in spearmans[length].items():
                f.write(mean_std_as_string(f"{method} Spearman", values))

            f.write("\n")
Example #7
0
def train_and_evaluate(
    model: BertForSequenceClassification,
    tokenizer: BertTokenizer,
    condition_type: str,
    sampling_bin: int,
    n: int,
    metrics_output_path: str,
):
    """Train and evaluate the model on N conditions.
    @param model is the model to encode CLS tokens with.
    @param tokenizer is a BERT tokenizer.
    @param condition_type are we using the icd/medcat extracted conditions?
    @param b is which frequency to sample from.
    @param n is the number of conditions to sample the bin from.
    @return all AUCs and precision @ K scores.
    """
    ### Get Relevant Data

    subject_id_to_patient_info = get_subject_id_to_patient_info(
        condition_type=condition_type)
    condition_code_to_count = get_condition_code_to_count(
        condition_type=condition_type)
    condition_code_to_description = get_condition_code_to_descriptions(
        condition_type=condition_type)

    set_to_use = filter_condition_code_by_count(condition_code_to_count,
                                                min_count=0,
                                                max_count=500000)

    binned_conditions = get_frequency_bins(condition_code_to_count,
                                           condition_type)

    subject_ids = sorted(list(subject_id_to_patient_info.keys()))
    train_subject_ids, test_subject_ids = train_test_split(subject_ids,
                                                           train_size=0.5,
                                                           random_state=2021,
                                                           shuffle=True)

    ### Filter condition in each bin so we have atleast one positive training examples
    ### And One positive test example
    ### Otherwise, we can't train a LR model or calculate roc_auc_score

    train_set_conditions = get_non_zero_count_conditions(
        set_to_use, train_subject_ids, subject_id_to_patient_info)
    test_set_conditions = get_non_zero_count_conditions(
        set_to_use, test_subject_ids, subject_id_to_patient_info)
    binned_conditions = [
        set(bin_) & train_set_conditions & test_set_conditions
        for bin_ in binned_conditions
    ]
    binned_conditions = [sorted(list(bin_)) for bin_ in binned_conditions]

    ### Sample condition in selected bin

    condition_bin = binned_conditions[sampling_bin]
    np.random.seed(2021)
    sampled_conditions = np.random.choice(condition_bin, size=n, replace=False)

    ## Train a Classifier for Each Condition

    auc_score_list, precision_at_10_list = [], []
    for condition in tqdm(sampled_conditions):

        desc = condition_code_to_description[condition]
        train_templates = []
        train_labels = []
        for subject_id in train_subject_ids:
            patient_info = subject_id_to_patient_info[subject_id]
            template = generate_name_condition_template(
                patient_info.FIRST_NAME, patient_info.LAST_NAME,
                patient_info.GENDER, desc)
            label = condition in patient_info.CONDITIONS

            train_templates.append(template)
            train_labels.append(label)

        ## Resample to Upsample positive examples

        negative_indices = [i for i, x in enumerate(train_labels) if x == 0]
        positive_indices = [i for i, x in enumerate(train_labels) if x == 1]

        positive_indices = resample(positive_indices,
                                    replace=True,
                                    n_samples=len(negative_indices),
                                    random_state=2021)
        total_indices = negative_indices + positive_indices

        ### Divide Train Set into Train and Validation Set

        training_indices, validation_indices = train_test_split(
            total_indices, train_size=0.85, random_state=2021, shuffle=True)

        # Not too sure we can ensure the validation templates have a positive label in it...
        # Or if there is only 1, that it doesn't end up in the validation set.
        validation_templates = [train_templates[i] for i in validation_indices]
        validation_labels = [train_labels[i] for i in validation_indices]

        np.random.seed(2021)
        np.random.shuffle(training_indices)
        train_templates = [train_templates[i] for i in training_indices]
        train_labels = [train_labels[i] for i in training_indices]

        ### Train the BERT Model

        train_dataset = get_as_dataset(tokenizer, train_templates,
                                       train_labels)
        validation_dataset = get_as_dataset(tokenizer, validation_templates,
                                            validation_labels)

        clf = train_model(model, train_dataset, validation_dataset)

        ### Get Test Templates

        test_templates = []
        test_labels = []
        for subject_id in test_subject_ids:
            patient_info = subject_id_to_patient_info[subject_id]
            template = generate_name_condition_template(
                patient_info.FIRST_NAME, patient_info.LAST_NAME,
                patient_info.GENDER, desc)
            label = condition in patient_info.CONDITIONS

            test_templates.append(template)
            test_labels.append(label)

        ### Get Test Predictions
        test_dataset = get_as_dataset(tokenizer, test_templates, test_labels)
        test_predictions = clf.predict(test_dataset)

        test_predictions = test_predictions.predictions[:, 1]

        ### Calculate Metrics

        auc_score = roc_auc_score(test_labels, test_predictions)
        precision_at_10 = precision_at_k(test_labels, test_predictions, k=10)

        auc_score_list.append(auc_score)
        precision_at_10_list.append(precision_at_10)

    from experiments.MLM.common import mean_std_as_string

    with open(f"{metrics_output_path}/results.txt", "w") as f:
        f.write(mean_std_as_string("Model AUC", auc_score_list))
        f.write(mean_std_as_string("Model P@K", precision_at_10_list))
from argparse import ArgumentParser

parser = ArgumentParser()
parser.add_argument("--condition-type", required=True)
parser.add_argument("--output-file", required=True)

args = parser.parse_args()


def generate_name_condition_template(first_name: str, last_name: str,
                                     gender: str, condition_description: str):
    title = "Mr" if gender == "M" else "Mrs"  # I guess just assume married w/e idk ?
    return f"{title} {first_name} {last_name} is a yo patient with {condition_description}"


data = get_subject_id_to_patient_info(args.condition_type)
desc = get_condition_code_to_descriptions(args.condition_type)

templates = [
    "\n".join([
        generate_name_condition_template(d.FIRST_NAME, d.LAST_NAME, d.GENDER,
                                         desc[c]) for c in d.CONDITIONS
    ]) for d in data.values()
]
subject_ids = [d for d in data.keys()]

import pandas as pd

df = pd.DataFrame({"SUBJECT_ID": subject_ids, "TEXT": templates})

df.to_csv(args.output_file, index=False)