Beispiel #1
0
def test_predict_tags_mesh_cnn(mesh_cnn_path, mesh_label_binarizer_path):
    tags = predict_tags(X,
                        mesh_cnn_path,
                        mesh_label_binarizer_path,
                        approach="mesh-cnn")
    assert len(tags) == 5
    tags = predict_tags(X,
                        mesh_cnn_path,
                        mesh_label_binarizer_path,
                        approach="mesh-cnn",
                        probabilities=True)
    for tags_ in tags:
        for tag, prob in tags_.items():
            assert 0 <= prob <= 1.0
    tags = predict_tags(X,
                        mesh_cnn_path,
                        mesh_label_binarizer_path,
                        approach="mesh-cnn",
                        threshold=0)
    for tags_ in tags:
        assert len(tags_) == 5000
    tags = predict_tags(X,
                        mesh_cnn_path,
                        mesh_label_binarizer_path,
                        approach="mesh-cnn",
                        threshold=1)
    for tags_ in tags:
        assert len(tags_) == 0
Beispiel #2
0
def test_predict_tags_science_ensemble(science_ensemble_path,
                                       label_binarizer_path):
    tags = predict_tags(X,
                        model_path=science_ensemble_path,
                        label_binarizer_path=label_binarizer_path,
                        approach="science-ensemble")
    assert len(tags) == 5
    tags = predict_tags(X,
                        model_path=science_ensemble_path,
                        label_binarizer_path=label_binarizer_path,
                        approach="science-ensemble",
                        probabilities=True)
    for tags_ in tags:
        for tag, prob in tags_.items():
            assert 0 <= prob <= 1.0
    tags = predict_tags(X,
                        model_path=science_ensemble_path,
                        label_binarizer_path=label_binarizer_path,
                        approach="science-ensemble",
                        threshold=0)
    for tags_ in tags:
        assert len(tags_) == 24
    tags = predict_tags(X,
                        model_path=science_ensemble_path,
                        label_binarizer_path=label_binarizer_path,
                        approach="science-ensemble",
                        threshold=1)
    for tags_ in tags:
        assert len(tags_) == 0
Beispiel #3
0
def test_predict_tags_tfidf_svm(tfidf_svm_path, label_binarizer_path):
    tags = predict_tags(X,
                        model_path=tfidf_svm_path,
                        label_binarizer_path=label_binarizer_path,
                        approach="tfidf-svm")
    assert len(tags) == 5
    tags = predict_tags(X,
                        model_path=tfidf_svm_path,
                        label_binarizer_path=label_binarizer_path,
                        approach="tfidf-svm",
                        probabilities=True)
    for tags_ in tags:
        for tag, prob in tags_.items():
            assert 0 <= prob <= 1.0
    tags = predict_tags(X,
                        model_path=tfidf_svm_path,
                        label_binarizer_path=label_binarizer_path,
                        approach="tfidf-svm",
                        threshold=0)
    for tags_ in tags:
        assert len(tags_) == 24
    tags = predict_tags(X,
                        model_path=tfidf_svm_path,
                        label_binarizer_path=label_binarizer_path,
                        approach="tfidf-svm",
                        threshold=1)
    for tags_ in tags:
        assert len(tags_) == 0
Beispiel #4
0
def test_predict_tags_mesh_xlinear(mesh_xlinear_path,
                                   mesh_label_binarizer_path):
    # We need to pass parameters because the load function is different
    # depending on the vectorizer library (pecos or sklearn)
    parameters = str({'vectorizer_library': 'sklearn'})
    tags = predict_tags(X,
                        mesh_xlinear_path,
                        mesh_label_binarizer_path,
                        approach="mesh-xlinear",
                        parameters=parameters)
    assert len(tags) == 5
    tags = predict_tags(X,
                        mesh_xlinear_path,
                        mesh_label_binarizer_path,
                        approach="mesh-xlinear",
                        parameters=parameters,
                        probabilities=True)
    for tags_ in tags:
        for tag, prob in tags_.items():
            assert 0 <= prob <= 1.0
    tags = predict_tags(X,
                        mesh_xlinear_path,
                        mesh_label_binarizer_path,
                        approach="mesh-xlinear",
                        threshold=0,
                        parameters=parameters)
    for tags_ in tags:
        assert len(tags_) == 5000
    tags = predict_tags(X,
                        mesh_xlinear_path,
                        mesh_label_binarizer_path,
                        approach="mesh-xlinear",
                        threshold=1,
                        parameters=parameters)
    for tags_ in tags:
        assert len(tags_) == 0
def tag_grants(grants_path,
               tagged_grants_path,
               model_path,
               label_binarizer_path,
               approach,
               threshold=0.5,
               grant_id_field="grant_id",
               grant_text_fields=["title", "synopsis"],
               text_null_value="No Data Entered"):

    with open(tagged_grants_path, "w") as tagged_grants_tf:
        fieldnames = ["Grant id", "Tag", "Prob"]
        csv_writer = csv.DictWriter(tagged_grants_tf,
                                    fieldnames=fieldnames,
                                    delimiter=",",
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
        csv_writer.writeheader()

        for grants in yield_batched_grants(grants_path, 128):
            grants_text = [
                " ".join([
                    grant[field].replace(text_null_value, "")
                    for field in grant_text_fields
                ]) for grant in grants
            ]

            # Removes consecutive white spaces which are uninformative and may cause error #30
            grants_text = [
                " ".join(text.split()) for text in grants_text if text.strip()
            ]  # Removes empty text
            grants_tags = predict_tags(grants_text,
                                       model_path,
                                       label_binarizer_path,
                                       approach,
                                       probabilities=True,
                                       threshold=threshold)

            for grant, tags in zip(grants, grants_tags):
                for tag, prob in tags.items():
                    csv_writer.writerow({
                        'Grant id': grant[grant_id_field],
                        'Tag': tag,
                        'Prob': prob
                    })

            tagged_grants_tf.flush()
    },
}

model_option = st.sidebar.selectbox("Model", options=list(models.keys()))
model = models[model_option]

probabilities = st.sidebar.checkbox("Display probabilities")

if text == DEFAULT_TEXT:
    st.stop()

with st.spinner("Calculating tags..."):
    tags = predict_tags(
        [text],
        model["model_path"],
        model["label_binarizer_path"],
        model["approach"],
        probabilities=probabilities,
        threshold=threshold,
    )
    tags = tags[0]
st.success("Done!")

if probabilities:
    tag_probs = [
        {"Tag": tag, "Prob": prob} for tag, prob in tags.items() if prob > threshold
    ]
    st.table(pd.DataFrame(tag_probs))
    tags = [tag_prob["Tag"] for tag_prob in tag_probs]
else:
    for tag in tags:
        st.button(tag)