Example #1
0
def example_doc(request):
    with open(request.param["file"]) as stream:
        return dict(
            request.param,
            document=Document(request.param["file"]),
            document_string=Document(stream.read()),
        )
Example #2
0
def identify_publications(doc: Document) -> Dict[str, float]:
    """
    Determine entities with high probability of being a publication name.

    Arguments:
        doc (Document): The document to search

    Returns:
        Dict[str, float]: A list of publication names and the probability
            that we believe that we've made a correct assessment. For example,
            {"My Paper Title": 1.0} means that we are 100% sure that this is
            a reference to a publication.
    """
    _DOC_ID_MARKERS = [
        "arxiv:",
        "doi:",
    ]

    _AUTHORSHIP_MARKERS = [
        "et al",
    ]
    potential_publications = {}

    # Anything in quotes get a low probability:
    rxp = re.compile('"[^"]+"')
    for match in re.findall(rxp, doc.text()):
        if match not in potential_publications:
            potential_publications[match] = 0.0
        potential_publications[match] += 0.25

    return potential_publications
Example #3
0
def test_good_documents_pass_detector():
    for doc in GOOD_DOCUMENTS:
        text = str(PersonalLifeDetector().get_report(Document(doc)))
        assert len(text.split("\n")) == 2
Example #4
0
def test_bad_documents_trip_detector():
    for doc in BAD_DOCUMENTS:
        text = str(PersonalLifeDetector().get_report(Document(doc)))
        assert len(text.split("\n")) > 2
def test_good_documents_pass_detector():
    for doc in GOOD_DOCUMENTS:
        report = EffortDetector().get_report(Document(doc))
        report.set_summary("MY_SUMMARY")
        text = str(report)
        assert len(text.split("\n")) == 2
def test_bad_documents_trip_detector():
    for doc in BAD_DOCUMENTS:
        assert len(EffortDetector().get_report(Document(doc)).get_flags()) > 0
Example #7
0
def test_good_documents_pass_detector():
    for doc in GOOD_DOCUMENTS:
        report = PersonalLifeDetector().get_report(Document(doc))
        assert len(report.get_flags()) == 0
Example #8
0
def test_bad_documents_trip_detector():
    for doc in BAD_DOCUMENTS:
        text = PersonalLifeDetector().get_report(Document(doc)).to_string()
        print(text)
        assert "tends to relate" in text