Example #1
0
def test_string_content():
    """FileBackup PDFs should be parsable."""
    pdf_path = Path(__file__).parent / 'data' / 'matmod_exam_des_2017.pdf'
    pdf_content = ContentFile(pdf_path.read_bytes())
    sha1 = '0000000000000000000000000000000000000000'
    pdf_backup = Pdf(sha1_hash=sha1)
    pdf_backup.file.save(name=sha1 + '.pdf', content=pdf_content)
    pdf_backup.read_text()
    pdf_backup.save()

    pdf_backup.refresh_from_db()

    # Ensure unicode string
    assert isinstance(pdf_backup.text, str)

    # Check content with text property
    assert len(pdf_backup.text.split('\f')) == 6
    assert 'Rottman' in pdf_backup.text
    assert 'population model' in pdf_backup.text
    assert 'this is not in the exam' not in pdf_backup.text

    # Check associated PdfPage model objects
    pages = pdf_backup.pages.all()
    assert pages.count() == 6

    # The ordering should be based on page number
    for page_num, page in enumerate(pages):
        assert page.number == page_num
        assert page.confidence is None

    # And each page should containt content
    assert 'Rottman' in pages[0].text
    assert 'Rottman' not in pages[2].text

    assert 'population model' in pages[2].text
    assert 'population model' not in pages[0].text
Example #2
0
    def test_classify_pdf(self):
        """Exam type should be determinable from pdf content."""
        # The PDF contains the following content
        sha1_hash = '0000000000000000000000000000000000000000'
        pdf = Pdf(sha1_hash=sha1_hash)
        text = """
            NTNU TMA4115 Matematikk 3
            Institutt for matematiske f*g
            eksamen 11.08.05
            Eksamenssettet har 12 punkter.
        """
        content = ContentFile(text)
        pdf.file.save(content=content, name=sha1_hash + '.pdf', save=True)

        # No errors should be raised when no pages has been saved yet, but
        # False should be returned to indicate a lack of success.
        assert pdf.classify(
            allow_ocr=True) is False  # Malformed plain text PDF
        assert pdf.classify(allow_ocr=False) is False
        assert pdf.exams.count() == 0

        # But now we add a cover page and classify its content
        PdfPage.objects.create(text=text, pdf=pdf, number=0)
        pdf.refresh_from_db()
        assert pdf.classify() is True

        # And all metadata should be saved
        pdf = Pdf.objects.get(id=pdf.id)
        assert pdf.exams.count() == 1
        exam = pdf.exams.first()
        assert exam.language == Language.BOKMAL
        assert exam.course_code == 'TMA4115'
        assert exam.solutions is False
        assert exam.year == 2005
        assert exam.season == Season.CONTINUATION
        assert exam.content_type == DocumentInfo.EXAM

        # When the classification method changes it result, old results
        # should be removed. This is simulated here by mutating the exam.
        exam.year == 1999
        exam.save()
        pdf.classify()
        pdf = Pdf.objects.get(id=pdf.id)
        assert pdf.exams.count() == 1
        assert pdf.exams.first().year == 2005

        # But verified exams should NOT be deleted
        verified_exam = DocumentInfo.objects.create(
            year=1999,
            course_code=exam.course_code,
            language=exam.language,
            solutions=exam.solutions,
        )
        user = UserFactory.create(username='******')
        verified_exam_pdf = DocumentInfoSource.objects.create(
            document_info=verified_exam,
            pdf=pdf,
        )
        verified_exam_pdf.verified_by.add(user)
        pdf.classify()
        pdf = Pdf.objects.get(id=pdf.id)
        assert pdf.exams.count() == 2

        # If a PDF has already verified document info, classify should by a
        # no-op.
        DocumentInfoSource.objects.all().delete()
        verified_exam_pdf = DocumentInfoSource.objects.create(
            document_info=verified_exam,
            pdf=pdf,
        )
        verified_exam_pdf.verified_by.add(user)
        pdf.classify(save=True)
        assert pdf.exams.count() == 1
        assert pdf.exams.first() == verified_exam