Python Pdf.classify Examples, examiner.models.Pdf.classify Python Examples

Example #1

0

Show file

File: test_models.py Project: JakobGM/WikiLinks

    def test_using_courses_from_url_in_classification(self):
        """Exam course classification should AND combine PDF and URL parsing."""
        sha1_hash = '0000000000000000000000000000000000000000'
        pdf = Pdf(sha1_hash=sha1_hash)
        pdf.file.save(sha1_hash + '.pdf', ContentFile('Exam text'))

        # 1 course in exam
        text = "Exsamen i TMA4000"
        PdfPage.objects.create(text=text, pdf=pdf, number=0)

        # 3 additional courses in URL parsing
        urls = [
            'http://wiki.math.ntnu.no/TMA4100/exams/problems.pdf',
            'http://wiki.math.ntnu.no/TMA4200/exams/problems.pdf',
            'http://wiki.math.ntnu.no/TMA4300/exams/problems.pdf',
        ]
        for url in urls:
            PdfUrl.objects.create(url=url, scraped_pdf=pdf)

        # Results in 4 courses all together
        pdf.classify()
        assert pdf.exams.count() == 4
        assert (set(pdf.exams.values_list('course_code', flat=True)) == {
            'TMA4000', 'TMA4100', 'TMA4200', 'TMA4300'
        })

Example #2

0

Show file

File: test_models.py Project: JakobGM/WikiLinks

    def test_combining_urls_and_content_for_classification(self):
        """Exam classification should OR combine PDF and URL parsing."""
        sha1_hash = '0000000000000000000000000000000000000000'
        pdf = Pdf(sha1_hash=sha1_hash)
        pdf.file.save(sha1_hash + '.pdf', ContentFile('Exam text'))
        text = """
            Exsamen i TMA4000
            Dato: USPESIFISERT
            Løsningsforslag
        """
        PdfPage.objects.create(text=text, pdf=pdf, number=0)

        # The first URL is disregarded as the other two are more popular
        urls = [
            'http://wiki.math.ntnu.no/TMA4000/exams/2017_kont.pdf',
            'http://wiki.math.ntnu.no/TMA4000/exams/h2018.pdf',
            'http://wiki.math.ntnu.no/TMA4000/exams/2018h.pdf',
        ]
        for url in urls:
            pdf_url = PdfUrl.objects.create(url=url, scraped_pdf=pdf)
            assert pdf_url.exam.year and pdf_url.exam.season

        pdf.classify()
        assert pdf.exams.count() == 1

        exam = pdf.exams.first()
        assert exam.course_code == 'TMA4000'
        assert exam.solutions is True
        assert exam.language == Language.BOKMAL
        assert exam.year == 2018
        assert exam.season == Season.AUTUMN

Example #3

0

Show file

File: test_models.py Project: JakobGM/WikiLinks

    def test_classify_pdf_with_several_course_codes(self):
        """Several course codes should be supported for exam PDFs."""
        sha1_hash = '0000000000000000000000000000000000000000'
        pdf = Pdf(sha1_hash=sha1_hash)
        text = """
            Exsamen i TMA4000/10 og TIØ4000
            Dato: 11.08.99
            Løsningsforslag
        """.encode('utf-8')
        content = ContentFile(text)
        pdf.file.save(content=content, name=sha1_hash + '.pdf', save=True)
        PdfPage.objects.create(text=text, pdf=pdf, number=0)
        pdf.classify()
        assert (set(pdf.exams.values_list(
            'course_code', flat=True)) == {'TMA4000', 'TMA4010', 'TIØ4000'})

        for exam in pdf.exams.all():
            assert exam.year == 1999
            assert exam.season == Season.CONTINUATION
            assert exam.language == Language.BOKMAL
            assert exam.solutions is True

Example #4

0

Show file

File: test_models.py Project: JakobGM/WikiLinks

    def test_determining_solutions_of_exam_without_content(self):
        """Solutions should by OR determined."""
        sha1_hash = '0000000000000000000000000000000000000000'
        pdf = Pdf(sha1_hash=sha1_hash)
        pdf.file.save(sha1_hash + '.pdf', ContentFile('Exam text'))

        text = "Bad OCR handwritten content"
        PdfPage.objects.create(text=text, pdf=pdf, number=0)

        # Only one url contains solutions but it is completely trusted
        urls = [
            'http://wiki.math.ntnu.no/TMA4000/exams/problems1.pdf',
            'http://wiki.math.ntnu.no/TMA4000/exams/problems2.pdf',
            'http://wiki.math.ntnu.no/TMA4000/exams/problems3_solutions.pdf',
        ]
        for url in urls:
            PdfUrl.objects.create(url=url, scraped_pdf=pdf)

        # Results in 4 courses all together
        pdf.classify()
        assert pdf.exams.count() == 1
        assert pdf.exams.first().solutions is True

Example #5

0

Show file

File: test_models.py Project: JakobGM/WikiLinks

    def test_classifiying_bad_content(self):
        """Classification should handle onle Nones."""
        sha1_hash = '0000000000000000000000000000000000000000'
        text = 'Bad OCR!'
        pdf = Pdf(sha1_hash=sha1_hash)
        pdf.file.save(sha1_hash + '.pdf', ContentFile(text))
        PdfPage.objects.create(text=text, pdf=pdf, number=0)

        # First handle bad OCR without any URLs
        pdf.classify()
        assert pdf.exams.count() == 1

        exam = pdf.exams.first()
        assert exam.solutions is False
        assert exam.course_code is None
        assert exam.language is None
        assert exam.year is None
        assert exam.season is None

        # And handle bad OCR with bad URLs
        urls = [
            'http://bad.url/1.pdf',
            'http://bad.url/2.pdf',
            'http://bad.url/3.pdf',
        ]
        for url in urls:
            PdfUrl.objects.create(url=url, scraped_pdf=pdf)

        pdf.classify()
        assert pdf.exams.count() == 1

        exam = pdf.exams.first()
        assert exam.solutions is False
        assert exam.course_code is None
        assert exam.language is None
        assert exam.year is None
        assert exam.season is None

Example #6

0

Show file

File: test_models.py Project: JakobGM/WikiLinks

    def test_classify_pdf(self):
        """Exam type should be determinable from pdf content."""
        # The PDF contains the following content
        sha1_hash = '0000000000000000000000000000000000000000'
        pdf = Pdf(sha1_hash=sha1_hash)
        text = """
            NTNU TMA4115 Matematikk 3
            Institutt for matematiske f*g
            eksamen 11.08.05
            Eksamenssettet har 12 punkter.
        """
        content = ContentFile(text)
        pdf.file.save(content=content, name=sha1_hash + '.pdf', save=True)

        # No errors should be raised when no pages has been saved yet, but
        # False should be returned to indicate a lack of success.
        assert pdf.classify(
            allow_ocr=True) is False  # Malformed plain text PDF
        assert pdf.classify(allow_ocr=False) is False
        assert pdf.exams.count() == 0

        # But now we add a cover page and classify its content
        PdfPage.objects.create(text=text, pdf=pdf, number=0)
        pdf.refresh_from_db()
        assert pdf.classify() is True

        # And all metadata should be saved
        pdf = Pdf.objects.get(id=pdf.id)
        assert pdf.exams.count() == 1
        exam = pdf.exams.first()
        assert exam.language == Language.BOKMAL
        assert exam.course_code == 'TMA4115'
        assert exam.solutions is False
        assert exam.year == 2005
        assert exam.season == Season.CONTINUATION
        assert exam.content_type == DocumentInfo.EXAM

        # When the classification method changes it result, old results
        # should be removed. This is simulated here by mutating the exam.
        exam.year == 1999
        exam.save()
        pdf.classify()
        pdf = Pdf.objects.get(id=pdf.id)
        assert pdf.exams.count() == 1
        assert pdf.exams.first().year == 2005

        # But verified exams should NOT be deleted
        verified_exam = DocumentInfo.objects.create(
            year=1999,
            course_code=exam.course_code,
            language=exam.language,
            solutions=exam.solutions,
        )
        user = UserFactory.create(username='******')
        verified_exam_pdf = DocumentInfoSource.objects.create(
            document_info=verified_exam,
            pdf=pdf,
        )
        verified_exam_pdf.verified_by.add(user)
        pdf.classify()
        pdf = Pdf.objects.get(id=pdf.id)
        assert pdf.exams.count() == 2

        # If a PDF has already verified document info, classify should by a
        # no-op.
        DocumentInfoSource.objects.all().delete()
        verified_exam_pdf = DocumentInfoSource.objects.create(
            document_info=verified_exam,
            pdf=pdf,
        )
        verified_exam_pdf.verified_by.add(user)
        pdf.classify(save=True)
        assert pdf.exams.count() == 1
        assert pdf.exams.first() == verified_exam