def test_string_content(): """FileBackup PDFs should be parsable.""" pdf_path = Path(__file__).parent / 'data' / 'matmod_exam_des_2017.pdf' pdf_content = ContentFile(pdf_path.read_bytes()) sha1 = '0000000000000000000000000000000000000000' pdf_backup = Pdf(sha1_hash=sha1) pdf_backup.file.save(name=sha1 + '.pdf', content=pdf_content) pdf_backup.read_text() pdf_backup.save() pdf_backup.refresh_from_db() # Ensure unicode string assert isinstance(pdf_backup.text, str) # Check content with text property assert len(pdf_backup.text.split('\f')) == 6 assert 'Rottman' in pdf_backup.text assert 'population model' in pdf_backup.text assert 'this is not in the exam' not in pdf_backup.text # Check associated PdfPage model objects pages = pdf_backup.pages.all() assert pages.count() == 6 # The ordering should be based on page number for page_num, page in enumerate(pages): assert page.number == page_num assert page.confidence is None # And each page should containt content assert 'Rottman' in pages[0].text assert 'Rottman' not in pages[2].text assert 'population model' in pages[2].text assert 'population model' not in pages[0].text
def test_classify_pdf(self): """Exam type should be determinable from pdf content.""" # The PDF contains the following content sha1_hash = '0000000000000000000000000000000000000000' pdf = Pdf(sha1_hash=sha1_hash) text = """ NTNU TMA4115 Matematikk 3 Institutt for matematiske f*g eksamen 11.08.05 Eksamenssettet har 12 punkter. """ content = ContentFile(text) pdf.file.save(content=content, name=sha1_hash + '.pdf', save=True) # No errors should be raised when no pages has been saved yet, but # False should be returned to indicate a lack of success. assert pdf.classify( allow_ocr=True) is False # Malformed plain text PDF assert pdf.classify(allow_ocr=False) is False assert pdf.exams.count() == 0 # But now we add a cover page and classify its content PdfPage.objects.create(text=text, pdf=pdf, number=0) pdf.refresh_from_db() assert pdf.classify() is True # And all metadata should be saved pdf = Pdf.objects.get(id=pdf.id) assert pdf.exams.count() == 1 exam = pdf.exams.first() assert exam.language == Language.BOKMAL assert exam.course_code == 'TMA4115' assert exam.solutions is False assert exam.year == 2005 assert exam.season == Season.CONTINUATION assert exam.content_type == DocumentInfo.EXAM # When the classification method changes it result, old results # should be removed. This is simulated here by mutating the exam. exam.year == 1999 exam.save() pdf.classify() pdf = Pdf.objects.get(id=pdf.id) assert pdf.exams.count() == 1 assert pdf.exams.first().year == 2005 # But verified exams should NOT be deleted verified_exam = DocumentInfo.objects.create( year=1999, course_code=exam.course_code, language=exam.language, solutions=exam.solutions, ) user = UserFactory.create(username='******') verified_exam_pdf = DocumentInfoSource.objects.create( document_info=verified_exam, pdf=pdf, ) verified_exam_pdf.verified_by.add(user) pdf.classify() pdf = Pdf.objects.get(id=pdf.id) assert pdf.exams.count() == 2 # If a PDF has already verified document info, classify should by a # no-op. DocumentInfoSource.objects.all().delete() verified_exam_pdf = DocumentInfoSource.objects.create( document_info=verified_exam, pdf=pdf, ) verified_exam_pdf.verified_by.add(user) pdf.classify(save=True) assert pdf.exams.count() == 1 assert pdf.exams.first() == verified_exam