def test_using_courses_from_url_in_classification(self): """Exam course classification should AND combine PDF and URL parsing.""" sha1_hash = '0000000000000000000000000000000000000000' pdf = Pdf(sha1_hash=sha1_hash) pdf.file.save(sha1_hash + '.pdf', ContentFile('Exam text')) # 1 course in exam text = "Exsamen i TMA4000" PdfPage.objects.create(text=text, pdf=pdf, number=0) # 3 additional courses in URL parsing urls = [ 'http://wiki.math.ntnu.no/TMA4100/exams/problems.pdf', 'http://wiki.math.ntnu.no/TMA4200/exams/problems.pdf', 'http://wiki.math.ntnu.no/TMA4300/exams/problems.pdf', ] for url in urls: PdfUrl.objects.create(url=url, scraped_pdf=pdf) # Results in 4 courses all together pdf.classify() assert pdf.exams.count() == 4 assert (set(pdf.exams.values_list('course_code', flat=True)) == { 'TMA4000', 'TMA4100', 'TMA4200', 'TMA4300' })
def test_combining_urls_and_content_for_classification(self): """Exam classification should OR combine PDF and URL parsing.""" sha1_hash = '0000000000000000000000000000000000000000' pdf = Pdf(sha1_hash=sha1_hash) pdf.file.save(sha1_hash + '.pdf', ContentFile('Exam text')) text = """ Exsamen i TMA4000 Dato: USPESIFISERT Løsningsforslag """ PdfPage.objects.create(text=text, pdf=pdf, number=0) # The first URL is disregarded as the other two are more popular urls = [ 'http://wiki.math.ntnu.no/TMA4000/exams/2017_kont.pdf', 'http://wiki.math.ntnu.no/TMA4000/exams/h2018.pdf', 'http://wiki.math.ntnu.no/TMA4000/exams/2018h.pdf', ] for url in urls: pdf_url = PdfUrl.objects.create(url=url, scraped_pdf=pdf) assert pdf_url.exam.year and pdf_url.exam.season pdf.classify() assert pdf.exams.count() == 1 exam = pdf.exams.first() assert exam.course_code == 'TMA4000' assert exam.solutions is True assert exam.language == Language.BOKMAL assert exam.year == 2018 assert exam.season == Season.AUTUMN
def test_deletion_of_file_on_delete(tmpdir, settings): """FileField file should be cleaned up on Pdf deletion.""" # Create Pdf object with associated downloaded file sha1_hash = '4dc828ea76ab618be6d72d135af13c40de3b9ce6' pdf = Pdf(sha1_hash=sha1_hash) pdf.file.save(content=ContentFile('Exam text'), name=sha1_hash + '.pdf') # The file should now exist on disk filepath = Path(settings.MEDIA_ROOT, pdf.file.name) assert filepath.is_file() # But after deleting the model, the file should be cleaned as well pdf.delete() assert not filepath.is_file()
def test_classify_pdf_with_several_course_codes(self): """Several course codes should be supported for exam PDFs.""" sha1_hash = '0000000000000000000000000000000000000000' pdf = Pdf(sha1_hash=sha1_hash) text = """ Exsamen i TMA4000/10 og TIØ4000 Dato: 11.08.99 Løsningsforslag """.encode('utf-8') content = ContentFile(text) pdf.file.save(content=content, name=sha1_hash + '.pdf', save=True) PdfPage.objects.create(text=text, pdf=pdf, number=0) pdf.classify() assert (set(pdf.exams.values_list( 'course_code', flat=True)) == {'TMA4000', 'TMA4010', 'TIØ4000'}) for exam in pdf.exams.all(): assert exam.year == 1999 assert exam.season == Season.CONTINUATION assert exam.language == Language.BOKMAL assert exam.solutions is True
def test_raising_validation_errors_of_wrong_sha1_formatting(): """SHA1 hash format should be enforced, also for filenames.""" with pytest.raises(ValidationError): Pdf.objects.create(sha1_hash='WRONG HASH FORMAT') correct_sha1_hash = '4dc828ea76ab618be6d72d135af13c40de3b9ce6' pdf = Pdf(sha1_hash=correct_sha1_hash) with pytest.raises(ValidationError): pdf.file.save( content=ContentFile(''), name=correct_sha1_hash.replace('4', '3') + '.pdf', )
def test_determining_solutions_of_exam_without_content(self): """Solutions should by OR determined.""" sha1_hash = '0000000000000000000000000000000000000000' pdf = Pdf(sha1_hash=sha1_hash) pdf.file.save(sha1_hash + '.pdf', ContentFile('Exam text')) text = "Bad OCR handwritten content" PdfPage.objects.create(text=text, pdf=pdf, number=0) # Only one url contains solutions but it is completely trusted urls = [ 'http://wiki.math.ntnu.no/TMA4000/exams/problems1.pdf', 'http://wiki.math.ntnu.no/TMA4000/exams/problems2.pdf', 'http://wiki.math.ntnu.no/TMA4000/exams/problems3_solutions.pdf', ] for url in urls: PdfUrl.objects.create(url=url, scraped_pdf=pdf) # Results in 4 courses all together pdf.classify() assert pdf.exams.count() == 1 assert pdf.exams.first().solutions is True
def test_string_content(): """FileBackup PDFs should be parsable.""" pdf_path = Path(__file__).parent / 'data' / 'matmod_exam_des_2017.pdf' pdf_content = ContentFile(pdf_path.read_bytes()) sha1 = '0000000000000000000000000000000000000000' pdf_backup = Pdf(sha1_hash=sha1) pdf_backup.file.save(name=sha1 + '.pdf', content=pdf_content) pdf_backup.read_text() pdf_backup.save() pdf_backup.refresh_from_db() # Ensure unicode string assert isinstance(pdf_backup.text, str) # Check content with text property assert len(pdf_backup.text.split('\f')) == 6 assert 'Rottman' in pdf_backup.text assert 'population model' in pdf_backup.text assert 'this is not in the exam' not in pdf_backup.text # Check associated PdfPage model objects pages = pdf_backup.pages.all() assert pages.count() == 6 # The ordering should be based on page number for page_num, page in enumerate(pages): assert page.number == page_num assert page.confidence is None # And each page should containt content assert 'Rottman' in pages[0].text assert 'Rottman' not in pages[2].text assert 'population model' in pages[2].text assert 'population model' not in pages[0].text
def test_classifiying_bad_content(self): """Classification should handle onle Nones.""" sha1_hash = '0000000000000000000000000000000000000000' text = 'Bad OCR!' pdf = Pdf(sha1_hash=sha1_hash) pdf.file.save(sha1_hash + '.pdf', ContentFile(text)) PdfPage.objects.create(text=text, pdf=pdf, number=0) # First handle bad OCR without any URLs pdf.classify() assert pdf.exams.count() == 1 exam = pdf.exams.first() assert exam.solutions is False assert exam.course_code is None assert exam.language is None assert exam.year is None assert exam.season is None # And handle bad OCR with bad URLs urls = [ 'http://bad.url/1.pdf', 'http://bad.url/2.pdf', 'http://bad.url/3.pdf', ] for url in urls: PdfUrl.objects.create(url=url, scraped_pdf=pdf) pdf.classify() assert pdf.exams.count() == 1 exam = pdf.exams.first() assert exam.solutions is False assert exam.course_code is None assert exam.language is None assert exam.year is None assert exam.season is None
def test_verify_pdf_view(admin_client): """Test PDF verification of specific model objects.""" # We have one PDF sha1_hash = '0000000000000000000000000000000000000000' pdf = Pdf(sha1_hash=sha1_hash) content = ContentFile('exam text') pdf.file.save(name=sha1_hash + '.pdf', content=content) # And another one sha1_hash2 = '1111111111111111111111111111111111111111' pdf2 = Pdf(sha1_hash=sha1_hash2) content2 = ContentFile('exam text') pdf2.file.save(name=sha1_hash2 + '.pdf', content=content2) # Both PDFs are connected to the same exam exam = DocumentInfo.objects.create() DocumentInfoSource.objects.create(pdf=pdf, document_info=exam) DocumentInfoSource.objects.create(pdf=pdf2, document_info=exam) # And each resolves to a view with their own PDF as context response = admin_client.get(pdf.get_absolute_url()) assert response.context['pdf'] == pdf response2 = admin_client.get(pdf2.get_absolute_url()) assert response2.context['pdf'] == pdf2
def test_classify_pdf(self): """Exam type should be determinable from pdf content.""" # The PDF contains the following content sha1_hash = '0000000000000000000000000000000000000000' pdf = Pdf(sha1_hash=sha1_hash) text = """ NTNU TMA4115 Matematikk 3 Institutt for matematiske f*g eksamen 11.08.05 Eksamenssettet har 12 punkter. """ content = ContentFile(text) pdf.file.save(content=content, name=sha1_hash + '.pdf', save=True) # No errors should be raised when no pages has been saved yet, but # False should be returned to indicate a lack of success. assert pdf.classify( allow_ocr=True) is False # Malformed plain text PDF assert pdf.classify(allow_ocr=False) is False assert pdf.exams.count() == 0 # But now we add a cover page and classify its content PdfPage.objects.create(text=text, pdf=pdf, number=0) pdf.refresh_from_db() assert pdf.classify() is True # And all metadata should be saved pdf = Pdf.objects.get(id=pdf.id) assert pdf.exams.count() == 1 exam = pdf.exams.first() assert exam.language == Language.BOKMAL assert exam.course_code == 'TMA4115' assert exam.solutions is False assert exam.year == 2005 assert exam.season == Season.CONTINUATION assert exam.content_type == DocumentInfo.EXAM # When the classification method changes it result, old results # should be removed. This is simulated here by mutating the exam. exam.year == 1999 exam.save() pdf.classify() pdf = Pdf.objects.get(id=pdf.id) assert pdf.exams.count() == 1 assert pdf.exams.first().year == 2005 # But verified exams should NOT be deleted verified_exam = DocumentInfo.objects.create( year=1999, course_code=exam.course_code, language=exam.language, solutions=exam.solutions, ) user = UserFactory.create(username='******') verified_exam_pdf = DocumentInfoSource.objects.create( document_info=verified_exam, pdf=pdf, ) verified_exam_pdf.verified_by.add(user) pdf.classify() pdf = Pdf.objects.get(id=pdf.id) assert pdf.exams.count() == 2 # If a PDF has already verified document info, classify should by a # no-op. DocumentInfoSource.objects.all().delete() verified_exam_pdf = DocumentInfoSource.objects.create( document_info=verified_exam, pdf=pdf, ) verified_exam_pdf.verified_by.add(user) pdf.classify(save=True) assert pdf.exams.count() == 1 assert pdf.exams.first() == verified_exam
def test_get_absolute_url_for_pdf(): """PDF absolute URL should point to verify form view for that pdf.""" sha1_hash = '4dc828ea76ab618be6d72d135af13c40de3b9ce6' pdf = Pdf(sha1_hash=sha1_hash) assert pdf.get_absolute_url() == '/exams/verify/' + sha1_hash
def test_queryset_organize_method(): """ExamURLs should be organizable in hierarchy.""" # All links are related to this course CourseFactory( full_name='Mathematics 1', display_name='Maths 1', course_code='TMA4000', ) exam1 = DocumentInfo.objects.create( course_code='TMA4000', year=2016, season=Season.SPRING, language=Language.ENGLISH, ) exam_url1 = PdfUrl.objects.create( url='http://exams.com/exam', exam=exam1, ) exam1_solutions = DocumentInfo.objects.create( course_code='TMA4000', year=2016, season=Season.SPRING, solutions=True, language=Language.ENGLISH, ) exam_url_solutions = PdfUrl.objects.create(url='http://exams.com/solution', exam=exam1_solutions) eksamen_losning = DocumentInfo.objects.create( course_code='TMA4000', year=2016, season=Season.SPRING, solutions=True, language=Language.BOKMAL, ) eksamen_url_losning = PdfUrl.objects.create( url='http://exams.com/losning', exam=eksamen_losning, ) # The URL classifier could not determine the language url_exam_2015 = DocumentInfo.objects.create( course_code='TMA4000', year=2015, season=Season.SPRING, solutions=False, language=Language.UNKNOWN, ) # But the PDF classifier managed to determine it pdf_exam_2015 = DocumentInfo.objects.create( course_code='TMA4000', year=2015, season=Season.SPRING, solutions=False, language=Language.ENGLISH, ) sha1_hash = '0000000000000000000000000000000000000000' exam_2015_pdf = Pdf(sha1_hash=sha1_hash) exam_2015_pdf.file.save(sha1_hash + '.pdf', ContentFile('exam text')) DocumentInfoSource.objects.create( document_info=pdf_exam_2015, pdf=exam_2015_pdf, ) # The pdf is scraped exam_2015_url = PdfUrl.objects.create( url='http://exams.com/exam_2015', exam=url_exam_2015, scraped_pdf=exam_2015_pdf, ) organization = DocumentInfo.objects.all().organize() assert organization == { 'TMA4000': { 'full_name': 'Mathematics 1', 'nick_name': 'Maths 1', 'years': { 2016: { 'Vår': { 'exams': { 'Engelsk': [exam_url1] }, 'solutions': { 'Bokmål': [eksamen_url_losning], 'Engelsk': [exam_url_solutions], }, }, }, 2015: { 'Vår': { 'exams': { 'Engelsk': [exam_2015_url] }, 'solutions': {}, }, }, }, }, }
def test_verify_random_pdf_view(client, django_user_model): """Test PDF verification view.""" # We have one PDF sha1_hash = '0000000000000000000000000000000000000000' pdf = Pdf(sha1_hash=sha1_hash) content = ContentFile('exam text') pdf.file.save(name=sha1_hash + '.pdf', content=content) # And three courses course1 = CourseFactory(course_code='TMA1000') course2 = CourseFactory(course_code='TMA2000') course3 = CourseFactory(course_code='TMA3000') # The PDF has been inferred to contain the two first of these courses common_docinfo_attrs = { 'language': 'Bokmål', 'year': 2010, 'solutions': False, 'content_type': 'Exam', } exam1 = DocumentInfo.objects.create( course=course1, **common_docinfo_attrs, ) DocumentInfoSource.objects.create(pdf=pdf, document_info=exam1) exam2 = DocumentInfo.objects.create( course=course2, **common_docinfo_attrs, ) DocumentInfoSource.objects.create(pdf=pdf, document_info=exam2) # We verify a random PDF in this case our PDF since there is only one user = django_user_model.objects.create_user(username='******', password='******') client.login(username='******', password='******') url = reverse('examiner:verify_random') response = client.get(url) assert response.status_code == 200 # The form instance is populated with the first exam form = response.context['form'] data = form.initial assert form.instance == exam1 assert data['language'] == 'Bokmål' assert data['pdf'] == pdf assert data['season'] is None assert data['verifier'] == user # But both courses appear in the courses field assert data['courses'].count() == 2 assert set(data['courses']) == {course1.id, course2.id} # The user now changes the 2 courses form = VerifyExamForm({ 'courses': [course2.id, course3.id], 'pdf': pdf.id, 'verifier': user.id, **common_docinfo_attrs, }) assert form.is_valid() response = client.post(url, form.data) assert response.status_code == 302 # We have two new verified exams verified_exams = DocumentInfoSource.objects.filter(verified_by__in=[user]) assert verified_exams.count() == 2 # Both are connected to our pdf exam_pdf1 = verified_exams.first() exam_pdf2 = verified_exams.last() assert exam_pdf1.pdf == pdf assert exam_pdf2.pdf == pdf assert exam_pdf1.verified_by.first() == user assert exam_pdf2.verified_by.first() == user # With two different courses docinfo1 = exam_pdf1.document_info docinfo2 = exam_pdf2.document_info assert docinfo1.course == course2 assert docinfo2.course == course3 # But all other attributes being equeal for key, value in common_docinfo_attrs.items(): assert getattr(docinfo1, key) == value assert getattr(docinfo2, key) == value # The two other unverified infos have now been removed assert not DocumentInfoSource.objects.filter( verified_by__isnull=True, ).exists() # And we have alltogether 3 DocumentInfo objects assert DocumentInfo.objects.count() == 3 # And only two through relations assert DocumentInfoSource.objects.count() == 2