def test_using_courses_from_url_in_classification(self): """Exam course classification should AND combine PDF and URL parsing.""" sha1_hash = '0000000000000000000000000000000000000000' pdf = Pdf(sha1_hash=sha1_hash) pdf.file.save(sha1_hash + '.pdf', ContentFile('Exam text')) # 1 course in exam text = "Exsamen i TMA4000" PdfPage.objects.create(text=text, pdf=pdf, number=0) # 3 additional courses in URL parsing urls = [ 'http://wiki.math.ntnu.no/TMA4100/exams/problems.pdf', 'http://wiki.math.ntnu.no/TMA4200/exams/problems.pdf', 'http://wiki.math.ntnu.no/TMA4300/exams/problems.pdf', ] for url in urls: PdfUrl.objects.create(url=url, scraped_pdf=pdf) # Results in 4 courses all together pdf.classify() assert pdf.exams.count() == 4 assert (set(pdf.exams.values_list('course_code', flat=True)) == { 'TMA4000', 'TMA4100', 'TMA4200', 'TMA4300' })
def test_combining_urls_and_content_for_classification(self): """Exam classification should OR combine PDF and URL parsing.""" sha1_hash = '0000000000000000000000000000000000000000' pdf = Pdf(sha1_hash=sha1_hash) pdf.file.save(sha1_hash + '.pdf', ContentFile('Exam text')) text = """ Exsamen i TMA4000 Dato: USPESIFISERT Løsningsforslag """ PdfPage.objects.create(text=text, pdf=pdf, number=0) # The first URL is disregarded as the other two are more popular urls = [ 'http://wiki.math.ntnu.no/TMA4000/exams/2017_kont.pdf', 'http://wiki.math.ntnu.no/TMA4000/exams/h2018.pdf', 'http://wiki.math.ntnu.no/TMA4000/exams/2018h.pdf', ] for url in urls: pdf_url = PdfUrl.objects.create(url=url, scraped_pdf=pdf) assert pdf_url.exam.year and pdf_url.exam.season pdf.classify() assert pdf.exams.count() == 1 exam = pdf.exams.first() assert exam.course_code == 'TMA4000' assert exam.solutions is True assert exam.language == Language.BOKMAL assert exam.year == 2018 assert exam.season == Season.AUTUMN
def test_classify_pdf_with_several_course_codes(self): """Several course codes should be supported for exam PDFs.""" sha1_hash = '0000000000000000000000000000000000000000' pdf = Pdf(sha1_hash=sha1_hash) text = """ Exsamen i TMA4000/10 og TIØ4000 Dato: 11.08.99 Løsningsforslag """.encode('utf-8') content = ContentFile(text) pdf.file.save(content=content, name=sha1_hash + '.pdf', save=True) PdfPage.objects.create(text=text, pdf=pdf, number=0) pdf.classify() assert (set(pdf.exams.values_list( 'course_code', flat=True)) == {'TMA4000', 'TMA4010', 'TIØ4000'}) for exam in pdf.exams.all(): assert exam.year == 1999 assert exam.season == Season.CONTINUATION assert exam.language == Language.BOKMAL assert exam.solutions is True
def test_determining_solutions_of_exam_without_content(self): """Solutions should by OR determined.""" sha1_hash = '0000000000000000000000000000000000000000' pdf = Pdf(sha1_hash=sha1_hash) pdf.file.save(sha1_hash + '.pdf', ContentFile('Exam text')) text = "Bad OCR handwritten content" PdfPage.objects.create(text=text, pdf=pdf, number=0) # Only one url contains solutions but it is completely trusted urls = [ 'http://wiki.math.ntnu.no/TMA4000/exams/problems1.pdf', 'http://wiki.math.ntnu.no/TMA4000/exams/problems2.pdf', 'http://wiki.math.ntnu.no/TMA4000/exams/problems3_solutions.pdf', ] for url in urls: PdfUrl.objects.create(url=url, scraped_pdf=pdf) # Results in 4 courses all together pdf.classify() assert pdf.exams.count() == 1 assert pdf.exams.first().solutions is True
def test_classifiying_bad_content(self): """Classification should handle onle Nones.""" sha1_hash = '0000000000000000000000000000000000000000' text = 'Bad OCR!' pdf = Pdf(sha1_hash=sha1_hash) pdf.file.save(sha1_hash + '.pdf', ContentFile(text)) PdfPage.objects.create(text=text, pdf=pdf, number=0) # First handle bad OCR without any URLs pdf.classify() assert pdf.exams.count() == 1 exam = pdf.exams.first() assert exam.solutions is False assert exam.course_code is None assert exam.language is None assert exam.year is None assert exam.season is None # And handle bad OCR with bad URLs urls = [ 'http://bad.url/1.pdf', 'http://bad.url/2.pdf', 'http://bad.url/3.pdf', ] for url in urls: PdfUrl.objects.create(url=url, scraped_pdf=pdf) pdf.classify() assert pdf.exams.count() == 1 exam = pdf.exams.first() assert exam.solutions is False assert exam.course_code is None assert exam.language is None assert exam.year is None assert exam.season is None
def test_classify_pdf(self): """Exam type should be determinable from pdf content.""" # The PDF contains the following content sha1_hash = '0000000000000000000000000000000000000000' pdf = Pdf(sha1_hash=sha1_hash) text = """ NTNU TMA4115 Matematikk 3 Institutt for matematiske f*g eksamen 11.08.05 Eksamenssettet har 12 punkter. """ content = ContentFile(text) pdf.file.save(content=content, name=sha1_hash + '.pdf', save=True) # No errors should be raised when no pages has been saved yet, but # False should be returned to indicate a lack of success. assert pdf.classify( allow_ocr=True) is False # Malformed plain text PDF assert pdf.classify(allow_ocr=False) is False assert pdf.exams.count() == 0 # But now we add a cover page and classify its content PdfPage.objects.create(text=text, pdf=pdf, number=0) pdf.refresh_from_db() assert pdf.classify() is True # And all metadata should be saved pdf = Pdf.objects.get(id=pdf.id) assert pdf.exams.count() == 1 exam = pdf.exams.first() assert exam.language == Language.BOKMAL assert exam.course_code == 'TMA4115' assert exam.solutions is False assert exam.year == 2005 assert exam.season == Season.CONTINUATION assert exam.content_type == DocumentInfo.EXAM # When the classification method changes it result, old results # should be removed. This is simulated here by mutating the exam. exam.year == 1999 exam.save() pdf.classify() pdf = Pdf.objects.get(id=pdf.id) assert pdf.exams.count() == 1 assert pdf.exams.first().year == 2005 # But verified exams should NOT be deleted verified_exam = DocumentInfo.objects.create( year=1999, course_code=exam.course_code, language=exam.language, solutions=exam.solutions, ) user = UserFactory.create(username='******') verified_exam_pdf = DocumentInfoSource.objects.create( document_info=verified_exam, pdf=pdf, ) verified_exam_pdf.verified_by.add(user) pdf.classify() pdf = Pdf.objects.get(id=pdf.id) assert pdf.exams.count() == 2 # If a PDF has already verified document info, classify should by a # no-op. DocumentInfoSource.objects.all().delete() verified_exam_pdf = DocumentInfoSource.objects.create( document_info=verified_exam, pdf=pdf, ) verified_exam_pdf.verified_by.add(user) pdf.classify(save=True) assert pdf.exams.count() == 1 assert pdf.exams.first() == verified_exam