def test_success(self): x = PdfDownloader(path, self.grobid_url) x.parse_references(self.file_path, self.source) self.assertEqual(SingleReference.objects.count(), 5) ref1 = SingleReference.objects.get(id=1).test() ref2 = SingleReference.objects.get(id=2).test() ref3 = SingleReference.objects.get(id=3).test() ref4 = SingleReference.objects.get(id=4).test() ref5 = SingleReference.objects.get(id=5).test() self.assertEqual(ref1, [ 1, 'Retrieving metadata for your local scholarly papers', datetime.date(2009, 1, 1), 'D. Aumueller', 'OP' ]) self.assertEqual(ref2, [ 1, 'Introducing Mr. DLib, a Machine-readable Digital Library', datetime.date(2011, 1, 1), 'J. Beel;B. Gipp;S. Langer;M. Genzmehr;E. Wilde;A. NĂźrnberger;J. Pitman', 'OP' ]) self.assertEqual(ref3, [ 1, 'SciPlore Xtract: Extracting Titles from Scientific PDF Documents by Analyzing Style Information (Font Size)', datetime.date(2010, 1, 1), 'J. Beel;B. Gipp;A. Shaker;N. Friedrich', 'OP' ])
def create_clusters(): gurl = global_url.objects.get(id=1) open_ref_list = {} with open(os.path.join(file_path,'ref.log'),encoding="utf8") as f: for line in f: re.findall(r'alpha(.*?)bravo', line) if 'SOURCE' in line: regex= r'SOURCE (.*?):' else: regex = r'REF (.*?):' try: id_match = re.findall(regex, line)[0] title = line[line.index(":")+1:] except: continue # normalize title and transform 8 byte hex number to int normal_title = normalize_title(title) normal_id = int(id_match, 16) # insert into cluster #cluster.objects.get_or_create(id= normal_id,defaults={"name":normal_title}) # create local urls for matching titles if id_match in match_list: index = match_list.index(id_match) lurl, tmp =local_url.objects.get_or_create(id=normal_id,global_url=gurl, url=id_match) # creat open reference opref,tmp =OpenReferences.objects.get_or_create(id=normal_id,ingester_key=lurl,source_table=666,source_key=id_match) open_ref_list[id_match] = opref # creates single references directly from pdf: logger = logging.getLogger("PDFDownloader") logger.setLevel(logging.INFO) # create the logging file handler log_file = os.path.join(file_path, "pdf_downloader.log") fh = logging.FileHandler(log_file) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) # add handler to logger object logger.addHandler(fh) # run actual task tmp = get_config("FOLDERS")["tmp"] grobid = get_config("WEAVER")["grobid"] limit = 20 if limit is None: limit = int(get_config("WEAVER")["pdf_limit"]) obj = PdfDownloader(tmp, grobid, logger=logger, limit=limit) for element in match_list: pdf_path = os.path.join("C:\\Users\\anhtu\\Google Drive\\Informatik 2016\\Related Work\\evaluation", "{}.pdf".format(element)) obj.parse_references(pdf_path, open_ref_list[element])
def setUp(self): self.grobid_url = "http://localhost:8080/processReferences" self.source = OpenReferences.objects.create(source_table="0", source_key="AAAAA") self.output_folder = "C:\\Users\\anhtu\\Desktop\\pdf" self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.INFO) # create the logging file handler fh = logging.StreamHandler(sys.stdout) formatter = logging.Formatter('%(levelname)s - %(message)s') fh.setFormatter(formatter) # add handler to logger object self.logger.addHandler(fh) self.obj = PdfDownloader(self.output_folder, self.grobid_url, logger=self.logger) self.obj.parse_references = MagicMock(return_value=None) # delete all files in test folder for the_file in os.listdir(self.output_folder): file_path = os.path.join(self.output_folder, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) self.source = OpenReferences.objects.create(source_table=0, source_key="AAAAA") PDFDownloadQueue.objects.bulk_create([ PDFDownloadQueue(url="https://arxiv.org/pdf/1704.03738.pdf", tries=0, source=self.source), PDFDownloadQueue(url="https://arxiv.org/pdf/1704.03732.pdf", tries=0, source=self.source), PDFDownloadQueue(url="https://arxiv.org/pdf/1704.03723.pdf", tries=0, source=self.source), ])
def test_get_reference_no_analytics_no_date(self): tei_doc = { "monogr": { "title": "title", "authors": ["A. And; B. And"], "pubyear": None } } x = PdfDownloader.get_reference(tei_doc) self.assertEqual(x, { "title": "title", "authors": ["A. And; B. And"], "pubyear": None })
def test_get_reference_no_analytics(self): tei_doc = { "monogr": { "title": "title", "authors": ["A. And; B. And"], "pubyear": "2001" } } x = PdfDownloader.get_reference(tei_doc) self.assertEqual( x, { "title": "title", "authors": ["A. And; B. And"], "pubyear": datetime.date(2001, 1, 1) })
def test_no_authors(self): tei_doc = { "monogr": { "title": "title", "authors": None, "pubyear": None }, "analytic": { "title": "better title", "authors": [], "pubyear": "1999" } } x = PdfDownloader.get_reference(tei_doc) self.assertEqual(x, None)
def test_analytics(self): tei_doc = { "monogr": { "title": "title", "authors": None, "pubyear": None }, "analytic": { "title": "better title", "authors": ["A. And; B. And"], "pubyear": "1999" } } x = PdfDownloader.get_reference(tei_doc) self.assertEqual( x, { "title": "better title", "authors": ["A. And; B. And"], "pubyear": datetime.date(1999, 1, 1) })
def test_invalid_file(self): x = PdfDownloader(path, self.grobid_url) file = os.path.join(path, "invalid.txt") self.assertRaises(GrobidException, x.parse_references, file, self.source)
def test_invalid_path(self): x = PdfDownloader(path, self.grobid_url) self.assertRaises(GrobidException, x.parse_references, "nkjk", self.source)
def test_regression10(self): file_path = os.path.join(path, "reg10.pdf") x = PdfDownloader(path, self.grobid_url) x.parse_references(file_path, self.source)
class TestPDFDownloader(TransactionTestCase): def setUp(self): self.grobid_url = "http://*****:*****@patch("weaver.PDFDownloader.requests.get") def test_403(self, req): req.return_value.status_code = 403 PDFDownloadQueue.objects.all().delete() PDFDownloadQueue.objects.create( url="https://arxiv.org/pdf/1704dasdsad738.pdf", tries=0) result = self.obj.process_pdf() self.assertEqual(result["skipped"], 1) obj = PDFDownloadQueue.objects.first() self.assertEqual(obj.tries, 1) self.assertIsNotNone(obj.last_try) @patch("weaver.PDFDownloader.requests.get") def test_404(self, req): req.return_value.status_code = 404 PDFDownloadQueue.objects.all().delete() PDFDownloadQueue.objects.create( url="https://arxiv.org/pdf/1704dasdsad738.pdf", tries=0) result = self.obj.process_pdf() self.assertEqual(result["invalid"], 1) self.assertEqual(PDFDownloadQueue.objects.count(), 0) @patch("weaver.PDFDownloader.requests.get") def test_false(self, req): req.return_value.ok = False PDFDownloadQueue.objects.all().delete() PDFDownloadQueue.objects.create( url="https://arxiv.org/pdf/1704dasdsad738.pdf", tries=0) result = self.obj.process_pdf() self.assertEqual(result["skipped"], 1) obj = PDFDownloadQueue.objects.first() self.assertEqual(obj.tries, 1) self.assertIsNotNone(obj.last_try) def test_connection_error(self): self.assertEqual(PDFDownloadQueue.objects.count(), 3) self.obj.parse_references.side_effect = ConnectionError() result = self.obj.process_pdf() self.assertEqual(PDFDownloadQueue.objects.count(), 3) self.assertEqual(result["skipped"], 1) def test_error(self): self.obj.limit = 1 self.obj.parse_references.side_effect = Exception() result = self.obj.process_pdf() self.assertEqual(PDFDownloadQueue.objects.count(), 3) self.assertEqual(result["skipped"], 1) def test_regression(self): file_path = os.path.join(path, "0704.0251.pdf") x = PdfDownloader(path, self.grobid_url) x.parse_references(file_path, self.source) def test_regression1(self): file_path = os.path.join(path, "reg1.pdf") x = PdfDownloader(path, self.grobid_url) x.parse_references(file_path, self.source) def test_regression2(self): file_path = os.path.join(path, "reg2.pdf") x = PdfDownloader(path, self.grobid_url) x.parse_references(file_path, self.source) def test_regression3(self): file_path = os.path.join(path, "reg3.pdf") x = PdfDownloader(path, self.grobid_url) x.parse_references(file_path, self.source) def test_regression4(self): file_path = os.path.join(path, "reg4.pdf") x = PdfDownloader(path, self.grobid_url) x.parse_references(file_path, self.source) def test_regression5(self): file_path = os.path.join(path, "reg5.pdf") x = PdfDownloader(path, self.grobid_url) x.parse_references(file_path, self.source) def test_regression6(self): file_path = os.path.join(path, "reg6.pdf") x = PdfDownloader(path, self.grobid_url) x.parse_references(file_path, self.source) def test_regression7(self): file_path = os.path.join(path, "reg7.pdf") x = PdfDownloader(path, self.grobid_url) x.parse_references(file_path, self.source) def test_regression8(self): file_path = os.path.join(path, "reg8.pdf") x = PdfDownloader(path, self.grobid_url) x.parse_references(file_path, self.source) def test_regression9(self): file_path = os.path.join(path, "reg9.pdf") x = PdfDownloader(path, self.grobid_url) x.parse_references(file_path, self.source) def test_regression10(self): file_path = os.path.join(path, "reg10.pdf") x = PdfDownloader(path, self.grobid_url) x.parse_references(file_path, self.source) @patch("weaver.PDFDownloader.requests.get") def test_pdf_downloader_regression1(self, req): req.return_value.status_code = 403 PDFDownloadQueue.objects.all().delete() PDFDownloadQueue.objects.create( url= "http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC4656104&blobtype=pdf", tries=0) result = self.obj.process_pdf(user_agent='Mozilla/5.0') self.assertEqual(result["skipped"], 0)