Example #1
0
    def test_success(self):
        x = PdfDownloader(path, self.grobid_url)
        x.parse_references(self.file_path, self.source)
        self.assertEqual(SingleReference.objects.count(), 5)
        ref1 = SingleReference.objects.get(id=1).test()
        ref2 = SingleReference.objects.get(id=2).test()
        ref3 = SingleReference.objects.get(id=3).test()
        ref4 = SingleReference.objects.get(id=4).test()
        ref5 = SingleReference.objects.get(id=5).test()
        self.assertEqual(ref1, [
            1, 'Retrieving metadata for your local scholarly papers',
            datetime.date(2009, 1, 1), 'D. Aumueller', 'OP'
        ])

        self.assertEqual(ref2, [
            1, 'Introducing Mr. DLib, a Machine-readable Digital Library',
            datetime.date(2011, 1, 1),
            'J. Beel;B. Gipp;S. Langer;M. Genzmehr;E. Wilde;A. NĂźrnberger;J. Pitman',
            'OP'
        ])
        self.assertEqual(ref3, [
            1,
            'SciPlore Xtract: Extracting Titles from Scientific PDF Documents by Analyzing Style Information (Font Size)',
            datetime.date(2010, 1,
                          1), 'J. Beel;B. Gipp;A. Shaker;N. Friedrich', 'OP'
        ])
Example #2
0
def create_clusters():
    gurl = global_url.objects.get(id=1)
    open_ref_list = {}
    with open(os.path.join(file_path,'ref.log'),encoding="utf8") as f:
        for line in f:
            re.findall(r'alpha(.*?)bravo', line)
            if 'SOURCE' in line:
                regex= r'SOURCE (.*?):'
            else:
                regex = r'REF (.*?):'
            try:
                id_match = re.findall(regex, line)[0]
                title = line[line.index(":")+1:]
            except:
                continue

            # normalize title and transform 8 byte hex number to int
            normal_title = normalize_title(title)
            normal_id  = int(id_match, 16)
            # insert into cluster
            #cluster.objects.get_or_create(id= normal_id,defaults={"name":normal_title})
            # create local urls for matching titles

            if id_match in match_list:
                index = match_list.index(id_match)
                lurl, tmp =local_url.objects.get_or_create(id=normal_id,global_url=gurl, url=id_match)
                # creat open reference
                opref,tmp =OpenReferences.objects.get_or_create(id=normal_id,ingester_key=lurl,source_table=666,source_key=id_match)
                open_ref_list[id_match] = opref

    # creates single references directly from pdf:
    logger = logging.getLogger("PDFDownloader")
    logger.setLevel(logging.INFO)
    # create the logging file handler
    log_file = os.path.join(file_path, "pdf_downloader.log")
    fh = logging.FileHandler(log_file)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    # add handler to logger object
    logger.addHandler(fh)
    # run actual task
    tmp = get_config("FOLDERS")["tmp"]
    grobid = get_config("WEAVER")["grobid"]
    limit = 20
    if limit is None:
        limit = int(get_config("WEAVER")["pdf_limit"])
    obj = PdfDownloader(tmp, grobid, logger=logger, limit=limit)
    for element in match_list:
        pdf_path = os.path.join("C:\\Users\\anhtu\\Google Drive\\Informatik 2016\\Related Work\\evaluation",
                                "{}.pdf".format(element))
        obj.parse_references(pdf_path, open_ref_list[element])
Example #3
0
    def setUp(self):
        self.grobid_url = "http://localhost:8080/processReferences"
        self.source = OpenReferences.objects.create(source_table="0",
                                                    source_key="AAAAA")
        self.output_folder = "C:\\Users\\anhtu\\Desktop\\pdf"

        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.INFO)
        # create the logging file handler
        fh = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter('%(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        # add handler to logger object
        self.logger.addHandler(fh)

        self.obj = PdfDownloader(self.output_folder,
                                 self.grobid_url,
                                 logger=self.logger)
        self.obj.parse_references = MagicMock(return_value=None)
        # delete all files in test folder
        for the_file in os.listdir(self.output_folder):
            file_path = os.path.join(self.output_folder, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                print(e)

        self.source = OpenReferences.objects.create(source_table=0,
                                                    source_key="AAAAA")

        PDFDownloadQueue.objects.bulk_create([
            PDFDownloadQueue(url="https://arxiv.org/pdf/1704.03738.pdf",
                             tries=0,
                             source=self.source),
            PDFDownloadQueue(url="https://arxiv.org/pdf/1704.03732.pdf",
                             tries=0,
                             source=self.source),
            PDFDownloadQueue(url="https://arxiv.org/pdf/1704.03723.pdf",
                             tries=0,
                             source=self.source),
        ])
Example #4
0
 def test_get_reference_no_analytics_no_date(self):
     tei_doc = {
         "monogr": {
             "title": "title",
             "authors": ["A. And; B. And"],
             "pubyear": None
         }
     }
     x = PdfDownloader.get_reference(tei_doc)
     self.assertEqual(x, {
         "title": "title",
         "authors": ["A. And; B. And"],
         "pubyear": None
     })
Example #5
0
 def test_get_reference_no_analytics(self):
     tei_doc = {
         "monogr": {
             "title": "title",
             "authors": ["A. And; B. And"],
             "pubyear": "2001"
         }
     }
     x = PdfDownloader.get_reference(tei_doc)
     self.assertEqual(
         x, {
             "title": "title",
             "authors": ["A. And; B. And"],
             "pubyear": datetime.date(2001, 1, 1)
         })
Example #6
0
 def test_no_authors(self):
     tei_doc = {
         "monogr": {
             "title": "title",
             "authors": None,
             "pubyear": None
         },
         "analytic": {
             "title": "better title",
             "authors": [],
             "pubyear": "1999"
         }
     }
     x = PdfDownloader.get_reference(tei_doc)
     self.assertEqual(x, None)
Example #7
0
 def test_analytics(self):
     tei_doc = {
         "monogr": {
             "title": "title",
             "authors": None,
             "pubyear": None
         },
         "analytic": {
             "title": "better title",
             "authors": ["A. And; B. And"],
             "pubyear": "1999"
         }
     }
     x = PdfDownloader.get_reference(tei_doc)
     self.assertEqual(
         x, {
             "title": "better title",
             "authors": ["A. And; B. And"],
             "pubyear": datetime.date(1999, 1, 1)
         })
Example #8
0
 def test_invalid_file(self):
     x = PdfDownloader(path, self.grobid_url)
     file = os.path.join(path, "invalid.txt")
     self.assertRaises(GrobidException, x.parse_references, file,
                       self.source)
Example #9
0
 def test_invalid_path(self):
     x = PdfDownloader(path, self.grobid_url)
     self.assertRaises(GrobidException, x.parse_references, "nkjk",
                       self.source)
Example #10
0
 def test_regression10(self):
     file_path = os.path.join(path, "reg10.pdf")
     x = PdfDownloader(path, self.grobid_url)
     x.parse_references(file_path, self.source)
Example #11
0
class TestPDFDownloader(TransactionTestCase):
    def setUp(self):
        self.grobid_url = "http://*****:*****@patch("weaver.PDFDownloader.requests.get")
    def test_403(self, req):
        req.return_value.status_code = 403
        PDFDownloadQueue.objects.all().delete()
        PDFDownloadQueue.objects.create(
            url="https://arxiv.org/pdf/1704dasdsad738.pdf", tries=0)
        result = self.obj.process_pdf()
        self.assertEqual(result["skipped"], 1)
        obj = PDFDownloadQueue.objects.first()
        self.assertEqual(obj.tries, 1)
        self.assertIsNotNone(obj.last_try)

    @patch("weaver.PDFDownloader.requests.get")
    def test_404(self, req):
        req.return_value.status_code = 404
        PDFDownloadQueue.objects.all().delete()
        PDFDownloadQueue.objects.create(
            url="https://arxiv.org/pdf/1704dasdsad738.pdf", tries=0)
        result = self.obj.process_pdf()
        self.assertEqual(result["invalid"], 1)
        self.assertEqual(PDFDownloadQueue.objects.count(), 0)

    @patch("weaver.PDFDownloader.requests.get")
    def test_false(self, req):
        req.return_value.ok = False
        PDFDownloadQueue.objects.all().delete()
        PDFDownloadQueue.objects.create(
            url="https://arxiv.org/pdf/1704dasdsad738.pdf", tries=0)
        result = self.obj.process_pdf()
        self.assertEqual(result["skipped"], 1)
        obj = PDFDownloadQueue.objects.first()
        self.assertEqual(obj.tries, 1)
        self.assertIsNotNone(obj.last_try)

    def test_connection_error(self):
        self.assertEqual(PDFDownloadQueue.objects.count(), 3)
        self.obj.parse_references.side_effect = ConnectionError()
        result = self.obj.process_pdf()
        self.assertEqual(PDFDownloadQueue.objects.count(), 3)
        self.assertEqual(result["skipped"], 1)

    def test_error(self):
        self.obj.limit = 1
        self.obj.parse_references.side_effect = Exception()
        result = self.obj.process_pdf()
        self.assertEqual(PDFDownloadQueue.objects.count(), 3)
        self.assertEqual(result["skipped"], 1)

    def test_regression(self):
        file_path = os.path.join(path, "0704.0251.pdf")
        x = PdfDownloader(path, self.grobid_url)
        x.parse_references(file_path, self.source)

    def test_regression1(self):
        file_path = os.path.join(path, "reg1.pdf")
        x = PdfDownloader(path, self.grobid_url)
        x.parse_references(file_path, self.source)

    def test_regression2(self):
        file_path = os.path.join(path, "reg2.pdf")
        x = PdfDownloader(path, self.grobid_url)
        x.parse_references(file_path, self.source)

    def test_regression3(self):
        file_path = os.path.join(path, "reg3.pdf")
        x = PdfDownloader(path, self.grobid_url)
        x.parse_references(file_path, self.source)

    def test_regression4(self):
        file_path = os.path.join(path, "reg4.pdf")
        x = PdfDownloader(path, self.grobid_url)
        x.parse_references(file_path, self.source)

    def test_regression5(self):
        file_path = os.path.join(path, "reg5.pdf")
        x = PdfDownloader(path, self.grobid_url)
        x.parse_references(file_path, self.source)

    def test_regression6(self):
        file_path = os.path.join(path, "reg6.pdf")
        x = PdfDownloader(path, self.grobid_url)
        x.parse_references(file_path, self.source)

    def test_regression7(self):
        file_path = os.path.join(path, "reg7.pdf")
        x = PdfDownloader(path, self.grobid_url)
        x.parse_references(file_path, self.source)

    def test_regression8(self):
        file_path = os.path.join(path, "reg8.pdf")
        x = PdfDownloader(path, self.grobid_url)
        x.parse_references(file_path, self.source)

    def test_regression9(self):
        file_path = os.path.join(path, "reg9.pdf")
        x = PdfDownloader(path, self.grobid_url)
        x.parse_references(file_path, self.source)

    def test_regression10(self):
        file_path = os.path.join(path, "reg10.pdf")
        x = PdfDownloader(path, self.grobid_url)
        x.parse_references(file_path, self.source)

    @patch("weaver.PDFDownloader.requests.get")
    def test_pdf_downloader_regression1(self, req):
        req.return_value.status_code = 403
        PDFDownloadQueue.objects.all().delete()
        PDFDownloadQueue.objects.create(
            url=
            "http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC4656104&blobtype=pdf",
            tries=0)
        result = self.obj.process_pdf(user_agent='Mozilla/5.0')
        self.assertEqual(result["skipped"], 0)