def setUp(self): """Setup test.""" self.pos = PosPackage() sample_filepath = pkg_resources.resource_filename( 'harvestingkit.tests', os.path.join('data', 'sample_pos_record.xml') ) self.pos.document = parse(sample_filepath)
class POSPackageTests(unittest.TestCase): """Tests for the PosPackage.""" def setUp(self): """Setup test.""" self.pos = PosPackage() sample_filepath = pkg_resources.resource_filename( 'harvestingkit.tests', os.path.join('data', 'sample_pos_record.xml') ) self.pos.document = parse(sample_filepath) def test_authors(self): """Test the field authors.""" self.assertEqual(self.pos._get_authors(), [('El-Khadra, Aida', ['INFN and Università di Firenze'])]) def test_language(self): """Test the field language.""" self.assertEqual(self.pos._get_language(), 'en') def test_publisher(self): """Test the field publisher.""" self.assertEqual(self.pos._get_publisher(), 'SISSA') def test_date(self): """Test the field date.""" self.assertEqual(self.pos._get_date(), '2014-03-19') def test_title(self): """Test the field title.""" self.assertEqual(self.pos._get_title(), 'Heavy Flavour Physics Review') def test_copyright(self): """Test the field copyright.""" self.assertEqual(self.pos._get_copyright(), 'CC-BY-NC-SA') def test_subject(self): """Test the field subject.""" self.assertEqual(self.pos._get_subject(), 'Lattice Field Theory') def test_identifier(self): """Test the field identifier.""" self.assertEqual(self.pos.get_identifier(), 'oai:pos.sissa.it:LATTICE 2013/001') def test_record(self): """Test the field identifier.""" record = self.pos.get_record(self.pos.document) self.assertTrue(record["100"]) self.assertTrue(record["980"])
class POSPackageTests(unittest.TestCase): def setUp(self): self.pos = PosPackage() self.pos.document = parse(join(dirname(folder), pos_test_record)) def test_authors(self): self.assertEqual(self.pos._get_authors(), ['El-Khadra, Aida', 'Johnson, A.T.']) def test_language(self): self.assertEqual(self.pos._get_language(), 'en') def test_publisher(self): self.assertEqual(self.pos._get_publisher(), 'SISSA') def test_date(self): self.assertEqual(self.pos._get_date(), '2014-03-19') def test_copyright(self): self.assertEqual(self.pos._get_copyright(), 'CC-BY-NC-SA') def test_subject(self): self.assertEqual(self.pos._get_subject(), 'Lattice Field Theory') def test_identifier(self): self.assertEqual(self.pos.get_identifier(), 'oai:pos.sissa.it:LATTICE 2013/001')
def main(args): if len(args) != 1: print("usage: python bibfilter_oaipos2inspire.py input_filename") raise Exception("Wrong usage!!") input_filename = args[0] out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY) insert_records = [] append_records = [] error_records = [] files_uploaded = [] pos = PosPackage() xml_doc = parse(input_filename) for record in xml_doc.getElementsByTagName('record'): rec = pos.get_record(record) identifier = pos.get_identifier() conference = identifier.split(':')[2] conference = conference.split('/')[0] contribution = identifier.split(':')[2] contribution = contribution.split('/')[1] identifier = "PoS(%s)%s" % (conference, contribution) query = "773__p:pos 773__v:%s 773__c:%s" % \ (conference.replace(' ', ''), contribution) print("Querying with: %s" % (query, )) results = perform_request_search(p=query, of="id") #harvest fulltext url = base_url + identifier session = requests.session() r = session.get(url) parsed_html = BeautifulSoup(r.text) links = parsed_html.body.findAll('a') found = False for link in links: url = urllib.quote(link['href'], safe=":/") if url.endswith('.pdf'): found = True if results: rec = create_record() filename = join(out_folder, identifier + ".pdf") record_add_field(rec, '856', ind1='4', subfields=[('u', url), ('y', 'PoS server')]) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'PoS'), ('d', 'Fulltext')]) try: print('Downloading ' + url) download_url(url, "pdf", filename, 5, 60.0) if results: recid = results[0] record_add_field(rec, '001', controlfield_value=recid) append_records.append(rec) else: insert_records.append(rec) except InvenioFileDownloadError: print("Download of %s failed" % (url, )) break if not found: error_records.append(rec) #upload to FTP tempfile_path = '/tmp/%s.xml' % (contribution, ) with open(tempfile_path, 'w') as tempfile: tempfile.write(record_xml_output(rec)) try: submit_records_via_ftp(tempfile_path, conference) files_uploaded.append('%s/%s.xml' % (conference, contribution)) write_message("%s successfully uploaded to FTP server" % tempfile_path) except: write_message("Failed to upload %s to FTP server" % tempfile_path) remove(tempfile_path) insert_filename = "%s.insert.xml" % (input_filename, ) append_filename = "%s.append.xml" % (input_filename, ) errors_filename = "%s.errors.xml" % (input_filename, ) created_files = [] if write_record_to_file(insert_filename, insert_records): copy(insert_filename, out_folder) created_files.append(join(out_folder, basename(insert_filename))) if write_record_to_file(append_filename, append_records): copy(append_filename, out_folder) created_files.append(join(out_folder, basename(append_filename))) if write_record_to_file(errors_filename, error_records): copy(errors_filename, errors_filename) created_files.append(join(out_folder, basename(errors_filename))) total_records = len(append_records) + len(insert_records) + len( error_records) subject = "PoS Harvest results: " + datetime.now().strftime( "%Y-%m-%d %H:%M:%S") body = """ Total of %d records processed: %d new records, %d records already existing in the system, %d records that failed to retrieve the fulltext Location of new records: %s """ % \ (total_records, len(insert_records), len(append_records), len(error_records), "\n".join(created_files)) if files_uploaded: body += "\nFiles uploaded:" for fl in files_uploaded: body += "\n\t%s file uploaded on the FTP Server\n" % (fl, ) write_message(subject) write_message(body) if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject, body): print("ERROR: Mail not sent") else: print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL, ))
def main(args): if len(args) != 1: print("usage: python bibfilter_oaipos2inspire.py input_filename") raise Exception("Wrong usage!!") input_filename = args[0] out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY) insert_records = [] append_records = [] error_records = [] files_uploaded = [] pos = PosPackage() xml_doc = parse(input_filename) for record in xml_doc.getElementsByTagName('record'): rec = pos.get_record(record) identifier = pos.get_identifier() conference = identifier.split(':')[2] conference = conference.split('/')[0] contribution = identifier.split(':')[2] contribution = contribution.split('/')[1] identifier = "PoS(%s)%s" % (conference, contribution) query = "773__p:pos 773__v:%s 773__c:%s" % \ (conference.replace(' ', ''), contribution) print("Querying with: %s" % (query,)) results = perform_request_search(p=query, of="id") #harvest fulltext url = base_url + identifier session = requests.session() r = session.get(url) parsed_html = BeautifulSoup(r.text) links = parsed_html.body.findAll('a') found = False for link in links: url = urllib.quote(link['href'], safe=":/") if url.endswith('.pdf'): found = True if results: rec = create_record() filename = join(out_folder, identifier + ".pdf") record_add_field(rec, '856', ind1='4', subfields=[ ('u', url), ('y', 'PoS server') ]) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'PoS'), ('d', 'Fulltext')]) try: print('Downloading ' + url) download_url(url, "pdf", filename, 5, 60.0) if results: recid = results[0] record_add_field(rec, '001', controlfield_value=recid) append_records.append(rec) else: insert_records.append(rec) except InvenioFileDownloadError: print("Download of %s failed" % (url,)) break if not found: error_records.append(rec) #upload to FTP tempfile_path = '/tmp/%s.xml' % (contribution,) with open(tempfile_path, 'w') as tempfile: tempfile.write(record_xml_output(rec)) try: submit_records_via_ftp(tempfile_path, conference) files_uploaded.append('%s/%s.xml' % (conference, contribution)) write_message("%s successfully uploaded to FTP server" % tempfile_path) except: write_message("Failed to upload %s to FTP server" % tempfile_path) remove(tempfile_path) insert_filename = "%s.insert.xml" % (input_filename,) append_filename = "%s.append.xml" % (input_filename,) errors_filename = "%s.errors.xml" % (input_filename,) created_files = [] if write_record_to_file(insert_filename, insert_records): copy(insert_filename, out_folder) created_files.append(join(out_folder, basename(insert_filename))) if write_record_to_file(append_filename, append_records): copy(append_filename, out_folder) created_files.append(join(out_folder, basename(append_filename))) if write_record_to_file(errors_filename, error_records): copy(errors_filename, errors_filename) created_files.append(join(out_folder, basename(errors_filename))) total_records = len(append_records) + len(insert_records) + len(error_records) subject = "PoS Harvest results: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") body = """ Total of %d records processed: %d new records, %d records already existing in the system, %d records that failed to retrieve the fulltext Location of new records: %s """ % \ (total_records, len(insert_records), len(append_records), len(error_records), "\n".join(created_files)) if files_uploaded: body += "\nFiles uploaded:" for fl in files_uploaded: body += "\n\t%s file uploaded on the FTP Server\n" % (fl,) write_message(subject) write_message(body) if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject, body): print("ERROR: Mail not sent") else: print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL,))
def main(args): if len(args) != 1: print("usage: python bibfilter_oaipos2inspire.py input_filename") raise Exception("Wrong usage!!") input_filename = args[0] out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY) insert_records = [] append_records = [] error_records = [] pos = PosPackage() xml_doc = parse(input_filename) for record in xml_doc.getElementsByTagName("record"): rec = pos.get_record(record) identifier = pos.get_identifier() conference = identifier.split(":")[2] conference = conference.split("/")[0] contribution = identifier.split(":")[2] contribution = contribution.split("/")[1] identifier = "PoS(%s)%s" % (conference, contribution) query = "773__p:pos 773__v:%s 773__c:%s" % (conference.replace(" ", ""), contribution) print("Querying with: %s" % (query,)) results = perform_request_search(p=query, of="id") # harvest fulltext url = base_url + identifier session = requests.session() r = session.get(url) parsed_html = BeautifulSoup(r.text) links = parsed_html.body.findAll("a") found = False for link in links: url = urllib.quote(link["href"], safe=":/") if url.endswith(".pdf"): found = True if results: rec = {} filename = join(out_folder, identifier + ".pdf") record_add_field(rec, "856", ind1="4", subfields=[("u", url), ("y", "Fulltext")]) record_add_field(rec, "FFT", subfields=[("a", filename), ("t", "PoS"), ("d", "Fulltext")]) try: print("Downloading " + url) download_url(url, "pdf", filename, 5, 60.0) if results: recid = results[0] record_add_field(rec, "001", controlfield_value=recid) append_records.append(rec) else: insert_records.append(rec) except InvenioFileDownloadError: print("Download of %s failed" % (url,)) break if not found: error_records.append(rec) insert_filename = "%s.insert.xml" % (input_filename,) append_filename = "%s.append.xml" % (input_filename,) errors_filename = "%s.errors.xml" % (input_filename,) created_files = [] if write_record_to_file(insert_filename, insert_records): copy(insert_filename, out_folder) created_files.append(join(out_folder, basename(insert_filename))) if write_record_to_file(append_filename, append_records): copy(append_filename, out_folder) created_files.append(join(out_folder, basename(append_filename))) if write_record_to_file(errors_filename, error_records): copy(errors_filename, errors_filename) created_files.append(join(out_folder, basename(errors_filename))) total_records = len(append_records) + len(insert_records) + len(error_records) subject = "PoS Harvest results: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") body = """ Total of %d records processed: %d new records, %d records already existing in the system, %d records that failed to retrieve the fulltext Location of new records: %s """ % ( total_records, len(insert_records), len(append_records), len(error_records), "\n".join(created_files), ) print(subject) print(body) if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject, body): print("ERROR: Mail not sent") else: print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL,))
def setUp(self): self.pos = PosPackage() self.pos.document = parse(join(dirname(folder), pos_test_record))