# Check if published date is after treshold: if is_beyond_threshold_date(threshold_date, fulltext_file): # The published date is beyond the threshold, we continue msg = "Warning: Article published beyond threshold: %s" % \ (record.doi,) write_message(msg) yield record, msg continue else: write_message("OK. Record is below the threshold.", verbose=3) if add_metadata: from harvestingkit.aps_package import (ApsPackage, ApsPackageXMLError) # Generate Metadata,FFT and yield it aps = ApsPackage(journal_mappings) try: xml = aps.get_record(fulltext_file) record.add_metadata_by_string(xml) except ApsPackageXMLError, e: # This must be old-format XML write_message("Warning: old-style metadata detected for %s" % (fulltext_file)) # Remove any DTD info in the file before converting cleaned_fulltext_file = remove_dtd_information(fulltext_file) try: convert_xml_using_saxon(cleaned_fulltext_file, CFG_APSHARVEST_XSLT) # Conversion is a success. Let's derive location of converted file source_directory = os.path.dirname(cleaned_fulltext_file)
# Check if published date is after threshold: if is_beyond_threshold_date(parameters.get("threshold_date"), fulltext_file): # The published date is beyond the threshold, we continue msg = "Warning: Article published beyond threshold: %s" % \ (record.doi,) write_message(msg) yield record, msg continue else: write_message("OK. Record is below the threshold.", verbose=3) if parameters.get("metadata"): from harvestingkit.aps_package import (ApsPackage, ApsPackageXMLError) # Generate Metadata,FFT and yield it aps = ApsPackage(self.journal_mappings) try: xml = aps.get_record(fulltext_file) record.add_metadata_by_string(xml) except ApsPackageXMLError, e: # This must be old-format XML write_message("Warning: old-style metadata detected for %s" % (fulltext_file)) # Remove any DTD info in the file before converting cleaned_fulltext_file = remove_dtd_information(fulltext_file) try: convert_xml_using_saxon(cleaned_fulltext_file, CFG_APSHARVEST_XSLT) # Conversion is a success. Let's derive location of converted file source_directory = os.path.dirname(cleaned_fulltext_file)
def setUp(self): """Setup sample parsing used in tests.""" self.aps = ApsPackage(journal_mappings) self.aps.document = parse(join(dirname(folder), aps_test_record))