def test_eif(self): """each XML file in the JATS dir with a matching *complete* output in the EIF directory are equal""" def xml_fname_to_eif(xml_fname, xml_path): return join(self.source_eif_dir, os.path.splitext(xml_fname)[0] + ".json") ddiffs = {} for xml_file, xml_path in self.xml_path_list.items(): eif_file = xml_fname_to_eif(xml_file, xml_path) if not os.path.exists(eif_file): LOG.info('skipping %s, path `%s` not found', xml_file, eif_file) continue generated_eif = json.loads(feeds.scrape(xml_path, lambda x: x[0]['article'][0])) expected_eif = json.load(open(eif_file)) LOG.info("testing %s", xml_path) ddiff = DeepDiff(self.byteify(expected_eif), self.byteify(generated_eif)) if ddiff: ddiffs[eif_file] = ddiff if len(ddiffs): for attr, value in ddiffs.items(): print attr pprint(value) print "\n" self.assertTrue(False)
def test_eif_partials(self): """each XML file in the JATS dir with a matching *partial* output in the EIF/partial directory are present and equal""" def xml_fname_to_eif_partial(xml_fname, xml_path): return join(self.source_partial_dir, os.path.splitext(xml_fname)[0] + "-match.json") ddiffs = {} for xml_file, xml_path in self.xml_path_list.items(): eif_path = xml_fname_to_eif_partial(xml_file, xml_path) if not os.path.exists(eif_path): LOG.info('skipping %s, path `%s` not found', xml_file, eif_path) continue generated_eif = json.loads(feeds.scrape(xml_path, lambda x: x[0]['article'][0])) # a list of maps with keys 'description' and 'data' eif_partial_tests = json.load(open(eif_path)) for test in eif_partial_tests: if not test.has_key('description') or not test.has_key('data'): LOG.debug('description or data elements not found in file %r, skipping', eif_path) continue desc, expected_eif = test['description'], test['data'] for element, expected_partial_eif in expected_eif.items(): has_key = generated_eif.has_key(element) if not has_key: ddiff = "EIF generated from %r doesn't contain expected element %r (in partial file %r)" ddiff = ddiff % (xml_path, element, eif_path) if has_key: ddiff = DeepDiff(self.byteify(expected_partial_eif), self.byteify(generated_eif[element])) if ddiff: if not ddiffs.has_key(eif_path): ddiffs[eif_path] = {} ddiffs[eif_path][desc] = ddiff if len(ddiffs): for attr, values in ddiffs.items(): print attr for desc, value in values.items(): print desc.encode('utf-8') pprint(value) print "\n" self.assertTrue(False)
def setUp(self): self.results = {} self.references = {} self.mod = __import__("feeds") source_directory = 'JATS/' reference_directory = 'JSON/' for f in listdir(source_directory): if isfile(join(source_directory, f)): reference_file_name = f.replace('.xml', '.json') with open(source_directory + f, "r") as source_file: source_string = source_file.read() # a bit odd this but seems worthwhile round tripping to match actual results expected res = feeds.scrape(source_string, lambda x: x[0]['article'][0]) self.results[reference_file_name] = json.loads(res) with open(reference_directory + reference_file_name, "r") as reference_file: self.references[reference_file_name] = json.loads(reference_file.read())
def setUp(self): self.results = {} self.references = {} self.mod = __import__("feeds") source_directory = 'JATS/' reference_directory = 'JSON/' for f in listdir(source_directory): if isfile(join(source_directory, f)): reference_file_name = f.replace('.xml', '.json') with open(source_directory + f, "r") as source_file: source_string = source_file.read() # a bit odd this but seems worthwhile round tripping to match actual results expected res = feeds.scrape(source_string, lambda x: x[0]['article'][0]) self.results[reference_file_name] = json.loads(res) with open(reference_directory + reference_file_name, "r") as reference_file: self.references[reference_file_name] = json.loads( reference_file.read())
def scrape(xml): res = feeds.scrape(xml, lambda x: x[0]['article'][0]) return res
def scrape(xml, article_version=None): res = feeds.scrape(xml, lambda x: x[0]['article'][0], article_version) return res
import pystache def render(fname, dat): # print len(dat['entries']), len(dat['entries'][0]) # return f = open(fname) template = f.read() print pystache.render(template, dat) if __name__ == '__main__': import feeds entries = feeds.scrape(feeds.feeds) render( '/home/ygreif/learndjango/blur/crawler/index.tmpl', { 'rows': [{ 'entries': entries[i:i + 3] } for i in range(2, len(entries) - 6, 3)], 'lead': entries[0], 'second': entries[1], 'title': 'The Discerning Whig' })