# digests = f.readlines() # responses = ['testdata/docs/response_%s.json' % d.strip() for d in digests] with open('testdata/second_harvest/priority_identification_all.csv', 'w') as f: f.write('digest|url|protocol|subtype|service|has dataset|has metadata|version|is error\n') for response in responses: with open(response, 'r') as f: data = json.loads(f.read()) digest = data['digest'] raw_content = data['raw_content'] url = data['url'] rr = RawResponse(url.upper(), raw_content, digest, **{}) cleaned_text = rr.clean_raw_content() cleaned_text = cleaned_text.strip() try: parser = Parser(cleaned_text) except Exception as ex: logger.debug('xml parsing error: %s' % digest, exc_info=1) continue print digest identifier = Identify(YAML_FILE, cleaned_text, url, **{'parser': parser, 'ignore_case': True}) identifier.identify() protocol = identifier.protocol subtype = identifier.subtype service = identifier.service