def test_strip_unicode_replace(self): test_string = '''Test the Web Forward\\ufffd\\ufffdParis,\\ufffd\\ufffdBeijing\\ufffd\\ufffdand\\ufffd\\ufffd San Francisco''' expected_string = 'Test the Web Forward Paris, Beijing and San Francisco' rr = RawResponse('', test_string, '', **{}) rr._extract_from_cdata() rr._strip_unicode_replace() self.assertTrue(rr.content == expected_string) test_string = 'Con\\ufffd\\ufffdfu\\ufffd\\ufffdcius Insti\\ufffd\\ufffdtute' expected_string = 'Con fu cius Insti tute' rr = RawResponse('', test_string, '', **{}) rr._extract_from_cdata() rr._strip_unicode_replace() self.assertTrue(rr.content == expected_string)
def test_strip_cdata(self): test_string = u'<![CDATA[<xml><node>Hi</node></xml>]]>' expected_string = '<xml><node>Hi</node></xml>' rr = RawResponse('', test_string, '', **{}) rr._extract_from_cdata() returned_string = rr.content self.assertTrue(expected_string == returned_string)
# with open('testdata/probable_ogc.txt', 'r') as f: # digests = f.readlines() # responses = ['testdata/docs/response_%s.json' % d.strip() for d in digests] with open('testdata/second_harvest/priority_identification_all.csv', 'w') as f: f.write('digest|url|protocol|subtype|service|has dataset|has metadata|version|is error\n') for response in responses: with open(response, 'r') as f: data = json.loads(f.read()) digest = data['digest'] raw_content = data['raw_content'] url = data['url'] rr = RawResponse(url.upper(), raw_content, digest, **{}) cleaned_text = rr.clean_raw_content() cleaned_text = cleaned_text.strip() try: parser = Parser(cleaned_text) except Exception as ex: logger.debug('xml parsing error: %s' % digest, exc_info=1) continue print digest identifier = Identify(YAML_FILE, cleaned_text, url, **{'parser': parser, 'ignore_case': True}) identifier.identify() protocol = identifier.protocol subtype = identifier.subtype