def test_strip_cdata(self):
        test_string = u'<![CDATA[<xml><node>Hi</node></xml>]]>'
        expected_string = '<xml><node>Hi</node></xml>'

        rr = RawResponse('', test_string, '', **{})

        rr._extract_from_cdata()

        returned_string = rr.content

        self.assertTrue(expected_string == returned_string)
Beispiel #2
0
# with open('testdata/probable_ogc.txt', 'r') as f:
#     digests = f.readlines()
# responses = ['testdata/docs/response_%s.json' % d.strip() for d in digests]

with open('testdata/second_harvest/priority_identification_all.csv', 'w') as f:
    f.write('digest|url|protocol|subtype|service|has dataset|has metadata|version|is error\n')

for response in responses:
    with open(response, 'r') as f:
        data = json.loads(f.read())

    digest = data['digest']
    raw_content = data['raw_content']
    url = data['url']

    rr = RawResponse(url.upper(), raw_content, digest, **{})
    cleaned_text = rr.clean_raw_content()
    cleaned_text = cleaned_text.strip()

    try:
        parser = Parser(cleaned_text)
    except Exception as ex:
        logger.debug('xml parsing error: %s' % digest, exc_info=1)
        continue

    print digest

    identifier = Identify(YAML_FILE, cleaned_text, url, **{'parser': parser, 'ignore_case': True})
    identifier.identify()
    protocol = identifier.protocol
    subtype = identifier.subtype
    def test_strip_unicode_replace(self):
        test_string = '''Test the Web Forward\\ufffd\\ufffdParis,\\ufffd\\ufffdBeijing\\ufffd\\ufffdand\\ufffd\\ufffd San Francisco'''
        expected_string = 'Test the Web Forward  Paris,  Beijing  and   San Francisco'

        rr = RawResponse('', test_string, '', **{})
        rr._extract_from_cdata()
        rr._strip_unicode_replace()

        self.assertTrue(rr.content == expected_string)

        test_string = 'Con\\ufffd\\ufffdfu\\ufffd\\ufffdcius Insti\\ufffd\\ufffdtute'
        expected_string = 'Con  fu  cius Insti  tute'

        rr = RawResponse('', test_string, '', **{})
        rr._extract_from_cdata()
        rr._strip_unicode_replace()

        self.assertTrue(rr.content == expected_string)