Example #1
0
    def test_maybe_convert_dedupe_urls(self):
        """Only convert the first response for a URL, then ignore the rest."""
        record = warc_record(warc_response('foo', 'http://foo'))
        self.assertEqual(BIGQUERY_JSON, maybe_convert(record, 'foo'))

        record2 = warc_record(warc_response('bar', 'http://foo'))
        self.assertIsNone(maybe_convert(record2, 'foo'))
Example #2
0
    def test_other_domains(self):
        """We keep subdomains, but discard other domains."""
        for url in 'http://foo.com/1', 'http://sub.foo.com/2', 'http://foo.com:80/3':
            with self.subTest(url=url):
                self.assertIsNotNone(
                    maybe_convert(warc_record(warc_response('X', url)),
                                  'foo.com'))

        self.assertIsNone(
            maybe_convert(warc_record(warc_response('3', 'http://bar.com/3')),
                          'foo.com'))
Example #3
0
 def test_mf2py_crash_lxml_and_html5lib(self, _):
     """https://github.com/tommorris/mf2py/issues/78"""
     got = maybe_convert(warc_record(warc_response('X', 'http://foo')),
                         'foo')
     self.assertEqual('{}', got['mf2'])
     for key in 'mf2_classes', 'u_urls', 'rels':
         self.assertEqual([], got[key])
Example #4
0
    def test_composite_u_url(self):
        """u-url and h-entry on the same tag results in a composite url property.

    Make sure we handle that and extract out the string url inside.

    Parsed mf2:

    {"items": [{
      "type": ["h-feed"],
      "properties": {
        "url": [{
            "type": ["h-entry"],
            "properties": {
              "summary": ["Two fresh data sets"],
              "name": ["Two fresh data sets"],
              "url": ["../X1Ya.html"]
            },
            "value": "../X1Ya.html"
          }
        ],
        ...
    }}]}
    """
        record = warc_record(
            warc_response(
                """\
<div class="h-feed">
  <a class="h-entry u-url" href="../X1Ya.html">
    <p class="p-summary">Two fresh data sets</p>
  </a>
</div>
""", 'http://foo'))
        self.assertEqual(['http://foo/X1Ya.html'],
                         maybe_convert(record, 'foo')['u_urls'])
Example #5
0
 def test_rels(self):
     for i, (content, expected) in enumerate((
         ('', []),
         ('<link rel="foo" href="http://x">', [{
             'value': 'foo',
             'urls': ['http://x']
         }]),
         ('<link rel="foo bar" href="http://x">', [{
             'value': 'foo',
             'urls': ['http://x']
         }, {
             'value': 'bar',
             'urls': ['http://x']
         }]),
         ('<link rel="foo" href="http://x"> <link rel="bar foo" href="http://y">',
          [{
              'value': 'foo',
              'urls': ['http://x', 'http://y']
          }, {
              'value': 'bar',
              'urls': ['http://y']
          }]),
     )):
         record = warc_record(warc_response(content, 'http://foo/%s' % i))
         actual = maybe_convert(record, 'foo')['rels']
         with self.subTest(content=content):
             self.assertEqual(expected, actual)
Example #6
0
 def test_max_row_size(self):
     """Discovered by http://www.downes.ca/research_authors.htm , ~45MB single
 HTML file with a bunch of big embedded data: URIs."""
     got = maybe_convert(warc_record(warc_response('X', 'http://foo')),
                         'foo')
     self.assertEqual(warc_to_bigquery.MAX_ROW_MESSAGE, got['html'])
     self.assertEqual({warc_to_bigquery.MAX_ROW_MESSAGE: None},
                      json.loads(got['mf2']))
Example #7
0
    def test_utf8_url_and_html(self):
        url = 'http://foo/☕/post'
        body = 'Charles ☕ Foo'

        out = maybe_convert(warc_record(warc_response(body, url) + '\r\n\r\n'),
                            'foo')
        self.assertIn(body, out['html'])

        url += '/1'
        got = self._run_main((warc_response(body, url) + '\r\n\r\n', ))
        self.assertEqual(url, got['url'])
        self.assertIn(body, got['html'])
Example #8
0
    def test_links(self):
        record = warc_record(
            warc_response(
                """\
foo <a href="#frag"></a>
bar <a class="x" rel="a b" href="/local">bar</a>
baz <a class="y u-in-reply-to" href="http://ext/ernal">baz</a>
baj <a class="u-repost-of z" href="http://ext/ernal"><img src="/baj"></a>
baj <link rel="c" class="w" href="http://link/tag" />
biff <a rel="c" class="w" />  <!-- no hrefs, these should be ignored -->
biff <a rel="c" class="w" href="" />
""",
                'http://foo',
                html_head='<link rel="d e" href="https://head/link">'))

        self.assertEqual([{
            'url': 'https://head/link',
            'inner_html': '',
            'tag': 'link',
            'rels': ['d', 'e'],
            'classes': [],
        }, {
            'url': 'http://link/tag',
            'inner_html': '',
            'tag': 'link',
            'rels': ['c'],
            'classes': ['w'],
        }, {
            'url': '#frag',
            'inner_html': '',
            'tag': 'a',
            'rels': [],
            'classes': [],
        }, {
            'url': '/local',
            'inner_html': 'bar',
            'tag': 'a',
            'rels': ['a', 'b'],
            'classes': ['x'],
        }, {
            'url': 'http://ext/ernal',
            'inner_html': 'baz',
            'tag': 'a',
            'rels': [],
            'classes': ['y', 'u-in-reply-to'],
        }, {
            'url': 'http://ext/ernal',
            'inner_html': '<img src="/baj"/>',
            'tag': 'a',
            'rels': [],
            'classes': ['u-repost-of', 'z'],
        }],
                         maybe_convert(record, 'foo')['links'])
Example #9
0
    def test_maybe_convert(self):
        foo_record = warc_record(warc_response('foo', 'http://foo'))
        self.assertEqual(BIGQUERY_JSON, maybe_convert(foo_record, 'foo'))

        bar_record = warc_record(
            warc_response('bar', 'http://bar', extra_headers={'XYZ': 'Baz'}))
        bar_json = copy.deepcopy(BIGQUERY_JSON)
        bar_json.update({
            'domain':
            'bar',
            'url':
            'http://bar',
            'html':
            HTML % ('', 'bar'),
            'headers':
            bar_json['headers'] + [{
                'name': 'XYZ',
                'value': 'Baz'
            }],
        })
        self.assertEqual(bar_json, maybe_convert(bar_record, 'bar'))
Example #10
0
    def test_max_links(self):
        """Discovered by pages on werd.io with lots of spam, e.g.
    http://werd.io/2014/why-cant-you-comment-on-this-post-indieweb ,
    before Ben cleaned them up."""
        got = maybe_convert(
            warc_record(
                warc_response(
                    """\
<a href="http://one"></a>
<a href="http://two"></a>
<a href="http://three"></a>
""", 'http://foo')), 'foo')
        self.assertEqual(2, len(got['links']))
Example #11
0
 def test_url_blacklist(self):
     for path in (
             '/foo/bar?shared=email&x',
             '/?share=facebook',
             '/?x&share=tumblr',
             '/?like_comment=123',
             '/?x&replytocom=456',
             '/wp-login.php?redirect_to=qwert',
     ):
         self.assertIsNone(
             maybe_convert(
                 warc_record(warc_response('', 'http://foo%s' % path)),
                 'foo'))
Example #12
0
 def test_microformats(self):
     for i, (content, expected) in enumerate((
         ('', []),
         ('foo', []),
         ('<div class="h-entry"></div>', ['h-entry']),
         ('<div class="h-entry">1</div> <div class="h-entry">2</div>',
          ['h-entry']),
         ('<div class="h-entry h-card"></div>', ['h-card', 'h-entry']),
         ('<div class="h-feed"><div class="h-entry"><div class="h-card">'
          '</div></div></div> <div class="h-adr"></div>',
          ['h-adr', 'h-card', 'h-entry', 'h-feed']),
             # microformats1 backward compatibility
         ('<div class="hentry"></div>', ['h-entry']),
     )):
         record = warc_record(warc_response(content, 'http://foo/%s' % i))
         actual = maybe_convert(record, 'foo')['mf2_classes']
         with self.subTest(content=content):
             self.assertEqual(expected, actual)
Example #13
0
 def test_u_urls(self):
     for i, (content, expected) in enumerate((
         ('', []),
         ('foo', []),
         ('<div class="h-entry"></div>', []),
         ('<div class="h-entry"><a class="u-url" href="http://foo" /></div>',
          ['http://foo']),
         ('<div class="h-entry"><a class="u-url" href="http://foo" /></div>'
          '<div class="h-entry"><a class="u-url" href="http://bar" /></div>',
          ['http://foo', 'http://bar']),
         ('<div class="h-feed"><div class="h-entry">'
          '<a class="u-url" href="http://foo" /></div></div>', []),
             # microformats1 backward compatibility
             # http://microformats.org/wiki/rel-bookmark#rel.3D.22bookmark.22
         ('<div class="hentry"><a rel="bookmark" href="http://baz" /></div>',
          ['http://baz']),
     )):
         record = warc_record(warc_response(content, 'http://foo/%s' % i))
         actual = maybe_convert(record, 'foo')['u_urls']
         with self.subTest(content=content):
             self.assertEqual(expected, actual)
Example #14
0
 def test_mf2py_crash_lxml(self, _):
     """https://github.com/tommorris/mf2py/issues/78"""
     got = maybe_convert(warc_record(warc_response('X', 'http://foo')),
                         'foo')
     self.assertEqual('{"x": "y"}', got['mf2'])
Example #15
0
 def test_maybe_convert_not_response(self):
     self.assertIsNone(maybe_convert(WARC_HEADER_RECORD, 'foo'))
     self.assertIsNone(maybe_convert(WARC_METADATA_RECORD, 'foo'))
     self.assertIsNone(maybe_convert(WARC_REQUEST_RECORD, 'foo'))