def record_page_links(page): region = page.region links = extract_internal_links(page.content) for pagename, count in links.iteritems(): qs = Link.objects.filter(source=page, region=region) link_exists = qs.filter(destination_slug=slugify(pagename)) | qs.filter(destination__slug=slugify(pagename)) if link_exists: link = link_exists[0] if link.count == count: # No new links with this name on this page, so skip updating. continue link.count = count else: page_exists = Page.objects.filter(slug=slugify(pagename), region=region) if page_exists: destination = page_exists[0] else: destination = None # Exists for some reason already (probably running a script that's moving between regions?) if destination and Link.objects.filter(source=page, destination=destination).exists(): continue link = Link( source=page, region=region, destination=destination, destination_name=pagename, destination_slug=slugify(pagename), count=count, ) link.save()
def test_simple_extraction(self): html = """ <p>I love <a href="Parks">awesome parks</a>.</p> """ links = extract_internal_links(html) self.assertTrue('Parks' in links) self.assertEqual(links['Parks'], 1)
def forwards(self, orm): from pages.models import slugify from links import extract_internal_links for page in orm['pages.Page'].objects.all().iterator(): region = page.region links = extract_internal_links(page.content) print "..recording page links on %s" % smart_str(page.name) for pagename, count in links.iteritems(): page_exists = orm['pages.Page'].objects.filter(slug=slugify(pagename), region=region) if page_exists: destination = page_exists[0] else: destination = None if orm.Link.objects.filter(source=page, destination=destination).exists(): continue if orm.Link.objects.filter(source=page, destination_name__iexact=pagename).exists(): if destination: link = orm.Link.objects.filter(source=page, destination_name__iexact=pagename)[0] link.destination = destination link.save() else: link = orm.Link( source=page, region=region, destination=destination, destination_name=pagename, count=count, ) link.save()
def test_ignore_external_links(self): html = """ <p>I love <a href="Parks">outside</a>.</p> <p>I love <a href="http://example.org/Night">test</a>.</p> """ links = extract_internal_links(html) self.assertTrue('Parks' in links) self.assertEqual(len(links.keys()), 1)
def test_link_unquoting(self): html = """ <p>I love <a href="Cats%20and%20dogs">animals</a>.</p> <p>I love <a href="Cats and dogs">animals</a>.</p> """ links = extract_internal_links(html) self.assertTrue('Cats and dogs' in links) self.assertFalse('Cats%20and%20dogs' in links)
def test_ignore_anchors(self): html = """ <p>I love <a href="Parks">outside</a>.</p> <p>I love <a href="#gohere">test</a>.</p> <p>I love <a>test now</a>.</p> """ links = extract_internal_links(html) self.assertTrue('Parks' in links) self.assertEqual(len(links.keys()), 1)
def test_count_links(self): html = """ <p>I love <a href="Parks">awesome parks</a>.</p> <p>I hate <a href="Cats%20and%20dogs">animals</a>.</p> <p>I love <a href="Parks">awesome parks</a>.</p> <p>I love <a href="Parks">awesome parks</a>.</p> <p>I love <a href="Cats%20and%20dogs">awesome parks</a>.</p> """ links = extract_internal_links(html) self.assertTrue('Parks' in links) self.assertTrue('Cats and dogs' in links) self.assertEqual(links['Parks'], 3) self.assertEqual(links['Cats and dogs'], 2)
def record_page_links(page): region = page.region links = extract_internal_links(page.content) for pagename, count in links.iteritems(): qs = Link.objects.filter(source=page, region=region) link_exists = qs.filter( destination_slug=slugify(pagename)) | qs.filter( destination__slug=slugify(pagename)) if link_exists: link = link_exists[0] if link.count == count: # No new links with this name on this page, so skip updating. continue link.count = count else: page_exists = Page.objects.filter(slug=slugify(pagename), region=region) if page_exists: destination = page_exists[0] else: destination = None # Exists for some reason already (probably running a script that's moving between regions?) if destination and Link.objects.filter( source=page, destination=destination).exists(): continue link = Link( source=page, region=region, destination=destination, destination_name=pagename, destination_slug=slugify(pagename), count=count, ) link.save()
def forwards(self, orm): from pages.models import slugify from links import extract_internal_links for page in orm['pages.Page'].objects.all().iterator(): region = page.region links = extract_internal_links(page.content) print "..recording page links on %s" % smart_str(page.name) for pagename, count in links.iteritems(): page_exists = orm['pages.Page'].objects.filter( slug=slugify(pagename), region=region) if page_exists: destination = page_exists[0] else: destination = None if orm.Link.objects.filter(source=page, destination=destination).exists(): continue if orm.Link.objects.filter( source=page, destination_name__iexact=pagename).exists(): if destination: link = orm.Link.objects.filter( source=page, destination_name__iexact=pagename)[0] link.destination = destination link.save() else: link = orm.Link( source=page, region=region, destination=destination, destination_name=pagename, count=count, ) link.save()
def test_ignore_plugins(self): html = """<a class="plugin includepage" href="seed">Include page seed</a></p>""" links = extract_internal_links(html) self.assertEqual(len(links.keys()), 0)