def test_find_urls_in_java(self): test_file = self.get_test_loc('finder/url/IMarkerActionFilter.java') expected = [ u'http://www.eclipse.org/legal/epl-v10.html', ] result = list(finder.find_urls(test_file)) assert expected == result
def test_find_urls_in_pom(self): lines = [ 'https://svn.codehaus.org/plexus/tags</tagBase>', 'http://svn.codehaus.org/plexus-interactivity-api</developerConnection>', 'https://svn.codehaus.org//plexus-container-default</developerConnection>', 'https://svn.codehaus.org/plexus/trunk/plexus-utils</developerConnection>', 'https://svn.codehaus.org/plexus/trunk</developerConnection>', 'https://svn.codehaus.org/plexus/trunk</developerConnection>', 'https://svn.codehaus.org/plexus/trunk</developerConnection>', 'https://svn.codehaus.org/qdox/tags/qdox-1.9</connection>', 'https://svn.codehaus.org/qdox/tags/qdox-1.9</developerConnection>', 'https://svn.codehaus.org/qdox/tags</tagBase>', 'https://svn.codehaus.org/xstream/tags/XSTREAM_1_2_2</connection>', 'https://svn.codehaus.org/xstream/tags/XSTREAM_1_2_2</developerConnection>', 'https://svn.codehaus.org/xstream/tags/XSTREAM_1_2_2</url>', 'https://svn.codehaus.org/xstream/tags</tagBase>', 'https://svn.sourceforge.net/svnroot/jtidy/trunk/jtidy/</connection>', 'https://svn.sourceforge.net/svn/jtidy/trunk/jtidy/</developerConnection>' ] expected = [ u'https://svn.codehaus.org/plexus/tags', u'http://svn.codehaus.org/plexus-interactivity-api', u'https://svn.codehaus.org/plexus-container-default', u'https://svn.codehaus.org/plexus/trunk/plexus-utils', u'https://svn.codehaus.org/plexus/trunk', u'https://svn.codehaus.org/qdox/tags/qdox-1.9', u'https://svn.codehaus.org/qdox/tags', u'https://svn.codehaus.org/xstream/tags/XSTREAM_1_2_2', u'https://svn.codehaus.org/xstream/tags', u'https://svn.sourceforge.net/svnroot/jtidy/trunk/jtidy/', u'https://svn.sourceforge.net/svn/jtidy/trunk/jtidy/', ] result = list(finder.find_urls(lines)) assert expected == result
def test_misc_invalid_urls(self): # set of non URLs from https://mathiasbynens.be/demo/url-regex urls = u''' http:// http://. http://.. http://../ http://? http://?? http://??/ http://# http://## http://##/ // //a ///a /// http:///a foo.com rdar://1234 h://test http:// shouldfail.com :// should fail http://-a.b.co http://0.0.0.0 http://10.1.1.0 http://10.1.1.255 http://224.1.1.1 http://3628126748 http://10.1.1.1 ''' for test in (u.strip() for u in urls.splitlines(False) if u and u.strip()): result = [val for val, _ln in finder.find_urls([test])] assert not result, test
def test_find_urls(self): lines = [ r"http://alaphalinu.org').", r'http://alaphalinu.org/bridge.', r'http://alaphalinu.org.', r"http://alaphalinu.org''.", r"http://alaphalinu.org',", r'http://alaphalinu.org>', r'http://alaphalinu.org>;', r'http://alaphalinu.org>.', r'http://alaphalinu.org/.', r'http://alaphalinu.org/>', r'http://alaphalinu.org/)', r'http://alaphalinu.org/>)', r'http://alaphalinu.org/">)', r'http://alaphalinu.org/>),', r'http://alaphalinu.org/tst.htm]', r'http://alaphalinu.org/tst.html.', r'http://alaphalinu.org/isc\\fR.', r'http://alaphalinu.org/isc.txt,', r'http://alaphalinu.org/isc.html\\n', r'http://alaphalinu.org/somedir/\\n', r'http://kernelnewbies.org/</ulink>).', r'http://kernelnewbies.org/(ulink).', r'http://alaphalinu.org/isc\fR.', r'http://alaphalinu.org/isc.html\n', r'http://alaphalinu.org/somedir/\n' ] expected = [ u'http://alaphalinu.org/', u'http://alaphalinu.org/bridge', u'http://alaphalinu.org/tst.htm', u'http://alaphalinu.org/tst.html', u'http://alaphalinu.org/isc', u'http://alaphalinu.org/isc.txt', u'http://alaphalinu.org/isc.html', u'http://alaphalinu.org/somedir', u'http://kernelnewbies.org/', u'http://alaphalinu.org/somedir/', ] result = list(finder.find_urls(lines)) assert expected == result
def find_urls_tester(lines_or_location, with_lineno=False, unique=True): """ Helper function for testing URLs with or without line numbers. """ result = list(finder.find_urls(lines_or_location, unique)) if not with_lineno: result = [val for val, _ln in result] return result
def test_misc_invalid_urls_that_should_not_be_detected_2(self): # At least per this set of non URLs from https://mathiasbynens.be/demo/url-regex urls = u''' ftps://foo.bar/ ''' for test in (u.strip() for u in urls.splitlines(False) if u and u.strip()): result = [val for val, _ln in finder.find_urls([test])] assert result, test
def test_misc_invalid_urls_that_are_still_detected_and_normalized(self): # set of non URLs from https://mathiasbynens.be/demo/url-regex urls = u''' http://www.foo.bar./ ''' for test in urls.split(): result = [val for val, _ln in finder.find_urls([test])] assert result == [test]
def test_misc_invalid_urls_that_crash(self): # set of non URLs from https://mathiasbynens.be/demo/url-regex urls = u''' http://.www.foo.bar/ http://.www.foo.bar./ ''' for test in (u.strip() for u in urls.splitlines(False) if u.strip()): result = [val for val, _ln in finder.find_urls([test])] assert ([test] == result or [test + u'/'] == result)
def test_invalid_urls_are_not_detected(self): # set of non URLs from https://mathiasbynens.be/demo/url-regex urls = u''' http://1.1.1.1.1 http://-error-.invalid/ ''' for test in urls.split(): result = [val for val, _ln in finder.find_urls([test])] assert result == []
def test_find_urls_does_not_return_duplicate_urls(self): testfile = self.get_test_loc('finder/url/nodupe.htm') expected = [ u'http://nexb.com/', u'http://trac.edgewall.org/', u'http://www.edgewall.org/', ] result = list(finder.find_urls(testfile)) assert expected == result
def test_misc_invalid_urls_that_are_still_detected_and_may_not_be_really_invalidPpy2(self): # set of non URLs from https://mathiasbynens.be/demo/url-regex urls = u''' http://www.foo.bar./ ftps://foo.bar/ ''' for test in urls.split(): result = [val for val, _ln in finder.find_urls([test])] assert result in ([test] , [test + u'/'])
def test_find_urls_without_scheme_in_python(self): testfile = self.get_test_loc('finder/url/no-scheme.py') expected = [ u'http://rvl4.ecn.purdue.edu/~kak/dist/BitVector-1.5.1.html', u'http://docs.python.org/dist/dist.html', u'http://www.programming-with-objects.com/', ] result = list(finder.find_urls(testfile)) assert expected == result
def test_misc_invalid_urls_that_should_not_crash(self): # set of non URLs from https://mathiasbynens.be/demo/url-regex urls = u''' http://.www.foo.bar/ http://.www.foo.bar./ ''' for test in urls.split(): result = [val for val, _ln in finder.find_urls([test])] assert [] == result
def test_find_urls_detects_urls_correcty_in_html(self): test_file = self.get_test_loc('finder/url/some_html.htm') expected = [ u'https://somesite.com/trac/instance/search', u'https://somesite.com/trac/instance/ticket/815', u'https://somesite.com/trac/instance/ticket/1679', u'https://somesite.com/trac/instance/wiki/TracGuide', u'https://somesite.com/trac/instance/ticket/816?format=csv', u'https://somesite.com/trac/instance/ticket/816?format=tab', u'https://somesite.com/trac/instance/ticket/816?format=rss', u'https://somesite.com/trac/instance/ticket/817', u'https://somesite.com/trac/instance/wiki', u'https://somesite.com/trac/instance/ticket/1', u'https://somesite.com/trac/instance/chrome/common/favicon.ico', u'https://somesite.com/trac/instance/search/opensearch', u'http://company.com/', u'https://somesite.com/trac/instance/logout', u'https://somesite.com/trac/instance/about', u'https://somesite.com/trac/instance/prefs', u'https://somesite.com/trac/instance/timeline', u'https://somesite.com/trac/instance/roadmap', u'https://somesite.com/trac/instance/browser', u'https://somesite.com/trac/instance/report', u'https://somesite.com/trac/instance/newticket', u'https://somesite.com/trac/instance/admin', u'https://somesite.com/trac/instance/importer', u'https://somesite.com/trac/instance/build', u'https://somesite.com/trac/instance/timeline?from=2009-04-28T18:46:08Z-0700&precision=second', u'https://somesite.com/trac/instance/timeline?from=2009-09-21T16:26:19Z-0700&precision=second', u'http://alaphalinu.org/', u'http://alaphalinu.org/bridge', u'http://alaphalinu.org/tst.htm', u'http://alaphalinu.org/tst.html', u'http://alaphalinu.org/isc', u'http://alaphalinu.org/isc.txt', u'http://alaphalinu.org/isc.html', u'http://alaphalinu.org/somedir/', u'http://kernelnewbies.org/', u'https://somesite.com/trac/instance/timeline?from=2009-04-28T18:46:50Z-0700&precision=second', u'https://somesite.com/trac/instance/timeline?from=2009-08-02T23:48:27Z-0700&precision=second', u'https://somesite.com/trac/instance/timeline?from=2009-08-03T21:13:02Z-0700&precision=second', u'http://alaphalinu.org/somedir', u'https://somesite.com/trac/instance/timeline?from=2009-09-15T14:58:31Z-0700&precision=second', u'https://somesite.com/trac/instance/timeline?from=2009-09-15T14:58:44Z-0700&precision=second', u'https://somesite.com/trac/instance/timeline?from=2009-09-18T15:22:56Z-0700&precision=second', u'https://somesite.com/trac/instance/changeset/3115', u'https://somesite.com/trac/instance/timeline?from=2009-09-21T16:26:08Z-0700&precision=second', u'https://somesite.com/trac/instance/changeset/3119', u'https://somesite.com/trac/instance/wiki/WikiFormatting', u'http://www.somesite.com/', u'https://somesite.com/trac/instance/wiki/TracTickets', u'http://trac.edgewall.org/', u'http://www.edgewall.org/' ] result = list(finder.find_urls(test_file)) assert expected == result
def test_misc_invalid_urls_that_are_still_detected_and_normalized(self): # set of non URLs from https://mathiasbynens.be/demo/url-regex urls = u''' http://www.foo.bar./ http://1.1.1.1.1 http://-error-.invalid/ ''' for test in urls.split(): result = [val.replace('.', '') for val, _ln in finder.find_urls([test])] assert result in ([test.replace('.', '')] , [test.replace('.', '') + u'/'])
def test_misc_invalid_urls_that_should_not_be_detected(self): # At least per this set of non URLs from https://mathiasbynens.be/demo/url-regex urls = u''' http://foo.bar?q=Spaces should be encoded http://foo.bar/foo(bar)baz quux http://a.b--c.de/ ''' for test in (u.strip() for u in urls.splitlines(False) if u and u.strip()): result = [val for val, _ln in finder.find_urls([test])] assert result, test
def test_find_urls_in_python(self): test_file = self.get_test_loc('finder/url/BeautifulSoup.py') expected = [ u'http://www.crummy.com/software/BeautifulSoup/', u'http://chardet.feedparser.org/', u'http://cjkpython.i18n.org/', u'http://www.crummy.com/software/BeautifulSoup/documentation.html', ] result = list(finder.find_urls(test_file)) assert expected == result
def test_find_urls_with_fragments(self): test_file = self.get_test_loc('finder/url/ABOUT') expected = [ u'http://pygments.org/', u'http://pypi.python.org/packages/2.5/P/Pygments/Pygments-0.11.1-py2.5.egg#md5=fde2a28ca83e5fca16f5ee72a67af719', u'http://pypi.python.org/packages/source/P/Pygments/Pygments-0.11.1.tar.gz#md5=a7dc555f316437ba5241855ac306209a', u'http://pypi.python.org/packages/2.4/P/Pygments/Pygments-0.11.1-py2.4.egg#md5=52d7a46a91a4a426f8fbc681c5c6f1f5', ] result = list(finder.find_urls(test_file)) assert expected == result
def test_find_urls_in_file_in_markup_in_code(self): testfile = self.get_test_loc('finder/url/markup_in_code.c') expected = [ u'http://xml.libexpat.org/dummy.ent', u'http://xml.libexpat.org/e', u'http://xml.libexpat.org/n', u'http://expat.sf.net/', u'http://xml.libexpat.org/', u'http://xml.libexpat.org/doc.dtd', u'http://xml.libexpat.org/entity.ent', u'http://xml.libexpat.org/ns1' ] result = list(finder.find_urls(testfile)) assert expected == result
def test_misc_valid_urls_that_should_pass(self): # At least per this set of non URLs from https://mathiasbynens.be/demo/url-regex urls = u''' http://foo.com/blah_blah_(wikipedia) http://foo.com/blah_blah_(wikipedia)_(again) http://foo.com/blah_(wikipedia)#cite-1 http://foo.com/blah_(wikipedia)_blah#cite-1 http://foo.com/(something)?after=parens ''' for test in urls.split(): result = [val for val, _ln in finder.find_urls([test])] assert [test] == result
def test_misc_invalid_urls_that_are_still_detected_and_may_not_be_really_invalid(self): # set of non URLs from https://mathiasbynens.be/demo/url-regex urls = u''' ftps://foo.bar/ http://a.b--c.de/ http://a.b-.co http://123.123.123 http://www.foo.bar./ ''' for test in (u.strip() for u in urls.splitlines(False) if u.strip()): result = [val for val, _ln in finder.find_urls([test])] assert ([test] == result or [test + u'/'] == result)
def test_misc_valid_urls_with_trailing_slash(self): # set of good URLs from https://mathiasbynens.be/demo/url-regex # for these, we detect but report a canonical form with a trailing slash urls = u''' http://a.b-c.de http://j.mp http://1337.net http://223.255.255.254 ''' for test in (u.strip() for u in urls.splitlines(False) if u.strip()): result = [val for val, _ln in finder.find_urls([test])] assert [test + u'/'] == result
def get_urls(location): """ Yield mappings of urls detected in the file at `location`. """ from cluecode.finder import find_urls for urls, line_num in find_urls(location): if not urls: continue misc = OrderedDict() misc['url'] = urls misc['start_line'] = line_num misc['end_line'] = line_num yield misc
def test_find_urls_without_scheme_in_lines(self): lines = [ "'http://RVL4.ecn.purdue.edu/~kak/dist/BitVector-1.5.1.html'", 'http://docs.python.org/dist/dist.html', 'www.programming-with-objects.com', ] expected = [ u'http://rvl4.ecn.purdue.edu/~kak/dist/BitVector-1.5.1.html', u'http://docs.python.org/dist/dist.html', u'http://www.programming-with-objects.com/', ] result = list(finder.find_urls(lines)) assert expected == result
def test_find_urls_does_not_return_local_ip(self): lines = [ 'http://localhost', 'http://localhost/', 'http://localhost:8080', 'http://localhost/dir/page.html', 'http://127.0.0.1', 'http://127.0.0.1:4029', 'http://127.0.0.1/dir', 'http://127.0.0.1/', 'http://127.0.0.1/page.htm', 'http://192.168.0.1', 'http://10.0.0.1', 'http://10.255.255.124', 'http://169.254.0.0', 'http://172.16.0.0', 'http://172.31.255.255', 'http://172.32.120.155' ] expected = [u'http://172.32.120.155/'] result = list(finder.find_urls(lines)) assert expected == result
def test_misc_valid_urls(self): # set of good URLs from https://mathiasbynens.be/demo/url-regex urls = u''' http://foo.com/blah_blah http://foo.com/blah_blah/ http://142.42.1.1/ http://142.42.1.1:8080/ http://code.google.com/events/#&product=browser ftp://foo.bar/baz http://foo.bar/?q=Test%20URL-encoded%20stuff ''' for test in (u.strip() for u in urls.splitlines(False) if u.strip()): result = [val for val, _ln in finder.find_urls([test])] assert [test] == result
def test_find_urls2(self): lines = [ 'http://localhost', 'http://localhost/', 'http://localhost:8080', 'http://localhost/dir/page.html', 'http://127.0.0.1', 'http://127.0.0.1:4029', 'http://127.0.0.1/dir', 'http://127.0.0.1/', 'http://127.0.0.1/page.htm', ] expected = [] result = list(finder.find_urls(lines)) assert expected == result
def test_misc_valid_unicode_or_punycode_urls_that_should_pass(self): # At least per this set of non URLs from https://mathiasbynens.be/demo/url-regex urls = u''' http://foo.com/unicode_(✪)_in_parens http://✪df.ws/123 http://➡.ws/䨹 http://⌘.ws http://⌘.ws/ http://☺.damowmow.com/ http://مثال.إختبار http://例子.测试 http://उदाहरण.परीक्षा ''' for test in (u.strip() for u in urls.splitlines(False) if u.strip()): result = [val for val, _ln in finder.find_urls([test])] assert [test] == result
def test_example_dot_com_valid_urls_return_nothing(self): urls = u''' https://www.example.com/foo/?bar=baz&inga=42&quux http://www.example.com/wpstyle/?p=364 http://userid:[email protected]:8080 http://userid:[email protected]:8080/ http://[email protected] http://[email protected]/ http://[email protected]:8080 http://[email protected]:8080/ http://userid:[email protected] http://userid:[email protected]/ http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com ''' for test in urls.split(): result = [val for val, _ln in finder.find_urls([test])] assert [] == result
def update_ignorables(licensish, verbose=False, dump=True): """ Collect, update and save the ignorable_* attributes of a `licensish` Rule or License object. """ location = licensish.text_file if verbose: print('Processing:', 'file://' + location) if not exists(location): return licensish # collect and set ignorable copyrights, holders and authors from cluecode.copyrights import detect_copyrights copyrights = set() holders = set() authors = set() for dtype, value, _start, _end in detect_copyrights(location): if dtype == 'copyrights': copyrights.add(value) elif dtype == 'holders': holders.add(value) elif dtype == 'authors': authors.add(value) licensish.ignorable_copyrights = sorted(copyrights) licensish.ignorable_holders = sorted(holders) licensish.ignorable_authors = sorted(authors) # collect and set ignrable emails and urls from cluecode.finder import find_urls from cluecode.finder import find_emails urls = set(u for (u, _ln) in find_urls(location) if u) licensish.ignorable_urls = sorted(urls) emails = set(u for (u, _ln) in find_emails(location) if u) licensish.ignorable_emails = sorted(emails) if dump: licensish.dump() return licensish
def test_find_urls_filters_bogus_url(self): url_text = [u'http://__________________'] expected = [] result = list(finder.find_urls(url_text)) assert expected == result
def test_find_urls_in_java(self): test_file = self.get_test_loc('finder/url/IMarkerActionFilter.java') expected = [u'http://www.eclipse.org/legal/epl-v10.html', ] result = list(finder.find_urls(test_file)) assert expected == result
def test_find_urls_filters_invalid_urls(self): testfile = self.get_test_loc('finder/url/truncated_url') result = list(finder.find_urls(testfile)) expected = [] assert expected == result
def test_find_urls_no_junk_urls(self): testfile = self.get_test_loc('finder/url/junk_urls.c') expected = [] result = list(finder.find_urls(testfile)) assert expected == result