def test_find_urls_in_java(self):
     test_file = self.get_test_loc('finder/url/IMarkerActionFilter.java')
     expected = [
         u'http://www.eclipse.org/legal/epl-v10.html',
     ]
     result = list(finder.find_urls(test_file))
     assert expected == result
 def test_find_urls_in_pom(self):
     lines = [
         'https://svn.codehaus.org/plexus/tags</tagBase>',
         'http://svn.codehaus.org/plexus-interactivity-api</developerConnection>',
         'https://svn.codehaus.org//plexus-container-default</developerConnection>',
         'https://svn.codehaus.org/plexus/trunk/plexus-utils</developerConnection>',
         'https://svn.codehaus.org/plexus/trunk</developerConnection>',
         'https://svn.codehaus.org/plexus/trunk</developerConnection>',
         'https://svn.codehaus.org/plexus/trunk</developerConnection>',
         'https://svn.codehaus.org/qdox/tags/qdox-1.9</connection>',
         'https://svn.codehaus.org/qdox/tags/qdox-1.9</developerConnection>',
         'https://svn.codehaus.org/qdox/tags</tagBase>',
         'https://svn.codehaus.org/xstream/tags/XSTREAM_1_2_2</connection>',
         'https://svn.codehaus.org/xstream/tags/XSTREAM_1_2_2</developerConnection>',
         'https://svn.codehaus.org/xstream/tags/XSTREAM_1_2_2</url>',
         'https://svn.codehaus.org/xstream/tags</tagBase>',
         'https://svn.sourceforge.net/svnroot/jtidy/trunk/jtidy/</connection>',
         'https://svn.sourceforge.net/svn/jtidy/trunk/jtidy/</developerConnection>'
     ]
     expected = [
         u'https://svn.codehaus.org/plexus/tags',
         u'http://svn.codehaus.org/plexus-interactivity-api',
         u'https://svn.codehaus.org/plexus-container-default',
         u'https://svn.codehaus.org/plexus/trunk/plexus-utils',
         u'https://svn.codehaus.org/plexus/trunk',
         u'https://svn.codehaus.org/qdox/tags/qdox-1.9',
         u'https://svn.codehaus.org/qdox/tags',
         u'https://svn.codehaus.org/xstream/tags/XSTREAM_1_2_2',
         u'https://svn.codehaus.org/xstream/tags',
         u'https://svn.sourceforge.net/svnroot/jtidy/trunk/jtidy/',
         u'https://svn.sourceforge.net/svn/jtidy/trunk/jtidy/',
     ]
     result = list(finder.find_urls(lines))
     assert expected == result
 def test_misc_invalid_urls(self):
     # set of non URLs from https://mathiasbynens.be/demo/url-regex
     urls = u'''
         http://
         http://.
         http://..
         http://../
         http://?
         http://??
         http://??/
         http://#
         http://##
         http://##/
         //
         //a
         ///a
         ///
         http:///a
         foo.com
         rdar://1234
         h://test
         http:// shouldfail.com
         :// should fail
         http://-a.b.co
         http://0.0.0.0
         http://10.1.1.0
         http://10.1.1.255
         http://224.1.1.1
         http://3628126748
         http://10.1.1.1
     '''
     for test in (u.strip() for u in urls.splitlines(False) if u and u.strip()):
         result = [val for val, _ln in finder.find_urls([test])]
         assert not result, test
 def test_find_urls_in_pom(self):
     lines = [
         'https://svn.codehaus.org/plexus/tags</tagBase>',
         'http://svn.codehaus.org/plexus-interactivity-api</developerConnection>',
         'https://svn.codehaus.org//plexus-container-default</developerConnection>',
         'https://svn.codehaus.org/plexus/trunk/plexus-utils</developerConnection>',
         'https://svn.codehaus.org/plexus/trunk</developerConnection>',
         'https://svn.codehaus.org/plexus/trunk</developerConnection>',
         'https://svn.codehaus.org/plexus/trunk</developerConnection>',
         'https://svn.codehaus.org/qdox/tags/qdox-1.9</connection>',
         'https://svn.codehaus.org/qdox/tags/qdox-1.9</developerConnection>',
         'https://svn.codehaus.org/qdox/tags</tagBase>',
         'https://svn.codehaus.org/xstream/tags/XSTREAM_1_2_2</connection>',
         'https://svn.codehaus.org/xstream/tags/XSTREAM_1_2_2</developerConnection>',
         'https://svn.codehaus.org/xstream/tags/XSTREAM_1_2_2</url>',
         'https://svn.codehaus.org/xstream/tags</tagBase>',
         'https://svn.sourceforge.net/svnroot/jtidy/trunk/jtidy/</connection>',
         'https://svn.sourceforge.net/svn/jtidy/trunk/jtidy/</developerConnection>'
     ]
     expected = [
         u'https://svn.codehaus.org/plexus/tags',
         u'http://svn.codehaus.org/plexus-interactivity-api',
         u'https://svn.codehaus.org/plexus-container-default',
         u'https://svn.codehaus.org/plexus/trunk/plexus-utils',
         u'https://svn.codehaus.org/plexus/trunk',
         u'https://svn.codehaus.org/qdox/tags/qdox-1.9',
         u'https://svn.codehaus.org/qdox/tags',
         u'https://svn.codehaus.org/xstream/tags/XSTREAM_1_2_2',
         u'https://svn.codehaus.org/xstream/tags',
         u'https://svn.sourceforge.net/svnroot/jtidy/trunk/jtidy/',
         u'https://svn.sourceforge.net/svn/jtidy/trunk/jtidy/',
     ]
     result = list(finder.find_urls(lines))
     assert expected == result
 def test_find_urls(self):
     lines = [
         r"http://alaphalinu.org').", r'http://alaphalinu.org/bridge.',
         r'http://alaphalinu.org.', r"http://alaphalinu.org''.",
         r"http://alaphalinu.org',", r'http://alaphalinu.org>',
         r'http://alaphalinu.org>;', r'http://alaphalinu.org>.',
         r'http://alaphalinu.org/.', r'http://alaphalinu.org/>',
         r'http://alaphalinu.org/)', r'http://alaphalinu.org/>)',
         r'http://alaphalinu.org/">)', r'http://alaphalinu.org/>),',
         r'http://alaphalinu.org/tst.htm]',
         r'http://alaphalinu.org/tst.html.',
         r'http://alaphalinu.org/isc\\fR.',
         r'http://alaphalinu.org/isc.txt,',
         r'http://alaphalinu.org/isc.html\\n',
         r'http://alaphalinu.org/somedir/\\n',
         r'http://kernelnewbies.org/</ulink>).',
         r'http://kernelnewbies.org/(ulink).',
         r'http://alaphalinu.org/isc\fR.',
         r'http://alaphalinu.org/isc.html\n',
         r'http://alaphalinu.org/somedir/\n'
     ]
     expected = [
         u'http://alaphalinu.org/',
         u'http://alaphalinu.org/bridge',
         u'http://alaphalinu.org/tst.htm',
         u'http://alaphalinu.org/tst.html',
         u'http://alaphalinu.org/isc',
         u'http://alaphalinu.org/isc.txt',
         u'http://alaphalinu.org/isc.html',
         u'http://alaphalinu.org/somedir',
         u'http://kernelnewbies.org/',
         u'http://alaphalinu.org/somedir/',
     ]
     result = list(finder.find_urls(lines))
     assert expected == result
def find_urls_tester(lines_or_location, with_lineno=False, unique=True):
    """
    Helper function for testing URLs with or without line numbers.
    """
    result = list(finder.find_urls(lines_or_location, unique))
    if not with_lineno:
        result = [val for val, _ln in result]
    return result
 def test_misc_invalid_urls_that_should_not_be_detected_2(self):
     # At least per this set of non URLs from https://mathiasbynens.be/demo/url-regex
     urls = u'''
         ftps://foo.bar/
     '''
     for test in (u.strip() for u in urls.splitlines(False) if u and u.strip()):
         result = [val for val, _ln in finder.find_urls([test])]
         assert result, test
def find_urls_tester(lines_or_location, with_lineno=False, unique=True):
    """
    Helper function for testing URLs with or without line numbers.
    """
    result = list(finder.find_urls(lines_or_location, unique))
    if not with_lineno:
        result = [val for val, _ln in result]
    return result
 def test_misc_invalid_urls_that_are_still_detected_and_normalized(self):
     # set of non URLs from https://mathiasbynens.be/demo/url-regex
     urls = u'''
         http://www.foo.bar./
     '''
     for test in urls.split():
         result = [val for val, _ln in finder.find_urls([test])]
         assert result == [test]
 def test_misc_invalid_urls_that_crash(self):
     # set of non URLs from https://mathiasbynens.be/demo/url-regex
     urls = u'''
         http://.www.foo.bar/
         http://.www.foo.bar./
     '''
     for test in (u.strip() for u in urls.splitlines(False) if u.strip()):
         result = [val for val, _ln in finder.find_urls([test])]
         assert ([test] == result or [test + u'/'] == result)
 def test_invalid_urls_are_not_detected(self):
     # set of non URLs from https://mathiasbynens.be/demo/url-regex
     urls = u'''
         http://1.1.1.1.1
         http://-error-.invalid/
     '''
     for test in urls.split():
         result = [val for val, _ln in finder.find_urls([test])]
         assert result == []
 def test_find_urls_does_not_return_duplicate_urls(self):
     testfile = self.get_test_loc('finder/url/nodupe.htm')
     expected = [
         u'http://nexb.com/',
         u'http://trac.edgewall.org/',
         u'http://www.edgewall.org/',
     ]
     result = list(finder.find_urls(testfile))
     assert expected == result
Beispiel #13
0
 def test_misc_invalid_urls_that_are_still_detected_and_may_not_be_really_invalidPpy2(self):
     # set of non URLs from https://mathiasbynens.be/demo/url-regex
     urls = u'''
         http://www.foo.bar./
         ftps://foo.bar/
     '''
     for test in urls.split():
         result = [val for val, _ln in finder.find_urls([test])]
         assert result in ([test] , [test + u'/'])
 def test_find_urls_without_scheme_in_python(self):
     testfile = self.get_test_loc('finder/url/no-scheme.py')
     expected = [
         u'http://rvl4.ecn.purdue.edu/~kak/dist/BitVector-1.5.1.html',
         u'http://docs.python.org/dist/dist.html',
         u'http://www.programming-with-objects.com/',
     ]
     result = list(finder.find_urls(testfile))
     assert expected == result
 def test_find_urls_does_not_return_duplicate_urls(self):
     testfile = self.get_test_loc('finder/url/nodupe.htm')
     expected = [
         u'http://nexb.com/',
         u'http://trac.edgewall.org/',
         u'http://www.edgewall.org/',
     ]
     result = list(finder.find_urls(testfile))
     assert expected == result
 def test_find_urls_without_scheme_in_python(self):
     testfile = self.get_test_loc('finder/url/no-scheme.py')
     expected = [
         u'http://rvl4.ecn.purdue.edu/~kak/dist/BitVector-1.5.1.html',
         u'http://docs.python.org/dist/dist.html',
         u'http://www.programming-with-objects.com/',
     ]
     result = list(finder.find_urls(testfile))
     assert expected == result
 def test_misc_invalid_urls_that_should_not_crash(self):
     # set of non URLs from https://mathiasbynens.be/demo/url-regex
     urls = u'''
         http://.www.foo.bar/
         http://.www.foo.bar./
     '''
     for test in urls.split():
         result = [val for val, _ln in finder.find_urls([test])]
         assert [] == result
 def test_find_urls_detects_urls_correcty_in_html(self):
     test_file = self.get_test_loc('finder/url/some_html.htm')
     expected = [
         u'https://somesite.com/trac/instance/search',
         u'https://somesite.com/trac/instance/ticket/815',
         u'https://somesite.com/trac/instance/ticket/1679',
         u'https://somesite.com/trac/instance/wiki/TracGuide',
         u'https://somesite.com/trac/instance/ticket/816?format=csv',
         u'https://somesite.com/trac/instance/ticket/816?format=tab',
         u'https://somesite.com/trac/instance/ticket/816?format=rss',
         u'https://somesite.com/trac/instance/ticket/817',
         u'https://somesite.com/trac/instance/wiki',
         u'https://somesite.com/trac/instance/ticket/1',
         u'https://somesite.com/trac/instance/chrome/common/favicon.ico',
         u'https://somesite.com/trac/instance/search/opensearch',
         u'http://company.com/',
         u'https://somesite.com/trac/instance/logout',
         u'https://somesite.com/trac/instance/about',
         u'https://somesite.com/trac/instance/prefs',
         u'https://somesite.com/trac/instance/timeline',
         u'https://somesite.com/trac/instance/roadmap',
         u'https://somesite.com/trac/instance/browser',
         u'https://somesite.com/trac/instance/report',
         u'https://somesite.com/trac/instance/newticket',
         u'https://somesite.com/trac/instance/admin',
         u'https://somesite.com/trac/instance/importer',
         u'https://somesite.com/trac/instance/build',
         u'https://somesite.com/trac/instance/timeline?from=2009-04-28T18:46:08Z-0700&precision=second',
         u'https://somesite.com/trac/instance/timeline?from=2009-09-21T16:26:19Z-0700&precision=second',
         u'http://alaphalinu.org/',
         u'http://alaphalinu.org/bridge',
         u'http://alaphalinu.org/tst.htm',
         u'http://alaphalinu.org/tst.html',
         u'http://alaphalinu.org/isc',
         u'http://alaphalinu.org/isc.txt',
         u'http://alaphalinu.org/isc.html',
         u'http://alaphalinu.org/somedir/',
         u'http://kernelnewbies.org/',
         u'https://somesite.com/trac/instance/timeline?from=2009-04-28T18:46:50Z-0700&precision=second',
         u'https://somesite.com/trac/instance/timeline?from=2009-08-02T23:48:27Z-0700&precision=second',
         u'https://somesite.com/trac/instance/timeline?from=2009-08-03T21:13:02Z-0700&precision=second',
         u'http://alaphalinu.org/somedir',
         u'https://somesite.com/trac/instance/timeline?from=2009-09-15T14:58:31Z-0700&precision=second',
         u'https://somesite.com/trac/instance/timeline?from=2009-09-15T14:58:44Z-0700&precision=second',
         u'https://somesite.com/trac/instance/timeline?from=2009-09-18T15:22:56Z-0700&precision=second',
         u'https://somesite.com/trac/instance/changeset/3115',
         u'https://somesite.com/trac/instance/timeline?from=2009-09-21T16:26:08Z-0700&precision=second',
         u'https://somesite.com/trac/instance/changeset/3119',
         u'https://somesite.com/trac/instance/wiki/WikiFormatting',
         u'http://www.somesite.com/',
         u'https://somesite.com/trac/instance/wiki/TracTickets',
         u'http://trac.edgewall.org/',
         u'http://www.edgewall.org/'
     ]
     result = list(finder.find_urls(test_file))
     assert expected == result
Beispiel #19
0
 def test_misc_invalid_urls_that_are_still_detected_and_normalized(self):
     # set of non URLs from https://mathiasbynens.be/demo/url-regex
     urls = u'''
         http://www.foo.bar./
         http://1.1.1.1.1
         http://-error-.invalid/
     '''
     for test in urls.split():
         result = [val.replace('.', '') for val, _ln in finder.find_urls([test])]
         assert result in ([test.replace('.', '')] , [test.replace('.', '') + u'/'])
 def test_misc_invalid_urls_that_should_not_be_detected(self):
     # At least per this set of non URLs from https://mathiasbynens.be/demo/url-regex
     urls = u'''
         http://foo.bar?q=Spaces should be encoded
         http://foo.bar/foo(bar)baz quux
         http://a.b--c.de/
     '''
     for test in (u.strip() for u in urls.splitlines(False) if u and u.strip()):
         result = [val for val, _ln in finder.find_urls([test])]
         assert result, test
 def test_find_urls_in_python(self):
     test_file = self.get_test_loc('finder/url/BeautifulSoup.py')
     expected = [
         u'http://www.crummy.com/software/BeautifulSoup/',
         u'http://chardet.feedparser.org/',
         u'http://cjkpython.i18n.org/',
         u'http://www.crummy.com/software/BeautifulSoup/documentation.html',
     ]
     result = list(finder.find_urls(test_file))
     assert expected == result
 def test_find_urls_in_python(self):
     test_file = self.get_test_loc('finder/url/BeautifulSoup.py')
     expected = [
         u'http://www.crummy.com/software/BeautifulSoup/',
         u'http://chardet.feedparser.org/',
         u'http://cjkpython.i18n.org/',
         u'http://www.crummy.com/software/BeautifulSoup/documentation.html',
     ]
     result = list(finder.find_urls(test_file))
     assert expected == result
 def test_find_urls_with_fragments(self):
     test_file = self.get_test_loc('finder/url/ABOUT')
     expected = [
         u'http://pygments.org/',
         u'http://pypi.python.org/packages/2.5/P/Pygments/Pygments-0.11.1-py2.5.egg#md5=fde2a28ca83e5fca16f5ee72a67af719',
         u'http://pypi.python.org/packages/source/P/Pygments/Pygments-0.11.1.tar.gz#md5=a7dc555f316437ba5241855ac306209a',
         u'http://pypi.python.org/packages/2.4/P/Pygments/Pygments-0.11.1-py2.4.egg#md5=52d7a46a91a4a426f8fbc681c5c6f1f5',
     ]
     result = list(finder.find_urls(test_file))
     assert expected == result
 def test_find_urls_with_fragments(self):
     test_file = self.get_test_loc('finder/url/ABOUT')
     expected = [
         u'http://pygments.org/',
         u'http://pypi.python.org/packages/2.5/P/Pygments/Pygments-0.11.1-py2.5.egg#md5=fde2a28ca83e5fca16f5ee72a67af719',
         u'http://pypi.python.org/packages/source/P/Pygments/Pygments-0.11.1.tar.gz#md5=a7dc555f316437ba5241855ac306209a',
         u'http://pypi.python.org/packages/2.4/P/Pygments/Pygments-0.11.1-py2.4.egg#md5=52d7a46a91a4a426f8fbc681c5c6f1f5',
     ]
     result = list(finder.find_urls(test_file))
     assert expected == result
 def test_find_urls_in_file_in_markup_in_code(self):
     testfile = self.get_test_loc('finder/url/markup_in_code.c')
     expected = [
         u'http://xml.libexpat.org/dummy.ent', u'http://xml.libexpat.org/e',
         u'http://xml.libexpat.org/n', u'http://expat.sf.net/',
         u'http://xml.libexpat.org/', u'http://xml.libexpat.org/doc.dtd',
         u'http://xml.libexpat.org/entity.ent',
         u'http://xml.libexpat.org/ns1'
     ]
     result = list(finder.find_urls(testfile))
     assert expected == result
 def test_misc_valid_urls_that_should_pass(self):
     # At least per this set of non URLs from https://mathiasbynens.be/demo/url-regex
     urls = u'''
         http://foo.com/blah_blah_(wikipedia)
         http://foo.com/blah_blah_(wikipedia)_(again)
         http://foo.com/blah_(wikipedia)#cite-1
         http://foo.com/blah_(wikipedia)_blah#cite-1
         http://foo.com/(something)?after=parens
     '''
     for test in urls.split():
         result = [val for val, _ln in finder.find_urls([test])]
         assert [test] == result
 def test_misc_invalid_urls_that_are_still_detected_and_may_not_be_really_invalid(self):
     # set of non URLs from https://mathiasbynens.be/demo/url-regex
     urls = u'''
         ftps://foo.bar/
         http://a.b--c.de/
         http://a.b-.co
         http://123.123.123
         http://www.foo.bar./
     '''
     for test in (u.strip() for u in urls.splitlines(False) if u.strip()):
         result = [val for val, _ln in finder.find_urls([test])]
         assert ([test] == result or [test + u'/'] == result)
 def test_misc_valid_urls_with_trailing_slash(self):
     # set of good URLs from https://mathiasbynens.be/demo/url-regex
     # for these, we detect but report a canonical form with a trailing slash
     urls = u'''
         http://a.b-c.de
         http://j.mp
         http://1337.net
         http://223.255.255.254
     '''
     for test in (u.strip() for u in urls.splitlines(False) if u.strip()):
         result = [val for val, _ln in finder.find_urls([test])]
         assert [test + u'/'] == result
 def test_find_urls_detects_urls_correcty_in_html(self):
     test_file = self.get_test_loc('finder/url/some_html.htm')
     expected = [
         u'https://somesite.com/trac/instance/search',
         u'https://somesite.com/trac/instance/ticket/815',
         u'https://somesite.com/trac/instance/ticket/1679',
         u'https://somesite.com/trac/instance/wiki/TracGuide',
         u'https://somesite.com/trac/instance/ticket/816?format=csv',
         u'https://somesite.com/trac/instance/ticket/816?format=tab',
         u'https://somesite.com/trac/instance/ticket/816?format=rss',
         u'https://somesite.com/trac/instance/ticket/817',
         u'https://somesite.com/trac/instance/wiki',
         u'https://somesite.com/trac/instance/ticket/1',
         u'https://somesite.com/trac/instance/chrome/common/favicon.ico',
         u'https://somesite.com/trac/instance/search/opensearch',
         u'http://company.com/',
         u'https://somesite.com/trac/instance/logout',
         u'https://somesite.com/trac/instance/about',
         u'https://somesite.com/trac/instance/prefs',
         u'https://somesite.com/trac/instance/timeline',
         u'https://somesite.com/trac/instance/roadmap',
         u'https://somesite.com/trac/instance/browser',
         u'https://somesite.com/trac/instance/report',
         u'https://somesite.com/trac/instance/newticket',
         u'https://somesite.com/trac/instance/admin',
         u'https://somesite.com/trac/instance/importer',
         u'https://somesite.com/trac/instance/build',
         u'https://somesite.com/trac/instance/timeline?from=2009-04-28T18:46:08Z-0700&precision=second',
         u'https://somesite.com/trac/instance/timeline?from=2009-09-21T16:26:19Z-0700&precision=second',
         u'http://alaphalinu.org/', u'http://alaphalinu.org/bridge',
         u'http://alaphalinu.org/tst.htm',
         u'http://alaphalinu.org/tst.html', u'http://alaphalinu.org/isc',
         u'http://alaphalinu.org/isc.txt',
         u'http://alaphalinu.org/isc.html',
         u'http://alaphalinu.org/somedir/', u'http://kernelnewbies.org/',
         u'https://somesite.com/trac/instance/timeline?from=2009-04-28T18:46:50Z-0700&precision=second',
         u'https://somesite.com/trac/instance/timeline?from=2009-08-02T23:48:27Z-0700&precision=second',
         u'https://somesite.com/trac/instance/timeline?from=2009-08-03T21:13:02Z-0700&precision=second',
         u'http://alaphalinu.org/somedir',
         u'https://somesite.com/trac/instance/timeline?from=2009-09-15T14:58:31Z-0700&precision=second',
         u'https://somesite.com/trac/instance/timeline?from=2009-09-15T14:58:44Z-0700&precision=second',
         u'https://somesite.com/trac/instance/timeline?from=2009-09-18T15:22:56Z-0700&precision=second',
         u'https://somesite.com/trac/instance/changeset/3115',
         u'https://somesite.com/trac/instance/timeline?from=2009-09-21T16:26:08Z-0700&precision=second',
         u'https://somesite.com/trac/instance/changeset/3119',
         u'https://somesite.com/trac/instance/wiki/WikiFormatting',
         u'http://www.somesite.com/',
         u'https://somesite.com/trac/instance/wiki/TracTickets',
         u'http://trac.edgewall.org/', u'http://www.edgewall.org/'
     ]
     result = list(finder.find_urls(test_file))
     assert expected == result
Beispiel #30
0
def get_urls(location):
    """
    Yield mappings of urls detected in the file at `location`.
    """
    from cluecode.finder import find_urls
    for urls, line_num in find_urls(location):
        if not urls:
            continue
        misc = OrderedDict()
        misc['url'] = urls
        misc['start_line'] = line_num
        misc['end_line'] = line_num
        yield misc
 def test_find_urls_without_scheme_in_lines(self):
     lines = [
         "'http://RVL4.ecn.purdue.edu/~kak/dist/BitVector-1.5.1.html'",
         'http://docs.python.org/dist/dist.html',
         'www.programming-with-objects.com',
     ]
     expected = [
         u'http://rvl4.ecn.purdue.edu/~kak/dist/BitVector-1.5.1.html',
         u'http://docs.python.org/dist/dist.html',
         u'http://www.programming-with-objects.com/',
     ]
     result = list(finder.find_urls(lines))
     assert expected == result
 def test_find_urls_without_scheme_in_lines(self):
     lines = [
         "'http://RVL4.ecn.purdue.edu/~kak/dist/BitVector-1.5.1.html'",
         'http://docs.python.org/dist/dist.html',
         'www.programming-with-objects.com',
     ]
     expected = [
         u'http://rvl4.ecn.purdue.edu/~kak/dist/BitVector-1.5.1.html',
         u'http://docs.python.org/dist/dist.html',
         u'http://www.programming-with-objects.com/',
     ]
     result = list(finder.find_urls(lines))
     assert expected == result
 def test_find_urls_does_not_return_local_ip(self):
     lines = [
         'http://localhost', 'http://localhost/', 'http://localhost:8080',
         'http://localhost/dir/page.html', 'http://127.0.0.1',
         'http://127.0.0.1:4029', 'http://127.0.0.1/dir',
         'http://127.0.0.1/', 'http://127.0.0.1/page.htm',
         'http://192.168.0.1', 'http://10.0.0.1', 'http://10.255.255.124',
         'http://169.254.0.0', 'http://172.16.0.0', 'http://172.31.255.255',
         'http://172.32.120.155'
     ]
     expected = [u'http://172.32.120.155/']
     result = list(finder.find_urls(lines))
     assert expected == result
Beispiel #34
0
def get_urls(location):
    """
    Yield mappings of urls detected in the file at `location`.
    """
    from cluecode.finder import find_urls
    for urls, line_num  in find_urls(location):
        if not urls:
            continue
        misc = OrderedDict()
        misc['url'] = urls
        misc['start_line'] = line_num
        misc['end_line'] = line_num
        yield misc
 def test_find_urls_in_file_in_markup_in_code(self):
     testfile = self.get_test_loc('finder/url/markup_in_code.c')
     expected = [
         u'http://xml.libexpat.org/dummy.ent',
         u'http://xml.libexpat.org/e',
         u'http://xml.libexpat.org/n',
         u'http://expat.sf.net/',
         u'http://xml.libexpat.org/',
         u'http://xml.libexpat.org/doc.dtd',
         u'http://xml.libexpat.org/entity.ent',
         u'http://xml.libexpat.org/ns1'
     ]
     result = list(finder.find_urls(testfile))
     assert expected == result
 def test_misc_valid_urls(self):
     # set of good URLs from https://mathiasbynens.be/demo/url-regex
     urls = u'''
         http://foo.com/blah_blah
         http://foo.com/blah_blah/
         http://142.42.1.1/
         http://142.42.1.1:8080/
         http://code.google.com/events/#&product=browser
         ftp://foo.bar/baz
         http://foo.bar/?q=Test%20URL-encoded%20stuff
     '''
     for test in (u.strip() for u in urls.splitlines(False) if u.strip()):
         result = [val for val, _ln in finder.find_urls([test])]
         assert [test] == result
 def test_find_urls2(self):
     lines = [
         'http://localhost',
         'http://localhost/',
         'http://localhost:8080',
         'http://localhost/dir/page.html',
         'http://127.0.0.1',
         'http://127.0.0.1:4029',
         'http://127.0.0.1/dir',
         'http://127.0.0.1/',
         'http://127.0.0.1/page.htm',
     ]
     expected = []
     result = list(finder.find_urls(lines))
     assert expected == result
 def test_find_urls2(self):
     lines = [
         'http://localhost',
         'http://localhost/',
         'http://localhost:8080',
         'http://localhost/dir/page.html',
         'http://127.0.0.1',
         'http://127.0.0.1:4029',
         'http://127.0.0.1/dir',
         'http://127.0.0.1/',
         'http://127.0.0.1/page.htm',
     ]
     expected = []
     result = list(finder.find_urls(lines))
     assert expected == result
 def test_misc_valid_unicode_or_punycode_urls_that_should_pass(self):
     # At least per this set of non URLs from https://mathiasbynens.be/demo/url-regex
     urls = u'''
         http://foo.com/unicode_(✪)_in_parens
         http://✪df.ws/123
         http://➡.ws/䨹
         http://⌘.ws
         http://⌘.ws/
         http://☺.damowmow.com/
         http://مثال.إختبار
         http://例子.测试
         http://उदाहरण.परीक्षा
     '''
     for test in (u.strip() for u in urls.splitlines(False) if u.strip()):
         result = [val for val, _ln in finder.find_urls([test])]
         assert [test] == result
 def test_example_dot_com_valid_urls_return_nothing(self):
     urls = u'''
         https://www.example.com/foo/?bar=baz&inga=42&quux
         http://www.example.com/wpstyle/?p=364
         http://userid:[email protected]:8080
         http://userid:[email protected]:8080/
         http://[email protected]
         http://[email protected]/
         http://[email protected]:8080
         http://[email protected]:8080/
         http://userid:[email protected]
         http://userid:[email protected]/
         http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com
     '''
     for test in urls.split():
         result = [val for val, _ln in finder.find_urls([test])]
         assert [] == result
def update_ignorables(licensish, verbose=False, dump=True):
    """
    Collect, update and save the ignorable_* attributes of a `licensish` Rule or
    License object.
    """
    location = licensish.text_file

    if verbose:
        print('Processing:', 'file://' + location)

    if not exists(location):
        return licensish

    # collect and set ignorable copyrights, holders and authors
    from cluecode.copyrights import detect_copyrights
    copyrights = set()
    holders = set()
    authors = set()

    for dtype, value, _start, _end in detect_copyrights(location):
        if dtype == 'copyrights':
            copyrights.add(value)
        elif dtype == 'holders':
            holders.add(value)
        elif dtype == 'authors':
            authors.add(value)

    licensish.ignorable_copyrights = sorted(copyrights)
    licensish.ignorable_holders = sorted(holders)
    licensish.ignorable_authors = sorted(authors)

    # collect and set ignrable emails and urls
    from cluecode.finder import find_urls
    from cluecode.finder import find_emails

    urls = set(u for (u, _ln) in find_urls(location) if u)
    licensish.ignorable_urls = sorted(urls)

    emails = set(u for (u, _ln) in find_emails(location) if u)
    licensish.ignorable_emails = sorted(emails)
    if dump:
        licensish.dump()
    return licensish
 def test_find_urls(self):
     lines = [
         r"http://alaphalinu.org').",
         r'http://alaphalinu.org/bridge.',
         r'http://alaphalinu.org.',
         r"http://alaphalinu.org''.",
         r"http://alaphalinu.org',",
         r'http://alaphalinu.org>',
         r'http://alaphalinu.org>;',
         r'http://alaphalinu.org>.',
         r'http://alaphalinu.org/.',
         r'http://alaphalinu.org/>',
         r'http://alaphalinu.org/)',
         r'http://alaphalinu.org/>)',
         r'http://alaphalinu.org/">)',
         r'http://alaphalinu.org/>),',
         r'http://alaphalinu.org/tst.htm]',
         r'http://alaphalinu.org/tst.html.',
         r'http://alaphalinu.org/isc\\fR.',
         r'http://alaphalinu.org/isc.txt,',
         r'http://alaphalinu.org/isc.html\\n',
         r'http://alaphalinu.org/somedir/\\n',
         r'http://kernelnewbies.org/</ulink>).',
         r'http://kernelnewbies.org/(ulink).',
         r'http://alaphalinu.org/isc\fR.',
         r'http://alaphalinu.org/isc.html\n',
         r'http://alaphalinu.org/somedir/\n'
     ]
     expected = [
         u'http://alaphalinu.org/',
         u'http://alaphalinu.org/bridge',
         u'http://alaphalinu.org/tst.htm',
         u'http://alaphalinu.org/tst.html',
         u'http://alaphalinu.org/isc',
         u'http://alaphalinu.org/isc.txt',
         u'http://alaphalinu.org/isc.html',
         u'http://alaphalinu.org/somedir',
         u'http://kernelnewbies.org/',
         u'http://alaphalinu.org/somedir/',
     ]
     result = list(finder.find_urls(lines))
     assert expected == result
 def test_find_urls_does_not_return_local_ip(self):
     lines = [
         'http://localhost',
         'http://localhost/',
         'http://localhost:8080',
         'http://localhost/dir/page.html',
         'http://127.0.0.1',
         'http://127.0.0.1:4029',
         'http://127.0.0.1/dir',
         'http://127.0.0.1/',
         'http://127.0.0.1/page.htm',
         'http://192.168.0.1',
         'http://10.0.0.1',
         'http://10.255.255.124',
         'http://169.254.0.0',
         'http://172.16.0.0',
         'http://172.31.255.255',
         'http://172.32.120.155'
     ]
     expected = [u'http://172.32.120.155/']
     result = list(finder.find_urls(lines))
     assert expected == result
 def test_find_urls_filters_bogus_url(self):
     url_text = [u'http://__________________']
     expected = []
     result = list(finder.find_urls(url_text))
     assert expected == result
 def test_find_urls_in_java(self):
     test_file = self.get_test_loc('finder/url/IMarkerActionFilter.java')
     expected = [u'http://www.eclipse.org/legal/epl-v10.html', ]
     result = list(finder.find_urls(test_file))
     assert expected == result
 def test_find_urls_filters_invalid_urls(self):
     testfile = self.get_test_loc('finder/url/truncated_url')
     result = list(finder.find_urls(testfile))
     expected = []
     assert expected == result
 def test_find_urls_no_junk_urls(self):
     testfile = self.get_test_loc('finder/url/junk_urls.c')
     expected = []
     result = list(finder.find_urls(testfile))
     assert expected == result