Example #1
0
def test_extract__idn():
    assert extract(u'http://пример.рф') == ('http', '', '', '', u'пример',
                                            u'рф', '', '', '', '',
                                            u'http://пример.рф')
    assert extract(u'http://إختبار.مصر/') == ('http', '', '', '', u'إختبار',
                                              u'مصر', '', '/', '', '',
                                              u'http://إختبار.مصر/')
Example #2
0
def test_extract():
    assert extract('http://example.com') == ('http', '', '', '', 'example',
                                             'com', '', '', '', '',
                                             'http://example.com')
    assert extract('http://example.com:8080') == ('http', '', '', '',
                                                  'example', 'com', '8080', '',
                                                  '', '',
                                                  'http://example.com:8080')
    assert extract('http://example.com:8080/abc?x=1&y=2#qwe') == (
        'http', '', '', '', 'example', 'com', '8080', '/abc', 'x=1&y=2', 'qwe',
        'http://example.com:8080/abc?x=1&y=2#qwe')
    assert extract('http://example.ac.at') == ('http', '', '', '', 'example',
                                               'ac.at', '', '', '', '',
                                               'http://example.ac.at')
    assert extract('http://example.co.uk/') == ('http', '', '', '', 'example',
                                                'co.uk', '', '/', '', '',
                                                'http://example.co.uk/')
    assert extract('http://foo.bar.example.co.uk') == (
        'http', '', '', 'foo.bar', 'example', 'co.uk', '', '', '', '',
        'http://foo.bar.example.co.uk')
    assert extract('http://*****:*****@www.example.com:1234/foo/?x=1#bla') == (
        'http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1',
        'bla', 'http://*****:*****@www.example.com:1234/foo/?x=1#bla')
    assert extract('http://example.com?x=y:z') == ('http', '', '', '',
                                                   'example', 'com', '', '',
                                                   'x=y:z', '',
                                                   'http://example.com?x=y:z')
    assert extract('http://example.com?x=y:z/') == (
        'http', '', '', '', 'example', 'com', '', '', 'x=y:z/', '',
        'http://example.com?x=y:z/')

    assert extract('mailto:[email protected]') == ('mailto', 'foo', '', '', 'bar',
                                             'com', '', '', '', '',
                                             'mailto:[email protected]')
def test_extract():
    assert extract('http://example.com') == ('http', '', '', '', 'example', 'com', '', '', '', '', 'http://example.com')
    assert extract('http://example.com:8080') == ('http', '', '', '', 'example', 'com', '8080', '', '', '', 'http://example.com:8080')
    assert extract('http://example.com:8080/abc?x=1&y=2#qwe') == ('http', '', '', '', 'example', 'com', '8080', '/abc', 'x=1&y=2', 'qwe', 'http://example.com:8080/abc?x=1&y=2#qwe')
    assert extract('http://example.ac.at') == ('http', '', '', '', 'example', 'ac.at', '', '', '', '', 'http://example.ac.at')
    assert extract('http://example.co.uk/') == ('http', '', '', '', 'example', 'co.uk', '', '/', '', '', 'http://example.co.uk/')
    assert extract('http://foo.bar.example.co.uk') == ('http', '', '', 'foo.bar', 'example', 'co.uk', '', '', '', '', 'http://foo.bar.example.co.uk')
    assert extract('http://*****:*****@www.example.com:1234/foo/?x=1#bla') == ('http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1', 'bla', 'http://*****:*****@www.example.com:1234/foo/?x=1#bla')
    assert extract('http://example.com?x=y:z') == ('http', '', '', '', 'example', 'com', '', '', 'x=y:z', '', 'http://example.com?x=y:z')
    assert extract('http://example.com?x=y:z/') == ('http', '', '', '', 'example', 'com', '', '', 'x=y:z/', '', 'http://example.com?x=y:z/')

    assert extract('mailto:[email protected]') == ('mailto', 'foo', '', '', 'bar', 'com', '', '', '', '', 'mailto:[email protected]')
Example #4
0
File: url.py Project: svven/summary
def canonicalize_url(url, keep_params=False, keep_fragments=False):
    """Canonicalize the given url by applying the following procedures:

    # a sort query arguments, first by key, then by value
    # b percent encode paths and query arguments. non-ASCII characters are
    # c percent-encoded using UTF-8 (RFC-3986)
    # d normalize all spaces (in query arguments) '+' (plus symbol)
    # e normalize percent encodings case (%2f -> %2F)
    # f remove query arguments with blank values (unless site in NONCANONIC_SITES)
    # g remove fragments (unless #!)
    # h remove username/password at front of domain
    # i remove port if 80, keep if not
    # k remove query arguments (unless site in USEFUL_QUERY_KEYS)

    The url passed can be a str or unicode, while the url returned is always a
    str.
    """
    if keep_params:
        # Preserve all query params
        parsed = extract(norm(url))
    else:
        # Remove unwanted params
        parsed = extract(url_query_cleaner(normalize(url), parameterlist=config.USEFUL_QUERY_KEYS))

    # Sort params, remove blank if not wanted
    query = urllib.urlencode(sorted(urlparse.parse_qsl(parsed.query, keep_blank_values=keep_params)))
    fragment = getFragment(url, keep_fragments)

    # The following is to remove orphaned '=' from query string params with no values
    query = re.sub(r"=$", "", query.replace("=&", "&"))

    # Reconstruct URL, escaping apart from safe chars
    # See http://stackoverflow.com/questions/2849756/list-of-valid-characters-for-the-fragment-identifier-in-an-url
    # http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
    safe = "/.-_~!$&'()*+,;=:@"
    newurl = construct(URL(parsed.scheme, '', '', parsed.subdomain, parsed.domain, parsed.tld, parsed.port, quote(parsed.path, safe=safe), query, quote(fragment, safe=safe), ''))
    return newurl.rstrip('/')
Example #5
0
async def check(line):
    global loop

    url = urltools.extract(line)
    base = url.domain + (url.tld and '.' + url.tld or '')
    c = colorama.Fore.RED

    prefixes = pfxs if url.tld else (sch + "://" for sch in schemes)

    for pfx in prefixes:
        uri = pfx + base
        data = None
        try:
            async with aiohttp.ClientSession(connector=connector()) as session:
                async with session.get(uri,
                                       headers={"User-Agent": USERAGENT},
                                       timeout=timeout) as resp:
                    if resp.status == 200:
                        with contextlib.suppress(LookupError,
                                                 UnicodeDecodeError):
                            data = await resp.text()

            if data is None:
                continue

            c = await loop.run_in_executor(
                None, functools.partial(parse, resp.url, data))

        except (OSError, ValueError):
            return
        except (RuntimeError, asyncio.TimeoutError,
                aiohttp.http_exceptions.BadHttpMessage,
                aiohttp.ClientResponseError, aiohttp.ServerDisconnectedError,
                ConnectionResetError):
            continue
        except Exception as e:
            print("Unhandled Exception: ", e)
            traceback.print_exc()
            return
        finally:
            print(colorama.Style.BRIGHT + c + uri + colorama.Style.RESET_ALL)
    await asyncio.sleep(0.5)
def get_content():
    if len(argv) < 2:
        content = get_list()
        return content
    else:
        file = argv[1]
        try:
            content = []
            list1 = list(open(file, "r"))
            links = list1[:] = [line.rstrip('\n') for line in list1]
            for item in links:
                if item:
                    url_info = urltools.extract(item)
                    if url_info[4] == str("twitter"):
                        item = url_info[7].replace("/", "")
                        content.append(item)
            return content
        except FileNotFoundError:
            print("File {0} not found!".format(argv[1]))
            exit(1)
Example #7
0
File: url.py Project: svven/summary
def getFragment(url, keep_fragments):
    fragment = extract(norm(url)).fragment
    return fragment if fragment.startswith('!') or keep_fragments else ''
Example #8
0
def test_extract():
    assert extract("http://example.com") == ('http', '', 'example', 'com', '', '/', '', '')
    assert extract("http://example.com:8080") == ('http', '', 'example', 'com', '8080', '/', '', '')
    assert extract("http://example.com:8080/abc?x=1&y=2#qwe") == ('http', '', 'example', 'com', '8080', '/abc', 'x=1&y=2', 'qwe')
    assert extract("http://example.ac.at") == ('http', '', 'example', 'ac.at', '', '/', '', '')
    assert extract("http://example.co.uk") == ('http', '', 'example', 'co.uk', '', '/', '', '')
    assert extract("http://foo.bar.example.co.uk") == ('http', 'foo.bar', 'example', 'co.uk', '', '/', '', '')

    assert extract("example.com.") == ('', '', 'example', 'com', '', '', '', '')
    assert extract("example.com/abc") == ('', '', 'example', 'com', '', '/abc', '', '')
    assert extract("www.example.com") == ('', 'www', 'example', 'com', '', '', '', '')
    assert extract("example.com/") == ('', '', 'example', 'com', '', '/', '', '')
    assert extract("example.com:8080") == ('', '', 'example', 'com', '8080', '', '', '')
    assert extract("example.com:8080/") == ('', '', 'example', 'com', '8080', '/', '', '')
    assert extract("example.com:8080/abc") == ('', '', 'example', 'com', '8080', '/abc', '', '')

    assert extract("http://пример.рф") == ('http', '', 'пример', 'рф', '', '/', '', '')
    assert extract("http://إختبار.مصر/") == ('http', '', 'إختبار', 'مصر', '', '/', '', '')
Example #9
0
def test_extract():
    assert extract("http://example.com") == ('http', '', '', '', 'example', 'com', '', '', '', '')
    assert extract("http://example.com:8080") == ('http', '', '', '', 'example', 'com', '8080', '', '', '')
    assert extract("http://example.com:8080/abc?x=1&y=2#qwe") == ('http', '', '', '', 'example', 'com', '8080', '/abc', 'x=1&y=2', 'qwe')
    assert extract("http://example.ac.at") == ('http', '', '', '', 'example', 'ac.at', '', '', '', '')
    assert extract("http://example.co.uk/") == ('http', '', '', '', 'example', 'co.uk', '', '/', '', '')
    assert extract("http://foo.bar.example.co.uk") == ('http', '', '', 'foo.bar', 'example', 'co.uk', '', '', '', '')
    assert extract("http://*****:*****@www.example.com:1234/foo/?x=1#bla") == ('http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1', 'bla')
    assert extract("http://example.com?foo=bar:blub") == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub', '')
    assert extract("http://example.com?foo=bar:blub/") == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub/', '')

    assert extract("example.com.") == ('', '', '', '', 'example', 'com', '', '', '', '')
    assert extract("example.com/abc") == ('', '', '', '', 'example', 'com', '', '/abc', '', '')
    assert extract("www.example.com") == ('', '', '', 'www', 'example', 'com', '', '', '', '')
    assert extract("example.com/") == ('', '', '', '', 'example', 'com', '', '/', '', '')
    assert extract("example.com:8080") == ('', '', '', '', 'example', 'com', '8080', '', '', '')
    assert extract("example.com:8080/") == ('', '', '', '', 'example', 'com', '8080', '/', '', '')
    assert extract("example.com:8080/abc") == ('', '', '', '', 'example', 'com', '8080', '/abc', '', '')
    assert extract("www.example.com/?x=1") == ('', '', '', 'www', 'example', 'com', '', '/', 'x=1', '')
    assert extract("www.example.com?x=1") == ('', '', '', 'www', 'example', 'com', '', '', 'x=1', '')
    assert extract("www.example.com/#foo") == ('', '', '', 'www', 'example', 'com', '', '/', '', 'foo')
    assert extract("www.example.com#foo") == ('', '', '', 'www', 'example', 'com', '', '', '', 'foo')

    assert extract("http://пример.рф") == ('http', '', '', '', 'пример', 'рф', '', '', '', '')
    assert extract("http://إختبار.مصر/") == ('http', '', '', '', 'إختبار', 'مصر', '', '/', '', '')

    assert extract("mailto:[email protected]") == ('mailto', 'foo', '', '', 'bar', 'com', '', '', '', '')

    assert extract("http://[::1]/foo/bar") == ('http', '', '', '', '[::1]', '', '', '/foo/bar', '', '')
    assert extract("[::1]/foo/bar") == ('', '', '', '', '[::1]', '', '', '/foo/bar', '', '')
def test_extract__ip():
    assert extract('http://[::1]/foo') == ('http', '', '', '', '[::1]', '', '', '/foo', '', '', 'http://[::1]/foo')
    assert extract('[::1]/foo') == ('', '', '', '', '[::1]', '', '', '/foo', '', '', '[::1]/foo')
def test_extract__no_scheme():
    assert extract('example.com.') == ('', '', '', '', 'example', 'com', '', '', '', '', 'example.com.')
    assert extract('example.com/abc') == ('', '', '', '', 'example', 'com', '', '/abc', '', '', 'example.com/abc')
    assert extract('www.example.com') == ('', '', '', 'www', 'example', 'com', '', '', '', '', 'www.example.com')
    assert extract('example.com/') == ('', '', '', '', 'example', 'com', '', '/', '', '', 'example.com/')
    assert extract('example.com:8080') == ('', '', '', '', 'example', 'com', '8080', '', '', '', 'example.com:8080')
    assert extract('example.com:8080/') == ('', '', '', '', 'example', 'com', '8080', '/', '', '', 'example.com:8080/')
    assert extract('example.com:8080/abc') == ('', '', '', '', 'example', 'com', '8080', '/abc', '', '', 'example.com:8080/abc')
    assert extract('www.example.com/?x=1') == ('', '', '', 'www', 'example', 'com', '', '/', 'x=1', '', 'www.example.com/?x=1')
    assert extract('www.example.com?x=1') == ('', '', '', 'www', 'example', 'com', '', '', 'x=1', '', 'www.example.com?x=1')
    assert extract('www.example.com/#foo') == ('', '', '', 'www', 'example', 'com', '', '/', '', 'foo', 'www.example.com/#foo')
    assert extract('www.example.com#foo') == ('', '', '', 'www', 'example', 'com', '', '', '', 'foo', 'www.example.com#foo')
def test_extract__idn():
    assert extract(u'http://пример.рф') == ('http', '', '', '', u'пример', u'рф', '', '', '', '', u'http://пример.рф')
    assert extract(u'http://إختبار.مصر/') == ('http', '', '', '', u'إختبار', u'مصر', '', '/', '', '', u'http://إختبار.مصر/')
Example #13
0
def test_extract__ip():
    assert extract('http://[::1]/foo') == ('http', '', '', '', '[::1]', '', '',
                                           '/foo', '', '', 'http://[::1]/foo')
    assert extract('[::1]/foo') == ('', '', '', '', '[::1]', '', '', '/foo',
                                    '', '', '[::1]/foo')
Example #14
0
def test_extract__no_scheme():
    assert extract('example.com.') == ('', '', '', '', 'example', 'com', '',
                                       '', '', '', 'example.com.')
    assert extract('example.com/abc') == ('', '', '', '', 'example', 'com', '',
                                          '/abc', '', '', 'example.com/abc')
    assert extract('www.example.com') == ('', '', '', 'www', 'example', 'com',
                                          '', '', '', '', 'www.example.com')
    assert extract('example.com/') == ('', '', '', '', 'example', 'com', '',
                                       '/', '', '', 'example.com/')
    assert extract('example.com:8080') == ('', '', '', '', 'example', 'com',
                                           '8080', '', '', '',
                                           'example.com:8080')
    assert extract('example.com:8080/') == ('', '', '', '', 'example', 'com',
                                            '8080', '/', '', '',
                                            'example.com:8080/')
    assert extract('example.com:8080/abc') == ('', '', '', '', 'example',
                                               'com', '8080', '/abc', '', '',
                                               'example.com:8080/abc')
    assert extract('www.example.com/?x=1') == ('', '', '', 'www', 'example',
                                               'com', '', '/', 'x=1', '',
                                               'www.example.com/?x=1')
    assert extract('www.example.com?x=1') == ('', '', '', 'www', 'example',
                                              'com', '', '', 'x=1', '',
                                              'www.example.com?x=1')
    assert extract('www.example.com/#foo') == ('', '', '', 'www', 'example',
                                               'com', '', '/', '', 'foo',
                                               'www.example.com/#foo')
    assert extract('www.example.com#foo') == ('', '', '', 'www', 'example',
                                              'com', '', '', '', 'foo',
                                              'www.example.com#foo')
Example #15
0
def test_extract():
    assert extract("http://example.com") == ('http', '', '', '', 'example',
                                             'com', '', '', '', '')
    assert extract("http://example.com:8080") == ('http', '', '', '',
                                                  'example', 'com', '8080', '',
                                                  '', '')
    assert extract("http://example.com:8080/abc?x=1&y=2#qwe") == (
        'http', '', '', '', 'example', 'com', '8080', '/abc', 'x=1&y=2', 'qwe')
    assert extract("http://example.ac.at") == ('http', '', '', '', 'example',
                                               'ac.at', '', '', '', '')
    assert extract("http://example.co.uk/") == ('http', '', '', '', 'example',
                                                'co.uk', '', '/', '', '')
    assert extract("http://foo.bar.example.co.uk") == ('http', '', '',
                                                       'foo.bar', 'example',
                                                       'co.uk', '', '', '', '')
    assert extract("http://*****:*****@www.example.com:1234/foo/?x=1#bla") == (
        'http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1',
        'bla')
    assert extract("http://example.com?foo=bar:blub") == ('http', '', '', '',
                                                          'example', 'com', '',
                                                          '', 'foo=bar:blub',
                                                          '')
    assert extract("http://example.com?foo=bar:blub/") == ('http', '', '', '',
                                                           'example', 'com',
                                                           '', '',
                                                           'foo=bar:blub/', '')

    assert extract("example.com.") == ('', '', '', '', 'example', 'com', '',
                                       '', '', '')
    assert extract("example.com/abc") == ('', '', '', '', 'example', 'com', '',
                                          '/abc', '', '')
    assert extract("www.example.com") == ('', '', '', 'www', 'example', 'com',
                                          '', '', '', '')
    assert extract("example.com/") == ('', '', '', '', 'example', 'com', '',
                                       '/', '', '')
    assert extract("example.com:8080") == ('', '', '', '', 'example', 'com',
                                           '8080', '', '', '')
    assert extract("example.com:8080/") == ('', '', '', '', 'example', 'com',
                                            '8080', '/', '', '')
    assert extract("example.com:8080/abc") == ('', '', '', '', 'example',
                                               'com', '8080', '/abc', '', '')
    assert extract("www.example.com/?x=1") == ('', '', '', 'www', 'example',
                                               'com', '', '/', 'x=1', '')
    assert extract("www.example.com?x=1") == ('', '', '', 'www', 'example',
                                              'com', '', '', 'x=1', '')
    assert extract("www.example.com/#foo") == ('', '', '', 'www', 'example',
                                               'com', '', '/', '', 'foo')
    assert extract("www.example.com#foo") == ('', '', '', 'www', 'example',
                                              'com', '', '', '', 'foo')

    assert extract("http://пример.рф") == ('http', '', '', '', 'пример', 'рф',
                                           '', '', '', '')
    assert extract("http://إختبار.مصر/") == ('http', '', '', '', 'إختبار',
                                             'مصر', '', '/', '', '')

    assert extract("mailto:[email protected]") == ('mailto', 'foo', '', '', 'bar',
                                             'com', '', '', '', '')

    assert extract("http://[::1]/foo/bar") == ('http', '', '', '', '[::1]', '',
                                               '', '/foo/bar', '', '')
    assert extract("[::1]/foo/bar") == ('', '', '', '', '[::1]', '', '',
                                        '/foo/bar', '', '')