Ejemplos de parse en Python, ejemplos de urltools.parse en Python

Ejemplo n.º 1

0

Mostrar archivo

    def imdb_parser(self, response):

        end_time = time.time()
        u = urltools.parse(response.url)

        if response.status == 200:
            if response.meta['depth'] == 1:
                soup = BeautifulSoup(response.text, 'html.parser')
                title = soup.find('title').get_text()
                r_date, budget, g_usa, runtime = info_extractor(soup)
                _id = unique_id_generator(response.url)

                yield {
                    "id": _id,
                    "url": response.url,
                    "timestamp_crawl": time.time(),
                    "title": title,
                    "release_date": r_date,
                    "budget": budget,
                    "gross_usa": g_usa,
                    "runtime": runtime
                }
                return

        for url in response.css("a::attr(href)"):

            u = url.get()
            if re.search('^/title', u):
                next_page = response.urljoin(u)
                if next_page not in self.URLS_CRAWLED:
                    next_page_parse = urltools.parse(next_page)
                    if next_page_parse.domain == self.domain:
                        self.URLS_CRAWLED.append(next_page)
                        yield (Request(next_page, callback=self.imdb_parser))

Ejemplo n.º 2

0

Mostrar archivo

Archivo: cast_crawler.py Proyecto: rijulvohra/Web-Crawling-IMDB-

    def imdb_parser(self,response):

        end_time = time.time()
        u = urltools.parse(response.url)
        
        if response.status == 200:
            if response.meta['depth'] == 1:
                soup = BeautifulSoup(response.text,'html.parser')
                title = soup.find('title').get_text()
                b_date,b_place,d_date,d_place = info_extractor(soup)
                _id = unique_id_generator(response.url)
                timestamp_crawl = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                yield {"id":_id,"url":response.url,"timestamp_crawl":timestamp_crawl,
                       "name":title,"date_of_birth":b_date,"place_of_birth":b_place,"date_of_death":d_date,
                       "place_of_death":d_place}
                return
            
        for url in response.css("a::attr(href)"):

            u = url.get()
            if re.search('^/name',u):
                next_page = response.urljoin(u)
                if next_page not in self.URLS_CRAWLED:
                    next_page_parse = urltools.parse(next_page)
                    if next_page_parse.domain == self.domain:
                        self.URLS_CRAWLED.append(next_page)
                        yield (Request(next_page, callback = self.imdb_parser))

Ejemplo n.º 3

0

Mostrar archivo

Archivo: url.py Proyecto: kuc2477/news

def issamedomain(index, url):
    parsed_index = urltools.parse(index)
    parsed_url = urltools.parse(url)

    return ispath(url) or \
        '{}.{}'.format(parsed_index.domain, parsed_index.tld) == \
        '{}.{}'.format(parsed_url.domain, parsed_url.tld)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: url.py Proyecto: kuc2477/news

def issamedomain(index, url):
    parsed_index = urltools.parse(index)
    parsed_url = urltools.parse(url)

    return ispath(url) or \
        '{}.{}'.format(parsed_index.domain, parsed_index.tld) == \
        '{}.{}'.format(parsed_url.domain, parsed_url.tld)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: utility.py Proyecto: Haoyulance/wtnic-release

 def normalize_url(self, web, sub_path=False):
     if web.startswith("http://") or web.startswith("https://"):
         parse = urltools.parse(web)
     else:
         web = "http://" + web
         parse = urltools.parse(web)
     url = parse.domain + "." + parse.tld
     return (url, parse.path) if sub_path else url

Ejemplo n.º 6

0

Mostrar archivo

def normalize_url(company):
    web = company.replace("_", "/")
    if web.startswith("http://") or web.startswith("https://"):
        parse = urltools.parse(web)
    else:
        web = "http://" + web
        parse = urltools.parse(web)
    url = parse.domain + "." + parse.tld
    return url

Ejemplo n.º 7

0

Mostrar archivo

    def is_valid_url(self, response, url, text, custom_url_pattern):
        word_count = len(re.split('\\s+', text.strip())) if text else 0

        if custom_url_pattern:
            return word_count >= 5 and custom_url_pattern.search(url)
        else:
            site_domain = urltools.parse(response.url).domain
            url_domain = urltools.parse(url).domain

            return word_count >= 5 and url_domain == site_domain and urls.valid_url(
                url)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_urltools.py Proyecto: yocoa/python-urltools

def test_parse():
    assert parse('http://example.com') == ('http', '', '', '', 'example',
                                           'com', '', '', '', '',
                                           'http://example.com')
    assert parse('http://example.com:8080') == ('http', '', '', '', 'example',
                                                'com', '8080', '', '', '',
                                                'http://example.com:8080')
    assert parse('http://example.co.uk') == ('http', '', '', '', 'example',
                                             'co.uk', '', '', '', '',
                                             'http://example.co.uk')
    assert parse('http://example.com/foo/') == ('http', '', '', '', 'example',
                                                'com', '', '/foo/', '', '',
                                                'http://example.com/foo/')
    assert parse('http://*****:*****@www.example.com:1234/foo/?x=1#bla') == (
        'http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1',
        'bla', 'http://*****:*****@www.example.com:1234/foo/?x=1#bla')
    assert parse('http://example.com?foo=bar:blub') == (
        'http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub', '',
        'http://example.com?foo=bar:blub')
    assert parse('http://example.com?foo=bar:blub/') == (
        'http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub/', '',
        'http://example.com?foo=bar:blub/')

    assert parse('mailto:[email protected]') == ('mailto', 'foo', '', '', 'bar',
                                           'com', '', '', '', '',
                                           'mailto:[email protected]')

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_urltools.py Proyecto: erazor85/urltools

def test_parse():
    assert parse("http://example.com") == ('http', '', 'example', 'com', '', '/', '', '')
    assert parse("http://example.com:8080") == ('http', '', 'example', 'com', '8080', '/', '', '')
    assert parse("http://example.ac.at") == ('http', '', 'example', 'ac.at', '', '/', '', '')
    assert parse("http://example.co.uk") == ('http', '', 'example', 'co.uk', '', '/', '', '')

    assert parse("example.com.") == ('', '', '', '', '', 'example.com.', '', '')
    assert parse("example.com/abc") == ('', '', '', '', '', 'example.com/abc', '', '')
    assert parse("www.example.com") == ('', '', '', '', '', 'www.example.com', '', '')

    assert parse("http://пример.рф") == ('http', '', 'пример', 'рф', '', '/', '', '')
    assert parse("http://إختبار.مصر/") == ('http', '', 'إختبار', 'مصر', '', '/', '', '')

Ejemplo n.º 10

0

Mostrar archivo

class imdb_spider(scrapy.Spider):
    name = 'scifi_movies'
    url = [
        'https://www.imdb.com/search/title/?genres=sci-fi&start=1&explore=title_type,genres'
    ]
    for it in range(51, 5000, 50):
        url.append(
            'https://www.imdb.com/search/title/?genres=sci-fi&start={}&explore=title_type,genres'
            .format(it))
    parse = urltools.parse(url[0])
    domain = parse.domain
    URLS_CRAWLED = []
    start_time = time.time()
    custom_settings = {
        'DEPTH_PRIORITY': 1,
        'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue',
        'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue',
        'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter'
    }

    def start_requests(self):
        for url in self.url:
            yield scrapy.Request(url, callback=self.imdb_parser)

    def imdb_parser(self, response):

        end_time = time.time()
        u = urltools.parse(response.url)

        if response.status == 200:
            if response.meta['depth'] == 1:
                soup = BeautifulSoup(response.text, 'html.parser')
                title = soup.find('title').get_text()
                r_date, budget, g_usa, runtime = info_extractor(soup)
                _id = unique_id_generator(response.url)

                yield {
                    "id": _id,
                    "url": response.url,
                    "timestamp_crawl": time.time(),
                    "title": title,
                    "release_date": r_date,
                    "budget": budget,
                    "gross_usa": g_usa,
                    "runtime": runtime
                }
                return

        for url in response.css("a::attr(href)"):

            u = url.get()
            if re.search('^/title', u):
                next_page = response.urljoin(u)
                if next_page not in self.URLS_CRAWLED:
                    next_page_parse = urltools.parse(next_page)
                    if next_page_parse.domain == self.domain:
                        self.URLS_CRAWLED.append(next_page)
                        yield (Request(next_page, callback=self.imdb_parser))

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test_urltools.py Proyecto: SashaBorandi/python-urltools

def test_parse():
    assert parse('http://example.com') == ('http', '', '', '', 'example', 'com', '', '', '', '', 'http://example.com')
    assert parse('http://example.com:8080') == ('http', '', '', '', 'example', 'com', '8080', '', '', '', 'http://example.com:8080')
    assert parse('http://example.co.uk') == ('http', '', '', '', 'example', 'co.uk', '', '', '', '', 'http://example.co.uk')
    assert parse('http://example.com/foo/') == ('http', '', '', '', 'example', 'com', '', '/foo/', '', '', 'http://example.com/foo/')
    assert parse('http://*****:*****@www.example.com:1234/foo/?x=1#bla') == ('http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1', 'bla', 'http://*****:*****@www.example.com:1234/foo/?x=1#bla')
    assert parse('http://example.com?foo=bar:blub') == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub', '', 'http://example.com?foo=bar:blub')
    assert parse('http://example.com?foo=bar:blub/') == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub/', '', 'http://example.com?foo=bar:blub/')

    assert parse('mailto:[email protected]') == ('mailto', 'foo', '', '', 'bar', 'com', '', '', '', '', 'mailto:[email protected]')

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test_urltools.py Proyecto: SashaBorandi/python-urltools

def test_parse__no_scheme():
    assert parse('example.com.') == ('', '', '', '', '', '', '', 'example.com.', '', '', 'example.com.')
    assert parse('example.com/abc') == ('', '', '', '', '', '', '', 'example.com/abc', '', '', 'example.com/abc')
    assert parse('www.example.com') == ('', '', '', '', '', '', '', 'www.example.com', '', '', 'www.example.com')
    assert parse('www.example.com/?x=1') == ('', '', '', '', '', '', '', 'www.example.com/', 'x=1', '', 'www.example.com/?x=1')
    assert parse('www.example.com?x=1') == ('', '', '', '', '', '', '', 'www.example.com', 'x=1', '', 'www.example.com?x=1')
    assert parse('www.example.com/#foo') == ('', '', '', '', '', '', '', 'www.example.com/', '', 'foo', 'www.example.com/#foo')
    assert parse('www.example.com#foo') == ('', '', '', '', '', '', '', 'www.example.com', '', 'foo', 'www.example.com#foo')

Ejemplo n.º 13

0

Mostrar archivo

Archivo: infer_unseen_docs.py Proyecto: Haoyulance/wtnic-release

def generate_bag_of_words(company):
    max_level = 3
    start_year, end_year = current_year - 4, current_year
    dnf = working_dir + "directory_not_found"
    directory = properties.get("data_directory")
    web = company.replace("_", "/")
    if web.startswith("http://") or web.startswith("https://"):
        parse = urltools.parse(web)
    else:
        web = "http://" + web
        parse = urltools.parse(web)
    url = parse.domain + "." + parse.tld
    dir = directory + url + "/"
    if not os.path.isdir(dir):
        f = open(dnf, "a")
        f.write(company + "\n")
        f.close()
        return
    years = os.listdir(dir)
    years = [int(x) for x in years]
    years.sort()

    words = ""
    file_count = 0
    for year in years:
        if start_year <= year <= end_year:
            y = dir + str(year) + "/"
            files = os.listdir(y)
            levels = []
            for filename in files:
                if "txt" == filename[-3:]:
                    levels.append(int(filename[:-4]))
            levels.sort()
            for level in levels:
                if level <= max_level:
                    path = y + str(level) + ".txt"
                    f = open(path, "r")
                    content = f.read().strip().lower().replace("__info__", " ")
                    words += content
                    f.close()
                    file_count += 1
    return words

Ejemplo n.º 14

0

Mostrar archivo

Archivo: Website.py Proyecto: CoreyHyllested/Census

    def __init__(self, website, site_ctx=None, debug=False):
        self.uri = urltools.normalize(website)
        self.parsed = urltools.parse(website)
        self.domain = ".".join(self.parsed[4:6]).lstrip("www.")

        self.robots = None
        self.sitemap = None  # list of documents
        self.error = {}
        self.debug = debug

        self.__session = None
        self.load_domain_state()

Ejemplo n.º 15

0

Mostrar archivo

Archivo: cast_crawler.py Proyecto: rijulvohra/Web-Crawling-IMDB-

class imdb_spider(scrapy.Spider):
    name = 'cast_scraper'
    url = ['https://www.imdb.com/search/name/?gender=male,female&ref_=rlm']
    for it in range(51,5000,50):
        url.append('https://www.imdb.com/search/name/?gender=male,female&start={}&ref_=rlm'.format(it))
    parse = urltools.parse(url[0])
    domain = parse.domain
    URLS_CRAWLED = []
    start_time = time.time()
    custom_settings = {
        'DEPTH_PRIORITY' : 1,
        'SCHEDULER_DISK_QUEUE' : 'scrapy.squeues.PickleFifoDiskQueue',
        'SCHEDULER_MEMORY_QUEUE' : 'scrapy.squeues.FifoMemoryQueue',
        'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter',
    }
    def start_requests(self):
        for url in self.url:
            yield scrapy.Request(url,callback = self.imdb_parser) 

    def imdb_parser(self,response):

        end_time = time.time()
        u = urltools.parse(response.url)
        
        if response.status == 200:
            if response.meta['depth'] == 1:
                soup = BeautifulSoup(response.text,'html.parser')
                title = soup.find('title').get_text()
                b_date,b_place,d_date,d_place = info_extractor(soup)
                _id = unique_id_generator(response.url)
                timestamp_crawl = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                yield {"id":_id,"url":response.url,"timestamp_crawl":timestamp_crawl,
                       "name":title,"date_of_birth":b_date,"place_of_birth":b_place,"date_of_death":d_date,
                       "place_of_death":d_place}
                return
            
        for url in response.css("a::attr(href)"):

            u = url.get()
            if re.search('^/name',u):
                next_page = response.urljoin(u)
                if next_page not in self.URLS_CRAWLED:
                    next_page_parse = urltools.parse(next_page)
                    if next_page_parse.domain == self.domain:
                        self.URLS_CRAWLED.append(next_page)
                        yield (Request(next_page, callback = self.imdb_parser))

Ejemplo n.º 16

0

Mostrar archivo

Archivo: test_urltools.py Proyecto: yocoa/python-urltools

def test_parse__no_scheme():
    assert parse('example.com.') == ('', '', '', '', '', '', '',
                                     'example.com.', '', '', 'example.com.')
    assert parse('example.com/abc') == ('', '', '', '', '', '', '',
                                        'example.com/abc', '', '',
                                        'example.com/abc')
    assert parse('www.example.com') == ('', '', '', '', '', '', '',
                                        'www.example.com', '', '',
                                        'www.example.com')
    assert parse('www.example.com/?x=1') == ('', '', '', '', '', '', '',
                                             'www.example.com/', 'x=1', '',
                                             'www.example.com/?x=1')
    assert parse('www.example.com?x=1') == ('', '', '', '', '', '', '',
                                            'www.example.com', 'x=1', '',
                                            'www.example.com?x=1')
    assert parse('www.example.com/#foo') == ('', '', '', '', '', '', '',
                                             'www.example.com/', '', 'foo',
                                             'www.example.com/#foo')
    assert parse('www.example.com#foo') == ('', '', '', '', '', '', '',
                                            'www.example.com', '', 'foo',
                                            'www.example.com#foo')

Ejemplo n.º 17

0

Mostrar archivo

Archivo: test_urltools.py Proyecto: yocoa/python-urltools

def test_parse__ip():
    assert parse('http://[::1]/foo') == ('http', '', '', '', '[::1]', '', '',
                                         '/foo', '', '', 'http://[::1]/foo')
    assert parse('[::1]/foo') == ('', '', '', '', '', '', '', '[::1]/foo', '',
                                  '', '[::1]/foo')

Ejemplo n.º 18

0

Mostrar archivo

Archivo: test_urltools.py Proyecto: vinilios/urltools

def test_parse():
    assert parse("http://example.com") == ('http', '', '', '', 'example', 'com', '', '', '', '')
    assert parse("http://example.com:8080") == ('http', '', '', '', 'example', 'com', '8080', '', '', '')
    assert parse("http://example.ac.at") == ('http', '', '', '', 'example', 'ac.at', '', '', '', '')
    assert parse("http://example.co.uk") == ('http', '', '', '', 'example', 'co.uk', '', '', '', '')
    assert parse("http://example.com/foo/") == ('http', '', '', '', 'example', 'com', '', '/foo/', '', '')
    assert parse("http://*****:*****@www.example.com:1234/foo/?x=1#bla") == ('http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1', 'bla')
    assert parse("http://example.com?foo=bar:blub") == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub', '')
    assert parse("http://example.com?foo=bar:blub/") == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub/', '')

    assert parse("example.com.") == ('', '', '', '', '', '', '', 'example.com.', '', '')
    assert parse("example.com/abc") == ('', '', '', '', '', '', '', 'example.com/abc', '', '')
    assert parse("www.example.com") == ('', '', '', '', '', '', '', 'www.example.com', '', '')
    assert parse("www.example.com/?x=1") == ('', '', '', '', '', '', '', 'www.example.com/', 'x=1', '')
    assert parse("www.example.com?x=1") == ('', '', '', '', '', '', '', 'www.example.com', 'x=1', '')
    assert parse("www.example.com/#foo") == ('', '', '', '', '', '', '', 'www.example.com/', '', 'foo')
    assert parse("www.example.com#foo") == ('', '', '', '', '', '', '', 'www.example.com', '', 'foo')

    assert parse("http://пример.рф") == ('http', '', '', '', 'пример', 'рф', '', '', '', '')
    assert parse("http://إختبار.مصر/") == ('http', '', '', '', 'إختبار', 'مصر', '', '/', '', '')

    assert parse("mailto:[email protected]") == ('mailto', 'foo', '', '', 'bar', 'com', '', '', '', '')

    assert parse("http://[::1]/foo/bar") == ('http', '', '', '', '[::1]', '', '', '/foo/bar', '', '')
    assert parse("[::1]/foo/bar") == ('', '', '', '', '', '', '', '[::1]/foo/bar', '', '')

Ejemplo n.º 19

0

Mostrar archivo

Archivo: test_urltools.py Proyecto: SashaBorandi/python-urltools

def test_parse__ip():
    assert parse('http://[::1]/foo') == ('http', '', '', '', '[::1]', '', '', '/foo', '', '', 'http://[::1]/foo')
    assert parse('[::1]/foo') == ('', '', '', '', '', '', '', '[::1]/foo', '', '', '[::1]/foo')

Ejemplo n.º 20

0

Mostrar archivo

Archivo: test_urltools.py Proyecto: SashaBorandi/python-urltools

def test_parse__idn():
    assert parse(u'http://пример.рф') == ('http', '', '', '', u'пример', u'рф', '', '', '', '', u'http://пример.рф')
    assert parse(u'http://إختبار.مصر/') == ('http', '', '', '', u'إختبار', u'مصر', '', '/', '', '', u'http://إختبار.مصر/')

Ejemplo n.º 21

0

Mostrar archivo

Archivo: generate-json.py Proyecto: trevorsee/deploy-studio-atlas

# save to json file
x_axis = model[:, 0]
y_axis = model[:, 1]
x_norm = (x_axis - np.min(x_axis)) / (np.max(x_axis) - np.min(x_axis))
y_norm = (y_axis - np.min(y_axis)) / (np.max(y_axis) - np.min(y_axis))
print(len(main))

data = []
meta = []
for i, studio in enumerate(main):
    temp = {}
    name = studio['name']
    size = studio['size']
    url = studio['url']
    location = studio['location']
    fname = urltools.parse(url).domain + ".png"
    x = x_norm[i]
    y = y_norm[i]
    tags = tags_dict[name]

    temp['name'] = name
    temp['location'] = location
    temp['size'] = size
    temp['url'] = url
    temp['fname'] = fname
    temp['x'] = x
    temp['y'] = y
    temp['tags'] = tags

    data.append(temp)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: urlrap.py Proyecto: bobcolner/urlrap

def parse(url_str):
    "Parse URL into component parts"
    return _urltools.parse(url_str)

Ejemplo n.º 23

0

Mostrar archivo

Archivo: urlrap.py Proyecto: bobcolner/urlrap

def find_fragment(url_str):
    "Extract fragment component from URL"
    return _urltools.parse(url_str).fragment

Ejemplo n.º 24

0

Mostrar archivo

Archivo: urlrap.py Proyecto: bobcolner/urlrap

def find_query(url_str):
    "Extract query component from URL"
    return _urltools.parse(url_str).query

Ejemplo n.º 25

0

Mostrar archivo

Archivo: test_urltools.py Proyecto: yocoa/python-urltools

def test_parse__idn():
    assert parse(u'http://пример.рф') == ('http', '', '', '', u'пример', u'рф',
                                          '', '', '', '', u'http://пример.рф')
    assert parse(u'http://إختبار.مصر/') == ('http', '', '', '', u'إختبار',
                                            u'مصر', '', '/', '', '',
                                            u'http://إختبار.مصر/')

Ejemplo n.º 26

0

Mostrar archivo

Archivo: urlrap.py Proyecto: bobcolner/urlrap

def find_path(url_str):
    "Extract path component from URL"
    return _urltools.parse(url_str).path

Ejemplo n.º 27

0

Mostrar archivo

Archivo: urlrap.py Proyecto: bobcolner/urlrap

def find_domain(url_str):
    "Extract domain from URL"
    return _urltools.parse(url_str).domain

Ejemplo n.º 28

0

Mostrar archivo

Archivo: crawler.py Proyecto: lukepatrick/unittest-python

    def process_links(self, url):
        """Process links in an website, organize links as same domain or 3rd party
        
        Args:
            url
           
        Returns:
            modifies class objects
        """

        links, images = self.parse_html(url=url)

        # parse the input url for a base domain
        my_domain_parsed = urltools.parse(url)
        my_domain_string = my_domain_parsed.domain + "." + my_domain_parsed.tld

        # {"link":url,
        # "sublinks": links,
        # "static-elements": images,
        #  "third-party": url}
        link_object = {}

        link_object["link"] = url
        link_object["sublinks"] = []
        link_object["static-elements"] = images
        link_object["third-party"] = []

        for link in links:
            parsed = urltools.parse(str(link))
            domain_string = parsed.domain + "." + parsed.tld
            if not parsed.path.startswith("#"):
                # skip all anchor links
                if domain_string == my_domain_string:
                    # compare domains, assuming subdomain does not matter for 'uniqueness'
                    subdomain = parsed.subdomain + "." if parsed.subdomain else ''
                    urlstring = parsed.scheme + "://" + subdomain + parsed.domain\
                                + "." + parsed.tld + parsed.path
                    if not urlstring in link_object["sublinks"]:
                        # sublinks is a relative unique list of links
                        link_object["sublinks"].append(urlstring)
                    if not urlstring in self.domain:
                        # self domain is global unique list of links
                        self.domain.append(urlstring)
                elif parsed.domain == '':
                    # handle relative path URL's, assume they belong to base domain of input url
                    path = parsed.path if parsed.path.startswith(
                        "/") else "/" + parsed.path
                    urlstring = my_domain_parsed.scheme + "://" + my_domain_parsed.subdomain + "." + \
                                my_domain_parsed.domain + "." + my_domain_parsed.tld + path
                    if not urlstring in link_object["sublinks"]:
                        # sublinks is a relative unique list of links
                        link_object["sublinks"].append(urlstring)
                    if not urlstring in self.domain:
                        # self domain is global unique list of links
                        self.domain.append(urlstring)
                else:
                    # handle non-matching domains, assume all third-party
                    subdomain = parsed.subdomain + "." if parsed.subdomain else ''
                    urlstring = parsed.scheme + "://" + subdomain + parsed.domain \
                                + "." + parsed.tld + parsed.path
                    if not urlstring in link_object["third-party"]:
                        # third-party is a relative unique list of links
                        link_object["third-party"].append(urlstring)
                    if not urlstring in self.third_party:
                        # self third party is global unique list of links
                        self.third_party.append(urlstring)

        self.web_crawl_index.append(link_object)

Ejemplo n.º 29

0

Mostrar archivo

Archivo: __init__.py Proyecto: angstwad/lastpass-cloudbleed

def match_domains(account):
    parsed = urltools.parse(account.url)
    rootdom = "{domain}.{tld}".format(**parsed.__dict__)
    if rootdom in cf_domains:
        return rootdom

Ejemplo n.º 30

0

Mostrar archivo

Archivo: urlrap.py Proyecto: bobcolner/urlrap

def normalize(url, strip=False):
    "RFC3986 normalize URL & Optionally removing url-query/fragment string"
    if strip:
        p = _urltools.parse(url)
        url = p.scheme + '://' + p.subdomain + p.domain + p.path
    return _urltools.normalize(url)

Ejemplo n.º 31

0

Mostrar archivo

def test_parse():
    assert parse("http://example.com") == ('http', '', '', '', 'example',
                                           'com', '', '', '', '')
    assert parse("http://example.com:8080") == ('http', '', '', '', 'example',
                                                'com', '8080', '', '', '')
    assert parse("http://example.ac.at") == ('http', '', '', '', 'example',
                                             'ac.at', '', '', '', '')
    assert parse("http://example.co.uk") == ('http', '', '', '', 'example',
                                             'co.uk', '', '', '', '')
    assert parse("http://example.com/foo/") == ('http', '', '', '', 'example',
                                                'com', '', '/foo/', '', '')
    assert parse("http://*****:*****@www.example.com:1234/foo/?x=1#bla") == (
        'http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1',
        'bla')
    assert parse("http://example.com?foo=bar:blub") == ('http', '', '', '',
                                                        'example', 'com', '',
                                                        '', 'foo=bar:blub', '')
    assert parse("http://example.com?foo=bar:blub/") == ('http', '', '', '',
                                                         'example', 'com', '',
                                                         '', 'foo=bar:blub/',
                                                         '')

    assert parse("example.com.") == ('', '', '', '', '', '', '',
                                     'example.com.', '', '')
    assert parse("example.com/abc") == ('', '', '', '', '', '', '',
                                        'example.com/abc', '', '')
    assert parse("www.example.com") == ('', '', '', '', '', '', '',
                                        'www.example.com', '', '')
    assert parse("www.example.com/?x=1") == ('', '', '', '', '', '', '',
                                             'www.example.com/', 'x=1', '')
    assert parse("www.example.com?x=1") == ('', '', '', '', '', '', '',
                                            'www.example.com', 'x=1', '')
    assert parse("www.example.com/#foo") == ('', '', '', '', '', '', '',
                                             'www.example.com/', '', 'foo')
    assert parse("www.example.com#foo") == ('', '', '', '', '', '', '',
                                            'www.example.com', '', 'foo')

    assert parse("http://пример.рф") == ('http', '', '', '', 'пример', 'рф',
                                         '', '', '', '')
    assert parse("http://إختبار.مصر/") == ('http', '', '', '', 'إختبار', 'مصر',
                                           '', '/', '', '')

    assert parse("mailto:[email protected]") == ('mailto', 'foo', '', '', 'bar',
                                           'com', '', '', '', '')

    assert parse("http://[::1]/foo/bar") == ('http', '', '', '', '[::1]', '',
                                             '', '/foo/bar', '', '')
    assert parse("[::1]/foo/bar") == ('', '', '', '', '', '', '',
                                      '[::1]/foo/bar', '', '')