def imdb_parser(self, response): end_time = time.time() u = urltools.parse(response.url) if response.status == 200: if response.meta['depth'] == 1: soup = BeautifulSoup(response.text, 'html.parser') title = soup.find('title').get_text() r_date, budget, g_usa, runtime = info_extractor(soup) _id = unique_id_generator(response.url) yield { "id": _id, "url": response.url, "timestamp_crawl": time.time(), "title": title, "release_date": r_date, "budget": budget, "gross_usa": g_usa, "runtime": runtime } return for url in response.css("a::attr(href)"): u = url.get() if re.search('^/title', u): next_page = response.urljoin(u) if next_page not in self.URLS_CRAWLED: next_page_parse = urltools.parse(next_page) if next_page_parse.domain == self.domain: self.URLS_CRAWLED.append(next_page) yield (Request(next_page, callback=self.imdb_parser))
def imdb_parser(self,response): end_time = time.time() u = urltools.parse(response.url) if response.status == 200: if response.meta['depth'] == 1: soup = BeautifulSoup(response.text,'html.parser') title = soup.find('title').get_text() b_date,b_place,d_date,d_place = info_extractor(soup) _id = unique_id_generator(response.url) timestamp_crawl = datetime.now().strftime("%Y-%m-%d %H:%M:%S") yield {"id":_id,"url":response.url,"timestamp_crawl":timestamp_crawl, "name":title,"date_of_birth":b_date,"place_of_birth":b_place,"date_of_death":d_date, "place_of_death":d_place} return for url in response.css("a::attr(href)"): u = url.get() if re.search('^/name',u): next_page = response.urljoin(u) if next_page not in self.URLS_CRAWLED: next_page_parse = urltools.parse(next_page) if next_page_parse.domain == self.domain: self.URLS_CRAWLED.append(next_page) yield (Request(next_page, callback = self.imdb_parser))
def issamedomain(index, url): parsed_index = urltools.parse(index) parsed_url = urltools.parse(url) return ispath(url) or \ '{}.{}'.format(parsed_index.domain, parsed_index.tld) == \ '{}.{}'.format(parsed_url.domain, parsed_url.tld)
def normalize_url(self, web, sub_path=False): if web.startswith("http://") or web.startswith("https://"): parse = urltools.parse(web) else: web = "http://" + web parse = urltools.parse(web) url = parse.domain + "." + parse.tld return (url, parse.path) if sub_path else url
def normalize_url(company): web = company.replace("_", "/") if web.startswith("http://") or web.startswith("https://"): parse = urltools.parse(web) else: web = "http://" + web parse = urltools.parse(web) url = parse.domain + "." + parse.tld return url
def is_valid_url(self, response, url, text, custom_url_pattern): word_count = len(re.split('\\s+', text.strip())) if text else 0 if custom_url_pattern: return word_count >= 5 and custom_url_pattern.search(url) else: site_domain = urltools.parse(response.url).domain url_domain = urltools.parse(url).domain return word_count >= 5 and url_domain == site_domain and urls.valid_url( url)
def test_parse(): assert parse('http://example.com') == ('http', '', '', '', 'example', 'com', '', '', '', '', 'http://example.com') assert parse('http://example.com:8080') == ('http', '', '', '', 'example', 'com', '8080', '', '', '', 'http://example.com:8080') assert parse('http://example.co.uk') == ('http', '', '', '', 'example', 'co.uk', '', '', '', '', 'http://example.co.uk') assert parse('http://example.com/foo/') == ('http', '', '', '', 'example', 'com', '', '/foo/', '', '', 'http://example.com/foo/') assert parse('http://*****:*****@www.example.com:1234/foo/?x=1#bla') == ( 'http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1', 'bla', 'http://*****:*****@www.example.com:1234/foo/?x=1#bla') assert parse('http://example.com?foo=bar:blub') == ( 'http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub', '', 'http://example.com?foo=bar:blub') assert parse('http://example.com?foo=bar:blub/') == ( 'http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub/', '', 'http://example.com?foo=bar:blub/') assert parse('mailto:[email protected]') == ('mailto', 'foo', '', '', 'bar', 'com', '', '', '', '', 'mailto:[email protected]')
def test_parse(): assert parse("http://example.com") == ('http', '', 'example', 'com', '', '/', '', '') assert parse("http://example.com:8080") == ('http', '', 'example', 'com', '8080', '/', '', '') assert parse("http://example.ac.at") == ('http', '', 'example', 'ac.at', '', '/', '', '') assert parse("http://example.co.uk") == ('http', '', 'example', 'co.uk', '', '/', '', '') assert parse("example.com.") == ('', '', '', '', '', 'example.com.', '', '') assert parse("example.com/abc") == ('', '', '', '', '', 'example.com/abc', '', '') assert parse("www.example.com") == ('', '', '', '', '', 'www.example.com', '', '') assert parse("http://пример.рф") == ('http', '', 'пример', 'рф', '', '/', '', '') assert parse("http://إختبار.مصر/") == ('http', '', 'إختبار', 'مصر', '', '/', '', '')
class imdb_spider(scrapy.Spider): name = 'scifi_movies' url = [ 'https://www.imdb.com/search/title/?genres=sci-fi&start=1&explore=title_type,genres' ] for it in range(51, 5000, 50): url.append( 'https://www.imdb.com/search/title/?genres=sci-fi&start={}&explore=title_type,genres' .format(it)) parse = urltools.parse(url[0]) domain = parse.domain URLS_CRAWLED = [] start_time = time.time() custom_settings = { 'DEPTH_PRIORITY': 1, 'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue', 'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue', 'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter' } def start_requests(self): for url in self.url: yield scrapy.Request(url, callback=self.imdb_parser) def imdb_parser(self, response): end_time = time.time() u = urltools.parse(response.url) if response.status == 200: if response.meta['depth'] == 1: soup = BeautifulSoup(response.text, 'html.parser') title = soup.find('title').get_text() r_date, budget, g_usa, runtime = info_extractor(soup) _id = unique_id_generator(response.url) yield { "id": _id, "url": response.url, "timestamp_crawl": time.time(), "title": title, "release_date": r_date, "budget": budget, "gross_usa": g_usa, "runtime": runtime } return for url in response.css("a::attr(href)"): u = url.get() if re.search('^/title', u): next_page = response.urljoin(u) if next_page not in self.URLS_CRAWLED: next_page_parse = urltools.parse(next_page) if next_page_parse.domain == self.domain: self.URLS_CRAWLED.append(next_page) yield (Request(next_page, callback=self.imdb_parser))
def test_parse(): assert parse('http://example.com') == ('http', '', '', '', 'example', 'com', '', '', '', '', 'http://example.com') assert parse('http://example.com:8080') == ('http', '', '', '', 'example', 'com', '8080', '', '', '', 'http://example.com:8080') assert parse('http://example.co.uk') == ('http', '', '', '', 'example', 'co.uk', '', '', '', '', 'http://example.co.uk') assert parse('http://example.com/foo/') == ('http', '', '', '', 'example', 'com', '', '/foo/', '', '', 'http://example.com/foo/') assert parse('http://*****:*****@www.example.com:1234/foo/?x=1#bla') == ('http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1', 'bla', 'http://*****:*****@www.example.com:1234/foo/?x=1#bla') assert parse('http://example.com?foo=bar:blub') == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub', '', 'http://example.com?foo=bar:blub') assert parse('http://example.com?foo=bar:blub/') == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub/', '', 'http://example.com?foo=bar:blub/') assert parse('mailto:[email protected]') == ('mailto', 'foo', '', '', 'bar', 'com', '', '', '', '', 'mailto:[email protected]')
def test_parse__no_scheme(): assert parse('example.com.') == ('', '', '', '', '', '', '', 'example.com.', '', '', 'example.com.') assert parse('example.com/abc') == ('', '', '', '', '', '', '', 'example.com/abc', '', '', 'example.com/abc') assert parse('www.example.com') == ('', '', '', '', '', '', '', 'www.example.com', '', '', 'www.example.com') assert parse('www.example.com/?x=1') == ('', '', '', '', '', '', '', 'www.example.com/', 'x=1', '', 'www.example.com/?x=1') assert parse('www.example.com?x=1') == ('', '', '', '', '', '', '', 'www.example.com', 'x=1', '', 'www.example.com?x=1') assert parse('www.example.com/#foo') == ('', '', '', '', '', '', '', 'www.example.com/', '', 'foo', 'www.example.com/#foo') assert parse('www.example.com#foo') == ('', '', '', '', '', '', '', 'www.example.com', '', 'foo', 'www.example.com#foo')
def generate_bag_of_words(company): max_level = 3 start_year, end_year = current_year - 4, current_year dnf = working_dir + "directory_not_found" directory = properties.get("data_directory") web = company.replace("_", "/") if web.startswith("http://") or web.startswith("https://"): parse = urltools.parse(web) else: web = "http://" + web parse = urltools.parse(web) url = parse.domain + "." + parse.tld dir = directory + url + "/" if not os.path.isdir(dir): f = open(dnf, "a") f.write(company + "\n") f.close() return years = os.listdir(dir) years = [int(x) for x in years] years.sort() words = "" file_count = 0 for year in years: if start_year <= year <= end_year: y = dir + str(year) + "/" files = os.listdir(y) levels = [] for filename in files: if "txt" == filename[-3:]: levels.append(int(filename[:-4])) levels.sort() for level in levels: if level <= max_level: path = y + str(level) + ".txt" f = open(path, "r") content = f.read().strip().lower().replace("__info__", " ") words += content f.close() file_count += 1 return words
def __init__(self, website, site_ctx=None, debug=False): self.uri = urltools.normalize(website) self.parsed = urltools.parse(website) self.domain = ".".join(self.parsed[4:6]).lstrip("www.") self.robots = None self.sitemap = None # list of documents self.error = {} self.debug = debug self.__session = None self.load_domain_state()
class imdb_spider(scrapy.Spider): name = 'cast_scraper' url = ['https://www.imdb.com/search/name/?gender=male,female&ref_=rlm'] for it in range(51,5000,50): url.append('https://www.imdb.com/search/name/?gender=male,female&start={}&ref_=rlm'.format(it)) parse = urltools.parse(url[0]) domain = parse.domain URLS_CRAWLED = [] start_time = time.time() custom_settings = { 'DEPTH_PRIORITY' : 1, 'SCHEDULER_DISK_QUEUE' : 'scrapy.squeues.PickleFifoDiskQueue', 'SCHEDULER_MEMORY_QUEUE' : 'scrapy.squeues.FifoMemoryQueue', 'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter', } def start_requests(self): for url in self.url: yield scrapy.Request(url,callback = self.imdb_parser) def imdb_parser(self,response): end_time = time.time() u = urltools.parse(response.url) if response.status == 200: if response.meta['depth'] == 1: soup = BeautifulSoup(response.text,'html.parser') title = soup.find('title').get_text() b_date,b_place,d_date,d_place = info_extractor(soup) _id = unique_id_generator(response.url) timestamp_crawl = datetime.now().strftime("%Y-%m-%d %H:%M:%S") yield {"id":_id,"url":response.url,"timestamp_crawl":timestamp_crawl, "name":title,"date_of_birth":b_date,"place_of_birth":b_place,"date_of_death":d_date, "place_of_death":d_place} return for url in response.css("a::attr(href)"): u = url.get() if re.search('^/name',u): next_page = response.urljoin(u) if next_page not in self.URLS_CRAWLED: next_page_parse = urltools.parse(next_page) if next_page_parse.domain == self.domain: self.URLS_CRAWLED.append(next_page) yield (Request(next_page, callback = self.imdb_parser))
def test_parse__ip(): assert parse('http://[::1]/foo') == ('http', '', '', '', '[::1]', '', '', '/foo', '', '', 'http://[::1]/foo') assert parse('[::1]/foo') == ('', '', '', '', '', '', '', '[::1]/foo', '', '', '[::1]/foo')
def test_parse(): assert parse("http://example.com") == ('http', '', '', '', 'example', 'com', '', '', '', '') assert parse("http://example.com:8080") == ('http', '', '', '', 'example', 'com', '8080', '', '', '') assert parse("http://example.ac.at") == ('http', '', '', '', 'example', 'ac.at', '', '', '', '') assert parse("http://example.co.uk") == ('http', '', '', '', 'example', 'co.uk', '', '', '', '') assert parse("http://example.com/foo/") == ('http', '', '', '', 'example', 'com', '', '/foo/', '', '') assert parse("http://*****:*****@www.example.com:1234/foo/?x=1#bla") == ('http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1', 'bla') assert parse("http://example.com?foo=bar:blub") == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub', '') assert parse("http://example.com?foo=bar:blub/") == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub/', '') assert parse("example.com.") == ('', '', '', '', '', '', '', 'example.com.', '', '') assert parse("example.com/abc") == ('', '', '', '', '', '', '', 'example.com/abc', '', '') assert parse("www.example.com") == ('', '', '', '', '', '', '', 'www.example.com', '', '') assert parse("www.example.com/?x=1") == ('', '', '', '', '', '', '', 'www.example.com/', 'x=1', '') assert parse("www.example.com?x=1") == ('', '', '', '', '', '', '', 'www.example.com', 'x=1', '') assert parse("www.example.com/#foo") == ('', '', '', '', '', '', '', 'www.example.com/', '', 'foo') assert parse("www.example.com#foo") == ('', '', '', '', '', '', '', 'www.example.com', '', 'foo') assert parse("http://пример.рф") == ('http', '', '', '', 'пример', 'рф', '', '', '', '') assert parse("http://إختبار.مصر/") == ('http', '', '', '', 'إختبار', 'مصر', '', '/', '', '') assert parse("mailto:[email protected]") == ('mailto', 'foo', '', '', 'bar', 'com', '', '', '', '') assert parse("http://[::1]/foo/bar") == ('http', '', '', '', '[::1]', '', '', '/foo/bar', '', '') assert parse("[::1]/foo/bar") == ('', '', '', '', '', '', '', '[::1]/foo/bar', '', '')
def test_parse__idn(): assert parse(u'http://пример.рф') == ('http', '', '', '', u'пример', u'рф', '', '', '', '', u'http://пример.рф') assert parse(u'http://إختبار.مصر/') == ('http', '', '', '', u'إختبار', u'مصر', '', '/', '', '', u'http://إختبار.مصر/')
# save to json file x_axis = model[:, 0] y_axis = model[:, 1] x_norm = (x_axis - np.min(x_axis)) / (np.max(x_axis) - np.min(x_axis)) y_norm = (y_axis - np.min(y_axis)) / (np.max(y_axis) - np.min(y_axis)) print(len(main)) data = [] meta = [] for i, studio in enumerate(main): temp = {} name = studio['name'] size = studio['size'] url = studio['url'] location = studio['location'] fname = urltools.parse(url).domain + ".png" x = x_norm[i] y = y_norm[i] tags = tags_dict[name] temp['name'] = name temp['location'] = location temp['size'] = size temp['url'] = url temp['fname'] = fname temp['x'] = x temp['y'] = y temp['tags'] = tags data.append(temp)
def parse(url_str): "Parse URL into component parts" return _urltools.parse(url_str)
def find_fragment(url_str): "Extract fragment component from URL" return _urltools.parse(url_str).fragment
def find_query(url_str): "Extract query component from URL" return _urltools.parse(url_str).query
def find_path(url_str): "Extract path component from URL" return _urltools.parse(url_str).path
def find_domain(url_str): "Extract domain from URL" return _urltools.parse(url_str).domain
def process_links(self, url): """Process links in an website, organize links as same domain or 3rd party Args: url Returns: modifies class objects """ links, images = self.parse_html(url=url) # parse the input url for a base domain my_domain_parsed = urltools.parse(url) my_domain_string = my_domain_parsed.domain + "." + my_domain_parsed.tld # {"link":url, # "sublinks": links, # "static-elements": images, # "third-party": url} link_object = {} link_object["link"] = url link_object["sublinks"] = [] link_object["static-elements"] = images link_object["third-party"] = [] for link in links: parsed = urltools.parse(str(link)) domain_string = parsed.domain + "." + parsed.tld if not parsed.path.startswith("#"): # skip all anchor links if domain_string == my_domain_string: # compare domains, assuming subdomain does not matter for 'uniqueness' subdomain = parsed.subdomain + "." if parsed.subdomain else '' urlstring = parsed.scheme + "://" + subdomain + parsed.domain\ + "." + parsed.tld + parsed.path if not urlstring in link_object["sublinks"]: # sublinks is a relative unique list of links link_object["sublinks"].append(urlstring) if not urlstring in self.domain: # self domain is global unique list of links self.domain.append(urlstring) elif parsed.domain == '': # handle relative path URL's, assume they belong to base domain of input url path = parsed.path if parsed.path.startswith( "/") else "/" + parsed.path urlstring = my_domain_parsed.scheme + "://" + my_domain_parsed.subdomain + "." + \ my_domain_parsed.domain + "." + my_domain_parsed.tld + path if not urlstring in link_object["sublinks"]: # sublinks is a relative unique list of links link_object["sublinks"].append(urlstring) if not urlstring in self.domain: # self domain is global unique list of links self.domain.append(urlstring) else: # handle non-matching domains, assume all third-party subdomain = parsed.subdomain + "." if parsed.subdomain else '' urlstring = parsed.scheme + "://" + subdomain + parsed.domain \ + "." + parsed.tld + parsed.path if not urlstring in link_object["third-party"]: # third-party is a relative unique list of links link_object["third-party"].append(urlstring) if not urlstring in self.third_party: # self third party is global unique list of links self.third_party.append(urlstring) self.web_crawl_index.append(link_object)
def match_domains(account): parsed = urltools.parse(account.url) rootdom = "{domain}.{tld}".format(**parsed.__dict__) if rootdom in cf_domains: return rootdom
def normalize(url, strip=False): "RFC3986 normalize URL & Optionally removing url-query/fragment string" if strip: p = _urltools.parse(url) url = p.scheme + '://' + p.subdomain + p.domain + p.path return _urltools.normalize(url)
def test_parse(): assert parse("http://example.com") == ('http', '', '', '', 'example', 'com', '', '', '', '') assert parse("http://example.com:8080") == ('http', '', '', '', 'example', 'com', '8080', '', '', '') assert parse("http://example.ac.at") == ('http', '', '', '', 'example', 'ac.at', '', '', '', '') assert parse("http://example.co.uk") == ('http', '', '', '', 'example', 'co.uk', '', '', '', '') assert parse("http://example.com/foo/") == ('http', '', '', '', 'example', 'com', '', '/foo/', '', '') assert parse("http://*****:*****@www.example.com:1234/foo/?x=1#bla") == ( 'http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1', 'bla') assert parse("http://example.com?foo=bar:blub") == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub', '') assert parse("http://example.com?foo=bar:blub/") == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub/', '') assert parse("example.com.") == ('', '', '', '', '', '', '', 'example.com.', '', '') assert parse("example.com/abc") == ('', '', '', '', '', '', '', 'example.com/abc', '', '') assert parse("www.example.com") == ('', '', '', '', '', '', '', 'www.example.com', '', '') assert parse("www.example.com/?x=1") == ('', '', '', '', '', '', '', 'www.example.com/', 'x=1', '') assert parse("www.example.com?x=1") == ('', '', '', '', '', '', '', 'www.example.com', 'x=1', '') assert parse("www.example.com/#foo") == ('', '', '', '', '', '', '', 'www.example.com/', '', 'foo') assert parse("www.example.com#foo") == ('', '', '', '', '', '', '', 'www.example.com', '', 'foo') assert parse("http://пример.рф") == ('http', '', '', '', 'пример', 'рф', '', '', '', '') assert parse("http://إختبار.مصر/") == ('http', '', '', '', 'إختبار', 'مصر', '', '/', '', '') assert parse("mailto:[email protected]") == ('mailto', 'foo', '', '', 'bar', 'com', '', '', '', '') assert parse("http://[::1]/foo/bar") == ('http', '', '', '', '[::1]', '', '', '/foo/bar', '', '') assert parse("[::1]/foo/bar") == ('', '', '', '', '', '', '', '[::1]/foo/bar', '', '')