def extract_domain_of_url(url=None): """ url中提取出domain :param url:网址 :return 正常:返回domain 异常:返回None """ if url is None: return no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None) url = no_fetch_extract(url) if url.domain == "" or url.suffix == "": return else: return url.domain + '.' + url.suffix
def setup(self, settings): ''' Setup redis and tldextract ''' self.extract = tldextract.TLDExtract() self.redis_conn = redis.Redis(host=settings['REDIS_HOST'], port=settings['REDIS_PORT'], db=settings.get('REDIS_DB')) try: self.redis_conn.info() self.logger.debug("Connected to Redis in ScraperHandler") except ConnectionError: self.logger.error("Failed to connect to Redis in ScraperHandler") # plugin is essential to functionality sys.exit(1)
def read_domains(): """ 从数据库中读取要探测的域名,并解析出其主域名和顶级域名(一级) 注意:若是不符合规范的域名,则丢弃 """ domains = [] main_domains = [] tlds = [] no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None) try: db = MySQL(SOURCE_CONFIG) except Exception, e: logger.logger.error(e) return False
def get_site_type_from_url(url): """ Gets a site type (as defined in profiles.models) from the given URL Args: url (str): A URL Returns: str: A string indicating the site type """ no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=False) extract_result = no_fetch_extract(url) domain = extract_result.domain.lower() if domain in SITE_TYPE_OPTIONS: return domain return PERSONAL_SITE_TYPE
def __init__(self, config=None): if not config: # If there is not config specified, we load a non-interactive configuration. self.config = helper_config.non_interactive_config_resolver() elif not isinstance(config, helper_config.ConfigResolver): # If config is not a ConfigResolver, we are in a legacy situation. # We protect this part of the Client API. self.config = helper_config.legacy_config_resolver(config) else: self.config = config # Validate configuration self._validate_config() runtime_config = {} # Process domain, strip subdomain domain_extractor = tldextract.TLDExtract( cache_dir=_get_tldextract_cache_path(), include_psl_private_domains=True) domain_parts = domain_extractor(self.config.resolve("lexicon:domain")) runtime_config[ "domain"] = f"{domain_parts.domain}.{domain_parts.suffix}" if self.config.resolve("lexicon:delegated"): # handle delegated domain delegated = self.config.resolve("lexicon:delegated").rstrip(".") if delegated != runtime_config.get("domain"): # convert to relative name if delegated.endswith(runtime_config.get("domain")): delegated = delegated[:-len(runtime_config.get("domain"))] delegated = delegated.rstrip(".") # update domain runtime_config[ "domain"] = f"{delegated}.{runtime_config.get('domain')}" self.action = self.config.resolve("lexicon:action") self.provider_name = self.config.resolve( "lexicon:provider_name") or self.config.resolve("lexicon:provider") self.config.add_config_source( helper_config.DictConfigSource(runtime_config), 0) provider_module = importlib.import_module("lexicon.providers." + self.provider_name) provider_class = getattr(provider_module, "Provider") self.provider = provider_class(self.config)
def split_domain_into_subdomains(domain, split_tld=False): """ Walks up a domain by subdomain. >>> split_domain_into_subdomains('this.is.a.test.skywww.net') ['this.is.a.test.skywww.net', 'is.a.test.skywww.net', 'a.test.skywww.net', 'test.skywww.net', 'skywww.net'] """ import tldextract # Requires unicode domain = ensure_decoded_text(domain) # Do not request latest TLS list on init == suffix_list_urls=False global _tldex if _tldex is None: _tldex = tldextract.TLDExtract(suffix_list_urls=False) tx = _tldex(domain) domains = [] if tx.subdomain: domains.extend(tx.subdomain.split('.')) # tx.registered_domain returns only if domain AND suffix are not none # There are cases where we have domain and not suffix; ie short hostnames registered_domain = [tx.domain] if tx.suffix: registered_domain.append(tx.suffix) if split_tld: domains.extend(registered_domain) else: domains.append('.'.join(registered_domain)) # Musical chairs. Change places! domains.reverse() def join_dom(a, b): return '.'.join([b, a]) # Take each part and add it to the previous part, returning all results domains = list(accumulate(domains, func=join_dom)) # Change places! domains.reverse() return domains
def __init__(self, cookiefile): with open(cookiefile, 'rb') as db, \ tempfile.NamedTemporaryFile(delete=False, prefix='fxcookie-') as tmp: shutil.copyfileobj(db, tmp) tmp.flush() self._file_to_delete = tmp.name self._db = sqlite3.connect(tmp.name) # self._db = sqlite3.connect(cookiefile) self._extract = tldextract.TLDExtract(include_psl_private_domains=True) try: self._db.execute( "select 1 from moz_cookies where originAttributes = '' limit 1" ).fetchall() self._origin = True except sqlite3.OperationalError: self._origin = False
def __init__(self, config=None): if not config: # If there is not config specified, we load a non-interactive configuration. self.config = non_interactive_config_resolver() elif not isinstance(config, ConfigResolver): # If config is not a ConfigResolver, we are in a legacy situation. # We protect this part of the Client API. self.config = legacy_config_resolver(config) else: self.config = config # Validate configuration self._validate_config() runtime_config = {} # Process domain, strip subdomain domain_extractor = tldextract.TLDExtract(cache_file=TLDEXTRACT_CACHE_FILE, include_psl_private_domains=True) domain_parts = domain_extractor( self.config.resolve('lexicon:domain')) runtime_config['domain'] = '{0}.{1}'.format( domain_parts.domain, domain_parts.suffix) if self.config.resolve('lexicon:delegated'): # handle delegated domain delegated = self.config.resolve('lexicon:delegated').rstrip('.') if delegated != runtime_config.get('domain'): # convert to relative name if delegated.endswith(runtime_config.get('domain')): delegated = delegated[:-len(runtime_config.get('domain'))] delegated = delegated.rstrip('.') # update domain runtime_config['domain'] = '{0}.{1}'.format( delegated, runtime_config.get('domain')) self.action = self.config.resolve('lexicon:action') self.provider_name = (self.config.resolve('lexicon:provider_name') or self.config.resolve('lexicon:provider')) self.config.add_config_source(DictConfigSource(runtime_config), 0) provider_module = importlib.import_module( 'lexicon.providers.' + self.provider_name) provider_class = getattr(provider_module, 'Provider') self.provider = provider_class(self.config)
def is_valid_redirect(next_param): # add local domain suffix because it is non-standard extract_with_extra_suffix = tldextract.TLDExtract(extra_suffixes=["great" ], ) extracted_domain = extract_with_extra_suffix(next_param) # Allow internal redirects is_domain = bool(extracted_domain.domain) and bool(extracted_domain.suffix) # NOTE: The extra is_domain check is necessary because otherwise # for example ?next=//satan.com would redirect even if # satan.com is not an allowed redirect domain if next_param.startswith('/') and not is_domain: return True # Otherwise check we allow that domain/suffix domain = '.'.join([extracted_domain.domain, extracted_domain.suffix]) return (domain in settings.ALLOWED_REDIRECT_DOMAINS) or ( extracted_domain.suffix in settings.ALLOWED_REDIRECT_DOMAINS)
def initial_seeds(self): """初始化调度器""" while True: initial_len = self.server.llen('seeds') if initial_len: break time.sleep(180) continue self.logger.debug('获取初始种子列表.........') while True: tasks = self.server.lrange('seeds', 0, -1) self.server.ltrim('seeds', -1, 0) self.tasks.extend(tasks) if self.tasks: break self.logger.debug('获取初始爬虫进程个数.........') self.spiders = self.server.keys('stats:spider:*:*') # spiders列表 self.spider_count = len(self.spiders) if self.spider_count: self.logger.debug('调用一致性哈希算法布局爬虫节点位置.......') job_ids = [] for spider in self.spiders: job_ids.append(spider.split(':')[3]) self.chose = ketama.Continuum(job_ids) self.logger.debug('分配初始种子URLs队列........') for task_json in self.tasks: task = pickle.loads(task_json) if 'url' in task and 'spider_type' in task: extract = tldextract.TLDExtract() url = task['url'] spider_type = task['spider_type'] domain = extract(url).domain job_id = self.chose[url.encode('utf-8')] queue_key = '{spider_type}:{job_id}:{domain}:queue'.format(spider_type=spider_type, job_id=job_id, domain=domain) priority = task['priority'] self.server.zadd(queue_key, pickle.dumps(task), priority) else: self.logger.error("please input url and spider_type that you want to crawl!")
def extract(self): """ extract domain >>> d = Domain('www.example.com') <domain.Domain object> >>> d.extract() ExtractResult(subdomain='www', domain='example', suffix='com') :return: extracted domain results """ data_storage_dir = settings.data_storage_dir extract_cache_file = data_storage_dir.joinpath('public_suffix_list.dat') ext = tldextract.TLDExtract(extract_cache_file, None) result = self.match() if result: return ext(result) return None
def makeData(black="./data/dga.txt", white="./data/top-1m.csv"): X = [] Y = [] no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None) with open(black, 'r') as f: data = f.readlines() for i in data: X.append(domain2list(no_fetch_extract(i.strip()).domain)) Y.append([0]) with open("./data/top-1m.csv", 'r') as f: data = f.readlines() for i in data: X.append( domain2list(no_fetch_extract(i.strip().split(',')[1]).domain)) Y.append([1]) X = np.mat(X) Y = np.mat(Y) return X, Y
def __init__(self,): self.ad_objects = [] self.tld_extract = tldextract.TLDExtract(suffix_list_urls=None) # Stores the set of unique domain.hash to reduce load on the clustering algo self.domain_hashes = set() # domain_ids... maps a domain to an integer ID self.domain_ids = {} # ad_objects indices of samples being considered for clustering self.input_indices = [] self.feature_matrix_hash = [] self.feature_matrix_domain = [] # label --> [index in ad_objects] self.clusters = {} self.fetch_data() self.cluster_images()
def test_list_source(): extractor = tldextract.TLDExtract(cache_dir=None) extractor._extractor = _SuffixListTLDExtractor({ # pylint: disable=locally-disabled, protected-access SOURCE_PUBLICSUFFIX_ICANN: set(['com', 'net']), SOURCE_PUBLICSUFFIX_PRIVATE: set(['blogspot.com']), }) result = extractor('hi.blogspot.com', include_psl_private_domains=False) assert result.subdomain == 'hi' assert result.domain == 'blogspot' assert result.suffix == 'com' assert result.source == SOURCE_PUBLICSUFFIX_ICANN result = extractor('hi.blogspot.com', include_psl_private_domains=True) assert result.subdomain == '' assert result.domain == 'hi' assert result.suffix == 'blogspot.com' assert result.source == SOURCE_PUBLICSUFFIX_PRIVATE
def parse_url(url, isupdate=False): """ :param url: :return: """ o = urlparse(url) scheme = o.scheme netloc = o.netloc path = o.path extract = tldextract.TLDExtract() if isupdate: extract.update() ext = extract(url) return [ scheme, ext.subdomain, ext.suffix, ext.domain + "." + ext.suffix, netloc, path, url ]
def detect_type(indicator): """Infer the type of the indicator. Args: indicator(str): The indicator whose type we want to check. Returns: str. The type of the indicator. """ if re.match(sha256Regex, indicator) or re.match( md5Regex, indicator) or re.match(sha1Regex, indicator): return FeedIndicatorType.File if re.match(ipv4cidrRegex, indicator): return FeedIndicatorType.CIDR if re.match(ipv6cidrRegex, indicator): return FeedIndicatorType.IPv6CIDR if re.match(ipv4Regex, indicator): return FeedIndicatorType.IP if re.match(ipv6Regex, indicator): return FeedIndicatorType.IPv6 if re.match(urlRegex, indicator): return FeedIndicatorType.URL if re.match(emailRegex, indicator): return FeedIndicatorType.Email try: # we use TLDExtract class to fetch all existing domain suffixes from the bellow mentioned file: # https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat # the suffix_list_urls=None is used to not make http calls using the extraction - avoiding SSL errors if tldextract.TLDExtract( cache_file='https://raw.githubusercontent.com/publicsuffix' '/list/master/public_suffix_list.dat', suffix_list_urls=None).__call__(indicator).suffix: if '*' in indicator: return FeedIndicatorType.DomainGlob return FeedIndicatorType.Domain except Exception: pass return None
def __init__(self, log_id, start_url=None, agent_name="edge_win"): self.start_time = time.time() self.agent_name = agent_name self.log_id = log_id self.logger = logging.getLogger(log_id) self.screenshots_dir_path = os.path.join(config.MAIN_LOG_PATH, config.SCREENSHOTS_DIR, self.log_id) os.mkdir(self.screenshots_dir_path) self.html_logs_dir_path = os.path.join(config.MAIN_LOG_PATH, config.HTML_LOGS_DIR, self.log_id) os.mkdir(self.html_logs_dir_path) # Every time the browser is restarted the browser_counter increases # this counter is used to name the JSGraph log file uniquely. self.browser_counter = 1 tab = self.open_browser(start_url) self.start_url = start_url self._save_html(tab) # Useful when milking sshot_path = self._take_screenshot(tab) self.logger.info("The screenshot of loaded home page: %s", sshot_path) # Sometimes, websites might redirect to a different site. We would like to use the # name of this redirected site instead of the original one self.url = self.devtools_client.get_tab_url(tab) self.tabs_opened = 0 self.overall_tabs_opened = 0 self.logger.info("Home URL: %s", self.url) # current state. self.state = [] self.restart = False # VERY STRANGELY, some ad networks fail to display ads when there is a live TLD lookup from Python code! # I have no idea why this request is interfering with that at all. # However, we are disabling the live lookup of suffixes. Only the stored list is used. self.tld_extract = tldextract.TLDExtract(suffix_list_urls=None) self.home_domain = self.tld_extract(self.url).registered_domain
def extract(self): """ 域名导出 >>> d = Domain('www.example.com') <domain.Domain object> >>> d.extract() ExtractResult(subdomain='www', domain='example', suffix='com') :return: 导出结果 """ extract_cache_file = config.data_storage_dir.joinpath( 'public_suffix_list.dat') tldext = tldextract.TLDExtract(extract_cache_file) result = self.match() if result: return tldext(result) else: return None
def handle_noargs(self, **options): self.setup_logging(verbosity=options.get('verbosity', 1)) filename = getattr( settings, 'MULTISITE_PUBLIC_SUFFIX_LIST_CACHE', os.path.join(tempfile.gettempdir(), 'multisite_tld.dat')) self.log("Updating {filename}".format(filename=filename)) with tempfile.NamedTemporaryFile(dir=os.path.dirname(filename)) as f: tmpname = f.name extract = tldextract.TLDExtract(fetch=True, cache_file=tmpname) extract._get_tld_extractor() self.log( "Downloaded new data to {filename}".format(filename=tmpname)) os.rename(tmpname, filename) f.delete = False # No need to delete f any more. self.log("Done.")
def read_domains(file_name): """ 读取域名存储文件,获取要探测的域名,以及提取出主域名 注意:若是不符合规范的域名,则丢弃 """ domains = [] main_domains = [] no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None) file_path = './unverified_domain_data/' with open(file_path+file_name,'r') as fp: for d in fp.readlines(): domain_tld = no_fetch_extract(d.strip()) tld, reg_domain = domain_tld.suffix, domain_tld.domain # 提取出顶级域名和主域名部分 if tld and reg_domain: main_domains.append(reg_domain+'.'+tld) domains.append(d.strip()) else: logger.logger.warning('域名%s不符合规范,不进行探测' % d.strip()) return domains, main_domains
def __init__(self, server, persist, update_int, timeout, retries, logger, hits, window, mod, add_type, backlog_blacklist, queue_timeout): self.redis_conn = server self.persist = persist self.queue_dict = {} self.update_interval = update_int self.hits = hits self.window = window self.moderated = mod self.rfp_timeout = timeout self.add_type = add_type self.item_retires = retries self.logger = logger self.backlog_blacklist = backlog_blacklist self.queue_timeout = queue_timeout self.extract = tldextract.TLDExtract() self.job_id = None # 标识爬虫进程 self.task = None self.paused = False # 标识爬虫是否暂停
def extract(url, *, include_psl_private_domains=False): cache_dir = save_cache_path('tldextract') last_updated = os.path.join(cache_dir, 'last_updated') extractor = tldextract.TLDExtract( cache_dir = cache_dir, include_psl_private_domains = include_psl_private_domains, ) update = False try: t = os.path.getmtime(last_updated) if time.time() - t > 86400 * 7: update = True except FileNotFoundError: update = True if update: extractor.update() with open(last_updated, 'w'): pass return extractor(url)
def setup(self, settings): ''' Setup redis and tldextract ''' self.extract = tldextract.TLDExtract() self.redis_conn = redis.Redis( host=settings['REDIS_HOST'], port=settings['REDIS_PORT'], db=settings.get('REDIS_DB'), password=settings['REDIS_PASSWORD'], decode_responses=True, socket_timeout=settings.get('REDIS_SOCKET_TIMEOUT'), socket_connect_timeout=settings.get('REDIS_SOCKET_TIMEOUT')) try: self.redis_conn.info() self.logger.debug("Connected to Redis in ScraperHandler") except ConnectionError: self.logger.error("Failed to connect to Redis in ScraperHandler") # plugin is essential to functionality sys.exit(1)
def split_domain_into_subdomains(domain, split_tld=False): if not hasattr(split_domain_into_subdomains, '_tldex'): import tldextract # Do not request latest TLS list on init == suffix_list_url=False split_domain_into_subdomains._tldex = tldextract.TLDExtract( suffix_list_url=False) _tldex = split_domain_into_subdomains._tldex # Requires unicode domain = ensure_decoded_text(domain) tx = _tldex(domain) domains = [] if tx.subdomain: domains.extend(tx.subdomain.split('.')) # tx.registered_domain returns only if domain AND suffix are not none # There are cases where we have domain and not suffix; ie short hostnames registered_domain = [tx.domain] if tx.suffix: registered_domain.append(tx.suffix) if split_tld: domains.extend(registered_domain) else: domains.append('.'.join(registered_domain)) # Musical chairs. Change places! domains.reverse() def join_dom(a, b): return '.'.join([b, a]) # Take each part and add it to the previous part, returning all results domains = list(accumulate(domains, func=join_dom)) # Change places! domains.reverse() return domains
def auto(page_url, **kwargs): default_options = {"index": 0, "album": False, "username": False} for k, v in default_options.items(): if k not in kwargs: kwargs[k] = v domains = { "artstation": artstation, "pixiv": pixiv, "hentai-foundry": hentai_foundry, "deviantart": deviantart, "furaffinity": furaffinity, } no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None) domain = no_fetch_extract(page_url).domain if domain in domains: return domains[domain](page_url, kwargs) raise exc.UnsupportedSite(page_url)
def test_cache_permission(mocker, monkeypatch, tmpdir): """Emit a warning once that this can't cache the latest PSL.""" warning = mocker.patch.object(logging.getLogger("tldextract.cache"), "warning") def no_permission_makedirs(*args, **kwargs): raise PermissionError("""[Errno 13] Permission denied: '/usr/local/lib/python3.7/site-packages/tldextract/.suffix_cache""" ) monkeypatch.setattr(os, "makedirs", no_permission_makedirs) for _ in range(0, 2): my_extract = tldextract.TLDExtract(cache_dir=tmpdir) assert_extract( "http://www.google.com", ("www.google.com", "www", "google", "com"), funs=(my_extract, ), ) assert warning.call_count == 1 assert warning.call_args[0][0].startswith("unable to cache")
def __init__(self, ): self.ad_objects = [] self.tld_extract = tldextract.TLDExtract(suffix_list_urls=None) # Stores the set of unique domain.hash to reduce load on the clustering algo self.domain_hashes = set() # domain_ids... maps a domain to an integer ID self.domain_ids = {} # ad_objects indices of samples being considered for clustering self.input_indices = [] self.feature_matrix_hash = [] self.feature_matrix_domain = [] # label --> [index in ad_objects] self.clusters = {} # self.fetch_data() self.fetch_clustered_data() print len(self.clusters) self.filter_clusters() print "Number of clusters after filtering:", len(self.clusters) print "# of SE clusters:", sum( [len(x) for x in se_categories.values()]) se_clusters_1 = set([int(x) for x in self.clusters.keys()]) se_clusters_2 = set() for cluster_list in se_categories.values(): se_clusters_2 = se_clusters_2.union(set(cluster_list)) print se_clusters_1 print "***" * 10 print se_clusters_2 print "***" * 10 print "Missing clusters:", se_clusters_1.difference(se_clusters_2) print "Different clusters:" self.fetch_mal_ad_hashes() self.fetch_ad_network_counts()
def main(): tld_extractor = tldextract.TLDExtract(suffix_list_urls=None) ad_objs = ad_object.parse_ad_objects(CLUSTERED_ADOBJECTS_PATH) ad_domains = set() image_hashes = set() milking_url_domains = set() upstream_domains = set() milking_urls = [] for ad in ad_objs: ad_domain, milking_url, milking_domain, curr_upstream_domains = get_milking_url( ad, tld_extractor) if milking_domain and milking_domain not in milking_url_domains: # print "Milking url: ", milking_url # import ipdb; ipdb.set_trace() milking_url_domains.add(milking_domain) milking_urls.append(milking_url) image_hashes.add(ad.screenshot_hash) ad_domains.add(ad_domain) upstream_domains = upstream_domains.union(curr_upstream_domains) # home_domain = self.tld_extract(ad).registered_domain print len(ad_objs) print "# Ad domains: ", len(ad_domains) print "# Image hashes: ", len(image_hashes) print "# Milking URLS: ", len(milking_urls) print "# Upstream domains: ", len(upstream_domains) print "# All domains:", len( ad_domains.union(milking_url_domains).union(upstream_domains)) dump_object = { "ad_domains": list(ad_domains), "image_hashes": list(image_hashes), "milking_urls": milking_urls, "upstream_domains": list(upstream_domains) } with open(MILKING_URLS_PATH, "wb") as f: dump_str = json.dumps(dump_object) f.write(dump_str)
def __init__(self, server, persist, update_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex, backlog_blacklist, queue_timeout, chose): self.redis_conn = server self.persist = persist self.queue_dict = {} self.update_interval = update_int self.hits = hits self.window = window self.moderated = mod self.rfp_timeout = timeout self.ip_update_interval = ip_refresh self.add_type = add_type self.add_ip = add_ip self.item_retires = retries self.logger = logger self.ip_regex = re.compile(ip_regex) self.backlog_blacklist = backlog_blacklist self.queue_timeout = queue_timeout self.chose = chose self.extract = tldextract.TLDExtract() self.job_id = None # 标识爬虫进程 self.paused = False # 标识爬虫是否暂停
def link_is_blocked(blocked_links, url): # TLDExtract extracts the top-level domain from the # registered domain and subdomains of a URL. For example, # to get just the 'google' part of 'http://www.google.com' # or 'http://google.com.sg'. # # By default, when the module is first run, it updates # its TLD list with a live HTTP request. This updated TLD # list is cached indefinitely in /path/to/tldextract/.tld_set # # This is to set the call to not use cache. no_cache_extract = tldextract.TLDExtract(cache_file=False) # How to use TLDExtract # Usage: no_cache_extract('http://forums.news.cnn.com/') # Result: ExtractResult(subdomain='forums.news', domain='cnn', suffix='com') extracted = no_cache_extract(url) url_domain = extracted.domain + "." + extracted.suffix for site in blocked_links: # if url matches with blocked domain if (re.match(site, url_domain)): return True