def here(modal, string): ban_words = ['here', 'click', 'Here', 'Click', 'CLICK', 'HERE'] here_num = 0 ban_flag = 0 stand_host = modal stand_host_1 = '' urls = re.findall(r'<[Aa].*?href=.*?</[Aa]>', string, re.S) for url in urls: for word in ban_words: if word in url: #如果找到click、here敏感词 ban_flag = 1 break if ban_flag == 1: http_url = re.findall( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(url)) if len(http_url) > 0: first_url = http_url[0] proto, rest = splittype(first_url) host, rest = splithost(rest) host, port = splitport(host) stand_host = host # print(host) if host in modal: #如果域名相同->0 pass else: stand_host_1 = host here_num = 1 #域名不同->1 if stand_host_1: stand_host = stand_host_1 return str(here_num), str(stand_host)
def find_modal(list): domain_list = [] for url in list: http_url = re.findall( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(url)) if len(http_url) > 0: first_url = http_url[0] proto, rest = splittype(first_url) host, rest = splithost(rest) host, port = splitport(host) domain_list.append(host) # print(host) else: host_ip_num = 0 word_counts = collections.Counter(domain_list) # 出现频率最高的3个单词 top_one = word_counts.most_common(1) if len(top_one) > 0: modal = top_one[0][0] # print(modal) else: modal = '-' return modal
def single_request(self, host, handler, request_body, verbose=0): # Add SCGI headers to the request. headers = [('CONTENT_LENGTH', str(len(request_body))), ('SCGI', '1')] header = '\x00'.join(['%s\x00%s' % (key, value) for key, value in headers]) + '\x00' header = '%d:%s' % (len(header), header) request_body = '%s,%s' % (header, request_body) sock = None try: if host: host, port = urlparser.splitport(host) addrinfo = socket.getaddrinfo(host, int(port), socket.AF_INET, socket.SOCK_STREAM) sock = socket.socket(*addrinfo[0][:3]) sock.connect(addrinfo[0][4]) else: sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.connect(handler) self.verbose = verbose if sys.version_info[0] > 2: sock.send(bytes(request_body, "utf-8")) else: sock.send(request_body) return self.parse_response(sock.makefile()) finally: if sock: sock.close()
def work(self, task): sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: pro_, rest = splittype(task[0]) host, rest = splithost(rest) host, port = splitport(host) task.append(rest) task.append(host) sock.setblocking(0) sock.connect_ex((host, int(port) if port else 80)) def timeout_cb(): if not sock._closed: KBEngine.deregisterWriteFileDescriptor(sock.fileno()) sock.close() if task and task[2]: task[2](None) self._write_timer[sock.fileno()] = self.add_timer( REQUEST_TIMEOUT, timeout_cb) KBEngine.registerWriteFileDescriptor( sock.fileno(), Functor(self.onSend, task, sock)) except: self._tasks.append(task) self.logsError() if not sock._closed: sock.close()
def revise_urls(roots, strict=True): """ Revise each URL which invalids in the specific roots and returns the list which revised. :param roots: a collection that contains urls :param strict: is it a strict matching """ result = [] for root in roots: if not root.startswith('http'): i = root.find('www') if i == -1: continue root = ('http://' + root[i:]).lower() parts = parse_url(root) host, port = splitport(parts.netloc) if not host: continue elif strict: host = host[4:] # ignore prefix: www. if re.match(_DIGIT_HOST_REGEX, host): continue result.append(root) return result
def single_request(self, host, handler, request_body, verbose=0): # Add SCGI headers to the request. headers = {'CONTENT_LENGTH': str(len(request_body)), 'SCGI': '1'} header = '\x00'.join( ('%s\x00%s' % item for item in headers.items())) + '\x00' header = '%d:%s' % (len(header), header) request_body = '{},{}'.format(header, request_body) sock = None try: if host: host, port = splitport(host) addrinfo = socket.getaddrinfo(host, int(port), socket.AF_INET, socket.SOCK_STREAM) sock = socket.socket(*addrinfo[0][:3]) sock.connect(addrinfo[0][4]) else: sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.connect(handler) self.verbose = verbose sock.send(request_body.encode()) return self.parse_response(sock.makefile()) finally: if sock: sock.close()
def proxy_bypass(self, host: str, spider: Spider = None, proxies: Dict[str, Union[str, List[Tuple[bytes, str]]]] = None): """Test if proxies should not be used for a particular host. Checks the proxy dict for the value of no_proxy, which should be a list of comma separated DNS suffixes, or '*' for all hosts. """ if proxies is None: proxies = self.storage.proxies # don't bypass, if no_proxy isn't specified try: no_proxy: List = proxies['no'] except KeyError: return False # '*' is special case for always bypass if isinstance(no_proxy, str) and '*' == no_proxy: return True # strip port off host host_only, port = splitport(host) for pattern in no_proxy: if any(map(lambda x: pattern.match(x), [host_only, host])): return True # otherwise, don't bypass return False
def open_local_file(self, url): """Use local file.""" import mimetypes, mimetools, email.Utils try: from io import StringIO except ImportError: from io import StringIO host, file = splithost(url) localname = url2pathname(file) try: stats = os.stat(localname) except OSError as e: raise IOError(e.errno, e.strerror, e.filename) size = stats.st_size modified = email.Utils.formatdate(stats.st_mtime, usegmt=True) mtype = mimetypes.guess_type(url)[0] headers = mimetools.Message( StringIO( 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % (mtype or 'text/plain', size, modified))) if not host: urlfile = file if file[:1] == '/': urlfile = 'file://' + file return addinfourl(open(localname, 'rb'), headers, urlfile) host, port = splitport(host) if not port \ and socket.gethostbyname(host) in (localhost(), thishost()): urlfile = file if file[:1] == '/': urlfile = 'file://' + file return addinfourl(open(localname, 'rb'), headers, urlfile) raise IOError('local file error', 'not on local host')
def single_request(self, host, handler, request_body, verbose=0): # Add SCGI headers to the request. headers = [('CONTENT_LENGTH', str(len(request_body))), ('SCGI', '1')] header = '\x00'.join( ['%s\x00%s' % (key, value) for key, value in headers]) + '\x00' header = '%d:%s' % (len(header), header) request_body = '%s,%s' % (header, request_body) sock = None try: if host: host, port = urlparser.splitport(host) addrinfo = socket.getaddrinfo(host, int(port), socket.AF_INET, socket.SOCK_STREAM) sock = socket.socket(*addrinfo[0][:3]) sock.connect(addrinfo[0][4]) else: sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.connect(handler) self.verbose = verbose if sys.version_info[0] > 2: sock.send(bytes(request_body, "utf-8")) else: sock.send(request_body) return self.parse_response(sock.makefile()) finally: if sock: sock.close()
def send_request(url, method, data=None, json=None): #socket.setdefaulttimeout(2.0) uri = urlparse(url) host, port = splitport(uri.netloc) port = port or 80 path = uri.path address = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)[0] sock = socket.socket(address[0], address[1], address[2]) sock.connect(address[-1]) if json is not None: import json as json_module data = json_module.dumps(json) method = method.encode() host = host.encode() path = path.encode() data = data.encode() sock.send(b"%s %s HTTP/1.0\r\n" % (method, path)) sock.send(b"Host: %s\r\n" % host) sock.send(b"Content-Type: application/json\r\n") sock.send(b"Content-Length: %d\r\n" % len(data)) sock.send(b"Connection: close\r\n\r\n") sock.send(data)
async def http_default(self, sock: TCPSocket) -> None: firstline = await sock.r.readline() if firstline == b"": return method, url, version = firstline.decode("ascii").strip().split(" ") scheme = url.split("://")[0] if "/" in url.split("://")[1]: netloc, urlpath = url.split("://")[1].split("/", 1) else: netloc = url.split("://")[1] urlpath = "" urlpath = "/" + urlpath dsthost, dstport = splitport(netloc) if dstport is None: dstport = {"http": 80, "https": 443}[scheme] logger.debug(f"Request HTTP_{capwords(method)} ('{dsthost}', {dstport})") try: remote = await self.connect_remote(dsthost, int(dstport)) except asyncio.TimeoutError: logger.info(f"HTTP_{capwords(method)} ('{dsthost}', {dstport}) × (timeout)") except OSError: logger.info(f"HTTP_{capwords(method)} ('{dsthost}', {dstport}) × (general)") else: logger.info(f"HTTP_{capwords(method)} ('{dsthost}', {dstport}) √") for index, data in enumerate( (" ".join([method, urlpath, version]) + "\r\n").encode("ascii") ): sock.r._buffer.insert(index, data) await self.bridge(remote, sock) await remote.close()
def __init__(self, username=None, password=None, serverurl=None): xmlrpclib.Transport.__init__(self) self.username = username self.password = password self.verbose = False self.serverurl = serverurl if serverurl.startswith('http://'): type, uri = urllib.splittype(serverurl) host, path = urllib.splithost(uri) host, port = urllib.splitport(host) if port is None: port = 80 else: port = int(port) def get_connection(host=host, port=port): return httplib.HTTPConnection(host, port) self._get_connection = get_connection elif serverurl.startswith('unix://'): def get_connection(serverurl=serverurl): # we use 'localhost' here because domain names must be # < 64 chars (or we'd use the serverurl filename) conn = UnixStreamHTTPConnection('localhost') conn.socketfile = serverurl[7:] return conn self._get_connection = get_connection else: raise ValueError('Unknown protocol for serverurl %s' % serverurl)
async def default(self, sock: TCPSocket) -> None: firstline = await sock.r.readline() if firstline == b"": return method, url, version = firstline.decode("ascii").strip().split(" ") scheme = url.split("://")[0] if "/" in url.split("://")[1]: netloc, urlpath = url.split("://")[1].split("/", 1) else: netloc = url.split("://")[1] urlpath = "" urlpath = "/" + urlpath host, port = splitport(netloc) if port is None: port = {"http": 80, "https": 443}[scheme] logger.info(f"{capwords(method)} request to ('{host}', {port})") try: remote = await connect_remote(host, int(port)) except Exception: return for index, data in enumerate( (" ".join([method, urlpath, version]) + "\r\n").encode("ascii")): sock.r._buffer.insert(index, data) await bridge(remote, sock) await remote.close()
def get_server_url(self): """ Functionality that medusa's http request doesn't have; set an attribute named 'server_url' on the request based on the Host: header """ default_port = {'http': '80', 'https': '443'} environ = self.cgi_environment() if (environ.get('HTTPS') in ('on', 'ON') or environ.get('SERVER_PORT_SECURE') == "1"): # XXX this will currently never be true protocol = 'https' else: protocol = 'http' if 'HTTP_HOST' in environ: host = environ['HTTP_HOST'].strip() hostname, port = urllib.splitport(host) else: hostname = environ['SERVER_NAME'].strip() port = environ['SERVER_PORT'] if port is None or default_port[protocol] == port: host = hostname else: host = hostname + ':' + port server_url = '%s://%s' % (protocol, host) if server_url[-1:] == '/': server_url = server_url[:-1] return server_url
def get_server_url(self): """ Functionality that medusa's http request doesn't have; set an attribute named 'server_url' on the request based on the Host: header """ default_port={'http': '80', 'https': '443'} environ = self.cgi_environment() if (environ.get('HTTPS') in ('on', 'ON') or environ.get('SERVER_PORT_SECURE') == "1"): # XXX this will currently never be true protocol = 'https' else: protocol = 'http' if 'HTTP_HOST' in environ: host = environ['HTTP_HOST'].strip() hostname, port = urllib.splitport(host) else: hostname = environ['SERVER_NAME'].strip() port = environ['SERVER_PORT'] if port is None or default_port[protocol] == port: host = hostname else: host = hostname + ':' + port server_url = '%s://%s' % (protocol, host) if server_url[-1:]=='/': server_url=server_url[:-1] return server_url
def get_info_by_url(url): protocol, rest = parse.splittype(url) host, path = parse.splithost(rest) host, port = parse.splitport(host) if port is None: port = '80' return protocol, host, path, port
def handles_request(self, request: MITMRequest) -> bool: """Can this service handle the request?""" try: host = splitport(request.headers['Host'])[0] except (TypeError, KeyError): host = urlparse(request.url).hostname return host in self.SERVICE_HOSTS
def __init__(self, host=None, username=None, password=None): super().__init__('qBittorrent', host, username, password) self.host, self.port = splitport(self.host or settings.TORRENT_HOST) self.api = qbittorrentapi.Client(host=self.host, port=self.port or 8080, username=self.username or settings.TORRENT_USERNAME, password=self.password or settings.TORRENT_PASSWORD)
def connect(self): netloc = unquote(urlparse(self.usbmux_socket_url).netloc) udid, port = splitport(netloc) if not port: port = 8100 # WDA Default port _device = _usbmux.device(udid) conn = _device.create_inner_connection(int(port)) self.sock = conn._sock self.sock.settimeout(self.timeout)
def domain_name(list): domain_name_list = [] for url in list: proto, rest = splittype(url) host, rest = splithost(rest) host, port = splitport(host) if host not in domain_name_list: domain_name_list.append(host) else: pass return len(domain_name_list)
def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True
def load(self, url): self.url = url self.protocol, s1 = urllib_parse.splittype(self.url) s2, self.path = urllib_parse.splithost(s1) self.host, self.port = urllib_parse.splitport(s2) if not self.port: if self.protocol == 'http': self.port = 80 elif self.protocol == 'https': self.port = 443
def _normalize_check_url(self, check_url): """ Normalizes check_url by: * Adding the `http` scheme if missing * Adding or replacing port with `self.port` """ # TODO: Write tests for this method split_url = urlsplit(check_url) host = splitport(split_url.path or split_url.netloc)[0] return '{0}://{1}:{2}'.format(self.scheme, host, self.port)
def load(self, url): self.url = url self.protocol, s1 = splittype(self.url) s2, self.path = splithost(s1) self.host, port = splitport(s2) self.port = int(port) if port is not None else None if not self.port: if self.protocol == 'http': self.port = 80 elif self.protocol == 'https': self.port = 443
def __init__(self, host: str = None, username: str = None, password: str = None): super().__init__("qBittorrent", host, username, password) self.host, self.port = splitport(self.host or settings.TORRENT_HOST) self.api = qbittorrentapi.Client( host=self.host, port=self.port or None, username=self.username or settings.TORRENT_USERNAME, password=self.password or settings.TORRENT_PASSWORD, VERIFY_WEBUI_CERTIFICATE=settings.TORRENT_VERIFY_CERT, )
def __setattr__(self, key, value): object.__setattr__(self, key, value) if key == 'url': self.protocol, s1 = urllib_parse.splittype(self.url) if s1: s2, self.path = urllib_parse.splithost(s1) if s2: self.host, self.port = urllib_parse.splitport(s2) if not getattr(self, 'port', None): if self.protocol == 'http': self.port = 80 elif self.protocol == 'https': self.port = 443
def all_port(list): port_num = 0 for url in list: proto, rest = splittype(url) host, rest = splithost(rest) host, port = splitport(host) if str(port) == "None": pass else: port_num += 1 if port_num > 0: return 1 else: return 0
def parse_url(self,url): self.valid_url = True parseres = urlparse(url) self.scheme = parseres.scheme if self.scheme.lower()=="https": self.port = 443 elif self.scheme.lower()=="http": self.port = 80 else: self.valid_url = False self.hostname,custom_port = splitport(parseres.netloc) if str(custom_port).isdigit(): self.port = int(custom_port) self.path = parseres.path
def parse_url(self, url): self.valid_url = True parseres = urlparse(url) self.scheme = parseres.scheme if self.scheme.lower() == "https": self.port = 443 elif self.scheme.lower() == "http": self.port = 80 else: self.valid_url = False self.hostname, custom_port = splitport(parseres.netloc) if str(custom_port).isdigit(): self.port = int(custom_port) self.path = parseres.path
def __setattr__(self, key, value): object.__setattr__(self, key, value) if key == 'url': self.protocol, s1 = splittype(self.url) if s1: s2, self.path = splithost(s1) if s2: self.host, port = splitport(s2) self.port = int(port) if port is not None else None if not getattr(self, 'port', None): if self.protocol == 'http': self.port = 80 elif self.protocol == 'https': self.port = 443
def split_host_port(cls, server): """ Return (host, port) from server. Port defaults to 11211. >>> split_host_port('127.0.0.1:11211') ('127.0.0.1', 11211) >>> split_host_port('127.0.0.1') ('127.0.0.1', 11211) """ host, port = splitport(server) if port is None: port = 11211 port = int(port) if re.search(':.*$', host): host = re.sub(':.*$', '', host) return host, port
def open_local_file(self, req): host = req.get_host() file = req.get_selector() localfile = url2pathname(file) stats = os.stat(localfile) size = stats[stat.ST_SIZE] modified = rfc822.formatdate(stats[stat.ST_MTIME]) mtype = mimetypes.guess_type(file)[0] stats = os.stat(localfile) headers = mimetools.Message(StringIO( 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % (mtype or 'text/plain', size, modified))) if host: host, port = splitport(host) if not host or \ (not port and socket.gethostbyname(host) in self.get_names()): return addinfourl(open(localfile, 'rb'), headers, 'file:'+file) raise URLError('file not on local host')
def parse_url(url, scheme="http", path='/'): _scheme, netloc, _path, params, query, fragment = tupleify_urlparse( urlparse(url)) if not netloc: # No scheme - trying to patch it up ourselves? url = scheme + "://" + url _scheme, netloc, _path, params, query, fragment = tupleify_urlparse( urlparse(url)) if not netloc: # XXX raise Exception() (host, port) = splitport(netloc) if not _path: _path = path return (_scheme, (host, port), _path, params, query, fragment)
def putrequest(self, method, url, skip_host=0, skip_accept_encoding=0): #putrequest is called before connect, so can interpret url and get #real host/port to be used to make CONNECT request to proxy proto, rest = splittype(url) if proto is None: raise ValueError("unknown URL type: %s" % url) #get host host, rest = splithost(rest) #try to get port host, port = splitport(host) #if port is not defined try to get from proto if port is None: try: port = self._ports[proto] except KeyError: raise ValueError("unknown protocol for: %s" % url) self._real_host = host self._real_port = int(port) M2Crypto.httpslib.HTTPSConnection.putrequest(self, method, url, skip_host, skip_accept_encoding)
def reduce_uri(self, uri, default_port=True): """Accept authority or URI and extract only the authority and path.""" # note HTTP URLs do not have a userinfo component parts = urllib.parse.urlsplit(uri) if parts[1]: # URI scheme = parts[0] authority = parts[1] path = parts[2] or '/' else: # host or host:port scheme = None authority = uri path = '/' host, port = splitport(authority) if default_port and port is None and scheme is not None: dport = {"http": 80, "https": 443, }.get(scheme) if dport is not None: authority = "%s:%d" % (host, dport) return authority, path
def ftp_open(self, req): host = req.get_host() if not host: raise IOError('ftp error', 'no host given') # XXX handle custom username & password try: host = socket.gethostbyname(host) except socket.error as msg: raise URLError(msg) host, port = splitport(host) if port is None: port = ftplib.FTP_PORT path, attrs = splitattr(req.get_selector()) path = unquote(path) dirs = path.split('/') dirs, file = dirs[:-1], dirs[-1] if dirs and not dirs[0]: dirs = dirs[1:] user = passwd = '' # XXX try: fw = self.connect_ftp(user, passwd, host, port, dirs) type = file and 'I' or 'D' for attr in attrs: attr, value = splitattr(attr) if attr.lower() == 'type' and \ value in ('a', 'A', 'i', 'I', 'd', 'D'): type = value.upper() fp, retrlen = fw.retrfile(file, type) headers = "" mtype = mimetypes.guess_type(req.get_full_url())[0] if mtype: headers += "Content-Type: %s\n" % mtype if retrlen is not None and retrlen >= 0: headers += "Content-Length: %d\n" % retrlen sf = StringIO(headers) headers = mimetools.Message(sf) return addinfourl(fp, headers, req.get_full_url()) except ftplib.all_errors as msg: raise IOError('ftp error', msg).with_traceback(sys.exc_info()[2])
def host_no_default_port(scheme, netloc): host, port = splitport(netloc) if port and port == default_port(scheme): return host else: return netloc
def host_and_port_default(scheme, host): host, port = splitport(host) if not port: port = default_port(scheme) return host, port
def host_and_port(host): host, port = splitport(host) return host, int(port) if port else None
def smb_open(self, req): global USE_NTLM, MACHINE_NAME host = req.get_host() if not host: raise urllib.error.URLError('SMB error: no host given') host, port = splitport(host) if port is None: port = 139 else: port = int(port) # username/password handling user, host = splituser(host) if user: user, passwd = splitpasswd(user) else: passwd = None host = unquote(host) user = user or '' passwd = passwd or '' myname = MACHINE_NAME or self.generateClientMachineName() n = NetBIOS() names = n.queryIPForName(host) if names: server_name = names[0] else: raise urllib.error.URLError('SMB error: Hostname does not reply back with its machine name') path, attrs = splitattr(req.get_selector()) if path.startswith('/'): path = path[1:] dirs = path.split('/') dirs = list(map(unquote, dirs)) service, path = dirs[0], '/'.join(dirs[1:]) try: conn = SMBConnection(user, passwd, myname, server_name, use_ntlm_v2 = USE_NTLM) conn.connect(host, port) headers = email.message.Message() if req.has_data(): data_fp = req.get_data() filelen = conn.storeFile(service, path, data_fp) headers.add_header('Content-length', '0') fp = BytesIO(b"") else: fp = self.createTempFile() file_attrs, retrlen = conn.retrieveFile(service, path, fp) fp.seek(0) mtype = mimetypes.guess_type(req.get_full_url())[0] if mtype: headers.add_header('Content-type', mtype) if retrlen is not None and retrlen >= 0: headers.add_header('Content-length', '%d' % retrlen) return addinfourl(fp, headers, req.get_full_url()) except Exception as ex: raise urllib.error.URLError('smb error: %s' % ex).with_traceback(sys.exc_info()[2])