def run(self): QgsMessageLog.logMessage( 'Started task "{}"'.format(self.description()), MESSAGE_CATEGORY, Qgis.Info) if self.proxyHost != None and self.ProxyPort != None: QgsMessageLog.logMessage('Proxy? ' + str(self.proxyHost), MESSAGE_CATEGORY, Qgis.Info) proxy = urllib.ProxyHandler({'http': proxyHost}) opener = urllib.build_opener(proxy) urllib.install_opener(opener) sparql = SPARQLWrapper( self.triplestoreurl, agent= "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11" ) sparql.setQuery(self.query) print("now sending query") sparql.setReturnFormat(JSON) results = sparql.query().convert() for result in results["results"]["bindings"]: self.viewlist.append(str(result[self.queryvar]["value"])) print(self.viewlist) #self.layercount.setText("["+str(len(viewlist))+"]") if self.getlabels and "classlabelquery" in self.triplestoreconf and self.triplestoreconf[ "classlabelquery"] != "": labels = self.getLabelsForClasses( self.viewlist, self.triplestoreconf["classlabelquery"]) print(labels) self.amountoflabels = len(labels) i = 0 sorted_labels = sorted(labels.items(), key=lambda x: x[1]) for lab in sorted_labels: self.resultlist.append(labels[lab[0]] + " (" + lab[0] + ")") i = i + 1 return True
def run(self): if self.proxyHost!=None and self.ProxyPort!=None: QgsMessageLog.logMessage('Proxy? '+str(self.proxyHost), MESSAGE_CATEGORY, Qgis.Info) proxy = urllib.ProxyHandler({'http': proxyHost}) opener = urllib.build_opener(proxy) urllib.install_opener(opener) QgsMessageLog.logMessage('Started task "{}"'.format(self.description()),MESSAGE_CATEGORY, Qgis.Info) self.graph = Graph() try: if self.filename.startswith("http"): self.graph.load(self.filename) else: filepath=self.filename.split(".") result = self.graph.parse(self.filename, format=filepath[len(filepath)-1]) except Exception as e: QgsMessageLog.logMessage('Failed "{}"'.format(self.description()),MESSAGE_CATEGORY, Qgis.Info) self.exception=str(e) return False self.geoconcepts=[] if self.graph!=None: print("WE HAVE A GRAPH") results = self.graph.query(self.query) for row in results: self.geoconcepts.append(str(row[0])) return True
def detectNamespaces(self, subpredobj): if subpredobj < 0 or subpredobj == None: query = "select distinct ?ns where { ?s ?p ?o . bind( replace( str(?s), \"(#|/)[^#/]*$\", \"$1\" ) as ?ns )} limit 10" elif subpredobj == 0: query = "select distinct ?ns where { ?s ?p ?o . bind( replace( str(?p), \"(#|/)[^#/]*$\", \"$1\" ) as ?ns )} limit 10" else: query = "select distinct ?ns where { ?s ?p ?o . bind( replace( str(?o), \"(#|/)[^#/]*$\", \"$1\" ) as ?ns )} limit 10" if self.proxyHost != None and self.ProxyPort != None: proxy = urllib.ProxyHandler({'http': proxyHost}) opener = urllib.build_opener(proxy) urllib.install_opener(opener) sparql = SPARQLWrapper( self.triplestoreurl, agent= "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11" ) sparql.setQuery(query) sparql.setReturnFormat(JSON) print("now sending query") try: results = sparql.query().convert() reslist = [] for nss in results["results"]["bindings"]: if "ns" in nss: reslist.append(nss["ns"]["value"]) return reslist except: return []
def download(url, user_agent='Mozilla/5.0', proxy=None, num_retries=2): print('Downloading:', url) headers = {'User-agent': user_agent} url = url.encode('utf-8') url = urllib.parse.quote(url, "://?=&") #print(url) request = urllib.request.Request(url, headers=headers) opener = urllib.request.build_opener() if proxy: proxy_params = {urllib.parse.urlparse(url).scheme: proxy} opener.add_handler(urllib.ProxyHandler(proxy_params)) try: data = opener.open(request, timeout=5).read() try: decompressed_data = gzip.decompress( data) #zlib.decompress(data ,16+zlib.MAX_WBITS) html = decompressed_data.decode('utf8') except: html = data.decode('utf8') #print(html) except urllib.request.URLError as e: print('Download error:', e.reason) html = "" if num_retries > 0: if hasattr(e, 'code') and 500 <= e.code < 600: #recursively retry 5xx HTTP errors return download(url, num_retries=num_retries - 1) except socket.timeout as e: html = "" print("Download error:", e) except UnicodeDecodeError as e: print('Download error:', e) html = "" return html
def run(self): QgsMessageLog.logMessage('Started task "{}"'.format( self.description()), MESSAGE_CATEGORY, Qgis.Info) if self.proxyHost!=None and self.ProxyPort!=None: QgsMessageLog.logMessage('Proxy? '+str(self.proxyHost), MESSAGE_CATEGORY, Qgis.Info) proxy = urllib.ProxyHandler({'http': proxyHost}) opener = urllib.build_opener(proxy) urllib.install_opener(opener) sparql = SPARQLWrapper(self.triplestoreurl, agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11") sparql.setQuery(self.query) sparql.setMethod(POST) sparql.setReturnFormat(JSON) try: results = sparql.query().convert() except Exception as e: try: sparql = SPARQLWrapper(self.triplestoreurl, agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11") sparql.setQuery(self.query) sparql.setMethod(GET) sparql.setReturnFormat(JSON) results = sparql.query().convert() except Exception as e: self.exception=e return #print(results) # geojson stuff self.geojson=self.processResults(results,(self.triplestoreconf["crs"] if "crs" in self.triplestoreconf else ""),self.triplestoreconf["mandatoryvariables"][1:],self.allownongeo) return True
def testTripleStoreConnection( self, query="SELECT ?a ?b ?c WHERE { ?a ?b ?c .} LIMIT 1"): if self.proxyHost != None and self.ProxyPort != None: proxy = urllib.ProxyHandler({'http': proxyHost}) opener = urllib.build_opener(proxy) urllib.install_opener(opener) sparql = SPARQLWrapper( self.triplestoreurl, agent= "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11" ) sparql.setQuery(query) sparql.setReturnFormat(JSON) print("now sending query") try: results = sparql.query().convert() if self.testURL and not self.testConfiguration: self.message = "URL depicts a valid SPARQL Endpoint!" if "ASK" in query: return results["boolean"] self.feasibleConfiguration = True return True except: self.message = "URL does not depict a valid SPARQL Endpoint!" self.feasibleConfiguration = False return False
def download(self, url, headers, proxy, num_retries, data=None): print 'Downloading:', url request = urllib.request.Request(url, data, headers or {}) opener = self.opener or urllib.request.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(urllib.ProxyHandler(proxy_params)) try: response = opener.open(request) html = response.read() code = response.code except Exception as e: print 'Download error:', str(e) html = '' if hasattr(e, 'code'): code = e.code if num_retries > 0 and 500 <= code < 600: # retry 5XX HTTP errors return self._get(url, headers, proxy, num_retries - 1, data) else: code = None return {'html': html.decode(encoding="utf-8"), 'code': code}
def download(url, headers, proxy, num_retries, data=None): global url_crawled_num print('Downloading: %s' % url) request = urllib.request.Request(url, data, headers) opener = urllib.request.build_opener() if proxy: proxy_params = {urllib.parse.urlparse(url).scheme: proxy} opener.add_handler(urllib.ProxyHandler(proxy_params)) try: response = opener.open(request) html = response.read() code = response.code if url.find("full_record.do?product=WOS") != -1: url_crawled_num += 1 html_emt = etree.HTML(html) title = html_emt.xpath("//div[@class='title']/value/text()") print("Title %d: %s" % (url_crawled_num, str(title[0]))) except urllib.error.URLError as e: print('Download error: %s' % e.reason) html = '' if hasattr(e, 'code'): code = e.code if num_retries > 0 and 500 <= code < 600: # retry 5XX HTTP errors return download(url, headers, proxy, num_retries - 1, data) else: code = None except urllib.error.HTTPError as e: print("Download error: %s" % e.reason) return html
def download(url, headers, proxy, num_retries, data=None): global download_url_total download_url_total += 1 print('Downloading: %d %s' % (download_url_total, url)) request = urllib.request.Request(url=url, headers=headers) opener = urllib.request.build_opener() if proxy: proxy_params = {urllib.parse.urlparse(url).scheme: proxy} opener.add_handler(urllib.ProxyHandler(proxy_params)) try: response = opener.open(request) response.encoding = "utf-8" html = response.read() #print(html) code = response.code except urllib.error.URLError as e: print('Download error: %s' % e.reason) html = '' if hasattr(e, 'code'): code = e.code if num_retries > 0 and 500 <= code < 600: # retry 5XX HTTP errors return download(url, headers, proxy, num_retries - 1, data) else: code = None except urllib.error.HTTPError as e: print("Download error: %s" % e.reason) return html
def crawler(): while not q.empty(): # 循环 path = q.get() #将line从队列 q 中取出来 url = "%s%s" % (domain_name, path) # 组合url地址,用于下一步提交 random_proxy = random.choice(proxy_list) # 随机使用一个代理服务器 proxy_support = urllib.ProxyHandler(random_proxy) opener = urllib.build_opener(proxy_support) urllib.install_opener(opener) headers = {} headers['User-Agent'] = Baidu_spider # 蜘蛛的头部信息 # 玩蛇网 www.iplaypython.com request = urllib.Request(url, headers=headers) try: response = urllib.urlopen(request) content = response.read() if len(content): # 内容不为空的情况下返回状态码、路径 print("Status [%s] - path: %s" % (response.code, path)) response.close() time.sleep(1) # 休息一会儿,防止速度过快连接数过大被封掉IP except urllib.HTTPError as e: # print e.code, path pass # 异常处理,先暂时pass掉
def getRemoteFile(urlOrPath, destPath, proxy={}): ''' Fetches URL to local path or just returns absolute path. :param urlOrPath: resource locator, generally URL or path :param destPath: path to store the resource, usually a path on file system :return: tuple having (path, 'local'/'remote') ''' urlp = urlparse(urlOrPath) if urlp.scheme == '': return (os.path.abspath(urlOrPath), 'local') elif urlp.scheme not in ('http', 'https'): return (urlOrPath, 'local') else: filename = toFilename(urlOrPath) destPath = destPath + '/' + filename log.info('Retrieving %s to %s.' % (urlOrPath, destPath)) try: proxy = urllibreq.ProxyHandler(proxy) opener = urllibreq.build_opener(proxy) urllibreq.install_opener(opener) urlretrieve(urlOrPath, destPath) except IOError: # monkey patch fix for SSL/Windows per Tika-Python #54 # https://github.com/chrismattmann/tika-python/issues/54 import ssl if hasattr(ssl, '_create_unverified_context'): ssl._create_default_https_context = ssl._create_unverified_context # delete whatever we had there if os.path.exists(destPath) and os.path.isfile(destPath): os.remove(destPath) urlretrieve(urlOrPath, destPath) return (destPath, 'remote')
def __init__(self, column, row, triplestoreconf, prefixes, interlinkOrEnrich, table, propOrClass=False, bothOptions=False, currentprefixes=None, addVocab=None): super(QDialog, self).__init__() self.setupUi(self) self.currentcol = column self.currentrow = row self.table = table self.prefixes = prefixes self.currentprefixes = currentprefixes self.bothOptions = bothOptions self.triplestoreconf = triplestoreconf self.interlinkOrEnrich = interlinkOrEnrich self.addVocab = addVocab if column != 4: self.findConcept.setChecked(True) if column == 4 or (not interlinkOrEnrich and column != 4) or (not interlinkOrEnrich and propOrClass): self.findProperty.setChecked(True) if not bothOptions: self.findProperty.setEnabled(False) self.findConcept.setEnabled(False) self.tripleStoreEdit.setEnabled(False) for triplestore in self.triplestoreconf: if not "File" == triplestore["name"]: self.tripleStoreEdit.addItem(triplestore["name"]) if addVocab != None: for cov in addVocab: self.tripleStoreEdit.addItem(addVocab[cov]["label"]) self.searchButton.clicked.connect(self.getClassesFromLabel) urlregex = QRegExp( "http[s]?://(?:[a-zA-Z#]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" ) urlvalidator = QRegExpValidator(urlregex, self) self.costumproperty.setValidator(urlvalidator) self.costumproperty.textChanged.connect(self.check_state3) self.costumproperty.textChanged.emit(self.costumproperty.text()) self.costumpropertyButton.clicked.connect(self.applyConceptToColumn2) self.applyButton.clicked.connect(self.applyConceptToColumn) s = QSettings() #getting proxy from qgis options settings self.proxyEnabled = s.value("proxy/proxyEnabled") self.proxyType = s.value("proxy/proxyType") self.proxyHost = s.value("proxy/proxyHost") self.proxyPort = s.value("proxy/proxyPort") self.proxyUser = s.value("proxy/proxyUser") self.proxyPassword = s.value("proxy/proxyPassword") if self.proxyHost != None and self.ProxyPort != None: proxy = urllib.ProxyHandler({'http': proxyHost}) opener = urllib.build_opener(proxy) urllib.install_opener(opener)
def run(self): try: proxy = urllib.ProxyHandler({}) opener = urllib.build_opener(proxy) response = opener.open(self.url, self.data, self.timeout) self.callback(response.read()) except: self.callback(None)
def getHtml(url): USER_AGENTS = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10", "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12", "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ", "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14", "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" ] # 代理的IP设置 proxies = ['114.215.95.188:3128', '218.14.115.211:3128'] req = urllib.Request(url) # 设置消息头 req.add_header('User-Agent', random.choice(USER_AGENTS)) # 设置代理ip地址 proxy_support = urllib.ProxyHandler({"http": random.choice(proxies)}) opener = urllib.build_opener(proxy_support) urllib.install_opener(opener) # 访问并获取服务端返回的对象 times = 0 try: res = urllib.urlopen(req) html = res.read() return html except: times += 1 getHtml(url)
def getRemoteFileSize(url, proxy=None): """ 通过content-length头获取远程文件大小 url - 目标文件URL proxy - 代理 """ opener = urllib.request.build_opener() if proxy: if url.lower().startswith('https://'): opener.add_handler(urllib.ProxyHandler({'https': proxy})) else: opener.add_handler(urllib.ProxyHandler({'http': proxy})) try: request = urllib.Request(url) request.get_method = lambda: 'HEAD' response = opener.open(request) response.read() except Exception: # 远程文件不存在 return 0 else: fileSize = dict(response.headers).get('content-length', 0) return int(fileSize)
def setupProxy(): proxies = {} if _proxyhttp != None: proxies['http'] = 'http://' + _proxyhttp os.environ['http'] = _proxyhttp if _proxyhttps != None: proxies['https'] = _proxyhttps os.environ['https'] = 'http://' + _proxyhttps if proxies != {}: proxy = urllib.ProxyHandler(proxies) opener = urllib.build_opener(proxy) urllib.install_opener(opener)
def get_url(self, url, proxy_dict): proxyIP = proxy_dict['ip'] proxyPort = proxy_dict['port'] proxyProtocol = proxy_dict['protocol'] proxy_handler = urllib.ProxyHandler({proxyProtocol: "{0}:{1}".format(proxyIP, proxyPort)}) opener_proxy = urllib.build_opener(proxy_handler) urllib.install_opener(opener_proxy) request = urllib.Request(url=url, headers=HEADERS) response = urllib.urlopen(request) html = response.read() return html
def performDownload(self, updateDirectory): """Download zip with new version""" log.debug('Downloading new version') self.setProgress('Preparing download...', 0, 1) # setup proxies proxies = {} if gdata.config.proxy.http != None: proxies['http'] = gdata.config.proxy.http log.debug('Using proxies', proxies) # get file try: # open URL opener = urllib.build_opener(urllib.ProxyHandler(proxies)) # it unfortunately is not completely reliable for i in range(1, 5): try: ifh = opener.open(self.url) log.debug("Retrieving URL", ifh.geturl()) # download file total = int(ifh.info()["content-length"]) basename = re.search( '(?<=filename=).*', ifh.info()["content-disposition"]).group(0) break except KeyError: pygame.time.wait(1) if not basename: log.message("URL is not a file") self.reportFailure(_("Error: URL does not point to a file.")) return filename = os.path.join(updateDirectory, basename) log.debug("Downloading file %s of size %d" % (filename, total)) ofh = open(filename, "wb") # download and report progress downloaded = 0 while True: data = ifh.read(100000) if not data: break ofh.write(data) downloaded += len(data) log.debug("Download progress", downloaded, total) self.setProgress("Downloading update...", downloaded, total) ifh.close() ofh.close() return filename except urllib.error.URLError as e: log.warning("Cannot download file") self.reportFailure( _("Cannot finish download: %(s)") % str(e.reason)) return None
def SetProxiesIfNecessary(): global HTTP_PROXY global HTTPS_PROXY dProxies = {} if HTTP_PROXY != '': dProxies['http'] = HTTP_PROXY if HTTPS_PROXY != '': dProxies['https'] = HTTPS_PROXY if os.getenv('http_proxy') != None: dProxies['http'] = os.getenv('http_proxy') if os.getenv('https_proxy') != None: dProxies['https'] = os.getenv('https_proxy') if dProxies != {}: urllib.install_opener( urllib.build_opener(urllib.ProxyHandler(dProxies)))
def useProxy(self, proxy): '''利用代理访问百度,并查找关键词''' protocol = proxy.split('//')[0].split(":", '') ip = proxy.split('//')[1] opener = urllib.build_opener(urllib.ProxyHandler({protocol: ip})) urllib.install_opener(opener) try: response = urllib.request.urlopen(self.url, timeout=self.timeout) except: print(u'连接错误,退出程序') exit() str = response.read() if re.search(self.flagword, str): print(u"已取得关键词,该代理可用") else: print("该代理不可用")
def httpConnection(url, proxy): #TODO: habilitar autenticacion ntlm if (proxy.auth == "ntlm"): passman = urllib.HTTPPasswordMgrWithDefaultRealm() passman.add_password(None, proxy.url, proxy.user, proxy.password) auth = HTTPNtlmAuthHandler.HTTPNtlmAuthHandler(passman) else: passman = urllib.request.HTTPPasswordMgr() passman.add_password(None, proxy.url, proxy.user, proxy.password) auth = urllib.request.HTTPBasicAuthHandler(passman) if (proxy.url): proxy = urllib.ProxyHandler({'http': proxy.url}) opener = urllib.build_opener(proxy.url, auth, urllib2.HTTPHandler) urllib.install_opener(opener) return urllib.request.urlopen(url)
def run(self): url = self._url outputfile = self._target def reporthook(blocknum, blocksize, totalsize): readsofar = blocknum * blocksize percent = 0 if totalsize > 0: percent = readsofar * 1e2 / totalsize s = "\r%5.1f%% %*d / %d" % (percent, len( str(totalsize)), readsofar, totalsize) sys.stderr.write(s) if readsofar >= totalsize: sys.stderr.write("\n") else: sys.stderr.write("read %d\n" % (readsofar, )) self.signal.emit(int(percent)) proxy = urllib.ProxyHandler({'http': "myproxy"}) opener = urllib.build_opener(proxy) urllib.install_opener(opener) urllib.urlretrieve(url, outputfile, reporthook)
def CBDownload(env, target, url): try: import urllib # Python 3+ except ImportError: import urllib2 as urllib sys.stdout.write('Downloading ' + url + '.') sys.stdout.flush() ftp_proxy = os.getenv('ftp_proxy', None) http_proxy = os.getenv('http_proxy', None) if ftp_proxy or http_proxy: handlers = {} if ftp_proxy: handlers['ftp'] = ftp_proxy if http_proxy: handlers['http'] = http_proxy opener = urllib.build_opener(urllib.ProxyHandler(handlers)) urllib.install_opener(opener) f = None stream = None try: stream = urllib.urlopen(url) f = open(target, 'wb', 0) # Unbuffered while stream and f: data = stream.read(1024 * 1024) if not data: break f.write(data) sys.stdout.write('.') sys.stdout.flush() sys.stdout.write('ok\n') sys.stdout.flush() finally: if f is not None: f.close() if stream is not None: stream.close()
def download_from_url(url): proxy = env_server.get_proxy() if proxy['enabled']: server = proxy['server'].replace('http://', '') proxy_dict = { 'http': 'http://{login}:{pass}@{0}'.format(server, **proxy) } proxy_handler = urllib2.ProxyHandler(proxy_dict) auth = urllib2.HTTPBasicAuthHandler() opener = urllib2.build_opener(proxy_handler, auth, urllib2.HTTPHandler) urllib2.install_opener(opener) def url_open_agent(url=url, timeout=1): return urllib2.urlopen(url=url, timeout=timeout) query_worker = gf.get_thread_worker(url_open_agent, error_func=gf.error_handle) query_worker.try_start() thread_pool = query_worker.get_thread_pool() thread_pool.waitForDone() if query_worker.is_failed(): return False
def curl_get(url, timeout=5, proxy=False, headers=None, gzip=False): """ wowtoken.py dd373.py crawler_515fa.py crawler_amac.py crawler_for_some_site.py """ if headers is None: headers = {} opener = urllib.request.build_opener() if proxy: proxy_info = {'host': '127.0.0.1', 'port': 7890} proxy_support = urllib.ProxyHandler( {"http": "http://%(host)s:%(port)d" % proxy_info}) opener = urllib.build_opener(proxy_support) request = urllib.request.Request(url, headers=headers) resp = opener.open(request, timeout=timeout) resp_html = resp.read() if gzip: resp_html = zlib.decompress(resp_html, 16 + zlib.MAX_WBITS) return resp_html
def download(url, headers, proxy, num_retries, data=None): print('Downloading:', url) request = urllib.request.Request(url=url,headers=headers) opener = urllib.request.build_opener() if proxy: proxy_params = {urlparse(url).scheme: proxy} opener.add_handler(urllib.ProxyHandler(proxy_params)) try: response = opener.open(request) html = response.read() code = response.code except urllib.error.URLError as e: print('Download error:', e.reason) html = '' if hasattr(e, 'code'): code = e.code if num_retries > 0 and 500 <= code < 600: # retry 5XX HTTP errors return download(url, headers, proxy, num_retries - 1, data) else: code = None if(html): return html.decode(encoding="utf-8") return html
def run(self): QgsMessageLog.logMessage('Started task "{}"'.format( self.description()), MESSAGE_CATEGORY, Qgis.Info) if self.proxyHost!=None and self.ProxyPort!=None: QgsMessageLog.logMessage('Proxy? '+str(self.proxyHost), MESSAGE_CATEGORY, Qgis.Info) proxy = urllib.ProxyHandler({'http': proxyHost}) opener = urllib.build_opener(proxy) urllib.install_opener(opener) attlist={} attlist[self.item]=[] attlist[self.idfield]={} for f in self.layer.getFeatures(): if self.item in f: attlist[self.item].append(f[self.item]) attlist[self.idfield][f[self.idfield]]=True query="" if self.content=="Enrich URI": query+="SELECT ?item WHERE {\n" elif self.content=="Enrich Value" or self.strategy=="Enrich Both": query+="SELECT ?item ?val ?valLabel ?vals WHERE {\n" query+="VALUES ?vals { " print(attlist) for it in attlist[self.idfield]: if str(it).startswith("http"): query+="<"+str(it)+"> " elif self.idprop=="http://www.w3.org/2000/01/rdf-schema#label" and self.language!=None and self.language!="": query+="\""+str(it)+"\"@"+self.language+" " else: query+="\""+str(it)+"\" " query+=" } . \n" proppp=self.propertyy.data(1) if self.propertyy.data(1).startswith("//"): proppp="http:"+proppp if self.table.item(self.row, 7).text()!="" and "wikidata" in self.triplestoreurl: query+="?item wdt:P31 <"+self.table.item(self.row, 7).text()+"> .\n" else: query+="?item rdf:type <"+self.table.item(self.row, 7).text()+"> .\n" query+="?item <"+self.idprop+"> ?vals .\n" query+="?item <"+proppp+"> ?val . \n" if (self.content=="Enrich Value" or self.content=="Enrich Both") and not "wikidata" in self.triplestoreurl: query+="OPTIONAL{ ?val rdfs:label ?valLabel }" elif (self.content=="Enrich Value" or self.content=="Enrich Both") and "wikidata" in self.triplestoreurl: query+="SERVICE wikibase:label { bd:serviceParam wikibase:language \"[AUTO_LANGUAGE],"+self.language+"\". }\n" query+="} " QgsMessageLog.logMessage("proppp: "+str(proppp), MESSAGE_CATEGORY, Qgis.Info) QgsMessageLog.logMessage("idprop: "+self.idprop, MESSAGE_CATEGORY, Qgis.Info) QgsMessageLog.logMessage(query, MESSAGE_CATEGORY, Qgis.Info) QgsMessageLog.logMessage(self.triplestoreurl, MESSAGE_CATEGORY, Qgis.Info) print(self.triplestoreurl) try: sparql = SPARQLWrapper(self.triplestoreurl, agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11") sparql.setQuery(query) sparql.setMethod(POST) print("now sending query") sparql.setReturnFormat(JSON) results = sparql.query().convert() except Exception as e: QgsMessageLog.logMessage("Trying GET query", MESSAGE_CATEGORY, Qgis.Info) try: sparql = SPARQLWrapper(self.triplestoreurl, agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11") sparql.setQuery(query) sparql.setMethod(GET) sparql.setReturnFormat(JSON) results = sparql.query().convert() except Exception as e: #msgBox=QMessageBox() #msgBox.setText("The following exception occurred: "+str(e)) #msgBox.exec() return False print(str(results)) #resultcounter=0 for resultcounter in results["results"]["bindings"]: if self.content=="Enrich Value": self.resultmap[resultcounter["vals"]["value"]]=resultcounter["valLabel"]["value"] elif self.content=="Enrich URI": self.resultmap[resultcounter["vals"]["value"]]=resultcounter["val"]["value"] else: self.resultmap[resultcounter["vals"]["value"]]=resultcounter["valLabel"]["value"]+";"+resultcounter["val"]["value"] self.columntype=self.detectColumnType(self.resultmap,self.table) QgsMessageLog.logMessage(str(self.columntype), MESSAGE_CATEGORY, Qgis.Info) QgsMessageLog.logMessage(str(self.resultmap), MESSAGE_CATEGORY, Qgis.Info) return True
def getimage(cls, imagename): """ Downloads the requested image @return: path of the image, locally """ logger.info("UtilsTest.getimage('%s')" % imagename) fullimagename = os.path.join(cls.image_home, imagename) if not os.path.isfile(fullimagename): logger.info("Trying to download image %s, timeout set to %ss" % (imagename, cls.timeout)) if "http_proxy" in os.environ: dictProxies = {'http': os.environ["http_proxy"]} proxy_handler = urllib2.ProxyHandler(dictProxies) opener = urllib2.build_opener(proxy_handler).open else: opener = urllib2.urlopen # Nota: since python2.6 there is a timeout in the urllib2 timer = threading.Timer(cls.timeout + 1, cls.timeoutDuringDownload) timer.start() logger.info("wget %s/%s" % (cls.url_base, imagename)) if sys.version > (2, 6): data = opener("%s/%s" % (cls.url_base, imagename), data=None, timeout=cls.timeout).read() else: data = opener("%s/%s" % (cls.url_base, imagename), data=None).read() timer.cancel() logger.info("Image %s successfully downloaded." % imagename) try: open(fullimagename, "wb").write(data) except IOError: raise IOError("unable to write downloaded \ data to disk at %s" % cls.image_home) if not os.path.isfile(fullimagename): raise RuntimeError("Could not automatically \ download test images %s!\n \ If you are behind a firewall, \ please set the environment variable http_proxy.\n \ Otherwise please try to download the images manually from \n \ %s" % (cls.url_base, imagename)) if imagename.endswith(".bz2"): decompressed = bz2.decompress(data) basename = fullimagename[:-4] elif imagename.endswith(".gz"): decompressed = gzip.open(fullimagename).read() basename = fullimagename[:-3] else: decompressed = data basename = fullimagename gzipname = basename + ".gz" bzip2name = basename + ".bz2" if basename != fullimagename: try: open(basename, "wb").write(decompressed) except IOError: raise IOError("unable to write decompressed \ data to disk at %s" % cls.image_home) if gzipname != fullimagename: try: gzip.open(gzipname, "wb").write(decompressed) except IOError: raise IOError("unable to write gzipped \ data to disk at %s" % cls.image_home) if bzip2name != fullimagename: try: bz2.BZ2File(bzip2name, "wb").write(decompressed) except IOError: raise IOError("unable to write bzipped2 \ data to disk at %s" % cls.image_home) return fullimagename
def downloadurl(url, useragent="wswp", proxy=Gone, retries=2): """ Pass a URL to download it and return the HTML. You can also run this function with a specific useragent, proxy, and different number of retries. """ print("Downloading:", url) headers = {"User-agent": useragent} # Use the agent name as a header. request = ul.request(url, headers=headers) # Form the request. opener = ul.request.opener() html = ul.request.urlopen(url).read() if proxy: # if we are using a proxy proxyparams = {ulp.urlparse(url).scheme:proxy} # Use the proxy. opener.add_handler(ul.ProxyHandler(proxyparams) try: html = opener.open(request).read() except: e = ul.URLError print("Download error;", ereason) html = None if retries > 0: if hasattr(e, "code") and 500 <= e.code < 600: # Check the error codes # to make sure you can try again. html = downloadurl(url, useragent, proxy, retries-1) # Try again. return html def crawlsitemap(url): """ Sitemap crawler """ sitemap = downloadurl(url) # Download the sitemap file. links = re.findall("<loc>*.*?)</loc>", sitemap) # Extract the sitemap links. for link in links: html = downloadurl(link) def crawllink(seedurl, linkregex=None, delay=5, maxdepth=-1, maxurls=-1, useragent="wswp", proxies=None, retries=1, scallback=None, cache=None): """ Crawl from the given seed URL seedurl following links matched by linkregex for an agentname of the crawler and initialized robot parser. You can add a maxdepth to determine how many pages you will crawl. You can also add a scrape callback scallback to search multiple websites. """ # the queue of URL's that still need to be crawled crawl_queue = [seedurl] # the URL's that have been seen and at what depth seen = {seedurl: 0} # track how many URL's have been downloaded num_urls = 0 rp = getrobots(seedurl) D = Downloader(delay=delay, useragent=useragent, proxies=proxies, retries=retries, cache=cache) while crawl_queue: url = crawl_queue.pop() depth = seen[url] # check url passes robots.txt restrictions if rp.can_fetch(useragent, url): html = D(url) links = [] if scallback: links.extend(scallback(url, html) or []) if depth != maxdepth: # can still crawl further if linkregex: # filter for links matching our regular expression links.extend(link for link in getlinks(html) if re.match(linkregex, link)) for link in links: link = normalize(seedurl, link) # check whether already crawled this link if link not in seen: seen[link] = depth + 1 # check link is within same domain if samedomain(seedurl, link): # success! add this new link to queue crawl_queue.append(link) # check whether have reached downloaded maximum num_urls += 1 if num_urls == maxurls: break else: print("Blocked by robots.txt:", url) def getlinks(html): """ Return a list of links from html. """ # Regex to extract all links from a webpage. webpageregex = re.compile("<a[^>]+href=["\"]*.*?)["\"], re.IGNORECASE)
def run(self): QgsMessageLog.logMessage( 'Started task "{}"'.format(self.description()), MESSAGE_CATEGORY, Qgis.Info) if self.proxyHost != None and self.proxyHost != "" and self.proxyPort != None and self.proxyPort != "": QgsMessageLog.logMessage('Proxy? ' + str(self.proxyHost), MESSAGE_CATEGORY, Qgis.Info) proxy = urllib.ProxyHandler({'http': proxyHost}) opener = urllib.build_opener(proxy) urllib.install_opener(opener) #msgBox=QMessageBox() #msgBox.setText(self.query+" - "+self.triplestoreconf[self.tripleStoreEdit.currentIndex()+1]["endpoint"]) #msgBox.exec() if self.findProperty.isChecked(): if "propertyfromlabelquery" in self.triplestoreconf[ self.tripleStoreEdit.currentIndex() + 1]: self.query = self.triplestoreconf[ self.tripleStoreEdit.currentIndex() + 1]["propertyfromlabelquery"].replace( "%%label%%", self.label) else: if "classfromlabelquery" in self.triplestoreconf[ self.tripleStoreEdit.currentIndex() + 1]: self.query = self.triplestoreconf[ self.tripleStoreEdit.currentIndex() + 1]["classfromlabelquery"].replace("%%label%%", self.label) if self.query == "": return if "SELECT" in self.query: self.query = self.query.replace("%%label%%", self.label).replace( "%%language%%", self.language) sparql = SPARQLWrapper( self.triplestoreconf[self.tripleStoreEdit.currentIndex() + 1]["endpoint"], agent= "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11" ) sparql.setQuery(self.prefixes[self.tripleStoreEdit.currentIndex() + 1] + self.query) sparql.setReturnFormat(JSON) self.results = sparql.query().convert() # msgBox=QMessageBox() # msgBox.setText(str(results)) # msgBox.exec() for res in self.results["results"]["bindings"]: item = QListWidgetItem() item.setData(1, str(res["class"]["value"])) if "label" in res: item.setText( str(res["label"]["value"] + " (" + res["class"]["value"] + ")")) else: item.setText(str(res["class"]["value"])) self.searchResult.addItem(item) else: myResponse = json.loads(requests.get(self.query).text) self.qids = [] for ent in myResponse["search"]: qid = ent["concepturi"] if "http://www.wikidata.org/entity/" in qid and self.findProperty.isChecked( ): qid = "http://www.wikidata.org/prop/direct/" + ent["id"] elif "http://www.wikidata.org/wiki/" in qid and self.findConcept.isChecked( ): qid = "http://www.wikidata.org/entity/" + ent["id"] self.qids.append(qid) label = ent["label"] + " (" + ent["id"] + ") " if "description" in ent: label += "[" + ent["description"] + "]" self.results[qid] = label