コード例 #1
0
 def run(self):
     QgsMessageLog.logMessage(
         'Started task "{}"'.format(self.description()), MESSAGE_CATEGORY,
         Qgis.Info)
     if self.proxyHost != None and self.ProxyPort != None:
         QgsMessageLog.logMessage('Proxy? ' + str(self.proxyHost),
                                  MESSAGE_CATEGORY, Qgis.Info)
         proxy = urllib.ProxyHandler({'http': proxyHost})
         opener = urllib.build_opener(proxy)
         urllib.install_opener(opener)
     sparql = SPARQLWrapper(
         self.triplestoreurl,
         agent=
         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"
     )
     sparql.setQuery(self.query)
     print("now sending query")
     sparql.setReturnFormat(JSON)
     results = sparql.query().convert()
     for result in results["results"]["bindings"]:
         self.viewlist.append(str(result[self.queryvar]["value"]))
     print(self.viewlist)
     #self.layercount.setText("["+str(len(viewlist))+"]")
     if self.getlabels and "classlabelquery" in self.triplestoreconf and self.triplestoreconf[
             "classlabelquery"] != "":
         labels = self.getLabelsForClasses(
             self.viewlist, self.triplestoreconf["classlabelquery"])
         print(labels)
         self.amountoflabels = len(labels)
         i = 0
         sorted_labels = sorted(labels.items(), key=lambda x: x[1])
         for lab in sorted_labels:
             self.resultlist.append(labels[lab[0]] + " (" + lab[0] + ")")
             i = i + 1
     return True
コード例 #2
0
 def run(self):
     if self.proxyHost!=None and self.ProxyPort!=None:
         QgsMessageLog.logMessage('Proxy? '+str(self.proxyHost), MESSAGE_CATEGORY, Qgis.Info)
         proxy = urllib.ProxyHandler({'http': proxyHost})
         opener = urllib.build_opener(proxy)
         urllib.install_opener(opener)
     QgsMessageLog.logMessage('Started task "{}"'.format(self.description()),MESSAGE_CATEGORY, Qgis.Info)
     self.graph = Graph()
     try:
         if self.filename.startswith("http"):
             self.graph.load(self.filename)
         else:
             filepath=self.filename.split(".")
             result = self.graph.parse(self.filename, format=filepath[len(filepath)-1])
     except Exception as e:
         QgsMessageLog.logMessage('Failed "{}"'.format(self.description()),MESSAGE_CATEGORY, Qgis.Info)
         self.exception=str(e)
         return False
     self.geoconcepts=[]
     if self.graph!=None:
         print("WE HAVE A GRAPH")
         results = self.graph.query(self.query)
         for row in results:
             self.geoconcepts.append(str(row[0]))
     return True
コード例 #3
0
 def detectNamespaces(self, subpredobj):
     if subpredobj < 0 or subpredobj == None:
         query = "select distinct ?ns where { ?s ?p ?o . bind( replace( str(?s), \"(#|/)[^#/]*$\", \"$1\" ) as ?ns )} limit 10"
     elif subpredobj == 0:
         query = "select distinct ?ns where { ?s ?p ?o . bind( replace( str(?p), \"(#|/)[^#/]*$\", \"$1\" ) as ?ns )} limit 10"
     else:
         query = "select distinct ?ns where { ?s ?p ?o . bind( replace( str(?o), \"(#|/)[^#/]*$\", \"$1\" ) as ?ns )} limit 10"
     if self.proxyHost != None and self.ProxyPort != None:
         proxy = urllib.ProxyHandler({'http': proxyHost})
         opener = urllib.build_opener(proxy)
         urllib.install_opener(opener)
     sparql = SPARQLWrapper(
         self.triplestoreurl,
         agent=
         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"
     )
     sparql.setQuery(query)
     sparql.setReturnFormat(JSON)
     print("now sending query")
     try:
         results = sparql.query().convert()
         reslist = []
         for nss in results["results"]["bindings"]:
             if "ns" in nss:
                 reslist.append(nss["ns"]["value"])
         return reslist
     except:
         return []
コード例 #4
0
ファイル: spyder.py プロジェクト: easezyc/Python_Spider
def download(url, user_agent='Mozilla/5.0', proxy=None, num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    url = url.encode('utf-8')
    url = urllib.parse.quote(url, "://?=&")
    #print(url)
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.ProxyHandler(proxy_params))
    try:
        data = opener.open(request, timeout=5).read()
        try:
            decompressed_data = gzip.decompress(
                data)  #zlib.decompress(data ,16+zlib.MAX_WBITS)
            html = decompressed_data.decode('utf8')
        except:
            html = data.decode('utf8')
            #print(html)
    except urllib.request.URLError as e:
        print('Download error:', e.reason)
        html = ""
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                #recursively retry 5xx HTTP errors
                return download(url, num_retries=num_retries - 1)
    except socket.timeout as e:
        html = ""
        print("Download error:", e)
    except UnicodeDecodeError as e:
        print('Download error:', e)
        html = ""
    return html
コード例 #5
0
 def run(self):
     QgsMessageLog.logMessage('Started task "{}"'.format(
                                  self.description()),
                              MESSAGE_CATEGORY, Qgis.Info)
     if self.proxyHost!=None and self.ProxyPort!=None:
         QgsMessageLog.logMessage('Proxy? '+str(self.proxyHost), MESSAGE_CATEGORY, Qgis.Info)
         proxy = urllib.ProxyHandler({'http': proxyHost})
         opener = urllib.build_opener(proxy)
         urllib.install_opener(opener)
     sparql = SPARQLWrapper(self.triplestoreurl, agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11")
     sparql.setQuery(self.query)
     sparql.setMethod(POST)
     sparql.setReturnFormat(JSON)
     try:
         results = sparql.query().convert()
     except Exception as e: 
         try:
             sparql = SPARQLWrapper(self.triplestoreurl, agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11")
             sparql.setQuery(self.query)
             sparql.setMethod(GET)
             sparql.setReturnFormat(JSON)
             results = sparql.query().convert()
         except Exception as e:
             self.exception=e
             return            
     #print(results)
     # geojson stuff
     self.geojson=self.processResults(results,(self.triplestoreconf["crs"] if "crs" in self.triplestoreconf else ""),self.triplestoreconf["mandatoryvariables"][1:],self.allownongeo)
     return True
コード例 #6
0
 def testTripleStoreConnection(
         self, query="SELECT ?a ?b ?c WHERE { ?a ?b ?c .} LIMIT 1"):
     if self.proxyHost != None and self.ProxyPort != None:
         proxy = urllib.ProxyHandler({'http': proxyHost})
         opener = urllib.build_opener(proxy)
         urllib.install_opener(opener)
     sparql = SPARQLWrapper(
         self.triplestoreurl,
         agent=
         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"
     )
     sparql.setQuery(query)
     sparql.setReturnFormat(JSON)
     print("now sending query")
     try:
         results = sparql.query().convert()
         if self.testURL and not self.testConfiguration:
             self.message = "URL depicts a valid SPARQL Endpoint!"
         if "ASK" in query:
             return results["boolean"]
         self.feasibleConfiguration = True
         return True
     except:
         self.message = "URL does not depict a valid SPARQL Endpoint!"
         self.feasibleConfiguration = False
         return False
コード例 #7
0
 def download(self, url, headers, proxy, num_retries, data=None):
     print
     'Downloading:', url
     request = urllib.request.Request(url, data, headers or {})
     opener = self.opener or urllib.request.build_opener()
     if proxy:
         proxy_params = {urlparse.urlparse(url).scheme: proxy}
         opener.add_handler(urllib.ProxyHandler(proxy_params))
     try:
         response = opener.open(request)
         html = response.read()
         code = response.code
     except Exception as e:
         print
         'Download error:', str(e)
         html = ''
         if hasattr(e, 'code'):
             code = e.code
             if num_retries > 0 and 500 <= code < 600:
                 # retry 5XX HTTP errors
                 return self._get(url, headers, proxy, num_retries - 1,
                                  data)
         else:
             code = None
     return {'html': html.decode(encoding="utf-8"), 'code': code}
コード例 #8
0
ファイル: link_crawler3.py プロジェクト: keepwise/sci_spider
def download(url, headers, proxy, num_retries, data=None):

    global url_crawled_num

    print('Downloading: %s' % url)
    request = urllib.request.Request(url, data, headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.ProxyHandler(proxy_params))
    try:
        response = opener.open(request)
        html = response.read()
        code = response.code
        if url.find("full_record.do?product=WOS") != -1:
            url_crawled_num += 1
            html_emt = etree.HTML(html)
            title = html_emt.xpath("//div[@class='title']/value/text()")
            print("Title %d:  %s" % (url_crawled_num, str(title[0])))

    except urllib.error.URLError as e:
        print('Download error: %s' % e.reason)
        html = ''
        if hasattr(e, 'code'):
            code = e.code
            if num_retries > 0 and 500 <= code < 600:
                # retry 5XX HTTP errors
                return download(url, headers, proxy, num_retries - 1, data)
        else:
            code = None
    except urllib.error.HTTPError as e:

        print("Download error: %s" % e.reason)

    return html
コード例 #9
0
ファイル: common.py プロジェクト: keepwise/sci_spider
def download(url, headers, proxy, num_retries, data=None):
    global download_url_total

    download_url_total += 1
    print('Downloading: %d  %s' % (download_url_total, url))
    request = urllib.request.Request(url=url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.ProxyHandler(proxy_params))
    try:
        response = opener.open(request)
        response.encoding = "utf-8"
        html = response.read()
        #print(html)

        code = response.code

    except urllib.error.URLError as e:
        print('Download error: %s' % e.reason)
        html = ''
        if hasattr(e, 'code'):
            code = e.code
            if num_retries > 0 and 500 <= code < 600:
                # retry 5XX HTTP errors
                return download(url, headers, proxy, num_retries - 1, data)
        else:
            code = None
    except urllib.error.HTTPError as e:

        print("Download error: %s" % e.reason)

    return html
コード例 #10
0
def crawler():
    while not q.empty():  # 循环
        path = q.get()  #将line从队列 q 中取出来

        url = "%s%s" % (domain_name, path)  # 组合url地址,用于下一步提交

        random_proxy = random.choice(proxy_list)  # 随机使用一个代理服务器
        proxy_support = urllib.ProxyHandler(random_proxy)
        opener = urllib.build_opener(proxy_support)
        urllib.install_opener(opener)

        headers = {}
        headers['User-Agent'] = Baidu_spider  # 蜘蛛的头部信息
        # 玩蛇网 www.iplaypython.com

        request = urllib.Request(url, headers=headers)

        try:
            response = urllib.urlopen(request)
            content = response.read()

            if len(content):  # 内容不为空的情况下返回状态码、路径
                print("Status [%s]  - path: %s" % (response.code, path))

            response.close()
            time.sleep(1)  # 休息一会儿,防止速度过快连接数过大被封掉IP
        except urllib.HTTPError as e:
            # print e.code, path
            pass  # 异常处理,先暂时pass掉
コード例 #11
0
def getRemoteFile(urlOrPath, destPath, proxy={}):
    '''
    Fetches URL to local path or just returns absolute path.
    :param urlOrPath: resource locator, generally URL or path
    :param destPath: path to store the resource, usually a path on file system
    :return: tuple having (path, 'local'/'remote')
    '''
    urlp = urlparse(urlOrPath)
    if urlp.scheme == '':
        return (os.path.abspath(urlOrPath), 'local')
    elif urlp.scheme not in ('http', 'https'):
        return (urlOrPath, 'local')
    else:
        filename = toFilename(urlOrPath)
        destPath = destPath + '/' + filename
        log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
        try:
            proxy = urllibreq.ProxyHandler(proxy)
            opener = urllibreq.build_opener(proxy)
            urllibreq.install_opener(opener)
            urlretrieve(urlOrPath, destPath)
        except IOError:
            # monkey patch fix for SSL/Windows per Tika-Python #54
            # https://github.com/chrismattmann/tika-python/issues/54
            import ssl
            if hasattr(ssl, '_create_unverified_context'):
                ssl._create_default_https_context = ssl._create_unverified_context
            # delete whatever we had there
            if os.path.exists(destPath) and os.path.isfile(destPath):
                os.remove(destPath)
            urlretrieve(urlOrPath, destPath)
        return (destPath, 'remote')
コード例 #12
0
 def __init__(self,
              column,
              row,
              triplestoreconf,
              prefixes,
              interlinkOrEnrich,
              table,
              propOrClass=False,
              bothOptions=False,
              currentprefixes=None,
              addVocab=None):
     super(QDialog, self).__init__()
     self.setupUi(self)
     self.currentcol = column
     self.currentrow = row
     self.table = table
     self.prefixes = prefixes
     self.currentprefixes = currentprefixes
     self.bothOptions = bothOptions
     self.triplestoreconf = triplestoreconf
     self.interlinkOrEnrich = interlinkOrEnrich
     self.addVocab = addVocab
     if column != 4:
         self.findConcept.setChecked(True)
     if column == 4 or (not interlinkOrEnrich
                        and column != 4) or (not interlinkOrEnrich
                                             and propOrClass):
         self.findProperty.setChecked(True)
     if not bothOptions:
         self.findProperty.setEnabled(False)
         self.findConcept.setEnabled(False)
     self.tripleStoreEdit.setEnabled(False)
     for triplestore in self.triplestoreconf:
         if not "File" == triplestore["name"]:
             self.tripleStoreEdit.addItem(triplestore["name"])
     if addVocab != None:
         for cov in addVocab:
             self.tripleStoreEdit.addItem(addVocab[cov]["label"])
     self.searchButton.clicked.connect(self.getClassesFromLabel)
     urlregex = QRegExp(
         "http[s]?://(?:[a-zA-Z#]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
     )
     urlvalidator = QRegExpValidator(urlregex, self)
     self.costumproperty.setValidator(urlvalidator)
     self.costumproperty.textChanged.connect(self.check_state3)
     self.costumproperty.textChanged.emit(self.costumproperty.text())
     self.costumpropertyButton.clicked.connect(self.applyConceptToColumn2)
     self.applyButton.clicked.connect(self.applyConceptToColumn)
     s = QSettings()  #getting proxy from qgis options settings
     self.proxyEnabled = s.value("proxy/proxyEnabled")
     self.proxyType = s.value("proxy/proxyType")
     self.proxyHost = s.value("proxy/proxyHost")
     self.proxyPort = s.value("proxy/proxyPort")
     self.proxyUser = s.value("proxy/proxyUser")
     self.proxyPassword = s.value("proxy/proxyPassword")
     if self.proxyHost != None and self.ProxyPort != None:
         proxy = urllib.ProxyHandler({'http': proxyHost})
         opener = urllib.build_opener(proxy)
         urllib.install_opener(opener)
コード例 #13
0
 def run(self):
     try:
         proxy = urllib.ProxyHandler({})
         opener = urllib.build_opener(proxy)
         response = opener.open(self.url, self.data, self.timeout)
         self.callback(response.read())
     except:
         self.callback(None)
コード例 #14
0
def getHtml(url):
    USER_AGENTS = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10",
        "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12",
        "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
        "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ",
        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
    ]
    # 代理的IP设置
    proxies = ['114.215.95.188:3128', '218.14.115.211:3128']
    req = urllib.Request(url)
    # 设置消息头
    req.add_header('User-Agent', random.choice(USER_AGENTS))
    # 设置代理ip地址
    proxy_support = urllib.ProxyHandler({"http": random.choice(proxies)})
    opener = urllib.build_opener(proxy_support)
    urllib.install_opener(opener)
    # 访问并获取服务端返回的对象
    times = 0
    try:
        res = urllib.urlopen(req)
        html = res.read()
        return html
    except:
        times += 1
        getHtml(url)
コード例 #15
0
def getRemoteFileSize(url, proxy=None):
    """ 通过content-length头获取远程文件大小
        url - 目标文件URL
        proxy - 代理  """
    opener = urllib.request.build_opener()
    if proxy:
        if url.lower().startswith('https://'):
            opener.add_handler(urllib.ProxyHandler({'https': proxy}))
        else:
            opener.add_handler(urllib.ProxyHandler({'http': proxy}))
    try:
        request = urllib.Request(url)
        request.get_method = lambda: 'HEAD'
        response = opener.open(request)
        response.read()
    except Exception:  # 远程文件不存在
        return 0
    else:
        fileSize = dict(response.headers).get('content-length', 0)
        return int(fileSize)
コード例 #16
0
ファイル: dla.py プロジェクト: chris-fox/data-assistant
def setupProxy():
    proxies = {}
    if _proxyhttp != None:
        proxies['http'] = 'http://' + _proxyhttp
        os.environ['http'] = _proxyhttp
    if _proxyhttps != None:
        proxies['https'] = _proxyhttps
        os.environ['https'] = 'http://' + _proxyhttps
    if proxies != {}:
        proxy = urllib.ProxyHandler(proxies)
        opener = urllib.build_opener(proxy)
        urllib.install_opener(opener)
コード例 #17
0
    def get_url(self, url, proxy_dict):
        proxyIP = proxy_dict['ip']
        proxyPort = proxy_dict['port']
        proxyProtocol = proxy_dict['protocol']
        proxy_handler = urllib.ProxyHandler({proxyProtocol: "{0}:{1}".format(proxyIP, proxyPort)})

        opener_proxy = urllib.build_opener(proxy_handler)
        urllib.install_opener(opener_proxy)
        request = urllib.Request(url=url, headers=HEADERS)
        response = urllib.urlopen(request)
        html = response.read()

        return html
コード例 #18
0
ファイル: UpdateDlg.py プロジェクト: taislin/outerspace
 def performDownload(self, updateDirectory):
     """Download zip with new version"""
     log.debug('Downloading new version')
     self.setProgress('Preparing download...', 0, 1)
     # setup proxies
     proxies = {}
     if gdata.config.proxy.http != None:
         proxies['http'] = gdata.config.proxy.http
     log.debug('Using proxies', proxies)
     # get file
     try:
         # open URL
         opener = urllib.build_opener(urllib.ProxyHandler(proxies))
         # it unfortunately is not completely reliable
         for i in range(1, 5):
             try:
                 ifh = opener.open(self.url)
                 log.debug("Retrieving URL", ifh.geturl())
                 # download file
                 total = int(ifh.info()["content-length"])
                 basename = re.search(
                     '(?<=filename=).*',
                     ifh.info()["content-disposition"]).group(0)
                 break
             except KeyError:
                 pygame.time.wait(1)
         if not basename:
             log.message("URL is not a file")
             self.reportFailure(_("Error: URL does not point to a file."))
             return
         filename = os.path.join(updateDirectory, basename)
         log.debug("Downloading file %s of size %d" % (filename, total))
         ofh = open(filename, "wb")
         # download and report progress
         downloaded = 0
         while True:
             data = ifh.read(100000)
             if not data:
                 break
             ofh.write(data)
             downloaded += len(data)
             log.debug("Download progress", downloaded, total)
             self.setProgress("Downloading update...", downloaded, total)
         ifh.close()
         ofh.close()
         return filename
     except urllib.error.URLError as e:
         log.warning("Cannot download file")
         self.reportFailure(
             _("Cannot finish download: %(s)") % str(e.reason))
         return None
コード例 #19
0
def SetProxiesIfNecessary():
    global HTTP_PROXY
    global HTTPS_PROXY

    dProxies = {}
    if HTTP_PROXY != '':
        dProxies['http'] = HTTP_PROXY
    if HTTPS_PROXY != '':
        dProxies['https'] = HTTPS_PROXY
    if os.getenv('http_proxy') != None:
        dProxies['http'] = os.getenv('http_proxy')
    if os.getenv('https_proxy') != None:
        dProxies['https'] = os.getenv('https_proxy')
    if dProxies != {}:
        urllib.install_opener(
            urllib.build_opener(urllib.ProxyHandler(dProxies)))
コード例 #20
0
 def useProxy(self, proxy):
     '''利用代理访问百度,并查找关键词'''
     protocol = proxy.split('//')[0].split(":", '')
     ip = proxy.split('//')[1]
     opener = urllib.build_opener(urllib.ProxyHandler({protocol: ip}))
     urllib.install_opener(opener)
     try:
         response = urllib.request.urlopen(self.url, timeout=self.timeout)
     except:
         print(u'连接错误,退出程序')
         exit()
     str = response.read()
     if re.search(self.flagword, str):
         print(u"已取得关键词,该代理可用")
     else:
         print("该代理不可用")
コード例 #21
0
def httpConnection(url, proxy):
    #TODO: habilitar autenticacion ntlm
    if (proxy.auth == "ntlm"):
        passman = urllib.HTTPPasswordMgrWithDefaultRealm()
        passman.add_password(None, proxy.url, proxy.user, proxy.password)
        auth = HTTPNtlmAuthHandler.HTTPNtlmAuthHandler(passman)
    else:
        passman = urllib.request.HTTPPasswordMgr()
        passman.add_password(None, proxy.url, proxy.user, proxy.password)
        auth = urllib.request.HTTPBasicAuthHandler(passman)

    if (proxy.url):
        proxy = urllib.ProxyHandler({'http': proxy.url})
        opener = urllib.build_opener(proxy.url, auth, urllib2.HTTPHandler)
        urllib.install_opener(opener)

    return urllib.request.urlopen(url)
コード例 #22
0
    def run(self):
        url = self._url
        outputfile = self._target

        def reporthook(blocknum, blocksize, totalsize):
            readsofar = blocknum * blocksize
            percent = 0
            if totalsize > 0:
                percent = readsofar * 1e2 / totalsize
                s = "\r%5.1f%% %*d / %d" % (percent, len(
                    str(totalsize)), readsofar, totalsize)
                sys.stderr.write(s)
                if readsofar >= totalsize:
                    sys.stderr.write("\n")
            else:
                sys.stderr.write("read %d\n" % (readsofar, ))
            self.signal.emit(int(percent))

        proxy = urllib.ProxyHandler({'http': "myproxy"})
        opener = urllib.build_opener(proxy)
        urllib.install_opener(opener)
        urllib.urlretrieve(url, outputfile, reporthook)
コード例 #23
0
def CBDownload(env, target, url):
    try:
        import urllib # Python 3+
    except ImportError:
        import urllib2 as urllib

    sys.stdout.write('Downloading ' + url + '.')
    sys.stdout.flush()

    ftp_proxy = os.getenv('ftp_proxy', None)
    http_proxy = os.getenv('http_proxy', None)

    if ftp_proxy or http_proxy:
        handlers = {}
        if ftp_proxy: handlers['ftp'] = ftp_proxy
        if http_proxy: handlers['http'] = http_proxy

        opener = urllib.build_opener(urllib.ProxyHandler(handlers))
        urllib.install_opener(opener)

    f = None
    stream = None
    try:
        stream = urllib.urlopen(url)
        f = open(target, 'wb', 0) # Unbuffered
        while stream and f:
            data = stream.read(1024 * 1024)
            if not data: break
            f.write(data)
            sys.stdout.write('.')
            sys.stdout.flush()

        sys.stdout.write('ok\n')
        sys.stdout.flush()

    finally:
        if f is not None: f.close()
        if stream is not None: stream.close()
コード例 #24
0
def download_from_url(url):
    proxy = env_server.get_proxy()
    if proxy['enabled']:
        server = proxy['server'].replace('http://', '')
        proxy_dict = {
            'http': 'http://{login}:{pass}@{0}'.format(server, **proxy)
        }
        proxy_handler = urllib2.ProxyHandler(proxy_dict)
        auth = urllib2.HTTPBasicAuthHandler()
        opener = urllib2.build_opener(proxy_handler, auth, urllib2.HTTPHandler)
        urllib2.install_opener(opener)

    def url_open_agent(url=url, timeout=1):
        return urllib2.urlopen(url=url, timeout=timeout)

    query_worker = gf.get_thread_worker(url_open_agent,
                                        error_func=gf.error_handle)
    query_worker.try_start()
    thread_pool = query_worker.get_thread_pool()
    thread_pool.waitForDone()

    if query_worker.is_failed():
        return False
コード例 #25
0
ファイル: utils1.py プロジェクト: hjshiwbd/notes
def curl_get(url, timeout=5, proxy=False, headers=None, gzip=False):
    """
    wowtoken.py
    dd373.py
    crawler_515fa.py
    crawler_amac.py
    crawler_for_some_site.py
    """
    if headers is None:
        headers = {}
    opener = urllib.request.build_opener()
    if proxy:
        proxy_info = {'host': '127.0.0.1', 'port': 7890}
        proxy_support = urllib.ProxyHandler(
            {"http": "http://%(host)s:%(port)d" % proxy_info})
        opener = urllib.build_opener(proxy_support)

    request = urllib.request.Request(url, headers=headers)

    resp = opener.open(request, timeout=timeout)
    resp_html = resp.read()
    if gzip:
        resp_html = zlib.decompress(resp_html, 16 + zlib.MAX_WBITS)
    return resp_html
コード例 #26
0
def download(url, headers, proxy, num_retries, data=None):
    print('Downloading:', url)
    request = urllib.request.Request(url=url,headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urlparse(url).scheme: proxy}
        opener.add_handler(urllib.ProxyHandler(proxy_params))
    try:
        response = opener.open(request)
        html = response.read()
        code = response.code
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = ''
        if hasattr(e, 'code'):
            code = e.code
            if num_retries > 0 and 500 <= code < 600:
                # retry 5XX HTTP errors
                return download(url, headers, proxy, num_retries - 1, data)
        else:
            code = None
    if(html):
        return html.decode(encoding="utf-8")
    return html
コード例 #27
0
 def run(self):
     QgsMessageLog.logMessage('Started task "{}"'.format(
                                  self.description()),
                              MESSAGE_CATEGORY, Qgis.Info)
     if self.proxyHost!=None and self.ProxyPort!=None:
         QgsMessageLog.logMessage('Proxy? '+str(self.proxyHost), MESSAGE_CATEGORY, Qgis.Info)
         proxy = urllib.ProxyHandler({'http': proxyHost})
         opener = urllib.build_opener(proxy)
         urllib.install_opener(opener)
     attlist={}
     attlist[self.item]=[]
     attlist[self.idfield]={}
     for f in self.layer.getFeatures():
         if self.item in f:
             attlist[self.item].append(f[self.item])
         attlist[self.idfield][f[self.idfield]]=True
         query=""
         if self.content=="Enrich URI": 
             query+="SELECT ?item WHERE {\n"
         elif self.content=="Enrich Value" or self.strategy=="Enrich Both":
             query+="SELECT ?item ?val ?valLabel ?vals WHERE {\n"
         query+="VALUES ?vals { "
         print(attlist)
     for it in attlist[self.idfield]:
         if str(it).startswith("http"):
             query+="<"+str(it)+"> "
         elif self.idprop=="http://www.w3.org/2000/01/rdf-schema#label" and self.language!=None and self.language!="":
             query+="\""+str(it)+"\"@"+self.language+" "
         else:
             query+="\""+str(it)+"\" "
     query+=" } . \n"
     proppp=self.propertyy.data(1)
     if self.propertyy.data(1).startswith("//"):
         proppp="http:"+proppp
     if self.table.item(self.row, 7).text()!="" and "wikidata" in self.triplestoreurl:
         query+="?item wdt:P31 <"+self.table.item(self.row, 7).text()+"> .\n"
     else:
         query+="?item rdf:type <"+self.table.item(self.row, 7).text()+"> .\n"
     query+="?item <"+self.idprop+"> ?vals .\n"
     query+="?item <"+proppp+"> ?val . \n"
     if (self.content=="Enrich Value" or self.content=="Enrich Both") and not "wikidata" in self.triplestoreurl:
         query+="OPTIONAL{ ?val rdfs:label ?valLabel }"
     elif (self.content=="Enrich Value" or self.content=="Enrich Both") and "wikidata" in self.triplestoreurl:
         query+="SERVICE wikibase:label { bd:serviceParam wikibase:language \"[AUTO_LANGUAGE],"+self.language+"\". }\n"
     query+="} "
     QgsMessageLog.logMessage("proppp: "+str(proppp),
                          MESSAGE_CATEGORY, Qgis.Info)
     QgsMessageLog.logMessage("idprop: "+self.idprop,
                          MESSAGE_CATEGORY, Qgis.Info)
     QgsMessageLog.logMessage(query,
                          MESSAGE_CATEGORY, Qgis.Info)
     QgsMessageLog.logMessage(self.triplestoreurl,
                          MESSAGE_CATEGORY, Qgis.Info)
     print(self.triplestoreurl)
     try:
         sparql = SPARQLWrapper(self.triplestoreurl, agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11")
         sparql.setQuery(query)
         sparql.setMethod(POST)
         print("now sending query")
         sparql.setReturnFormat(JSON)
         results = sparql.query().convert()
     except Exception as e: 
         QgsMessageLog.logMessage("Trying GET query",
                              MESSAGE_CATEGORY, Qgis.Info)
         try:
             sparql = SPARQLWrapper(self.triplestoreurl, agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11")
             sparql.setQuery(query)
             sparql.setMethod(GET)
             sparql.setReturnFormat(JSON)
             results = sparql.query().convert()
         except Exception as e:
             #msgBox=QMessageBox()
             #msgBox.setText("The following exception occurred: "+str(e))
             #msgBox.exec()
             return False
     print(str(results))
     #resultcounter=0
     for resultcounter in results["results"]["bindings"]:
         if self.content=="Enrich Value":
             self.resultmap[resultcounter["vals"]["value"]]=resultcounter["valLabel"]["value"]
         elif self.content=="Enrich URI":
             self.resultmap[resultcounter["vals"]["value"]]=resultcounter["val"]["value"]
         else:
             self.resultmap[resultcounter["vals"]["value"]]=resultcounter["valLabel"]["value"]+";"+resultcounter["val"]["value"]
     self.columntype=self.detectColumnType(self.resultmap,self.table)
     QgsMessageLog.logMessage(str(self.columntype),
                          MESSAGE_CATEGORY, Qgis.Info)
     QgsMessageLog.logMessage(str(self.resultmap),
                          MESSAGE_CATEGORY, Qgis.Info)
     return True
コード例 #28
0
    def getimage(cls, imagename):
        """
        Downloads the requested image
        @return:  path of the image, locally
        """
        logger.info("UtilsTest.getimage('%s')" % imagename)
        fullimagename = os.path.join(cls.image_home, imagename)
        if not os.path.isfile(fullimagename):
            logger.info("Trying to download image %s, timeout set to %ss" %
                        (imagename, cls.timeout))
            if "http_proxy" in os.environ:
                dictProxies = {'http': os.environ["http_proxy"]}
                proxy_handler = urllib2.ProxyHandler(dictProxies)
                opener = urllib2.build_opener(proxy_handler).open
            else:
                opener = urllib2.urlopen

#           Nota: since python2.6 there is a timeout in the urllib2
            timer = threading.Timer(cls.timeout + 1, cls.timeoutDuringDownload)
            timer.start()
            logger.info("wget %s/%s" % (cls.url_base, imagename))
            if sys.version > (2, 6):
                data = opener("%s/%s" % (cls.url_base, imagename),
                              data=None,
                              timeout=cls.timeout).read()
            else:
                data = opener("%s/%s" % (cls.url_base, imagename),
                              data=None).read()
            timer.cancel()
            logger.info("Image %s successfully downloaded." % imagename)

            try:
                open(fullimagename, "wb").write(data)
            except IOError:
                raise IOError("unable to write downloaded \
                    data to disk at %s" % cls.image_home)

            if not os.path.isfile(fullimagename):
                raise RuntimeError("Could not automatically \
                download test images %s!\n \ If you are behind a firewall, \
                please set the environment variable http_proxy.\n \
                Otherwise please try to download the images manually from \n \
                %s" % (cls.url_base, imagename))

            if imagename.endswith(".bz2"):
                decompressed = bz2.decompress(data)
                basename = fullimagename[:-4]
            elif imagename.endswith(".gz"):
                decompressed = gzip.open(fullimagename).read()
                basename = fullimagename[:-3]
            else:
                decompressed = data
                basename = fullimagename

            gzipname = basename + ".gz"
            bzip2name = basename + ".bz2"

            if basename != fullimagename:
                try:
                    open(basename, "wb").write(decompressed)
                except IOError:
                    raise IOError("unable to write decompressed \
                    data to disk at %s" % cls.image_home)
            if gzipname != fullimagename:
                try:
                    gzip.open(gzipname, "wb").write(decompressed)
                except IOError:
                    raise IOError("unable to write gzipped \
                    data to disk at %s" % cls.image_home)
            if bzip2name != fullimagename:
                try:
                    bz2.BZ2File(bzip2name, "wb").write(decompressed)
                except IOError:
                    raise IOError("unable to write bzipped2 \
                    data to disk at %s" % cls.image_home)
        return fullimagename
コード例 #29
0
ファイル: download.py プロジェクト: HussainAther/scrape
def downloadurl(url, useragent="wswp", proxy=Gone, retries=2):
    """
    Pass a URL to download it and return the HTML.
    You can also run this function with a specific useragent, proxy, and 
    different number of retries.
    """
    print("Downloading:", url)
    headers = {"User-agent": useragent} # Use the agent name as a header.
    request = ul.request(url, headers=headers) # Form the request.
    opener = ul.request.opener() 
    html = ul.request.urlopen(url).read()
    if proxy: # if we are using a proxy
        proxyparams = {ulp.urlparse(url).scheme:proxy} # Use the proxy.
        opener.add_handler(ul.ProxyHandler(proxyparams)
    try:
        html = opener.open(request).read()
    except:
        e = ul.URLError
        print("Download error;", ereason)
        html = None
        if retries > 0:
            if hasattr(e, "code") and 500 <= e.code < 600: # Check the error codes
                                                           # to make sure you can try again.
                html = downloadurl(url, useragent, proxy, retries-1) # Try again.
    return html

def crawlsitemap(url):
    """
    Sitemap crawler
    """
    sitemap = downloadurl(url) # Download the sitemap file.
    links = re.findall("<loc>*.*?)</loc>", sitemap) # Extract the sitemap links.
    for link in links:
        html = downloadurl(link)
    
def crawllink(seedurl, linkregex=None, delay=5, maxdepth=-1, maxurls=-1, useragent="wswp", proxies=None, retries=1, scallback=None, cache=None):
    """
    Crawl from the given seed URL seedurl following links
    matched by linkregex for an agentname of the crawler and
    initialized robot parser. You can add a maxdepth to determine
    how many pages you will crawl. You can also add a scrape
    callback scallback to search multiple websites.
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = [seedurl]
    # the URL's that have been seen and at what depth
    seen = {seedurl: 0}
    # track how many URL's have been downloaded
    num_urls = 0
    rp = getrobots(seedurl)
    D = Downloader(delay=delay, useragent=useragent, proxies=proxies, retries=retries, cache=cache)
    while crawl_queue:
        url = crawl_queue.pop()
        depth = seen[url]
        # check url passes robots.txt restrictions
        if rp.can_fetch(useragent, url):
            html = D(url)
            links = []
            if scallback:
                links.extend(scallback(url, html) or [])

            if depth != maxdepth:
                # can still crawl further
                if linkregex:
                    # filter for links matching our regular expression
                    links.extend(link for link in getlinks(html) if re.match(linkregex, link))

                for link in links:
                    link = normalize(seedurl, link)
                    # check whether already crawled this link
                    if link not in seen:
                        seen[link] = depth + 1
                        # check link is within same domain
                        if samedomain(seedurl, link):
                            # success! add this new link to queue
                            crawl_queue.append(link)

            # check whether have reached downloaded maximum
            num_urls += 1
            if num_urls == maxurls:
                break
        else:
            print("Blocked by robots.txt:", url)

def getlinks(html):
    """
    Return a list of links from html.
    """
    # Regex to extract all links from a webpage.
    webpageregex = re.compile("<a[^>]+href=["\"]*.*?)["\"], re.IGNORECASE)
コード例 #30
0
 def run(self):
     QgsMessageLog.logMessage(
         'Started task "{}"'.format(self.description()), MESSAGE_CATEGORY,
         Qgis.Info)
     if self.proxyHost != None and self.proxyHost != "" and self.proxyPort != None and self.proxyPort != "":
         QgsMessageLog.logMessage('Proxy? ' + str(self.proxyHost),
                                  MESSAGE_CATEGORY, Qgis.Info)
         proxy = urllib.ProxyHandler({'http': proxyHost})
         opener = urllib.build_opener(proxy)
         urllib.install_opener(opener)
     #msgBox=QMessageBox()
     #msgBox.setText(self.query+" - "+self.triplestoreconf[self.tripleStoreEdit.currentIndex()+1]["endpoint"])
     #msgBox.exec()
     if self.findProperty.isChecked():
         if "propertyfromlabelquery" in self.triplestoreconf[
                 self.tripleStoreEdit.currentIndex() + 1]:
             self.query = self.triplestoreconf[
                 self.tripleStoreEdit.currentIndex() +
                 1]["propertyfromlabelquery"].replace(
                     "%%label%%", self.label)
     else:
         if "classfromlabelquery" in self.triplestoreconf[
                 self.tripleStoreEdit.currentIndex() + 1]:
             self.query = self.triplestoreconf[
                 self.tripleStoreEdit.currentIndex() +
                 1]["classfromlabelquery"].replace("%%label%%", self.label)
     if self.query == "":
         return
     if "SELECT" in self.query:
         self.query = self.query.replace("%%label%%", self.label).replace(
             "%%language%%", self.language)
         sparql = SPARQLWrapper(
             self.triplestoreconf[self.tripleStoreEdit.currentIndex() +
                                  1]["endpoint"],
             agent=
             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"
         )
         sparql.setQuery(self.prefixes[self.tripleStoreEdit.currentIndex() +
                                       1] + self.query)
         sparql.setReturnFormat(JSON)
         self.results = sparql.query().convert()
         # msgBox=QMessageBox()
         # msgBox.setText(str(results))
         # msgBox.exec()
         for res in self.results["results"]["bindings"]:
             item = QListWidgetItem()
             item.setData(1, str(res["class"]["value"]))
             if "label" in res:
                 item.setText(
                     str(res["label"]["value"] + " (" +
                         res["class"]["value"] + ")"))
             else:
                 item.setText(str(res["class"]["value"]))
             self.searchResult.addItem(item)
     else:
         myResponse = json.loads(requests.get(self.query).text)
         self.qids = []
         for ent in myResponse["search"]:
             qid = ent["concepturi"]
             if "http://www.wikidata.org/entity/" in qid and self.findProperty.isChecked(
             ):
                 qid = "http://www.wikidata.org/prop/direct/" + ent["id"]
             elif "http://www.wikidata.org/wiki/" in qid and self.findConcept.isChecked(
             ):
                 qid = "http://www.wikidata.org/entity/" + ent["id"]
             self.qids.append(qid)
             label = ent["label"] + " (" + ent["id"] + ") "
             if "description" in ent:
                 label += "[" + ent["description"] + "]"
             self.results[qid] = label