コード例 #1
0
def getHttpStatusCode2(url, result):
    '''
	获取http状态吗
	'''
    Logger.debug(u'---- getHttpStatusCode2 ---- BEGIN ----')
    Logger.debug(u'---- params1:url:%s ----' % url)
    status = ''
    try:
        r = http.urlopen(method='GET',
                         url=url,
                         timeout=10,
                         retries=False,
                         redirect=False)
        status = r.status
        result.append([url, status])
        Logger.info("%s : %s" % (url, status))
        host = urllib3.get_host(url)
        if (status in [301, 302]):
            redirect = r.get_redirect_location()
            if (urllib3.get_host(redirect)[1] == None):
                redirect = host[0] + '://' + host[1] + '/' + redirect
            Logger.debug(u'重定向url:%s' % redirect)
            if (redirect == url):
                # 自己重定向自己,跳出
                pass
            else:
                return getHttpStatusCode2(redirect, result)
    except urllib3.exceptions.MaxRetryError as e:
        Logger.debug(u'---- return ----')
        result.append([url, '链接无效'])
        return '链接无效'
    except urllib3.exceptions.ConnectTimeoutError as e:
        # 链接超时
        Logger.debug(u'---- return ----')
        result.append([url, '链接超时'])
        return '链接超时'
    except urllib3.exceptions.SSLError as e:
        # 链接超时
        Logger.debug(u'---- return ----')
        result.append([url, 'SSLError'])
        return 'SSLError'
    except ConnectionResetError as e:
        raise
    except urllib3.exceptions.ProtocolError as e:
        raise
    except:
        raise
    else:
        return status
        Logger.debug(u'---- return1:%s ----' % status)
        Logger.debug(u'---- getHttpStatusCode2 ---- END ----')
コード例 #2
0
ファイル: http.py プロジェクト: flow90/python-billomat
    def __init__(
        self,
        billomat_id,
        billomat_api_key,
        billomat_app_id = None,
        billomat_app_secret = None,
    ):

        # Base URL
        self.url = "https://{billomat_id}.billomat.net/".format(
            billomat_id = billomat_id
        )

        # Headers
        self.headers = {
            "X-BillomatApiKey": billomat_api_key,
            "Content-Type": "application/xml",
        }
        if billomat_app_id:
            self.headers["X-AppId"] = billomat_app_id
        if billomat_app_secret:
            self.headers["X-AppSecret"] = billomat_app_secret

        # Bind Urlfetch for Google App Engine
        self.urlfetch = urlfetch

        # Initialize Urllib3-ConnectionPool
        if urllib3:
            scheme, host, port = urllib3.get_host(self.url)
            self.conn = urllib3.HTTPSConnectionPool(
                host = host, port = port
            )
        else:
            self.conn = None
コード例 #3
0
def fetch_all_album_in_page() -> []:
    url = "http://www.gqxzt.com/gaoqingr/qingzxies/xinyanxiaogongzhu/1.html"
    resp = requests.get(url)
    resp.encoding = "gb2312"
    bs = BeautifulSoup(resp.text, "html.parser")

    album_list = bs.select("#list")[0].find_all("li")

    host = urllib3.get_host(url)[0] + "://" + urllib3.get_host(url)[1]
    print(f"host: {host}")

    album_info_list = []
    for album in album_list:
        title = album.a.get("title")
        link = host + album.a.get("href")
        album_info_list.append((title, link))

    print(album_info_list)
    return album_info_list
コード例 #4
0
    def school(self, response):
        current = response.meta['current']
        for schoolInfo in response.xpath(
                '//div[contains(@class, "mar_t_30 overhidden")]'):
            self.schoolId += 1

            # 获取学校的信息
            schoolItem = SchoolItem()
            schoolItem['id'] = self.schoolId
            schoolItem['type'] = 'school'
            schoolItem['name'] = schoolInfo.xpath(
                './following-sibling::div[@class="p_bor"]/a/text()'
            ).extract_first()
            schoolItem['address'] = schoolInfo.xpath(
                './descendant::tr[1]/td[1]/text()').extract_first()
            schoolItem['url'] = schoolInfo.xpath(
                './descendant::tr[1]/td[2]/a/@href').extract_first()
            if schoolInfo.xpath('./descendant::tr[2]/td[1]/text()'
                                ).extract_first().split(':')[1]:
                schoolItem['schoolType'] = schoolInfo.xpath(
                    './descendant::tr[2]/td[1]/text()').extract_first().split(
                        ':')[1].strip()
            schoolItem['phone'] = schoolInfo.xpath(
                './descendant::tr[2]/td[2]/text()').extract_first().split(
                    ':')[1].strip()
            schoolItem['top'] = schoolInfo.xpath(
                './descendant::tr[3]/td[1]/text()').extract_first().split(
                    ':')[1].strip()
            yield schoolItem

            # 学校的图片
            imageItem = ImageItem()
            imageItem['id'] = self.schoolId
            imageItem['type'] = 'image'
            imageItem['image_urls'] = schoolInfo.xpath(
                './descendant::img/@src').extract()
            imageItem['image_paths'] = ''
            yield imageItem

            middleItem = CityToSchoolItem()
            middleItem['type'] = 'middle'
            middleItem['provinceId'] = current['ppid']
            middleItem['cityId'] = current['pid']
            middleItem['townId'] = current['id']
            middleItem['schoolId'] = self.schoolId
            yield middleItem

        # 爬取分页
        host = urllib3.get_host(response.url)
        nextPage = response.xpath("//div[@id='pagenav']/ul/li[3]/a/@href")
        if nextPage:
            nextUrl = host[0] + host[1] + nextPage
            request = scrapy.Request(nextUrl, callback=self.school)
            request.meta['current'] = current
            yield request
コード例 #5
0
ファイル: SSLMiddleware.py プロジェクト: Dogechi/Me2U
    def _redirect(self, request, secure):
        protocol = secure and "https" or "http"

        newurl = "%s://%s%s" % (protocol, get_host(request),
                                request.get_full_path())

        if settings.DEBUG and request.method == 'POST':
            raise RuntimeError(
                """Django can't perform a SSL redirect while maintaining POST data.            
            Please structure your views so that redirects only occur during GETs."""
            )
        return HttpResponsePermanentRedirect(newurl)
コード例 #6
0
ファイル: bing.py プロジェクト: Mark-Shine/station
def ViewResult(Keyword, curIP, nType, bSave, _AccountKey=None):
    conn = MySQLdb.connect(**ConfigFile)
    #conn.text_factory=str
    ResData = BingSearch(Keyword, _AccountKey)
    JsonData = {}
    try:
        JsonData = json.loads(ResData)
    except:
        pass
    url = []
    for key, value in JsonData.items():
        for key1, value1 in value.items():
            for lv in value1:
                try:
                    if nType == 0:
                        uu = lv['Url']
                        i = uu.find('/', 8)
                        url.append(uu[0:i])
                    elif nType == 1:
                        print lv['Url']
                    elif nType == 2:
                        print '%s -> %s' % (lv['Url'], lv['Title'])
                    if bSave:
                        host = urllib3.get_host(lv['Url'])[1]
                        newip = getIp(host)
                        if newip == curIP:
                            d_query_set = Data.objects.filter(ip=curIP).filter(
                                uri=host)
                            if d_query_set.exists():
                                print("update 2 ip-host : %s->%s" %
                                      (newip, host))
                                #同个ip 如果更新域名则同步更新内容
                                d_query = d_query_set[0]
                                d_query.uri = host
                                d_query.title = lv['Title']
                                d_query.descript = lv['Description']
                                d_query.state = '6'  #域名更新
                                d_query.save()
                            else:
                                print "save to database %s" % host
                                szSQL = "Insert into Data(IP,URI,Title,Descript) values ('%s','%s','%s','%s');" % (
                                    curIP, host, lv['Title'],
                                    lv['Description'])
                                cur = conn.cursor()
                                cur.execute(szSQL)
                                conn.commit()
                        else:
                            print("fake curip")
                except Exception, e:
                    print e
                finally:
                    pass
コード例 #7
0
ファイル: new_bing.py プロジェクト: Mark-Shine/station
def ViewResult(data):
    JsonData = {}

    for r in data:
        ResData = r.get("ResData", "")
        curIP = r.get("curIP", "")
        updated = r.get("updated", "")
        data_query = Data.objects.filter(ip=curIP)

        if not updated:
            #如果快照的内容没有更新则更新数据
            print("nothing changed compared with cached page")
            logger.info("nothing changed compared with cached page")
            continue
        try:
            JsonData = json.loads(ResData)
        except:
            pass
        print("[%s]" % curIP)
        logger.info("[%s]" % curIP)
        url = []
        for key, value in JsonData.items():
            for key1, value1 in value.items():
                for lv in value1:
                    try:
                        print(lv['Url'])
                        logger.info(lv['Url'])
                        host = urllib3.get_host(lv['Url'])[1]
                        d_query_set = data_query.filter(uri=host)
                        if d_query_set.exists():
                            print("update 2 ip-host : %s->%s" % (curIP, host))
                            #同个ip 如果更新域名则同步更新内容
                            d_query = d_query_set[0]
                            d_query.uri = host
                            d_query.title = lv['Title']
                            d_query.descript = lv['Description']
                            d_query.state = '6'  #域名更新
                            d_query.save()
                        else:
                            print "save to database %s" % host
                            now = datetime.datetime.now()
                            Data.objects.create(ip=curIP,
                                                uri=host,
                                                title=lv['Title'],
                                                descript=lv['Description'],
                                                time=now)
                    except Exception, e:
                        print e

                    finally:
                        pass
コード例 #8
0
ファイル: url_convertor.py プロジェクト: NMelis/get_text
def url_query_convert_to_path(url, file_type):
    """Конвертация урла на path.
    Пример: http://lenta.ru/news/2013/03/dtp/index.html =>
            [CUR_DIR]/lenta.ru/news/2013/03/dtp/index.txt
    """
    host = get_host(url)[1]
    query = urlparse(url).path
    query = query.strip('/')
    query = '/'.join(query.split('/')[0:-1])
    file_name = list(filter(None, urlparse(url).path.split('/')))[-1]

    file_name = os.path.splitext(file_name)[0]
    return "{host}/{query}".format(host=host, query=query), \
           "{file_name}.{file_type}".format(
               file_name=file_name, file_type=file_type)
コード例 #9
0
ファイル: bing.py プロジェクト: Mark-Shine/station
def ViewResult(Keyword,curIP,nType,bSave, _AccountKey=None):
    conn = MySQLdb.connect(**ConfigFile)
    #conn.text_factory=str
    ResData=BingSearch(Keyword, _AccountKey)
    JsonData={}
    try:
        JsonData=json.loads(ResData)
    except:
        pass
    url=[]
    for key,value in JsonData.items():
        for key1,value1 in value.items():
            for lv in value1:
                try:
                    if nType==0:
                        uu=lv['Url']
                        i=uu.find('/',8)
                        url.append(uu[0:i])
                    elif nType==1:
                        print lv['Url']
                    elif nType==2:
                        print '%s -> %s' % (lv['Url'],lv['Title'])
                    if bSave:
                        host = urllib3.get_host(lv['Url'])[1]
                        newip = getIp(host)
                        if newip == curIP:
                            d_query_set = Data.objects.filter(ip=curIP).filter(uri=host)
                            if d_query_set.exists():
                                print ("update 2 ip-host : %s->%s" %(newip, host))
                                #同个ip 如果更新域名则同步更新内容
                                d_query = d_query_set[0]
                                d_query.uri = host
                                d_query.title = lv['Title']
                                d_query.descript = lv['Description']
                                d_query.state = '6' #域名更新
                                d_query.save()
                            else:
                                print "save to database %s" % host
                                szSQL="Insert into Data(IP,URI,Title,Descript) values ('%s','%s','%s','%s');" % (curIP,host,lv['Title'],lv['Description'])
                                cur = conn.cursor()
                                cur.execute(szSQL)
                                conn.commit()
                        else:
                            print ("fake curip")
                except Exception, e:
                    print e
                finally:
                    pass
コード例 #10
0
ファイル: new_bing.py プロジェクト: Mark-Shine/station
def ViewResult(data):
    JsonData={}

    for r in data:
        ResData = r.get("ResData", "")
        curIP = r.get("curIP", "")
        updated = r.get("updated", "")
        data_query = Data.objects.filter(ip=curIP)

        if not updated:
            #如果快照的内容没有更新则更新数据
            print ("nothing changed compared with cached page")
            logger.info("nothing changed compared with cached page")
            continue
        try:
            JsonData=json.loads(ResData)
        except:
            pass
        print ("[%s]" % curIP)
        logger.info("[%s]" % curIP)
        url=[]
        for key,value in JsonData.items():
            for key1,value1 in value.items():
                for lv in value1:
                    try:
                        print (lv['Url'])
                        logger.info(lv['Url'])
                        host = urllib3.get_host(lv['Url'])[1]
                        d_query_set = data_query.filter(uri=host)
                        if d_query_set.exists():
                            print ("update 2 ip-host : %s->%s" %(curIP, host))
                            #同个ip 如果更新域名则同步更新内容
                            d_query = d_query_set[0]
                            d_query.uri = host
                            d_query.title = lv['Title']
                            d_query.descript = lv['Description']
                            d_query.state = '6' #域名更新
                            d_query.save()
                        else:
                            print "save to database %s" % host
                            now = datetime.datetime.now()
                            Data.objects.create(ip=curIP,uri=host,title=lv['Title'],descript=lv['Description'], time=now)
                    except Exception, e:
                        print e

                    finally:
                        pass
コード例 #11
0
ファイル: urlcanon.py プロジェクト: Dogechi/Me2U
    def process_view(self, request, view_func, view_args, view_kwargs):
        if not settings.DEBUG:
            # print(settings.DEBUG)
            """ only perform the redirect if not in debug mode """
            protocol = 'https://' if request.is_secure() else 'http://'
            host = get_host(request)
            print('host:', host)
            new_url = ''
            try:
                if host in settings.CANON_URLS_TO_REWRITE:
                    new_url = protocol + settings.CANON_URL_HOST + request.get_full_path(
                    )
            except AttributeError:
                if host != settings.CANON_URL_HOST:
                    new_url = protocol + settings.CANON_URL_HOST + request.get_full_path(
                    )

            if new_url:
                return HttpResponsePermanentRedirect(new_url)
コード例 #12
0
ファイル: http.py プロジェクト: alex1989/python-billomat
    def __init__(
            self,
            billomat_id,
            billomat_api_key,
            billomat_app_id=None,
            billomat_app_secret=None,
            timeout_seconds=600  # 10 Minutes
    ):

        self.timeout_seconds = timeout_seconds

        # Base URL
        self.url = "https://{billomat_id}.billomat.net/".format(
            billomat_id=billomat_id)

        # Headers
        self.headers = {
            "X-BillomatApiKey": billomat_api_key,
            "Content-Type": "application/xml",
        }
        if billomat_app_id:
            self.headers["X-AppId"] = billomat_app_id
        if billomat_app_secret:
            self.headers["X-AppSecret"] = billomat_app_secret

        # Bind Urlfetch for Google App Engine
        self.urlfetch = urlfetch

        # Initialize Urllib3-ConnectionPool
        if urllib3:
            scheme, host, port = urllib3.get_host(self.url)
            import certifi

            self.conn = urllib3.HTTPSConnectionPool(
                host=host,
                port=port,
                timeout=self.timeout_seconds,
                cert_reqs='CERT_REQUIRED',
                ca_certs=certifi.where())

        else:
            self.conn = None
コード例 #13
0
def parse_uri(url, path):
    logging.getLogger("urllib3").setLevel(logging.INFO)
    host = urllib3.get_host(url)
    logger.info(host)
    if host[0] == 'http':
        pool = urllib3.HTTPConnectionPool(host=host[1], port=host[2], maxsize=10)
    elif host[0] == 'https':
        pool = urllib3.HTTPSConnectionPool(host=host[1], port=host[2], maxsize=10)

    #req = urllib3.HTTPResponse()
    req = pool.request("GET", url, timeout=2.5)
    if req.status == 200:
        logging.info("Make directory %s" % path)
        try:
            os.mkdir(os.path.join(os.path.curdir, path))
        except Exception as e:
            logger.warning(str(e))
        parse_body(req.data, url, path)
    else:
        logger.error("Error while loading page: %s" % str(req.status))
コード例 #14
0
 def get_and_test(self, url: str = 'https://baidu.com', num: int = 3, timeout=1):
     # 只测试域名是否能访问
     url = '://'.join(urllib3.get_host(url)[:2])
     proxy_list, proxy_type = [], f'http{["", "s"][url[5] == "s"]}'
     while num > 0:
         proxy_ip = self.get(proxy_type)
         if not proxy_ip:
             print('代理池准备中', proxy_ip)
             time.sleep(3)
         try:
             self.log('获取到代理 ip:', proxy_ip, proxy_type)
             self.log(f'使用{url}来测试代理')
             get(url, proxies=f'{proxy_type}://{proxy_ip}', timeout=timeout, verify=False)
             num -= 1
             proxy_list.append(f'{proxy_type}://{proxy_ip}')
         except Exception as e:
             self.log(f'代理 {proxy_ip} 出错:{e}')
             self.delete(proxy_ip)
     self.log('获取代理完成:', proxy_list)
     return proxy_list
コード例 #15
0
ファイル: find_whois.py プロジェクト: guokai27/Tools
        '.net.bz', '.cc', '.com.co', '.net.co', '.nom.co', '.de', '.es',
        '.com.es', '.nom.es', '.org.es', '.eu', '.fm', '.fr', '.gs', '.in',
        '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in',
        '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl', '.nu', '.co.nz',
        '.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw',
        '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg'
    ]
    # 保证先匹配有两个后缀的域名
    sorted(shuffix_lst, key=len)[::-1]
    # 获取当前时间用于建立文件夹
    time_stamp = str(time.time())[:10]
    os.mkdir('domains_{}'.format(time_stamp))
    for line in d:
        name, url = line.split()
        # 解析主机名
        url = urllib3.get_host(url)[1]
        # 获取主站
        for shuffix in shuffix_lst:
            if shuffix in url:
                url = url.split(shuffix)[0].split('.')[-1] + shuffix
                break
        print(url)

        try:
            res = pythonwhois.get_whois(url)
        except:
            res = '{}: 解析失败'.format(url)
        with open('domains_{}/{}-{}.txt'.format(time_stamp, name, url),
                  'w') as f:
            json.dump(res, f, ensure_ascii=False, cls=DateEncoder, indent=4)
コード例 #16
0
ファイル: get_article.py プロジェクト: NMelis/get_text
 def __init__(self, html_text, url, settings=None):
     self._url = url
     self.config = Config(get_host(url)[1], settings)
     self.soup = BeautifulSoup(html_text, features='html.parser')
コード例 #17
0
import urllib3

body = urllib3.get_host("https://baidu.com")
print(body)
コード例 #18
0
ファイル: dummy_server.py プロジェクト: berg/urllib3
        time.sleep(0.1)

    @classmethod
    def tearDownClass(cls):
        import urllib # Yup, that's right.
        try:
            urllib.urlopen(cls.scheme + '://' + cls.host + ':' + str(cls.port) + '/shutdown')
        except IOError:
            pass
        cls.server_thread.join()


class HTTPSDummyServerTestCase(HTTPDummyServerTestCase):
    scheme = 'https'
    host = 'localhost'
    port = 18082


if __name__ == '__main__':
    log.setLevel(logging.DEBUG)
    log.addHandler(logging.StreamHandler(sys.stderr))

    from urllib3 import get_host

    url = "http://localhost:8081"
    if len(sys.argv) > 1:
        url = sys.argv[1]

    scheme, host, port = get_host(url)
    make_server(scheme=scheme, host=host, port=port)
コード例 #19
0
ファイル: new_bing.py プロジェクト: Mark-Shine/station
     #如果快照的内容没有更新则更新数据
     print("nothing changed compared with cached page")
     return
 try:
     JsonData = json.loads(ResData)
 except Exception, e:
     print e
     pass
 print "[%s]" % curIP
 url = []
 for key, value in JsonData.items():
     for key1, value1 in value.items():
         for lv in value1:
             try:
                 print lv['Url']
                 host = urllib3.get_host(lv['Url'])[1]
                 # newip = getIp(host)
                 # if newip == curIP:
                 d_query_set = data_query.filter(uri=host)
                 if d_query_set.exists():
                     print("update 2 ip-host : %s->%s" % (curIP, host))
                     #同个ip 如果更新域名则同步更新内容
                     d_query = d_query_set[0]
                     d_query.uri = host
                     d_query.title = lv['Title']
                     d_query.descript = lv['Description']
                     d_query.state = '6'  #域名更新
                     d_query.save()
                 else:
                     print "save to database %s" % host
                     Data.objects.create(ip=curIP,
コード例 #20
0
#coding:utf-8
from tkinter import *
from tkinter.scrolledtext import ScrolledText
import threading
import time
import queue
import urllib3
import requests
li = list()
host = urllib3.get_host(
    "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm")
li.append("host:" + (str)(host))
status = requests.get(
    "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").status_code
li.append("status:" + (str)(status))
charset = requests.get(
    "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").encoding
li.append("charset:" + (str)(charset))
cookies = requests.get(
    "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").cookies
li.append("cookies:" + (str)(cookies))
headers = requests.get(
    "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").headers
li.append("headers:" + (str)(headers))
url = requests.get("http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").url
li.append("url:" + (str)(url))
codes = requests.codes
li.append("codes:" + (str)(codes))
history = requests.get(
    "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").history
li.append("history:" + (str)(history))
コード例 #21
0
import urllib3
from bs4 import BeautifulSoup

quote_page = 'https://www.bloomberg.com/quote/SPX:IND'

page = urllib3.get_host(quote_page)

soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find('h1', attrs={'class': 'companyName__99a4824b'})
name = name_box.text.strip()  # strip() is used to remove starting and trailing
print(name)
コード例 #22
0
ファイル: new_bing.py プロジェクト: Mark-Shine/station
     #如果快照的内容没有更新则更新数据
     print ("nothing changed compared with cached page")
     return 
 try:
     JsonData=json.loads(ResData)
 except Exception, e:
     print e;
     pass
 print "[%s]" % curIP
 url=[]
 for key,value in JsonData.items():
     for key1,value1 in value.items():
         for lv in value1:
             try:
                 print lv['Url']
                 host = urllib3.get_host(lv['Url'])[1]
                 # newip = getIp(host)
                 # if newip == curIP:
                 d_query_set = data_query.filter(uri=host)
                 if d_query_set.exists():
                     print ("update 2 ip-host : %s->%s" %(curIP, host))
                     #同个ip 如果更新域名则同步更新内容
                     d_query = d_query_set[0]
                     d_query.uri = host
                     d_query.title = lv['Title']
                     d_query.descript = lv['Description']
                     d_query.state = '6' #域名更新
                     d_query.save()
                 else:
                     print "save to database %s" % host
                     Data.objects.create(ip=curIP,uri=host,title=lv['Title'],descript=lv['Description'])
コード例 #23
0
ファイル: rest1.py プロジェクト: kiran009/MySampleProject
import urllib3

url = 'http://www.acme.com/products/3322'
response = urllib3.get_host(url)
    #.urlopen(url).read()
print("The Response is  :",response)
コード例 #24
0

def make_server(**kw):
    try:
        return eventlet_server(**kw)
    except ImportError:
        return simple_server(**kw)


def make_server_thread(target, **kw):
    import threading
    t = threading.Thread(target=target, kwargs=kw)
    t.start()
    return t


if __name__ == '__main__':
    log.setLevel(logging.DEBUG)
    log.addHandler(logging.StreamHandler(sys.stderr))

    from urllib3 import get_host

    url = "http://localhost:8081"
    if len(sys.argv) > 1:
        url = sys.argv[1]

    print "Starting server at: %s" % url

    scheme, host, port = get_host(url)
    make_server(scheme=scheme, host=host, port=port)
コード例 #25
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-

#    File Name:       webpage
#    Description :
#    Author :          SanYapeng
#    date:            2019/2/2
#    Change Activity:  2019/2/2:

import urllib3

url = "http://www.baidu.com"
webpage = urllib3.get_host(url)
print(webpage)
コード例 #26
0
def download_mplce_url(urldest_tuple):
    import requests, re, urllib, urllib2, urllib3, OpenSSL, subprocess
    from os import path

    import urllib3.contrib.pyopenssl
    urllib3.contrib.pyopenssl.inject_into_urllib3()
    urllib3.disable_warnings()

    countimage = 0
    countstyle = 0
    image_url, destpath = urldest_tuple
    destdir = path.dirname(destpath)
    colorstyle = destpath.split('/')[-1][:9]
    alt_number = destpath.split('_')[-1][0]
    try:
        image_url = 'https://www.drop'.join(image_url.split('https://wwwop'))
        image_url = urllib.unquote_plus(image_url)
    except:
        pass
    ########################################################
    ##################  REGEX Filters Defined ##############
    ## Image URL Cleanup and Replace Extraneous/Bad Chars ##
    ########################################################
    ####### Dropbox Fix for View vs DL value ###############
    regex_dbx = re.compile(r'^https://www.dropbox.com/.+?\.[jpngJPNG]{3}$')
    regex_dbxprev = re.compile(r'^https://www.dropbox.com/.+?preview.*\.[jpngJPNG]{3}$')
    image_url = image_url.replace('dl=9', 'dl=1').replace('dl=0', 'dl=1').replace('dl=2', 'dl=1').replace('dl=3',
                                                                                                          'dl=1').replace(
        'dl=4', 'dl=1').replace('dl=5', 'dl=1').replace('dl=6', 'dl=1').replace('dl=7', 'dl=1').replace('dl=8', 'dl=1')
    regex_dl = re.compile(r'^.+\?dl=1.*?$')
    if regex_dbx.findall(image_url):
        if regex_dbxprev.findall(image_url):
            print 'REGEX DBXPRE'
            # import http_tools.auth.Dropbox.dropboxapi_service as dropboxapi_service
            final_path = image_url  # dropboxapi_service.download_auth_file(image_url=image_url, destpath=destpath)

            if final_path:
                print final_path, 'Final DBX Path'
                image_url = final_path
            else:
                pass
        else:
            image_url.replace('.JPG', '.jpg')
            image_url.replace('.PNG', '.png')
            print 'REGEX DBX dl=1'
        if not regex_dl.findall(image_url):
            image_url = image_url + '&dl=1'

    regex_validurl = re.compile(r'^http[s]?://.+?$', re.U)
    regex_ftpurl = re.compile(r'^ftp[s]?://.+?$', re.U)
    regex_drive2 = re.compile(r'^(https://d(.+?)\.google\.com/).*\?id\=(?P<fileId>.+?)\&?.*?$', re.U)
    regex_drive3 = re.compile(r'^(https://d(.+?)\.google\.com/file/d/)(?P<fileId>.+?)/(edit|view)\?usp\=.*?$', re.U)

    # regex_dropbox = re.compile(r'^https?://www.dropbox.com/.+?\.[jpngJPNG]{3}$')
    ######################
    #### BOX API AUTH #### removed
    # regex_boxapi  = re.compile(r'^(https?)?(?:\://)?(?P<VENDER_ROOT>.*)?(.*?)\.box\.com/(s/)?(?P<SHARED_LINK_ID>.+)?/?(\.?[jpngJPNG]{3,4})?(.*?)?\??(.*?)?$', re.U)
    ########################
    #### DRIVE API AUTH ####
    # if regex_drive2.findall(image_url):
    ########################################################
    ########################################################
    ########################################################
    import urllib3
    print ' 404 - 1 - Trying Urllib3 ', image_url
    hostname = urllib3.get_host(image_url)[1]
    ####################################################################################################
    ####################################################################################################
    ####
    #################
    ########### TEMPORARY RESTRICT URL from Merchantry Placeholder ###########
    old_blue_sweater_placeholder_url = 'https://pim-image-4b2a111d-0df4-447e-bc31-d288a9-s3bucket-1in4kl2m21ire.s3.amazonaws.com/7274a8bcbb3fa06076da6e1c2950e617b24baf4f5d467def6c88f276a0e15f12.jpg'
    placeholder_url = 'https://pim-image-4b2a111d-0df4-447e-bc31-d288a9-s3bucket-1in4kl2m21ire.s3.amazonaws.com/5d167c72a8100a911f06940395589769857cbd395e452bf21416503d1c43861c.jpg'
    if image_url == placeholder_url or image_url == old_blue_sweater_placeholder_url:
        print 'RESTRICTING DOWNLOAD of Merchantry Placeholder image.--> {}'.format(image_url)
        return
    ######################### END ###### Adjust below elif to initial if once removed above #############
    #################
    ####
    ####################################################################################################
    ####################################################################################################
    elif regex_drive3.findall(image_url):
        image_url = drive_match_fileid(image_url)
        print image_url, ' DRIVE3 --ID--> '
        import http_tools.auth.Google.google_drive_auth_downloader as google_drive_auth_downloader
        try:
            final_path = google_drive_auth_downloader.download_google_drive_file(image_url=image_url, destpath=destpath)
            if final_path:
                return final_path
            else:
                print 'Final DRIVE Failure ', destpath, '\n', image_url
        except IndexError:
            print 'Final DRIVE Exception ', destpath, '\n', image_url
        ## Tmp - Cannot use auth from johnb or http_tools above
        except:
            pass

    elif regex_drive2.findall(image_url):
        print image_url, ' DRIVE'
        # import jbmodules
        # from jbmodules
        import http_tools.auth.Google.google_drive_auth_downloader as google_drive_auth_downloader
        try:
            final_path = google_drive_auth_downloader.download_google_drive_file(image_url=image_url, destpath=destpath)
            if final_path:
                return final_path
            else:
                print 'Final DRIVE Failure ', destpath, '\n', image_url
        except IndexError:
            print 'Final DRIVE Exception ', destpath, '\n', image_url
            # return
            ## Tmp - Cannot use auth from johnb or http_tools above
        except:
            pass

    elif regex_ftpurl.findall(image_url):
        print image_url, ' FTP--FTP\n...probably Jaipur...'
        from http_tools.ftp_functions import pycurl_ftp_download
        import pycurl
        # from jbmodules
        try:
            res = pycurl_ftp_download(imageurl=image_url, destpath=destpath)
            print 'FTEPPEE --> {}\n{}\t\n'.format(res, image_url, destpath)
            return destpath
        except pycurl.error, error:
            print 'Pycurl error in FTP Download --> ', error
        ## Tmp - Cannot use auth from johnb or http_tools above
        except:
コード例 #27
0
import os
import argparse
import time
import urllib3
from logger import Logger
from pathlib import Path
from http_helper import HttpHelper
from html_parser import DictHtmlParser

BASE_URL = 'https://www.spanishdict.com'
HOST = urllib3.get_host(BASE_URL)[1]
SEARCH_URL = '{0}/translate/'.format(BASE_URL)
CONJUGATE_URL = '{0}/conjugate'.format(BASE_URL)
PROJECT_FILEPATH = os.getcwd()
DOWNLOAD_FILEPATH = '{0}/downloads'.format(PROJECT_FILEPATH)
JSON_DOWNLOAD_FILEPATH = '{0}/json'.format(DOWNLOAD_FILEPATH)
HTML_TRANSLATION_DOWNLOAD_FILEPATH = '{0}/translate'.format(DOWNLOAD_FILEPATH)
HTML_CONJUGATION_DOWNLOAD_FILEPATH = '{0}/conjugate'.format(DOWNLOAD_FILEPATH)
LOG_FILEPATH = "{0}/logs".format(PROJECT_FILEPATH)
JSON_HISTORY_LOG = '{0}/.json_history'.format(LOG_FILEPATH)
HTML_HISTORY_LOG = '{0}/.history'.format(LOG_FILEPATH)
ERROR_LOG = '{0}/.error_log'.format(LOG_FILEPATH)


def create_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('-s', '--scrape', help='Scrape words')
    parser.add_argument(
        '-p', '--parse', help='Parse dictionary entries from downloaded html pages')
    parser.add_argument('-d', '--define', help='Define word')
    parser.add_argument('-i', '--input', help='Input file containing list of words to define/conjugate.')