Example #1
0
 def __init__(self, url):
     self._url = url
     self._opener = urllib2.build_opener(
         CustomHTTPErrorHandler(),
         urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
     self._xsrf_token = None
Example #2
0
#!/usr/bin/env python
import urllib
import urllib2
import cookielib

filename = 'login.txt'
cookie = cookielib.MozillaCookieJar()
cookie.load(filename,ignore_discard=True,ignore_expires=True)


opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))

postdata = urllib.urlencode({
   'user':'******',
   'pwd':'123456'
})

loginUrl = "http://192.168.63.241:8000/cookie.php"
result = opener.open(loginUrl,postdata)
print result.read()


    def http_response(self, request, response):
        if __debug__:
            code, msg, hdrs = response.code, response.msg, response.info()
            self.httpout.write("HTTP/1.x %s %s\n" % (code, msg))
            self.httpout.write(str(hdrs))

        return response

    https_request = http_request
    https_response = http_response


# Example
cjar = cookielib.LWPCookieJar()
opener = urllib2.build_opener(
    urllib2.HTTPCookieProcessor(cjar),
    HTTPMyDebugProcessor(),
)
#opener = urllib2.build_opener(HTTPMyDebugProcessor(),)
urllib2.install_opener(opener)
##response = urllib2.urlopen("http://www.google.com")
#response = urllib2.urlopen("https://www.idcourts.us/repository/start.do")
#response = urllib2.urlopen("https://www.idcourts.us/repository/searchParty.do")
req = urllib2.Request(
    'http://www.microsoft.com/windows/windows-7/default.aspx')
#req = urllib2.Request('https://www.idcourts.us/repository/start.do')
res = opener.open(req)

print cjar
for c in cjar:
    cookie_str = "%s=%s" % (c.name, c.value)
Example #4
0
def getRegexParsed(
        regexs,
        url,
        cookieJar=None,
        forCookieJarOnly=False,
        recursiveCall=False):  #0,1,2 = URL, regexOnly, CookieJarOnly

    cachedPages = {}
    #print 'url',url
    doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url)
    print 'doRegexs', doRegexs, regexs

    for rege in doRegexs:
        k = regexs.find("regex", {"name": rege})
        if not k == None:
            cookieJarParam = False
            if k.cookiejar:
                cookieJarParam = k.cookiejar.text
                if '$doregex' in cookieJarParam:
                    cookieJar = getRegexParsed(regexs, cookieJarParam,
                                               cookieJar, True, True)
                    cookieJarParam = True
                else:
                    cookieJarParam = True
            if cookieJarParam:
                if cookieJar == None:
                    #print 'create cookie jar'
                    import cookielib
                    cookieJar = cookielib.LWPCookieJar()
                    #print 'cookieJar new',cookieJar
            page = k.page.text
            if '$doregex' in page:
                page = getRegexParsed(regexs,
                                      page,
                                      cookieJar,
                                      recursiveCall=True)

            postInput = None
            if k.post:
                postInput = k.post.text
                if '$doregex' in postInput:
                    postInput = getRegexParsed(regexs,
                                               postInput,
                                               cookieJar,
                                               recursiveCall=True)
                print 'post is now', postInput

            if page in cachedPages:
                link = cachedPages[page]
            else:
                #print 'Ingoring Cache',m['page']
                req = urllib2.Request(page)
                req.add_header(
                    'User-Agent',
                    'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'
                )
                if k.refer:
                    req.add_header('Referer', k.refer.text)
                if k.agent:
                    req.add_header('User-agent', k.agent.text)

                if not cookieJar == None:
                    #print 'cookieJarVal',cookieJar
                    cookie_handler = urllib2.HTTPCookieProcessor(cookieJar)
                    opener = urllib2.build_opener(
                        cookie_handler, urllib2.HTTPBasicAuthHandler(),
                        urllib2.HTTPHandler())
                    opener = urllib2.install_opener(opener)
                #print 'after cookie jar'

                post = None
                if postInput:
                    postData = postInput
                    splitpost = postData.split(',')
                    post = {}
                    for p in splitpost:
                        n = p.split(':')[0]
                        v = p.split(':')[1]
                        post[n] = v
                    post = urllib.urlencode(post)

                if post:
                    response = urllib2.urlopen(req, post)
                else:
                    response = urllib2.urlopen(req)

                link = response.read()

                response.close()
                cachedPages[page] = link
                if forCookieJarOnly:
                    return cookieJar  # do nothing
            print 'link', link
            print k.expres.text
            reg = re.compile(k.expres.text).search(link)

            url = url.replace("$doregex[" + rege + "]", reg.group(1).strip())
            if recursiveCall: return url
    print 'final url', url
    return url
Example #5
0
# -*- coding=utf-8 -*-

import urllib
import urllib2
import cookielib

# 通过CookieJar()类构建一个cookieJar()对象,用来保存cookie的值。
cookie = cookielib.CookieJar()

#通过HTTPCookieProcessor()处理器类构建一个处理器对象,又来处理cookie
#参数就是构建的CookieJar()对象
cookie_handler = urllib2.HTTPCookieProcessor(cookie)

#构建一个自定义的opener
opener = urllib2.build_opener(cookie_handler)

#构建一个自定义的opener的addheaders的参数,可以添加HTTP报头参数/
opener.addheaders = [(
    "User-Agent",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"
)]

#人人网的登录接口
url = "http://www.renren.com.PLogin.do"

#需要登录的账户和密码
data = {"email": "*****@*****.**", "password": "******"}

#通过urlencode()编码转换
data = urllib.urlencode(data)
Example #6
0
                batchfd = sys.stdin
            else:
                batchfd = open(opts.batchfile, 'r')
            batchurls = batchfd.readlines()
            batchurls = [x.strip() for x in batchurls]
            batchurls = [
                x for x in batchurls
                if len(x) > 0 and not re.search(r'^[#/;]', x)
            ]
        except IOError:
            sys.exit(u'ERROR: batch file could not be read')
    all_urls = batchurls + args
    all_urls = map(lambda url: url.strip(), all_urls)

    # General configuration
    cookie_processor = urllib2.HTTPCookieProcessor(jar)
    proxy_handler = urllib2.ProxyHandler()
    opener = urllib2.build_opener(proxy_handler, cookie_processor,
                                  YoutubeDLHandler())
    urllib2.install_opener(opener)
    socket.setdefaulttimeout(
        300)  # 5 minutes should be enough (famous last words)

    extractors = gen_extractors()

    if opts.list_extractors:
        for ie in extractors:
            print(ie.IE_NAME)
            matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
            all_urls = filter(lambda url: url not in matchedUrls, all_urls)
            for mu in matchedUrls:
def opener():
    return urllib2.build_opener(urllib2.HTTPCookieProcessor())
Example #8
0
 def __init__(self, host, port, io_loop_executor=None):
     self.host = host
     self.port = port
     self.io_loop_executor = io_loop_executor
     self.cookie_jar = cookielib.CookieJar()
     self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookie_jar), SilentHTTPErrorProcessor())
Example #9
0
def request(url,
            close=True,
            redirect=True,
            error=False,
            proxy=None,
            post=None,
            headers=None,
            mobile=False,
            XHR=False,
            limit=None,
            referer=None,
            cookie=None,
            compression=True,
            output='',
            timeout='30'):
    try:
        if not url:
            return

        handlers = []

        if not proxy == None:
            handlers += [
                urllib2.ProxyHandler({'http': '%s' % (proxy)}),
                urllib2.HTTPHandler
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

        if output == 'cookie' or output == 'extended' or not close == True:
            cookies = cookielib.LWPCookieJar()
            handlers += [
                urllib2.HTTPHandler(),
                urllib2.HTTPSHandler(),
                urllib2.HTTPCookieProcessor(cookies)
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

        if (2, 7, 8) < sys.version_info < (2, 7, 12):
            try:
                import ssl
                ssl_context = ssl.create_default_context()
                ssl_context.check_hostname = False
                ssl_context.verify_mode = ssl.CERT_NONE
                handlers += [urllib2.HTTPSHandler(context=ssl_context)]
                opener = urllib2.build_opener(*handlers)
                opener = urllib2.install_opener(opener)
            except:
                pass

        if url.startswith('//'): url = 'http:' + url

        _headers = {}
        try:
            _headers.update(headers)
        except:
            pass
        if 'User-Agent' in _headers:
            pass
        elif not mobile == True:
            #headers['User-Agent'] = agent()
            _headers['User-Agent'] = cache.get(randomagent, 1)
        else:
            _headers['User-Agent'] = 'Apple-iPhone/701.341'
        if 'Referer' in _headers:
            pass
        elif referer is not None:
            _headers['Referer'] = referer
        if not 'Accept-Language' in _headers:
            _headers['Accept-Language'] = 'en-US'
        if 'X-Requested-With' in _headers:
            pass
        elif XHR == True:
            _headers['X-Requested-With'] = 'XMLHttpRequest'
        if 'Cookie' in _headers:
            pass
        elif not cookie == None:
            _headers['Cookie'] = cookie
        if 'Accept-Encoding' in _headers:
            pass
        elif compression and limit is None:
            _headers['Accept-Encoding'] = 'gzip'

        if redirect == False:

            class NoRedirection(urllib2.HTTPErrorProcessor):
                def http_response(self, request, response):
                    return response

            opener = urllib2.build_opener(NoRedirection)
            opener = urllib2.install_opener(opener)

            try:
                del _headers['Referer']
            except:
                pass

        if isinstance(post, dict):
            post = utils.byteify(post)
            post = urllib.urlencode(post)

        url = utils.byteify(url)

        request = urllib2.Request(url, data=post)
        _add_request_header(request, _headers)

        try:
            response = urllib2.urlopen(request, timeout=int(timeout))
        except urllib2.HTTPError as response:

            if response.code == 503:
                cf_result = response.read(5242880)
                try:
                    encoding = response.info().getheader('Content-Encoding')
                except:
                    encoding = None
                if encoding == 'gzip':
                    cf_result = gzip.GzipFile(
                        fileobj=StringIO.StringIO(cf_result)).read()

                if 'cf-browser-verification' in cf_result:

                    netloc = '%s://%s' % (urlparse.urlparse(url).scheme,
                                          urlparse.urlparse(url).netloc)

                    if not netloc.endswith('/'): netloc += '/'

                    ua = _headers['User-Agent']

                    cf = cache.get(cfcookie().get, 168, netloc, ua, timeout)

                    _headers['Cookie'] = cf

                    request = urllib2.Request(url, data=post)
                    _add_request_header(request, _headers)

                    response = urllib2.urlopen(request, timeout=int(timeout))
                else:
                    log_utils.log(
                        'Request-Error (%s): %s' % (str(response.code), url),
                        log_utils.LOGDEBUG)
                    if error == False: return
            else:
                log_utils.log(
                    'Request-Error (%s): %s' % (str(response.code), url),
                    log_utils.LOGDEBUG)
                if error == False: return

        if output == 'cookie':
            try:
                result = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                result = cf
            except:
                pass
            if close == True: response.close()
            return result

        elif output == 'geturl':
            result = response.geturl()
            if close == True: response.close()
            return result

        elif output == 'headers':
            result = response.headers
            if close == True: response.close()
            return result

        elif output == 'chunk':
            try:
                content = int(response.headers['Content-Length'])
            except:
                content = (2049 * 1024)
            if content < (2048 * 1024): return
            result = response.read(16 * 1024)
            if close == True: response.close()
            return result

        if limit == '0':
            result = response.read(224 * 1024)
        elif not limit == None:
            result = response.read(int(limit) * 1024)
        else:
            result = response.read(5242880)

        try:
            encoding = response.info().getheader('Content-Encoding')
        except:
            encoding = None
        if encoding == 'gzip':
            result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()

        if 'sucuri_cloudproxy_js' in result:
            su = sucuri().get(result)

            _headers['Cookie'] = su

            request = urllib2.Request(url, data=post)
            _add_request_header(request, _headers)

            response = urllib2.urlopen(request, timeout=int(timeout))

            if limit == '0':
                result = response.read(224 * 1024)
            elif not limit == None:
                result = response.read(int(limit) * 1024)
            else:
                result = response.read(5242880)

            try:
                encoding = response.info().getheader('Content-Encoding')
            except:
                encoding = None
            if encoding == 'gzip':
                result = gzip.GzipFile(
                    fileobj=StringIO.StringIO(result)).read()

        if 'Blazingfast.io' in result and 'xhr.open' in result:
            netloc = '%s://%s' % (urlparse.urlparse(url).scheme,
                                  urlparse.urlparse(url).netloc)
            ua = _headers['User-Agent']
            _headers['Cookie'] = cache.get(bfcookie().get, 168, netloc, ua,
                                           timeout)

            result = _basic_request(url,
                                    headers=_headers,
                                    post=post,
                                    timeout=timeout,
                                    limit=limit)

        if output == 'extended':
            try:
                response_headers = dict([(item[0].title(), item[1])
                                         for item in response.info().items()])
            except:
                response_headers = response.headers
            response_code = str(response.code)
            try:
                cookie = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                cookie = cf
            except:
                pass
            if close == True: response.close()
            return (result, response_code, response_headers, _headers, cookie)
        else:
            if close == True: response.close()
            return result
    except Exception as e:
        log_utils.log('Request-Error: (%s) => %s' % (str(e), url),
                      log_utils.LOGDEBUG)
        return
Example #10
0
 def set_cookiejar(self,cj):
     self.cookiejar = cj
     saveheaders = self.opener.addheaders
     self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor())
     self.opener.addheaders = saveheaders
Example #11
0
def requestIPs(host):
    try:
        #cookie
        cookieJar = cookielib.LWPCookieJar()
        cookie_support = urllib2.HTTPCookieProcessor(cookieJar)
        opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
        urllib2.install_opener(opener)

        #服务器地址
        urlServer = 'http://tools.fastweb.com.cn/index.php/Index/Mdig'
        urlServerData = 'http://tools.fastweb.com.cn/index.php/Index/sendMdig'
        urlServerResult = 'http://tools.fastweb.com.cn/index.php/Index/getMdigResultOne'
        #header
        headers = {'User-Agent': 'Mozilla/4.0', 'Referer': '******'}

        #打开主页面
        urllib2.urlopen(urlServer)

        ##发送PING请求

        #Post数据
        postData = {
            'query_type': 'A',
            'domain_name': host,
            'city':
            '6,7,8,1,2,3,4,5,15,27,28,29,30,31,22,23,24,25,26,16,17,18,9,10,11,12,13,14,19,20,21,32,33,34,36,37',
            'isp': '1,2,3,5,8,12',
            'rand': '5244'
        }
        postData = urllib.urlencode(postData)

        # POST
        if debug: print '正在提交请求.'
        request = urllib2.Request(urlServerData, postData, headers)
        response = urllib2.urlopen(request)
        responseData = json.loads(response.read())

        if responseData['status'] == 1:
            if debug: print '请求已提交.'
            task_id = responseData['data']['task_id']
            view_ids = responseData['data']['view_ids']
            _from = responseData['data']['from']
            result_id = 0
            ipList = []
            print '等待服务器返回数据.'
            #每隔1.5秒检查返回结果
            while True:
                time.sleep(1.5)
                #Post数据
                postData = {
                    'task_id': task_id,
                    'view_ids': view_ids,
                    'from': _from,
                    'query_type': 'A',
                    'result_id': result_id
                }
                postData = urllib.urlencode(postData)

                # POST
                request = urllib2.Request(urlServerResult, postData, headers)
                response = urllib2.urlopen(request)
                responseData = json.loads(response.read())

                if responseData['status'] == 1 and responseData['info'] == '0':
                    result_id = responseData['data']['result_id']
                    if type(responseData['data']) != dict: continue
                    for x in responseData['data'].values():
                        for y in x:
                            if type(y) != dict: continue
                            if y['type'] == 'a':
                                i = y['result'].index('(')
                                ip = y['result'][:i]
                                if not ip in ipList:
                                    ipList.append(ip)
                    if debug: print '已取得部分数据,继续等待. (%d条IP数据)' % len(ipList)
                elif responseData['status'] == 0 and responseData[
                        'info'] == '1':
                    print 'IP列表获取成功.'
                    return ipList
                elif responseData['status'] == 1 and responseData[
                        'info'] == '1':
                    raise Exception('获取IP列表过程中出现错误.')
                else:
                    if debug: print '继续等待服务器返回数据.'
        else:
            raise Exception(responseData['info'])
    except Exception, e:
        print '无法获取IP列表'
        raise e
Example #12
0
import HTMLParser
import urlparse
import urllib
import urllib2
import cookielib
import string
import re

#登录的主页面
hosturl = 'https://www.douban.com/'
#post数据接收和处理的页面(我们要向这个页面发送我们构造的Post数据)
posturl = 'https://book.douban.com/'

#设置一个cookie处理器,它负责从服务器下载cookie到本地,并且在发送请求时带上本地的cookie
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)

#打开登录主页面(他的目的是从页面下载cookie,这样我们在再送post数据时就有cookie了,否则发送不成功)
h = urllib2.urlopen(hosturl)

#构造header,一般header至少要包含一下两项。这两项是从抓到的包里分析得出的。
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1',
    'Referer': '******'
}
#构造Post数据,他也是从抓大的包里分析得出的。
postData = {
    'op': 'dmlogin',
Example #13
0
def logintopronto(username, password, debug):
    params = urllib.urlencode({
        'loginUserId': username,
        'authType': 'Pronto',
        'loginPassword': password,
        'submit': 'Login'
    })

    opener = urllib2.build_opener(
        urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
    urllib2.install_opener(opener)
    puts(colored.white("Contacting ProntoNetworks..."))

    if debug:
        if not os.path.exists('debug/'):
            os.makedirs('debug/')

    with indent(5, quote=">"):
        puts(colored.yellow("Fetching site"))
    mainreq = urllib2.Request(
        BASE_URL + '/registration/Main.jsp?wispId=1&nasId=00:15:17:c8:09:b1')
    mainres = urllib2.urlopen(mainreq)

    if debug:
        with open('debug/main.txt', 'wb') as f:
            f.write(mainres.read())
            f.close()
            with indent(5, quote=colored.white("DEBUG:")):
                puts(colored.red("logged /registration/Main.jsp response"))

    with indent(5, quote=">"):
        puts(colored.yellow("Sending credentials"))
    loginReq = urllib2.Request(BASE_URL + '/registration/chooseAuth.do',
                               params)
    loginRes = urllib2.urlopen(loginReq)

    if debug:
        with open('debug/login.txt', 'wb') as f:
            f.write(loginRes.read())
            f.close()
            with indent(5, quote=colored.white("DEBUG:")):
                puts(
                    colored.red("logged /registration/chooseAuth.do response"))

    with indent(5, quote=">"):
        puts(colored.yellow("Checking plan"))
    planreq = urllib2.Request(
        BASE_URL + '/registration/main.do?content_key=%2FSelectedPlan.jsp')
    planres = urllib2.urlopen(planreq)

    planSoup = BeautifulSoup(planres.read())
    data = planSoup.findAll('td',
                            attrs={
                                'class': 'formFieldRight',
                                'colspan': '2'
                            })
    planDetails = []
    for i in range(0, len(data) - 1):
        kids = data[i].parent.findAll('td')
        planDetails.append(str(kids[1].text))

    if debug:
        with open('debug/plan.txt', 'wb') as f:
            f.write(loginRes.read())
            f.close()
            with indent(5, quote=colored.white("DEBUG:")):
                puts(
                    colored.red(
                        "logged /registration/main.do?content_key=%2FSelectedPlan.jsp response"
                    ))

    sedate = datetime.strptime(planDetails[2], "%m/%d/%Y %H:%M:%S")
    # enddate = datetime.strptime(planDetails[3], "%m/%d/%Y %H:%M:%S")

    cycleStart = getcyclestartdate(sedate)

    historyparams = urllib.urlencode({
        "location": "allLocations",
        "parameter": "custom",
        "customStartMonth": cycleStart['mm'],
        "customStartDay": cycleStart['dd'],
        "customStartYear": cycleStart['yy'],
        "customEndMonth": 04,
        "customEndDay": 01,
        "customEndYear": 2016,  # Lazy, so hardcoding end year.
        "button": "View"
    })
Example #14
0
def get_cookies():
    cj = cookielib.LWPCookieJar()
    cookie_support = urllib2.HTTPCookieProcessor(cj)
    opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
    urllib2.install_opener(opener)
Example #15
0
class YoujiaoSpider(Spider):
    # name='ry'
    name = 'youjiao'
    download_delay = 5
    allowed_domains = ['youjiao.com']
    # start_urls = ['http://www.youjiao.com/sejy/qnkf/']
    #  start_urls = ['http://www.youjiao.com/etly/kpzs/gxkj/']

    start_urls = [
        'http://www.youjiao.com/ysx/ysx/',
        'http://www.youjiao.com/ysx/zhengce/',
        'http://www.youjiao.com/ysx/zsjz/',
        'http://www.youjiao.com/ysx/zexiao/',
        'http://www.youjiao.com/ysx/zhinan/',
        'http://www.youjiao.com/ysx/xuequfang/',
        'http://www.youjiao.com/ysx/rxcs/',
        'http://www.youjiao.com/ysx/shiti/',
        'http://www.youjiao.com/ysx/xianchang/',
        'http://www.youjiao.com/ysx/yxxj/',
        'http://www.youjiao.com/ysx/jingyan/',
        'http://www.youjiao.com/ysx/mxdt/', 'http://www.youjiao.com/ysx/zdxx/',
        'http://www.youjiao.com/sejy/czrj/',
        'http://www.youjiao.com/sejy/wenti/',
        'http://www.youjiao.com/sejy/jtjy/',
        'http://www.youjiao.com/sejy/qnkf/',
        'http://www.youjiao.com/sejy/yspy/',
        'http://www.youjiao.com/sejy/etxl/',
        'http://www.youjiao.com/sejy/tsdx/',
        'http://www.youjiao.com/sejy/yyxx/',
        'http://www.youjiao.com/sejy/pyxx/',
        'http://www.youjiao.com/sejy/jyzj/',
        'http://www.youjiao.com/sejy/gwjy/',
        'http://www.youjiao.com/sejy/jyxd/',
        'http://www.youjiao.com/sejy/ryzd/',
        'http://www.youjiao.com/sejy/zaojiao/quming/',
        'http://www.youjiao.com/sejy/zaojiao/ceping/',
        'http://www.youjiao.com/sejy/zaojiao/yyxw/',
        'http://www.youjiao.com/sejy/zaojiao/czzb/',
        'http://www.youjiao.com/sejy/zaojiao/zqjy/',
        'http://www.youjiao.com/sejy/zaojiao/xgpy/',
        'http://www.youjiao.com/sejy/zaojiao/qinzijiaoliu/',
        'http://www.youjiao.com/sejy/zaojiao/zjyx/',
        'http://www.youjiao.com/sejy/zaojiao/wanju/',
        'http://www.youjiao.com/sejy/zaojiao/zlkf/',
        'http://www.youjiao.com/sejy/taijiao/tjff/',
        'http://www.youjiao.com/sejy/taijiao/tjxd/',
        'http://www.youjiao.com/sejy/taijiao/yuyan/',
        'http://www.youjiao.com/sejy/taijiao/ydtj/',
        'http://www.youjiao.com/sejy/taijiao/fmtj/',
        'http://www.youjiao.com/sejy/taijiao/yinyue/',
        'http://www.youjiao.com/sejy/taijiao/tjgs/',
        'http://www.youjiao.com/sejy/taijiao/tjyy/',
        'http://www.youjiao.com/sejy/taijiao/mrtj/',
        'http://www.youjiao.com/sejy/taijiao/yytj/',
        'http://www.youjiao.com/etly/zlyx/',
        'http://www.youjiao.com/etly/etgq/',
        'http://www.youjiao.com/etly/etgs/',
        'http://www.youjiao.com/etly/seyy/',
        'http://www.youjiao.com/etly/gushi/',
        'http://www.youjiao.com/etly/lianxi/',
        'http://www.youjiao.com/etly/erge/',
        'http://www.youjiao.com/etly/youxi/',
        'http://www.youjiao.com/etly/etdw/',
        'http://www.youjiao.com/etly/mhsj/',
        'http://www.youjiao.com/etly/sqgs/',
        'http://www.youjiao.com/etly/qzgs/',
        'http://www.youjiao.com/etly/kpzs/gxkj/',
        'http://www.youjiao.com/etly/kpzs/zwls/',
        'http://www.youjiao.com/etly/kpzs/twdl/',
        'http://www.youjiao.com/etly/kpzs/kpcs/',
        'http://www.youjiao.com/etly/kpzs/smkx/',
        'http://www.youjiao.com/etly/kpzs/rtkx/',
        'http://www.youjiao.com/etly/kpzs/jckx/',
        'http://www.youjiao.com/etly/kpzs/yyys/',
        'http://www.youjiao.com/etly/kpzs/hjkx/',
        'http://www.youjiao.com/etly/kpzs/jskx/',
        'http://www.youjiao.com/etly/kpzs/rcsh/',
        'http://www.youjiao.com/etly/kpzs/msqw/',
        'http://www.youjiao.com/etly/etdw/',
        'http://www.youjiao.com/etly/etwj/',
        'http://www.youjiao.com/jkbb/huli/',
        'http://www.youjiao.com/jkbb/yfjj/',
        'http://www.youjiao.com/jkbb/mianyi/',
        'http://www.youjiao.com/jkbb/jibing/',
        'http://www.youjiao.com/jkbb/zjdy/',
        'http://www.youjiao.com/jkbb/jkys/',
        'http://www.youjiao.com/jkbb/yewy/',
        'http://www.youjiao.com/jkbb/fushi/',
        'http://www.youjiao.com/jkbb/mmyy/',
        'http://www.youjiao.com/jkbb/etyy/',
        'http://www.youjiao.com/jkbb/bbyp/',
        'http://www.youjiao.com/shipu/yqsp/',
        'http://www.youjiao.com/shipu/fushi/',
        'http://www.youjiao.com/shipu/etsp/',
        'http://www.youjiao.com/shipu/jtsp/',
        'http://www.youjiao.com/shipu/chsp/',
        'http://www.youjiao.com/shipu/pengren/',
        'http://www.youjiao.com/shipu/meishi/',
        'http://www.youjiao.com/ssmm/mswh/',
        'http://www.youjiao.com/shipu/meirong/',
        'http://www.youjiao.com/shipu/jianfei/',
        'http://www.youjiao.com/yyzn/zhunbeihuaiyun/huaiyunzhishi/',
        'http://www.youjiao.com/yyzn/zhunbeihuaiyun/yunqianzhunbei/',
        'http://www.youjiao.com/yyzn/zhunbeihuaiyun/huaiyunjinji/',
        'http://www.youjiao.com/yyzn/zhunbeihuaiyun/yichuanyousheng/',
        'http://www.youjiao.com/yyzn/zhunbeihuaiyun/biyunliuchan/',
        'http://www.youjiao.com/yyzn/zhunbeihuaiyun/buyunbuyu/',
        'http://www.youjiao.com/yyzn/zhunbeihuaiyun/shengnanshengnv/',
        'http://www.youjiao.com/yyzn/gongju/',
        'http://www.youjiao.com/ssmm/xmdr/',
        'http://www.youjiao.com/ssmm/jtlc/',
        'http://www.youjiao.com/ssmm/fscl/',
        'http://www.youjiao.com/ssmm/mswh/',
        'http://www.youjiao.com/ssmm/pxgx/',
        'http://www.youjiao.com/ssmm/meirong/',
        'http://www.youjiao.com/ssmm/sushen/',
        'http://www.youjiao.com/ssmm/yuedu/',
        'http://www.youjiao.com/ssmm/zhichang/',
        'http://www.youjiao.com/ssmm/bbsj/',
        'http://www.youjiao.com/ssmm/lydj/',
        'http://www.youjiao.com/ssmm/jiaju/',
        'http://www.youjiao.com/ssmm/fqgx/',
        'http://www.youjiao.com/ssmm/mmxl/'
    ]

    # url_pattern=[r'.*rank=sale&type=hot.*']
    url_pattern = [r'id=.*']
    url_extractor = LxmlLinkExtractor(allow=url_pattern)
    item_dict = {}

    cj = cookielib.LWPCookieJar()
    cookie_support = urllib2.HTTPCookieProcessor(cj)
    opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
    urllib2.install_opener(opener)

    def start_requests(self):
        for url in self.start_urls:
            """爬取网站分页"""
            # for i in range(2, 3):
            #     next_url = url + 'index_' + str(i) + '.shtml'
            #     yield SplashRequest(next_url, callback=self.parse, args={
            #         'wait': 0.5, 'html': 1,
            #     })
            yield SplashRequest(url,
                                callback=self.parse,
                                args={
                                    'wait': 0.5,
                                    'html': 1,
                                })

    def parse(self, response):
        # print response.url
        lis = response.xpath("//*[@id='content']/li/div/a/@href").extract()
        # print lis
        for li in lis:
            news_href = li
            """判断网页是否已经被爬取过"""
            news_id = news_href.split('/')[-1].split('.')[0]
            ret = Sql.select_name(news_id)
            link_hdfs = InsecureClient("http://192.168.10.117:50070",
                                       user='******')  # Base HDFS web client
            dir_ls = link_hdfs.list('/testdir')
            today = datetime.date.today()
            today_dir = '/testdir' + '/' + str(today)
            if str(today) not in dir_ls:
                link_hdfs.makedirs(today_dir)
            txt_name = news_id + '.txt'
            yesterday = today - datetime.timedelta(days=1)
            yesterday_dir = '/testdir' + '/' + str(yesterday)
            if str(yesterday) not in dir_ls:
                if ret[0] == 1 and txt_name in link_hdfs.list(today_dir):
                    print "<+_+>-----已经存在HDFS与Mysql中-----<+_+>"
                    pass
                else:
                    yield Request(news_href, callback=self.get_content)
            else:
                if ret[0] == 1 and txt_name in link_hdfs.list(yesterday_dir):
                    print "<+_+>-----已经存在HDFS与Mysql中-----<+_+>"
                    pass
                elif ret[0] == 1 and txt_name in link_hdfs.list(today_dir):
                    print "<+_+>-----已经存在HDFS与Mysql中-----<+_+>"
                else:
                    yield Request(news_href, callback=self.get_content)

    def get_content(self, response):
        item = YoujiaoItem()
        """提取info_id"""
        info_id = response.url.split('/')[-1].split('.')[0]
        item['info_id'] = info_id
        """提取新闻标题"""
        title = response.xpath(
            "//div[@class='wrapper']/div/div[@class='wrapper']/div[@class='container']"
            "/div[@class='content']/h1/text()").extract()
        item['title'] = title[0]
        """提取新闻地址"""
        url = response.url
        item['link'] = url
        # url_id = url.split('/')[-1].split('.')[0]
        # print url_id
        """提取新闻发布时间"""
        o_time = response.xpath("//p[@class='data']/text()").extract()[-1]
        # print time
        online_time = o_time.replace('(', '').replace(')', '').replace(
            ' ', '').replace(':', '').replace('-', '')
        item['online_time'] = online_time
        """提取新闻来源"""
        news_from = response.xpath("//p[@class='data']/em/text()").extract()
        try:
            item['news_from'] = news_from[0]
        except IndexError:
            item['news_from'] = news_from
        """提取新闻来源地址"""
        item['source'] = None
        """提取新闻关键字"""
        tags = response.xpath("//*[@id='xg_tag']/span/a/text()").extract()
        tag = ','.join(tags)
        item['tag'] = tag
        """提取新闻描述"""
        desc = response.xpath("//meta[@name='description']/@content").extract()
        item['news_description'] = desc[0]
        """提取新闻作者"""
        try:
            author = response.xpath(
                "//p[@class='data']/text()").extract()[1].replace(' ', '')
            author = author.split(':')[-1]
            item['author'] = author
        except:
            author = None
            item['author'] = author
        """提取版块"""
        section = response.xpath(
            "//div[@class='logoArea']/span/a/text()").extract()
        section = '-'.join(section)
        item['section'] = section
        """提取评论数"""
        try:
            cmt_url = response.xpath(
                "//*[@id='bbs']/iframe[@id='rating']/@src").extract()[0]
            cmt_req = urllib2.Request(cmt_url)
            cmt_html = urllib2.urlopen(cmt_req).read()
            # print cmt_html
            cmt_re = re.findall(r'<span class="talk">(.*?)</span>', cmt_html)
            item['comment_count'] = cmt_re[0]
        except:
            item['comment_count'] = None
        """提取内容"""
        soup = BeautifulSoup(response.text, 'lxml')
        [script.extract() for script in soup.find_all('script')]  # 去除script标签
        [style.extract() for style in soup.find_all('style')]  # 去除style标签
        text_soup = soup.find(class_='content_txt').get_text()
        if '上一页' in text_soup:
            text = text_soup.split('上一页')[0]
        else:
            text = text_soup.split('幼教网微信')[0]
        text = text.split('\n')
        # text = response.xpath("//div[@class='content_txt']/p/text()").extract()
        body = []
        for t_body in text:
            bdy = t_body.replace('\r', '').replace('\n', '').replace(
                '\t', '').replace(u'\xa0', '')
            body.append(bdy)
        """提取网页内容分页"""
        try:
            next_urls = response.xpath(
                "//div[@class='pages']/a/@href").extract()[:-1]
            header = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/55.0.2883.87 Safari/537.36'
            }
            for i in range(len(next_urls)):
                if item['info_id'] in next_urls[i]:
                    next_url = next_urls[i]
                    next_url_req = urllib2.Request(next_url, headers=header)
                    next_url_html = urllib2.urlopen(next_url_req).read()
                    soup_body = BeautifulSoup(
                        next_url_html,
                        'html.parser').find(class_='content_txt').get_text()
                    n_body = soup_body.split('上一页')[0]
                    n_body = n_body.split('\n')
                    for bdy in n_body:
                        n_by = bdy.replace('\n', '').replace('\r', '').replace(
                            '\t', '').replace(u'\xa0', '')
                        # print n_by
                        body.append(n_by)
        except Exception, e:
            print Exception, "没有下一页内容", e
            pass
        while '' in body:
            body.remove('')
        item['news_body'] = body

        return item
Example #16
0
    def get_cookie(self, netloc, ua, timeout):
        try:
            headers = {'User-Agent': ua}

            request = urllib2.Request(netloc)
            _add_request_header(request, headers)

            try:
                response = urllib2.urlopen(request, timeout=int(timeout))
            except urllib2.HTTPError as response:
                result = response.read(5242880)
                try:
                    encoding = response.info().getheader('Content-Encoding')
                except:
                    encoding = None
                if encoding == 'gzip':
                    result = gzip.GzipFile(
                        fileobj=StringIO.StringIO(result)).read()

            jschl = re.findall('name="jschl_vc" value="(.+?)"/>', result)[0]

            init = re.findall('setTimeout\(function\(\){\s*.*?.*:(.*?)};',
                              result)[-1]

            builder = re.findall(r"challenge-form\'\);\s*(.*)a.v", result)[0]

            decryptVal = self.parseJSString(init)

            lines = builder.split(';')

            for line in lines:

                if len(line) > 0 and '=' in line:

                    sections = line.split('=')
                    line_val = self.parseJSString(sections[1])
                    decryptVal = int(
                        eval(
                            str(decryptVal) + sections[0][-1] + str(line_val)))

            answer = decryptVal + len(urlparse.urlparse(netloc).netloc)

            query = '%s/cdn-cgi/l/chk_jschl?jschl_vc=%s&jschl_answer=%s' % (
                netloc, jschl, answer)

            if 'type="hidden" name="pass"' in result:
                passval = re.findall('name="pass" value="(.*?)"', result)[0]
                query = '%s/cdn-cgi/l/chk_jschl?pass=%s&jschl_vc=%s&jschl_answer=%s' % (
                    netloc, urllib.quote_plus(passval), jschl, answer)
                time.sleep(6)

            cookies = cookielib.LWPCookieJar()
            handlers = [
                urllib2.HTTPHandler(),
                urllib2.HTTPSHandler(),
                urllib2.HTTPCookieProcessor(cookies)
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

            try:
                request = urllib2.Request(query)
                _add_request_header(request, headers)
                response = urllib2.urlopen(request, timeout=int(timeout))
            except:
                pass

            cookie = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies])

            if 'cf_clearance' in cookie: self.cookie = cookie
        except:
            pass
Example #17
0
import urllib2
import urllib
import cookielib

values = {"name": "wowo", "age": "20"}
data = urllib.urlencode(values)

headers = {"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"}

cookie = cookielib.MozillaCookieJar()
headler = urllib2.HTTPCookieProcessor(cookie)
opener = urllib2.build_opener(headler)
request = urllib2.Request("http://www.wangzhi.com", data, headers)
response = opener.open(request)
print response.read()
print cookie
for item in cookie:
    print "-" * 10
    print item.name
    print item.value

cookie.save("d:\\app.txt", True, True)

response.close()
Example #18
0
def VIDEO(url):
    urlogin = '******'
    cookiejar = cookielib.LWPCookieJar()
    cookiejar = urllib2.HTTPCookieProcessor(cookiejar)
    opener = urllib2.build_opener(cookiejar)
    urllib2.install_opener(opener)
    values = {
        'ref': 'http://veehd.com/',
        'uname': uname,
        'pword': pwd,
        'submit': 'Login',
        'terms': 'on'
    }
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent': user_agent}
    data = urllib.urlencode(values)
    req = urllib2.Request(urlogin, data, headers)
    response = urllib2.urlopen(req)
    if url.find('flv') > 0:
        req = urllib2.Request(url)
        req.add_header(
            'User-Agent',
            'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3'
        )
        response = urllib2.urlopen(req)
        link = response.read()
        vpi = re.compile('"/vpi.+?h=(.+?)"').findall(link)[0]
        req = urllib2.Request('http://veehd.com/vpi?h=' + vpi)
        req.add_header(
            'User-Agent',
            'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3'
        )
        response = urllib2.urlopen(req)
        link = response.read()
        swap = re.compile('"url":"(.+?)"').findall(link)[0]
        finalurl = swap.replace('%2F', '/').replace('%3F', '?').replace(
            '%3D',
            '=').replace('%25',
                         '%').replace('%2F',
                                      '/').replace('%26',
                                                   '&').replace('%3A', ':')
        if (vhd.getSetting('download') == '0'):
            dia = xbmcgui.Dialog()
            ret = dia.select('Streaming Options', ['Play', 'Download'])
            if (ret == 0):
                item = xbmcgui.ListItem(path=finalurl)
                xbmcplugin.setResolvedUrl(pluginhandle, True, item)
            elif (ret == 1):
                path = xbmc.translatePath(
                    os.path.join(vhd.getSetting('download_path'), name))
                Download(finalurl, path + name + '.avi')
            else:
                return
        elif (vhd.getSetting('download') == '1'):
            item = xbmcgui.ListItem(path=finalurl)
            xbmcplugin.setResolvedUrl(pluginhandle, True, item)
        elif (vhd.getSetting('download') == '2'):
            path = xbmc.translatePath(
                os.path.join(vhd.getSetting('download_path'), name))
            Download(finalurl, path + name + '.avi')
        else:
            return

    if url.find('flv') < 0:
        req = urllib2.Request(url)
        req.add_header(
            'User-Agent',
            'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3'
        )
        response = urllib2.urlopen(req)
        link = response.read()
        vpi = re.compile('"/vpi.+?h=(.+?)"').findall(link)[0]
        req = urllib2.Request('http://veehd.com/vpi?h=' + vpi)
        req.add_header(
            'User-Agent',
            'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3'
        )
        response = urllib2.urlopen(req)
        link = response.read()
        finalurl = re.compile('param name="src" value="(.+?)"').findall(
            link)[0]
        item = xbmcgui.ListItem(path=finalurl)
        xbmcplugin.setResolvedUrl(pluginhandle, True, item)

        if (vhd.getSetting('download') == '0'):
            dia = xbmcgui.Dialog()
            ret = dia.select('Streaming Options', ['Play', 'Download'])
            if (ret == 0):
                item = xbmcgui.ListItem(path=finalurl)
                xbmcplugin.setResolvedUrl(pluginhandle, True, item)
            elif (ret == 1):
                path = xbmc.translatePath(
                    os.path.join(vhd.getSetting('download_path'), name))
                Download(finalurl, path + name + '.avi')
            else:
                return
        elif (vhd.getSetting('download') == '1'):
            item = xbmcgui.ListItem(path=finalurl)
            xbmcplugin.setResolvedUrl(pluginhandle, True, item)
        elif (vhd.getSetting('download') == '2'):
            path = xbmc.translatePath(
                os.path.join(vhd.getSetting('download_path'), name))
            Download(finalurl, path + name + '.avi')
        else:
            return
import re
import pdb

from lxml import etree

headers = {
    "Accept":
    "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language":
    "fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3",
    "User-Agent":
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",
}
parser = etree.HTMLParser()
url_opener = urllib2.build_opener(
    urllib2.HTTPCookieProcessor(cookielib.CookieJar()))


def go_to_next_page(parameters, response, form_element, form_to_take=1):
    response_url = response.geturl()
    request = urllib2.Request(urlparse.urljoin(response_url,
                                               form_element.get('action')),
                              headers=headers)
    try:
        response = url_opener.open(request,
                                   urllib.urlencode(parameters, doseq=True))
    except urllib2.HTTPError as response:
        print "Erreur", response.code
        print response.read()
        raise
    html_page = response.read()
    def send(self, original_payload, additional_handlers=[]):

        # Generate session id and referrers
        session_id, referrers_data = self._prepare(original_payload)

        cj = cookielib.CookieJar()
        additional_handlers.append(urllib2.HTTPCookieProcessor(cj))
        opener = urllib2.build_opener(*additional_handlers)

        # When core.conf contains additional cookies, carefully merge
        # the new headers killing the needed ones
        additional_headers = []
        additional_ua = ''
        additional_cookie = ''
        for h in self.additional_headers:
            if h[0].lower() == 'user-agent' and h[1]:
                additional_ua = h[1]
            if h[0].lower() == 'cookie' and h[1]:
                cookies = h[1].rstrip(';').split('; ')
                for cookie in cookies:
                    name, value = cookie.split('=')
                    cj.set_cookie(
                        cookielib.Cookie(version=0,
                                         name=name,
                                         value=value,
                                         port=None,
                                         port_specified=False,
                                         domain='',
                                         domain_specified=True,
                                         domain_initial_dot=True,
                                         path='/',
                                         path_specified=True,
                                         secure=False,
                                         expires=None,
                                         discard=True,
                                         comment=None,
                                         comment_url=None,
                                         rest={'HttpOnly': None}))
            elif h[0].lower() in ('accept', 'accept-language', 'referer'):
                # Skip sensible headers
                pass
            else:
                additional_headers.append(h)

        for referrer_index, referrer_data in enumerate(referrers_data):

            accept_language_header = self._generate_header_accept_language(
                referrer_data[1], session_id)
            accept_header = self._generate_header_accept()
            opener.addheaders = [
                ('Referer', referrer_data[0]),
                ('Accept-Language', accept_language_header),
                ('Accept', accept_header),
                ('User-Agent', (additional_ua if additional_ua else
                                random.choice(self.agents)))
            ] + additional_headers

            dlog.debug('[H %i/%i]\n%s\n[C] %s' %
                       (referrer_index, len(referrers_data) - 1, '\n'.join(
                           '> %s: %s' % (h[0], h[1])
                           for h in opener.addheaders), cj))

            url = (self.url if not config.add_random_param_nocache else
                   utils.http.add_random_url_param(self.url))

            try:
                response = opener.open(url).read()
            except httplib.BadStatusLine as e:
                # TODO: add this check to the other channels
                log.warn('Connection closed unexpectedly, aborting command.')
                return

            if not response:
                continue

            # Multiple debug string may have been printed, using findall
            matched_debug = self.re_debug.findall(response)
            if matched_debug:
                dlog.debug('\n'.join(matched_debug))

            matched = self.re_response.search(response)
            if matched and matched.group(1):
                return zlib.decompress(
                    utils.strings.sxor(base64.b64decode(matched.group(1)),
                                       self.shared_key))
Example #21
0
def get_data(host,
             query,
             idx,
             limit,
             debug,
             threshold=300,
             ckey=None,
             cert=None,
             das_headers=True):
    """Contact DAS server and retrieve data for given DAS query"""
    params = {'input': query, 'idx': idx, 'limit': limit}
    path = '/das/cache'
    pat = re.compile('http[s]{0,1}://')
    if not pat.match(host):
        msg = 'Invalid hostname: %s' % host
        raise Exception(msg)
    url = host + path
    client = '%s (%s)' % (DAS_CLIENT, os.environ.get('USER', ''))
    headers = {"Accept": "application/json", "User-Agent": client}
    encoded_data = urllib.urlencode(params, doseq=True)
    url += '?%s' % encoded_data
    req = urllib2.Request(url=url, headers=headers)
    if ckey and cert:
        ckey = fullpath(ckey)
        cert = fullpath(cert)
        http_hdlr = HTTPSClientAuthHandler(ckey, cert, debug)
    else:
        http_hdlr = urllib2.HTTPHandler(debuglevel=debug)
    proxy_handler = urllib2.ProxyHandler({})
    cookie_jar = cookielib.CookieJar()
    cookie_handler = urllib2.HTTPCookieProcessor(cookie_jar)
    opener = urllib2.build_opener(http_hdlr, proxy_handler, cookie_handler)
    fdesc = opener.open(req)
    data = fdesc.read()
    fdesc.close()

    pat = re.compile(r'^[a-z0-9]{32}')
    if data and isinstance(data, str) and pat.match(data) and len(data) == 32:
        pid = data
    else:
        pid = None
    iwtime = 2  # initial waiting time in seconds
    wtime = 20  # final waiting time in seconds
    sleep = iwtime
    time0 = time.time()
    while pid:
        params.update({'pid': data})
        encoded_data = urllib.urlencode(params, doseq=True)
        url = host + path + '?%s' % encoded_data
        req = urllib2.Request(url=url, headers=headers)
        try:
            fdesc = opener.open(req)
            data = fdesc.read()
            fdesc.close()
        except urllib2.HTTPError as err:
            return {"status": "fail", "reason": str(err)}
        if data and isinstance(data,
                               str) and pat.match(data) and len(data) == 32:
            pid = data
        else:
            pid = None
        time.sleep(sleep)
        if sleep < wtime:
            sleep *= 2
        elif sleep == wtime:
            sleep = iwtime  # start new cycle
        else:
            sleep = wtime
        if (time.time() - time0) > threshold:
            reason = "client timeout after %s sec" % int(time.time() - time0)
            return {"status": "fail", "reason": reason}
    jsondict = json.loads(data)
    return jsondict
Example #22
0
def uploader(USER_NAME, PASSWORD, FILE_LOCATION, DEBUG="NO"):

    SERVER_NAME = 'w3-connections.ibm.com'

    print "\nAttempting to log in to (%s)..." % (SERVER_NAME)

    # Create authenticated server opener

    cookieProcessor = urllib2.HTTPCookieProcessor(LWPCookieJar())

    opener = urllib2.build_opener(cookieProcessor)

    # encoded parameters sent in a POST method (over secure connection)

    encodedForm = urlencode({"j_username": USER_NAME, "j_password": PASSWORD})

    # in this test case we used the port numbers, depending on the server, you can ignore these

    urlform = "https://" + SERVER_NAME + "/wikis/j_security_check"

    request = urllib2.Request(urlform, encodedForm)

    opener.addheaders = [("User-agent", "Mozilla/5.0")]

    # Read the response from the server

    loggedIn = opener.open(request).read()

    # Check if the response contains a redirection to the login page

    # Or check if the response is the login page

    if string.find(loggedIn, 'window.location.replace') == -1 and string.find(
            loggedIn, 'X-LConn-Login') == -1:

        print 'Logged in successfully!'

    else:

        print 'Failed to log in.'

        exit()

    excelFile = open(FILE_LOCATION, 'r')

    if DEBUG.upper() == "YES":
        url = 'https://w3-connections.ibm.com/files/basic/api/userlibrary/'
        url += 'b8e5e0c0-a38e-1033-9149-ac5876bd6d0c/document/a0c2ab4d-b530-458c-a1f1-6ee6d8c3acfb/media'
        print "RUNNING IN DEBUG MODE"
    elif DEBUG.upper() == "NO":
        exit()
        url = "https://w3-connections.ibm.com/files/basic/api/"
        url += "communitylibrary/214fe5d2-2471-4a42-ba99-f53de8dbe081/document/fdc2f92b-03c9-4f77-b83c-d8f96f3c491b/media"
    else:
        print "\nInvalid argument, either set the DEBUG argument as \"YES\" or leave it blank."
        exit()
    #convert raw file to string
    mmapped_file_as_string = mmap.mmap(excelFile.fileno(),
                                       0,
                                       access=mmap.ACCESS_READ)

    request2 = urllib2.Request(url, mmapped_file_as_string)

    #this is a hack I found online to make the downloader in python work as an uploader
    contenttype = mimetypes.guess_type(FILE_LOCATION)[0]
    request2.add_header('Content-Type', contenttype)
    request2.get_method = lambda: 'PUT'
    opener.open(request2)

    print "File appears to have been uploaded successfully!"

    excelFile.close()
Example #23
0
def openAnything(source,
                 etag=None,
                 lastmodified=None,
                 agent=USER_AGENT,
                 post_data=None,
                 files=None):
    """URL, filename, or string --> stream

    This function lets you define parsers that take any input source
    (URL, pathname to local or network file, or actual data as a string)
    and deal with it in a uniform manner.  Returned object is guaranteed
    to have all the basic stdio read methods (read, readline, readlines).
    Just .close() the object when you're done with it.

    If the etag argument is supplied, it will be used as the value of an
    If-None-Match request header.

    If the lastmodified argument is supplied, it must be a formatted
    date/time string in GMT (as returned in the Last-Modified header of
    a previous request).  The formatted date/time will be used
    as the value of an If-Modified-Since request header.

    If the agent argument is supplied, it will be used as the value of a
    User-Agent request header.
    """

    if hasattr(source, 'read'):
        return source

    if source == '-':
        return sys.stdin

    if isinstance(post_data, dict):
        post_data_dict = post_data
        post_data = []
        for key in post_data_dict.keys():
            post_data.append((key, post_data_dict[key]))

    protocol = urlparse.urlparse(source)[0]
    if protocol == 'http' or protocol == 'https':
        # open URL with urllib2
        request = urllib2.Request(source)
        request.add_header('User-Agent', agent)
        if lastmodified:
            request.add_header('If-Modified-Since', lastmodified)
        if etag:
            request.add_header('If-None-Match', etag)
        if post_data and files:
            content_type, body = encode_multipart_formdata(post_data, files)
            request.add_header('Content-Type', content_type)
            request.add_data(body)
        elif post_data:
            request.add_data(encode_post_data(post_data))
        request.add_header('Accept-encoding', 'gzip')
        opener = urllib2.build_opener(SmartRedirectHandler(),
                                      DefaultErrorHandler(),
                                      urllib2.HTTPCookieProcessor(cj))
        return opener.open(request)

    # try to open with native open function (if source is a filename)
    try:
        return open(source)
    except (IOError, OSError):
        pass

    # treat source as string
    return StringIO(str(source))
Example #24
0
def downloadpage(url,
                 post=None,
                 headers=None,
                 timeout=None,
                 follow_redirects=True,
                 cookies=True,
                 replace_headers=False,
                 add_referer=False,
                 only_headers=False,
                 bypass_cloudflare=True,
                 count_retries=0,
                 random_headers=False,
                 ignore_response_code=False):
    """
    Abre una url y retorna los datos obtenidos

    @param url: url que abrir.
    @type url: str
    @param post: Si contiene algun valor este es enviado mediante POST.
    @type post: str
    @param headers: Headers para la petición, si no contiene nada se usara los headers por defecto.
    @type headers: dict, list
    @param timeout: Timeout para la petición.
    @type timeout: int
    @param follow_redirects: Indica si se han de seguir las redirecciones.
    @type follow_redirects: bool
    @param cookies: Indica si se han de usar las cookies.
    @type cookies: bool
    @param replace_headers: Si True, los headers pasados por el parametro "headers" sustituiran por completo los headers por defecto.
                            Si False, los headers pasados por el parametro "headers" modificaran los headers por defecto.
    @type replace_headers: bool
    @param add_referer: Indica si se ha de añadir el header "Referer" usando el dominio de la url como valor.
    @type add_referer: bool
    @param only_headers: Si True, solo se descargarán los headers, omitiendo el contenido de la url.
    @type only_headers: bool
    @param random_headers: Si True, utiliza el método de seleccionar headers aleatorios.
    @type random_headers: bool
    @param ignore_response_code: Si es True, ignora el método para WebErrorException para error como el error 404 en veseriesonline, pero es un data funcional
    @type ignore_response_code: bool
    @return: Resultado de la petición
    @rtype: HTTPResponse

            Parametro               Tipo    Descripción
            ----------------------------------------------------------------------------------------------------------------
            HTTPResponse.sucess:    bool   True: Peticion realizada correctamente | False: Error al realizar la petición
            HTTPResponse.code:      int    Código de respuesta del servidor o código de error en caso de producirse un error
            HTTPResponse.error:     str    Descripción del error en caso de producirse un error
            HTTPResponse.headers:   dict   Diccionario con los headers de respuesta del servidor
            HTTPResponse.data:      str    Respuesta obtenida del servidor
            HTTPResponse.time:      float  Tiempo empleado para realizar la petición

    """

    response = {}

    # Headers por defecto, si no se especifica nada
    request_headers = default_headers.copy()

    # Headers pasados como parametros
    if headers is not None:
        if not replace_headers:
            request_headers.update(dict(headers))
        else:
            request_headers = dict(headers)

    if add_referer:
        request_headers["Referer"] = "/".join(url.split("/")[:3])

    if random_headers or HTTPTOOLS_DEFAULT_RANDOM_HEADERS:
        request_headers['User-Agent'] = random_useragent()

    url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]")

    # Limitar tiempo de descarga si no se ha pasado timeout y hay un valor establecido en la variable global
    if timeout is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None:
        timeout = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT
    if timeout == 0: timeout = None

    logger.info("----------------------------------------------")
    logger.info("downloadpage Alfa: %s" % __version)
    logger.info("----------------------------------------------")
    logger.info("Timeout: %s" % timeout)
    logger.info("URL: " + url)
    logger.info("Dominio: " + urlparse.urlparse(url)[1])
    if post:
        logger.info("Peticion: POST")
    else:
        logger.info("Peticion: GET")
    logger.info("Usar Cookies: %s" % cookies)
    logger.info("Descargar Pagina: %s" % (not only_headers))
    logger.info("Fichero de Cookies: " + ficherocookies)
    logger.info("Headers:")
    for header in request_headers:
        logger.info("- %s: %s" % (header, request_headers[header]))

    # Handlers
    handlers = [urllib2.HTTPHandler(debuglevel=False)]

    if not follow_redirects:
        handlers.append(NoRedirectHandler())

    if cookies:
        handlers.append(urllib2.HTTPCookieProcessor(cj))

    opener = urllib2.build_opener(*handlers)

    logger.info("Realizando Peticion")

    # Contador
    inicio = time.time()

    req = urllib2.Request(url, post, request_headers)

    try:
        if urllib2.__version__ == "2.4":
            import socket
            deftimeout = socket.getdefaulttimeout()
            if timeout is not None:
                socket.setdefaulttimeout(timeout)
            handle = opener.open(req)
            socket.setdefaulttimeout(deftimeout)
        else:
            handle = opener.open(req, timeout=timeout)

    except urllib2.HTTPError, handle:
        response["sucess"] = False
        response["code"] = handle.code
        response["error"] = handle.__dict__.get("reason", str(handle))
        response["headers"] = handle.headers.dict
        if not only_headers:
            response["data"] = handle.read()
        else:
            response["data"] = ""
        response["time"] = time.time() - inicio
        response["url"] = handle.geturl()
def get_opener():
    """Return a url opener that handles cookies."""
    cookie_jar = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
    return opener
Example #26
0
def give_me_cookie():
    cookie = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie), urllib2.HTTPHandler())
    return opener
def read_body_and_headers(url, post=None, headers=[], follow_redirects=False, timeout=30):
    _log("read_body_and_headers "+url)

    if post is not None:
        _log("read_body_and_headers post="+post)

    if len(headers)==0:
        headers.append(["User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:18.0) Gecko/20100101 Firefox/18.0"])

    # Start cookie lib
    ficherocookies = os.path.join( get_data_path(), 'cookies.dat' )
    _log("read_body_and_headers cookies_file="+ficherocookies)

    cj = None
    ClientCookie = None
    cookielib = None

    # Let's see if cookielib is available
    try:
        _log("read_body_and_headers importing cookielib")
        import cookielib
    except ImportError:
        _log("read_body_and_headers cookielib no disponible")
        # If importing cookielib fails
        # let's try ClientCookie
        try:
            _log("read_body_and_headers importing ClientCookie")
            import ClientCookie
        except ImportError:
            _log("read_body_and_headers ClientCookie not available")
            # ClientCookie isn't available either
            urlopen = urllib2.urlopen
            Request = urllib2.Request
        else:
            _log("read_body_and_headers ClientCookie available")
            # imported ClientCookie
            urlopen = ClientCookie.urlopen
            Request = ClientCookie.Request
            cj = ClientCookie.MozillaCookieJar()

    else:
        _log("read_body_and_headers cookielib available")
        # importing cookielib worked
        urlopen = urllib2.urlopen
        Request = urllib2.Request
        cj = cookielib.MozillaCookieJar()
        # This is a subclass of FileCookieJar
        # that has useful load and save methods

    if cj is not None:
    # we successfully imported
    # one of the two cookie handling modules
        _log("read_body_and_headers Cookies enabled")

        if os.path.isfile(ficherocookies):
            _log("read_body_and_headers Reading cookie file")
            # if we have a cookie file already saved
            # then load the cookies into the Cookie Jar
            try:
                cj.load(ficherocookies, ignore_discard=True)
            except:
                _log("read_body_and_headers Wrong cookie file, deleting...")
                os.remove(ficherocookies)

        # Now we need to get our Cookie Jar
        # installed in the opener;
        # for fetching URLs
        if cookielib is not None:
            _log("read_body_and_headers opener using urllib2 (cookielib)")
            # if we use cookielib
            # then we get the HTTPCookieProcessor
            # and install the opener in urllib2
            if not follow_redirects:
                opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=http_debug_log_enabled),urllib2.HTTPCookieProcessor(cj),NoRedirectHandler())
            else:
                opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=http_debug_log_enabled),urllib2.HTTPCookieProcessor(cj))
            urllib2.install_opener(opener)

        else:
            _log("read_body_and_headers opener using ClientCookie")
            # if we use ClientCookie
            # then we get the HTTPCookieProcessor
            # and install the opener in ClientCookie
            opener = ClientCookie.build_opener(ClientCookie.HTTPCookieProcessor(cj))
            ClientCookie.install_opener(opener)

    # -------------------------------------------------
    # Cookies instaladas, lanza la petición
    # -------------------------------------------------

    # Contador
    inicio = time.clock()

    # Diccionario para las cabeceras
    txheaders = {}

    # Construye el request
    if post is None:
        _log("read_body_and_headers GET request")
    else:
        _log("read_body_and_headers POST request")
    
    # Añade las cabeceras
    _log("read_body_and_headers ---------------------------")
    for header in headers:
        _log("read_body_and_headers header %s=%s" % (str(header[0]),str(header[1])) )
        txheaders[header[0]]=header[1]
    _log("read_body_and_headers ---------------------------")

    req = Request(url, post, txheaders)
    if timeout is None:
        handle=urlopen(req)
    else:        
        #Disponible en python 2.6 en adelante --> handle = urlopen(req, timeout=timeout)
        #Para todas las versiones:
        try:
            import socket
            deftimeout = socket.getdefaulttimeout()
            socket.setdefaulttimeout(timeout)
            handle=urlopen(req)            
            socket.setdefaulttimeout(deftimeout)
        except:
            import sys
            for line in sys.exc_info():
                _log( "%s" % line )
    
    # Actualiza el almacén de cookies
    cj.save(ficherocookies, ignore_discard=True)

    # Lee los datos y cierra
    if handle.info().get('Content-Encoding') == 'gzip':
        buf = StringIO( handle.read())
        f = gzip.GzipFile(fileobj=buf)
        data = f.read()
    else:
        data=handle.read()

    info = handle.info()
    _log("read_body_and_headers Response")

    returnheaders=[]
    _log("read_body_and_headers ---------------------------")
    for header in info:
        _log("read_body_and_headers "+header+"="+info[header])
        returnheaders.append([header,info[header]])
    handle.close()
    _log("read_body_and_headers ---------------------------")

    '''
    # Lanza la petición
    try:
        response = urllib2.urlopen(req)
    # Si falla la repite sustituyendo caracteres especiales
    except:
        req = urllib2.Request(url.replace(" ","%20"))
    
        # Añade las cabeceras
        for header in headers:
            req.add_header(header[0],header[1])

        response = urllib2.urlopen(req)
    '''
    
    # Tiempo transcurrido
    fin = time.clock()
    _log("read_body_and_headers Downloaded in %d seconds " % (fin-inicio+1))
    _log("read_body_and_headers body="+data)

    return data,returnheaders
Example #28
0
#!/usr/bin/env python
"""

Copyright (c) 2004  Dustin Sallings <*****@*****.**>
"""

import sys
import urllib2
import traceback
try:
    import cookielib
    cookieJar = cookielib.CookieJar()
    cookieProcessor = urllib2.HTTPCookieProcessor(cookieJar)
    openerFactory = urllib2.build_opener
except ImportError:
    import ClientCookie
    cookieJar = ClientCookie.MozillaCookieJar()
    cookieProcessor = ClientCookie.HTTPCookieProcessor(cookieJar)
    openerFactory = ClientCookie.build_opener


class ErrorHandler(urllib2.HTTPDefaultErrorHandler):
    def http_error_default(self, req, fp, code, msg, hdrs):
        print "*** Got an error %d ***" % (code, )
        # print self, req, fp, code, msg, headers
        return fp


if __name__ == '__main__':

    headers = {'SOAPAction': 'Inform', 'Content-type': 'text/xml'}
Example #29
0
#!/usr/bin/python
import urllib
import urllib2
import cookielib
import requests
url = 'http://10.1.16.65:8000/login/'
login_data = {'username': '******', 'password': '******'}
data = urllib.urlencode(login_data)
#'http://10.1.16.65:8000/upload/'

cookie = cookielib.CookieJar()
cookieProc = urllib2.HTTPCookieProcessor(cookie)
opener = urllib2.build_opener(cookieProc)
urllib2.install_opener(opener)
urllib2.urlopen(url, data)
tt = {'server': 'test', 'submit': 'select', 'time': '2015-08-18 14:57:06'}
data1 = urllib.urlencode(tt)
print urllib2.urlopen('http://10.1.16.65:8000/altertime/', data1).read()
Example #30
0
def prepare_compare_data():
    param = ({
        'utype': 2,
        'uid': 'imei',
        'category': 1,
        'offset': 102,
        'count': 4
    })
    req_url = default_server + '/info/'
    print 'the req url is %s' % (req_url)
    client = urllib2.Request(req_url)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
    response = opener.open(client, json.JSONEncoder().encode(param))
    js_data = json.loads(response.read())
    store_text(1, js_data['post'], 1)  # 1 means store into a compare data dir

    param = ({
        'utype': 2,
        'uid': 'imei',
        'category': 2,
        'offset': 102,
        'count': 4
    })
    req_url = default_server + '/info/'
    print 'the req url is %s' % (req_url)
    client = urllib2.Request(req_url)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
    response = opener.open(client, json.JSONEncoder().encode(param))
    js_data = json.loads(response.read())
    store_text(2, js_data['post'], 1)  # 1 means store into a compare data dir

    param = ({
        'utype': 2,
        'uid': 'imei',
        'category': 3,
        'offset': 102,
        'count': 4
    })
    req_url = default_server + '/info/'
    print 'the req url is %s' % (req_url)
    client = urllib2.Request(req_url)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
    response = opener.open(client, json.JSONEncoder().encode(param))
    js_data = json.loads(response.read())
    store_text(3, js_data['post'], 1)  # 1 means store into a compare data dir

    param = ({
        'utype': 2,
        'uid': 'imei',
        'category': 4,
        'offset': 102,
        'count': 4
    })
    req_url = default_server + '/info/'
    print 'the req url is %s' % (req_url)
    client = urllib2.Request(req_url)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
    response = opener.open(client, json.JSONEncoder().encode(param))
    js_data = json.loads(response.read())
    store_text(4, js_data['post'], 1)  # 1 means store into a compare data dir
    return 0