Esempio n. 1
0
def crawl(parent_url):
    print('do crawl')
    cookie = http.cookiejar.CookieJar()
    handler = urllib3.HTTPCookieProcessor(cookie)
    opener = urllib3.build_opener(handler)

    response = opener.open(parent_url)
    html = response.read().decode('gbk', 'ignore')

    html = html.replace('\\/', '/')

    new_url_list = []
    sku_information = []
    links = re.findall(r'href\=\"(\/\/[a-zA-Z0-9\.\/\-]+)\"', html)
    for link in links:
        # python 多个and啥意思来着
        if link.find('jd.com') > 0 and link.find(
                'club.jd.com') < 0 and link.find(
                    'item.m.jd.com') < 0 and link.find(
                        'help.jd.com') < 0 and link.find('yp.jd.com') < 0:
            link = 'http:' + link
            new_url_list.append(link)

    if parent_url.find('item.jd.com') >= 0:
        id_reg_exp = '[\d]+'
        all_found = re.findall(id_reg_exp, parent_url)
        sku_information.append(all_found[0])
        for item in retrieve_sku_info(html):
            sku_information.append(item)

    print('sku', sku_information)

    #print sku_information
    return new_url_list, sku_information
Esempio n. 2
0
def download_url_list():
    cookie = cookielib.CookieJar()
    handler = urllib3.HTTPCookieProcessor(cookie)
    opener = urllib3.build_opener(handler)
    url = 'http://www.jd.com'

    response = opener.open(url)
    html = response.read().decode('utf-8', 'ignore')

    start_pattern = 'class=\"cate_menu_lk\"'
    end_pattern = '</a>'

    start_position = html.find(start_pattern)

    link_str = ''
    while start_position > 0:
        html = html[start_position + len(start_pattern):]
        end_position = html.find(end_pattern)
        if end_position > 0:
            link_str += html[0:end_position]
        start_position = html.find(start_pattern)

    #print link_str

    url_re = '\/\/[a-z.\/\d\-]+'
    url_list = []
    for url in re.findall(url_re, link_str):
        url = 'http:' + url
        url_list.append(url)

    #print url_list

    return url_list
Esempio n. 3
0
 def doHttpWithCookie(self, url, data={}, save_cookie = False):
     cookie = cookielib.CookieJar()
     cookie.load(self.cookie_filename, ignore_discard=True, ignore_expires=True)
     if save_cookie:
         cookie = cookielib.MozillaCookieJar(self.cookie_filename)
     handler = urllib3.HTTPCookieProcessor(cookie)
     opener = urllib3.build_opener(handler)
     response = opener.open(url)
     for item in cookie:
         print('Name = ' + item.name)
         print('Value = ' + item.value)
     if save_cookie:
         cookie.save(ignore_discard=True, ignore_expires=True)
Esempio n. 4
0
def Brower(url):
    login_page = "https://10.64.70.225/cgi-bin/logon.cgi"
    login_data = "usrname=admin&passwd=admin&isCookieEnable=1&action=on&wrong_passwd=%3C%21--invalid_passwd_flag--%3E"  # get from fiddler
    try:
        cj = cookielib.CookieJar()
        opener = urllib3.build_opener(urllib3.HTTPCookieProcessor(cj))
        opener.addheaders = [
            ('User-agent', 'Mozilla/4.0 (compatible; MSIE 8.0;\
		Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152;\
		.NET CLR 3.5.30729; MS-RTC LM 8; InfoPath.2; CIBA; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322)'
             )
        ]

        opener.open(login_page, login_data)
        op = opener.open(url)
        data = op.read()
        return data
    except Exception:
        print(str(Exception))
Esempio n. 5
0
import urllib3
import cookie

print('start')
cookie = cookielib.CookieJar()
opener = urllib3.build_opener(urllib3.HTTPCookieProcessor(cookie))
response = opener.open('http://www.zhihu.com')
for item in cookie:
    print(item.name +':' item.value)
print('over')    
Esempio n. 6
0
import urllib3
import cookielib
from bs4 import BeautifulSoup
#设置代理IP
proxy_support = urllib3.ProxyHandler({'http':'120.197.234.164:80'})
#设置cookie
cookie_support = urllib3.HTTPCookieProcessor(cookielib.LWPCookieJar())
opener = urllib3.build_opener(proxy_support,cookie_support,urllib.HTTPHandler)
urllib3.install_opener(opener)
#开始的URL
#hosturl = "http://www.renren.com"
hosturl = "http://mail.163.com/"
#接受表单数据的URL
#posturl = "http://www.renren.com/ajaxLogin/login"
posturl = "https://mail.163.com/entry/cgi/ntesdoor?df=mail163_letter&from=web&funcid=loginone&iframe=1&language=-1&passtype=1&product=mail163&net=e&style=-1&race=118_35_39_bj&[email protected]"
#发送表单数据
postdata = urllib.urlencode(
  {
  "username":"******",
  "password":"******"
  }
)
#设置表头
headers = {
  #'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0/',
  #'Referer':'http://www.renren.com/'
  'User-Agent':"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0",
  'Referer':'http://mail.163.com/'
}
#生成HTTP请求
req =urllib.Request(
    def getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy):
        url = "https://twitter.com/i/search/timeline?f=tweets&q=%s&src=typd&max_position=%s"

        urlGetData = ''

        if hasattr(tweetCriteria, 'username'):
            urlGetData += ' from:' + tweetCriteria.username

        if hasattr(tweetCriteria, 'querySearch'):
            urlGetData += ' ' + tweetCriteria.querySearch

        if hasattr(tweetCriteria, 'near'):
            urlGetData += "&near:" + tweetCriteria.near + " within:" + tweetCriteria.within

        if hasattr(tweetCriteria, 'since'):
            urlGetData += ' since:' + tweetCriteria.since

        if hasattr(tweetCriteria, 'until'):
            urlGetData += ' until:' + tweetCriteria.until

        if hasattr(tweetCriteria, 'topTweets'):
            if tweetCriteria.topTweets:
                url = "https://twitter.com/i/search/timeline?q=%s&src=typd&max_position=%s"

        url = url % (urllib.quote(urlGetData), urllib.quote(refreshCursor))

        headers = [
            ('Host', "twitter.com"),
            ('User-Agent',
             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
             ), ('Accept', "application/json, text/javascript, */*; q=0.01"),
            ('Accept-Language', "de,en-US;q=0.7,en;q=0.3"),
            ('X-Requested-With', "XMLHttpRequest"), ('Referer', url),
            ('Connection', "keep-alive")
        ]

        if proxy:
            opener = urllib2.build_opener(
                urllib2.ProxyHandler({
                    'http': proxy,
                    'https': proxy
                }), urllib2.HTTPCookieProcessor(cookieJar))
        else:
            opener = urllib2.build_opener(
                urllib2.HTTPCookieProcessor(cookieJar))
        opener.addheaders = headers

        try:
            response = opener.open(url)
            jsonResponse = response.read()
        except:
            print(
                "Twitter weird response. Try to see on browser: https://twitter.com/search?q=%s&src=typd"
                % urllib.quote(urlGetData))
            sys.exit()
            return

        dataJson = json.loads(jsonResponse)

        return dataJson