def get_movie_data(location): website = "https://in.bookmyshow.com/" + location + "/movies" cookies = get_cookies(location) url = "https://in.bookmyshow.com/serv/getData?cmd=QUICKBOOK&type=MT" response = requests.get(url, cookies=cookies) response.raise_for_status() data = (json.loads(response.text)) return data
def run(self): self.select_url(self.url_list) for url in self.new_yupoo_list: yupoo_href_list = self.yupoo_spider(url, list()) self.get_ypimg_page(yupoo_href_list) for url in self.other_url_list: cookies, base_url = get_cookies.get_cookies(url) self.get_other_msg(cookies, base_url)
def setUpClass(cls) -> None: """ 方案一:获取cookies 方案二:登录成功,并保存session会话状态 """ # cls.my_cookies = get_cookies.get_cookies() cls.my_header = get_cookies.get_cookies() print(cls.my_header)
def get_movie_data(location): website="https://in.bookmyshow.com/"+location+"/movies" cookies=get_cookies(location) url="https://in.bookmyshow.com/serv/getData?cmd=QUICKBOOK&type=MT" response=requests.get(url,cookies=cookies) response.raise_for_status() data=(json.loads(response.text)) return data
def main(): #获取url和cookies #url是每本书的网址 url = 'https://book.douban.com/subject/1826007/' cookies = get_cookies() #调用函数 emotion_analysis(url, cookies) # book_wordcloud(url, cookies)
def attach_movie_url(location): website = "https://in.bookmyshow.com" website_with_location = "https://in.bookmyshow.com/" + location + "/movies" cookies = get_cookies(location) response = requests.get(website_with_location) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') mlist = get_movie_list(location) for div in soup.findAll('div', {'class': '__name'}): name = div.findAll('a')[0].text.strip() href = div.findAll('a')[0].attrs['href'] #a[name]=website+href for movie in mlist: if name == movie['Title']: movie['url'] = website + href return mlist
def attach_movie_url(location): website="https://in.bookmyshow.com" website_with_location="https://in.bookmyshow.com/"+location+"/movies" cookies=get_cookies(location) response=requests.get(website_with_location) response.raise_for_status() soup=BeautifulSoup(response.text,'html.parser') mlist=get_movie_list(location) for div in soup.findAll('div',{'class':'__name'}): name=div.findAll('a')[0].text.strip() href=div.findAll('a')[0].attrs['href'] #a[name]=website+href for movie in mlist: if name == movie['Title']: movie['url']=website+href return mlist
def other_url_handl(self, url, store_id): other_headers = None response = request_url(url, headers=self.headers, proxies_list=self.ip_pool) try: shop_id = re.search(r"/shop_detail/(\w\d+)", response.url).group(1) except: print(response.url) else: if response is not None: cookies, base_url = get_cookies.get_cookies(url, ip_pool) url = parse_url(base_url.lower(), self.server_url, "") for item in cookies: other_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", "cookie": "%s=%s" % (item.name, item.value) } self.get_other_msg(other_headers, url, store_id, shop_id)
def other_url_handl(self, url, store_id): """ 获取shop_id,构造json信息的地址,对请求头进行初步处理 :param url: 首页地址 :param store_id: 商店ID :return: None """ other_headers = None response = request_url(url, headers=self.headers, proxies_list=self.ip_pool) # 对于请求错误的url进行重新请求,直到正确 if "b.oijgvrq.cn" in response.url: self.other_url_handl(url, store_id) # 从首页重定向的url中获取商品的id用于后面构造url try: shop_id = re.search(r"/shop_detail/(\w\d+)", response.url).group(1) except: print(response.url) else: if response is not None: cookies = get_cookies.get_cookies(url) url = parse_url(response.url.lower(), self.server_url, "") # 对请求头进行处理,获取请求首页时服务器设置的cookie值中的token字段构造请求头 for item in cookies: other_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", "cookie": "%s=%s" % (item.name, item.value) } # 对于一些特殊的商店因为请求头中服务器没有设置token字段,所有手动构造 if "token" not in other_headers["cookie"]: other_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", "cookie": "token=Mzk4MDk3Q0E5RTZCN0I1MkYwMTYwNDlCQUNFNkQ5QzVFOEZCOTI1OEEwOTA2MDc0QzUzRTVCNDVDMTg1RTgzRTZBNTY1MTZDQTNFNDFCRkI2ODZGRTgxRjQxRDU3MEZD;" } self.get_other_msg(other_headers, url, store_id, shop_id)
#!/bin/python2.7 import urllib2 import json from get_cookies import get_cookies from StringIO import StringIO from ProgressBar.progress_bar import ProgressBar COOKIE, COOKIE_2 = get_cookies() CLASS_SLUG = "startup-001" CALLBACK = "some" OUT_FILE = "user_data.json" def get_page(url, use_cookie_2 = False): opener = urllib2.build_opener() if use_cookie_2: c = COOKIE_2 else: c = COOKIE opener.addheaders.append(('Cookie', c)) return opener.open(url) forum_thread_template = "https://class.coursera.org/" + CLASS_SLUG + "/api/forum/threads/%d" user_template = "https://www.coursera.org/maestro/api/user/profiles?user-ids=%d&callback=" + CALLBACK uids = set() thread_count = 1 number_of_threads = 0 # figure out total number of threads
] headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36" } with open("./ip_pool", "r") as f: content = f.read() ip_pool = json.loads(content) server_url = "service/album/get_album_themes_list.jsp" for url in url_list: response = requests.get(url, proxies=random.choices(ip_pool)[0], headers=headers) shop_id = re.search(r"/shop_detail/(.*)", response.url).group(1) if response is not None: cookies, base_url = get_cookies(url, ip_pool) url = parse_url(base_url.lower(), server_url, "") for item in cookies: other_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", # "cookie": "%s=%s" % (item.name, item.value) "cookie": "UM_distinctid=172fef76482421-09a957e7ea6b66-4353761-144000-172fef764838c7; " "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22A202007011648174290197177%22%2C%22first_id%22%3A%22172fef75cf638b-0c1df4c5ab5928-4353761-1327104-172fef75cf78d9%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22172fef75cf638b-0c1df4c5ab5928-4353761-1327104-172fef75cf78d9%22%7D; " "token=Mzk4MDk3Q0E5RTZCN0I1MkYwMTYwNDlCQUNFNkQ5QzVFOEZCOTI1OEEwOTA2MDc0QzUzRTVCNDVDMTg1RTgzRTZBNTY1MTZDQTNFNDFCRkI2ODZGRTgxRjQxRDU3MEZD; " "CNZZDATA1275056938=120497081-1594357916-%7C1594363316; " "JSESSIONID=B74EC8A017C3DDD861F3E6E17F3D6C3A" } params = { "page_index": 1,
import requests import time import threading import pandas as pd import get_cookies ''' cookies = {'_vis_opt_exp_67_combi': '2', '_vis_opt_exp_81_exclude': '1', '_ga': 'GA1.2.1502718080.1561463733', '_gid': 'GA1.2.1646304843.1561463733', '_vis_opt_s': '1%7C', 'SNLB2': '12-001', '_vwo_sn': '0%3A1', '_vis_opt_test_cookie': '1', '_vwo_ds': '3%3Aa_0%2Ct_0%3A0%241561463723%3A23.3810494%3A%3A%3A%3A1', 'D_ZID': 'B7582915-2342-3B54-9343-BC1DCE799E8E', 'satisfaction-survey-chance': '0.0403899275885848', 'fonts-loaded': 'true', 'INLB': '01-004', 'sr': '0%7cfalse', 'D_SID': '213.46.252.136:48rZgOaLq07BxrV0Qr6LtG+9ZX6f1oCFngMEVSQzTPk', 'D_UID': 'D644992F-2B1E-3E41-9DE9-F217B4362A49', '.ASPXANONYMOUS': '4V5eW9xpxKl3EkqZq8zIBv4lcHtHlKJzUXRaBTg5iCG5uOS5DfKvk0gLrD0PqmD1sknqjSbDyPTzkKIcVGWAfn4hfTnoLb8FBqUB1iW71ervGlotG8otKQ6aALP4MaNLDjjJUCLn1dz5YorxzHNosKif--g1', '_vwo_uuid_v2': 'D15EFEB276818AA2126C9C28B803EC581|b916da0e2f06e5601dfa81595da1a0aa', 'html-classes': 'js supports-placeholder', 'D_HID': 'A4A9E92C-2A48-3C2C-8A88-C4964FA76B91', 'D_IID': 'D383A0CE-44F4-3CD9-A86F-F857C5DAE6B6', 'D_ZUID': '03044E65-40B9-39DC-B123-50B3846BD9DA', '_vwo_uuid': 'D15EFEB276818AA2126C9C28B803EC581', '__RequestVerificationToken': 'tDON4sgpOIM-o_KwfC0sjX3NodUMYfxV7ZcnZLGxAMNlLJ8D9WrJLi0-aQAInnJeaDput6sP7jq1PanBIRBgzBJDo9c1'} ''' cookies = get_cookies.get_cookies() cookie_string = "; ".join([str(x) + "=" + str(y) for x, y in cookies.items()]) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7,nl;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': cookie_string, #'..ASPXANONYMOUS=d0oK0LM1NdCj5Jj81GX6xZCQ2RoA8LXNtZgyqdpEUbZA6HeGTGn_8Fg6rS8qtSGhEMPhHCzkAQ9WNwsQPJOz2ZcJru6sMaSMPmq5gjexZt8jXJfP17NXvS1uQaI92DoKwyAxOlbmfhmx8roCgeP2Yly7A3o1; sr=0%7cfalse; INLB=01-002; html-classes=js supports-placeholder; _vwo_uuid_v2=D78F6C8B5734A4EB42802737D85D9997B|bb168ce01705a1c8e305a7b53c0daf84; fonts-loaded=true; D_IID=791293A9-C17B-3312-B7D2-ACEC88EBCF6F; D_UID=6D9DEFD3-9936-33CF-BAD9-B8A336E733D5; D_ZID=FA92A681-5C62-3D00-A6E0-FFCFA2DFE000; D_ZUID=4AAFAD98-FC9A-397B-BFD0-CC4CAD64ABF8; D_HID=FE2E611F-6D24-3E8E-82DC-A8B080B475A7; D_SID=85.145.109.88:KlfxLH44zOxwMWXky4Fdeo0TMw982Lv2FwcHBwJhkZY; _vis_opt_s=1%7C; _vis_opt_test_cookie=1; _vwo_uuid=D78F6C8B5734A4EB42802737D85D9997B; _vwo_ds=3%3Aa_0%2Ct_0%3A0%241560642413%3A15.2871134%3A%3A%3A%3A0; _ga=GA1.2.2014485093.1560642416; _gid=GA1.2.1849743777.1560642416; cookiePolicy_16=allowPersonalisatie=True&allowAdvertenties=True; oil_data={%22opt_in%22:true%2C%22version%22:%221.3.0-RELEASE%22%2C%22localeVariantName%22:%22enEN_00%22%2C%22localeVariantVersion%22:0%2C%22customPurposes%22:[]%2C%22consentString%22:%22BOiNyJoOiNyJoBQABBENCX-AAAAoR6_-faqaRo25-P7J9kRFAL6lgBrPSFAQKQAIQAeCJWBiKgUkyDUoCUEIAoBAAARASCJARBgQEAESgAuAAJAgAgCCAAAIBAAAAAAAAAAAAAAAAA%22%2C%22configVersion%22:0}; rtb-platform=improve; satisfaction-survey-chance=0.0438349624368525; __RequestVerificationToken=iYFvdjp0cFTjaPyPxg2Qz0kjxnX4rdaVIPbkDn1YlLtIOXJ7bPEM3A7tM9DjF95SRidYCg9uIsQDtLrRV9iX-FRA01I1; _vis_opt_exp_80_exclude=1; _vis_opt_exp_75_combi=1; __utma=72423812.2014485093.1560642416.1560685213.1560685213.1; __utmc=72423812; __utmz=72423812.1560685213.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmb=72423812.1.10.1560685213; lzo=koop=%2fkoop%2fheel-nederland%2f&huur=%2fhuur%2fheel-nederland%2f&europe=%2feurope%2fheel-europa%2fhuur%2f&nieuwbouw=%2fnieuwbouw%2fheel-nederland%2f; _vwo_sn=40055%3A7; SNLB2=12-001; utag_main=v_id:016b5d8981a1001abec4ed9a3ddd03068001806000bd0$_sn:4$_ss:0$_st:1560687325603$vapi_domain:funda.nl$dc_visit:4$ses_id:1560682473031%3Bexp-session$_pn:7%3Bexp-session$dc_event:7%3Bexp-session$dc_region:eu-central-1%3Bexp-session', 'Host': 'www.funda.nl', 'Referer': 'https://www.funda.nl/en/koop/', 'Upgrade-Insecure-Requests':