def parse(self, filename): """ Parses Netscape HTTP Cookie files. The format of a cookie file is described here: http://www.cookiecentral.com/faq/#3.5 Content sample: # Netscape HTTP Cookie File # http://www.netscape.com/newsref/std/cookie_spec.html # This is a generated file! Do not edit. #HttpOnly_login.corp.google.com FALSE/TRUE 1443186342 name value :param filename: The path where to find the cookie file. :return: None """ with open(filename, "r") as file: for line in file: line = line.strip() # Skip empty line. if not line: continue # Skip comments. if line.startswith("# "): continue cookie = Cookie(line) self.cookies.append(cookie)
def post(self): mobile = self.request.get('mobile') browser = self.request.get('browser') logging.info("Mobile: %s, User Agent: %s Browser %s" % (mobile, self.request.user_agent, browser)) self.response.headers['Content-Type'] = 'text/html' if bool(RE_MOBILE_NUMBER.search(mobile)) and len(mobile) == 10 and mobile != '9876543210': path = os.path.join(os.path.dirname(__file__), "../registration_done.html") learner = Learner.retrieve(Learner, mobile) if not learner: learner = Learner() learner.MobileNumber = db.PhoneNumber(mobile) learner.UserAgent = self.request.user_agent learner.Channel = 'WEB' if (browser == 'true') else 'APP' learner.MotherTongue = 1 # Hindi learner.Status = simplejson.dumps({'id': str(mobile)}) learner.put() self.response.headers['Set-Cookie'] = Cookie.get_maza_cookie_str(learner.Status) if browser == 'false': self.redirect('/' + mobile) else: # Learner already existed, take him to lesson 1 self.redirect('/' + mobile); else: path = os.path.join(os.path.dirname(__file__), "../registration_fail.html") self.response.out.write(template.render(path, {'mobile' : mobile}))
def enter(): global stage, ground, hp, cookie, jelly, potion, jtrap, djtrap, strap global scoreLabel, Label global jelly_sound, item_sound, collide_sound stage = Stage() # 스테이지 ground = Ground() # 바닥 hp = HP() # 체력 cookie = Cookie() # 캐릭터 game_world.add_object(stage, game_world.layer_bg) game_world.add_object(ground, game_world.layer_bg) game_world.add_object(hp, game_world.layer_bg) game_world.add_object(cookie, game_world.layer_player) # 점수 label = score.Label("Score: ", 50, get_canvas_height() - 50, 45, 0) label.color = (255, 255, 255) score.labels.append(label) scoreLabel = label # 사운드 jelly_sound = load_wav('jelly.wav') jelly_sound.set_volume(32) item_sound = load_wav('item.wav') item_sound.set_volume(32) collide_sound = load_wav('collide.wav') collide_sound.set_volume(50)
def login(self): "Login process. Returns cookie in HTTP header raw text" # Checks if not self.username or not self.password: print("No credentials to use for login!") return False if not self.userfile: print("No user file DB was set!") return False # Loop through the file with users fd = open(self.userfile, "r") for user in fd.readlines(): username, password = user.split(":") # If the user is found in the list and the corresponding password is correct if username == self.username: if password == self.password: usercookie = Cookie().create_login() return usercookie return False
def __init__(self, response, encoding='utf-8', is_json=False): self.is_json = is_json self.encoding = encoding self._headers = {a.lower(): b for (a, b) in response.info().items()} self.cookies = Cookie.parse_cookies(self._headers.get('set-cookie', '')) self.raw_body = response.read() self.code = response.code response.close()
def __init__(self, response, encoding='utf-8', is_json=False): self.is_json = is_json self.encoding = encoding self._headers = {a.lower(): b for (a, b) in response.info().items()} self.cookies = Cookie.parse_cookies(self._headers.get( 'set-cookie', '')) self.raw_body = response.read() self.code = response.code response.close()
def logout(self): """ Logout process Returns a login cookie with negative expire value, so it gets deleted on client side """ logoutcookie = Cookie().create_custom("logged", "no", -10) return logoutcookie
def enter(): global Board, scoreLabel, cookie, scoretemp Board = ScoreBoard() cookie = Cookie() # 스코어 label = score.Label("Score: ", 60, 300, 80, 0) label.color = (0, 0, 0) score.labels.append(label) scoreLabel = label str = "Score: {:0.0f}".format(scoretemp) scoreLabel.text = str
def enter(): global cookie, stage, pet, game_timer, gameinfo, hp_time game_timer = get_time() hp_time = get_time() cookie = Cookie() gameinfo = GameInfo() stage = Stage() pet = Pet() game_world.objects = [[], [], [], []] game_world.add_object(stage, 0) game_world.add_object(cookie, 1) game_world.add_object(pet, 2) game_world.add_object(gameinfo, 3)
def get(self): self.response.headers['Content-Type'] = 'text/cache-manifest' self.response.headers['Cache-Control'] = 'max-age=10' # Retrieve tracking cookie to find stats for this user cookie, mobile, learner = Cookie.parse_maza(self.request.cookies) if learner: learner.Status = cookie learner.put() filename = os.path.join(os.path.dirname(__file__), "../html/lesson1.mf") f = open(filename, "r") text = f.read() self.response.out.write(text.replace('#', '#'))
def get(self): cookie, mobile, learner = Cookie.parse_maza(self.request.cookies) if learner: self.redirect('/' + mobile) return user_agent = self.request.user_agent if self.is_desktop(user_agent): logging.info("Desktop User Agent: %s", user_agent) else: logging.info("Mobile User Agent: %s", user_agent) self.response.headers['Content-Type'] = 'text/html' path = os.path.join(os.path.dirname(__file__), "../registration_form.html") self.response.out.write(template.render(path, {}))
def get(self, mobile_number): learner = Learner.retrieve(Learner, mobile_number) if not learner: self.redirect('/') # redirect to registration. return jsonData = { 'learner' : { 'MobileNumber' : learner.MobileNumber}, 'lesson': Lesson1Data.get_data() } jsonDataStr = simplejson.dumps(jsonData) self.response.headers['Content-Type'] = 'text/html' self.response.headers['Cache-Control'] = 'max-age=3600' self.response.headers['Set-Cookie'] = Cookie.get_maza_cookie_str(learner.Status) filename = os.path.join(os.path.dirname(__file__), "../html/lesson1.html") f = open(filename, "r") text = f.read() self.response.out.write(text.replace('{{JSONDATA}}', jsonDataStr))
def load(self, filename): fp = open(filename, "r") for i in fp: item = i.rstrip() self.map.append(list(item)) self.row = len(self.map) self.col = len(self.map[0]) self.width = self.col * self.GS self.height = self.row * self.GS fp.close() for i in range(self.row): for j in range(self.col): if self.map[i][j] == 'B': self.blocks.add(Block((j * self.GS, i * self.GS))) elif self.map[i][j] == 'C': self.cookies.add( Cookie((j * self.GS + 10, i * self.GS + 10), "./img/cookie.png")) elif self.map[i][j] == 'P': self.powercookies.add( PowerCookie((j * self.GS + 5, i * self.GS + 5), "./img/powercookie.png"))
from cookie import Cookie import configparser if __name__ == '__main__': config = configparser.ConfigParser() config.read('login.ini') username = config['DEFAULT']['username'] password = config['DEFAULT']['password'] browser = Cookie() browser.login(username, password) cookies = browser.get_cookies() file = open('cookie.txt', 'w') file.write(cookies) file.close()
class WeSpider(Spider): """ The WeSpider class will use weixin.sogou.com to search the official accounts. And get the first ten article infomation of each official account. """ article_infos = {} cookie_pool = Cookie() name = 'wespider' def start_requests(self): """ Actually, it's better to use __init__ to pass the attributes. But I've tried and failed. So I use scrapy settings for a workaround. """ start_point = { config.type_acc: [ "http://weixin.sogou.com/weixin?type=1&ie=utf8&_sug_=n&_sug_type_=&query=", "http://weixin.sogou.com/weixin?query=" ], config.type_all: ["http://weixin.sogou.com/weixin?type=2&query="], config.type_day: [ "http://weixin.sogou.com/weixin?type=2&sourceid=inttime_day&tsn=1&query=" ], config.type_week: [ "http://weixin.sogou.com/weixin?type=2&sourceid=inttime_week&tsn=2&query=" ], config.type_mon: [ "http://weixin.sogou.com/weixin?type=2&sourceid=inttime_month&tsn=3&query=" ], config.type_year: [ "http://weixin.sogou.com/weixin?type=2&sourceid=inttime_year&tsn=4&query=" ] } account_list = self.settings.get("ACCOUNT_LIST", []) search_type = self.settings.get("SEARCH_TYPE", config.type_acc) random_urls = start_point[search_type] self.start_urls = map( lambda x: random_urls[int(random() * len(random_urls))] + x, account_list) for i, url in enumerate(self.start_urls): cookie = self.cookie_pool.fetch_one() if search_type == config.type_acc: yield Request(url, cookies=cookie, callback=self.parse, meta={ 'cookiejar': i, 'current_cookie': cookie }) else: yield Request(url, cookies=cookie, callback=self.parse_keyword, meta={ 'cookiejar': i, 'current_cookie': cookie }) def parse(self, response): """ Parse the result from the main search page and crawl into each result. """ current_cookie = response.meta['current_cookie'] logger = logging.getLogger(response.url[-10:]) logger.debug(str("Current cookie: " + str(current_cookie))) if "/antispider/" in response.url: cookie = self.cookie_pool.get_banned(current_cookie) if cookie: logger.debug( str("Got banned. Using new cookie: " + str(cookie))) yield Request(response.request.meta['redirect_urls'][0], cookies=cookie, callback=self.parse, meta={ 'cookiejar': response.meta['cookiejar'], 'current_cookie': cookie }) else: yield self.error( "Seems our IP was banned. Caught by WeChat Antispider: {}". format(response.url)) else: if self.no_results(response): if config.always_return_in_format: yield self.error_in_format("No article found") else: yield self.error(u"No article found") else: self.cookie_pool.set_return_header( response.headers.getlist('Set-Cookie'), current_cookie) yield Request(response.xpath( '//div[@class="results mt7"]/div[contains(@class, "wx-rb")]/@href' ).extract_first(), callback=self.parse_account) def parse_keyword(self, response): current_cookie = response.meta['current_cookie'] logger = logging.getLogger(response.url[-10:]) logger.debug(str("Current cookie: " + str(current_cookie))) if "/antispider/" in response.url: cookie = self.cookie_pool.get_banned(current_cookie) if cookie: logger.debug( str("Got banned. Using new cookie: " + str(cookie))) yield Request(response.request.meta['redirect_urls'][0], cookies=cookie, callback=self.parse, meta={ 'cookiejar': response.meta['cookiejar'], 'current_cookie': cookie }) else: yield self.error( "Seems our IP was banned. Caught by WeChat Antispider: {}". format(response.url)) else: self.cookie_pool.set_return_header( response.headers.getlist('Set-Cookie'), current_cookie) articles = response.xpath( '//div[@class="results"]/div[contains(@class, "wx-rb")]') if self.no_results(response) or not len(articles): if config.always_return_in_format: yield self.error_in_format("No article found") else: yield self.error("No article found") else: for i in range(0, len(articles)): url = response.urljoin( articles.xpath('//div/h4/a/@href')[i].extract()) cover = hp().unescape(hp().unescape( articles.xpath('//div/a/img/@src') [i].extract())).replace('\\/', '/') date = datetime.fromtimestamp( int( articles.xpath('//div/div/span/script/text()') [i].extract()[22:-2])).strftime(config.date_format) digest = articles.xpath( '//div[@class="txt-box"]/p')[i].extract() self.article_infos[url] = { 'cover': cover, 'date': date, 'digest': digest } yield Request(url, callback=self.parse_article) def parse_account(self, response): """ Parse the account page and crawl into each article. Note: this account page does not render HTML code from very beginning. It use JavaScript and a Json string to render the page dynamicly. So we use python-json module to parse the Json string. """ m = re.search(r'var msgList = \'(.*)\'', response.body) if not m: yield self.error("Invalid response {}".format(response.url)) else: articles = json.loads(m.group(1).replace('"', '"'))['list'] for article in articles: appinfo = article['app_msg_ext_info'] allinfo = [appinfo ] + (appinfo[u'multi_app_msg_item_list'] if u'multi_app_msg_item_list' in appinfo else []) cominfo = article['comm_msg_info'] for info in allinfo: # Unescape the HTML tags twice url = "http://mp.weixin.qq.com/s?" + hp().unescape( hp().unescape(info['content_url'][4:])) self.article_infos[url] = { 'cover': hp().unescape(hp().unescape(info['cover'])).replace( '\\/', '/'), 'date': datetime.fromtimestamp(int( cominfo['datetime'])).strftime(config.date_format), 'digest': info['digest'] } yield Request(url, callback=self.parse_article) def parse_article(self, response): """ Finally we've got into the article page. Since response.url is generated dynamically, we need to get the permenant URL of the article. """ title = response.xpath( '//div[@id="page-content"]/div/h2/text()').extract_first( default=config.not_found_hint).strip() user = response.xpath('//*[@id="post-user"]/text()').extract_first( default=config.not_found_hint).strip() m = re.search('var msg_link = .*"([^"]*)";', response.body) if not m: yield self.error("Something wrong with article {}".format(title)) else: params = ['__biz', 'sn', 'mid', 'idx'] url = hp().unescape(m.group(1)) html = str.join( "\n", response.xpath('//*[@id="js_content"]').extract()).strip() info = self.article_infos[response.url] yield { u'title': unicode(title), u'account': unicode(user), u'url': unicode(url), u'date': unicode(info['date']), u'cover': unicode(info['cover']), u'digest': unicode(info['digest']), u'content': unicode(html) } def error(self, msg): return { u"error": unicode(msg), u"date": unicode(datetime.now().strftime(config.date_format)) } def no_results(self, response): if len(response.xpath("///div[@id='smart_hint_container']")): smart_hint = response.xpath( "///div[@id='smart_hint_container']/text()").extract_first() if u'\uff08\u4e0d\u542b\u5f15\u53f7\uff09\u7684\u641c\u7d22\u7ed3\u679c\uff1a' == smart_hint: return True else: return False elif len(response.xpath("///div[@class='no-sosuo']")): return True else: return False def error_in_format(self, msg): date = str(datetime.now().strftime(config.date_format)) yesterday = str( (datetime.now() - timedelta(days=1)).strftime(config.date_format)) return { u'title': unicode("{} at {}".format(msg, date)), u'account': unicode(""), u'url': unicode("http://localhost/{}".format(date)), u'date': unicode(date), u'cover': unicode(""), u'digest': unicode(""), u'content': unicode("{} at {}".format(msg, date)) }
def make_cookie(self): cookie = Cookie() cookie.size = 6 cookie.hasChocolateChips = True if self.delegate: self.delegate.on_cookie_baked(cookie)
def create_cookie(self): new_cookie = Cookie() self.cookie_batch.add(new_cookie)
from queue import PriorityQueue from queue import Queue from threading import Lock from threading import Thread from urllib.parse import urljoin from requests import Timeout import requests from lxml import etree from cookie import Cookie from gallery import * from picture import Picture # 初始化图片库 # Gallery() Cookie() # 创建队列实例, 用于存储任务 class Spider: HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } # 图片网站 HOST = 'https://anime-pictures.net' # 超时时间 TIMEOUT = 30 # 爬取网络请求并发数 REQUEST_THREAD_NUMBER = 5 # 爬取时间间隔