def main(): # Check username and password global USERNAME, PASSWORD if not USERNAME: USERNAME = input('请输入学号:') if not PASSWORD: PASSWORD = input('请输入密码:') req = requests.Session() cookie_jar = RequestsCookieJar() login_payload = {'username': USERNAME, 'password': PASSWORD} url = 'http://dj.cs.ustc.edu.cn/admin/index/login.html' # Open Login print('正在登录: %s' % url) r = req.post(url, data=login_payload, allow_redirects=False) cookie_jar.update(r.cookies) # print(cookie_jar.items()) # Now set url to index.html url = 'http://dj.cs.ustc.edu.cn/admin/index/index.html' r = req.get(url, cookies=cookie_jar) # Now we have got the page. We should know what '待办事项' refers to dashboard_page = etree.HTML(r.text) iframe_link_path = dashboard_page.xpath( "//*[@id='draggable']/div[2]/div[1]/dl[1]/dd[2]/a/@data-param") assert (len(iframe_link_path) == 1) iframe_link = DOMAIN + iframe_link_path[0] todo_events = [] r = req.get(iframe_link, cookies=cookie_jar) assert (r.status_code == 200) events_page = etree.HTML(r.text) events = events_page.xpath("//div[@class='bDiv']/table/tbody/tr") for i in range(len(events)): event_name = events_page.xpath( "//div[@class='bDiv']/table/tbody/tr[%d]/td[1]/text()" % (i + 1))[0] event_status = events_page.xpath( "//div[@class='bDiv']/table/tbody/tr[%d]/td[5]/text()" % (i + 1))[0].strip() event_link = events_page.xpath( "//div[@class='bDiv']/table/tbody/tr[%d]/td[6]/a/@href" % (i + 1))[0] if event_status != '已办理': event_status = '\033[1;31m未办理\033[0m' todo_events.append((event_name, event_link)) print('%s\t%s' % (event_name, event_status)) print('=========================') for event in todo_events: sys.stdout.write('正在办理 %s' % event[0]) event_full_link = DOMAIN + event[1] r = req.get(event_full_link, cookies=cookie_jar) commit_page = etree.HTML(r.text) commit_path = commit_page.xpath("//div[@class='bot']/a[1]/@href")[0] commit_url = DOMAIN + commit_path r = req.get(commit_url, cookies=cookie_jar) print(r.status_code == 200 and '成功' or '失败') return 0
def restore_session(self): cookies = RequestsCookieJar() try: data = keyring.get_password(KEYRING_SESSION_NAME, self.username) if data is None: # Session is not saved return data = json.loads(data) except (KeyringError, JSONDecodeError) as e: raise PypiKeyringError(f'{e}') cookies.update(data) self.session.cookies = cookies
def create_request(session, method, url, params=None, data=None, headers=None, cookies=None, files=None, auth=None): cookies = cookies or {} if not isinstance(cookies, cookielib.CookieJar): cookies = cookiejar_from_dict(cookies) merged_cookies = RequestsCookieJar() merged_cookies.update(session.cookies) merged_cookies.update(cookies) cookies = merged_cookies params = merge_setting_safe(params, session.params) headers = merge_setting_safe(headers, session.headers, dict_class=CaseInsensitiveDict) auth = merge_setting_safe(auth, session.auth) return Request(method=method.upper(), url=url, headers=headers, files=files, data=data, params=params, auth=auth, cookies=cookies)
def from_dict(cls, response_dict): """ 利用字典获取Response对象 @param response_dict: 原生的response.__dict__ @return: """ cookie_jar = RequestsCookieJar() cookie_jar.update(other=response_dict["cookies"]) response_dict["cookies"] = cookie_jar response_dict["elapsed"] = datetime.timedelta( 0, 0, response_dict["elapsed"]) # 耗时 response_dict["connection"] = None response_dict["_content_consumed"] = True response = res() response.__dict__.update(response_dict) return cls(response)
class LeaSession: def __init__(self, session, config: ConfigDict, lea_html: str): self.cfg = config self.cookies = RequestsCookieJar() self.cookies.update(session.cookies) self.lea_html = lea_html self.lea_html_query = pq(lea_html) def getAssignments(self): assignmentURL = self.lea_html_query('a[id="lienDTRV"]').attr("href") assignmentsPage = doRequest( self.cfg, self.cookies, requests.get(url=self.cfg["https_ovxUrl2"] + assignmentURL, headers=self.cfg["headers"], cookies=self.cookies, allow_redirects=True)) self.cookies.update(assignmentsPage.cookies) d = pq(assignmentsPage.text) assignmentDict = {} for i in range(1, 3): d = pq(assignmentsPage.text) assignmentsHTML = d('tr[class="LigneListTrav' + str(i) + '"]') for tab in assignmentsHTML: d = pq(tab) name = d( 'a[class="RemTrav_Sommaire_NomCours"]')[0].text.strip() listAssignmentsOfClassHTML = d( 'a[class="RemTrav_Sommaire_ProchainsTravaux"]') listAssignmentsDescOfClassHTML = d( 'span[class="RemTrav_Sommaire_ProchainsTravauxDesc"]') listAssignmentsOfClass = {} for assignment in listAssignmentsOfClassHTML: listAssignmentsOfClass[assignment.text.strip( )] = listAssignmentsDescOfClassHTML[ listAssignmentsOfClassHTML.index( assignment)].text.replace('\n', '').replace( '\r', '').replace(' ', '').replace('\xa0', ' ') assignmentDict[name] = listAssignmentsOfClass print(assignmentDict)
def prepare_request(self, request): """Constructs a :class:`PreparedRequest <PreparedRequest>` for transmission and returns it. The :class:`PreparedRequest` has settings merged from the :class:`Request <Request>` instance and those of the :class:`Session`. :param request: :class:`Request` instance to prepare with this session's settings. """ cookies = request.cookies or {} # Bootstrap CookieJar. if not isinstance(cookies, cookielib.CookieJar): cookies = cookiejar_from_dict(cookies) # Merge with session cookies merged_cookies = RequestsCookieJar() merged_cookies.update(self.cookies) merged_cookies.update(cookies) # Set environment's basic authentication if not explicitly set. auth = request.auth if self.trust_env and not auth and not self.auth: auth = get_netrc_auth(request.url) p = PreparedRequest() p.prepare( method=request.method.upper(), url=request.url, files=request.files, data=request.data, json=request.json, headers=merge_setting(request.headers, self.headers, dict_class=CaseInsensitiveDict), params=merge_setting(request.params, self.params), auth=merge_setting(auth, self.auth), cookies=merged_cookies, hooks=merge_hooks(request.hooks, self.hooks), ) return p
def load_and_merge_cookie_jars(cookie_jar_paths): cookie_jar = RequestsCookieJar() if not cookie_jar_paths: return cookie_jar logging.debug("Attempting to load and merge the following cookie files: %s" % cookie_jar_paths) for f in cookie_jar_paths: if os.path.isfile(f): try: cookies = MozillaCookieJar(f) cookies.load(ignore_expires=True, ignore_discard=True) cookie_jar.update(cookies) except Exception as e: logging.warning("Unable to load cookie file [%s]: %s" % (f, get_typed_exception(e))) # Do not preserve expire values from cookies with expires=0 from the file, or requests will not use the cookie for cookie in iter(cookie_jar): if not cookie.expires: cookie.expires = None return cookie_jar
def get_modis_by_requests(): # 获取token resp = request('GET', 'https://urs.earthdata.nasa.gov/home') pt = re.compile( r'.*<input type="hidden" name="authenticity_token" value="(.*)" />.*') print(len(resp.text)) token = pt.findall(resp.text)[0] print('token: ', token) # 登录并保存cookie jar = RequestsCookieJar() jar.update(resp.cookies) url = 'https://urs.earthdata.nasa.gov/login' forms = { 'username': '******', 'password': '******', 'redirect_uri': '', 'commit': 'Log+in', 'client_id': '', 'authenticity_token': token } resp = request('POST', url, data=forms, cookies=jar) jar.update(resp.cookies) print('cookie: ', resp.cookies.items()) # 请求下载页面,解析文件下载的url url = 'https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/6/MOD13Q1/2019/321/MOD13Q1.A2019321.h00v08.006.2019337235356.hdf' resp = request('GET', url, cookies=jar) pu = re.compile(r'href="(https://ladsweb.modaps.eosdis.nasa.gov.*hdf)"') furl = pu.findall(resp.text)[0] furl = furl.replace('&', '&') print('furl: ', furl) # 下载文件并保存 resp = request('GET', furl, cookies=jar) with open('fb/modis_requests.hdf', 'wb') as fp: fp.write(resp.content) print('OK')
class LeaScheduleSelectionPage: """ Represents the page to request schedules in LEA. """ def __init__(self, session, schedule_reference: str): """ Initializes a wrapper over the LEA schedule request page. :param session: The Omnivox session used to authenticate the LEA requests. :param schedule_reference: The schedule request reference. """ self.cookies = RequestsCookieJar() self.cookies.update(session.cookies) self.schedule_reference = schedule_reference self._semesters: Tuple[OmnivoxSemester] = None self._schedule_cache: Dict[str, OmnivoxSemesterSchedule] = dict() self._schedule_request_url: str = None async def fetch(self): """ Fetches the page, including the ID of the available semesters. :return: Nothing """ schedule_page_response = requests.get(url=VANIER_DOMAIN + self.schedule_reference, headers=HEADER_UA, cookies=self.cookies) self.cookies.update(schedule_page_response.cookies) body_redirect_location = get_js_redirect( pq(schedule_page_response.text)("body")) session_load_url = LEA_DOMAIN + "/" + body_redirect_location session_load_response = requests.get(url=session_load_url, headers=HEADER_UA, cookies=self.cookies) self.cookies.update(session_load_response.cookies) schedule_page_response = requests.get(url=LEA_DOMAIN + "/hrre/horaire.ovx", headers=HEADER_UA, cookies=self.cookies) semesters = [] page_d = pq(schedule_page_response.text) for option in page_d("select[name='AnSession']").children("option"): option_d = pq(option) semesters.append( OmnivoxSemester(option_d.val(), option_d.text(), option_d.attr("selected") is not None)) self._semesters = tuple(semesters) self._schedule_request_url = LEA_DOMAIN + "/hrre/" + page_d( "form").attr("action") async def get_current_semester(self) -> Optional[OmnivoxSemester]: """ Retrieves the ID of the current semester, if any. """ if not self._semesters: await self.fetch() for semester in self._semesters: if semester.current: return semester return None async def get_all_semesters(self) -> Tuple[OmnivoxSemester]: """ Retrieves the ID of all the available semesters. """ if not self._semesters: await self.fetch() return tuple(self._semesters) async def get_schedule(self, semester: OmnivoxSemester, force=False) -> OmnivoxSemesterSchedule: """ Gets and caches the schedule for the given semester. :param semester: The semester whose schedule is being requested. :param force: Whether to ignore the cache for the schedules. :return: An object representing the schedule for the requested semester. """ if not self._semesters: await self.fetch() if not force: if semester.id in self._schedule_cache: return self._schedule_cache[semester.id] schedule_request_response = requests.post( url=self._schedule_request_url, headers=HEADER_UA, cookies=self.cookies, data={ "AnSession": semester.id, "Confirm": "Obtain+my+schedule" }) body_redirect_location = LEA_DOMAIN + "/hrre/" + get_js_redirect( pq(schedule_request_response.text)("body")) schedule_page_response = requests.get(url=body_redirect_location, headers=HEADER_UA, cookies=self.cookies) # Parse the schedule page courses: List[OmnivoxSemesterScheduleCourse] = [] schedule_grid: Dict[ScheduleDay, List[OmnivoxSemesterScheduleGridClass]] = { day: [] for day in ScheduleDay } schedule_d = pq(schedule_page_response.text) # Check if there is no warning - if there is, there are no courses for this semester. if not schedule_d(".tbAvertissement"): schedule_course_list_table = pq( schedule_d(".tbContenantPageLayout table table")[3]) course_list_rows = schedule_course_list_table.children("tr") for i in range(3, len(course_list_rows) - 1): course_row = pq(course_list_rows[i]) course_number = pq(course_row.children("td")[1])("span").text() course_section = pq( course_row.children("td")[2])("span").text() course_title = pq(course_row.children("td")[3])("span").text() teacher = pq(course_row.children("td")[4])("a").text() courses.append( OmnivoxSemesterScheduleCourse(number=course_number, section=course_section, title=course_title, teacher=teacher)) schedule_grid_table = pq( schedule_d(".tbContenantPageLayout table table")[11]) schedule_grid_rows = schedule_grid_table.children("tr") for row_index in range(1, len(schedule_grid_rows)): time_slot = row_index - 1 schedule_grid_cols = pq( schedule_grid_rows[row_index]).children("td") col_index = 1 for day_index in range(5): if col_index == len(schedule_grid_cols): continue day = ScheduleDay(day_index) # check if a class has started prior to this slot past_classes = schedule_grid[day] for past_class in past_classes: if past_class.time_slot_start <= time_slot < ( past_class.time_slot_start + past_class.length): continue grid_cell = pq(schedule_grid_cols[col_index]) if grid_cell.attr("bgcolor") != "#ffffff": col_index += 1 continue class_length = int(grid_cell.attr("rowspan")) schedule_class = OmnivoxSemesterScheduleGridClass( grid_cell.text().split("\n")[0], day, time_slot, class_length) schedule_grid[day].append(schedule_class) col_index += 1 schedule = OmnivoxSemesterSchedule( semester=semester, courses=tuple(courses), grid=OmnivoxSemesterScheduleGrid(schedule_grid)) self._schedule_cache[semester.id] = schedule return schedule
class CookieManager: URL = "http://kns.cnki.net/kns/brief/default_result.aspx" INTERVAL = 900 # 15分钟 POOL = [] # cookies池 def __init__(self, url=None, pool=True): """ 初始化过期时间 :param pool: """ if url: self.URL = url # 初始化cookie请求的url self.cookies = RequestsCookieJar() self.user_agent = UserAgent().random self.header = { "Host": "kns.cnki.net", "Connection": "keep-alive", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip,deflate", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": self.user_agent } self.proxymanager = ProxyManager(pool=False) self.timeout = 10 self.pool = pool # 是否使用cookie池:True or False if self.pool: self.get_pool() else: self.set_cookie() @logger.log_decoratore def set_cookie(self): """ cookie获取,记录cookies以及获取时间 :return: """ retry_time = 5 # 重复请求次数 # IP, PORT = self.proxymanager.get_proxy() # self.proxy = { # 'https': 'http://{}:{}'.format(IP, PORT) # } while True: try: print(1111111) print(self.URL) resp = requests.get( self.URL, headers=self.header, # proxies=self.proxy, cookies=self.cookies, timeout=self.timeout) print(resp.status_code) if resp.status_code == 200: self.EXPIRES_TIME = int(time.time()) # 过期时间 self.cookies.update(resp.cookies) print('resp.cookies>>', resp.cookies) print('self.cookies', self.cookies) return self.cookies, self.header, self.EXPIRES_TIME except Exception as e: retry_time -= 1 if retry_time <= 0: print('self.cookies>> {}...!'.format(self.cookies)) print(e) return self.cookies, 0 time.sleep(0.1) @logger.log_decoratore def get_cookie(self): """ 若__init__的pool参数为True,则从POOL随机获取一个cookie;否则自动获取一个cookie; 并都对cookie进行了过期维护 :return: 返回获取cookie """ now_time = int(time.time()) if not self.pool: # 没用使用cookie池时 cookie_expires = self.EXPIRES_TIME # 过期时间 if cookie_expires + self.INTERVAL < now_time: # 过期从新获取 self.set_cookie() return self.cookies else: # 使用cookie池时 while True: try: cookies = random.choice(self.POOL) # 过期维护 # 过期时间 cookie_expires = cookies[1] if cookie_expires + self.INTERVAL < now_time: # 移除过期cookie self.cookie_remove(cookies) else: return cookies[0] except IndexError as e: self.get_pool() def _add_cookie(self, i): """ 使用cookie池时,添加cookie到POOL :return: """ self.set_cookie() item = (self.cookies, self.EXPIRES_TIME) self.POOL.append(item) def get_pool(self): """ 获取一个数量为200的cookie池POOL :return: """ pool = threadpool.ThreadPool(32) req = threadpool.makeRequests(self._add_cookie, range(200)) [pool.putRequest(i) for i in req] pool.wait() def cookie_remove(self, cookie): """ 从POOL中移除一个cookie,并再添加一个 :param cookie: cookie :return: """ self.POOL.remove(cookie) self._add_cookie()
class IdeaLoomIdeaSource(IdeaSource): __tablename__ = 'idealoom_idea_source' id = Column(Integer, ForeignKey(IdeaSource.id), primary_key=True) # or use a token? username = Column(String()) password = Column(String()) # add credentials! use_local = False __mapper_args__ = { 'polymorphic_identity': 'idealoom', } @reconstructor def init_on_load(self): super(IdeaLoomIdeaSource, self).init_on_load() # TODO: find a way to reuse Users when self.source_uri.startswith(self.global_url) self.cookies = CookieJar() def class_from_data(self, json): typename = json.get('@type', None) if typename: return get_named_class(typename) def base_source_uri(self): return urljoin(self.source_uri, '/data/') def process_data(self, data): dtype = data.get('@type', None) if dtype == 'RootIdea': self.base_set_item(self.normalize_id(data['@id']), self.discussion.root_idea) return None if dtype == 'GenericIdeaNode': data['pub_state_name'] = self.target_state.name elif dtype == 'DirectedIdeaRelation': source_id = self.normalize_id(data['source']) if source_id not in self.instance_by_id: self.base_set_item(source_id, self.discussion.root_idea) self[source_id] = self.discussion.root_idea return data def external_id_to_uri(self, external_id): if '//' in external_id: return external_id if external_id.startswith('local:'): return self.base_source_uri() + external_id[6:] return external_id # as urn? def uri_to_external_id(self, uri): base = self.base_source_uri() if uri.startswith(base) and self.use_local: uri = 'local:' + uri[len(base):] return uri def get_imported_from_in_data(self, data): return data.get('imported_from_url', None) def normalize_id(self, id): id = self.id_from_data(id) if not id: return if id.startswith('local:') and not self.use_local: return self.external_id_to_uri(id) return super(IdeaLoomIdeaSource, self).normalize_id(id) def login(self, admin_user_id=None): login_url = urljoin(self.source_uri, '/login') r = requests.post(login_url, cookies=self.cookies, data={ 'identifier': self.username, 'password': self.password}, allow_redirects=False) assert r.ok if 'login' in r.headers['Location']: return False self.cookies.update(r.cookies) self._last_login = datetime.now() return True def read(self, admin_user_id=None): admin_user_id = admin_user_id or self.discussion.creator_id super(IdeaLoomIdeaSource, self).read(admin_user_id) local_server = self.source_uri.startswith(urljoin(self.global_url, '/')) super(IdeaLoomIdeaSource, self).read(admin_user_id) last_login = getattr(self, '_last_login', None) if not last_login or datetime.now() - last_login > timedelta(days=1): assert self.login(admin_user_id) r = requests.get(self.source_uri, cookies=self.cookies) assert r.ok ideas = r.json() self.read_json(ideas, admin_user_id, True) discussion_id = self.source_uri.split('/')[-2] link_uri = urljoin( self.source_uri, '/data/Conversation/%s/idea_links' % (discussion_id,)) r = requests.get(link_uri, cookies=self.cookies) assert r.ok links = r.json() link_subset = [ l for l in links if self.normalize_id(l['target']) in self.instance_by_id] self.read_json(link_subset, admin_user_id) missing_oids = list(self.promises_by_target_id.keys()) missing_classes = {oid.split('/')[-2] for oid in missing_oids} missing_classes.discard('Agent') assert not missing_classes, "Promises for unknown classes " + str(missing_classes) if local_server: for oid in missing_oids: loid = 'local:'+oid[len(self.global_url):] self.base_set_item(oid, AgentProfile.get_instance(loid)) else: self.read_json([ requests.get(oid, cookies=self.cookies).json() for oid in missing_oids], admin_user_id) self.db.flush() self.add_missing_links() def read_json(self, data, admin_user_id, apply_filter=False): if isinstance(data, string_types): data = json.loads(data) def find_objects(j): if isinstance(j, list): for x in j: for obj in find_objects(x): yield obj elif isinstance(j, dict): jid = j.get('@id', None) if jid: yield j for x in j.values(): for obj in find_objects(x): yield obj self.read_data_gen(find_objects(data), admin_user_id, apply_filter)
def main(): # Check username and password global USERNAME, PASSWORD if not USERNAME: USERNAME = input('请输入学号:') if not PASSWORD: PASSWORD = input('请输入密码:') # Prepare for the session req = requests.Session() cookie_jar = RequestsCookieJar() login_payload = { 'username': USERNAME, 'password': PASSWORD, 'service': 'https://weixine.ustc.edu.cn/2020/caslogin' } url = 'https://passport.ustc.edu.cn/login?service=https%3A%2F%2Fweixine.ustc.edu.cn%2F2020%2Fcaslogin' # Login start print('Requesting for cookies from: %s' % url) r = req.post(url, data=login_payload, allow_redirects=False) # Redirections while r.status_code in range(300, 304): new_location = r.headers['Location'] print('Redirecting to %s' % new_location) cookie_jar.update(r.cookies) r = req.get(new_location, allow_redirects=False) # Finally update my cookies cookie_jar.update(r.cookies) # print(cookie_jar.keys()) # Get my token for later commit login_form_data = etree.HTML(r.text) token_line = login_form_data.xpath( "//*[@id='daliy-report']/form/input/@value") assert (len(token_line) == 1) token = token_line[0] # Close login request r.close() # Prepare for report request headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36' } param = { # 'Accept': 'text/html, application/xhtml+xml, application/xml; q=0.9, image/webp,image/apng, */*; q=0.8, application/signed-exchange; v=b3; q=0.9', 'Accept - Encoding': 'gzip, deflate, br', 'Accept - Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 'Cache - Control': 'max-age=0', 'Content - Type': 'application/x-www-form-urlencoded', 'Origin': 'https://weixine.ustc.edu.cn', 'Referer': 'https://weixine.ustc.edu.cn/2020/home', 'Src - Fetch - Dest': 'document', 'Src - Fetch - Mode': 'navigate', 'Src - Fetch - Site': 'same-origin', 'Src - Fetch - User': '******', 'Upgrade - Insecure - Requests': '1' } report_payload = { '_token': token, # 加入上面获得的token 'now_address': '1', # 当前所在地:内地 'gps_now_address': '', # 'now_province': '340000', # 当前所在地:安徽 'gps_province': '', # 'now_city': '340100', # 当前所在地:合肥 'gps_city': '', # 'now_detail': '', # 'is_inschool': '6', # 是否在校:西校区 'body_condition': '1', # 当前身体状况:正常 'body_condition_detail': '', # 'now_status': '1', # 当前状态:正常在校园内 'now_status_detail': '', # 'has_fever': '0', # 当前有无发热症状:无 'last_touch_sars': '0', # 有无接触患者:无 'last_touch_sars_date': '', # 'last_touch_sars_detail': '', # 'last_touch_hubei': '0', # 有无接触湖北人员:无 'last_touch_hubei_date': '', # 'last_touch_hubei_detail': '', # 'last_cross_hubei': '0', # 有无在湖北停留或路过:无 'last_cross_sars_date': '', # 'last_cross_sars_detail': '', # 'return_dest': '1', # 返校目的地:合肥校本部 'return_dest_detail': '', # 'other_detail': '', # 其他情况说明:(无) } print(report_payload) print('=======================') # print(cookie_jar.items()) r = req.post('https://weixine.ustc.edu.cn/2020/daliy_report', cookies=cookie_jar, data=report_payload, headers=headers, params=param, allow_redirects=False, timeout=50) # Redirections while r.status_code in range(300, 304): new_location = r.headers['Location'] print('Redirecting to %s' % new_location) cookie_jar.update(r.cookies) r = req.get(new_location, allow_redirects=False) print('Last status code: %d' % r.status_code) if (r.status_code == 200): ret_text = r.text last_report_info_pos = r.text.find('上次上报时间') print(ret_text[last_report_info_pos:(last_report_info_pos + 26)]) r.close()
class CustomsforgeClient: """ Implements customsforge API for CDLCs. (Should be) thread-safe. To access the API, logging in is required. This is attempted exactly once for every API call that returns a redirect indicating lack of (or invalid) credentials. Cookies resulting from login can be stored to avoid this process in subsequent executions. """ def __init__(self, api_key: str, batch_size: int = DEFAULT_BATCH_SIZE, timeout: int = DEFAULT_TIMEOUT, cookie_jar_file: Optional[str] = DEFAULT_COOKIE_FILE, username: str = None, password: str = None, get_today: Callable[[], date] = date.today): self.__api_key = api_key self.__batch_size = batch_size if Verify.batch_size( batch_size) else DEFAULT_BATCH_SIZE self.__timeout = max(0, timeout) or DEFAULT_TIMEOUT self.__cookie_jar_file = cookie_jar_file self.__username = username self.__password = password self.__login_rejected = False self.__prevent_multiple_login_lock = RLock() self.__sessions = SessionFactory(unsafe=['ips_password']) self.__cookies = RequestsCookieJar() self.__with_cookie_jar('rb', lambda f: self.__cookies.update(pickle.load(f))) # no error, since cookie file probably doesn't exist; we'll try to write it later and log any error then self.__get_today = get_today def login(self, username: str = None, password: str = None) -> bool: """ Tries to log in using given credentials. They are stored for future use (e.g. automatic re-log). If no credentials are passed into the method, tries to use already stored credentials, if any. In some cases it is possible to determine that login failed due to invalid credentials. In such cases this method will avoid logging in until new credentials are passed into it. :returns true if login succeeded, false otherwise """ with self.__prevent_multiple_login_lock: if not self.__has_credentials(username, password): return False form = { 'ips_username': self.__username, 'ips_password': self.__password, 'auth_key': self.__api_key, 'rememberMe': '1', 'referer': MAIN_PAGE, } with self.__sessions.with_retry() as session: r = self.__call('login', session.post, LOGIN_API, data=form, cookies=None, try_login=False) if not r: # this indicates an error - repeated attempts may still succeed return False if not r.is_redirect or not r.headers.get('Location', '') == MAIN_PAGE: LOG.error('Login failed. Please check your credentials.') self.__login_rejected = True return False self.__with_cookie_jar('wb', lambda f: pickle.dump(r.cookies, f), trying_to='update cookie jar') self.__cookies = r.cookies return True def ping(self) -> bool: """ :returns true if a simple call to customsforge succeeded (including login), false otherwise """ with self.__sessions.with_retry() as session: return self.__date_count(session=session) is not None def dates(self, since: date = None) -> Iterator[str]: """ Generates all dates which had an updated CDLC, since the date given, in ascending order. If no date is given, starts at the beginning. The dates are returned in ISO format strings, intended to be used by other APIs. """ with self.__sessions.with_retry() as session: yield from self.__dates(since, session) def cdlcs(self, since: date = None, since_exact: int = 0) -> Iterator[dict]: """ Generates all CDLCs which are available in customsforge, since the date and/or exact time given. Exact time takes precedence over the date, unless it is 0 (or negative). If no date or exact time is given, starts at the beginning. CDLCs are returned in order of ascending last update time. CDLCs are returned as dicts containing information, such as artist, title, id, link, etc. Refer to To.cdlcs method for specifics. """ since_exact = since_exact or 0 since = self.__estimate_date(since_exact) if since_exact else since with self.__sessions.with_retry() as session: for d in self.__dates(since, session): lazy_cdlcs = self.__lazy_all(trying_to='find CDLCs', call=session.get, url=CDLC_BY_DATE_API, params={'filter': d}, convert=To.cdlcs) yield from dropwhile( lambda c: c['snapshot_timestamp'] < since_exact, lazy_cdlcs) def direct_link(self, cdlc_id: Union[str, int]) -> str: """ :returns link to the direct download of the CDLC with given id, if such exists, empty string otherwise """ url = DOWNLOAD_API.format(cdlc_id) with self.__sessions.with_retry() as session: r = self.__call('get direct link', session.get, url) return r.headers.get('Location', '') if r and r.is_redirect else '' def calculate_date_skip(self, since: date, date_count: int) -> int: """ :returns how many dates can be skipped to arrive closer to expected date; this is usually a generous estimate, but can become outdated; therefore, only estimate right before calling for dates """ passed_since = self.__get_today() - since # we subtract one to include the date, one to account for time passing, one to avoid timezone shenanigans skip_estimate = date_count - passed_since.days - 3 return max(0, skip_estimate) def __has_credentials(self, username: str, password: str) -> bool: if username and password: self.__username = username self.__password = password self.__login_rejected = False if self.__login_rejected: LOG.debug( 'Login rejected. Please provide new credentials to try again.') return False if not self.__username and not self.__password: LOG.error('No credentials have been provided.') self.__login_rejected = True return False return True def __dates(self, since: date, session: Session) -> Iterator[str]: since = since or EONS_AGO lazy_dates = self.__lazy_all(trying_to='find dates for CDLC updates', call=session.get, url=DATES_API, convert=To.dates, skip=self.__estimate_date_skip( since, session)) yield from dropwhile(lambda d: date.fromisoformat(d) < since, lazy_dates) def __estimate_date_skip(self, since: date, session: Session) -> int: if since <= EONS_AGO: return 0 date_count = self.__date_count(session) if not date_count: return 0 return self.calculate_date_skip(since, date_count) def __estimate_date(self, epoch_seconds: int) -> date: # we subtract one day to account for timezone shenanigans return date.fromtimestamp(epoch_seconds) - timedelta( days=1) if epoch_seconds > 0 else EONS_AGO def __date_count(self, session: Session) -> Optional[int]: date_count = self.__lazy_all(trying_to='total count of dates', call=session.get, url=DATES_API, convert=To.date_count, batch=1) return next(date_count, None) def __call(self, trying_to: str, call: Callable[..., Response], url: str, try_login: bool = True, **kwargs) -> Optional[Response]: kwargs.setdefault('cookies', self.__cookies) try: r = call(url=url, timeout=self.__timeout, allow_redirects=False, **kwargs) except Exception as e: return debug_ex(e, trying_to, LOG) if not try_login or not r.is_redirect or not r.headers.get( 'Location', '') == LOGIN_PAGE: return r if not self.login(): LOG.debug('Cannot %s: automatic login to customsforge failed.', trying_to) return None kwargs.pop('cookies', None) return self.__call(trying_to, call, url, try_login=False, **kwargs) def __lazy_all(self, convert: Callable[[Any], Iterator[T]], skip: int = 0, batch: int = None, **call_params) -> Iterator[T]: batch = batch if Verify.batch_size(batch) else self.__batch_size while True: params = call_params.setdefault('params', {}) params['skip'] = skip params['take'] = batch r = self.__call(**call_params) if not r or not r.text: break try: it = convert(r.json()) first = next(it, NON_EXISTENT) if first is NON_EXISTENT: break yield first yield from it except Exception as e: trying_to = call_params['trying_to'] return debug_ex(e, f'parse response of <{trying_to}> as JSON', LOG) skip += batch def __with_cookie_jar(self, options: str, on_file: Callable[[IO], T] = identity, trying_to: str = None) -> T: if self.__cookie_jar_file: try: f = open(self.__cookie_jar_file, options) except Exception as e: if trying_to: debug_ex(e, trying_to, LOG) else: with f: return on_file(f)
def main(): print('Start time: %s' % time.ctime()) try: mysql_conn = MySQLdb.connect( host=config.mysql_host, port=config.mysql_port, user=config.mysql_user, passwd=config.mysql_passwd, db=config.mysql_db, charset=config.mysql_charset ) mysql_conn.autocommit(True) mysql_cursor = mysql_conn.cursor(MySQLdb.cursors.DictCursor) except Exception as e: error_msg = 'Failed to connect to MySQL: {error_msg}'.format(error_msg=traceback.format_exc()) logging.error(error_msg) first_visit_status = login_status = logout_status = 0 session = requests.Session() try: resp = session.get(url = '', verify=True, timeout=5) first_visit_status = 1 except Exception as e: print('First visit failed : %s' % e) post_data = {} html = PyQuery(resp.text) input_list = html('')('') for item in input_list: if item.type == '': continue post_data[item.name] = item.value post_data['username'] = ad_username post_data['password'] = ad_password try: resp = session.post(url = '', verify=False, timeout=5, data = post_data) if resp.status_code == 302 or resp.status_code == 307 or resp.status_code == 200 : login_status = 1 else : print('Login failed ') except Exception as e: print('Login failed : %s' % e) if first_visit_status == 1 and login_status == 1: cookie_jar = RequestsCookieJar() cookie_jar.update(resp.cookies) mysql_cursor.execute('SELECT * FROM domain WHERE status=1') domains = mysql_cursor.fetchall() mysql_conn.close() gevent_pool = Pool(POOL_SIZE) gevent_pool.map(test_domain, [(d, cookie_jar)for d in domains]) gevent_pool.join() logouturl = '' try: resp = session.get(url=logouturl, verify=True, timeout=5) if resp.status_code == 302 or resp.status_code == 307 or resp.status_code == 200 : logout_status = 1 else : print('Logout failed ') except Exception as e: result += u'\tLogout ERROR {e}\n'.format(e=e) if logout_status == 1: results.sort(key=lambda x: (x['domain'])) for result in results: print('%s' % result['domain']) print(result['result'].rstrip('\n')) print('') print('%s domain redirect to APSSO' % redirect2apsso_cnt) print('%s domain redirect to SSO' % redirect2sso_cnt) print('%s domain return 5xx' % len(error_5xx_domain)) print(json.dumps(error_5xx_domain)) print('%s domain return 4xx' % len(error_4xx_domain)) print(json.dumps(error_4xx_domain)) print('Finish time: %s' % time.ctime()) print('-' * 20) print(' ')
class Account: headers = c.headers data = None proxies = c.proxies def __init__(self, email, password=None, cookie=None): self.email = email self.cookies = RequestsCookieJar() if password is None: temp_cookie = SimpleCookie() temp_cookie.load(cookie) for key, morsel in temp_cookie.items(): self.cookies[key] = morsel.value self.cookie = True else: self.password = password self.cookie = False def login(self, mobile=False, useProxy=False): self.headers = c.headers if not self.cookie: postURL = self.preLogin(useProxy=useProxy) res = self.post(postURL, data=self.data, useProxy=useProxy) # Parse HTML Form form = BeautifulSoup(res.text, "html.parser").findAll("form")[0] # Get Form params = dict() for field in form: # Add each field to params params[field["name"]] = field["value"] self.headers["Host"] = c.host # Set Host to Bing Server self.cookies.clear() res = self.post(form.get("action"), data=params, useProxy=useProxy) if mobile: self.headers = c.mobileHeaders def preLogin(self, useProxy=False): res = self.get(c.hostURL, useProxy=useProxy) # Get Login URL index = res.text.index("WindowsLiveId") # Find URL cutText = res.text[index + 16:] # Cut Text at Start of URL loginURL = cutText[:cutText.index("\"")] # Cut at End of URL # Unescape URL loginURL = bytes(loginURL, encoding="UTF-8").decode("unicode_escape") # Get Login Cookies self.headers["Host"] = c.loginHost # Set Host to Login Server res = self.get(loginURL, useProxy=useProxy) self.data = self.getAuthData() self.cookies["CkTst"] = "G" + \ str(int(time.time() * 1000)) # Add Time Cookie # Get Post URL index = res.text.index(c.loginPostURL) # Find URL cutText = res.text[index:] # Cut Text at Start of URL postURL = cutText[:cutText.index("\'")] # Cut at End of URL # Get PPFT index = res.text.index("sFTTag") # Find PPFT cutText = res.text[index:] # Cut Text Near PPFT PPFT = cutText[cutText.index("value=") + 7:cutText.index("\"/>")] # Cut PPFT self.data["PPFT"] = PPFT # Get PPSX index = res.text.index(",bH:\'") # Find PPSX cutText = res.text[index + 4:] # Cut Text at Start of PPSX PPSX = cutText[:cutText.index("\'")] # Cut at End of PPSX self.data["PPSX"] = PPSX # Finish Up self.cookies["wlidperf"] = "FR=L&ST=" + \ str(int(time.time() * 1000)) # Add Another Time Cookie return postURL def logout(self): if not self.cookie: self.cookies.clear() def getAuthData(self): return { "login": self.email, "loginfmt": self.email, "passwd": self.password, "i13": "0", "type": "11", "LoginOptions": "3", "lrt": "", "ps": "2", "psRNGCDefaultType": "", "psRNGCEntropy": "", "psRNGCSLK": "", "canary": "", "ctx": "", "NewUser": "******", "FoundMSAs": "", "fspost": "0", "i21": "0", "i2": "1", "i17": "0", "i18": "__ConvergedLoginPaginatedStrings%7C1%2C__ConvergedLogin_PCore%7C1%2C", "i19": "2" + str(randint(0, 5000)) } def request(self, method, URL, headers=USE_SELF, cookies=USE_SELF, params=None, data=None, proxies=USE_SELF, useProxy=False, setReferer=True, setCookies=True): headers = self.headers if headers is USE_SELF else headers cookies = self.cookies if cookies is USE_SELF else cookies proxies = self.proxies if proxies is USE_SELF else proxies res = requests.request(method, URL, headers=headers, cookies=cookies, params=params, data=data, proxies=proxies if useProxy else None) if setReferer: self.headers["Referer"] = URL if setCookies: self.cookies.update(res.cookies) return res def get(self, URL, headers=USE_SELF, cookies=USE_SELF, params=None, data=None, proxies=USE_SELF, useProxy=False, setReferer=True, setCookies=True): return self.request('GET', URL, headers, cookies, params, data, proxies, useProxy, setReferer, setCookies) def post(self, URL, headers=USE_SELF, cookies=USE_SELF, params=None, data=None, proxies=USE_SELF, useProxy=False, setReferer=True, setCookies=True): return self.request('POST', URL, headers, cookies, params, data, proxies, useProxy, setReferer, setCookies)
class weibo_login(object): def __init__(self): self.cookies = RequestsCookieJar() self.pre_login_info = None self.sp = None self.username = 13760398874 self.pwd = 'Yangfei123@' self.rsa2_password = None self.session = requests.session() self.session.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36', } self.prelt = 0 def gen_rsa_sp(self): rsapubkey = int(self.pre_login_info.get('pubkey'), 16) key = rsa.PublicKey(rsapubkey, 65537) message = '{}\t{}\n{}'.format(self.pre_login_info.get('servertime'), self.pre_login_info.get('nonce'), self.pwd).encode() password = rsa.encrypt(message, key) self.rsa2_password = binascii.b2a_hex(password) return self.rsa2_password def login(self): url1 = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)' # url1 = 'https://login.sina.com.cn/signup/signin.php' data = { 'su': b64encode(b'13760398874'), 'entry': 'weibo', 'geteway': 1, 'from': None, 'savestate': 7, 'qrcode_flag': False, 'useticket': 1, 'pagerefer': 'https://login.sina.com.cn/crossdomain2.php?action=login&r=https%3A%2F%2Fpassport.weibo.com%2Fwbsso%2Flogout%3Fr%3Dhttps%253A%252F%252Fweibo.com%26returntype%3D1', 'vsnf': 1, 'service': 'miniblog', 'servertime': time.time(), 'nonce': self.pre_login_info.get('nonce'), 'pwencode': 'rsa2', 'rsakv': self.pre_login_info.get('rsakv'), 'sp': self.gen_rsa_sp(), 'sr': '1920*1080', 'encoding': 'UTF-8', 'prelt': self.prelt, 'url': 'https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 'returntype': 'META' } r1 = self.session.post(url1, data=data, timeout=10) r1.raise_for_status() self.cookies.update(r1.cookies) try: url2 = re.search(r'location.replace\("(.*?)"\);', r1.content.decode('gbk'), re.S).group(1) r2 = self.session.get(url=url2, timeout=10) r2.raise_for_status() self.cookies.update(r2.cookies) url3 = re.search(r'location.replace\(\'(.*?)\'\);', r2.content.decode('gbk'), re.S).group(1) r2 = self.session.get(url=url3, timeout=10) r2.raise_for_status() self.cookies = self.session.cookies except Exception as e: logging.exception(e) def pre_login(self): try: pre_login_time = int(1000 * time.time()) r = self.session.get( 'https://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=&rsakt=mod&client=ssologin.js(v1.4.19)&_=1567405550547' ) r.raise_for_status() d = json.loads( r.text.lstrip('sinaSSOController.preloginCallBack(').rstrip( ')')) self.pre_login_info = d self.prelt = int( 1000 * time.time()) - pre_login_time - d.get('exectime') except Exception as e: logging.exception(e) def logout(self): try: r = self.session.get( 'https://login.sina.com.cn/sso/logout.php?entry=miniblog&r=https%3A%2F%2Fweibo.com', timeout=10) r.raise_for_status() self.cookies.update(r.cookies) except Exception as e: logging.exception(e) def init_cookie(self): try: r = self.session.get('https://weibo.com') r.raise_for_status() except Exception as e: logging.exception(e) def run(self): self.init_cookie() self.pre_login() self.login() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.logout()
class RedisCookieManager: """ 创建redis的cookie管理类:为请求添加cookie,并进行过期维护 """ URL = "http://kns.cnki.net/kns/brief/default_result.aspx" INTERVAL = 1500 # 25分钟 def __init__(self, url=None, num=10): """ 初始化过期时间 :param pool: """ self.URL = url # 初始化cookie请求的url self.cookies = RequestsCookieJar() self.user_agent = UserAgent().chrome self.headers = { "Host": "kns.cnki.net", "Connection": "keep-alive", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip,deflate", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": self.user_agent } self.proxymanager = ProxyManager(pool=False) self.timeout = 20 self.redis_host = 'localhost' self.redis_port = 6379 self.num = num # redis中存储的cookie数量, 默认10个 @logger.log_decoratore def set_cookie(self): """ cookie获取,记录cookies以及获取时间 :return: """ retry_time = 5 # 重复请求次数 # IP, PORT = self.proxymanager.get_proxy() # self.proxy = { # 'https': 'http://{}:{}'.format(IP, PORT) # } while True: try: resp = requests.get( self.URL, headers=self.headers, # proxies=self.proxy, cookies=self.cookies, timeout=self.timeout) if resp.status_code == 200: self.cookies.update(resp.cookies) self.EXPIRES_TIME = int(time.time()) # 过期时间 print('resp.cookies>>', resp.cookies) print('self.cookies', self.cookies) return self.cookies, self.EXPIRES_TIME except Exception as e: retry_time -= 1 if retry_time <= 0: resp = Response() cookie = resp.cookies print('cookie>> {}...!'.format(cookie)) return cookie time.sleep(0.1) @logger.log_decoratore def get_cookie(self): """ 从Redis的mala_cookies列表中最左侧获取一个cookie判断过期时间,没过期返回 并都对cookie进行了过期维护 :return: 返回获取cookie """ try: while True: r = redis.Redis(host=self.redis_host, port=self.redis_port) res = r.blpop('cnki:cookies') if not res: self.create_redis() continue cookie_expires = res[1] now_time = int(time.time()) if cookie_expires + self.INTERVAL < now_time: # 过期从新获取 continue else: r.rpush('cnki:cookies', res) return res[0] except Exception as e: # redis为空时,添加cookie raise e def create_redis(self): """ 往redis插入cookie :return: """ pool = threadpool.ThreadPool(32) req = threadpool.makeRequests(self._add_cookie, list(range(self.num))) [pool.putRequest(i) for i in req] pool.wait() @logger.log_decoratore def _add_cookie(self, i): """ 把cookie存到redis :return: """ try: r = redis.Redis(host=self.redis_host, port=self.redis_port, db=0) res = self.set_cookie() if res: print(type(res), res.get_dict()) r.rpush('mala_cookies', res.get_dict()) except Exception as e: pass
# 超出最大重试次数,把最后一个异常(肯定是重试异常或者空白页面异常)向上爆 raise err self.logger.debug('[%s]<< %s' % (method.upper(), url)) merged_cookies = RequestsCookieJar() if not isinstance(kwargs['cookies'], cookielib.CookieJar): kwargs['cookies'] = cookiejar_from_dict( kwargs['cookies']) # 先更新旧的cookies response.cookies.update(kwargs['cookies']) # 再更新新的cookies,顺序不能乱 merged_cookies.update(response.cookies) response.cookies = merged_cookies return response def switch_proxy(self, old_proxy=None): # 加锁,一个爬虫只能有一个协程在切换代理 self.logger.debug('Try to switch proxy from %s.', old_proxy) with self._proxy_lock: # 不是你叫我切换代理我就会帮你切的,除非是你现在在用的代理跟现在设置的一样, # 否则有可能是其他线程已经切换过代理 if old_proxy and old_proxy not in self._crawler.proxies.values(): return