def login(): retry_times = 0 while True: try: idx = random.randint(0, len(login_users) - 1) login_user = login_users[idx] logger.info(login_user) flag = -1 while flag != 0: s = my_request.get_https_session(new=True, agent=True) (flag, r) = my_request.get(logger, "https://www.itjuzi.com/user/login") logger.info(r.status_code) if flag == 0 and r.status_code != 200: flag = -1 logger.info(r.headers["Set-Cookie"]) r = s.post("https://www.itjuzi.com/user/login", data={ "identity": login_user["name"], "password": login_user["pwd"] }, timeout=10) logger.info(r.headers["Refresh"]) if "0;url=https://www.itjuzi.com/" == r.headers["Refresh"]: return True except Exception, ex: logger.exception(ex) time.sleep(10) '''
def login(): while True: try: global s s = my_request.get_https_session(new=True, agent=True) login_url = 'https://passport.jd.com/uc/login' randomCode = random.uniform(0.1, 1.0) r = s.get(login_url) d = pq(r.text) uuid = d('input:eq(0)').attr('value') param = d('input:eq(4)').attr('name') param_value = d('input:eq(4)').attr('value') loginname = login_user['name'] loginpwd = login_user['pwd'] verify_url = 'https://passport.jd.com/uc/showAuthCode?r='+str(randomCode)+'&version=2015' r = s.post(verify_url, data={'loginName': loginname}, timeout=10) result = r.text.replace('(', '').replace(')', '') verify = json.loads(result) logger.info(verify) if not verify['verifycode']: data = {'uuid': uuid, param: param_value, 'loginname': loginname, 'nloginpwd': loginpwd, 'loginpwd': loginpwd, 'chkRememberMe': 'on', 'authcode': None} url = 'https://passport.jd.com/uc/loginService?uuid='+uuid+'&version=2015' r = s.post(url, data, timeout=10) logger.info(r.text) if "success" in r.text: return True except Exception,ex: logger.exception(ex) time.sleep(60)
def fetch_company(url): (company_key, ) = util.re_get_result("http://www.itjuzi.com/company/(\d+)", url) logger.info("company_key=%s" % company_key) company_content = None member_contents = [] news_contents = [] investor_contents = [] (flag, r) = my_request.get(logger, url) logger.info("flag=%d", flag) if flag == -1: return -1 logger.info("status=%d", r.status_code) if r.status_code == 404: logger.info("Page Not Found!!!") return r.status_code if r.status_code != 200: #logger.info(r.status_code) return r.status_code #logger.info(r.text) d = pq(r.text) product_name = d('div.line-title> span> b').clone().children().remove( ).end().text().strip() if product_name == "": return 404 company_content = { "date": datetime.datetime.now(), "source": source, "url": url, "company_key": company_key, "company_key_int": int(company_key), "content": r.text } #members lis = d('h4.person-name> a.title') for li in lis: try: l = pq(li) href = l.attr("href").strip() logger.info(href) member_name = l('b> span.c').text().strip() (member_key, ) = util.re_get_result( r'http://www.itjuzi.com/person/(\d*?)$', href) logger.info("member_key=%s, member_name=%s" % (member_key, member_name)) href = href.replace("http://", "https://") #if member_collection.find_one({"source":source, "member_key":member_key}) == None: if 1 == 1: flag = -1 while flag != 0: (flag, r) = my_request.get(logger, href) if flag == 0 and r.status_code != 200: flag = -1 if flag != 0: my_request.get_https_session(new=True, agent=True) #logger.info(r.text) member_contents.append({ "date": datetime.datetime.now(), "source": source, "url": url, "member_key": member_key, "member_name": member_name, "content": r.text }) except Exception, ex: logger.exception(ex)
"news_date": "%s/%s/%s" % (news_year, news_month, news_day), "news_url": news_url, "news_source_domain": news_source_domain, "content": r.text }) except Exception, ex: logger.exception(ex) #investors lis = d('table.list-round-v2 >tr > td> a') if len(lis) > 0: my_request.get_https_session(new=True, agent=True) for li in lis: try: l = pq(li) investor_url = l.attr('href').strip() investor_name = l.text().strip() (investor_key, ) = util.re_get_result( r"http://www.itjuzi.com/investfirm/(\d*)$", investor_url) logger.info(investor_url) logger.info(investor_name) logger.info(investor_key) #if investor_collection.find_one({"source":source, "investor_key":investor_key}) == None: investor_url = investor_url.replace("http://", "https://") if 1 == 1: flag = -1 while flag != 0:
latest_album = collection.find({}).sort("key", pymongo.DESCENDING).limit(1) if latest_album.count() == 0: i = 0 else: i = int(latest_album[0]["key"]) latest = i logger.info("From: %d" % i) while True: i += 1 url = "https://itjuzi.com/album/%d" % (i) if cnt <= 0: my_request.get_https_session() cnt = 100 status = -1 retry_times = 0 while status != 200 and status != 404 and status != 302: try: status = fetch(url) except Exception, ex: logger.exception(ex) if status == -1: my_request.get_http_session(new=True, agent=False) cnt = 100 retry_times += 1