def create_session(): global session cf = ConfigParser.ConfigParser() cf.read("config.ini") email = cf.get("info", "email") password = cf.get("info", "password") s = requests.session() dict_cookie = pyCookieCheat.chrome_cookies("http://zhihu.com") requests.utils.add_dict_to_cookiejar(s.cookies, dict_cookie) login_data = {"email": email, "password": password} header = { 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", 'Host': "www.zhihu.com", 'Referer': "http://www.zhihu.com/", 'X-Requested-With': "XMLHttpRequest" } #c = s.get('http://www.zhihu.com', headers = header) #print c.text.encode("utf8") #r = s.post('http://www.zhihu.com/login', data = login_data, headers = header) #print r.text.decode("utf-8") #print r #if r.json()["r"] == 1: #raise Exception("login failed.") session = s
def get_json(url): cookies = pyCookieCheat.chrome_cookies(url) r = session.get(url, cookies=cookies) text = r.text if text.startswith('while(1);'): text = r.text[9:] return json.loads(text)
def loadSite(end='quoteElementPiece11',end_num = 50): # cj = cookielib.CookieJar() ## add cookies # opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) # opener.addheaders = [('User-agent','Mozilla/5.0 \ # (compatible; MSIE 6.0; Windows NT 5.1)')] # data = urllib.urlencode(acc_pwd) # r = opener.open(address,data,10) s = requests.Session() cookies = pyCookieCheat.chrome_cookies(address) r = s.get(address, cookies=cookies,auth=('rballestiero','OEOM8032nielsypoo')) site = r.text for stock in stocks: print(site[site.find(stock.ini):site.find(end)+end_num]) valores = re.findall(r'-?\d+.\d+\b', site[site.find(stock.ini):site.find(end)+end_num]) stock.change, stock.percent_change, stock.current,stock.high,stock.low,stock.open,stock.close = [float(n) for n in valores if n.find(":")<0] stock.time = valores[-1] stock.values.append(stock.current) if stock.time.split(":")[0] != time.localtime().tm_hour: stock.running = True
import requests import pyCookieCheat # jar = requests.cookies.RequestsCookieJar([ # { # "domain": ".stackoverflow.com", # "expirationDate": "1427212131.77312", # "hostOnly": "false", # "httpOnly": "true", # "name": "usr", # "path": "/", # "secure": "false", # "session": "false", # "storeId": "0", # "value": "SOMEVALUE", # "id": "5" # }] # requests.get(url, headers=headers, cookies=jar) url = 'https://www.itnetwork.cz/' s = requests.Session() cookies = pyCookieCheat.chrome_cookies(url) s.get(url, cookies=cookies) print(f'{s}')
def __init__(self, args): self.args = args self.display = Display("info_%s.log" % escape(args.bookid)) self.display.intro() self.session = requests.Session() self.session.headers.update(self.HEADERS) self.jwt = {} if args.chrome_cookies: cookies = pyCookieCheat.chrome_cookies(ORLY_BASE_URL) json.dump(cookies, open(COOKIES_FILE, 'w')) if not args.cred: if not os.path.isfile(COOKIES_FILE): self.display.exit( "Login: unable to find cookies file.\n" " Please use the `--cred` or `--login` options to perform the login." ) self.session.cookies.update(json.load(open(COOKIES_FILE))) else: self.display.info("Logging into Safari Books Online...", state=True) self.do_login(*args.cred) if not args.no_cookies: json.dump(self.session.cookies.get_dict(), open(COOKIES_FILE, 'w')) self.check_login() self.book_id = args.bookid self.api_url = self.API_TEMPLATE.format(self.book_id) self.display.info("Retrieving book info...") self.book_info = self.get_book_info() self.display.book_info(self.book_info) self.display.info("Retrieving book chapters...") self.book_chapters = self.get_book_chapters() self.chapters_queue = self.book_chapters[:] if len(self.book_chapters) > sys.getrecursionlimit(): sys.setrecursionlimit(len(self.book_chapters)) self.book_title = self.book_info["title"] self.base_url = self.book_info["web_url"] self.clean_book_title = "".join(self.escape_dirname(self.book_title).split(",")[:2]) \ + " ({0})".format(self.book_id) books_dir = os.path.join(PATH, "Books") if not os.path.isdir(books_dir): os.mkdir(books_dir) self.BOOK_PATH = os.path.join(books_dir, self.clean_book_title) self.display.set_output_dir(self.BOOK_PATH) self.css_path = "" self.images_path = "" self.create_dirs() self.chapter_title = "" self.filename = "" self.css = [] self.images = [] self.display.info("Downloading book contents... (%s chapters)" % len(self.book_chapters), state=True) self.BASE_HTML = self.BASE_01_HTML + ( self.KINDLE_HTML if not args.no_kindle else "") + self.BASE_02_HTML self.cover = False self.get() if not self.cover: self.cover = self.get_default_cover() cover_html = self.parse_html( html.fromstring( "<div id=\"sbo-rt-content\"><img src=\"Images/{0}\"></div>" .format(self.cover)), True) self.book_chapters = [{ "filename": "default_cover.xhtml", "title": "Cover" }] + self.book_chapters self.filename = self.book_chapters[0]["filename"] self.save_page_html(cover_html) self.css_done_queue = Queue( 0) if "win" not in sys.platform else WinQueue() self.display.info("Downloading book CSSs... (%s files)" % len(self.css), state=True) self.collect_css() self.images_done_queue = Queue( 0) if "win" not in sys.platform else WinQueue() self.display.info("Downloading book images... (%s files)" % len(self.images), state=True) self.collect_images() self.display.info("Creating EPUB file...", state=True) self.create_epub() if not args.no_cookies: json.dump(self.session.cookies.get_dict(), open(COOKIES_FILE, "w")) self.display.done(os.path.join(self.BOOK_PATH, self.book_id + ".epub")) self.display.unregister() if not self.display.in_error and not args.log: os.remove(self.display.log_file)
def output2txt(): writerList = open("output.txt", "w") urls = getArticleList() articles = [] for i, url in enumerate(urls): # Set up conn and cookies s = requests.Session() cookies = pyCookieCheat.chrome_cookies(url) res = s.get(url, cookies=cookies) res.encoding = "utf-8" soup = BeautifulSoup(res.text, 'html.parser') res2 = s.get(soup.body.iframe["src"], cookies=cookies) res2.encoding = "utf-8" soup2 = BeautifulSoup(res2.text, "html.parser") # Web scraping title = soup.title.string[1:-8] time_re = re.search( r'\d{4}-\d{2}-\d{2}', soup2.body.find('div', attrs={ 'class': 'time-cang' }).string) time = time_re.group(0) if time_re else '0000-00-00' content = "" content_div = soup2.body.find( 'div', id='detailArticleContent_ptkaiapt4bxy_baiduscarticle') tags = content_div.find_all('p') if tags: # case 1: newer articles (>2011) use <p> or <p><span> to make new paragraphs for tag in tags: content += tag.text.replace('\n', '') + '\n' else: # case 2: older baidu articles use <br> to make new paragraphs for br in soup2.find_all('br'): br.replace_with('\n') content = content_div.text + '\n' content.replace(" ", " ") # Appending content images to the end for img in content_div.find_all('img'): content += img['src'] + '\n' # Debugging #print i print title print time + '\n' print content print '+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+\n\n' articles.append((title, content, time)) # Write to text writerList.write(title.encode("utf-8") + '\n') writerList.write(time + '\n\n') writerList.write(content.encode("utf-8") + '\n') writerList.write( '+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+\n\n') # Write to obj cPickle.dump(articles, open("articles.obj", "wb")) writerList.close()
def output2md(): urls = getArticleList() articles = [] for i, url in enumerate(urls): # Set up conn and cookies s = requests.Session() cookies = pyCookieCheat.chrome_cookies(url) res = s.get(url, cookies=cookies) res.encoding = "utf-8" soup = BeautifulSoup(res.text, 'html.parser') res2 = s.get(soup.body.iframe["src"], cookies=cookies) res2.encoding = "utf-8" soup2 = BeautifulSoup(res2.text, "html.parser") # Web scraping title = soup.title.string[1:-8] time_re = re.search( r'\d{4}-\d{2}-\d{2}', soup2.body.find('div', attrs={ 'class': 'time-cang' }).string) time = time_re.group(0) if time_re else '0000-00-00' content = "" content_div = soup2.body.find( 'div', id='detailArticleContent_ptkaiapt4bxy_baiduscarticle') tags = content_div.find_all('img') now = content_div.prettify() print title if tags: for img in tags: tmp = 0 print print "Downloading >>>" + img['src'] j = hashlib.md5(img['src']).hexdigest() path = "images/" + j + ".jpg" try: data = urllib.urlretrieve(img['src'], path) except urllib2.HTTPError: print 'Bad URL' pass except IOError: print 'Bad URL' pass else: if (tmp < 100): print 'Network conditions is not good.Reloading.' data = urllib.urlretrieve(img['src'], path) else: print 'Fail to get it' + img['src'] pass tmp = tmp + 1 print img['src'] + ">>>" + path now = now.replace(img['src'], path) content = now # Debugging #print i print time + '\n' print content print '+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+\n\n' articles.append((title, content, time)) # Write to mardown with differnet files writerList = open( time.encode("utf-8") + " " + title.encode("utf-8") + ".md", "w") writerList.write('# ' + title.encode("utf-8") + '\n\n') writerList.write('> ' + time + '\n\n') writerList.write(content.encode("utf-8") + '\n\n') writerList.close() # Write to obj cPickle.dump(articles, open("articles.obj", "wb"))
gradeUrl = 'http://wenzhang.baidu.com/' result = opener.open(gradeUrl) print result.read() if __name__ == "__main__": output2txt = True #url="https://wenzhang.baidu.com/page/view?key=168a2f0785435838-1426607065" writerList = open("output.txt", "w") #urls = ["https://wenzhang.baidu.com/page/view?key=168a2f0785435838-1426607065"] urls = getArticleList() articles = [] for i, url in enumerate(urls): # Set up conn and cookies s = requests.Session() cookies = pyCookieCheat.chrome_cookies(url) res = s.get(url, cookies=cookies) res.encoding = "utf-8" soup = BeautifulSoup(res.text, 'html.parser') res2 = s.get(soup.body.iframe["src"], cookies = cookies) res2.encoding = "utf-8" soup2 = BeautifulSoup(res2.text, "html.parser") # Web scraping title = soup.title.string[1:-8] time_re = re.search(r'\d{4}-\d{2}-\d{2}', soup2.body.find('div', attrs={'class':'time-cang'}).string) time = time_re.group(0) if time_re else '0000-00-00' content = "" content_div = soup2.body.find('div', id='detailArticleContent_ptkaiapt4bxy_baiduscarticle')