Beispiel #1
0
def create_session():

    global session

    cf = ConfigParser.ConfigParser()
    cf.read("config.ini")
    email = cf.get("info", "email")
    password = cf.get("info", "password")
    s = requests.session()

    dict_cookie = pyCookieCheat.chrome_cookies("http://zhihu.com")
    requests.utils.add_dict_to_cookiejar(s.cookies, dict_cookie)

    login_data = {"email": email, "password": password}
    header = {
        'User-Agent':
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
        'Host': "www.zhihu.com",
        'Referer': "http://www.zhihu.com/",
        'X-Requested-With': "XMLHttpRequest"
    }
    #c = s.get('http://www.zhihu.com', headers = header)
    #print c.text.encode("utf8")
    #r = s.post('http://www.zhihu.com/login', data = login_data, headers = header)
    #print r.text.decode("utf-8")
    #print r
    #if r.json()["r"] == 1:
    #raise Exception("login failed.")
    session = s
Beispiel #2
0
def get_json(url):
    cookies = pyCookieCheat.chrome_cookies(url)
    r = session.get(url, cookies=cookies)
    text = r.text

    if text.startswith('while(1);'):
        text = r.text[9:]
    return json.loads(text)
def loadSite(end='quoteElementPiece11',end_num = 50):
    # cj = cookielib.CookieJar() ## add cookies
    # opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    # opener.addheaders = [('User-agent','Mozilla/5.0 \
    #                     (compatible; MSIE 6.0; Windows NT 5.1)')]
    # data = urllib.urlencode(acc_pwd)
    # r = opener.open(address,data,10) 

    s = requests.Session()
    cookies = pyCookieCheat.chrome_cookies(address)
    r = s.get(address, cookies=cookies,auth=('rballestiero','OEOM8032nielsypoo'))
    site = r.text

    for stock in stocks:
        print(site[site.find(stock.ini):site.find(end)+end_num])
        valores = re.findall(r'-?\d+.\d+\b', site[site.find(stock.ini):site.find(end)+end_num])
        stock.change, stock.percent_change, stock.current,stock.high,stock.low,stock.open,stock.close = [float(n) for n in valores if n.find(":")<0]
        stock.time = valores[-1]
        stock.values.append(stock.current)
        if stock.time.split(":")[0] != time.localtime().tm_hour:
            stock.running = True
Beispiel #4
0
import requests
import pyCookieCheat

# jar = requests.cookies.RequestsCookieJar([
# {
#     "domain": ".stackoverflow.com",
#     "expirationDate": "1427212131.77312",
#     "hostOnly": "false",
#     "httpOnly": "true",
#     "name": "usr",
#     "path": "/",
#     "secure": "false",
#     "session": "false",
#     "storeId": "0",
#     "value": "SOMEVALUE",
#     "id": "5"
# }]
# requests.get(url, headers=headers, cookies=jar)

url = 'https://www.itnetwork.cz/'

s = requests.Session()
cookies = pyCookieCheat.chrome_cookies(url)
s.get(url, cookies=cookies)
print(f'{s}')
    def __init__(self, args):
        self.args = args
        self.display = Display("info_%s.log" % escape(args.bookid))
        self.display.intro()

        self.session = requests.Session()
        self.session.headers.update(self.HEADERS)

        self.jwt = {}

        if args.chrome_cookies:
            cookies = pyCookieCheat.chrome_cookies(ORLY_BASE_URL)
            json.dump(cookies, open(COOKIES_FILE, 'w'))

        if not args.cred:
            if not os.path.isfile(COOKIES_FILE):
                self.display.exit(
                    "Login: unable to find cookies file.\n"
                    "    Please use the `--cred` or `--login` options to perform the login."
                )

            self.session.cookies.update(json.load(open(COOKIES_FILE)))

        else:
            self.display.info("Logging into Safari Books Online...",
                              state=True)
            self.do_login(*args.cred)
            if not args.no_cookies:
                json.dump(self.session.cookies.get_dict(),
                          open(COOKIES_FILE, 'w'))

        self.check_login()

        self.book_id = args.bookid
        self.api_url = self.API_TEMPLATE.format(self.book_id)

        self.display.info("Retrieving book info...")
        self.book_info = self.get_book_info()
        self.display.book_info(self.book_info)

        self.display.info("Retrieving book chapters...")
        self.book_chapters = self.get_book_chapters()

        self.chapters_queue = self.book_chapters[:]

        if len(self.book_chapters) > sys.getrecursionlimit():
            sys.setrecursionlimit(len(self.book_chapters))

        self.book_title = self.book_info["title"]
        self.base_url = self.book_info["web_url"]

        self.clean_book_title = "".join(self.escape_dirname(self.book_title).split(",")[:2]) \
                                + " ({0})".format(self.book_id)

        books_dir = os.path.join(PATH, "Books")
        if not os.path.isdir(books_dir):
            os.mkdir(books_dir)

        self.BOOK_PATH = os.path.join(books_dir, self.clean_book_title)
        self.display.set_output_dir(self.BOOK_PATH)
        self.css_path = ""
        self.images_path = ""
        self.create_dirs()

        self.chapter_title = ""
        self.filename = ""
        self.css = []
        self.images = []

        self.display.info("Downloading book contents... (%s chapters)" %
                          len(self.book_chapters),
                          state=True)
        self.BASE_HTML = self.BASE_01_HTML + (
            self.KINDLE_HTML if not args.no_kindle else "") + self.BASE_02_HTML

        self.cover = False
        self.get()
        if not self.cover:
            self.cover = self.get_default_cover()
            cover_html = self.parse_html(
                html.fromstring(
                    "<div id=\"sbo-rt-content\"><img src=\"Images/{0}\"></div>"
                    .format(self.cover)), True)

            self.book_chapters = [{
                "filename": "default_cover.xhtml",
                "title": "Cover"
            }] + self.book_chapters

            self.filename = self.book_chapters[0]["filename"]
            self.save_page_html(cover_html)

        self.css_done_queue = Queue(
            0) if "win" not in sys.platform else WinQueue()
        self.display.info("Downloading book CSSs... (%s files)" %
                          len(self.css),
                          state=True)
        self.collect_css()
        self.images_done_queue = Queue(
            0) if "win" not in sys.platform else WinQueue()
        self.display.info("Downloading book images... (%s files)" %
                          len(self.images),
                          state=True)
        self.collect_images()

        self.display.info("Creating EPUB file...", state=True)
        self.create_epub()

        if not args.no_cookies:
            json.dump(self.session.cookies.get_dict(), open(COOKIES_FILE, "w"))

        self.display.done(os.path.join(self.BOOK_PATH, self.book_id + ".epub"))
        self.display.unregister()

        if not self.display.in_error and not args.log:
            os.remove(self.display.log_file)
Beispiel #6
0
def output2txt():
    writerList = open("output.txt", "w")
    urls = getArticleList()
    articles = []
    for i, url in enumerate(urls):
        # Set up conn and cookies
        s = requests.Session()
        cookies = pyCookieCheat.chrome_cookies(url)
        res = s.get(url, cookies=cookies)
        res.encoding = "utf-8"
        soup = BeautifulSoup(res.text, 'html.parser')

        res2 = s.get(soup.body.iframe["src"], cookies=cookies)
        res2.encoding = "utf-8"
        soup2 = BeautifulSoup(res2.text, "html.parser")

        # Web scraping
        title = soup.title.string[1:-8]
        time_re = re.search(
            r'\d{4}-\d{2}-\d{2}',
            soup2.body.find('div', attrs={
                'class': 'time-cang'
            }).string)
        time = time_re.group(0) if time_re else '0000-00-00'

        content = ""
        content_div = soup2.body.find(
            'div', id='detailArticleContent_ptkaiapt4bxy_baiduscarticle')
        tags = content_div.find_all('p')
        if tags:
            # case 1: newer articles (>2011) use <p> or <p><span> to make new paragraphs
            for tag in tags:
                content += tag.text.replace('\n', '') + '\n'
        else:
            # case 2: older baidu articles use <br> to make new paragraphs
            for br in soup2.find_all('br'):
                br.replace_with('\n')
            content = content_div.text + '\n'

        content.replace("&nbsp;", " ")
        # Appending content images to the end
        for img in content_div.find_all('img'):
            content += img['src'] + '\n'

        # Debugging
        #print i
        print title
        print time + '\n'
        print content
        print '+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+\n\n'

        articles.append((title, content, time))

        # Write to text
        writerList.write(title.encode("utf-8") + '\n')
        writerList.write(time + '\n\n')
        writerList.write(content.encode("utf-8") + '\n')
        writerList.write(
            '+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+\n\n')

    # Write to obj
    cPickle.dump(articles, open("articles.obj", "wb"))
    writerList.close()
Beispiel #7
0
def output2md():
    urls = getArticleList()
    articles = []
    for i, url in enumerate(urls):
        # Set up conn and cookies
        s = requests.Session()
        cookies = pyCookieCheat.chrome_cookies(url)
        res = s.get(url, cookies=cookies)
        res.encoding = "utf-8"
        soup = BeautifulSoup(res.text, 'html.parser')

        res2 = s.get(soup.body.iframe["src"], cookies=cookies)
        res2.encoding = "utf-8"
        soup2 = BeautifulSoup(res2.text, "html.parser")

        # Web scraping
        title = soup.title.string[1:-8]
        time_re = re.search(
            r'\d{4}-\d{2}-\d{2}',
            soup2.body.find('div', attrs={
                'class': 'time-cang'
            }).string)
        time = time_re.group(0) if time_re else '0000-00-00'

        content = ""
        content_div = soup2.body.find(
            'div', id='detailArticleContent_ptkaiapt4bxy_baiduscarticle')
        tags = content_div.find_all('img')
        now = content_div.prettify()
        print title
        if tags:
            for img in tags:
                tmp = 0
                print
                print "Downloading >>>" + img['src']
                j = hashlib.md5(img['src']).hexdigest()
                path = "images/" + j + ".jpg"
                try:
                    data = urllib.urlretrieve(img['src'], path)
                except urllib2.HTTPError:
                    print 'Bad URL'
                    pass
                except IOError:
                    print 'Bad URL'
                    pass
                else:
                    if (tmp < 100):
                        print 'Network conditions is not good.Reloading.'
                        data = urllib.urlretrieve(img['src'], path)
                    else:
                        print 'Fail to get it' + img['src']
                        pass
                    tmp = tmp + 1

                print img['src'] + ">>>" + path
                now = now.replace(img['src'], path)
        content = now
        # Debugging
        #print i
        print time + '\n'
        print content
        print '+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+\n\n'

        articles.append((title, content, time))

        # Write to mardown with differnet files
        writerList = open(
            time.encode("utf-8") + " " + title.encode("utf-8") + ".md", "w")
        writerList.write('# ' + title.encode("utf-8") + '\n\n')
        writerList.write('> ' + time + '\n\n')
        writerList.write(content.encode("utf-8") + '\n\n')
        writerList.close()
    # Write to obj
    cPickle.dump(articles, open("articles.obj", "wb"))
Beispiel #8
0
    gradeUrl = 'http://wenzhang.baidu.com/'
    result = opener.open(gradeUrl)
    print result.read()

if __name__ == "__main__":
    output2txt = True

    #url="https://wenzhang.baidu.com/page/view?key=168a2f0785435838-1426607065"
    writerList = open("output.txt", "w")
    #urls = ["https://wenzhang.baidu.com/page/view?key=168a2f0785435838-1426607065"]
    urls = getArticleList()
    articles = []
    for i, url in enumerate(urls):
        # Set up conn and cookies
        s = requests.Session()
        cookies = pyCookieCheat.chrome_cookies(url)
        res = s.get(url, cookies=cookies)
        res.encoding = "utf-8"
        soup = BeautifulSoup(res.text, 'html.parser')

        res2 = s.get(soup.body.iframe["src"], cookies = cookies)
        res2.encoding = "utf-8"
        soup2 = BeautifulSoup(res2.text, "html.parser")

        # Web scraping
        title = soup.title.string[1:-8]
        time_re = re.search(r'\d{4}-\d{2}-\d{2}', soup2.body.find('div', attrs={'class':'time-cang'}).string)
        time = time_re.group(0) if time_re else '0000-00-00'

        content = ""
        content_div = soup2.body.find('div', id='detailArticleContent_ptkaiapt4bxy_baiduscarticle')