Beispiel #1
0
def grab_douban():
    """从豆瓣抓取数据"""

    SLEEP_TIME = 11

    # 计数器文件
    counter_file = open(join(_basedir, 'counter.txt'), 'r+')
    counter = int(counter_file.readline()) or 1
    # 索引文件
    index_file = open(join(_basedir, 'records.txt'), 'r')

    douban_api_url_base = 'https://api.douban.com/v2/book/isbn/'

    line = 1
    while (True):
        if line == counter: break
        index_file.readline()
        line += 1

    for line in iter(index_file.readline, b''):
        # 从Amazon获取isbn13
        re_isbn13 = '.*(978\d{10}).*'
        req = requests.get(line)
        if req.status_code != requests.codes.ok:
            counter += 1
            continue

        html_amazon = req.text
        m = re.search(re_isbn13, html_amazon)
        if m is None:
            counter += 1
            continue
        else:
            isbn13 = m.group(1)

        book = Book.query.filter_by(isbn13=isbn13).first()
        if book is not None:
            counter += 1
            continue

        douban_api_url = douban_api_url_base + isbn13
        print douban_api_url

        req = requests.get(douban_api_url)
        if req.status_code != requests.codes.ok:
            counter += 1
            continue

        data = req.json()
        if 'code' in data and data['code'] == 6000:
            counter += 1
            continue

        isbn = data['isbn13']
        title = data['title']
        subtitle = data['subtitle']
        orititle = data['origin_title']

        author = data['author']
        translator = data['translator']
        publisher = data['publisher']
        pubdate = data['pubdate']
        price = data['price']
        binding = data['binding']

        pages = data['pages']

        author_intro = data['author_intro']
        summary = data['summary']

        book = Book(isbn, title, author, publisher,
                    price, subtitle, orititle,
                    translator, pubdate=pubdate,
                    pages=pages, binding=binding)
        book.extra = BookExtra(summary, author_intro)

        try:
            book.save()
        except ValueError:
            counter += 1
            continue

        # 抓取封面
        img_src = data['images']['large'].replace('\\', '')
        img = requests.get(img_src)

        if img.status_code == requests.codes.ok:
            book.save_cover(Image.open(StringIO(img.content)))

        counter += 1
        counter_file.seek(0)
        counter_file.write(str(counter) + '\n')

        time.sleep(SLEEP_TIME)

    counter_file.close()
    index_file.close()
    return 'Done!'
Beispiel #2
0
def grab():

    # 计数器文件
    counter_file = open(join(_basedir, 'counter.txt'), 'r+')
    counter = int(counter_file.readline()) or 1
    # 索引文件
    index_file = open(join(_basedir, 'records.txt'), 'r')

    line = 1
    while (True):
        if line == counter: break
        index_file.readline()
        line += 1

    for line in iter(index_file.readline, b''):
        # retrieve the isbn13 number
        re_isbn13 =  '.*(978\d{10}).*'
        try:
            amazon_doc = urlopen(line).read()
            m = re.search(re_isbn13, amazon_doc)
            
            if m is None:
                continue
            else:
                isbn13 = m.group(1)
        except URLError:
            print URLError
            continue

        # get html doc
        url = 'http://www.openisbn.com/isbn/%s' % isbn13
        print url
        try:
            html_doc = urlopen(url).read()
        except URLError:
            print URLError
            continue

        soup = BeautifulSoup(html_doc, 'html5lib')
        title = soup.find('div', class_='PostHead').string.strip()

        content = soup.find('div', class_='PostContent').prettify()

        author = str2list(_get_attr('author', content))
        publisher = _get_attr('publisher', content)
        pages = _get_attr('pages', content)
        language = _get_attr('language', content, re.M)
        binding = _get_attr('binding', content)
        price, currency = _get_attr('price', content).split(' ')

        book = Book(isbn13, title, author, publisher,
                    price, pages=pages)
        book.set_language(language)
        book.set_binding(binding)
        book.set_currency(currency)

        intro = soup.find('div', class_='div')
        if intro.font is not None:
            intro.font.extract()
        intro = safe_html(intro.prettify())
        if intro[-3:] == u'海报:':
            intro = intro[:-3].strip()

        book.extra = BookExtra(intro)
        book.save()
        break

        # update counter and the counter file
        counter += 1
        counter_file.seek(0)
        counter_file.write(str(counter) + '\n')
    # end for

    counter_file.close()
    index_file.close()