Esempio n. 1
0
 def __init__(self, to_user, parent_id, title,
              content, from_user=None):
     self.to_user = to_user
     self.title = title
     self.content = safe_html(content, safe_tags=[])
     self.parent_id = parent_id
     self.from_user = from_user
Esempio n. 2
0
 def __init__(self, summary=None, author_intro=None, \
              catalog=None, book=None):
     self.summary = safe_html(summary)
     self.author_intro = safe_html(author_intro)
     self.catalog = safe_html(catalog)
     self.book = book
Esempio n. 3
0
 def __init__(self, content, book, user=None):
     self.content = safe_html(content, safe_tags=[])
     self.book = book
     self.user = user
Esempio n. 4
0
def grab():

    # 计数器文件
    counter_file = open(join(_basedir, 'counter.txt'), 'r+')
    counter = int(counter_file.readline()) or 1
    # 索引文件
    index_file = open(join(_basedir, 'records.txt'), 'r')

    line = 1
    while (True):
        if line == counter: break
        index_file.readline()
        line += 1

    for line in iter(index_file.readline, b''):
        # retrieve the isbn13 number
        re_isbn13 =  '.*(978\d{10}).*'
        try:
            amazon_doc = urlopen(line).read()
            m = re.search(re_isbn13, amazon_doc)
            
            if m is None:
                continue
            else:
                isbn13 = m.group(1)
        except URLError:
            print URLError
            continue

        # get html doc
        url = 'http://www.openisbn.com/isbn/%s' % isbn13
        print url
        try:
            html_doc = urlopen(url).read()
        except URLError:
            print URLError
            continue

        soup = BeautifulSoup(html_doc, 'html5lib')
        title = soup.find('div', class_='PostHead').string.strip()

        content = soup.find('div', class_='PostContent').prettify()

        author = str2list(_get_attr('author', content))
        publisher = _get_attr('publisher', content)
        pages = _get_attr('pages', content)
        language = _get_attr('language', content, re.M)
        binding = _get_attr('binding', content)
        price, currency = _get_attr('price', content).split(' ')

        book = Book(isbn13, title, author, publisher,
                    price, pages=pages)
        book.set_language(language)
        book.set_binding(binding)
        book.set_currency(currency)

        intro = soup.find('div', class_='div')
        if intro.font is not None:
            intro.font.extract()
        intro = safe_html(intro.prettify())
        if intro[-3:] == u'海报:':
            intro = intro[:-3].strip()

        book.extra = BookExtra(intro)
        book.save()
        break

        # update counter and the counter file
        counter += 1
        counter_file.seek(0)
        counter_file.write(str(counter) + '\n')
    # end for

    counter_file.close()
    index_file.close()