Example #1
0
File: utils.py Project: mitnk/mc
def get_book_pages(book_id):
    url = "http://book.douban.com/subject/%s/" % book_id
    soup = get_soup_by_url(url)
    tag = soup.find("div", {"id": "info"})
    result = re.search(r">页数:</span> (\d+)<br", str(tag))
    if result:
        return result.group(1)
    return 0
Example #2
0
File: views.py Project: mitnk/mc
def save_note(url, date=None):
    soup = get_soup_by_url(url)
    tag = soup.find("div", {"class": "highlightText"})
    text = "".join(tag.findAll(text=True)).strip()

    remark = ""
    tag = soup.find("div", {"class": "note"})
    if tag:
        remark = "".join(tag.findAll(text=True)).replace("Note:", "").replace("@zzrt", "").strip()

    cover_tag = soup.find("div", {"class": "cover"})
    tag = cover_tag.find("span", {"class": "title"})
    if tag:
        book = "".join(tag.findAll(text=True)).strip()
        if "Personal Document" in book:
            book = ""
    else:
        book = ""

    tag = cover_tag.find("span", {"class": "author"})
    if tag:
        author = "".join(tag.findAll(text=True)).replace(" by ", "").strip()
    else:
        author = ""

    if " " not in text and text[0] in ASCII_CHARS and len(text) <= 64:
        if Word.objects.filter(word=text).count() == 0:
            Word.objects.create(url=url, word=text)
    else:
        note = Note()
        note.url = url
        note.text = text
        note.added = date or datetime.datetime.now()
        if remark:
            note.remark = remark
        if book:
            note.book = book
        if author:
            note.author = author
        note.save()
Example #3
0
    def fetch(self):
        def is_home_page(url):
            return '/' not in url.replace('//', '').strip('/')

        try:
            soup = get_soup_by_url(self.url)
        except:
            logger.info("Time out when fetching HackerNews.")
            return

        # Reset articles before fetching
        self.articles = []

        tags = soup.find("table").find_all("td", {"class": "title"})
        for tag in tags:
            tag_a = tag.find('a')
            if (not tag_a) or \
                ('href' not in tag_a.attrs) or \
                (len(tag_a.contents) > 1) or \
                (tag_a.string.lower() == "more" and '/' not in tag_a['href']):
                continue

            try:
                points = int(tag.parent.nextSibling.find('span').string.split(' ')[0])
            except AttributeError, ValueError:
                points = 0

            if 'http' not in tag_a['href']:
                tag_a['href'] = "http://news.ycombinator.com/" + tag_a['href']

            if tag_a['href'] and points >= self.POINTS_MIN_LIMIT and (not is_home_page(tag_a['href'])):
                self.articles.append({
                    'url': tag_a['href'],
                    'title': tag_a.string,
                    'points': points,
                })