Python ParseExceptionの例、custom_exception.ParseException Pythonの例

コード例 #1

0

ファイルを表示

def is_login_page(html):
    '''Judge the page is library-login page or not.

    Because if the user login failed, the site will redirect to this page, so the method can
    also judge if the user login successfully.

    Since there is only one login-page, just judge it using the caption "登录我的图书馆".

    Args:
        html: The page html string.

    Returns:
        True or False.

    Raises:
        custom_exception.ParseException
    '''
    try:
        soup = BeautifulSoup.BeautifulSoup(html)
        result = soup.findAll('caption')
        for cap in result:
            if "<caption>登录我的图书馆</caption>" == str(cap):
                # print "it is login page"
                return True
        # print "it is not login page"
        return False
    except Exception, e:
        raise custom_exception.ParseException("login page")

コード例 #2

0

ファイルを表示

def get_appointed_books(html):
    '''

    Returns:
        list of book.AppointedBookItem or [].

    Raises:
        custom_exception.ParseException
    '''
    try:
        soup = BeautifulSoup.BeautifulSoup(html)
        book_table = soup.find(
            'table', {
                'width': "100%",
                'border': "0",
                'cellpadding': "5",
                'cellspacing': "1",
                'bgcolor': "#CCCCCC"
            })
        # print book_table
        if not book_table:
            return []
        book_trs = book_table.findAll('tr')
        book_trs.remove(book_trs[0])
        # print book_trs
        book_list = []
        for b in book_trs:
            book_tds = b.findAll('td')
            book_item = book.AppointedBookItem(
                extract_string(book_tds[0]).encode('utf-8').strip(),
                decode_strange_str(extract_string(
                    book_tds[1])).encode('utf-8').strip(),
                decode_strange_str(extract_string(
                    book_tds[2])).encode('utf-8').strip(),
                extract_string(book_tds[3]).encode('utf-8').strip(),
                extract_string(book_tds[4]).encode('utf-8').strip(),
                extract_string(book_tds[5]).encode('utf-8').strip(),
                extract_string(book_tds[6]).encode('utf-8').strip(),
                extract_string(book_tds[7]).encode('utf-8').strip())
            # print extract_string(book_tds[7]).encode('utf-8').strip()
            contain_data = book_tds[8].div.input['onclick']
            # pattern = re.compile(r"\((\'.*\'?)\,(\'.*\'?)\,(\'.*\'?)\,(\'.*\'?)\)")
            pattern = re.compile(r"\((\'.*\'?\,){3}(\'.*\'?)\)")
            match = pattern.search(contain_data)
            data_list = eval(match.group())
            data = {
                "marc_no": data_list[0],
                "call_no": data_list[1],
                "loca": data_list[3]
            }
            book_item.set_data(data)
            print book_item
            book_list.append(book_item)
        return book_list
    except Exception, e:
        raise custom_exception.ParseException("appointed-book page")

コード例 #3

0

ファイルを表示

def get_render_book_list(html):
    '''Extract book info from the rendered book page.

    Args:
        html: The page-string of render book.

    Returns:
        If extract successfully, it will return the list of 'book.BookIntro' object, else return None.

    Raises:
        raise custom_exception.ParseException
    '''
    try:
        soup = BeautifulSoup.BeautifulSoup(html)
        book_table = soup.findAll(
            'table', {
                'width': "100%",
                'border': "0",
                'cellpadding': "5",
                'cellspacing': "1",
                'bgcolor': "#CCCCCC"
            })
        if not book_table:
            return []

        def __get_book_tags(tag_list):
            __remove_navi_string(tag_list)
            tag_list.remove(tag_list[0])
            return tag_list

        book_tags = book_table[0].contents
        __get_book_tags(book_tags)
        book_list = []
        for book_tag in book_tags:
            book_info_tags = book_tag.contents
            __remove_navi_string(book_info_tags)
            book_intro = book.BookRenderItem(
                book_info_tags[0].string.strip().encode('utf-8'),
                decode_strange_str(
                    book_info_tags[1].a.string).strip().encode('utf-8'),
                decode_strange_str(
                    book_info_tags[1].contents[1].string).strip().encode(
                        'utf-8')[1:].strip(),
                book_info_tags[2].string.strip().encode('utf-8'),
                book_info_tags[3].font.string.strip().encode('utf-8'),
                book_info_tags[4].string.strip().encode('utf-8'),
                book_info_tags[5].string.strip().encode('utf-8'),
                book_info_tags[6].string.strip().encode('utf-8'))
            print book_intro
            book_list.append(book_intro)
        return book_list
    except:
        raise custom_exception.ParseException("render-book page")

コード例 #4

0

ファイルを表示

ファイル: page_parser.py プロジェクト: wgyg535/webservice-py

def get_paocao_number(html):
    '''
    Raises:
        custom_exception.ParseException
    '''
    try:
        soup = BeautifulSoup(html)
        table = soup.findAll("td", {"class": "Content_Form"})
        pc = table[7].text
        return pc
    except:
        raise custom_exception.ParseException("跑操次数页面")

コード例 #5

0

ファイルを表示

def get_appoint_list(html):
    '''Get appoint books.

    Args: appoint-books page html.

    Returns:
        Appoint books list.

    Raises:
        raise custom_exception.ParseException
    '''
    try:
        soup = BeautifulSoup.BeautifulSoup(html)
        renew_table = soup.find(
            'table', {
                'width': "98%",
                'border': "0",
                'align': "center",
                'cellpadding': "2",
                'cellspacing': "1",
                'bgcolor': "#CCCCCC",
                'class': "table-line"
            })
        items = renew_table.findAll('tr')
        #print items
        appoint_item_list = []
        for i in range(1, len(items) - 1):
            item = items[i]
            td_items = item.findAll('td')
            print item
            print '-----------------'
            places = []
            tmp = td_items[6].findAll('option')
            for tag in tmp:
                places.append(tag['value'].encode('utf-8').strip())
            appoint_item = book.AppointInfoItem(
                td_items[0].string.encode('utf-8').strip(),
                td_items[1].string.encode('utf-8').strip(),
                td_items[7].findAll('input')[1]['value'].encode(
                    'utf-8').strip(),
                td_items[2].string.encode('utf-8').strip(),
                td_items[3].string.encode('utf-8').strip(),
                td_items[4].string.encode('utf-8').strip(),
                extract_string(td_items[5]).encode('utf-8').strip(), places,
                td_items[7].findAll('input')[3]['disabled'].encode(
                    'utf-8').strip())
            print appoint_item
            appoint_item_list.append(appoint_item)
        return appoint_item_list
    except:
        raise custom_exception.ParseException("appoint-books page")

コード例 #6

0

ファイルを表示

def get_search_book_list(html):
    '''Extract info from search result page.

    Args:
        html: the page resource string.

    Returns:
        If extract successfully, return the list of BookSearchItem obj, else return None.

    Raises:
        raise custom_exception.ParseException
    '''
    try:
        soup = BeautifulSoup.BeautifulSoup(html)
        book_divs = soup.findAll('div', {
            'class': 'list_books',
            'id': 'list_books'
        })
        book_list = []
        for book_div in book_divs:
            # print book_div
            book_item = book.BookSearchItem(
                book_div.h3.a['href'].encode('utf-8').strip()[17:],
                decode_strange_str(
                    book_divs[0].h3.a.string[2:]).encode('utf-8').strip(),
                book_div.find('span', {
                    'class': 'doc_type_class'
                }).string.encode('utf-8').strip(),
                decode_strange_str(
                    book_div.h3.contents[2].string).encode('utf-8').strip(),
                decode_strange_str(
                    book_div.p.contents[2].string).encode('utf-8').strip(),
                decode_strange_str(
                    book_div.p.contents[4].string).encode('utf-8').strip(),
                book_div.p.span.contents[1].string.encode('utf-8').strip(),
                book_div.p.span.contents[5].string.encode('utf-8').strip())
            book_list.append(book_item)
            print book_item
        return book_list
    except Exception, e:
        raise custom_exception.ParseException("search-result page")

コード例 #7

0

ファイルを表示

            years = items[2].string.encode('utf-8').strip()
            campus = items[3].string.encode('utf-8').strip()
            room = items[4].string.encode('utf-8').strip()
            lend = extract_string(items[5]).encode('utf-8').strip()
            try:
                store = book.BookStore(search_num, barcode, years, campus,
                                       room, lend)
                detail.add_store(store)
            except Exception, e:
                print e

        # print store_items
        # print detail
        return detail
    except:
        raise custom_exception.ParseException("book-detail page")


def __remove_navi_string(tag_list):
    '''Remove NavigableString in tag.contents especially u'\n'.

    Args:
        tag_list: the list of tags and nav_string gotten from tag.contents.
    '''

    tag_type = BeautifulSoup.Tag
    for tag in tag_list:
        if type(tag) is not tag_type:
            tag_list.remove(tag)
    return tag_list