Python ScrapyPage Exemples, app.tools.WebPageParsing.ScrapyPage Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : views.py Projet : xing-shuai/school

def get_news_content():
    content = json.loads(request.data.decode())
    host = 'http://news.ahut.edu.cn' if content[
        "mode"] == "0" else 'http://jwc.ahut.edu.cn'
    content_url = content["contentUrl"]
    html = ScrapyPage(host + content_url)
    selector = etree.HTML(html)

    paragraph = selector.xpath("//div[@id='vsb_content']/p")
    if not paragraph:
        paragraph = selector.xpath("//div[@id='vsb_content_501']/p")
    content = []
    for p in paragraph:
        text = p.xpath("string(.)").strip().replace("\r\n",
                                                    "").replace(" ", '')
        if text:
            content.append({"text": text, "type": 0})

        imgs = p.xpath(".//img")
        for img in imgs:
            content.append({
                "url": host + "/" + img.xpath("./@src")[0],
                "type": 1
            })
    return json.dumps({'content': content})

Exemple #2

0

Afficher le fichier

Fichier : views.py Projet : xing-shuai/school

def get_news_list():
    content = json.loads(request.data.decode())
    mode = content["mode"]
    static_url = 'http://news.ahut.edu.cn/list.jsp' if mode == "0" else 'http://jwc.ahut.edu.cn/list.jsp'

    html = ScrapyPage(static_url + content['nextPageUrl'])
    selector = etree.HTML(html)
    news = selector.xpath("//a[@class='" +
                          ("c1022" if mode == "0" else "c44456") + "']")
    try:
        next_page = selector.xpath("//a[@class='Next']/@href")[0]
    except IndexError:
        next_page = '-1'

    res = []
    for new in news:
        data = {
            "mode":
            mode,
            "href":
            new.xpath("./@href")[0],
            "title":
            new.xpath("./text()")[0].strip(),
            "time":
            new.xpath("../following-sibling::td[1]/text()")[0][:-1]
            if mode == "0" else
            new.xpath("../following-sibling::td[1]/text()")[0][1:-2]
        }
        res.append(data)
    return json.dumps({"news": res, 'nextPageUrl': next_page})

Exemple #3

0

Afficher le fichier

def search_books():
    static_url = 'http://10.100.101.10:8080/opac/openlink.php?historyCount=0&doctype=ALL&match_flag=forward&displaypg=20&sort=CATA_DATE&orderby=desc&showmode=list&dept=ALL'
    search_string = request.args.get('strText')
    search_type = request.args.get('strSearchType')
    static_url += ("&strText=" + search_string + "&strSearchType=" +
                   search_type)
    selector = etree.HTML(ScrapyPage(static_url))
    return json.dumps(get_books(selector))

Exemple #4

0

Afficher le fichier

def get_school_calendar():
    base_url = "http://jwc.ahut.edu.cn"
    static_url = base_url + '/list.jsp?urltype=tree.TreeTempUrl&wbtreeid=1109'
    html = ScrapyPage(static_url)
    selector = etree.HTML(html)
    calendars = selector.xpath("//a[@class='c44456']")
    data = []
    for calendar in calendars:
        selectot_cal = etree.HTML(
            ScrapyPage(base_url + calendar.xpath("./@href")[0].strip()))
        images = []
        for cal in selectot_cal.xpath("//div[@id='vsb_content']/p"):
            images.append(base_url + "/" + cal.xpath("./img/@src")[0])
        data.append({
            "name": calendar.xpath("./@title")[0].strip(),
            "images": images
        })
    return json.dumps({"data": data})

Exemple #5

0

Afficher le fichier

def get_week():
    try:
        html = ScrapyPage('http://211.70.149.139:84/jxz.aspx')
        selector = etree.HTML(html)
        tr = selector.xpath("//table/tr")[1]
        week = tr.xpath("./td/font/text()")[0].strip()
        return json.dumps({'code': 1, 'week': week})
    except:
        return json.dumps({'code': -1})

Exemple #6

0

Afficher le fichier

def get_book_detail():
    static_url = 'http://10.100.101.10:8080/opac/item.php'
    book_id = request.args.get("book_id")
    static_url += ("?marc_no=" + book_id)
    selector = etree.HTML(ScrapyPage(static_url))
    dls = selector.xpath("//div[@id='item_detail']/dl")[:-2]
    res = []
    for dl in dls:
        title = dl.xpath("./dt/text()")[0]
        content = dl.xpath("./dd")[0].xpath("string(.)")
        if 'ISBN' in title:
            isbn = content.split('/')[0].replace('-', '')
        res.append({'title': title, 'content': content})

    borrow = selector.xpath("//table[@id='item']/tr")[1:]
    borrow_info = []
    for borrow_ in borrow:
        td = borrow_.xpath("./td")
        try:
            borrow_info.append({
                'index': td[0].xpath("./text()")[0],
                'tiao_ma': td[1].xpath("./text()")[0],
                'xiao_qu': td[3].xpath("./text()")[0].strip(),
                'status': td[4].xpath("string(.)")
            })
        except IndexError as e:
            continue
    return json.dumps({
        'info':
        res,
        'book_img':
        json.loads(
            ScrapyPage('http://10.100.101.10:8080/opac/ajax_douban.php?isbn=' +
                       isbn))['image'],
        'borrow_info':
        borrow_info
    })

Exemple #7

0

Afficher le fichier

def check_user():
    login_code = json.loads(request.data.decode())
    url = 'https://api.weixin.qq.com/sns/jscode2session?appid=' + wx_config[
        'appid'] + '&secret=' + wx_config[
            'secret'] + '&grant_type=authorization_code&js_code=' + login_code[
                'code']
    try:
        open_id = json.loads(ScrapyPage(url))['openid']
        bind = UserBind.query.filter(UserBind.openid == open_id).first()
        if bind:
            session['user_number'] = bind.number
            session['user_type'] = bind.identity_type
            return json.dumps({
                'code': '1',
                'msg': '',
                'user_type': bind.identity_type,
                'user_number': bind.number,
                'notification': init_notification()
            })
        session['open_id'] = open_id
        return json.dumps({'code': '0', 'msg': ''})
    except:
        return json.dumps({'code': '-1', 'msg': '用户验证失败...'})

Exemple #8

0

Afficher le fichier

def search_next_page():
    static_url = 'http://10.100.101.10:8080/opac/openlink.php'
    static_url += json.loads(request.data.decode())["next_page"]
    selector = etree.HTML(ScrapyPage(static_url))
    return json.dumps(get_books(selector))