コード例 #1
0
def retrieve_pages():
    url = "https://future-students.uq.edu.au/study/find-a-program/listing/undergraduate"
    page = jquery(url=url)
    retrieve_page(page)
    url = "https://future-students.uq.edu.au/study/find-a-program/listing/postgraduate"
    page = jquery(url=url)
    retrieve_page(page)
コード例 #2
0
ファイル: bxd.py プロジェクト: tomtam/SpiderDemo
def parseComment(questionNum, commentItem):
    if commentItem is None:
        print(u'评论标签不存在')
        return
    commentJq = jquery(commentItem)
    # 机构
    organization = commentJq.find(".div1 .p1 span a").html()
    # 代理人
    proxy = commentJq.find(".div1 .p1 a span").html()
    # 回复内容
    replys = commentJq.find(".div2 p")
    if replys is None:
        print(u'评论回复为空')
        return
    # 解析回复内容
    proxyReply = replys[0]
    if proxyReply is None:
        print(u'无代理人回复')
        return
    proxyReplyJq = jquery(proxyReply)
    proxyReplyComment = proxyReplyJq.html()
    # 解析手机号
    proxyReplyComment = cnToNum(proxyReplyComment)
    proxyReplyComment = proxyReplyComment.replace(u"-", "")
    proxyReplyComment = proxyReplyComment.replace(u"_", "")
    proxyReplyComment = proxyReplyComment.replace(u"—", "")
    phone = parsePhone(proxyReplyComment)
    saveHandle(questionNum, {
        "proxy": proxy,
        "organization": organization,
        "phone": phone
    })
    return
コード例 #3
0
 def GetBrandData(self):
     html = rq.get(self._url)
     doc = jquery(html)
     brandJqs = doc.find('.super-mod')
     allNum = brandJqs.length  # 总数量
     print('解析出总的品牌数据量:%s' % allNum)
     scNum = 0  # 成功数据量
     for brandItem in brandJqs:
         brandJq = jquery(brandItem)
         if (len(brandJq.find('.mod-intro')) == 0):
             print('解析没有mod-intro标签')
             continue
         scNum += 1
         print(brandJq.find('.mod-intro').html())
     print('成功获取的品牌数据数量:%s,解析失败数量:%s' % (scNum, allNum - scNum))
コード例 #4
0
 def GetBrandData(self):
     rs = urllib2.urlopen(self._url, timeout=10)
     html = rs.read().decode('utf-8')
     doc = jquery(html)
     brandJqs = doc.find('.super-mod')
     allNum = brandJqs.length  # 总数量
     print('解析出总的品牌数据量:%s' % allNum)
     scNum = 0  # 成功数据量
     for brandItem in brandJqs:
         brandJq = jquery(brandItem)
         if (len(brandJq.find('.mod-intro')) == 0):
             print('解析没有mod-intro标签')
             continue
         scNum += 1
         print brandJq.find('.mod-intro').html()
     print('成功获取的品牌数据数量:%s,解析失败数量:%s' % (scNum, allNum - scNum))
コード例 #5
0
def retrieve_program_page(program_page):
    if program_page.find("a.green") is not None:
        try:
            courselist_url = host + program_page.find("a.green").attr("href")
            print courselist_url
            courselist_page = jquery(url=courselist_url)
            retrieve_course_list_page(courselist_page)
        except Exception, err:
            print err
コード例 #6
0
def retrieve_course_list_page(courselist_page):
    courses = courselist_page.find('tr>td:first>a')
    index = 0
    while index < courses.size():
        course_url = host + courses.eq(index).attr("href")
        course_page = jquery(course_url)
        print courses.eq(index).text()
        retrieve_course_page(course_page, courses.eq(index).text())
        index += 1
コード例 #7
0
def retrieve_page(qaPage):
    baseurl = 'https://uqfuture.custhelp.com'
    listHolder = qaPage.find('.rn_Content')
    all_link = listHolder.find('a')
    for li in all_link:
        if li.text:
            url = baseurl + li.attrib['href']
            answerPage = jquery(url=url)
            retrieve_answer_page(answerPage=answerPage)
コード例 #8
0
def retrieve_pages():
    max = 43
    baseurl = "https://uqfuture.custhelp.com/app/answers/list/st/4/page/"
    index = 1
    while (index <= max):
        print(index)
        url = baseurl + str(index)
        page = jquery(url=url)
        retrieve_page(page)
        index += 1
コード例 #9
0
def getArticleContent(url):
    '''
        获取文章博文内容
    '''
    # show-content
    if (not url):
        print('非法地址')
        return
    htmlStr = rq.get(url)
    jq_dom = jquery(htmlStr)
    jq_content = jq_dom.find('.show-content')
    content_html = jq_content.html()
    # print(content_html)
    return content_html
コード例 #10
0
def fetch_course(url):
    courselist_page = jquery(url=url)
    courses = courselist_page.find('tbody>tr>td:nth-child(3)')
    result = []
    maxItem = 50
    index = 0
    for course in courses.items():
        index += 1
        print(course.text())
        if 'Alrady' not in course.text():
            result.append(course.text())
        if index >= maxItem:
            return ','.join(result) + ". If you want know more courses, please visit UQ website to check detail."
    return ','.join(result)
コード例 #11
0
def retrieve_page(page):
    program_lists = page.find(".plan")
    first = True
    print program_lists.size()
    index = 1
    while index < program_lists.size():
        program = program_lists.eq(index)
        index += 1
        if program.text() != "":
            print program.text()
            program_url = host + program.find('a').attr("href")
            print program_url
            try:
                program_page = jquery(url=program_url)
                retrieve_program_page(program_page)
            except:
                print "error"
コード例 #12
0
ファイル: bxd.py プロジェクト: tomtam/SpiderDemo
def getComment(questionNum):
    url = u'http://www.bxd365.com/qa/%s.html' % questionNum
    print(u"开始解析:%s" % url)
    try:
        html = rq.get(url)
    except Exception as e:
        print(e)
        return
    doc = jquery(html)
    replys = doc.find(".reply li")
    if replys is None:
        print(u'无评论数据')
        return
    if len(replys) <= 0:
        print(u'评论数量为0')
        return
    for item in replys:
        parseComment(questionNum, item)
コード例 #13
0
def retrieve_program_page(url):
    driver.get(url=url)
    driver.save_screenshot('screenshot.png')
    try:
        if driver.find_element_by_xpath(
                "//a[text()=\"I'm an international student\"]"):
            driver.find_element_by_xpath(
                "//a[text()=\"I'm an international student\"]").click()
    except:
        print("Already click this button")
    try:
        head = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//h1')))
        page = jquery(
            driver.find_element_by_xpath("//html").get_attribute(
                'innerHTML').replace('&gt;', '>'))
        print(head.text)
        head = clean_text(head.text)
        print(head)
        location = clean_text(
            driver.find_element_by_xpath(
                "//span[@data-sinet='LOCATION']").text)
        duration = clean_text(
            driver.find_element_by_xpath(
                "//div[@class='program__duration-value']").text)
        commencing = clean_text(
            driver.find_element_by_xpath(
                "//div[@class='program__commencement-value']").text)
        print(location)
        print(duration)
        print(commencing)

        # fee of the program
        driver.find_element_by_xpath(
            "//a[text()='Fees and scholarships']").click()
        fee = page.find(
            'span[data-sinet="StudentInfo > Domestic > IndicativeFee > CSP"]'
        ).text()
        fee = clean_text(fee)
        print(fee)

        # major or the program
        majors = []
        majorsElements = page.find('h3[data-sinet="[Plan] TITLE"]')
        for majorsElement in majorsElements.items():
            print(majorsElement.text())
            majors.append(clean_text(majorsElement.text()))
        majors = ','.join(majors)

        # summary of the program
        program_code = clean_text(
            page.find(
                "ul[class='program__table'] div[data-sinet='CODE']").text())
        program_unit = clean_text(
            page.find(
                "ul[class='program__table'] div[data-sinet='UNITS']").text())
        program_level = clean_text(
            page.find(
                "ul[class='program__table'] div[data-sinet='LEVEL_VALUE']").
            text())
        program_faculty = clean_text(
            page.find(
                "ul[class='program__table'] div[data-sinet='Faculty > FACULTY_KEY']"
            ).text())
        print(program_code)
        print(program_unit)
        print(program_level)
        print(program_faculty)

        course_url = page.find('#program-structure > a:nth-child(2)')
        course_url = course_url.attr['href']
        print(course_url)
        courses = "The course list is still not available"
        if course_url is not None:
            courses = fetch_course(course_url)

        entry_requirements = page.find('#entry-requirements')
        entry_requirements = clean_text(entry_requirements.text().replace(
            'Entry requirements ', ''))
        print(entry_requirements)

        connection.cursor().execute(
            '''INSERT into program_international (title, location, duration, commencing, fee, majors, program_code, program_unit, program_level, program_faculty, courses, entry_requirements)
                        values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''',
            (head, location, duration, commencing, fee, majors, program_code,
             program_unit, program_level, program_faculty, courses,
             entry_requirements))
        connection.commit()
    except TimeoutException:
        print('No such program for international student')
コード例 #14
0
def retrieve_pages():
    baseurl = "http://www.uq.edu.au/departments/unit_types.html?type=5"
    page = jquery(url=baseurl)
    retrieve_page(page)
コード例 #15
0
def retrieve_pages():
    url = "http://www.uq.edu.au/events/calendar_view.php?category_id=16"
    page = jquery(url=url)
    retrieve_page(page)
コード例 #16
0
ファイル: fetchMap.py プロジェクト: DouerZYY/uq_receptionist
def retrieve_pages():
    baseurl = "http://www.uq.edu.au/maps/mapindex.html?menu=1"
    page = jquery(url=baseurl)
    retrieve_page(page)
コード例 #17
0
dir_path = os.path.split(os.path.realpath(__file__))[0]
sys.path.append(dir_path + "/..")
import common.request as rq

proxyDatas = []


def getComment(questionNum):
    url = u'http://www.bxd365.com/qa/%s.html' % questionNum
    print u"开始解析:%s" % url
    try:
        html = rq.get(url)
    except Exception, e:
        print e
        return
    doc = jquery(html)
    replys = doc.find(".reply li")
    if replys is None:
        print u'无评论数据'
        return
    if len(replys) <= 0:
        print u'评论数量为0'
        return
    for item in replys:
        parseComment(questionNum, item)


def parseComment(questionNum, commentItem):
    if commentItem is None:
        print u'评论标签不存在'
        return
コード例 #18
0
def get_agencys(city, page_index=1):
    '''
        获取58全职搜索保险代理人信息
        * 'city_type' 城市类型
        * 'page_index' 当前页码
    '''
    city_type = city['id']
    hp.print_partition(u'解析城市:%s-%s-%s,保险代理人的工作' %
                       (city['province'], city['city'], city['id']))
    # 构造接口url
    url = __agencys_url.format(city_type, page_index)
    print(u'工作地址:%s' % url)
    try:
        # html = rq.get_cookie(
        #     url,
        #     headers={
        #         "User-agent":
        #         "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        #         "Referer":
        #         url
        #     })
        html = rq.get(url)
    except Exception as e:
        print(e)
        return
    doc = jquery(html)
    if doc is None:
        print(u"解析html报错")
        return
    # 总页数
    page_nums_str = doc.find(".num_operate .total_page").html()
    if page_nums_str is None:
        page_nums_str = '0'
    page_nums = int(page_nums_str)
    print(u'总页数:%s,当前页码:%s' % (page_nums, page_index))
    # 工作列表
    list_jobs = doc.find("#list_con .job_item")
    if list_jobs is None:
        print(u"没有查询到工作列表信息")
        return
    print(u"工作总数:%s" % len(list_jobs))
    # 遍历工作
    today_nums = 0
    for job_item in list_jobs:
        job_item_jq = jquery(job_item)
        job_sign = job_item_jq.find(".sign").html()
        if not check_job_istoday(job_sign):
            print(u'状态:%s,不是今日发布,略过' % job_sign)
            continue
        today_nums = today_nums + 1
        job_name = job_item_jq.find(".name").html()
        if job_name.find(u'保险') < 0:
            print(u'工作:%s,非保险类工作,略过~' % job_name)
            continue
        job_address = job_item_jq.find(".address").html()
        job_url = job_item_jq.find("a").attr("href")
        job_company = job_item_jq.find(".job_comp .comp_name .fl").attr(
            "title")
        job_company = analysis_job_company(job_company)
        print(u'%s|%s|%s' % (job_address, job_name, job_sign))
        # 延迟
        hp.sleep(0.3, 0.6, content=u'获取工作详情=》')
        # 解析job_data数据
        job_data = analysis_job_data(job_url)

        if job_data is None or job_data['pagenum'] is None:
            print(u'无法获取pagenum,略过!')
            continue
        __jobs.append({
            "name": job_name,
            "address": job_address,
            "url": job_url,
            "pagenum": job_data['pagenum'],
            "contactPerson": job_data['contactPerson'],
            "sign": job_sign,
            "company": job_company
        })
    print(u'当前页码:%s,总页数:%s' % (page_index, page_nums))
    # 校验是否需要继续翻页
    if today_nums <= 0:
        print('当前页码:%s,无今日工作,无需继续翻页' % (page_index))
        return
    page_index = page_index + 1
    # 递归翻页
    if page_index <= page_nums:
        print(' ')
        # 延迟
        hp.sleep(0, 1, content=u'翻页=》')
        get_agencys(city, page_index)
コード例 #19
0
def retrieve_page(qaPage):
    base_url = 'http://www.uq.edu.au/departments/'
    all_links = qaPage.find('#content-primary>a')
    for link in all_links.items():
        url = base_url + link.attr['href']
        retrieve_school_page(jquery(url=url))
コード例 #20
0
 url = __agencys_url.format(city_type, page_index)
 print u'工作地址:%s' % url
 try:
     # html = rq.get_cookie(
     #     url,
     #     headers={
     #         "User-agent":
     #         "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
     #         "Referer":
     #         url
     #     })
     html = rq.get(url)
 except Exception, e:
     print e
     return
 doc = jquery(html)
 if doc is None:
     print u"解析html报错"
     return
 # 总页数
 page_nums_str = doc.find(".num_operate .total_page").html()
 if page_nums_str is None:
     page_nums_str = '0'
 page_nums = int(page_nums_str)
 print u'总页数:%s,当前页码:%s' % (page_nums, page_index)
 # 工作列表
 list_jobs = doc.find("#list_con .job_item")
 if list_jobs is None:
     print u"没有查询到工作列表信息"
     return
 print u"工作总数:%s" % len(list_jobs)
コード例 #21
0
def retrieve_pages():
    url = "https://www.uq.edu.au/study/browse.html?level=ugpg"
    page = jquery(url=url)
    retrieve_page(page)
コード例 #22
0
def specialArticles(key, source, page=1):
    '''
        获取主题中的文章信息列表
        * 'key' 主题Key
        * 'page' 文章页码
    '''
    url = __special_newlike_url.format(key)
    htmlStr = rq.get(url)
    if (not htmlStr):
        print(u'获取html失败')
        return
    jq_dom = jquery(htmlStr)
    if (not jq_dom):
        print(u'无法解析页面dom')
        return
    dom_contents = jq_dom.find('.content')
    if (not dom_contents):
        print(u'无法解析文章内容')
        return
    articles = []
    for item in dom_contents:
        jq_content_item = jquery(item)
        dom_title = jq_content_item.find('.title')
        dom_time = jq_content_item.find('.time')
        dom_read = jq_content_item.find('.ic-list-read')
        dom_comments = jq_content_item.find('.ic-list-comments')
        dom_like = jq_content_item.find('.ic-list-like')
        if (not dom_title):
            print(u'无法解析 title')
            continue
        if (not dom_time):
            print(u'无法解析 time')
            continue
        # 解析文章信息
        article_read = int(dom_read.parent().text())
        article_comments = int(dom_comments.parent().text())
        article_like = int(dom_like.parent().text())
        article_title = dom_title.html()
        artitle_href = dom_title.attr('href')
        artitle_time = dom_time.attr('data-shared-at').replace(
            '-', ' ').replace('+08:00', '').replace('T', ' ')
        # artitle_time = time.strptime(artitle_time, '%Y %m %d %H:%M:%S')
        article_url = '{host}{href}'.format(
            host=__jianshu_host, href=artitle_href)
        print(u'获得文章:', hp.remove_emoji(), article_title, article_url,
              artitle_time)
        if (article_read < 100):
            print(u'文章阅读量<100,不爬取')
            continue
        if (article_like < 1):
            print(u'文章收藏量<10,不爬取')
            continue
        if (article_comments < 1):
            print(u'文章评论量<3,不爬取')
            continue
        # 获取文章内容
        content_html = getArticleContent(article_url)
        if (not content_html):
            print(u'无法获取博文内容')
            continue
        # 文章内容字符串处理
        content_html = content_html.replace('data-original-', '')
        content_markdown = getCotentMarkDown(content_html)
        # markdown内容字符串处理
        # content_markdown = content_markdown.replace("|", "-")
        articles.append({
            'title': article_title,
            'url': article_url,
            'time': artitle_time,
            'source': source,
            'content': content_markdown
        })
    return articles