Ejemplos de get_item en Python

Lenguaje de programación: Python

Namespace/Package Name: scrapySchool_Australian_yan.getItem

Método / Función: get_item

Ejemplos en hotexamples.com: 17

Python get_item - 17 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de scrapySchool_Australian_yan.getItem.get_item extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

Archivo: VictoriaUniversity_P.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = "Victoria University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.vu.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 2
        item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        pro = ["Graduate Certificate in Enterprise and Resource Planning Systems",
"Graduate Certificate in International Business",
"Graduate Diploma in Education",
"Graduate Diploma in Project Management",
"Master of Business (Accounting)/Master of Finance",
"Master of Business (Enterprise Resource Planning Systems)/ Master of Supply Chain Management",
"Master of Business (Enterprise Resource Planning Systems)/Master of Business Analytics",
"Master of Business (Finance)",
"Master of Business (International Business)",
"Master of Counselling",
"Master of Engineering (Building Fire Safety and Risk Engineering)",
"Master of Finance",
"Master of Industrial Relations and Human Resource Management",
"Master of International Business",
"Master of Management",
"Master of Marketing",
"Master of Supply Chain Management",
"Master of Teaching (Secondary Education)",
"Master of Tourism and Destination Management", ]
        uu = ["https://www.vu.edu.au/courses/international/BTEN",
"https://www.vu.edu.au/courses/international/BTIB",
"https://www.vu.edu.au/courses/international/EGED",
"https://www.vu.edu.au/courses/international/NGPM",
"https://www.vu.edu.au/courses/international/BMDD",
"https://www.vu.edu.au/courses/international/BMDB",
"https://www.vu.edu.au/courses/international/BMDA",
"https://www.vu.edu.au/courses/international/BMFN",
"https://www.vu.edu.au/courses/international/BMIA",
"https://www.vu.edu.au/courses/international/AMPE",
"https://www.vu.edu.au/courses/international/EMQB",
"https://www.vu.edu.au/courses/international/BMFF",
"https://www.vu.edu.au/courses/international/BMIH",
"https://www.vu.edu.au/courses/international/BMIB",
"https://www.vu.edu.au/courses/international/BMMM",
"https://www.vu.edu.au/courses/international/BMKM",
"https://www.vu.edu.au/courses/international/BMSP",
"https://www.vu.edu.au/courses/international/EMES",
"https://www.vu.edu.au/courses/international/BMTD", ]
        programme_dict = {}
        for i in range(len(pro)):
            programme_dict[uu[i]] = pro[i]
        item['major_type1'] = programme_dict.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            # //h1[@class='page-header']
            programme = response.xpath("//h1[@class='page-header']//text()").extract()
            clear_space(programme)
            item['degree_name'] = ''.join(programme).strip()
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Master", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2 and "Graduate" not in item['degree_name']:
                programme_re = re.findall(r"\(.+\)", item['degree_name'])
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace("(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace("Master of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                department = response.xpath("//div[@class='field field-name-field-college field-type-link-field field-label-inline clearfix']//div[@class='field-items']//text()").extract()
                clear_space(department)
                item['department'] = ''.join(department).strip()
                print("item['department']: ", item['department'])

                start_date = response.xpath(
                    "//div[@class='field field-essentials-intake']//div[@class='field-item']//text()|"
                    "//strong[contains(text(),'Intakes:')]/../div//text()").extract()
                clear_space(start_date)
                # print("start_date: ", start_date)
                monthDict = {"january": "01", "february": "02", "march": "03", "april": "04", "may": "05", "june": "06",
                             "july": "07", "august": "08", "september": "09", "october": "10", "november": "11",
                             "december": "12",
                             "jan": "01", "feb": "02", "mar": "03", "apr": "04", "may": "05", "jun": "06",
                             "jul": "07", "aug": "08", "sep": "09", "oct": "10", "nov": "11", "dec": "12",
                             "sept": "09", }
                start_date_re = re.findall(r"january|february|march|april|may|june|july|febraugustuary|september|october|november|december", ''.join(start_date), re.I)
                start_date_str = ""
                # print(start_date_re)
                if len(start_date_re) > 0:
                    for s in start_date_re:
                        s1 = monthDict.get(s.lower().strip())
                        if s1 is not None:
                            start_date_str += s1 + ","
                start_date_str = start_date_str.replace("0", "").strip().strip(',').strip()
                item['start_date'] = start_date_str
                print("item['start_date']: ", item['start_date'])

                duration = response.xpath(
                    "//div[@class='field field-essentials-duration']//div[@class='field-item']//text()|"
                    "//strong[contains(text(),'Duration:')]/../div//text()").extract()
                clear_space(duration)
                item['duration'] = ''.join(duration).strip()
                print("item['duration']: ", item['duration'])

                location = response.xpath(
                    "//div[@class='field field-essentials-locations']//div[@class='field-items']//text()|"
                    "//strong[contains(text(),'Location:')]/../div//text()").extract()
                clear_space(location)
                item['location'] = ''.join(location).strip()
                print("item['location']: ", item['location'])

                tuition_fee = response.xpath(
                    "//div[@class='field field-essentials-short-fees']//div[@class='field-item']//text()|"
                    # "//strong[contains(text(),'Fees:')]/../div/text()|"
                    "//strong[contains(text(),'Fees:')]/../div//text()").extract()
                print("tuition_fee: ", tuition_fee)
                clear_space(tuition_fee)
                tuition_fee_str = ''.join(tuition_fee).strip()
                tuition_fee_re1 = re.findall(r"2019[\w\W]*?\d+,\d+", tuition_fee_str)
                tuition_fee_re = re.findall(r"\d+,\d+", ''.join(tuition_fee_re1))
                item['tuition_fee'] = ''.join(tuition_fee_re).replace(",", "").strip()
                print("item['tuition_fee']: ", item['tuition_fee'])

                overview = response.xpath(
                    "//div[@id='overview']").extract()
                item['degree_overview_en'] = remove_class(clear_lianxu_space(overview))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                career = response.xpath(
                    "//div[@id='careers']").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']: ", item['career_en'])

                modules = response.xpath(
                    "//div[@id='course-structure']").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # print("item['modules_en']: ", item['modules_en'])

                entry_requirements = response.xpath(
                    "//html//article/div[4]").extract()
                item['rntry_requirements_en'] = remove_class(clear_lianxu_space(entry_requirements))
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                how_to_apply = response.xpath(
                    "//div[@id='apply-now']").extract()
                item['apply_desc_en'] = remove_class(clear_lianxu_space(how_to_apply))
                # print("item['apply_desc_en']: ", item['apply_desc_en'])

                ielts_desc_re = re.findall(r"IELTS.{1,120}", item['rntry_requirements_en'])
                item['ielts_desc'] = ''.join(ielts_desc_re).strip()
                print("item['ielts_desc']: ", item['ielts_desc'])

                ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                # print(ieltlsrw)
                if len(ieltlsrw) == 1:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[0]
                    item["ielts_s"] = ieltlsrw[0]
                    item["ielts_r"] = ieltlsrw[0]
                    item["ielts_w"] = ieltlsrw[0]
                elif  len(ieltlsrw) == 2:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[1]
                    item["ielts_r"] = ieltlsrw[1]
                    item["ielts_w"] = ieltlsrw[1]
                elif  len(ieltlsrw) == 5:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[4]
                    item["ielts_r"] = ieltlsrw[2]
                    item["ielts_w"] = ieltlsrw[3]
                print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                        item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
                # item['ielts_desc'] = "Overall score of 6.5 (no band less than 6.0)"
                item['toefl'] = "79"
                item['toefl_l'] = "19"
                item['toefl_s'] = "19"
                item['toefl_r'] = "18"
                item['toefl_w'] = "22"
                yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/"+item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 2

Mostrar archivo

Archivo: UniversityofCanberra_P.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = "University of Canberra"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.vu.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 2
        item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)

        # item['major_type1'] = programme_dict.get(response.url)
        # print("item['major_type1']: ", item['major_type1'])
        try:
            # //h1[@class='page-header']
            programme = response.xpath("//h1[@class='course_title']//text()").extract()
            clear_space(programme)
            degree_name_str = ''.join(programme).strip()
            degree_name_re = re.findall(r"-.*", degree_name_str)
            item['degree_name'] = degree_name_str.replace(''.join(degree_name_re), '').strip()
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Master", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2 and "online" not in item['degree_name']:
                programme_re = re.findall(r"\(.+\)", item['degree_name'])
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace("(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace("Master of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                location = response.xpath(
                    "//th[contains(text(),'Location:')]/following-sibling::td//text()").extract()
                clear_space(location)
                item['location'] = ''.join(location).strip()
                # print("item['location']: ", item['location'])

                department = response.xpath("//th[contains(text(),'Faculty:')]/following-sibling::td//text()").extract()
                clear_space(department)
                item['department'] = ''.join(department).strip()
                # print("item['department']: ", item['department'])

                ielts_desc_re = response.xpath(
                    "//th[contains(text(),'English Language Requirements:')]/following-sibling::td//text()").extract()
                item['ielts_desc'] = ''.join(ielts_desc_re).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                # print(ieltlsrw)
                if len(ieltlsrw) == 1:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[0]
                    item["ielts_s"] = ieltlsrw[0]
                    item["ielts_r"] = ieltlsrw[0]
                    item["ielts_w"] = ieltlsrw[0]
                elif len(ieltlsrw) == 2:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[1]
                    item["ielts_r"] = ieltlsrw[1]
                    item["ielts_w"] = ieltlsrw[1]
                elif len(ieltlsrw) == 5:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[4]
                    item["ielts_r"] = ieltlsrw[2]
                    item["ielts_w"] = ieltlsrw[3]
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                tuition_fee = response.xpath(
                    "//div[@id='fees']//tr[2]/td[3]//text()").extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", tuition_fee)
                tuition_fee_str = ''.join(tuition_fee).strip()
                tuition_fee_re = re.findall(r"\d+,\d+", tuition_fee_str)
                item['tuition_fee'] = ''.join(tuition_fee_re).replace(",", "").strip()
                # print("item['tuition_fee']: ", item['tuition_fee'])

                # duration = response.xpath(
                #     "//div[@class='field field-essentials-duration']//div[@class='field-item']//text()").extract()
                # clear_space(duration)
                # item['duration'] = ''.join(duration).strip()
                # # print("item['duration']: ", item['duration'])

                overview = response.xpath(
                    "//h2[contains(text(),'Career opportunities')]/preceding-sibling::*").extract()
                if len(overview) == 0:
                    overview = response.xpath(
                        "//div[@class='collapsible-section']/preceding-sibling::*").extract()
                item['degree_overview_en'] = remove_class(clear_lianxu_space(overview))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                career = response.xpath(
                    "//h2[contains(text(),'Career opportunities')]|//h2[contains(text(),'Career opportunities')]/following-sibling::*[1]|"
                    "//strong[contains(text(),'Career opportunities')]/..|//strong[contains(text(),'Career opportunities')]/../following-sibling::*[position()<3]").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # if item['career_en'] == "":
                #     print("***career_en 为空")
                # print("item['career_en']: ", item['career_en'])

                modules = response.xpath(
                    "//h2[contains(text(),'Course Requirements')]|//div[@id='toggle-view']").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # if item['modules_en'] == "":
                #     print("***modules_en 为空")
                # print("item['modules_en']: ", item['modules_en'])

                entry_requirements = response.xpath(
                    "//div[@id='admission']").extract()
                item['rntry_requirements_en'] = remove_class(clear_lianxu_space(entry_requirements))
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                work_experience_desc_en = response.xpath(
                    "//*[contains(text(), 'work experience')]").extract()
                item['work_experience_desc_en'] = remove_class(clear_lianxu_space(work_experience_desc_en))
                print("item['work_experience_desc_en']: ", item['work_experience_desc_en'])

                yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/"+item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 3

Mostrar archivo

Archivo: SouthernCrossUniversity_P.py Proyecto: histudent/python_spider

    def content(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = "Southern Cross University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.scu.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 2
        item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        try:
            programme = response.xpath(
                "//h1[@class='pageTitleFixSource']//text()").extract()
            clear_space(programme)
            item['degree_name'] = ''.join(programme)
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Master", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                duration = response.xpath(
                    "//div[@id='international']//td[contains(text(),'Duration')]/following-sibling::td//text()"
                ).extract()
                clear_space(duration)
                item['duration'] = ''.join(duration).strip()
                print("item['duration']: ", item['duration'])

                if "full" in item['duration'].lower():
                    programme_re = re.findall(r"\(.+\)", item['degree_name'])
                    if len(programme_re) > 0:
                        item['programme_en'] = ''.join(programme_re).replace(
                            "(", "").replace(")", "").strip()
                    else:
                        item['programme_en'] = item['degree_name'].replace(
                            "Master of", "").strip()
                    print("item['programme_en']: ", item['programme_en'])

                    overview = response.xpath(
                        "//div[@class='summary']").extract()
                    item['degree_overview_en'] = remove_class(
                        clear_lianxu_space(overview))
                    # if item['degree_overview_en'] == "":
                    #     print("***degree_overview_en 为空")
                    # print("item['degree_overview_en']: ", item['degree_overview_en'])

                    career = response.xpath(
                        "//h3[contains(text(), 'Career opportunities')]/.."
                    ).extract()
                    item['career_en'] = remove_class(
                        clear_lianxu_space(career))
                    # if item['career_en'] == "":
                    #     print("***career_en 为空")
                    # print("item['career_en']: ", item['career_en'])

                    tuition_fee = response.xpath(
                        "//html//div[@class='table-grid table-responsive no-overflow']//div[@class='table-grid table-responsive no-overflow']//tbody/tr/td[3]//text()"
                    ).extract()
                    clear_space(tuition_fee)
                    item['tuition_fee'] = '; '.join(tuition_fee).strip()
                    print("item['tuition_fee']: ", item['tuition_fee'])

                    # //tr[@class='data-label-Overall']/td[2]
                    IELTS = response.xpath(
                        "//tr[@class='data-label-Overall']/td[2]//text()|//tr[@class='data-label-Overall Score']/td[2]//text()|"
                        "//td[contains(text(),'Overall Score')]/following-sibling::td//text()"
                    ).extract()
                    clear_space(IELTS)
                    item['ielts'] = ','.join(IELTS).strip()
                    print("item['ielts']: ", item['ielts'])

                    IELTS_L = response.xpath(
                        "//tr[@class='data-label-Listening']/td[2]//text()"
                    ).extract()
                    clear_space(IELTS_L)
                    item['ielts_l'] = ','.join(IELTS_L).strip()
                    print("item['ielts_l']: ", item['ielts_l'])

                    IELTS_S = response.xpath(
                        "//tr[@class='data-label-Speaking']/td[2]//text()"
                    ).extract()
                    clear_space(IELTS_S)
                    item['ielts_s'] = ','.join(IELTS_S).strip()
                    print("item['ielts_s']: ", item['ielts_s'])

                    IELTS_R = response.xpath(
                        "//tr[@class='data-label-Reading']/td[2]//text()"
                    ).extract()
                    clear_space(IELTS_R)
                    item['ielts_r'] = ','.join(IELTS_R).strip()
                    print("item['ielts_r']: ", item['ielts_r'])

                    IELTS_W = response.xpath(
                        "//tr[@class='data-label-Writing']/td[2]//text()"
                    ).extract()
                    clear_space(tuition_fee)
                    item['ielts_w'] = ','.join(IELTS_W).strip()
                    print("item['ielts_w']: ", item['ielts_w'])

                    average_score = response.xpath(
                        "//tr[@class='data-label-China Senior Middle 3']//text() | //tr[@class='data-label-China Gao Kao']//text()"
                    ).extract()
                    clear_space(average_score)
                    # item['average_score'] = ','.join(average_score).strip()
                    # print("item['average_score']: ", item['average_score'])

                    modules = response.xpath(
                        "//div[@id='structure']").extract()
                    item['modules_en'] = remove_class(
                        clear_lianxu_space(modules))
                    # print("item['modules_en']: ", item['modules_en'])

                    # //h2[contains(text(),'Admission requirements')]|//h2[contains(text(),'Admission requirements')]/following-sibling::div[1]
                    rntry_requirements_en = response.xpath(
                        "//h2[contains(text(),'Admission requirements')]|//h2[contains(text(),'Admission requirements')]/following-sibling::div[1]"
                    ).extract()
                    item['rntry_requirements_en'] = remove_class(
                        clear_lianxu_space(rntry_requirements_en))
                    print("item['rntry_requirements_en']: ",
                          item['rntry_requirements_en'])

                    how_to_apply = response.xpath(
                        "//div[@id='apply']").extract()
                    item['apply_desc_en'] = remove_class(
                        clear_lianxu_space(how_to_apply))
                    print("item['apply_desc_en']: ", item['apply_desc_en'])

                    other = response.xpath(
                        "//div[@id='international']//text()").extract()
                    clear_space(other)
                    # item['other'] = ''.join(other).strip()
                    # print("item['other']: ", item['other'])

                    location = response.xpath(
                        "//div[@id='international']//td[contains(text(),'Availability details')]/following-sibling::td//tbody/tr[position()<last()]/td[1]//text()"
                    ).extract()
                    clear_space(location)
                    item['location'] = ', '.join(location).strip()
                    print("item['location']: ", item['location'])

                    if item['location'] != "SCU Online":
                        major_list = response.xpath(
                            "//h3[contains(text(),'Specialisations')]/../../following-sibling::tr[@class='header-row text group-hdr']//h4//text()"
                        ).extract()
                        clear_space(major_list)
                        print("major_list: ", major_list)
                        print(len(major_list))

                        if len(major_list) == 0:
                            yield item
                        else:
                            modules_list = response.xpath(
                                "//h3[contains(text(),'Specialisations')]/../../following-sibling::tr[@class='header-row text group-hdr']//h4/following-sibling::div"
                            ).extract()
                            print("===", modules_list)
                            print(len(modules_list))
                            if len(modules_list) == len(major_list):
                                for m in range(len(major_list)):
                                    item['programme_en'] = major_list[m]
                                    item['modules_en'] = remove_class(
                                        clear_lianxu_space([modules_list[m]]))
                                    print("item['programme_en']: ",
                                          item['programme_en'])
                                    yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 4

Mostrar archivo

Archivo: LaTrobeUniversity_P1.py Proyecto: histudent/python_spider

    def parses(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = 'La Trobe University'
        item['url'] = response.url
        # item['location']='Melbourne'
        item['degree_type'] = 2
        item['teach_time'] = 'coursework'
        print("================================================")
        print(response.url)

        try:
            # 学位名称
            degree_name = response.xpath(
                '//h1[contains(text(),"Master of")]/text()').extract()
            clear_space(degree_name)
            item['degree_name'] = ''.join(degree_name).strip()
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Master", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                programme_re = re.findall(
                    r"\(.+\)", item['degree_name'].replace("(Advanced)",
                                                           "").strip())
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Master of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                start_date = response.xpath(
                    '//div[contains(text(),"tart")]/following-sibling::div//text()'
                ).extract()
                # print('start_date: ',start_date)
                item['start_date'] = getStartDateMonth(''.join(start_date))
                if item['start_date'] == "":
                    item['start_date'] = ''.join(start_date).strip()
                # print("item['start_date']: ", item['start_date'])

                duration = response.xpath(
                    '//div[contains(text(),"uration")]/following-sibling::div//text()'
                ).extract()
                # print('duration: ',duration)
                item['duration'] = ''.join(duration).strip()
                # print("item['duration']: ", item['duration'])

                fee = response.xpath(
                    '//h3[contains(text(),"tuition fee")]/following-sibling::p[1]/text()'
                ).extract()
                # print('fee: ',fee)
                fee = ''.join(fee).strip()
                tuition = fee.replace(' ', '')
                item['tuition_fee'] = tuition[0:99]
                # print("item['tuition_fee']: ", item['tuition_fee'])

                overview = response.xpath(
                    '//section[@id="overview"]/div[@class="block"]').extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                rntry = response.xpath(
                    '//section[@id="entry-requirements"]').extract()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(rntry))
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                career = response.xpath(
                    '//section[@id="career-outcomes"]').extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']: ", item['career_en'])

                htp = response.xpath('//section[@id="how-to-apply"]').extract()
                item['apply_desc_en'] = remove_class(clear_lianxu_space(htp))
                # print("item['apply_desc_en']: ", item['apply_desc_en'])

                # //ul[@class='list-arrows']//li
                location_dict = {
                    'BU': 'Melbourne',
                    'BE': 'Bendigo',
                    'CI': 'City',
                    'MI': 'Mildura',
                    'OT': 'Other',
                    'FS': 'Franklin Street',
                    'SH': 'Shepparton',
                    'SY': 'Sydney',
                    'ON': 'Online',
                    'WO': 'Albury-Wodonga',
                }
                location = response.xpath(
                    "//ul[@class='list-arrows']//li//text()").extract()
                # print("location: ", location)
                item['location'] = ''.join(location).replace("(Bundoora)",
                                                             "").strip()
                if item['location'] == "":
                    location_key = response.url.replace(
                        "https://www.latrobe.edu.au/courses/data/2019/international/",
                        "").strip()
                    # print("location_key1: ", location_key)
                    location_key = location_key.split("/")[0]
                    # print("location_key: ", location_key)
                    item['location'] = location_dict.get(
                        ''.join(location_key).upper())
                # print("item['location']: ", item['location'])

                ielts = response.xpath(
                    '//p[contains(text(),"IELTS")]/text()').extract()
                item['ielts_desc'] = ''.join(ielts).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts = get_ielts(item['ielts_desc'])
                item['ielts'] = ielts.get('IELTS')
                item['ielts_l'] = ielts.get('IELTS_L')
                item['ielts_s'] = ielts.get('IELTS_S')
                item['ielts_r'] = ielts.get('IELTS_R')
                item['ielts_w'] = ielts.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #        item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                modules_url = response.xpath(
                    '//ul[@class="list-arrows"]/li[1]/a/@href').extract()
                clear_space(modules_url)
                if modules_url != []:
                    try:
                        item['modules_en'] = self.parse_modules(modules_url[0])
                    except:
                        item['modules_en'] = ""
                # print("item['modules_en']: ", item['modules_en'])

                work_experience_desc_en = response.xpath(
                    "//section[@id='entry-requirements']//p[contains(text(), 'work experience')]"
                ).extract()
                item['work_experience_desc_en'] = remove_class(
                    clear_lianxu_space(work_experience_desc_en))
                if item['work_experience_desc_en'] == "":
                    work_experience_desc_en = re.findall(
                        r"<.{1,200}work\sexperience.{1,200}>", response.text)
                    item['work_experience_desc_en'] = "<p>" + remove_tags(
                        clear_lianxu_space(work_experience_desc_en)) + "</p>"
                    item['work_experience_desc_en'] = item[
                        'work_experience_desc_en'].replace("<p></p>", "")
                # print("item['work_experience_desc_en']: ", item['work_experience_desc_en'])

                item[
                    'apply_proces_en'] = "https://www.latrobe.edu.au/international/how-to-apply/undergraduate-and-postgraduate"

                item['overview_en'] = item['degree_overview_en']
                programme_major = response.xpath(
                    '//section[@id="overview"]/div[@class="block"]//ul/li'
                ).extract()
                print(len(programme_major))
                if len(programme_major) == 0:
                    yield item
                else:
                    for i in range(len(programme_major)):
                        print("***************************" + str(i + 1) +
                              "****************************")
                        programme_major1 = response.xpath(
                            '//section[@id="overview"]/div[@class="block"]//ul/li['
                            + str(i + 1) + ']//text()').extract()
                        item['programme_en'] = ''.join(
                            programme_major1).strip()
                        print("item['programme_en']_major: ",
                              item['programme_en'])
                        yield item
                degree_name_list = response.xpath(
                    '//p[contains(text(),"Our Majors are:")]/following-sibling::ul/li//text()|'
                    '//p[contains(text(),"Choose from five majors")]/following-sibling::ul[1]/li/text()|'
                    '//h3[contains(text(),"Specialisations, majors and minors")]/following-sibling::table/tbody/tr/td[1]//text()'
                ).extract()

        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 5

Mostrar archivo

    def parse_data(self, response):
        # print("判断详情页的链接", response.url)
        # 判断是否学位下面还有专业
        specialisations = response.xpath("//h2[contains(text(),'Specialisations')]/following-sibling::*//a/@href").extract()
        print("specialisations: ", specialisations, response.url)
        if len(specialisations) > 0:
            for link in specialisations:
                if "http" in link:
                    url = link
                else:
                    url = "http://study.unisa.edu.au" + link
                yield scrapy.Request(url, callback=self.parse_data)
        else:
            item = get_item(ScrapyschoolAustralianYanItem)
            item['university'] = "University of South Australia"
            # item['country'] = 'Australia'
            # item['website'] = 'http://www.unisa.edu.au/'
            item['url'] = response.url
            print("===========================")
            print(response.url)
            item['degree_type'] = 2
            item['teach_time'] = 'coursework'
            try:
                programme = response.xpath(
                    "//div[@class='title-row']/h1/text()").extract()
                clear_space(programme)
                item['degree_name'] = ''.join(programme).replace("(International)", "").strip()
                print("item['degree_name']: ", item['degree_name'])

                pro_re = re.findall(r"Master", item['degree_name'])
                # print("pre_re: ", pro_re)
                if len(pro_re) < 2:
                    programme_re = re.findall(r"\(.+\)", item['degree_name'])
                    print("programme_re: ", programme_re)
                    if len(programme_re) > 0:
                        if ''.join(programme_re).strip() != "(Graduate Entry)":
                            item['programme_en'] = ''.join(programme_re).replace("(", "").replace(")", "").strip()
                        else:
                            item['programme_en'] = item['degree_name'].replace("Master of", "").replace("(Graduate Entry)", "").strip().strip(
                                "in").strip()
                    else:
                        item['programme_en'] = item['degree_name'].replace("Master of", "").strip().strip("in").strip()
                print("item['programme_en']: ", item['programme_en'])

                # //div[@class='theme-background-white page-info-block-content']/div[@class='row']
                keyinformation = response.xpath("//div[@class='theme-background-white page-info-block-content']/div[@class='row']//text()").extract()
                clear_space(keyinformation)
                for i in range(len(keyinformation)):
                    keyinformation[i] = keyinformation[i].strip()
                # print("keyinformation: ", keyinformation)

                start_date = response.xpath(
                    "//span[contains(text(), 'Start')]/../text()").extract()
                clear_space(start_date)
                # print("start_date: ", start_date)
                item['start_date'] = getStartDateMonth(', '.join(start_date))
                print("item['start_date']: ", item['start_date'])

                # //span[contains(text(),'Campus')]/../a
                location = response.xpath(
                    "//span[contains(text(),'Campus')]/../a//text()").extract()
                clear_space(location)
                item['location'] = ''.join(location).strip()
                # print("item['location']: ", item['location'])

                duration = response.xpath(
                    "//span[contains(text(),'Duration')]/../text()").extract()
                clear_space(duration)
                item['duration'] = ''.join(duration).strip()
                # print("item['duration']: ", item['duration'])

                tuition_fee = response.xpath("//span[contains(text(),'2019: AUD$')]//text()|"
                                             "//p[contains(text(),'(2019 annual)')]//text()|"
                                             "//span[contains(text(),'Fees')]/../text()").extract()
                clear_space(tuition_fee)
                tuition_fee = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee'] = str(tuition_fee)
                if item['tuition_fee'] == "0":
                    item['tuition_fee'] = None
                print("item['tuition_fee']: ", item['tuition_fee'])

                # //span[contains(text(),'English Language Requirements')]/..
                ielts = response.xpath("//span[contains(text(),'English Language Requirements')]/../ul//text()").extract()
                clear_space(ielts)
                item['ielts_desc'] = ' '.join(ielts).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                item["ielts"] = ieltlsrw[0]

                ielts_l_re = re.findall(r"listening\s\[.*?\]", item['ielts_desc'])
                item["ielts_l"] = ''.join(ielts_l_re).replace("listening", "").replace("[", "").replace("]", "").strip()

                ielts_s_re = re.findall(r"speaking\s\[.*?\]", item['ielts_desc'])
                item["ielts_s"] = ''.join(ielts_s_re).replace("speaking", "").replace("[", "").replace("]", "").strip()

                ielts_r_re = re.findall(r"reading\s\[.*?\]", item['ielts_desc'])
                item["ielts_r"] = ''.join(ielts_r_re).replace("reading", "").replace("[", "").replace("]", "").strip()

                ielts_w_re = re.findall(r"writing\s\[.*?\]", item['ielts_desc'])
                item["ielts_w"] = ''.join(ielts_w_re).replace("writing", "").replace("[", "").replace("]", "").strip()
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                # //div[@class='page-info-block-inner']//ul[@id='entry-requirements']
                entry_requirements = response.xpath(
                    "//div[@class='page-info-block-inner']//ul[@id='entry-requirements']").extract()
                item['rntry_requirements_en'] = remove_class(clear_lianxu_space(entry_requirements))
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                degree_overview_en = response.xpath(
                    "//h2[contains(text(),'Degree overview')]/../../..").extract()
                item['degree_overview_en'] = remove_class(clear_lianxu_space(degree_overview_en))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                overview_en = response.xpath(
                    "//h2[contains(text(),'Snapshot')]/..|"
                    "//h3[contains(text(),'Snapshot')]/..").extract()
                item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
                print("item['overview_en']: ", item['overview_en'])

                modules_en = response.xpath(
                    "//h2[@class='theme-white'][contains(text(), 'Degree structure')]/../..|"
                    "//h3[contains(text(),'Degree structure')]/../..").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
                print("item['modules_en']: ", item['modules_en'])

                career_en = response.xpath(
                    "//h2[contains(text(),'Your career')]/../../..|"
                    "//h3[contains(text(),'Your career')]/..").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career_en))
                print("item['career_en']: ", item['career_en'])

                apply_desc_en = response.xpath(
                    "//h2[contains(text(),'How to apply')]/../../..").extract()
                item['apply_desc_en'] = remove_class(clear_lianxu_space(apply_desc_en))
                # print("item['apply_desc_en']: ", item['apply_desc_en'])

                if "research" not in item['degree_name'].lower():
                    yield item
            except Exception as e:
                with open("scrapySchool_Australian_yan/error/" + item['university'] + str(item['degree_type']) + ".txt",
                          'a', encoding="utf-8") as f:
                    f.write(str(e) + "\n" + response.url + "\n========================\n")
                print("异常：", str(e))
                print("报错url：", response.url)

Ejemplo n.º 6

Mostrar archivo

Archivo: BondUniversity_P.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = "Bond University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://bond.edu.au'
        item['url'] = response.url
        item['degree_type'] = 2
        item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            type = response.xpath(
                "//strong[contains(text(),'Program type')]/../following-sibling::td//text()"
            ).extract()
            print("type: ", type)
            if ''.join(type) != "Master Research":
                degree_type = response.xpath(
                    "//h1[@class='page-title']//text()").extract()
                clear_space(degree_type)
                degree_type = ''.join(degree_type)
                item['degree_name'] = degree_type
                print("item['degree_name']: ", item['degree_name'])
                programme = degree_type
                if "Master of " in degree_type:
                    programme = degree_type.replace("Master of ", "").replace(
                        "Bachelor of", "").strip()
                    programme = ''.join(programme)
                item['programme_en'] = programme
                print("item['programme_en']: ", item['programme_en'])

                other = response.xpath(
                    "//html//article/blockquote[1]//text()").extract()
                item['other'] = clear_lianxu_space(other)
                # print("item['other']: ", item['other'])

                overview = response.xpath(
                    "//html//article/section[@class='section'][1]").extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # if item['overview_en'] == "":
                #     print("***overview_en为空")
                # print("item['overview_en']: ", item['overview_en'])

                degree_description = response.xpath(
                    "//div[@id='show-less-0']|//section[@id='accordion-program']/p"
                ).extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(degree_description))
                # if item['degree_overview_en'] == "":
                #     print("***degree_overview_en为空")
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                # //html//section[@id='accordion-program']/div[@class='table-responsive']//tr[2]/td[2]
                duration = response.xpath(
                    "//strong[contains(text(),'Duration')]/../following-sibling::td[1]//text()"
                ).extract()
                clear_space(duration)
                # print("duration: ", duration)
                duration = ', '.join(duration)
                duration_re = re.findall(r"\d\ssemesters|\d\ssemester",
                                         duration)
                if len(duration_re) > 0:
                    for d in duration_re:
                        item['duration'] = duration.replace(d, "").replace(
                            "(", "").replace(")", "").strip()
                else:
                    item['duration'] = duration.replace("(", "").replace(
                        ")", "").strip()
                # print("item['duration']: ", item['duration'])

                start_date = response.xpath(
                    "//strong[contains(text(),'Starting semesters')]/../following-sibling::td[1]//text()"
                ).extract()
                clear_space(start_date)
                # print("start_date: ", start_date)
                monthDict = {
                    "january": "01",
                    "february": "02",
                    "march": "03",
                    "april": "04",
                    "may": "05",
                    "june": "06",
                    "july": "07",
                    "august": "08",
                    "september": "09",
                    "october": "10",
                    "november": "11",
                    "december": "12",
                    "jan": "01",
                    "feb": "02",
                    "mar": "03",
                    "apr": "04",
                    "may": "05",
                    "jun": "06",
                    "jul": "07",
                    "aug": "08",
                    "sep": "09",
                    "oct": "10",
                    "nov": "11",
                    "dec": "12",
                    "sept": "09",
                }
                std = []
                start_date_re = re.findall(
                    r"january|february|march|april|may|june|july|august|september|october|november|december",
                    ','.join(start_date), re.I)
                # print(start_date_re)
                if len(start_date_re) > 0:
                    for s in start_date_re:
                        std_tmp = monthDict.get(s.lower())
                        if std_tmp is not None:
                            std.append(std_tmp)
                std = list(set(std))
                item['start_date'] = ','.join(std).replace(
                    "0", "").strip().strip(",").strip()
                # print("item['start_date']: ", item['start_date'])

                career = response.xpath(
                    "//div[@id='collapse-field_pgm_prof_out']|//a[@class='collapsed'][contains(text(),'Professional outcomes')]/../../.."
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # if item['career_en'] == "":
                #     print("***career_en为空")
                # print("item['career_en']: ", item['career_en'])

                modules = response.xpath(
                    "//div[@id='collapse-field_pgm_str_sub']|//a[@class='collapsed'][contains(text(),'Structure and subjects')]/../../.."
                ).extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # if item['modules_en'] == "":
                #     print("***modules_en为空")
                # print("item['modules_en']: ", item['modules_en'])

                tuition_fee = response.xpath(
                    "//span[contains(@data-prefix,'Program fees 2019:')]//text()|//strong[contains(text(),'Program fees 2019')]/../text()|"
                    "//strong[contains(text(),'2019 fees:')]/../text()"
                ).extract()  # 2019.03.18 星期一
                clear_space(tuition_fee)
                # print("tuition_fee: ", tuition_fee)
                tuition_fee_re = re.findall(r"\d+,\d+", ''.join(tuition_fee))
                if len(tuition_fee_re) > 0:
                    item['tuition_fee'] = tuition_fee_re[0].replace(
                        ",", "").strip()
                print("item['tuition_fee']: ", item['tuition_fee'])

                entry_requirements = response.xpath(
                    "//div[@id='collapse-field_pgm_ent_req']|//a[@data-toggle='collapse'][contains(text(),'Entry requirements')]/../../..|"
                    "//h4[contains(text(),'English language proficiency requirements')]/preceding-sibling::*"
                ).extract()  # 2019.03.18 星期一
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(entry_requirements))
                if item['rntry_requirements_en'] == "":
                    print("***rntry_requirements_en为空")
                print("item['rntry_requirements_en']: ",
                      item['rntry_requirements_en'])

                # "https://bond.edu.au/intl/future-students/bond-international/information-international-students/international-english-language-testing-requirements"
                ielt_desc_dict = {
                    "Doctor of Physiotherapy":
                    "IELTS score 7.0 No sub score less than 7.0",
                    "Master of Occupational Therapy":
                    "IELTS score 7.0 No sub score less than 7.0",
                    "Master of Psychology (Clinical)":
                    "IELTS score 7.0 No sub score less than 7.0",
                    "Doctor of Philosophy":
                    "IELTS score 7.0 No sub score less than 6.5",
                    "Master by Research":
                    "IELTS score 7.0 No sub score less than 6.5",
                    "Master of Science by Research (Health Sciences)":
                    "IELTS score 7.0 No sub score less than 6.5",
                    "Master of Philosophy":
                    "IELTS score 7.0 No sub score less than 6.5",
                    "Graduate Certificate in Nutrition":
                    "IELTS score 7.0 No sub score less than 6.5",
                    "Graduate Diploma in Nutrition":
                    "IELTS score 7.0 No sub score less than 6.5",
                    "Graduate Diploma of Psychological Science":
                    "IELTS score 7.0 No sub score less than 6.5",
                    "Master of Architecture":
                    "IELTS score 7.0 No sub score less than 6.5",
                    "Master of Nutrition and Dietetic Practice":
                    "IELTS score 7.0 No sub score less than 6.5",
                    "Master of Sports Science":
                    "IELTS score 7.0 No sub score less than 6.5",
                    "Graduate Certificate in TESOL":
                    "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                    "Graduate Certificate in TESOL (Online)":
                    "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                    "Master of Arts (Coursework)":
                    "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                    "Master of Arts (TESOL)":
                    "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                    "Master of Arts (TESOL) (online)":
                    "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                    "Master of Communication":
                    "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                    "Master of Communication (Professional)":
                    "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                    "Master of Criminology":
                    "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                    "Master of Criminology (Professional)":
                    "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                    "Master of International Relations":
                    "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                    "Master of International Relations (Professional)":
                    "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                    "Master of Building Surveying":
                    "IELTS score 6.5 No sub score less than 6.0",
                    "Master of Construction Practice":
                    "IELTS score 6.5 No sub score less than 6.0",
                    "Master of Construction Practice (Professional)":
                    "IELTS score 6.5 No sub score less than 6.0",
                    "Master of Project Management":
                    "IELTS score 6.5 No sub score less than 6.0",
                    "Master of Project Management (Professional)":
                    "IELTS score 6.5 No sub score less than 6.0",
                    "Master of Sport Management":
                    "IELTS score 6.5 No sub score less than 6.0",
                    "Master of Sport Management (Professional)":
                    "IELTS score 6.5 No sub score less than 6.0",
                    "Master of Sustainable Environments and Planning":
                    "IELTS score 6.5 No sub score less than 6.0",
                    "Master of Sustainable Environments and Planning (Professional)":
                    "IELTS score 6.5 No sub score less than 6.0",
                    "Master of Valuation and Property Development":
                    "IELTS score 6.5 No sub score less than 6.0",
                    "Master of Valuation and Property Development (Professional)":
                    "IELTS score 6.5 No sub score less than 6.0",
                    "Master of Actuarial Practice":
                    "IELTS score 6.0 No sub score less than 6.0",
                    "Master of Actuarial Science":
                    "IELTS score 6.0 No sub score less than 6.0",
                    "Master of Actuarial Science (Specialisation)":
                    "IELTS score 6.0 No sub score less than 6.0",
                    "Master of Finance":
                    "IELTS score 6.0 No sub score less than 6.0",
                    "Master of Finance (Professional)":
                    "IELTS score 6.0 No sub score less than 6.0",
                    "Master of Financial Management":
                    "IELTS score 6.0 No sub score less than 6.0",
                    "Master of Financial Management (Professional)":
                    "IELTS score 6.0 No sub score less than 6.0",
                }
                item['ielts_desc'] = ielt_desc_dict.get(item['degree_name'])
                # print("item['ielts_desc']: ", item['ielts_desc'])

                if item['ielts_desc'] is not None:
                    ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                    if len(ieltlsrw) == 2:
                        item["ielts"] = ieltlsrw[0]
                        item["ielts_l"] = ieltlsrw[1]
                        item["ielts_s"] = ieltlsrw[1]
                        item["ielts_r"] = ieltlsrw[1]
                        item["ielts_w"] = ieltlsrw[1]
                    elif len(ieltlsrw) == 1:
                        item["ielts"] = ieltlsrw[0]
                        item["ielts_l"] = ieltlsrw[0]
                        item["ielts_s"] = ieltlsrw[0]
                        item["ielts_r"] = ieltlsrw[0]
                        item["ielts_w"] = ieltlsrw[0]
                    elif len(ieltlsrw) == 5:
                        item["ielts"] = ieltlsrw[0]
                        item["ielts_l"] = ieltlsrw[2]
                        item["ielts_s"] = ieltlsrw[2]
                        item["ielts_r"] = ieltlsrw[2]
                        item["ielts_w"] = ieltlsrw[1]
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                item['apply_desc_en'] = remove_class(
                    clear_lianxu_space([
                        """<section class="section" id="section-8551"> <a id="application" name="application" class="anchor" ></a><h2 class="field field-name-field-title field-type-text field-label-hidden"> Application essentials</h2><div class="panel-group" id="accordion-8552" role="tablist" aria-multiselectable="true"><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8553"> Application process </a></h4></div><div id="collapse-8553" class="panel-collapse collapse"><div class="panel-body"><h3>Australian students</h3><p>Applications for most Bond programs can be lodged at any time directly to the University. With the exception of the <a href="https://bond.edu.au/intl/future-students/study-bond/search-program/medicine-bond">Medical Program</a>, you do not need to go through QTAC and your Bond application will not affect your QTAC application for other university programs. Apply direct to Bond University via our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>.</p><h3>International students - full degree</h3><p>Apply directly to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a> or through a representative in your country.</p><h3>English language students</h3><p>Apply to study English at Bond University College, located on the Bond University campus, by selecting your desired program below:</p><ul><li><a href="https://apply.bond.edu.au/">English for Academic Purposes</a></li><li><a href="https://college.bond.edu.au/apply-english">General English</a></li></ul><h3>Diploma, university preparation or foundation program students</h3><p>You can apply for your academic pathway through our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>.</p><h3>Outbound exchange - Bond students</h3><p>Undergraduate Bond students must have completed two semesters prior to application while postgraduate Bondies can apply from their first semesters. Applications received in first and second semesters will be pending GPA requirements of 65% and above. Find out how to <a href="https://bond.edu.au/intl/future-students/bond-international/semester-abroad-exchange/outbound-bond">apply for exchange</a>.</p><h3>Inbound exchange students</h3><p>If your home institution has a formal exchange agreement with Bond, you will need to apply through them via their outbound exchange student application process. Your university may set certain academic performance standards for you to qualify for the program. Providing you meet their criteria, your home institution will contact Bond to nominate you as an exchange student and we will contact you to advise that your nomination has been successful. You will then be able to apply to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>, which must be accompanied by the required documentation. Exchange students pay their regular tuition fees to their home institution – not to Bond University.</p><h3>Study abroad students</h3><p>Firstly obtain approval from your home institution, then apply directly to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>; through a study abroad representative in your country; or through your home university if applicable.</p><ul></ul></div></div></div><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8554"> When can you start? </a></h4></div><div id="collapse-8554" class="panel-collapse collapse"><div class="panel-body"><p>Bond University runs three full semesters each year with intakes in January (Semester 1), May (Semester 2) and September (Semester 3). Our semesters are scheduled to coordinate with the Northern Hemisphere school/university timetables. (You’ll find that most other Australian universities offer only two semesters a year, meaning that you may have to wait until February or July before you can start your international studies.)</p></div></div></div><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8555"> Admissions criteria </a></h4></div><div id="collapse-8555" class="panel-collapse collapse"><div class="panel-body"><p>Bond University is committed to open and transparent admission processes, and to providing detailed information about the options and entry criteria that are relevant for you. </p><p>Learn more about our <a href="https://bond.edu.au/intl/future-students/study-bond/how-apply/undergraduate-admissions-criteria">undergraduate admissions criteria</a>. If you have further questions or wish to speak to one of our advisors, contact the <a href="https://bond.edu.au/intl/contact#ofs">Office of Future Students</a>.</p><p>For postgraduate study, the entry requirements are unique to each individual program. <a href="https://bond.edu.au/intl/future-students/study-bond/search-program#postgraduate">Search for your program</a> of interest to find out the specific entry requirements. </p></div></div></div><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8567"> Academic and English language entry requirements </a></h4></div><div id="collapse-8567" class="panel-collapse collapse"><div class="panel-body"><p>In addition to any performance standards stipulated by your home institution, you will also need to meet Bond’s academic and <a href="https://bond.edu.au/intl/future-students/bond-international/information-international-students/english-language-requirements">English language</a> requirements for the study program you have chosen.</p><p>If you need extra instruction, Bond offers <a href="https://college.bond.edu.au/english-at-bond">English classes</a> on campus through Bond University College, as well as a <a href="https://bond.edu.au/intl/program/bond-university-college-foundation-program">Foundation Program</a> to prepare you for university studies in Australia.</p></div></div></div></div></section>"""
                    ]))
                item['apply_proces_en'] = remove_class(
                    clear_lianxu_space([
                        """<div class="panel-group" id="accordion-8552" role="tablist" aria-multiselectable="true"><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8553"> Application process </a></h4></div><div id="collapse-8553" class="panel-collapse collapse"><div class="panel-body"><h3>Australian students</h3><p>Applications for most Bond programs can be lodged at any time directly to the University. With the exception of the <a href="https://bond.edu.au/intl/future-students/study-bond/search-program/medicine-bond">Medical Program</a>, you do not need to go through QTAC and your Bond application will not affect your QTAC application for other university programs. Apply direct to Bond University via our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>.</p><h3>International students - full degree</h3><p>Apply directly to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a> or through a representative in your country.</p><h3>English language students</h3><p>Apply to study English at Bond University College, located on the Bond University campus, by selecting your desired program below:</p><ul><li><a href="https://apply.bond.edu.au/">English for Academic Purposes</a></li><li><a href="https://college.bond.edu.au/apply-english">General English</a></li></ul><h3>Diploma, university preparation or foundation program students</h3><p>You can apply for your academic pathway through our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>.</p><h3>Outbound exchange - Bond students</h3><p>Undergraduate Bond students must have completed two semesters prior to application while postgraduate Bondies can apply from their first semesters. Applications received in first and second semesters will be pending GPA requirements of 65% and above. Find out how to <a href="https://bond.edu.au/intl/future-students/bond-international/semester-abroad-exchange/outbound-bond">apply for exchange</a>.</p><h3>Inbound exchange students</h3><p>If your home institution has a formal exchange agreement with Bond, you will need to apply through them via their outbound exchange student application process. Your university may set certain academic performance standards for you to qualify for the program. Providing you meet their criteria, your home institution will contact Bond to nominate you as an exchange student and we will contact you to advise that your nomination has been successful. You will then be able to apply to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>, which must be accompanied by the required documentation. Exchange students pay their regular tuition fees to their home institution – not to Bond University.</p><h3>Study abroad students</h3><p>Firstly obtain approval from your home institution, then apply directly to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>; through a study abroad representative in your country; or through your home university if applicable.</p><ul></ul></div></div></div></div>"""
                    ]))
                # print(item)
                # print("+++", "Graduate" not in item['degree_name'])
                if "/" not in item['degree_name'] and "online" not in item[
                        'degree_name'].lower(
                        ) and "research" not in item['degree_name'].lower():
                    print("++++++++++++")
                    major_list = response.xpath(
                        "//h2[contains(text(),'Specialist majors')]/following-sibling::div[@role='tablist']/div/div/h4/a//text()|"
                        "//h2[contains(text(),'Specialisations')]/following-sibling::div[@role='tablist']/div/div/h4/a//text()"
                    ).extract()
                    clear_space(major_list)
                    print("major_list: ", major_list)
                    print(len(major_list))

                    if len(major_list) == 0:
                        yield item
                    else:
                        modules_list = response.xpath(
                            "//h2[contains(text(),'Specialist majors')]/following-sibling::div[@role='tablist']/div/div[2]|"
                            "//h2[contains(text(),'Specialisations')]/following-sibling::div[@role='tablist']/div/div[2]"
                        ).extract()
                        print("===", modules_list)
                        print(len(modules_list))
                        if len(modules_list) == len(major_list):
                            for m in range(len(major_list)):
                                item['programme_en'] = major_list[m]
                                item['modules_en'] = remove_class(
                                    clear_lianxu_space([modules_list[m]]))
                                print("item['programme_en']: ",
                                      item['programme_en'])
                                yield item

        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 7

Mostrar archivo

    def parses(self, response):
        print(response.url)
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = 'La Trobe University'
        item['url'] = response.url
        item['location'] = 'Melbourne'
        degree_name = response.xpath('//h1/text()').extract()
        print(degree_name)
        degree_name = ''.join(degree_name)
        item['degree_name'] = degree_name
        degree_name_list = response.xpath(
            '//p[contains(text(),"Our Majors are:")]/following-sibling::ul/li//text()|'
            '//p[contains(text(),"Choose from five majors")]/following-sibling::ul[1]/li/text()|'
            '//h3[contains(text(),"Specialisations, majors and minors")]/following-sibling::table/tbody/tr/td[1]//text()'
        ).extract()
        print(degree_name_list)
        modules_url = response.xpath(
            '//ul[@class="list-arrows"]/li[1]/a/@href').extract()
        if modules_url != []:
            try:
                modules = self.getResponse(modules_url[0]).xpath(
                    '//h3[contains(text(),"Course structure")]/following-sibling::div/div/table//tr/td/text()'
                )
                # print('modules', modules)
                item['modules_en'] = clear_long_text(modules)
            except:
                item['modules_en'] = None

        overview = response.xpath(
            '//section[@id="overview"]/div[@class="block"]').extract()
        # print('overview', overview)
        item['degree_overview_en'] = remove_class(overview)
        rntry = response.xpath('//section[@id="entry-requirements"]').extract()
        # print('rntry',rntry)
        item['rntry_requirements_en'] = remove_class(rntry)
        career = response.xpath('//section[@id="career-outcomes"]').extract()
        # print('career',career)
        item['career_en'] = remove_class(career)
        htp = response.xpath('//section[@id="how-to-apply"]').extract()
        # print('htp',htp)
        item['apply_proces_en'] = remove_class(htp)
        fee = response.xpath(
            '//h3[contains(text(),"tuition fee")]/following-sibling::p[1]/text()'
        ).extract()
        # print('fee',fee)
        fee = ''.join(fee).strip()
        tuition = fee.replace(' ', '')
        # print('tuition_fee',tuition)
        item['tuition_fee'] = tuition[0:99]
        item['tuition_fee_pre'] = 'AUD'

        duration = response.xpath(
            '//div[contains(text(),"uration")]/following-sibling::div//text()'
        ).extract()
        # print('duration',duration)
        dura = re.findall('\d\.?\d?', ''.join(duration))
        dura = list(map(float, dura))
        item['duration'] = min(dura)
        item['duration_per'] = 1

        start_date = response.xpath(
            '//div[contains(text(),"tart")]/following-sibling::div//text()'
        ).extract()
        # print('start_date',start_date)
        start_date = tracslateDate(start_date)
        # print(start_date)
        item['start_date'] = ','.join(start_date)

        ielts = response.xpath(
            '//p[contains(text(),"IELTS")]/text()').extract()
        ielts = get_ielts(ielts)
        if ielts != []:
            item['ielts'] = ielts['IELTS']
            item['ielts_l'] = ielts['IELTS_L']
            item['ielts_s'] = ielts['IELTS_S']
            item['ielts_r'] = ielts['IELTS_R']
            item['ielts_w'] = ielts['IELTS_W']
        if degree_name_list != []:
            for url in degree_name_list:
                deg_res = self.getResponse(''.join(url))
                deg_overview = deg_res.xpath('//div[@id="overview"]//text()')
                deg_overview = clear_long_text(deg_overview)
                item['overview_en'] = deg_overview
                item['programme_en'] = ''.join(deg_res.xpath('//h1/text()'))
                if '/' not in degree_name:
                    yield item
        else:
            programme = re.findall('\(.*\)', degree_name)
            programme = ''.join(programme).replace('(',
                                                   '').replace(')',
                                                               '').strip()
            if programme != '':
                item['programme_en'] = programme
            else:
                item['programme_en'] = degree_name.replace(
                    'Master of', '').replace('Bachelor of', '').strip()
            if '/' not in degree_name:
                yield item

Ejemplo n.º 8

Mostrar archivo

Archivo: DeakinUniversity_P.py Proyecto: histudent/python_spider

    def parse(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = "Deakin University"
        # item['country'] = 'Australia'
        # item['website'] = 'http://www.deakin.edu.au'
        item['degree_type'] = 2
        item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        # 组合字典
        links = [
            "http://www.deakin.edu.au/course/master-accounting-and-international-finance-international",
            "http://www.deakin.edu.au/course/master-accounting-and-law-international",
            "http://www.deakin.edu.au/course/master-architecture-international",
            "http://www.deakin.edu.au/course/master-architecture-design-management-international",
            "http://www.deakin.edu.au/course/master-arts-international-relations-international",
            "http://www.deakin.edu.au/course/master-arts-writing-and-literature-international",
            "http://www.deakin.edu.au/course/master-biotechnology-bioinformatics-international",
            "http://www.deakin.edu.au/course/master-business-sport-management-international",
            "http://www.deakin.edu.au/course/master-business-administration-international",
            "http://www.deakin.edu.au/course/master-business-administration-healthcare-management-international",
            "http://www.deakin.edu.au/course/international-master-business-administration-international",
            "http://www.deakin.edu.au/course/master-business-analytics-international",
            "http://www.deakin.edu.au/course/master-clinical-exercise-physiology-international",
            "http://www.deakin.edu.au/course/master-commerce-international",
            "http://www.deakin.edu.au/course/master-communication-international",
            "http://www.deakin.edu.au/course/master-construction-management-international",
            "http://www.deakin.edu.au/course/master-construction-management-professional-international",
            "http://www.deakin.edu.au/course/master-creative-arts-international",
            "http://www.deakin.edu.au/course/master-cultural-heritage-international",
            "http://www.deakin.edu.au/course/master-data-analytics-international",
            "http://www.deakin.edu.au/course/master-dietetics-international",
            "http://www.deakin.edu.au/course/master-education-international",
            "http://www.deakin.edu.au/course/master-financial-planning-international",
            "http://www.deakin.edu.au/course/master-health-economics-international",
            "http://www.deakin.edu.au/course/master-health-promotion-international",
            "http://www.deakin.edu.au/course/master-health-and-human-services-management-international",
            "http://www.deakin.edu.au/course/master-humanitarian-assistance-international",
            "http://www.deakin.edu.au/course/master-information-systems-international",
            "http://www.deakin.edu.au/course/master-information-technology-international",
            "http://www.deakin.edu.au/course/master-information-technology-professional-international",
            "http://www.deakin.edu.au/course/master-international-accounting-international",
            "http://www.deakin.edu.au/course/master-international-finance-international",
            "http://www.deakin.edu.au/course/master-landscape-architecture-international",
            "http://www.deakin.edu.au/course/master-laws-international",
            "http://www.deakin.edu.au/course/master-marketing-international",
            "http://www.deakin.edu.au/course/master-nutrition-and-population-health-international",
            "http://www.deakin.edu.au/course/master-professional-accounting-international",
            "http://www.deakin.edu.au/course/master-professional-accounting-and-finance-international",
            "http://www.deakin.edu.au/course/master-psychology-clinical-international",
            "http://www.deakin.edu.au/course/master-psychology-organisational-international",
            "http://www.deakin.edu.au/course/master-public-health-international",
            "http://www.deakin.edu.au/course/master-science-research-international",
            "http://www.deakin.edu.au/course/master-sustainability-international",
            "http://www.deakin.edu.au/course/master-teaching-early-childhood-international",
            "http://www.deakin.edu.au/course/master-teaching-primary-and-early-childhood-international",
            "http://www.deakin.edu.au/course/master-teaching-primary-and-secondary-international",
            "http://www.deakin.edu.au/course/master-teaching-primary-international",
            "http://www.deakin.edu.au/course/master-teaching-secondary-international",
            "http://www.deakin.edu.au/course/master-teaching-english-to-speakers-other-languages-international",
        ]
        programme_dict = {}
        programme_list = [
            "Master of Accounting and International Finance",
            "Master of Accounting and Law",
            "Master of Architecture",
            "Master of Architecture (Design Management)",
            "Master of Arts (International Relations)",
            "Master of Arts (Writing and Literature)",
            "Master of Biotechnology and Bioinformatics",
            "Master of Business (Sport Management)",
            "Master of Business Administration",
            "Master of Business Administration (Healthcare Management)",
            "Master of Business Administration (International)",
            "Master of Business Analytics",
            "Master of Clinical Exercise Physiology",
            "Master of Commerce",
            "Master of Communication",
            "Master of Construction Management",
            "Master of Construction Management (Professional)",
            "Master of Creative Arts",
            "Master of Cultural Heritage",
            "Master of Data Analytics",
            "Master of Dietetics",
            "Master of Education",
            "Master of Financial Planning",
            "Master of Health Economics",
            "Master of Health Promotion",
            "Master of Health and Human Services Management",
            "Master of Humanitarian Assistance",
            "Master of Information Systems",
            "Master of Information Technology",
            "Master of Information Technology (Professional)",
            "Master of International Accounting",
            "Master of International Finance",
            "Master of Landscape Architecture",
            "Master of Laws",
            "Master of Marketing",
            "Master of Nutrition and Population Health",
            "Master of Professional Accounting",
            "Master of Professional Accounting and Finance",
            "Master of Psychology (Clinical)",
            "Master of Psychology (Organisational)",
            "Master of Public Health",
            "Master of Science (Research)",
            "Master of Sustainability",
            "Master of Teaching (Early Childhood)",
            "Master of Teaching (Primary and Early Childhood)",
            "Master of Teaching (Primary and Secondary)",
            "Master of Teaching (Primary)",
            "Master of Teaching (Secondary)",
            "Master of Teaching English to Speakers of Other Languages",
        ]
        for link in range(len(links)):
            url = links[link]
            programme_dict[url] = programme_list[link]
        item['major_type1'] = programme_dict.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programme = response.xpath(
                "//div[@class='module__banner-title']/h1//text()").extract()
            clear_space(programme)
            item['degree_name'] = ''.join(programme).strip()
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Master", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                programme_re = re.findall(r"\(.+\)", item['degree_name'])
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Master of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                # //div[@class='module__summary--items']/div[1]/div[2]
                ielts = response.xpath(
                    "//h3[contains(text(),'English language requirements')]/../following-sibling::*[1]//text()"
                ).extract()
                clear_space(ielts)
                item['ielts_desc'] = ''.join(ielts).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_d = get_ielts(item['ielts_desc'])
                item["ielts"] = ielts_d.get('IELTS')
                item["ielts_l"] = ielts_d.get('IELTS_L')
                item["ielts_s"] = ielts_d.get('IELTS_S')
                item["ielts_r"] = ielts_d.get('IELTS_R')
                item["ielts_w"] = ielts_d.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                duration = response.xpath(
                    "//h3[contains(text(),'Duration')]/../following-sibling::div//text()"
                ).extract()
                clear_space(duration)
                # print("duration: ", duration)
                duration_re = re.findall(r".*full[\s\-]time",
                                         ''.join(duration).strip())
                item['duration'] = ''.join(duration_re).strip()
                # if item['duration'] == "":
                #     print("***duration 为空")
                # print("item['duration']: ", item['duration'])

                location = response.xpath(
                    "//div[@class='module__summary--icon-wrapper']//h3[@class='course__subheading'][contains(text(),'Campuses')]/../following-sibling::div//text()"
                ).extract()
                clear_space(location)
                item['location'] = ' '.join(location).strip()
                location_tmp = item['location']
                # print("item['location']: ", item['location'])

                # //div[@id='navigation__course']/following-sibling::div
                overview = response.xpath(
                    "//h2[contains(text(),'Course information')]/../.."
                ).extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # if item['degree_overview_en'] == "":
                #     print("***degree_overview_en 为空")
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                modules = response.xpath(
                    "//div[@id='module__course-structure']").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # if item['modules_en'] == "":
                #     print("***modules_en 为空")
                # print("item['modules_en']: ", item['modules_en'])

                start_date = response.xpath(
                    "//li[contains(text(),'Start date:')]//text()").extract()
                clear_space(start_date)
                # print("start_date: ", start_date)
                item['start_date'] = getStartDateMonth(
                    ' '.join(start_date).strip())
                # print("item['start_date']: ", item['start_date'])

                entry_requirements = response.xpath(
                    "//div[@data-section='entry requirements']").extract()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(entry_requirements))
                # if item['rntry_requirements_en'] == "":
                #     print("***rntry_requirements_en 为空")
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                # //div[@data-section='fees and scholarships']
                tuition_fee = response.xpath(
                    "//div[@class='module__content-panel']//div[@class='module__key-information--item-content']/text()"
                ).extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", tuition_fee)
                tuition_fee = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee'] = tuition_fee
                if item['tuition_fee'] == 0:
                    item['tuition_fee'] = None
                # print("item['tuition_fee']: ", item['tuition_fee'])

                career = response.xpath(
                    "//div[@data-section='graduate outcomes']|//div[@data-section='graduate outcomes']/following-sibling::div[1]|"
                    "//h3[contains(text(),'Career outcomes')]/..|//h3[contains(text(),'Career outcomes')]/../following-sibling::div[1]"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # if item['career_en'] == "":
                #     print("***career_en 为空")
                # print("item['career_en']: ", item['career_en'])

                # //div[@data-section='application information']/following-sibling::div[2]
                how_to_apply = response.xpath(
                    "//h3[contains(text(),'How to apply')]/../..").extract()
                item['apply_desc_en'] = remove_class(
                    clear_lianxu_space(how_to_apply))
                # if item['apply_desc_en'] == "":
                #     print("***apply_desc_en 为空")
                # print("item['apply_desc_en']: ", item['apply_desc_en'])

                work_experience_desc_en = re.findall(
                    r"<.{1,100}work\sexperience.{1,100}>", response.text)
                item['work_experience_desc_en'] = "<p>" + remove_tags(
                    remove_class(
                        clear_lianxu_space(work_experience_desc_en))) + "</p>"
                item['work_experience_desc_en'] = item[
                    'work_experience_desc_en'].replace("<p></p>", "").strip()
                if item['work_experience_desc_en'] != "":
                    print("item['work_experience_desc_en']: ",
                          item['work_experience_desc_en'])
                # print(item)

                major_list_url = response.xpath(
                    "//h3[contains(text(), 'Specialisations')]/..//a/@href"
                ).extract()
                clear_space(major_list_url)
                print("major_list_url: ", major_list_url)
                print(len(major_list_url))

                major_url_l = []
                for major_url in major_list_url:
                    if "specialisation" in major_url:
                        major_url_l.append(major_url)
                print("major_url_l: ", major_url_l)
                print(len(major_url_l))
                if len(major_url_l) == 0:
                    item['url'] = response.url
                    print("item['url']2: ", item['url'])
                    yield item
                else:
                    for major_url in major_url_l:
                        headers_base = {
                            'User-Agent':
                            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
                        }
                        data = requests.get(major_url, headers=headers_base)
                        response_major = etree.HTML(data.text)
                        item['url'] = major_url
                        print("item['url']_major: ", item['url'])

                        programme_major = response_major.xpath(
                            "//div[@class='module__banner-title']/h1//text()")
                        item['programme_en'] = ''.join(programme_major).strip()
                        print("item['programme_en']_major: ",
                              item['programme_en'])

                        location_major = response_major.xpath(
                            "//*[contains(text(),'Campuses')]/../following-sibling::div[1]//text()"
                        )
                        item['location'] = ''.join(location_major).strip()
                        if item['location'] == "":
                            item['location'] = location_tmp
                        # print("item['location']_major: ", item['location'])

                        overview_en = response_major.xpath(
                            "//h2[contains(text(),'Overview')]/../..")
                        overview_en_str = ""
                        if len(overview_en) > 0:
                            for o in overview_en:
                                overview_en_str += etree.tostring(
                                    o, encoding='unicode', method='html')
                        item['overview_en'] = remove_class(
                            clear_lianxu_space([overview_en_str]))
                        # print("item['overview_en']_major: ", item['overview_en'])

                        modules_en = response_major.xpath(
                            "//h2[contains(text(),'Explore units')]/../..")
                        modules_en_str = ""
                        if len(modules_en) > 0:
                            for o in modules_en:
                                modules_en_str += etree.tostring(
                                    o, encoding='unicode', method='html')
                        item['modules_en'] = remove_class(
                            clear_lianxu_space([modules_en_str]))
                        # print("item['modules_en']_major: ", item['modules_en'])
                        yield item
                        # else:
                        #     item['url'] = response.url
                        #     print("item['url']1: ", item['url'])
                        #     yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 9

Mostrar archivo

    def parse(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = "The University of New South Wales"
        # item['country'] = 'Australia'
        # item['website'] = 'http://www.unsw.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 2
        item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        try:
            department = response.xpath(
                "//div[@class='inlinevideo-inner']//div[@class='contentarea-title']/h3//text()"
            ).extract()
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            # 学位类型列表
            degree_type = response.xpath(
                "//section//div[@class='degree js-degree']//h5//text()"
            ).extract()
            clear_space(degree_type)
            print(len(degree_type))
            print("degree_type: ", degree_type)

            duration = response.xpath(
                "//section//dt[contains(text(), 'Minimum years')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(duration)
            print(len(duration))
            print("duration: ", duration)

            start_date = response.xpath(
                "//section//dt[contains(text(), 'Entry')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(start_date)
            print(len(start_date))
            print("start_date: ", start_date)

            tuition_fee = response.xpath(
                "//section//dt[contains(text(), 'Estimated first year tuition')]/following-sibling::*[1]//text()"
            ).extract()
            clear_space(tuition_fee)
            print(len(tuition_fee))
            print("tuition_fee: ", tuition_fee)

            careerEle = response.xpath("//section//dl[last()]")
            print(len(careerEle))
            print("careerEle: ", careerEle)

            for i in range(len(degree_type)):
                print("-------------------" + str(i) + "-----------------")
                item['degree_name'] = degree_type[i]
                print("item['degree_name']: ", item['degree_name'])

                # 课程长度
                item['duration'] = duration[i]
                print("item['duration']: ", item['duration'])

                # 开学时间
                item['start_date'] = start_date[i]
                if "and" in item['start_date']:
                    start_date_sp = item['start_date'].split("and")
                else:
                    start_date_sp = [item['start_date']]
                # print(start_date_sp)
                start_date_str = ""
                for st in start_date_sp:
                    start_date_str += getStartDate(st).replace("0", "") + ","
                item['start_date'] = start_date_str.strip().strip(',').strip()
                print("item['start_date']: ", item['start_date'])

                # 学费
                item['tuition_fee'] = tuition_fee[i].replace(
                    "AUD $", "").replace(",", "").strip()
                print("item['tuition_fee']: ", item['tuition_fee'])

                # print(careerEle[i])
                careerRe = careerEle[i].xpath(
                    ".//dt[contains(text(), 'Career Opportunities')]|.//dt[contains(text(), 'Career Opportunities')]/following-sibling::dd[1]"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(careerRe))
                print("item['career_en']: ", item['career_en'])

                if "Graduate" not in item['degree_name']:
                    yield item

            # programme = response.xpath("//div[@class='internalContentWrapper']/h1[1]//text()").extract()
            # programme = ''.join(programme)
            # programme = programme.split("-")
            # item['programme_en'] = programme[0].strip()
            # print("item['programme_en']: ", item['programme_en'])

            # yield item
        except Exception as e:
            with open(".//scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'w',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 10

Mostrar archivo

Archivo: CharlesSturtUniversity_P.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = "Charles Sturt University"
        # item['country'] = 'Australia'
        # item['website'] = 'http://futurestudents.csu.edu.au'
        item['url'] = response.url
        item['degree_type'] = 2
        item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        driver = webdriver.Chrome(
            r"C:\Users\admin\AppData\Local\Programs\Python\Python36\Lib\site-packages\selenium\chromedriver.exe"
        )
        driver.implicitly_wait(30)  # 隐式等待
        driver.get(response.url)
        import time
        # time.sleep(2)

        try:
            location = driver.find_element_by_xpath(
                r"//div[@id='fYear-campus']").text
            print("location: ", location)
            # location = response.xpath(
            #     "//div[@id='locations1']//div[@class='section no-padding-top']//div[@class='card card-content z-depth-0']//div[@class='is-domestic']//text()|"
            #     "//div[@id='fYear-campus']//text()").extract()
            # clear_space(location)
            # item['location'] = ','.join(location).strip()
            item['location'] = location
            print("item['location']: ", item['location'])

            programme = response.xpath(
                "//h1[@class='logo-font csu-slogan course course-name']//text()"
            ).extract()
            clear_space(programme)
            programme = ''.join(programme).replace("(with specialisations)",
                                                   "").replace("  ",
                                                               "").strip()
            item['degree_name'] = programme
            print("item['degree_name']: ", item['degree_name'])

            item['programme_en'] = programme.replace("Master of", "").strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_overview_en = response.xpath(
                "//div[@class='col s12 m12 push-l1 l9 overview-text']"
            ).extract()
            item['degree_overview_en'] = remove_class(
                clear_lianxu_space(degree_overview_en))
            # print("item['degree_overview_en']: ", item['degree_overview_en'])

            duration = driver.find_element_by_xpath(
                r"//div[@id='fYear-duration']").text
            # print("duration: ", duration)
            # duration = response.xpath(
            #     "//div[@id='ocbDuration']//text()|"
            #     "//div[@id='fYear-duration']//text()").extract()
            # clear_space(duration)
            # item['duration'] = ','.join(duration).replace(",,", ",").replace(":,", ":").replace(",:", ":").strip().strip(",").strip()
            item['duration'] = duration
            print("item['duration']: ", item['duration'])

            start_date = driver.find_element_by_xpath(
                r"//div[@id='fYear-sessions']").text
            # print("start_date: ", start_date)
            # start_date = response.xpath(
            #     "//div[@id='sessDatesKI']/span/text() | //div[@id='sessDateDom']/span/text()| "
            #     "//div[@id='fYear-sessions']//text()").extract()
            # clear_space(start_date)
            # print("start_date: ", start_date)
            # start_date_str = ""
            # if len(start_date) > 0:
            #     start_date_str = start_date[0].strip()
            # if ";" in start_date_str:
            #     start_date_list = start_date_str.split(";")
            #     st_l = []
            #     for s in start_date_list:
            #         s1 = s.replace("2018", "").replace("2019", "").replace("0", "").strip()
            #         st_l.append(s1)
            #     st_l = list(set(st_l))
            #     item['start_date'] = ','.join(st_l).strip().strip(",").strip()
            item['start_date'] = start_date
            print("item['start_date']: ", item['start_date'])

            career = response.xpath(
                "//div[@class='hasCareerOpps']|//div[@class='section isPostGrad isHDR']"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # 显示等待，出现id为subject-div的元素结束等待
            locator = (By.ID, 'subject-div')
            WebDriverWait(driver, 20,
                          0.5).until(EC.presence_of_element_located(locator))
            # //div[@id='testimonial-area']/following-sibling::div[1]
            modules = driver.find_element_by_xpath(
                r"//div[@id='subject-div']").get_attribute('innerHTML')
            # print("modules: ", modules)
            # modules = response.xpath(
            #     "//div[@id='subject-intro']|//div[@id='subject-div']").extract()
            item['modules_en'] = remove_class(modules)
            print("item['modules_en']: ", item['modules_en'])

            rntry_requirements_en = driver.find_element_by_xpath(
                r"//div[@id='detailCardTeam1']").get_attribute('innerHTML')
            # print("rntry_requirements_en: ", rntry_requirements_en)
            # rntry_requirements_en = response.xpath(
            #     # "//h3[contains(text(),'Entry requirements')]/..|"
            #     "//div[@id='detailCardTeam1']").extract()
            item['rntry_requirements_en'] = remove_class(
                clear_lianxu_space([rntry_requirements_en]))
            print("item['rntry_requirements_en']: ",
                  item['rntry_requirements_en'])

            item["ielts"] = '6.5'
            item["ielts_l"] = '6.0'
            item["ielts_s"] = '6.0'
            item["ielts_r"] = '6.0'
            item["ielts_w"] = '6.0'

            department = response.xpath(
                "//html//nav[@class='breadcrumb-wrapper']//a[3]//text()"
            ).extract()
            clear_space(department)
            item['department'] = ' '.join(department).strip()
            # print("item['department']: ", item['department'])

            apply_desc_en = response.xpath(
                "//div[@id='international-app']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(apply_desc_en))
            # print("item['apply_desc_en']: ", item['apply_desc_en'])

            deadline = response.xpath(
                "//div[@class='card']//div[@class='card very-small-international-lower']//text()"
            ).extract()
            clear_space(deadline)
            # print("deadline: ", deadline)
            deadline_str = ""
            if "Important dates" in deadline:
                d = deadline.index("Important dates")
                deadline_str += deadline[d + 1] + " "
            item['deadline'] = getStartDate(deadline_str.strip())
            # print("item['deadline']: ", item['deadline'])

            feeDict = {
                "Master of Animal Science": "28800",
                "Master of Business Administration": "28560",
                "Master of Commerce": "28560",
                "Master of Customs Administration": "19200",
                "Master of Information Technology":
                "Port Macquarie:20100,CSU Study Centre Melbourne, CSU Study Centre Sydney:29680",
                "Master of Medical Radiation Science": "28800",
                "Master of Ministry": "18400",
                "Master of Philosophy (Lab Based)": "32000",
                "Master of Philosophy (Non Lab Based)": "29600",
                "Master of Professional Accounting":
                "CSU Study Centre Melbourne, CSU Study Centre Sydney28560",
                "Master of Sustainable Agriculture": "28800",
                "Master of Terrorism and Security Studies": "19200",
                "Master of Theology": "18400",
            }
            item['tuition_fee'] = feeDict.get(item['degree_name'].replace(
                "(12 subjects)", "").replace("(16 subjects)", "").strip())

            fd = {
                "Master of Business Administration(12 subjects)": "29,712",
                "Master of Business Administration(16 subjects)": "29,712",
                "Master of Commerce(12 subjects)": "22,284",
                "Master of Commerce(16 subjects)": "29,712",
                "Master of Information Technology(12 subjects)": "23,160",
                "Master of Information Technology(16 subjects)": "30,880",
                "Master of Professional Accounting(12 subjects)": "22,284",
                "Master of Professional Accounting(16 subjects)": "29,712",
            }
            if item['tuition_fee'] is None:
                item['tuition_fee'] = fd.get(item['degree_name'])

            print("item['tuition_fee']: ", item['tuition_fee'])

            online = response.xpath(
                "//h2[contains(text(),'Study mode')]/following-sibling::*//text()"
            ).extract()
            clear_space(online)
            print("online: ", online)
            if ''.join(online).strip() != "Online":
                major_list = response.xpath(
                    # "//div[@id='subject-div']//div[@class='section']//h2//text()|"    # 2019.03.18 星期一之前的xpath
                    # "//div[@id='subject-div']//div[@class='section']//h3//text()|"
                    "//div[@id='fYear-specialisation']/ul[1]/li//text()|"
                    "//div[@id='fYear-specialisation']/h3//text()|"
                    "//div[@id='fYear-specialisation']/h6//text()").extract()
                clear_space(major_list)
                print("major_list: ", major_list)
                print(len(major_list))

                driver.quit()
                if len(major_list) == 0:
                    yield item
                else:
                    for m in range(len(major_list)):
                        item['programme_en'] = major_list[m]
                        # item['modules_en'] = remove_class(clear_lianxu_space([modules_list[m]]))
                        # print("item['programme_en']: ", item['programme_en'])
                        yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 11

Mostrar archivo

Archivo: UniversityofTechnologySydney_P.py Proyecto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = "University of Technology Sydney"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.uts.edu.au'
        item['url'] = response.url
        item['degree_type'] = 2
        item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programme = response.xpath(
                '//div[@class="field-item"]/div[contains(@class,"page-title")]/h1//text()'
            ).extract()
            clear_space(programme)
            programme = ''.join(programme).strip()
            item['degree_name'] = programme
            print("item['degree_name']: ", item['degree_name'])

            de_p = re.findall(r"\(.+\)", item['degree_name'])
            de_p = ''.join(de_p).strip()
            item['programme_en'] = item['degree_name'].replace(
                de_p, "").replace("Master of", "").strip()
            print("item['programme_en']: ", item['programme_en'])

            start_date = response.xpath(
                "//dt[contains(text(),'UAC')]/following-sibling::dd/span//text()"
            ).extract()
            clear_space(start_date)
            print(start_date)
            if len(start_date) > 0:
                start_date_re = re.findall(r"\w+\ssession",
                                           ' '.join(start_date))
                start_date_re = list(set(start_date_re))
                print("start_date_re: ", start_date_re)

                item['start_date'] = ','.join(start_date_re).replace(
                    "(", "").replace(")", "").replace(" session", "").strip()
            print("item['start_date']: ", item['start_date'])

            overview = response.xpath(
                '//div[@class="field field-dddd-view-modeluts-course-course__overview field-type-ds field-label-hidden"]'
            ).extract()
            item['degree_overview_en'] = remove_class(
                clear_lianxu_space(overview))
            item['overview_en'] = item['degree_overview_en']
            print("item['degree_overview_en']: ", item['degree_overview_en'])

            career = response.xpath(
                '//div[@class="field field-dddd-view-modeluts-course-course__careers field-type-ds field-label-hidden"]'
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en']: ", item['career_en'])

            modules = response.xpath(
                "//div[@class='course__structure']").extract()
            if len(modules) == 0:
                print(" 8888")
            #     modules = response.xpath("//div[@class='course__structure']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en']: ", item['modules_en'])

            location = response.xpath(
                '//div[@class="field field-dddd-view-modeluts-course-course__location field-type-ds field-label-hidden"]//p//text()'
            ).extract()
            clear_space(location)
            location = ''.join(location).strip()
            item['location'] = location
            # print("item['location']: ", item['location'])

            duration = response.xpath(
                '//div[@class="field field-dddd-view-modeluts-course-course__duration field-type-ds field-label-hidden"]//p//text()'
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            if len(duration) > 0:
                duration = duration[0]
            # print(duration)
            if "or" in duration:
                duration = duration.strip("or").strip()
            mode = re.findall("\w+\stime$", duration)
            # print(mode)
            mode = ''.join(mode)
            # item['mode'] = mode
            # print("item['mode']: ", item['mode'])
            item['duration'] = ''.join(duration.replace(mode, "").strip())
            # print("item['duration']: ", item['duration'])

            # http://cis.uts.edu.au/fees/course-fees.cfm
            # feeDict = {'C04006v6': '15000', 'C04007v7': '15000', 'C04008v6': '15000', 'C04018v6': '19015', 'C04037v6': '17570', 'C04038v8': '18650', 'C04048v7': '18650', 'C04052v4': '19770', 'C04055v4': '15585', 'C04067v7': '18650', 'C04090v5': '17930', 'C04094v5': '17570', 'C04097v2': '17930', 'C04098v3': '17570', 'C04106v5': '16005', 'C04108v3': '14790', 'C04109v7': '14790', 'C04140v11': '16005', 'C04143v8': '20575', 'C04145v4': '20575', 'C04147v5': '22280', 'C04149v4': '21415', 'C04157v8': '19770', 'C04158v4': '19015', 'C04160v7': '20985', 'C04203v4': '14790', 'C04210v1': '16895', 'C04218v5': '19770', 'C04222v1': '19770', 'C04224v4': '20985', 'C04226v4': '17570', 'C04227v3': '17570', 'C04228v2': '16005', 'C04229v3': '17570', 'C04231v2': '15145', 'C04232v3': '15145', 'C04234v1': '19770', 'C04235v2': '17570', 'C04236v3': '22280', 'C04237v3': '18650', 'C04238v3': '18650', 'C04239v2': '14150', 'C04241v2': '18280', 'C04242v1': '20575', 'C04243v3': '17270', 'C04244v1': '13520', 'C04245v1': '14790', 'C04246v2': '16005', 'C04248v1': '16280', 'C04250v2': '22280', 'C04251v1': '20575', 'C04252v2': '19015', 'C04253v2': '19015', 'C04254v1': '14790', 'C04255v1': '12300', 'C04257v1': '11145', 'C04258v3': '18650', 'C04259v2': '18650', 'C04260v2': '18650', 'C04261v2': '18650', 'C04262v1': '14790', 'C04264v1': '22280', 'C04265v2': '18280', 'C04266v1': '17270', 'C04267v1': '18280', 'C04268v1': '13340', 'C04269v2': '13340', 'C04270v1': '17570', 'C04271v2': '17930', 'C04272v2': '17570', 'C04273v2': '17930', 'C04274v1': '17570', 'C04275v1': '17570', 'C04277v2': '17930', 'C04278v2': '17570', 'C04279v2': '16005', 'C04281v2': '18650', 'C04284v2': '14150', 'C04285v1': '15000', 'C04286v1': '18650', 'C04287v1': '18650', 'C04288v1': '15000', 'C04289v1': '18650', 'C04290v1': '15000', 'C04291v1': '14150', 'C04292v1': '16005', 'C04293v2': '17930', 'C04294v1': '15585', 'C04295v2': '19770', 'C04296v2': '19015', 'C04297v2': '19770', 'C04298v1': '14790', 'C04299v1': '18280', 'C04300v1': '18650', 'C04301v1': '15000', 'C04302v1': '16005', 'C04303v1': '16005', 'C04304v3': '19015', 'C04305v1': '14415', 'C04306v1': '25070', 'C04307v1': '14415', 'C04309v2': '17930', 'C04314v1': '18650', 'C04315v1': '15585', 'C04316v2': '15000', 'C04317v1': '15000', 'C04319v1': '15000', 'C04320v1': '22280', 'C04321v1': '16565', 'C04322v1': '16005', 'C04323v1': '15585', 'C04324v2': '18650', 'C04325v2': '18650', 'C04368v1': '16895', 'C04369v1': '16895', 'C04371v1': '16895', 'C04372v1': '17930', 'C04373v1': '18650', 'C06006v5': '15000', 'C06009v8': '19015', 'C06017v7': '15000', 'C06033v4': '11145', 'C06037v4': '16005', 'C06041v6': '14790', 'C06058v7': '19770', 'C06096v3': '14415', 'C06097v1': '16895', 'C06099v1': '20575', 'C06100v2': '19015', 'C06101v1': '14790', 'C06102v1': '14790', 'C06103v1': '14790', 'C06104v1': '16565', 'C06105v1': '14790', 'C06106v1': '14790', 'C06107v1': '13340', 'C06108v1': '17930', 'C06109v1': '17570', 'C06110v1': '17570', 'C06113v1': '19770', 'C06114v2': '17930', 'C06115v2': '15000', 'C06116v1': '14415', 'C06118v2': '25070', 'C06119v1': '15585', 'C06121v1': '15585', 'C06122v1': '19015', 'C06123v1': '19770', 'C06124v1': '17930', 'C07002v7': '15000', 'C07004v4': '15000', 'C07012v7': '18650', 'C07018v5': '18650', 'C07019v6': '15000', 'C07021v8': '18650', 'C07027v8': '14150', 'C07028v9': '14150', 'C07029v7': '15000', 'C07044v4': '16005', 'C07048v7': '16005', 'C07073v5': '22280', 'C07074v5': '22280', 'C07075v4': '18280', 'C07078v3': '19015', 'C07080v7': '20985', 'C07107v3': '13520', 'C07112v4': '18650', 'C07113v3': '18650', 'C07118v1': '14790', 'C07119v1': '17270', 'C07120v2': '16895', 'C07122v1': '22280', 'C07124v1': '16005', 'C07125v1': '14790', 'C07126v1': '16005', 'C07128v1': '18650', 'C07129v1': '18650', 'C07132v1': '18650', 'C11001v5': '15000', 'C11005v5': '15000', 'C11008v7': '19015', 'C11015v8': '18650', 'C11017v5': '17570', 'C11021v5': '18650', 'C11027v5': '18650', 'C11039v4': '18650', 'C11048v3': '17930', 'C11051v3': '17570', 'C11054v2': '17570', 'C11125v4': '20575', 'C11128v3': '16005', 'C11130v4': '20575', 'C11142v7': '19770', 'C11145v7': '20985', 'C11198v3': '18650', 'C11199v4': '18650', 'C11206v3': '18650', 'C11210v2': '17270', 'C11211v2': '22280', 'C11215v4': '11145', 'C11216v1': '18280', 'C11217v1': '20575', 'C11223v1': '14790', 'C11225v1': '17270', 'C11227v1': '16895', 'C11229v1': '20575', 'C11230v2': '19015', 'C11232v1': '18280', 'C11234v1': '17270', 'C11235v1': '13340', 'C11236v1': '17930', 'C11237v1': '17570', 'C11238v1': '17930', 'C11239v1': '17570', 'C11242v1': '16005', 'C11245v1': '15000', 'C11247v1': '19770', 'C11248v1': '17930', 'C11249v2': '15000', 'C11254v1': '14415', 'C11257v1': '15000', 'C11260v2': '25070', 'C11262v1': '16005', 'C11264v1': '22280', 'C11265v1': '20575', 'C11270v1': '15000', 'C11271v1': '15000', 'C11274v1': '17930', 'C01001v2': '12810', 'C01002v2': '12810', 'C01003v2': '12810', 'C01004v2': '12810', 'C01005v2': '12810', 'C02001v2': '13850', 'C02018v5': '17570', 'C02019v3': '12810', 'C02020v2': '12810', 'C02024v4': '16005', 'C02025v5': '13520', 'C02026v4': '13520', 'C02028v6': '15000', 'C02029v4': '16280', 'C02030v3': '18280', 'C02031v3': '18280', 'C02037v4': '12810', 'C02039v3': '13340', 'C02041v4': '12810', 'C02047v1': '16280', 'C02048v4': '16005', 'C02050v1': '12810', 'C02056v1': '15000', 'C02057v1': '16005', 'C02058v2': '16005', 'C02059v1': '15000', 'C02060v1': '15000', 'C02061v1': '16005', 'C02062v1': '16005', 'C02063v1': '15000', 'C03001v4': '13850', 'C03002v5': '13850', 'C03012v4': '13850', 'C03017v5': '17570', 'C03018v3': '12810', 'C03024v7': '15000', 'C03025v4': '16280', 'C03026v6': '18280', 'C03029v4': '18280', 'C03032v4': '12810', 'C03034v3': '13340', 'C03044v2': '12810', 'C03046v3': '16005', 'C03047v2': '12810', 'C03048v3': '16005', 'C03049v3': '16005', 'C03050v3': '16005', 'C03051v1': '16280', 'C03053v1': '15000', 'C03054v1': '15000', 'C03055v1': '16005', 'C03056v1': '15000', 'C03057v1': '15000', 'C03058v1': '16005', 'C03059v1': '15000'}
            feeDict = {}
            cod = [
                "C04006v7",
                "C04007v7",
                "C04008v6",
                "C04018v6",
                "C04037v6",
                "C04038v8",
                "C04048v7",
                "C04052v4",
                "C04055v4",
                "C04067v7",
                "C04090v5",
                "C04094v5",
                "C04097v2",
                "C04106v5",
                "C04109v7",
                "C04140v11",
                "C04143v8",
                "C04145v4",
                "C04147v5",
                "C04157v8",
                "C04158v4",
                "C04160v7",
                "C04203v4",
                "C04210v1",
                "C04218v5",
                "C04222v1",
                "C04224v4",
                "C04226v4",
                "C04227v3",
                "C04228v2",
                "C04229v3",
                "C04231v2",
                "C04232v3",
                "C04234v1",
                "C04235v2",
                "C04236v3",
                "C04237v3",
                "C04238v3",
                "C04239v2",
                "C04241v2",
                "C04242v1",
                "C04243v3",
                "C04244v1",
                "C04245v1",
                "C04246v2",
                "C04248v1",
                "C04250v2",
                "C04251v1",
                "C04252v2 A ",
                "C04253v2 B ",
                "C04254v1",
                "C04255v2",
                "C04257v1",
                "C04258v3",
                "C04259v2",
                "C04260v2",
                "C04261v2",
                "C04262v1",
                "C04264v1",
                "C04265v2",
                "C04266v1",
                "C04267v1",
                "C04268v2",
                "C04269v3",
                "C04270v1",
                "C04271v3",
                "C04272v2",
                "C04273v3",
                "C04274v1",
                "C04275v1",
                "C04277v3",
                "C04278v3",
                "C04279v2",
                "C04281v2",
                "C04284v2",
                "C04285v1",
                "C04286v1",
                "C04287v1",
                "C04288v1",
                "C04289v1",
                "C04290v1",
                "C04291v1",
                "C04292v1",
                "C04293v2",
                "C04294v1",
                "C04295v2",
                "C04296v2",
                "C04297v2",
                "C04298v1",
                "C04299v1",
                "C04300v1",
                "C04301v1",
                "C04302v1",
                "C04303v1",
                "C04304v4",
                "C04305v1",
                "C04306v1",
                "C04307v1",
                "C04309v3",
                "C04314v1",
                "C04315v1",
                "C04316v2",
                "C04317v1",
                "C04319v1",
                "C04320v1",
                "C04321v1",
                "C04322v1",
                "C04323v1",
                "C04324v2",
                "C04325v2",
                "C04367v1",
                "C04368v1",
                "C04369v1",
                "C04371v1",
                "C04372v1",
                "C04373v1",
                "C04374v1",
                "C04382v1",
                "C04383v1",
                "C04384v1",
                "C04385v1",
                "C04386v1",
                "C04388v1",
                "C04389v1",
                "C04390v1",
                "C04391v1",
                "C04392v1",
                "C04393v1",
                "C04394v1",
                "C04395v1",
                "C04396v1",
                "C04397v1",
                "C06006v5",
                "C06009v8",
                "C06017v7",
                "C06033v4",
                "C06037v4",
                "C06041v6",
                "C06096v3",
                "C06097v1",
                "C06099v1",
                "C06100v2",
                "C06101v1",
                "C06102v1",
                "C06103v1",
                "C06104v1",
                "C06105v1",
                "C06106v1",
                "C06107v1",
                "C06108v1",
                "C06109v1",
                "C06110v1",
                "C06113v1",
                "C06114v2",
                "C06115v2",
                "C06116v1",
                "C06118v2",
                "C06119v2",
                "C06121v1",
                "C06122v1",
                "C06123v1",
                "C06124v1",
                "C06125v1",
                "C06126v1",
                "C06127v1",
                "C06129v1",
                "C06130v1",
                "C07002v7",
                "C07004v5",
                "C07012v7",
                "C07018v5",
                "C07019v6",
                "C07021v8",
                "C07028v9",
                "C07029v7",
                "C07044v4",
                "C07048v7",
                "C07073v5",
                "C07074v5",
                "C07078v3",
                "C07080v7",
                "C07107v3",
                "C07112v4",
                "C07113v3",
                "C07118v1",
                "C07119v1",
                "C07120v2",
                "C07122v1",
                "C07124v1",
                "C07125v1",
                "C07126v1",
                "C07128v1",
                "C07129v1",
                "C07132v1",
                "C07135v1",
                "C07136v1",
                "C07137v1",
                "C07140v1",
                "C07141v1",
                "C11001v5",
                "C11005v6",
                "C11008v7",
                "C11015v8",
                "C11017v5",
                "C11021v5",
                "C11027v5",
                "C11039v4",
                "C11048v3",
                "C11125v4",
                "C11128v3",
                "C11130v4",
                "C11142v7",
                "C11145v7",
                "C11198v3",
                "C11199v4",
                "C11206v3",
                "C11210v2",
                "C11211v2",
                "C11215v4",
                "C11216v1",
                "C11217v1",
                "C11223v1",
                "C11225v1",
                "C11227v1",
                "C11229v1",
                "C11230v2",
                "C11232v1 C ",
                "C11234v1",
                "C11235v1",
                "C11236v1",
                "C11237v1",
                "C11238v1",
                "C11239v1",
                "C11242v1",
                "C11245v1",
                "C11247v1",
                "C11249v3",
                "C11254v1",
                "C11257v1",
                "C11260v2",
                "C11262v1",
                "C11264v1",
                "C11265v1",
                "C11269v1",
                "C11270v1",
                "C11271v1",
                "C11274v1",
                "C11275v1",
                "C11276v1",
                "C11277v1",
                "C11282v1",
                "C11283v1",
                "C11285v1",
                "C11287v1",
                "C11289v1",
                "C11292v1 ",
            ]
            fee = [
                "15750",
                "15750",
                "15750",
                "19960",
                "18445",
                "19580",
                "19580",
                "20755",
                "16360",
                "19580",
                "18825",
                "18445",
                "18825",
                "16800",
                "15525",
                "16800",
                "21600",
                "21600",
                "23390",
                "20755",
                "19960",
                "22030",
                "15525",
                "17735",
                "20755",
                "20755",
                "22030",
                "18445",
                "18445",
                "16800",
                "18445",
                "15900",
                "15900",
                "20755",
                "18445",
                "23390",
                "19580",
                "19580",
                "14855",
                "19190",
                "21600",
                "18130",
                "14195",
                "15525",
                "16800",
                "17090",
                "23390",
                "21600",
                "19960",
                "19960",
                "15525",
                "12915",
                "11700",
                "19580",
                "19580",
                "19580",
                "19580",
                "15525",
                "23390",
                "19190",
                "18130",
                "19190",
                "14005",
                "14005",
                "18445",
                "18825",
                "18445",
                "18825",
                "18445",
                "18445",
                "18825",
                "18445",
                "16800",
                "19580",
                "14855",
                "15750",
                "19580",
                "19580",
                "15750",
                "19580",
                "15750",
                "14855",
                "16800",
                "18825",
                "16360",
                "20755",
                "19960",
                "20755",
                "15525",
                "19190",
                "19580",
                "15750",
                "16800",
                "16800",
                "19960",
                "15135",
                "26320",
                "15135",
                "18825",
                "19580",
                "16360",
                "15750",
                "15750",
                "15750",
                "23390",
                "16800",
                "16800",
                "16360",
                "19580",
                "19580",
                "19960",
                "15750",
                "15750",
                "15750",
                "18825",
                "19580",
                "23850",
                "19580",
                "19580",
                "15525",
                "15525",
                "25300",
                "19190",
                "19190",
                "19190",
                "19190",
                "19190",
                "19190",
                "17735",
                "19960",
                "16800",
                "16800",
                "15750",
                "19960",
                "15750",
                "11700",
                "16800",
                "15525",
                "15135",
                "17735",
                "21600",
                "19960",
                "15525",
                "15525",
                "15525",
                "16800",
                "15525",
                "15525",
                "14005",
                "18825",
                "18445",
                "18445",
                "20755",
                "18825",
                "15750",
                "15135",
                "26320",
                "15900",
                "16360",
                "19960",
                "20755",
                "18825",
                "18445",
                "15750",
                "23850",
                "15525",
                "25300",
                "15750",
                "15750",
                "19580",
                "19580",
                "15750",
                "19580",
                "14855",
                "15750",
                "16800",
                "16800",
                "23390",
                "23390",
                "19960",
                "22030",
                "14195",
                "19580",
                "19580",
                "15525",
                "18130",
                "17735",
                "23390",
                "16800",
                "15525",
                "16800",
                "19580",
                "19580",
                "19580",
                "19580",
                "19190",
                "19190",
                "17735",
                "19190",
                "15750",
                "15750",
                "19960",
                "19580",
                "18445",
                "19580",
                "19580",
                "19580",
                "18825",
                "21600",
                "16800",
                "21600",
                "20755",
                "22030",
                "19580",
                "19580",
                "19580",
                "18130",
                "23390",
                "11700",
                "19190",
                "21600",
                "15525",
                "18130",
                "17735",
                "21600",
                "19960",
                "19190",
                "18130",
                "14005",
                "18825",
                "18445",
                "18825",
                "18445",
                "16800",
                "15750",
                "20755",
                "15750",
                "15135",
                "15750",
                "26320",
                "16800",
                "23390",
                "21600",
                "19960",
                "15750",
                "15750",
                "18825",
                "18445",
                "15750",
                "23850",
                "19580",
                "25300",
                "19190",
                "19190",
                "17735",
                "19190",
            ]
            for i in range(len(cod)):
                feeDict[cod[i]] = fee[i]
            # //div[@class='sidebar__info sidebar--info-codes']//dl/dd[1]
            feeIndex = response.xpath(
                "//div[@class='sidebar__info sidebar--info-codes']//dl/dd[1]//text()"
            ).extract()
            clear_space(feeIndex)
            print("---", feeIndex)
            v_re = re.findall(r"version\s\d", ''.join(feeIndex))
            print(v_re, "***")
            if feeIndex:
                feeIndexe = feeIndex[1] + ''.join(v_re).replace(
                    "version ", "v").strip()
                print('===', feeIndexe)
                item['tuition_fee'] = feeDict.get(feeIndexe)
            print("item['tuition_fee']: ", item['tuition_fee'])

            # //h4[@class='collapsible__title'][contains(text(),'Admission requirements')]/following-sibling::div[1]
            entry_requirements = response.xpath(
                "//h4[@class='collapsible__title'][contains(text(),'Admission requirements')]/following-sibling::div[1]"
            ).extract()
            entry_requirements_str = ''.join(entry_requirements).strip()
            item['rntry_requirements_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

            ieltsRe = re.findall(r"IELTS[0-9a-zA-Z:\.,\s]*;",
                                 entry_requirements_str)
            # print("ieltsRe: ", ieltsRe)
            toeflRe = re.findall(r"internet\sbased[0-9a-zA-Z:\.,\s-]*;",
                                 entry_requirements_str)
            # print("toeflRe: ", toeflRe)
            item['ielts_desc'] = ''.join(ieltsRe).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            item['toefl_desc'] = ''.join(toeflRe).strip()
            # print("item['toefl_desc']: ", item['toefl_desc'])

            ieltsDict = {
                "Master of Teaching in Secondary Education":
                "7.5 overall,speaking 8.0,listening 8.0,reading 7.0,writing 7.0",
                "Master of Advanced Journalism":
                "7.0 overall,with a writing score of 6.5",
                "Master of Pharmacy": "7.0 overall,7.0 in each subtest",
                "Master of Pharmacy (International)":
                "7.0 overall,7.0 in each subtest",
                "Master of Clinical Psychology": "7.0 overall,writing 7.0",
                "Master of Physiotherapy": "7.0 overall,writing 7.0",
            }
            if item['ielts_desc'] == "":
                item['ielts_desc'] = ieltsDict.get(item['degree_name'])
                if item['ielts_desc'] is None:
                    item['ielts_desc'] = "6.5 overall, writing 6.0"
            # print("item['ielts_desc']: ", item['ielts_desc'])

            toeflDict = {
                "Master of Teaching in Secondary Education":
                "102-109 overall,speaking 23-27,listening 23-27,reading 23-27,writing 24",
                "Master of Advanced Journalism":
                "94-101 overall,with a writing score of 24 ",
                "Master of Pharmacy":
                "94 overall,reading 24,listening 24,speaking 23,writing 27 ",
                "Master of Pharmacy (International)":
                "94 overall,reading 24,listening 24,speaking 23,writing 27  ",
                "Master of Clinical Psychology": "94-101 overall,writing 23 ",
                "Master of Physiotherapy": "94-101 overall,writing 23 ",
            }
            if item['toefl_desc'] == "":
                item['toefl_desc'] = ieltsDict.get(item['degree_name'])
                if item['toefl_desc'] is None:
                    item['toefl_desc'] = "79-93 overall, writing 21"
            # print("item['toefl_desc']: ", item['toefl_desc'])

            ielts_d = get_ielts(item['ielts_desc'])
            item["ielts"] = ielts_d.get('IELTS')
            item["ielts_l"] = ielts_d.get('IELTS_L')
            item["ielts_s"] = ielts_d.get('IELTS_S')
            item["ielts_r"] = ielts_d.get('IELTS_R')
            item["ielts_w"] = ielts_d.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            department = response.xpath(
                "//div[@class='field field-dddd-view-modeluts-course-course__part-of field-type-ds field-label-hidden']//div[@class='field-item']//p/a/text()"
            ).extract()
            clear_space(department)
            department = ''.join(department).replace("UTS:", "").strip()
            item['department'] = department
            # print("item['department']: ", item['department'])

            apply_procces = response.xpath(
                "//h4[contains(text(),'International students')]/..").extract(
                )
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(apply_procces))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 12

Mostrar archivo

Archivo: EdithCowanUniversity_P.py Proyecto: histudent/python_spider

    def parse(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = "Edith Cowan University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.uts.edu.au'
        item['url'] = response.url
        item['degree_type'] = 2
        item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        # 组合字典
        links = [
            "http://www.ecu.edu.au/degrees/courses/master-of-business-administration",
            "http://www.ecu.edu.au/degrees/courses/master-of-business-administration-international",
            "http://www.ecu.edu.au/degrees/courses/master-of-clinical-nursing",
            "http://www.ecu.edu.au/degrees/courses/master-of-communication",
            "http://www.ecu.edu.au/degrees/courses/master-of-computer-science",
            "http://www.ecu.edu.au/degrees/courses/master-of-counselling-and-psychotherapy",
            "http://www.ecu.edu.au/degrees/courses/master-of-critical-care-paramedicine",
            "http://www.ecu.edu.au/degrees/courses/master-of-cyber-security",
            "http://www.ecu.edu.au/degrees/courses/master-of-design",
            "http://www.ecu.edu.au/degrees/courses/master-of-disaster-and-emergency-response",
            "http://www.ecu.edu.au/degrees/courses/master-of-education",
            "http://www.ecu.edu.au/degrees/courses/master-of-education-advanced",
            "http://www.ecu.edu.au/degrees/courses/master-of-engineering",
            "http://www.ecu.edu.au/degrees/courses/master-of-environmental-management",
            "http://www.ecu.edu.au/degrees/courses/master-of-environmental-science",
            "http://www.ecu.edu.au/degrees/courses/master-of-exercise-science-strength-and-conditioning",
            "http://www.ecu.edu.au/degrees/courses/master-of-finance-and-banking",
            "http://www.ecu.edu.au/degrees/courses/master-of-human-resource-management",
            "http://www.ecu.edu.au/degrees/courses/master-of-international-hospitality-management",
            "http://www.ecu.edu.au/degrees/courses/master-of-management-information-systems",
            "http://www.ecu.edu.au/degrees/courses/master-of-marketing-and-innovation-management",
            "http://www.ecu.edu.au/degrees/courses/master-of-midwifery-practice",
            "http://www.ecu.edu.au/degrees/courses/master-of-neurological-rehabilitation",
            "http://www.ecu.edu.au/degrees/courses/master-of-nurse-education",
            "http://www.ecu.edu.au/degrees/courses/master-of-nursing",
            "http://www.ecu.edu.au/degrees/courses/master-of-nursing-graduate-entry",
            "http://www.ecu.edu.au/degrees/courses/master-of-nursing-nurse-practitioner",
            "http://www.ecu.edu.au/degrees/courses/master-of-nutrition-and-dietetics",
            "http://www.ecu.edu.au/degrees/courses/master-of-occupational-health-and-safety",
            "http://www.ecu.edu.au/degrees/courses/master-of-occupational-hygiene-and-toxicology",
            "http://www.ecu.edu.au/degrees/courses/master-of-paramedic-practitioner",
            "http://www.ecu.edu.au/degrees/courses/master-of-professional-accounting",
            "http://www.ecu.edu.au/degrees/courses/master-of-professional-communication",
            "http://www.ecu.edu.au/degrees/courses/master-of-professional-design",
            "http://www.ecu.edu.au/degrees/courses/master-of-project-management",
            "http://www.ecu.edu.au/degrees/courses/master-of-psychology",
            "http://www.ecu.edu.au/degrees/courses/master-of-public-health",
            "http://www.ecu.edu.au/degrees/courses/master-of-science-assisted-reproductive-technology",
            "http://www.ecu.edu.au/degrees/courses/master-of-screen-studies",
            "http://www.ecu.edu.au/degrees/courses/master-of-teaching-early-childhood",
            "http://www.ecu.edu.au/degrees/courses/master-of-teaching-primary",
            "http://www.ecu.edu.au/degrees/courses/master-of-teaching-secondary",
            "http://www.ecu.edu.au/degrees/courses/master-of-technology-petroleum-engineering",
        ]
        programme_dict = {}
        programme_list = [
            "Master of Business Administration",
            "Master of Business Administration International",
            "Master of Clinical Nursing",
            "Master of Communication",
            "Master of Computer Science",
            "Master of Counselling and Psychotherapy",
            "Master of Critical Care Paramedicine",
            "Master of Cyber Security",
            "Master of Design",
            "Master of Disaster and Emergency Response",
            "Master of Education",
            "Master of Education (Advanced)",
            "Master of Engineering",
            "Master of Environmental Management",
            "Master of Environmental Science",
            "Master of Exercise Science (Strength and Conditioning)",
            "Master of Finance and Banking",
            "Master of Human Resource Management",
            "Master of International Hospitality Management",
            "Master of Management Information Systems",
            "Master of Marketing and Innovation Management",
            "Master of Midwifery Practice",
            "Master of Neurological Rehabilitation",
            "Master of Nurse Education",
            "Master of Nursing",
            "Master of Nursing (Graduate Entry)",
            "Master of Nursing (Nurse Practitioner)",
            "Master of Nutrition and Dietetics",
            "Master of Occupational Health and Safety",
            "Master of Occupational Hygiene and Toxicology",
            "Master of Paramedic Practitioner",
            "Master of Professional Accounting",
            "Master of Professional Communication",
            "Master of Professional Design",
            "Master of Project Management",
            "Master of Psychology",
            "Master of Public Health",
            "Master of Science (Assisted Reproductive Technology)",
            "Master of Screen Studies",
            "Master of Teaching (Early Childhood)",
            "Master of Teaching (Primary)",
            "Master of Teaching (Secondary)",
            "Master of Technology (Petroleum Engineering)",
        ]
        for link in range(len(links)):
            url = links[link]
            programme_dict[url] = programme_list[link]
        item['major_type1'] = programme_dict.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programme = response.xpath(
                "//h2[contains(text(), 'Master of')]//text()").extract()
            clear_space(programme)
            programme = ''.join(programme).strip()
            item['degree_name'] = programme
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Master", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                programme_re = re.findall(
                    r"\(.+\)",
                    item['degree_name'].replace("(Graduate Entry)",
                                                "").replace("(Advanced)", ""))
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Master of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                overview = response.xpath(
                    "//span[@id='overview']/..").extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                entry_requirements = response.xpath(
                    "//div[@id='before-you-start']").extract()
                entry_requirements_str = ''.join(entry_requirements).strip()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(entry_requirements))
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                modules = response.xpath(
                    "//h4[contains(text(),'Course structure')]|//div[@class='structure-heading']"
                ).extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # print("item['modules_en']: ", item['modules_en'])

                career = response.xpath(
                    "//h4[contains(text(),'Employment opportunities')]|//h4[contains(text(),'Employment opportunities')]/following-sibling::*[1]|"
                    "//h4[contains(text(),'Possible future job titles')]|//h4[contains(text(),'Possible future job titles')]/following-sibling::*[1]"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                if item['career_en'] == "":
                    print("***career_en 为空")
                print("item['career_en']: ", item['career_en'])

                location = response.xpath(
                    "//div[@class='courseOverview__info courseOverview__info--international courseOverview__info--noOnline']//div[@class='studyCampus__location studyCampus__location--joondalup studyCampus__location--active']/h4//text()|"
                    "//div[@class='courseOverview__info courseOverview__info--international courseOverview__info--noOnline']//div[@class='studyCampus__location studyCampus__location--mtLawley studyCampus__location--active']/h4//text()|"
                    "//div[@class='courseOverview__info courseOverview__info--international courseOverview__info--noOnline']//div[@class='studyCampus__location studyCampus__location--bunbury studyCampus__location--active']/h4//text()|"
                    "//div[@class='courseOverview__info courseOverview__info--international']//div[@class='studyCampus__location studyCampus__location--joondalup studyCampus__location--active']/h4//text()|"
                    "//div[@class='courseOverview__info courseOverview__info--international']//div[@class='studyCampus__location studyCampus__location--mtLawley studyCampus__location--active']/h4//text()|"
                    "//div[@class='courseOverview__info courseOverview__info--international']//div[@class='studyCampus__location studyCampus__location--bunbury studyCampus__location--active']/h4//text()"
                ).extract()
                clear_space(location)
                location = ', '.join(location).strip().strip(',').strip()
                item['location'] = location
                location_tmp = item['location']
                print("item['location']: ", item['location'])

                duration = response.xpath(
                    "//div[@class='courseOverview__info courseOverview__info--international courseOverview__info--noOnline']//p[contains(text(),'year')]//text()|"
                    "//div[@class='courseOverview__info courseOverview__info--international']//p[contains(text(),'year')]//text()"
                ).extract()
                clear_space(duration)
                print("duration: ", duration)
                duration_re = re.findall(r"Start\sSemester.*",
                                         ''.join(duration).strip())
                print(duration_re, "===")
                item['start_date'] = ','.join(duration_re)
                item['duration'] = ''.join(duration).replace(
                    ''.join(duration_re), "").strip()
                print("item['duration']: ", item['duration'])

                other = response.xpath(
                    "//span[@class='courseOverview__subHeader alert-warning alert']//text()"
                ).extract()
                item['other'] = ''.join(other)
                print("item['other']: ", item['other'])

                # 英语要求
                # https://www.ecu.edu.au/future-students/course-entry/english-competency#toggle-2
                # ieltsRe = re.findall(r"IELTS[0-9a-zA-Z:\.,\s]*;", entry_requirements_str)
                # # print("ieltsRe: ", ieltsRe)
                # toeflRe = re.findall(r"internet\sbased[0-9a-zA-Z:\.,\s-]*;", entry_requirements_str)
                # # print("toeflRe: ", toeflRe)
                # item['ielts_desc'] = ''.join(ieltsRe).strip()
                # # print("item['ielts_desc']: ", item['ielts_desc'])
                #
                # item['toefl_desc'] = ''.join(toeflRe).strip()
                # # print("item['toefl_desc']: ", item['toefl_desc'])
                #
                # print("item['ielts_desc']: ", item['ielts_desc'])
                #
                #
                #
                #
                # ielts_d = get_ielts(item['ielts_desc'])
                # item["ielts"] = ielts_d.get('IELTS')
                # item["ielts_l"] = ielts_d.get('IELTS_L')
                # item["ielts_s"] = ielts_d.get('IELTS_S')
                # item["ielts_r"] = ielts_d.get('IELTS_R')
                # item["ielts_w"] = ielts_d.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
                #
                # department = response.xpath(
                #     "//div[@class='field field-dddd-view-modeluts-course-course__part-of field-type-ds field-label-hidden']//div[@class='field-item']//p/a/text()").extract()
                # clear_space(department)
                # department = ''.join(department).replace("UTS:", "").strip()
                # item['department'] = department
                # print("item['department']: ", item['department'])

                if item['location'].lower() != "online":
                    if "This course is not offered for study on-campus to international students with a student visa" not in item[
                            'other']:
                        major_list_url = response.xpath(
                            "//div[@class='section']//ul[@class='core-units']//a/@href"
                        ).extract()
                        clear_space(major_list_url)
                        print("major_list_url: ", major_list_url)
                        print(len(major_list_url))

                        if len(major_list_url) == 0:
                            item['url'] = response.url
                            print("item['url']2: ", item['url'])
                            yield item
                        else:
                            for major_url in major_list_url:
                                headers_base = {
                                    'User-Agent':
                                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
                                }
                                data = requests.get(major_url,
                                                    headers=headers_base)
                                response_major = etree.HTML(data.text)
                                item['url'] = major_url
                                print("item['url']_major: ", item['url'])

                                programme_major = response_major.xpath(
                                    "//span[@id='overview']/following-sibling::h2//text()"
                                )
                                item['programme_en'] = ''.join(
                                    programme_major).strip()
                                print("item['programme_en']_major: ",
                                      item['programme_en'])

                                location_major = response_major.xpath(
                                    "//div[@class='studyCampus__location studyCampus__location--active']/h4//text()"
                                )
                                item['location'] = ','.join(
                                    location_major).strip().strip(',').strip()
                                if item['location'] == "":
                                    item['location'] = location_tmp
                                print("item['location']_major: ",
                                      item['location'])

                                overview_en = response_major.xpath(
                                    "//span[@id='overview']/..")
                                overview_en_str = ""
                                if len(overview_en) > 0:
                                    for o in overview_en:
                                        overview_en_str += etree.tostring(
                                            o,
                                            encoding='unicode',
                                            method='html')
                                item['overview_en'] = remove_class(
                                    clear_lianxu_space([overview_en_str]))
                                print("item['overview_en']_major: ",
                                      item['overview_en'])

                                modules_en = response_major.xpath(
                                    "//h4[contains(text(),'Structure')]|//h4[contains(text(),'Course structure')]|//div[@class='structure-heading']"
                                )
                                modules_en_str = ""
                                if len(modules_en) > 0:
                                    for o in modules_en:
                                        modules_en_str += etree.tostring(
                                            o,
                                            encoding='unicode',
                                            method='html')
                                item['modules_en'] = remove_class(
                                    clear_lianxu_space([modules_en_str]))
                                print("item['modules_en']_major: ",
                                      item['modules_en'])
                                yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 13

Mostrar archivo

    def parse(self, response):
    #     faculty_urls = response.xpath("//div[@id='tab_OrgUnits']/a/@href").extract()
    #     # print(faculty_urls)
    #     if faculty_urls:
    #         for link in faculty_urls:
    #             url = "https://www.handbook.unsw.edu.au" + link
    #             yield scrapy.Request(url, callback=self.parse_url)
    #
    # def parse_url(self, response):
    #     # print("======================", response.url)
    #     links = response.xpath("//div[@id='singleCoursePostgraduate']//a/@href").extract()
    #     print("links: ", links)
    #     if links:
    #         for link in links:
    #             url = "https://www.handbook.unsw.edu.au" + link.replace("\n", "").strip()
    #             print("url: ", url)
    #             # yield scrapy.Request(url, callback=self.parse_data)
    #
    # def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = "The University of New South Wales"
        item['url'] = response.url
        item['degree_type'] = 2
        item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        try:
            department = response.xpath("//li[3]//a//text()").extract()
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            location = response.xpath("//div[@role='complementary']//strong[@tabindex='0'][contains(text(),'Campus')]/../p//text()").extract()
            item['location'] = ''.join(location).strip()
            print("item['location']: ", item['location'])

            duration = response.xpath(
                "//div[contains(@role,'complementary')]//strong[contains(@tabindex,'0')][contains(text(),'Typical duration')]/../p//text()").extract()
            clear_space(duration)
            print("duration: ", duration)
            if "Years" in ''.join(duration):
                item['duration'] = ''.join(duration).replace("Years", "").strip()
                item['duration_per'] = 1
            print("item['duration']: ", item['duration'])

            # //div[@id='readMoreToggle1']
            overview_en = response.xpath("//div[@id='readMoreToggle1']/div[1]").extract()
            item['degree_overview_en'] = item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            print("item['overview_en']: ", item['overview_en'])

            item["rntry_requirements_en"] = None
            rntry_requirements_en = response.xpath("//div[@class='m-accordion-group m-accordion-with-header']//div[@class='m-accordion-body']").extract()
            if rntry_requirements_en:
                item['rntry_requirements_en'] = remove_class(clear_lianxu_space(rntry_requirements_en))
            print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

            modules_en = response.xpath("//div[@id='structure']/div[position()<last()]").extract()
            if modules_en:
                item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            print("item['modules_en']: ", item['modules_en'])

            # start_date = response.xpath(
            #     "//section//dt[contains(text(), 'Entry')]/following-sibling::dd[1]//text()").extract()
            # clear_space(start_date)
            # print(len(start_date))
            # print("start_date: ", start_date)
            #
            # tuition_fee = response.xpath(
            #     "//section//dt[contains(text(), 'Estimated first year tuition')]/following-sibling::*[1]//text()").extract()
            # clear_space(tuition_fee)
            # print(len(tuition_fee))
            # print("tuition_fee: ", tuition_fee)
            #
            # careerEle = response.xpath("//section//dl[last()]")
            # print(len(careerEle))
            # print("careerEle: ", careerEle)

            # 学位类型列表
            degree_name = response.xpath("//div[@role='complementary']//p[contains(text(),'Master of')]/text()|//div[@role='complementary']//p[contains(text(),'Juris Doctor')]/text()").extract()
            clear_space(degree_name)
            if len(degree_name) > 0:
                item['degree_name'] = ', '.join(degree_name).replace("-", "").strip()
            else:
                item['degree_name'] = None
            print("item['degree_name']: ", item['degree_name'])

            programme_list = response.xpath('//div[@data-hbui-filter-item="specialisation"]/a/div/p//text()').extract()
            print("programme_list: ", programme_list)

            if item['degree_name'] is None:
                pass
            else:
                if len(programme_list) == 0:
                    programme_en = response.xpath("//span[@data-hbui='module-title']//text()").extract_first(None)
                    print("programmen: ", programme_en)
                    item['programme_en'] = programme_en
                    yield item
                else:
                    for prog in programme_list:
                        item['programme_en'] = prog
                        yield item
        except Exception as e:
            with open(".//scrapySchool_Australian_yan/error/"+item['university']+str(item['degree_type'])+".txt", 'w', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 14

Mostrar archivo

Archivo: FederationUniversityAustralia_P.py Proyecto: histudent/python_spider

    def parse(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = "Federation University Australia"
        # item['country'] = 'Australia'
        # item['website'] = 'https://search.federation.edu.au'
        item['degree_type'] = 2
        item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        informationUrl = response.url.replace(
            "https://study.federation.edu.au/api/programs_plan_code",
            "https://study.federation.edu.au/#/course")
        print("------------", informationUrl)
        item['url'] = informationUrl
        try:
            # jsonData = clear_space_str(response.body).replace('\"', "'").replace(" ", "")
            jsonData = response.body
            informationDict = json.loads(jsonData)
            print(informationDict)

            international_details = informationDict.get(
                "international_details")
            # print("international_details: ", international_details)

            type_desc = informationDict.get("type_desc")
            print("type_desc: ", type_desc)
            if "research" not in type_desc.lower():
                programme = informationDict.get("title")
                item['degree_name'] = programme
                print("item['degree_name']: ", item['degree_name'])

                pro_re = re.findall(r"Master", item['degree_name'])
                # print("pre_re: ", pro_re)
                if len(pro_re) < 2:
                    programme_re = re.findall(r"\(.+\)", item['degree_name'])
                    if len(programme_re) > 0:
                        item['programme_en'] = ''.join(programme_re).replace(
                            "(", "").replace(")", "").strip()
                    else:
                        item['programme_en'] = item['degree_name'].replace(
                            "Master of", "").strip()
                    print("item['programme_en']: ", item['programme_en'])

                    location = international_details.get("teaching_location")
                    item['location'] = location
                    # print("item['location']: ", item['location'])

                    department = informationDict.get("school_dept")
                    item['department'] = department
                    # print("item['department']: ", item['department'])

                    overviewHtml = informationDict.get("outline")
                    # print("overviewHtml: ", overviewHtml)
                    delFu = re.findall(r"&\w+;", overviewHtml)
                    # print(delFu)
                    if len(delFu) != 0:
                        for d in delFu:
                            overviewHtml = overviewHtml.replace(d, " ")
                    # pageHtml = '<!DOCTYPE html><html><body>' + overviewHtml + '</body></html>'
                    item['degree_overview_en'] = remove_class(
                        clear_lianxu_space([overviewHtml]))
                    # print("item['degree_overview_en']: ", item['degree_overview_en'])

                    duration = international_details.get("duration")
                    item['duration'] = duration
                    # print("item['duration']: ", item['duration'])

                    start_date = informationDict.get("commences")
                    item['start_date'] = start_date
                    # print("item['start_date']: ", item['start_date'])

                    career1 = informationDict.get("careers")
                    career1Str = ""
                    # print(career1)
                    if len(career1) != 0:
                        for career1dict in career1:
                            career1Str += "<p>" + career1dict.get(
                                "name") + "</p>"
                    career2 = informationDict.get("career_opportunities")
                    # print(career2)
                    if "<p>" in career2:
                        delFu = re.findall(r"&\w+;", career2)
                        if len(delFu) != 0:
                            for d in delFu:
                                career2 = career2.replace(d, " ")
                        career2 = career2.replace("<br>", " ")
                        # pageHtml = '<!DOCTYPE html><html><body>' + career2 + '</body></html>'
                        career2 = remove_class(clear_lianxu_space([career2]))
                    career = career1Str + career2
                    item['career_en'] = career
                    # print("item['career_en']: ", item['career_en'])

                    tuition_fee = international_details.get("annual_fee_int")
                    item['tuition_fee'] = tuition_fee
                    # print("item['tuition_fee']: ", item['tuition_fee'])

                    entry_requirements = international_details.get(
                        "academic_entry_requirements")
                    entry_requirements1 = international_details.get(
                        "extra_requirements")
                    delFu = re.findall(r"&\w+;", entry_requirements)
                    if len(delFu) != 0:
                        for d in delFu:
                            entry_requirements = entry_requirements.replace(
                                d, " ")
                    # entry_requirementsHtml = '<!DOCTYPE html><html><body>' + entry_requirements + '</body></html>'
                    item['rntry_requirements_en'] = remove_class(
                        clear_lianxu_space(
                            [entry_requirements])) + remove_class(
                                clear_lianxu_space([entry_requirements1]))
                    # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                    IELTS = international_details.get(
                        "english_language_requirement")
                    delFu = re.findall(r"&\w+;", IELTS)
                    if len(delFu) != 0:
                        for d in delFu:
                            IELTS = IELTS.replace(d, " ")
                    IELTSHtml = '<!DOCTYPE html><html><body>' + IELTS + '</body></html>'
                    html = etree.fromstring(IELTSHtml)
                    IELTS = html.xpath("//p//text()")
                    IELTS = ''.join(IELTS)
                    item['ielts_desc'] = IELTS
                    # print("item['ielts_desc']: ", item['ielts_desc'])

                    ielts_dict = get_ielts(item['ielts_desc'])
                    item["ielts"] = ielts_dict.get('IELTS')
                    item["ielts_l"] = ielts_dict.get('IELTS_L')
                    item["ielts_s"] = ielts_dict.get('IELTS_S')
                    item["ielts_r"] = ielts_dict.get('IELTS_R')
                    item["ielts_w"] = ielts_dict.get('IELTS_W')
                    # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                    #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                    # print("--ces")
                    modules = informationDict.get("domestic_details").get(
                        "program_structures").get("majors")
                    # print("modules: ", type(modules))
                    if len(modules) != 0:
                        modules = modules[0].get("year_levels")
                    # print("modules: ", modules)
                    # print("modules: ", type(modules))
                    modulesStr = ""
                    for m in modules:
                        modulesStr += str(m)
                    item['modules_en'] = "<div>" + modulesStr.replace(
                        "[", "").replace("]", "").replace("{", "").replace(
                            "}", "") + "</div>"
                    # print("item['modules_en']: ", item['modules_en'])

                    # item['application_date'] = "Monday 5 March, 2018"
                    # item['deadline'] = "TBC"
                    # item['application_fee'] = "25"

                    how_to_apply = informationDict.get("apply_link")
                    item['apply_proces_en'] = how_to_apply
                    # print("item['apply_proces_en']: ", item['apply_proces_en'])

                    # location = informationDict.get("campuses")
                    # print("location: ", location)
                    # locationStr = ""
                    # if len(location) != 0:
                    #     for locationdict in location:
                    #         locationStr += locationdict.get("name") + " "

                    yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 15

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = "The University of New South Wales"
        # item['country'] = 'Australia'
        # item['website'] = 'http://www.unsw.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 3
        item['teach_time'] = 'research'
        print("===========================")
        print(response.url)
        try:
            programme = response.xpath(
                "//div[@class='internalContentWrapper']/h1[1]//text()"
            ).extract()
            programme = ''.join(programme)
            programme = programme.split("-")
            item['programme_en'] = programme[0].strip()
            print("item['programme_en']: ", item['programme_en'])

            department = response.xpath(
                "//div[@class='summary']/p[1]/a[1]//text()").extract()
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            location = response.xpath(
                "//div[@class='summary']/p[3]/text()").extract()
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            duration = response.xpath(
                "//div[@class='summary']/p[5]/text()").extract()
            clear_space(duration)
            print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['duration']: ", item['duration'])
            print("item['duration_per']: ", item['duration_per'])

            degree_type = response.xpath(
                "//strong[contains(text(),'Award(s):')]/../following-sibling::p//text()"
            ).extract()
            if "View program information for " in degree_type:
                degree_type.remove("View program information for ")
            if "previous years" in degree_type:
                degree_type.remove("previous years")
            # print(degree_type)
            item['degree_name'] = '/'.join(degree_type).strip()
            print("item['degree_name']: ", item['degree_name'])

            allcontent = response.xpath(
                "//div[@class='internalContentWrapper']//text()").extract()
            # clear_space(allcontent)
            # print("allcontent: ", allcontent)

            overview = response.xpath(
                "//a[@name='programobjectives']/preceding-sibling::*[position()<last()-4]"
            ).extract()
            if len(overview) == 0:
                overview = response.xpath(
                    "//a[@name='description']/preceding-sibling::*[1]/following-sibling::*[position()<3]"
                ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            if item['overview_en'] == "":
                print("***over***")
                # overview
                if "Program Description" in allcontent:
                    overviewIndex = allcontent.index("Program Description")
                    if "Program Objectives and Graduate Attributes" in allcontent:
                        overviewIndexEnd = allcontent.index(
                            "Program Objectives and Graduate Attributes")
                        overview = allcontent[overviewIndex:overviewIndexEnd]
                        item['overview_en'] = "<div>" + clear_lianxu_space(
                            overview) + "</div>"
            # print("item['overview_en']: ", item['overview_en'])

            # modules -- 有些过多，需要删除
            if "Program Structure" in allcontent:
                modulesIndex = allcontent.index("Program Structure")
                if "General Education Requirements" in allcontent:
                    modulesIndexEnd = allcontent.index(
                        "General Education Requirements")
                elif "Academic Rules" in allcontent:
                    modulesIndexEnd = allcontent.index("Academic Rules")
                else:
                    modulesIndexEnd = -1
                modules = allcontent[modulesIndex:modulesIndexEnd]
                item['modules_en'] = "<div>" + clear_lianxu_space(
                    modules) + "</div>"
            # print("item['modules_en']: ", item['modules_en'])

            # degree_description      可能不准
            degree_description = response.xpath(
                "//a[@name='academicrules']/preceding-sibling::div[1]"
            ).extract()
            item['degree_overview_en'] = remove_class(
                clear_lianxu_space(degree_description))
            print("item['degree_overview_en']: ", item['degree_overview_en'])

            # IELTS、TOEFL
            if item['department'] == "Faculty of Law":
                item["ielts"] = "7"
                item["ielts_l"] = "6.5"
                item["ielts_s"] = "6.5"
                item["ielts_r"] = "6.5"
                item["ielts_w"] = "7"
                item["toefl"] = "96"
                item["toefl_l"] = "23"
                item["toefl_s"] = "23"
                item["toefl_r"] = "23"
                item["toefl_w"] = "27"
            elif item['department'] == "UNSW Business School":
                item["ielts"] = "7"
                item["ielts_l"] = "6.5"
                item["ielts_s"] = "6.5"
                item["ielts_r"] = "6.5"
                item["ielts_w"] = "7"
                item["toefl"] = "96"
                item["toefl_l"] = "23"
                item["toefl_s"] = "23"
                item["toefl_r"] = "23"
                item["toefl_w"] = "27"
            elif "Master of Teaching" in item['degree_name']:
                item["ielts"] = "7.5"
                item["ielts_l"] = "8"
                item["ielts_s"] = "8"
                item["ielts_r"] = "7"
                item["ielts_w"] = "7"
                item["toefl"] = "102"
                item["toefl_l"] = "28"
                item["toefl_s"] = "26"
                item["toefl_r"] = "24"
                item["toefl_w"] = "27"
            elif "Master of Psychology" in item['degree_name']:
                item["ielts"] = "7"
                item["ielts_l"] = "7"
                item["ielts_s"] = "7"
                item["ielts_r"] = "7"
                item["ielts_w"] = "7"
                item["toefl"] = "94"
                item["toefl_l"] = "24"
                item["toefl_s"] = "23"
                item["toefl_r"] = "24"
                item["toefl_w"] = "27"
            else:
                item["ielts"] = "6.5"
                item["ielts_l"] = "6"
                item["ielts_s"] = "6"
                item["ielts_r"] = "6"
                item["ielts_w"] = "6"
                item["toefl"] = "90"
                item["toefl_l"] = "22"
                item["toefl_s"] = "22"
                item["toefl_r"] = "22"
                item["toefl_w"] = "23"

            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #       item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))
            # item['application_open_date'] = "30 November 2017    31 May 2018"
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="checklistitem-inner"> <div class="checklistitem-content"> <div class="checklistitem-content-copy rte"> <h5>Find a research area</h5> <p>You need to satisfy minimum eligibility requirements for your proposed postgraduate research program. You also need to match your area of interest to one of our faculties. Visit the <a href="https://research.unsw.edu.au/">Graduate Research School website</a> for more.</p> </div> </div> </div> </div> <div class="checklistitem "> <div class="checklistitem-number "> <div class="checklistitem-number-inner" data-list-item-number="554">2</div> </div> <div class="checklistitem-inner"> <div class="checklistitem-content"> <div class="checklistitem-content-copy rte"> <h5>Find a supervisor and develop a research description</h5> <p>Most faculties require researchers to secure an agreement for supervision prior to the submission of a formal application. Descriptions of your research interest should be around one page in length.</p> </div> </div> </div> </div> <div class="checklistitem "> <div class="checklistitem-number "> <div class="checklistitem-number-inner" data-list-item-number="555">3</div> </div> <div class="checklistitem-inner"> <div class="checklistitem-content"> <div class="checklistitem-content-copy rte"> <h5>Prepare your documentation</h5> <p>All documents must be in English or include a certified English translation. Scanned copies are sufficient.</p> </div> </div> </div> </div> <div class="checklistitem "> <div class="checklistitem-number "> <div class="checklistitem-number-inner" data-list-item-number="556">4</div> </div> <div class="checklistitem-inner"> <div class="checklistitem-content"> <div class="checklistitem-content-copy rte"> <h5>Submit your application</h5> <p>Submit your application at <a href="https://apply.unsw.edu.au/">UNSW Apply Online</a>. Click 'Register now' and fill out your details. You will receive an ID and password to log in. After logging in, go to 'My Applications' and follow the directions. Please submit only one application. You can apply for scholarships within your application.</p> </div> </div> </div> </div> <div class="checklistitem "> <div class="checklistitem-number "> <div class="checklistitem-number-inner" data-list-item-number="557">5</div> </div> <div class="checklistitem-inner"> <div class="checklistitem-content"> <div class="checklistitem-content-copy rte"> <h5>Track your application</h5> <p>After submitting your application, you will be emailed a confirmation receipt with your UNSW Student ID and a list of any outstanding documentation. You can track the progress of your application through <a href="https://my.unsw.edu.au/" target="_blank">myUNSW</a>. Applications take around 6-8 weeks to process. </p> </div> </div> </div> </div> <div class="checklistitem "> <div class="checklistitem-number "> <div class="checklistitem-number-inner" data-list-item-number="558">6</div> </div> <div class="checklistitem-inner"> <div class="checklistitem-content"> <div class="checklistitem-content-copy rte"> <h5>We will send you an offer</h5> <p>We will send you a full offer if everything is fine or a conditional offer if more steps are required.</p> </div> </div> </div> </div> <div class="checklistitem "> <div class="checklistitem-number "> <div class="checklistitem-number-inner" data-list-item-number="559">7</div> </div> <div class="checklistitem-inner"> <div class="checklistitem-content"> <div class="checklistitem-content-copy rte"> <h5>Accept your offer and enrol online</h5> <p>If you receive a full offer, accept it through <a href="https://my.unsw.edu.au/">myUNSW</a>. You then need to enrol for the start of the semester. Please note, you are not able to do this online. You will be emailed information about the enrolment procedure.</p> </div> </div> </div> """
                ]))
            item[
                'rntry_requirements_en'] = "For postgraduate study, you are generally required to have completed undergraduate studies at a university-type institution. You can find out the exact academic entry requirements for your program in the ‘Coursework program’ section of the international student guide for postgraduates."
            if "Entry Requirements" in allcontent:
                entryIndex = allcontent.index("Entry Requirements")
                if "How to Apply" in allcontent:
                    entryIndexEnd = allcontent.index("How to Apply")
                else:
                    entryIndexEnd = -1
                entry_requirements = allcontent[entryIndex:entryIndexEnd]
                item['rntry_requirements_en'] = item[
                    'rntry_requirements_en'] + clear_lianxu_space(
                        entry_requirements)

            # 中国要求高考分数
            avgDict = {
                'Art Theory': '80',
                'Design (Honours)': '80',
                'Fine Arts (Honours)': '80',
                'Media Arts (Honours)': '80',
                'Arts': '80',
                'Arts and Business': '83',
                'Arts / Education (Secondary)': '80',
                'Commerce / Education (Secondary)': '88',
                'Design (Honours) / Education (Secondary)': '80',
                'Economics / Education (Secondary)': '85',
                'Fine Arts / Education (Secondary)': '80',
                'Media Arts (Honours) / Education (Secondary)': '80',
                'Science / Education (Secondary)': '80',
                'Criminology & Criminal Justice': '80',
                'International Studies': '83',
                'Media (Communication & Journalism)': '80',
                'Media (Public Relations & Advertising)': '80',
                'Media (Screen & Sound Production)': '80',
                'Music': '80',
                'Music / Education (Secondary)': '80',
                'Social Research & Policy': '80',
                'Social Work (Honours)': '80',
                'Architectural Studies': '85',
                'City Planning (Honours)': '80',
                'Computational Design': '80',
                'Construction Management & Property': '80',
                'Industrial Design (Honours)': '80',
                'Interior Architecture (Honours)': '80',
                'Landscape Architecture (Honours)': '80',
                'Actuarial Studies': '88',
                'Commerce': '88',
                'Commerce (International)': '88',
                'Economics': '85',
                'Information Systems': '83',
                'Engineering (Honours)': '85',
                'Engineering (H) / M Biomed Engineering': '85',
                'Engineering (H) (Elec) / M Engineering (Elec)': '88',
                'Engineering (Honours) (Civil with Architecture)': '85',
                'Engineering (H) / Engineering Science': '85',
                'Science (Food Science & Technology)': '85',
                'Science (Comp Science)': '85',
                'Laws (Dual Degree)': 'NA',
                'Med MD': 'NA',
                'Exercise Physiology': '83',
                'Aviation (Flying)': '80',
                'Aviation (Management)': '80',
                'Engineering (Materials Science & Engineering)': '80',
                'Environmental Management': '80',
                'Life Sciences': '80',
                'Medical Science': '85',
                'Medicinal Chemistry (Honours)': '83',
                'Nanoscience (Honours)': '80',
                'Optometry / Science': 'NA',
                'Psychological Science': '80',
                'Psychology': '88',
                'Science': '80',
                'Advanced Science (Honours)': '85',
                'Science and Business': '83',
                'Science (Advanced Mathematics) (Honours)': '85',
                'Biotechnology (Honours)': '80',
                'Science (International)': '80'
            }
            item['average_score'] = avgDict.get(item['programme_en'])
            print("item['average_score']: ", item['average_score'])

            tuition_feeDict = {
                'Master of Business':
                '30,720',
                'Master of Logistics Management':
                '30,720',
                'Doctor of Philosophy (Anatomy)':
                '43,440',
                'Master of Sustainment Management':
                '30,720',
                'Doctor of Philosophy (Kirby Institute)':
                '43,440',
                'Master of Curating and Cultural Leadership':
                '32,400',
                'Master of Laws - Corporate & Commercial Law':
                '41,520',
                'Doctor of Philosophy (Creative Practice)':
                '30,000',
                'Doctor of Philosophy (Accounting)':
                '33,120',
                'Doctor of Philosophy (Aviation)':
                '40,080',
                'Graduate Diploma in Biomedical Engineering':
                '41,280',
                'Master of Capability Management':
                '30,720',
                'Doctor of Philosophy (Medical Education)':
                '41,280',
                'Doctor of Philosophy (Art, Design and Media)':
                '31,680',
                'Master of Criminal Justice & Criminology':
                '39,000',
                'Doctor of Philosophy (Humanities)':
                '30,000',
                'Master of Accounting and Business Information Technology':
                '42,480',
                'Graduate Diploma of Aviation Management':
                '29,340',
                'Master of Biomedical Engineering':
                '41,280',
                'Master of Cyber Security':
                '30,720',
                'Doctor of Philosophy (Pathology)':
                '43,440',
                'Master of Design':
                '32,400',
                'Master of Laws - Criminal Justice & Criminology':
                '41,520',
                'Master of Arts (Research)':
                '30,000',
                'Master of Commerce':
                '42,480',
                'Master of Aviation Management':
                '39,120',
                'Master of Engineering Science (Biomedical Engineering) ':
                '41,280',
                'Master of Cyber Security Operations':
                '30,720',
                'Doctor of Philosophy (Physiology and Pharmacology)':
                '43,440',
                'Master of Art':
                '32,400',
                'Master of Dispute Resolution':
                '41,520',
                'Master of Arts and Social Sciences (Combined)':
                '31,440',
                'Master of Commerce (Extension)':
                '42,480',
                'Master of Science (Aviation) (Research)':
                '40,080',
                'Graduate Diploma in Engineering Science (Chemical Process Engineering) ':
                '41,280',
                'Master of Cyber Security, Strategy and Diplomacy':
                '30,720',
                'Graduate Certificate in Pharmaceutical Medicine':
                '20,760',
                'Master of Fine Arts (Research)':
                '31,680',
                'Master of Laws - Dispute Resolution':
                '41,520',
                'Master of Journalism and Communication':
                '31,440',
                'Master of Financial Analysis':
                '42,480',
                'Doctor of Philosophy (Biochemistry & Molecular Genetics)':
                '40,080',
                'Graduate Diploma in Engineering Science (Food Process Engineering) ':
                '41,280',
                'Master of Information Technology (Specialisation)':
                '30,720',
                'Master of Pharmaceutical Medicine':
                '20,760',
                'Master of Environmental Law & Policy':
                '39,240',
                'Master of Music (Research)':
                '30,000',
                'Master of Philosophy (Commerce and Economics)':
                '33,120',
                'Doctor of Philosophy (Microbiology and Immunology)':
                '40,080',
                'Graduate Diploma in Food Science':
                '41,280',
                'Master of Project Management':
                '30,720',
                'Master of Science (Anatomy) (Research) ':
                '43,440',
                'Master of Laws - Environmental Law':
                '41,520',
                'Master of Music Education (Research)':
                '30,000',
                'Master of Professional Accounting':
                '42,480',
                'Master of Science (Biochemistry & Molecular Genetics) (Research)':
                '40,080',
                'Master of Engineering Science (Chemical Process Engineering) ':
                '41,280',
                'Master of Strategic People Management':
                '30,720',
                'Master of Science (Pathology) (Research)':
                '43,440',
                'Master of Human Rights Law & Policy':
                '41,520',
                'Master of Public Relations and Advertising':
                '31,440',
                'Master of Professional Accounting (Extension)':
                '42,480',
                'Master of Science (Microbiology and Immunology) (Research)':
                '40,080',
                'Master of Engineering Science (Food Process Engineering)':
                '41,280',
                'Master of Science (Physiology and Pharmacology) (Research)':
                '43,440',
                'Master of Laws - Human Rights & Social Justice':
                '41,520',
                'Doctor of Philosophy (Education)':
                '30,000',
                'Doctor of Philosophy (Actuarial Studies)':
                '33,120',
                'Doctor of Philosophy (Applied Geology)':
                '40,080',
                'Master of Food Science':
                '41,280',
                'Doctor of Philosophy in Medicine (Prince of Wales Clinical School)':
                '43,440',
                'Master of Laws - Innovation Law':
                '41,520',
                'Master of Actuarial Studies':
                '42,480',
                'Doctor of Philosophy (Biological Science)':
                '40,080',
                'Graduate Certificate in Engineering Science (Civil Engineering or Geospatial Engineering)':
                '20,640',
                'Doctor of Philosophy in Medicine (South Western Sydney Clinical School)':
                '43,440',
                'Master of Laws - International Business & Economic Law':
                '41,520',
                'Master of Actuarial Studies (Extension)':
                '42,480',
                'Doctor of Philosophy (Climate Science)':
                '40,080',
                'Master of Engineering (Civil Engineering) ':
                '41,280',
                'Doctor of Philosophy in Medicine (St George and Sutherland Clinical School)':
                '43,440',
                'Master of International Law & International Relations':
                '36,480',
                'Master of Education (Assessment and Evaluation)':
                '31,440',
                'Doctor of Philosophy (Environmental Management)':
                '40,080',
                'Master of Engineering (Environmental Engineering) ':
                '41,280',
                'Doctor of Philosophy in Medicine (St Vincents Clinical School)':
                '43,440',
                'Master of Laws - International Law':
                '41,520',
                'Master of Education (Educational Psychology)':
                '31,440',
                'Doctor of Philosophy (Taxation and Business Law)':
                '33,120',
                'Doctor of Philosophy (Geography)':
                '40,080',
                'Master of Engineering Science (Civil Engineering)':
                '41,280',
                'Master of Medicine (Research)':
                '43,440',
                'Doctor of Juridical Science (Research)':
                '38,400',
                'Master of Education (Educational Studies)':
                '31,440',
                'Master of Business Law':
                '42,000',
                'Graduate Certificate in Environmental Management':
                '18,480',
                'Master of Engineering Science (Environmental Engineering)':
                '41,280',
                'Master of Science in Medicine (Kirby Institute) (Research)':
                '41,280',
                'Doctor of Philosophy (Research)':
                '38,400',
                'Master of Education (Gifted Education)':
                '31,440',
                'Graduate Diploma in Environmental Management':
                '36,960',
                'Master of Engineering Science (Geospatial Engineering)':
                '41,280',
                'Master of Science in Medicine (Prince of Wales Clinical School) (Research)':
                '43,440',
                'JD (Juris Doctor)':
                '43,680',
                'Master of Education (Higher Education)':
                '31,440',
                'AGSM MBA':
                '40,800',
                'Master of Marine Science and Management':
                '39,120',
                'Master of Engineering Science (Geotechnical Engineering and Engineering Geology)':
                '41,280',
                'Master of Science in Medicine (South Western Sydney School) (Research)':
                '43,440',
                'Master of Laws':
                '41,520',
                'Master of Education (Research)':
                '30,000',
                'Master of Science (Applied Geology) (Research)':
                '40,080',
                'Master of Engineering Science (Project Management)':
                '41,280',
                'Master of Science in Medicine (St George and Sutherland Clinical School) (Research)':
                '43,440',
                'Master of Laws (Research)':
                '38,400',
                'Master of Education (Special Education)':
                '31,440',
                'Master of Science (Biological Science) (Research)':
                '40,080',
                'Master of Engineering Science (Structural Engineering)':
                '41,280',
                'Master of Science in Medicine (St Vincents Clinical School)':
                '43,440',
                'Master of Law, Media & Journalism':
                '36,480',
                'Master of Education (Teacher Professional Learning)':
                '31,440',
                'Master of Science (Climate Sciences) (Research)':
                '40,080',
                'Master of Engineering Science (Sustainable Systems)':
                '41,280',
                'Doctor of Philosophy (Psychiatry)':
                '43,440',
                'Master of Laws - Media & Technology Law':
                '41,520',
                'Master of Education (TESOL)':
                '31,440',
                'Doctor of Philosophy (Economics)':
                '33,120',
                'Master of Science (Geography) (Research)':
                '40,080',
                'Master of Engineering Science (Transportation Engineering)':
                '41,280',
                'Graduate Certificate in Forensic Mental Health':
                '20,760',
                'Master of Education (Visual Arts Education)':
                '31,440',
                'Doctor of Philosophy (Biotechnology)':
                '40,080',
                'Master of Engineering Science (Water Engineering: Catchments to Coasts)':
                '41,280',
                'Graduate Diploma in Forensic Mental Health':
                '31,140',
                'Master of Educational Leadership':
                '31,440',
                'Graduate Diploma (Research)':
                '39,120',
                'Master of Engineering Science (Water, Wastewater and Waste Engineering )':
                '41,280',
                'Master of Forensic Mental Health':
                '41,520',
                'Master of Educational Leadership (Research)':
                '30,000',
                'Master of Economics':
                '42,480',
                'Master of Science (Biotechnology) (Research)':
                '40,080',
                'Graduate Certificate in Computing':
                '20,640',
                'Master of Philosophy in Forensic Mental Health':
                '43,440',
                'Master of Teaching (Secondary)':
                '47,160',
                'Doctor of Philosophy (Chemistry)':
                '40,080',
                'Graduate Diploma in Information Technology':
                '41,280',
                'Master of Science (Psychiatry) (Research)':
                '43,440',
                'Doctor of Philosophy (Banking and Finance)':
                '33,120',
                'Master of Science (Chemistry) (Research)':
                '40,080',
                'Master of Information Technology':
                '41,280',
                'Master of Applied Linguistics':
                '31,440',
                'Doctor of Philosophy (Materials Science and Engineering)':
                '40,080',
                'Graduate Diploma in Engineering Science (Energy Systems) ':
                '41,280',
                'Doctor of Philosophy (Public Health & Community Medicine)':
                '43,440',
                'Master of Engineering (Materials Science and Engineering) (Research)':
                '40,080',
                'Graduate Diploma in Engineering Science (Electrical Engineering) ':
                '41,280',
                'Doctor of Public Health (Public Health & Community Medicine)':
                '41,280',
                'Master of Finance':
                '42,480',
                'Master of Materials Technology':
                '39,120',
                'Graduate Diploma of Engineering Science (Telecommunications) ':
                '41,280',
                'Graduate Certificate in Health Management':
                '20,760',
                'Master of Interpreting':
                '31,440',
                'Master of Science (Materials Science and Engineering) (Research)':
                '40,080',
                'Master of Engineering (Telecommunications) ':
                '41,280',
                'Graduate Certificate in Infectious Diseases Intelligence':
                '20,760',
                'Master of Translation':
                '31,440',
                'Master of Financial Planning':
                '42,480',
                'Doctor of Philosophy (Mathematics)':
                '40,080',
                'Master of Engineering (Electrical Engineering)':
                '41,280',
                'Graduate Certificate in International Public Health':
                '20,760',
                'Master of Translation and Interpreting':
                '31,440',
                'Graduate Certificate in Mathematics and Statistics':
                '19,560',
                'Master of Engineering Science (Electrical Engineering)  ':
                '41,280',
                'Graduate Certificate in Public Health':
                '20,760',
                'Doctor of Philosophy (Social Sciences)':
                '30,000',
                'Doctor of Philosophy (Organisation and Management)':
                '33,120',
                'Graduate Diploma in Mathematics and Statistics':
                '39,120',
                'Master of Engineering Science (Energy Systems) ':
                '41,280',
                'Graduate Diploma in Health Management':
                '31,140',
                'Doctor of Public Policy and Governance (Research)':
                '30,000',
                'Master of Financial Mathematics':
                '39,120',
                'Master of Engineering Science (Nuclear Engineering) ':
                '41,280',
                'Graduate Diploma in Infectious Diseases Intelligence':
                '31,140',
                'Doctor of Social Work (Research)':
                '30,000',
                'Master of Mathematics':
                '39,120',
                'Master of Engineering Science (Systems and Control) ':
                '41,280',
                'Graduate Diploma in International Public Health':
                '31,140',
                'Master of Science (Mathematics)(Research)':
                '40,080',
                'Master of Engineering Science (Telecommunications) ':
                '41,280',
                'Graduate Diploma in Public Health':
                '31,140',
                'Doctor of Philosophy (Information Systems and Technology Management)':
                '33,120',
                'Master of Statistics':
                '39,120',
                'Master of Engineering Science (Satellite Systems Engineering)':
                '41,280',
                'Master of Health Administration (Research)':
                '34,320',
                'Master of Development Studies':
                '31,440',
                'Doctor of Philosophy (Optometry)':
                '40,080',
                'Master of Engineering Science':
                '30,720',
                'Master of Health Management':
                '41,520',
                'Master of International Relations':
                '31,440',
                'Doctor of Philosophy (Vision Science)':
                '40,080',
                'Master of Space Engineering':
                '30,720',
                'Master of Health Management (Extension)':
                '41,520',
                'Master of Public Policy and Governance':
                '31,440',
                'Master of Optometry and Vision Science':
                '39,120',
                'Master of Space Operations':
                '30,720',
                'Master of Health Management (Extension)/International Public Health ':
                '41,520',
                'Master of Social Sciences (Research)':
                '30,000',
                'Master of Information Systems Management':
                '42,480',
                'Master of Science (Optometry)(Research)':
                '40,080',
                'Master of Systems Engineering':
                '30,720',
                'Master of Health Professions Education (Research)':
                '34,320',
                'Master of Social Work (Research)':
                '30,000',
                'Master of Science (Vision Science) (Research)':
                '40,080',
                'Graduate Diploma in Engineering Science (Manufacturing Engineering & Management)':
                '41,280',
                'Master of Infectious Diseases Intelligence':
                '41,520',
                'Doctor of Philosophy (Organisation and Management) ':
                '33,120',
                'Doctor of Philosophy (Physics)':
                '40,080',
                'Graduate Diploma in Engineering Science (Mechanical Engineering)':
                '41,280',
                'Master of International Public Health':
                '41,520',
                'Master of Engineering (Mechanical Engineering) ':
                '41,280',
                'Master of International Public Health (Extension) ':
                '41,520',
                'Master of Science (Physics) (Research)':
                '40,080',
                'Master of Engineering Science (Manufacturing Engineering & Management)':
                '41,280',
                'Master of International Public Health (Extension)':
                '41,520',
                'Master of International Business':
                '42,480',
                'Doctor of Philosophy (Psychology)':
                '40,080',
                'Master of Engineering Science (Mechanical Engineering)':
                '41,280',
                'Master of International Public Health (Extension) / Health Management':
                '41,520',
                'Doctor of Philosophy/Master of Psychology (Clinical)':
                '40,080',
                'Graduate Diploma in Mining Engineering':
                '41,280',
                'Master of International Public Health / Health Management':
                '41,520',
                'Doctor of Philosophy/Master of Psychology (Forensic)':
                '40,080',
                'Master of Mining Engineering':
                '41,280',
                'Master of International Public Health / Public Health':
                '41,520',
                'Dual PhD / Master of Psychology (Clinical)':
                '40,080',
                'Graduate Certificate in Petroleum Engineering':
                '30,960',
                'Master of Philosophy in Public Health':
                '34,320',
                'Dual PhD / Master of Psychology (Forensic)':
                '40,080',
                'Graduate Diploma in Engineering Science (Petroleum Engineering)  ':
                '41,280',
                'Master of Public Health':
                '41,520',
                'Doctor of Philosophy (Marketing)':
                '33,120',
                'Master of Psychology (Clinical)':
                '39,120',
                'Master of Engineering Science (Petroleum Engineering)  ':
                '41,280',
                'Master of Public Health (Extension)':
                '41,520',
                'Master of Psychology (Forensic)':
                '38,100',
                'Graduate Diploma of Engineering Science (Photovoltaics and Solar Energy)':
                '41,280',
                'Master of Public Health (Extension) / Health Management ':
                '41,520',
                'Master of Science (Psychology) (Research)':
                '40,080',
                'Graduate Diploma of Engineering Science (Renewable Energy Engineering)':
                '41,280',
                'Master of Public Health (Research)':
                '34,320',
                'Master of Marketing':
                '42,480',
                'Master of Engineering Science (Photovoltaics and Solar Energy)  ':
                '41,280',
                'Master of Public Health / Health Management':
                '41,520',
                'Master of Engineering Science (Renewable Energy Engineering)  ':
                '41,280',
                'Master of Public Health /Master of Health Management (Extension) ':
                '41,520',
                'Master of Public Health/International Public Health (Extension)':
                '41,520',
                'Master of Science (Community Medicine) (Research) ':
                '34,320',
                'Doctor of Philosophy (Rural Health)':
                '43,440',
                'Master of Laws - Corporate, Commercial Law & Taxation':
                '41,520',
                'Master of Science (Rural Health) (Research)':
                '43,440',
                'Master of Laws - Taxation':
                '41,520',
                'Doctor of Philosophy in Surgery (Prince of Wales Clinical School)':
                '43,440',
                'Doctor of Philosophy in Surgery (South Western Sydney Clinical School)':
                '43,440',
                'Master of Taxation':
                '42,480',
                'Doctor of Philosophy in Surgery (St George and Sutherland Clinical School':
                '43,440',
                'Doctor of Philosophy in Surgery (St Vincents Clinical School)':
                '43,440',
                'Master of Surgery (Research) (Prince of Wales Clinical School)':
                '43,440',
                'Master of Surgery (Research) (South Western Sydney Clinical School)':
                '43,440',
                'Master of Surgery (Research) (St George and Sutherland Clinical School) ':
                '43,440',
                'Master of Surgery (Research) (St Vincents Clinical School)':
                '43,440',
                'Doctor of Philosophy (Childhood Cancer)':
                '41,280',
                'Doctor of Philosophy (Obstetrics and Gynaecology)':
                '43,440',
                'Doctor of Philosophy (Paediatrics)':
                '43,440',
                'Graduate Certificate in Reproductive Medicine':
                '20,760',
                "Graduate Certificate in Women's Health Medicine":
                '20,760',
                'Graduate Diploma in Reproductive Medicine':
                '31,140',
                'Master of Reproductive Medicine':
                '41,520',
                'Master of Science (Obstetrics and Gynaecology) (Research)':
                '43,440',
                'Master of Science (Paediatrics) (Research)':
                '43,440',
                "Master of Women's Health Medicine":
                '41,520'
            }
            tuition_fee = tuition_feeDict.get(item['degree_name'])
            if tuition_fee != None:
                tuition_fee = tuition_fee.replace(",", "")
                item['tuition_fee_pre'] = "$"
            item['tuition_fee'] = tuition_fee
            print("item['tuition_fee']: ", item['tuition_fee'])

            # careerDict = {}
            career = response.xpath(
                "//p[contains(text(),'Career Opportunities')]/../../preceding-sibling::*[1]/following-sibling::*[position()<4]"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en']: ", item['career_en'])

            item['apply_pre'] = "$"
            item['apply_fee'] = 100
            yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'w',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 16

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = "Central Queensland University"
        item['url'] = response.url
        item['degree_type'] = 2
        item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programme = response.xpath(
                "//h1[@class='program-title']/text()|"
                "//h1[@itemprop='name']//text()").extract()
            clear_space(programme)
            programme = ''.join(programme).split("-")
            # print("programme: ", programme)
            item['programme_en'] = ''.join(programme[:-1]).replace(
                "Master of", "").strip()
            print("item['programme_en']: ", item['programme_en'])
            item['degree_name'] = ''.join(programme[:-1])
            print("item['degree_name']: ", item['degree_name'])

            department = response.xpath(
                "//ol[@id='breadcrumbs']/li[4]/a//text()").extract()
            clear_space(department)
            item['department'] = ''.join(department)
            print("item['department']: ", item['department'])

            duration = response.xpath(
                "//th[contains(text(),'Duration')]/following-sibling::td[1]//text()|"
                "//span[contains(text(),'DURATION')]/following-sibling::*[1]//text()"
            ).extract()
            clear_space(duration)
            item['duration'] = ''.join(duration).strip()
            print("item['duration']: ", item['duration'])

            start_date = response.xpath(
                "//th[contains(text(),'Intake dates')]/following-sibling::td[1]//text()|"
                "//p[contains(text(),'Term')]//text()").extract()
            # print("start_date: ", start_date)
            clear_space(start_date)
            if "," in ''.join(start_date):
                start_date = ''.join(start_date).split(",")
            # print("start_date: ", start_date)
            item['start_date'] = ''.join(start_date).strip()

            monthDict = {
                "january": "01",
                "february": "02",
                "march": "03",
                "april": "04",
                "may": "05",
                "june": "06",
                "july": "07",
                "august": "08",
                "september": "09",
                "october": "10",
                "november": "11",
                "december": "12",
                "jan": "01",
                "feb": "02",
                "mar": "03",
                "apr": "04",
                "may": "05",
                "jun": "06",
                "jul": "07",
                "aug": "08",
                "sep": "09",
                "oct": "10",
                "nov": "11",
                "dec": "12",
                "sept": "09",
            }
            std = []
            if len(start_date) > 0:
                for s in start_date:
                    std_tmp = monthDict.get(s.lower().strip())
                    if std_tmp is not None:
                        std.append(std_tmp)
            # item['start_date'] = ','.join(std).replace("0", "").strip().strip(",").strip()
            # print("item['start_date']: ", item['start_date'])

            # //div[@class='careers']
            career = response.xpath(
                "//div[@class='careers']|//span[@class='ct-accordion__title'][contains(text(),'Career Opportunities and Outcomes')]/../.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //p[@itemprop='description']/following-sibling::p
            degree_overview_en = response.xpath(
                "//p[@itemprop='description']/following-sibling::p").extract()
            item['degree_overview_en'] = remove_class(
                clear_lianxu_space(degree_overview_en))
            # print("item['degree_overview_en']: ", item['degree_overview_en'])

            overview1 = response.xpath(
                "//div[@class='tab-content active']/p|//div[@class='tab details-tab']|//span[@class='ct-accordion__title'][contains(text(),'Course Details')]/../.."
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview1))
            # print("item['overview_en']: ", item['overview_en'])

            # modules = response.xpath(
            #     "//div[@class='tab structure-tab']").extract()
            # item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            modules_url = response.xpath(
                "//div[@class='tab structure-tab']//a[contains(text(),'click here')]/@href|"
                "//a[contains(text(),'Handbook')]/@href").extract()
            # print(len(modules_url))
            if len(modules_url) > 0:
                item['modules_en'] = self.parse_modules(modules_url[0])
            # print("item['modules_en']: ", item['modules_en'])

        # //html//div[@class='tab entry-reqs-tab']//tr[1]
        #  https://www.cqu.edu.au/international-students/entry-requirements/english-requirements
            IELTS = response.xpath(
                "//td[contains(text(),'IELTS Academic')]/following-sibling::td[1]//text()|"
                "//li[contains(text(),'IELTS')]//text()").extract()
            clear_space(IELTS)
            item['ielts_desc'] = ''.join(IELTS)
            print("item['ielts_desc']: ", item['ielts_desc'])

            TOEFL = response.xpath(
                "//td[contains(text(),'TOEFL Internet-based')]/following-sibling::td[1]//text()"
            ).extract()
            clear_space(TOEFL)
            item['toefl_desc'] = ''.join(TOEFL)
            # print("item['toefl_desc']: ", item['toefl_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            toeflDict = get_toefl(item['toefl_desc'])
            item['toefl'] = toeflDict.get("TOEFL")
            item['toefl_l'] = toeflDict.get("TOEFL_L")
            item['toefl_s'] = toeflDict.get("TOEFL_S")
            item['toefl_r'] = toeflDict.get("TOEFL_R")
            item['toefl_w'] = toeflDict.get("TOEFL_W")
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # //div[@class='tab fees-tab']//div[@class='tab-content']//h4
            tuition_fee = response.xpath(
                "//div[@class='tab fees-tab']//div[@class='tab-content']//h4//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"Estimated\sfirst\syear\sfee.*",
                                        ','.join(tuition_fee))
            tuition_fee_re1 = re.findall(r"[\d\s]+", ' '.join(tuition_fee_re))
            item['tuition_fee'] = ''.join(tuition_fee_re1).replace(" ",
                                                                   "").strip()
            # print("item['tuition_fee']: ", item['tuition_fee'])

            # //div[@class='tab apply-tab']
            apply_desc_en = response.xpath(
                "//div[@class='tab apply-tab']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(apply_desc_en))
            print("item['apply_desc_en']: ", item['apply_desc_en'])

            apply_documents_en = response.xpath(
                "//div[contains(text(),'What type of supporting documents do I have to pro')]/.."
            ).extract()
            item['apply_documents_en'] = remove_class(
                clear_lianxu_space(apply_documents_en))
            print("item['apply_documents_en']: ", item['apply_documents_en'])

            location = response.xpath(
                "//th[contains(text(),'Availability')]/following-sibling::td[1]//text()|//span[contains(text(),'AVAILABILITY')]/../p//text()"
            ).extract()
            clear_space(location)
            print("location: ", location)
            item['location'] = ''.join(location).strip().strip(",").strip()
            print("item['location']: ", item['location'])

            entry_requirements = response.xpath(
                "//div[@class='tab entry-reqs-tab']|//span[@class='ct-accordion__title'][contains(text(),'Entry Requirements')]/../.."
            ).extract()
            item['rntry_requirements_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            # entry_requirements_url = response.xpath("//a[contains(text(), 'Student Handbook')]/@href").extract()
            # if len(entry_requirements_url) > 0:
            #     self.parse_entry(entry_requirements_url[0], item)
            print("item['rntry_requirements_en']: ",
                  item['rntry_requirements_en'])
            yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Ejemplo n.º 17

Mostrar archivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianYanItem)
        item['university'] = "The University of Melbourne"
        # item['country'] = 'Australia'
        # item['website'] = 'http://www.unimelb.edu.au/'
        item['url'] = response.url
        print("===========================")
        print(response.url)
        degree_type = response.meta['degree_type']
        item['degree_type'] = degree_type
        print("item['degree_type']: ", item['degree_type'])

        teach_time = response.meta['teach_time']
        item['teach_time'] = teach_time
        print("item['teach_time']: ", item['teach_time'])

        degree_description = response.meta['degree_description']
        item['degree_overview_en'] = degree_description
        # print("item['degree_overview_en']: ", item['degree_overview_en'])

        duration = response.meta['duration']
        if len(duration) == 0:
            duration = response.xpath("//h1[@class='page-title']/following-sibling::*[1]//text()|"
                                      "//p[contains(text(), 'full-time')]//text()|//p[contains(text(), 'full time')]//text()").extract()
        durationList = re.findall(r".{1,15}full.{1}time", duration)
        print(durationList)
        dur_re = re.findall(r".{1,15}year", ''.join(durationList))
        if len(dur_re) != 0:
            item['duration'] = ','.join(dur_re).strip().strip(',').strip()
        print("item['duration']: ", item['duration'])
        # duration_list = getIntDuration(''.join(durationList))
        # if len(duration_list) == 2:
        #     item['duration'] = duration_list[0]
        #     item['duration_per'] = duration_list[-1]
        # print("item['duration_per']: ", item['duration_per'])

        location = response.meta['location']
        item['location'] = location
        print("item['location']: ", item['location'])

        degree_name = response.meta['degree_name']
        item['degree_name'] = degree_name
        print("item['degree_name']: ", item['degree_name'])

        programme =re.findall(r"Teaching\s\(.+\)$|\(.+\)$", item['degree_name'])
        clear_space(programme)
        if "Teaching" in ''.join(programme):
            item['programme_en'] = ''.join(programme).strip()
        else:
            item['programme_en'] = ''.join(programme).replace("(", "").replace(")", "").strip()
        if item['programme_en'] == "":
            item['programme_en'] = item['degree_name'].replace("Master of", "").replace("Doctor of", "").strip()
        print("item['programme_en']: ", item['programme_en'])

        # start_date = response.meta['start_date']
        # print("start_date: ", start_date)
        # item['start_date'] = start_date
        # print("item['start_date']: ", item['start_date'])
        try:
            department = response.xpath("//li[@class='root']/a/span/text()|//img[@src='https://education.unimelb.edu.au/__data/assets/image/0006/1675995/Horizontal-180mm-new.png']/@alt|//a[@class='page-header-home']//text()").extract()
            clear_space(department)
            item['department'] = ''.join(department).replace("Courses -", "").replace(", shaping minds shaping the world", "").strip()
            if item['department'] == "":
                if "mbs.edu" in response.url:
                    item['department'] = "Melbourne Business School"
            print("item['department']: ", item['department'])

            overview = response.xpath("//div[@id='overview']//p[@class='course-ctas']/preceding-sibling::*|"
                                      "//div[@class='article']//p[@class='course-ctas']/preceding-sibling::*|//div[@class='lead']|"
                                      "//div[@class='lead']/following-sibling::div[1]|//section[@class='lead']/..|"
                                      "//section[@class='lead']/../following-sibling::div[1]|"
                                      "//a[contains(text(),'Calendar')]/../preceding-sibling::*[position()<last()-1]").extract()
            clear_space(overview)
            if len(overview) == 0:
                # //html//tr[5]
                overview = response.xpath("//html//tr[5]|//div[@id='overview']").extract()
                clear_space(overview)
            overviewRe = re.findall(r"Next.*?<", ''.join(overview))
            # print("===", overviewRe)
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            if len(overviewRe) > 0:
                item['overview_en'] = item['overview_en'].replace(''.join(overviewRe),'<').strip()
            print("item['overview_en']: ", item['overview_en'])

            career = response.xpath("//h2[contains(text(),'Career outcomes')]/preceding-sibling::*[1]/following-sibling::*|"
                                    "//h2[contains(text(),'Career outcomes')]/preceding-sibling::*[1]/following-sibling::*[position()<6]|"
                                    "//strong[contains(text(),'Career opportunities')]/../preceding-sibling::*[1]/following-sibling::*[position()<13]").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en']: ", item['career_en'])

            modules = response.xpath("//div[@id='course-structure']|//div[@id='subjects']|"
                                     "//a[contains(text(),'Subjects and structure')]/../preceding-sibling::*[1]/following-sibling::*[position()<3]|"
                                     "//div[@id='course-structure']|//h2[contains(text(),'Course structure')]/preceding-sibling::*[1]/following-sibling::*[position()<8]").extract()
            clear_space(modules)
            if len(modules) == 0:
                modules = response.xpath("//html//tr[9]").extract()
                clear_space(modules)
            modulesRe = re.findall(r"Next.*<", ''.join(modules))
            # print("===", modulesRe)
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            if len(modulesRe) > 0:
                item['modules_en'] = item['modules_en'].replace(''.join(modulesRe),'<').strip()
            print("item['modules_en']: ", item['modules_en'])

            entry_requirements = response.xpath("//div[@id='entry-requirements']|"
                                                "//h2[contains(text(),'Entry requirements')]/preceding-sibling::*[1]/following-sibling::*[position()<5]").extract()
            if len(entry_requirements) == 0:
                entry_requirements = response.xpath("//html//tr[13]").extract()
            entry_requirementsRe = re.findall(r"Next.*", ''.join(entry_requirements))
            # print("entry_requirementsRe===", entry_requirementsRe)
            item['rntry_requirements_en'] = remove_class(clear_lianxu_space(entry_requirements))
            if len(entry_requirementsRe) > 0:
                item['rntry_requirements_en'] = item['rntry_requirements_en'].replace(''.join(entry_requirementsRe),'<').strip()
            print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

            item['ielts_desc'] = "https://futurestudents.unimelb.edu.au/admissions/entry-requirements/language-requirements/graduate-toefl-ielts"
            if item['department'] == "Melbourne School of Design":
                if "Master of Philosophy" in item['degree_name'] or "Doctor of Philosophy" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '27'
                else:
                    item["ielts"] = '6.5'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '6'
                    item["toefl"] = '79'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '21'
            elif item['department'] == "Faculty of Arts" or item['department'] == "Graduate School of Humanities and Social Sciences":
                if "Master of Publishing and Communications" in item['degree_name'] or "Master of Creative Writing, Publishing and Editing" in item['degree_name'] \
                        or "Master of Journalism" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '27'
                elif "Master by Research" in item['degree_name'] or "Doctor of Philosophy" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '27'
                else:
                    item["ielts"] = '6.5'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '6'
                    item["toefl"] = '79'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '21'
            elif item['department'] == "Melbourne Business School":
                if "Master of Business Analytics" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '7'
                    item["ielts_s"] = '7'
                    item["ielts_r"] = '7'
                    item["ielts_w"] = '7'
                    item["toefl"] = '102'
                    item["toefl_l"] = '21'
                    item["toefl_s"] = '21'
                    item["toefl_r"] = '21'
                    item["toefl_w"] = '24'
                elif "Master of Business Administration" in item['degree_name'] or "Master of Marketing" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '6.5'
                    item["ielts_s"] = '6.5'
                    item["ielts_r"] = '6.5'
                    item["ielts_w"] = '6.5'
                    item["toefl"] = '102'
                    item["toefl_l"] = '21'
                    item["toefl_s"] = '21'
                    item["toefl_r"] = '21'
                    item["toefl_w"] = '24'
                elif "Master of Entrepreneurship" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '6'
                    item["toefl"] = '94'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '27'
                elif "Master of Philosophy" in item['degree_name'] or "Master of Commerce" in item['degree_name']:
                    item["ielts"] = '6.5'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '6'
                    item["toefl"] = '79'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '21'
                elif "Doctor of Philosophy" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '27'
                else:
                    item["ielts"] = '6.5'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '6'
                    item["toefl"] = '79'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '21'
            elif item['department'] == "Melbourne Graduate School of Education":
                if "Master of Teaching" in item['degree_name'] or "Master of Educational Psychology" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '7'
                    item["ielts_s"] = '7'
                    item["ielts_r"] = '7'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '24'
                    item["toefl_s"] = '24'
                    item["toefl_r"] = '24'
                    item["toefl_w"] = '27'
                elif "Master of English in a Global Context" in item['degree_name']:
                    item["ielts"] = '6.5'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '6'
                    item["toefl"] = '79'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '21'
                else:
                    item["ielts"] = '7'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '21'
            elif item['department'] == "Melbourne School of Engineering" or item['department'] == "Melbourne School of Information":
                item["ielts"] = '6.5'
                item["ielts_l"] = '6'
                item["ielts_s"] = '6'
                item["ielts_r"] = '6'
                item["ielts_w"] = '6'
                item["toefl"] = '79'
                item["toefl_l"] = '13'
                item["toefl_s"] = '18'
                item["toefl_r"] = '13'
                item["toefl_w"] = '21'
            elif "Melbourne Law School" in item['department'] or item['department'] == "Melbourne School of Government":
                if "Master of Philosophy" in item['degree_name'] or "Doctor of Philosophy" in item['degree_name'] or "JD" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '27'
                else:
                    item["ielts"] = '6.5'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '6'
                    item["toefl"] = '79'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '21'
            elif item['department'] == "Faculty of Medicine, Dentistry and Health Sciences":
                if "Doctor of Clinical Dentistry" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '7'
                    item["ielts_s"] = '7'
                    item["ielts_r"] = '7'
                    item["ielts_w"] = '7'
                elif "Doctor of Dental Surgery" in item['degree_name'] or "Doctor of Medicine" in item['degree_name'] or "Doctor of Physiotherapy" in item['degree_name'] \
                        or "Master of Genetic Counselling" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '27'
                elif "Doctor of Optometry" in item['degree_name']:
                    item["ielts"] = '6.5'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '6'
                    item["toefl"] = '79'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '27'
                elif "Master of Clinical Audiology" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '7'
                    item["ielts_s"] = '7'
                    item["ielts_r"] = '7'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '24'
                    item["toefl_s"] = '24'
                    item["toefl_r"] = '24'
                    item["toefl_w"] = '27'
                elif "Master of Clinical Education" in item['degree_name'] or "Master of Clinical Ultrasound" in item['degree_name'] \
                        or "Master in Surgical Education" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '7'
                    item["ielts_s"] = '7'
                    item["ielts_r"] = '7'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '27'
                elif "Master of Medicine" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '7'
                    item["ielts_s"] = '7'
                    item["ielts_r"] = '7'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '27'
                elif "Master of Nursing Science" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '6.5'
                    item["ielts_s"] = '6.5'
                    item["ielts_r"] = '6.5'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '20'
                    item["toefl_s"] = '20'
                    item["toefl_r"] = '20'
                    item["toefl_w"] = '27'
                elif "Master of Psychiatry" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '7'
                    item["ielts_s"] = '7'
                    item["ielts_r"] = '7'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '24'
                    item["toefl_s"] = '24'
                    item["toefl_r"] = '24'
                    item["toefl_w"] = '27'
                elif "Master of Psychology" in item['degree_name'] or "Doctor of Philosophy" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '7'
                    item["ielts_s"] = '7'
                    item["ielts_r"] = '7'
                    item["ielts_w"] = '7'
                elif "Master of Social Work" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '7'
                    item["ielts_s"] = '7'
                    item["ielts_r"] = '7'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '24'
                    item["toefl_s"] = '24'
                    item["toefl_r"] = '24'
                    item["toefl_w"] = '27'
                elif "Master of Speech Pathology" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '7'
                    item["ielts_s"] = '7'
                    item["ielts_r"] = '7'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '24'
                    item["toefl_s"] = '24'
                    item["toefl_r"] = '24'
                    item["toefl_w"] = '27'
                elif "Master of Sports Medicine" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '7'
                    item["ielts_s"] = '7'
                    item["ielts_r"] = '7'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '24'
                    item["toefl_s"] = '24'
                    item["toefl_r"] = '24'
                    item["toefl_w"] = '27'
                elif "Master of Rehabilitation Science" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '7'
                    item["ielts_s"] = '7'
                    item["ielts_r"] = '7'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '24'
                    item["toefl_s"] = '24'
                    item["toefl_r"] = '24'
                    item["toefl_w"] = '27'
                elif "Masters by Research" in item['degree_name'] or "Research Doctorates" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '27'
                else:
                    item["ielts"] = '6.5'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '6'
                    item["toefl"] = '79'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '21'
            elif "Faculty of Science" in item['department'] or item['department'] == "Melbourne Graduate School of Science" or "Faculty of Fine Arts and Music" in item['department'] or item['department'] == "Victorian College of the Arts" or item['department'] == "Melbourne Conservatorium of Music":
                item["ielts"] = '6.5'
                item["ielts_l"] = '6'
                item["ielts_s"] = '6'
                item["ielts_r"] = '6'
                item["ielts_w"] = '6'
                item["toefl"] = '79'
                item["toefl_l"] = '13'
                item["toefl_s"] = '18'
                item["toefl_r"] = '13'
                item["toefl_w"] = '21'
            elif item['department'] == "Faculty of Veterinary and Agricultural Sciences":
                if "Doctor of Veterinary Medicine" in item['degree_name']:
                    item["ielts"] = '7'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '7'
                    item["toefl"] = '94'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '27'
                else:
                    item["ielts"] = '6.5'
                    item["ielts_l"] = '6'
                    item["ielts_s"] = '6'
                    item["ielts_r"] = '6'
                    item["ielts_w"] = '6'
                    item["toefl"] = '79'
                    item["toefl_l"] = '13'
                    item["toefl_s"] = '18'
                    item["toefl_r"] = '13'
                    item["toefl_w"] = '21'

            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #                     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #                     item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))


            how_to_apply = response.xpath("//div[@id='apply-now']|"
                                          "//a[contains(text(),'Applying')]/../preceding-sibling::*[1]/following-sibling::*[position()<3]|"
                                          "//div[@id='how-to-apply']").extract()
            item['apply_desc_en'] = remove_class(clear_lianxu_space(how_to_apply))
            print("item['apply_desc_en']: ", item['apply_desc_en'])

            # //h3[contains(text(),'Application closing dates')]/following-sibling::*[1]
            deadline = response.xpath(
                "//h3[contains(text(),'Application closing dates')]/following-sibling::*[1]//text()|"
                "//h2[contains(text(),'Application deadlines')]/following-sibling::ul[1]//text()").extract()
            if len(deadline) == 0:
                deadline = response.xpath(
                    "//div[@id='how-to-apply']//h5[contains(text(),'International')]/following-sibling::p[1]//text()").extract()
                if len(deadline) > 0:
                    deadline = [getStartDate(deadline[0])]
            item['deadline'] = clear_lianxu_space(deadline)
            print("item['deadline']: ", item['deadline'])

            apply_documents_en = response.xpath(
                "//strong[contains(text(),'To apply, you will need to provide:')]/../preceding-sibling::*[1]/following-sibling::*[position()<3]|"
                "//h3[contains(text(),'To apply, you will need to provide:')]/preceding-sibling::*[1]/following-sibling::*[position()<3]|"
                "//div[@class='ct-checklist']|//h3[contains(text(),'Supporting documentation')]/preceding-sibling::*[1]/following-sibling::*").extract()
            item['apply_documents_en'] = remove_class(clear_lianxu_space(apply_documents_en))
            print("item['apply_documents_en']: ", item['apply_documents_en'])

            # //h3[@id='application-process']/preceding-sibling::*[1]/following-sibling::*[position()<9]
            apply_proces_en = response.xpath(
                "//h3[@id='application-process']/preceding-sibling::*[1]/following-sibling::*[position()<9]").extract()
            item['apply_proces_en'] = remove_class(clear_lianxu_space(apply_proces_en))
            print("item['apply_proces_en']: ", item['apply_proces_en'])

            tuition_fee =response.xpath("//div[@id='entry-requirements']/following-sibling::div[1]//table/tbody/tr/td[1]//text()|"
                                        "//div[@id='fees-and-scholarships']//td[contains(text(),'International')]/following-sibling::td[last()]//text()|"
                                        "//h2[contains(text(),'Fees')]/..//th[@id='course']/following-sibling::*[last()]//text()|"
                                        "//td[@data-label='Cost']//text()|"
                                        "//strong[contains(text(),'TOTALS')]/../../following-sibling::*[last()]//text()|"
                                        "//td[@data-label='Cost']//text()").extract()
            clear_space(tuition_fee)
            if len(tuition_fee) == 0:
                tuition_fee = response.xpath("//html//tr[14]//text()").extract()
                clear_space(tuition_fee)
            print("tuition_fee: ", tuition_fee)

            # for feeIndex in range(len(tuition_fee)):
            #     feeRe = re.findall(r"\d+,\d+", tuition_fee[feeIndex])
            #     if len(feeRe) != 0:
            #         tuition_fee[feeIndex] = ''.join(feeRe).replace(",", "")
            #     print("***tuition_fee: ", tuition_fee)
            #     maxfee = 0
            #     for fee in tuition_fee:
            #         if fee >= maxfee:
            #             maxfee = int(fee)
            #     if maxfee != 0:
            #         item['tuition_fee'] = maxfee
            #         item['tuition_fee_pre'] = "AUD$"
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "AUD$"
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            print("item['tuition_fee']: ", item['tuition_fee'])

            if item['department'] == "Melbourne Law School" and item['modules_en'] == "":
                law_url = response.xpath("//a[@href='https://law.unimelb.edu.au/study/masters']/@href").extract()
                if len(law_url) > 0:
                    self.parse_law_mes(law_url[0], item)

            urlCom = re.findall("/overview$", response.url)
            # print("urLCom：", urlCom)
            urlRes = ''.join(urlCom)
            if len(urlCom) != 0:
                urlRes = ''.join(response.url.split(''.join(urlCom)))
            # print("urlRes：", urlRes)

            if urlRes:
                # print("==============11")
                modulesUrl = urlRes + "/degree-structure"
                # print(modulesUrl,"------")
                self.parse_modules(modulesUrl, item)
                entryUrl = urlRes + "/entry-requirements"
                self.parse_entry(entryUrl, item)
                applyUrl = urlRes + "/apply-now"
                self.parse_apply(applyUrl, item)
                feeUrl = urlRes + "/fees-scholarships"
                self.parse_fee(feeUrl, item)

            if len(item['deadline']) > 150:
                item['deadline'] = item['deadline'][:151]
            yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/"+item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)