def download_catalog_year(college_pages, auth): # Create target URL parameters = copy.copy(URL_PARAMS) url = SOURCE_URL + '?' + urllib.parse.urlencode(parameters) parameters['changeMajor'] = 'Next' parameters['call'] = '5' for college in COLLEGES: parameters['college'] = college print('Downloading College', college) for idx, major in enumerate(college_pages[college]): print(' Getting Degree', major['name']) parameters['major'] = major['name'] url = SOURCE_URL + '?' + urllib.parse.urlencode(parameters) export_page = auth.get(url).content soup = bs4.BeautifulSoup(export_page) # print(export_page) data = soup.select('option') # print(data) if len(data) > 0: college_pages[college][idx]['year'] = _.map_( soup.select('option'), lambda x: _.strip_tags(x)) else: # print(soup.find('body table tbody')) decoded = export_page.decode() college_pages[college][idx]['MajorFile'] = decoded.split( 'name=MajorFile value=')[1].split('>')[0] year = _.js_match( _.js_match( export_page, '/[<input type=hidden name=year year="](\d{4})/'), '/\d{4}/') college_pages[college][idx]['year'] = year # if idx>5: # return college_pages # print(college_pages[college][idx]) return college_pages
def download_catalog_year(college_pages, auth): # Create target URL parameters = copy.copy(URL_PARAMS) url = SOURCE_URL + '?' + urllib.parse.urlencode(parameters) parameters['changeMajor']='Next' parameters['call']='5' for college in COLLEGES: parameters['college']=college print('Downloading College',college) for idx,major in enumerate(college_pages[college]): print(' Getting Degree', major['name']) parameters['major']=major['name'] url = SOURCE_URL + '?' + urllib.parse.urlencode(parameters) export_page=auth.get(url).content soup = bs4.BeautifulSoup(export_page) # print(export_page) data=soup.select('option') # print(data) if len(data) > 0: college_pages[college][idx]['year']=_.map_(soup.select('option'),lambda x: _.strip_tags(x)) else: # print(soup.find('body table tbody')) decoded = export_page.decode() college_pages[college][idx]['MajorFile']=decoded.split('name=MajorFile value=')[1].split('>')[0] year=_.js_match(_.js_match(export_page,'/[<input type=hidden name=year year="](\d{4})/'),'/\d{4}/') college_pages[college][idx]['year']=year # if idx>5: # return college_pages # print(college_pages[college][idx]) return college_pages
def test_js_match(case, expected): assert _.js_match(*case) == expected