def scrape(self): rooms = {} for room in self.queryset().filter(code__isnull=False): root = fetch.html('http://www.ntnu.no/studieinformasjon/rom/', query={'romnr': room.code}, verbose=True) if root is None: continue for link in root.cssselect('.hovedramme .hoyrebord a'): if not link.attrib['href'].startswith( 'http://www.ntnu.no/kart/'): continue root = fetch.html(link.attrib['href']) if root is None: continue data = {} # Sort so that link with the right room name bubbles to the top. links = root.cssselect('.facilitylist .horizontallist a') links.sort(key=lambda a: (a.text != room.name, a.text)) for a in links: code, name = fetch_room(a.attrib['href']) if code and room.code.endswith(code): data = { 'code': room.code, 'name': name, 'url': a.attrib['href'] } # Give up after first element that should be equal to room # name. Make this conditional on data having been found (i.e. # if data: break) and we will check all rooms to see if we # can find one with a matching code, but this takes a long # time. break crumb = root.cssselect('h1.ntnucrumb') if crumb[0].text_content() == room.name: links = root.cssselect('link[rel="canonical"]') for link in links: if link.attrib['href'] != 'http://www.ntnu.no/kart/': data = { 'code': room.code, 'name': room.name, 'url': link.attrib['href'] } if data: yield data break
def scrape(self): prefix = ntnu.prefix(self.semester) url = 'http://www.ntnu.no/studieinformasjon/timeplan/%s/' % prefix room_codes = {} for code, name in fetch_rooms(): room_codes.setdefault(name, []).append(code) for course in self.course_queryset(): code = '%s-%s' % (course.code, course.version) root = fetch.html(url, query={'emnekode': code.encode('latin1')}) if root is None: continue for h1 in root.cssselect(u'.hovedramme h1'): if course.code in h1.text_content(): table = root.cssselect('.hovedramme table')[1] break else: logging.debug("Couldn't load any info for %s", course.code) continue lecture_type = None for tr in table.cssselect('tr')[1:-1]: data = parse_row(tr, room_codes) if data.get('lecture_type', None): lecture_type = data['lecture_type'] elif data: data.update({'course': course, 'type': lecture_type}) yield data
def scrape(self): buildings = fetch_buildings() qs = self.queryset() qs = qs.filter(lecture__course__semester=self.semester) qs = qs.distinct() for code, name, url in qs.values_list('code', 'name', 'url'): if not code or url: continue data = fetch.json(BASE + '/fdv/rooms/lydiacode:%s' % code) if not data: continue room = data['rooms'][0] url = 'http://www.ntnu.no/kart/%s/%s' % ( buildings[room['buildingId']], room['nr']) name = (room['name'] or '').strip() or 'Rom %s' % room['nr'] root = fetch.html(url) if root: for link in root.cssselect('link[rel="canonical"]'): if link.attrib['href'] != 'http://www.ntnu.no/kart': url = link.attrib['href'] yield {'code': code, 'name': name, 'url': url}
def fetch_university(name_re): root = fetch.html('http://www.akademika.no/pensum', cache=False) if root is None: return for option in root.cssselect('select[name="select_university"] option'): if re.search(name_re, option.text): return option.attrib['value'] return None
def fetch_rooms(): result = fetch.html('http://www.ntnu.no/studieinformasjon/rom/') if result is None: return rooms = {} for option in result.cssselect('.hovedramme select[name="romnr"] option'): code = utils.clean_string(option.attrib['value']) name = utils.clean_string(option.text_content()) if code and name and 'ikkerom' not in name: yield code, name
def fetch_room(url): root = fetch.html(url) if root is None: return None, None name = root.cssselect('.ntnukart h2')[0].text_content() for div in root.cssselect('.ntnukart .buildingimage .caption'): match = re.match(r'[^(]+\(([^)]+)\)', div.text_content()) if match: return match.group(1), name return None, None
def fetch_node(pack): root = fetch.html( 'http://www.akademika.no/pensumlister/load_products2/%s' % pack) if root is None: return node = root.cssselect('[id*="node-"]') if not node: return node = node[0].attrib['id'].split('-')[1] if node: return 'http://www.akademika.no/node/%s' % node
def fetch_packs(university, study, semester): root = fetch.html('http://www.akademika.no/pensumlister/load_products', query={ 'university': university, 'study': study, 'semester': semester }) if root is None: return for link in root.cssselect('.packlink'): course = link.text.split(' ')[0] if course.endswith('NTNU'): course = course[:-len('NTNU')] yield course, link.attrib['rel']
def scrape(self): if self.semester.type == Semester.FALL: year = self.semester.year else: year = self.semester.year - 1 code_re = re.compile('/studier/emner/([^/]+)/', re.I | re.L) url = 'http://www.ntnu.no/web/studier/emnesok' query = { 'p_p_lifecycle': '2', 'p_p_id': 'courselistportlet_WAR_courselistportlet_INSTANCE_m8nT', '_courselistportlet_WAR_courselistportlet_INSTANCE_m8nT_year': year } courses_root = fetch.html(url, query=query, verbose=True) for a in courses_root.cssselect('a[href*="/studier/emner/"]'): course_url = a.attrib['href'] code = code_re.search(course_url).group(1) quoted_code = urllib.quote(code.encode('utf-8')) name = a.text_content() if not ntnu.valid_course_code(code): continue elif not self.should_proccess_course(code): continue title = None data = {} root = fetch.html('http://www.ntnu.no/studier/emner/%s/%s' % (quoted_code, year)) # Construct dict out of info boxes. for box in root.cssselect('.infoBox'): for child in box.getchildren(): if child.tag == 'h3': title = child.text_content() else: parts = [child.text or ''] for br in child.getchildren(): parts.append(br.tail or '') for key, value in [ p.split(':', 1) for p in parts if ':' in p ]: key = key.strip(u' \n\xa0') value = value.strip(u' \n\xa0') data.setdefault(title, {}).setdefault(key, []).append(value) try: semesters = data['Undervisning']['Undervises'] except KeyError: continue if self.semester.type == Semester.FALL and u'HØST %s' % year not in semesters: continue elif self.semester.type == Semester.SPRING and u'VÅR %s' % year not in semesters: continue yield { 'code': code, 'name': name, 'version': int(data['Fakta om emnet']['Versjon'][0]), 'points': float(data['Fakta om emnet']['Studiepoeng'][0]), 'url': course_url }