Python htmlの例、plan.scrape.fetch.html Pythonの例

コード例 #1

0

ファイルを表示

    def scrape(self):
        rooms = {}
        for room in self.queryset().filter(code__isnull=False):
            root = fetch.html('http://www.ntnu.no/studieinformasjon/rom/',
                              query={'romnr': room.code},
                              verbose=True)
            if root is None:
                continue

            for link in root.cssselect('.hovedramme .hoyrebord a'):
                if not link.attrib['href'].startswith(
                        'http://www.ntnu.no/kart/'):
                    continue

                root = fetch.html(link.attrib['href'])
                if root is None:
                    continue

                data = {}

                # Sort so that link with the right room name bubbles to the top.
                links = root.cssselect('.facilitylist .horizontallist a')
                links.sort(key=lambda a: (a.text != room.name, a.text))
                for a in links:
                    code, name = fetch_room(a.attrib['href'])
                    if code and room.code.endswith(code):
                        data = {
                            'code': room.code,
                            'name': name,
                            'url': a.attrib['href']
                        }

                    # Give up after first element that should be equal to room
                    # name. Make this conditional on data having been found (i.e.
                    # if data: break) and we will check all rooms to see if we
                    # can find one with a matching code, but this takes a long
                    # time.
                    break

                crumb = root.cssselect('h1.ntnucrumb')
                if crumb[0].text_content() == room.name:
                    links = root.cssselect('link[rel="canonical"]')
                    for link in links:
                        if link.attrib['href'] != 'http://www.ntnu.no/kart/':
                            data = {
                                'code': room.code,
                                'name': room.name,
                                'url': link.attrib['href']
                            }

                if data:
                    yield data
                    break

コード例 #2

0

ファイルを表示

    def scrape(self):
        prefix = ntnu.prefix(self.semester)
        url = 'http://www.ntnu.no/studieinformasjon/timeplan/%s/' % prefix
        room_codes = {}

        for code, name in fetch_rooms():
            room_codes.setdefault(name, []).append(code)

        for course in self.course_queryset():
            code = '%s-%s' % (course.code, course.version)
            root = fetch.html(url, query={'emnekode': code.encode('latin1')})
            if root is None:
                continue

            for h1 in root.cssselect(u'.hovedramme h1'):
                if course.code in h1.text_content():
                    table = root.cssselect('.hovedramme table')[1]
                    break
            else:
                logging.debug("Couldn't load any info for %s", course.code)
                continue

            lecture_type = None
            for tr in table.cssselect('tr')[1:-1]:
                data = parse_row(tr, room_codes)
                if data.get('lecture_type', None):
                    lecture_type = data['lecture_type']
                elif data:
                    data.update({'course': course, 'type': lecture_type})
                    yield data

コード例 #3

0

ファイルを表示

    def scrape(self):
        buildings = fetch_buildings()

        qs = self.queryset()
        qs = qs.filter(lecture__course__semester=self.semester)
        qs = qs.distinct()

        for code, name, url in qs.values_list('code', 'name', 'url'):
            if not code or url:
                continue

            data = fetch.json(BASE + '/fdv/rooms/lydiacode:%s' % code)
            if not data:
                continue

            room = data['rooms'][0]
            url = 'http://www.ntnu.no/kart/%s/%s' % (
                 buildings[room['buildingId']], room['nr'])
            name = (room['name'] or '').strip() or 'Rom %s' % room['nr']

            root = fetch.html(url)
            if root:
                for link in root.cssselect('link[rel="canonical"]'):
                    if link.attrib['href'] != 'http://www.ntnu.no/kart':
                        url = link.attrib['href']

            yield {'code': code, 'name': name, 'url': url}

コード例 #4

0

ファイルを表示

def fetch_university(name_re):
    root = fetch.html('http://www.akademika.no/pensum', cache=False)
    if root is None:
        return
    for option in root.cssselect('select[name="select_university"] option'):
        if re.search(name_re, option.text):
            return option.attrib['value']
    return None

コード例 #5

0

ファイルを表示

def fetch_rooms():
    result = fetch.html('http://www.ntnu.no/studieinformasjon/rom/')
    if result is None:
        return

    rooms = {}
    for option in result.cssselect('.hovedramme select[name="romnr"] option'):
        code = utils.clean_string(option.attrib['value'])
        name = utils.clean_string(option.text_content())

        if code and name and 'ikkerom' not in name:
            yield code, name

コード例 #6

0

ファイルを表示

def fetch_room(url):
    root = fetch.html(url)
    if root is None:
        return None, None

    name = root.cssselect('.ntnukart h2')[0].text_content()
    for div in root.cssselect('.ntnukart .buildingimage .caption'):
        match = re.match(r'[^(]+\(([^)]+)\)', div.text_content())
        if match:
            return match.group(1), name

    return None, None

コード例 #7

0

ファイルを表示

def fetch_node(pack):
    root = fetch.html(
        'http://www.akademika.no/pensumlister/load_products2/%s' % pack)
    if root is None:
        return

    node = root.cssselect('[id*="node-"]')
    if not node:
        return
    node = node[0].attrib['id'].split('-')[1]
    if node:
        return 'http://www.akademika.no/node/%s' % node

コード例 #8

0

ファイルを表示

def fetch_packs(university, study, semester):
    root = fetch.html('http://www.akademika.no/pensumlister/load_products',
                      query={
                          'university': university,
                          'study': study,
                          'semester': semester
                      })
    if root is None:
        return

    for link in root.cssselect('.packlink'):
        course = link.text.split(' ')[0]
        if course.endswith('NTNU'):
            course = course[:-len('NTNU')]
        yield course, link.attrib['rel']

コード例 #9

0

ファイルを表示

    def scrape(self):
        if self.semester.type == Semester.FALL:
            year = self.semester.year
        else:
            year = self.semester.year - 1

        code_re = re.compile('/studier/emner/([^/]+)/', re.I | re.L)

        url = 'http://www.ntnu.no/web/studier/emnesok'
        query = {
            'p_p_lifecycle': '2',
            'p_p_id': 'courselistportlet_WAR_courselistportlet_INSTANCE_m8nT',
            '_courselistportlet_WAR_courselistportlet_INSTANCE_m8nT_year': year
        }

        courses_root = fetch.html(url, query=query, verbose=True)
        for a in courses_root.cssselect('a[href*="/studier/emner/"]'):
            course_url = a.attrib['href']
            code = code_re.search(course_url).group(1)
            quoted_code = urllib.quote(code.encode('utf-8'))
            name = a.text_content()

            if not ntnu.valid_course_code(code):
                continue
            elif not self.should_proccess_course(code):
                continue

            title = None
            data = {}
            root = fetch.html('http://www.ntnu.no/studier/emner/%s/%s' %
                              (quoted_code, year))

            # Construct dict out of info boxes.
            for box in root.cssselect('.infoBox'):
                for child in box.getchildren():
                    if child.tag == 'h3':
                        title = child.text_content()
                    else:
                        parts = [child.text or '']
                        for br in child.getchildren():
                            parts.append(br.tail or '')
                        for key, value in [
                                p.split(':', 1) for p in parts if ':' in p
                        ]:
                            key = key.strip(u' \n\xa0')
                            value = value.strip(u' \n\xa0')
                            data.setdefault(title,
                                            {}).setdefault(key,
                                                           []).append(value)

            try:
                semesters = data['Undervisning']['Undervises']
            except KeyError:
                continue

            if self.semester.type == Semester.FALL and u'HØST %s' % year not in semesters:
                continue
            elif self.semester.type == Semester.SPRING and u'VÅR %s' % year not in semesters:
                continue

            yield {
                'code': code,
                'name': name,
                'version': int(data['Fakta om emnet']['Versjon'][0]),
                'points': float(data['Fakta om emnet']['Studiepoeng'][0]),
                'url': course_url
            }