def parseCampus(self, response: scrapy.http.TextResponse):
        semesterUrls = response.xpath('''
            //div[@class="thirteen wide column"]
            /div[@class="ui"]
            /a
            /@href
        ''').getall()
        # print(semesterUrls)

        yield from response.follow_all(semesterUrls,
                                       callback=self.parseSemester)
Ejemplo n.º 2
0
    def parse(self, response: scrapy.http.TextResponse):
        departmentUrls = response.xpath('''
            //div[@class="twelve wide column"]
            /div[@class="ui list"]
            /a[@class="item"]
            /@href
        ''').getall()
        # print(departmentUrls)

        yield from response.follow_all(departmentUrls,
                                       callback=self.parseDepartment)
    def parse(self, response: scrapy.http.TextResponse):
        # campusUrls will contain 'https://policy.fit.edu/Schedule-of-Classes'
        # This URL is automatically eliminated by self.allowed_domains
        campusUrls = response.xpath('''
            //div[@class="three wide column"]
            /div[@id="sub-nav"]
            /a
            /@href
        ''').getall()
        # print(campusUrls)

        yield from response.follow_all(campusUrls, callback=self.parseCampus)
Ejemplo n.º 4
0
 def parse_actors(self, response: scrapy.http.TextResponse, film_data: Dict,
                  **kwargs):
     film_data['actors'] = []
     actors_block = response.xpath('//div[@class="block_left"]/*')
     actor_type = 'Не указано'
     buf_type: str
     actor_fio: str
     for i in actors_block:
         if i.root.tag == 'a':
             buf_type = i.xpath('@name').get()
             if buf_type is None:
                 self.logger.warning(
                     f'Ошибка добавления должности актера {str(i)}')
             else:
                 actor_type = buf_type
         if len(i.css('div.dub')) == 1:
             actor_fio = i.css('div.name a::text').get()
             if actor_fio is None:
                 self.logger.warning(f'Ошибка добавления актера {str(i)}')
             else:
                 film_data['actors'].append({
                     'type': actor_type,
                     'fio': actor_fio
                 })
     self.logger.debug(f'Фильм {film_data["title"]} загружен.')
     yield film_data
    def parse(self, response: scrapy.http.TextResponse):
        """
        Parse the search page. Looks for item specific page and the next page in the search results
        """
        if response.status in [301, 302, 503] and 'Location' in response.headers:
            yield scrapy.Request(response.headers['location'], callback=self.parse)
            return

        for item in response.xpath('//p[@class="productimage"]//a/@href').extract():
            item_url = response.urljoin(item)
            yield scrapy.Request(item_url, callback=self.parse_item)

        next_page = response.xpath('//div[@class="pagination"]//a[@class="pagenext"]/@href').extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)
    def parseDepartment(self, response: scrapy.http.TextResponse):
        departmentCode = response.url[
            len('https://directory.fit.edu/department/'):]

        employeeRows = response.xpath('''
            //div[@class="twelve wide column"]
            /table[@class="ui celled table" and position()=2]
            /tr
        ''')

        for employeeRow in employeeRows:
            employee = {'departmentCode': departmentCode}

            for attribute in self.employeeAttributes:
                key: str = attribute['key']
                xpath: str = attribute['xpath']

                value: Optional[str] = employeeRow.xpath(xpath).get()
                employee[key] = value

            # If room starts with 'Room: ', trim it
            room: Optional[str] = employee['room']
            if room is not None and room.startswith('Room: '):
                employee['room'] = room[len('Room: '):]

            # print(employee)
            yield employee
Ejemplo n.º 7
0
    def parseDepartment(self, response: scrapy.http.TextResponse):
        # print(response.url)

        # It's not guaranteed all fields are present on the page
        headers = response.xpath('''
            //div[@class="twelve wide column"]
            /table[@class="ui celled table" and position()=1]
            //th
            /text()
        ''').getall()
        # print(headers)

        tdTags = response.xpath('''
            //div[@class="twelve wide column"]
            /table[@class="ui celled table" and position()=1]
            //td
        ''')
        # print(data)

        name: str = response.xpath('''
            //div[@class="twelve wide column"]
            /h2
            /text()
        ''').get()

        department = {
            'name': name,
            'code': response.url[len('https://directory.fit.edu/department/'):]
        }

        for attribute in self.departmentAttributes:
            header: str = attribute['header']
            xpath: str = attribute['xpath']
            key: str = attribute['key']

            # If we don't see the header, set field to default value
            if header not in headers:
                department[key] = None
                continue

            # Otherwise extract the string
            index = headers.index(header)
            value = tdTags[index].xpath(xpath).get()
            department[key] = value

        # print(department)
        yield department
Ejemplo n.º 8
0
    def parse(self, response: scrapy.http.TextResponse):
        """
        Parse the search page. Looks for item specific page and the next page in the search results
        """
        if response.status in [301, 302] and 'Location' in response.headers:
            yield scrapy.Request(response.headers['location'],
                                 callback=self.parse)
            return

        for item in response.xpath(
                '//div[@id="resultsCol"]//li//div[@class="a-row a-spacing-small"]//a/@href'
        ).extract():
            item_url = response.urljoin(item)
            yield scrapy.Request(item_url, callback=self.parse_item)

        next_page = response.xpath(
            '//a[@id="pagnNextLink"]/@href').extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)
Ejemplo n.º 9
0
    def parse_film(self, response: scrapy.http.TextResponse, **kwargs):
        title = None
        try:
            title = response.xpath(
                "//span[@class='styles_title__2l0HH']/text()").get()
            release_date = response.xpath(
                '//div[text()="Год производства"]/../div[2]/a/text()').get()
            country = response.xpath(
                '//div[text()="Страна"]/../div[2]/a/text()').get()
            box_office = response.xpath(
                '//div[text()="Сборы в мире"]/../div[2]/a/text()').get()

            if title is None:
                raise Exception('Ну удалось получить название')
            if country is None:
                country = 'Не указано'
                self.logger.warning('Не удалось получить страну')
            if release_date is None:
                release_date = '0000'
                self.logger.warning('Не удалось получить год производства')
            if box_office is None:
                box_office = '0'
                self.logger.warning('Не удалось получить сборы в мире')
            else:
                buf = box_office.rfind('$')
                box_office = ''.join(box_office[buf + 1:].split())
            film_data = {
                'title': title,
                'release_date': release_date,
                'country': country,
                'box_office': box_office
            }
            yield scrapy.Request(response.url + 'cast/',
                                 callback=self.parse_actors,
                                 cb_kwargs={'film_data': film_data})

        except Exception as e:
            self.logger.warning(
                f'Cannot parse film_data {title if title else "Название не определно "}'
            )
            self.logger.warning(str(e))
Ejemplo n.º 10
0
    def parse(self, response: scrapy.http.TextResponse):
        self.log("Parse")
        self.log(response.url)
        title = response.xpath("//title/text()")[0].get()
        name = response.xpath(
            '//*[@id="quote-header-info"]/div[2]/div[1]/div[1]/h1/text()'
        )[0].get()
        price = response.xpath(
            '//*[@id="quote-header-info"]/div[3]/div/div/span[1]/text()'
        )[0].get()
        ticker_from_url = response.url.split("?")[0].split("/")[-1]

        try:
            company_name, ticker_symbol = parse_name(name)
        except ValueError:
            item = StockScraperItem(name=name,
                                    price=price,
                                    ticker=ticker_from_url)
        else:
            item = StockScraperItem(name=company_name,
                                    price=price,
                                    ticker=ticker_symbol)
        yield item
    def parsePawsCourse(self, response: scrapy.http.TextResponse,
                        course: dict) -> None:
        # print(response.url)

        # Get all lines of texts on the page
        lines: List[str] = response.xpath('''
            //table[@class="datadisplaytable" and @summary="This table lists the course detail for the selected term."]
            //td[@class="ntdefault"]
            //text()
        ''').getall()

        # Strip all lines
        lines = [line.strip() for line in lines]

        # Using the 'Reverse & Pop' mechanism to process data
        lines.reverse()

        # Fill course with default data
        course.update({
            attribute['key']: attribute['default']
            for attribute in self.courseAttributes
        })

        # Process lines
        while lines != []:
            line: str = lines.pop()

            # Skip empty lines
            if line == '':
                continue

            # Call parse functions on the line
            for attribute in self.courseAttributes:
                key: str = attribute['key']
                parseFn: Callable[[str, List[str], dict],
                                  None] = attribute['parseFn']
                default = attribute['default']

                # If the value is still default, call parse function to update the course
                # Parse function will change the course if the line is for the function
                # Otherwise it does nothing
                if course[key] == default:
                    parseFn(line, lines, course)

        # print(course)
        yield course
Ejemplo n.º 12
0
    def parse(self, response: scrapy.http.TextResponse):
        """
        1、获取文章列表页中的文章url并交给scrapy,下载后并进行解析
        2、获取下一页的url并交给scrapy进行下载,下载完成后交给parse

        """
        # 提取下一页并交给scrapy进行下载
        next_page = response.meta.get("next_page", 0)+1
        if next_page <= 10:
            next_url = f"http://blog.jobbole.com/kaifadou/snews-getajax.php?next={next_page}"
            yield Request(url=next_url, meta={"next_page": next_page}, callback=self.parse)

        #解析列表页中的所有文章url并交给scrapy下载后并进行解析
        post_nodes = response.css(".zhicheng_news_list a")
        for post_node in post_nodes:
            image_url = post_node.css("img::attr(src)").extract_first("")
            post_url = post_node.css("::attr(href)").extract_first("")
            #new_url = response.url + post_url
            #parse.urljoin(response.url,post_url) 拼接url
            yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url": parse.urljoin(response.url, image_url)}, callback=self.parse_detail)
Ejemplo n.º 13
0
 def parse(self, response: scrapy.http.TextResponse, **kwargs):
     all_a = response.xpath('//a')
     # if a href contain *film/int_id/ then it is film link
     films = [
         a.attrib['href'] for a in all_a
         if self.re_film.match(a.attrib.get('href', '')) is not None
     ]
     for film in films:
         f = response.urljoin(film)
         yield scrapy.Request(f, callback=self.parse_film)
     next_page = response.xpath(
         "//a[contains(text(),'Вперёд')]/@href").get()
     if next_page:
         next_page = response.urljoin(next_page)
         yield scrapy.Request(next_page, callback=self.parse)
     if settings.DEEP_SCAN:
         all_pages = response.xpath('//a/@href').getall()
         for page in all_pages:
             next_page = response.urljoin(page)
             yield scrapy.Request(next_page, callback=self.parse)
    def parseSectionTable(self, response: scrapy.http.TextResponse):
        nextPageUrls = response.xpath('''
            //div[@class="thirteen wide column"]
            /div[@class="ui pagination menu"]
            /a
            /@href
        ''').getall()
        # print(nextPageUrls)

        yield from response.follow_all(nextPageUrls,
                                       callback=self.parseSectionTable)

        h2Text: str = response.xpath('''
            //div[@class="thirteen wide column"]
            /h2
            /text()
        ''').get()

        # It happends when 'There are currently no available classes for this term.'
        # For example, 'Fort Lee, VA Class Schedule: Summer' in June 2020
        try:
            triplet = re.match(  # AttributeError
                r'(.+) Class Schedule: (spring|summer|fall) (\d{4})',
                h2Text).groups()
        except AttributeError:  # 'NoneType' object has no attribute 'groups'
            return

        location: str = triplet[0]
        semester: str = triplet[1]
        year = int(triplet[2])
        # print(location, semester, year)

        # Semesters and campuses can have different header
        # Summer have 'session'
        # Non-main campus have 'syllabus'
        headers = response.xpath('''
            //table[@class="ui small compact celled table"]
            //th
            /text()
        ''').getall()
        # print(headers)

        # Take out every cell in a flattened array
        sectionData = response.xpath('''
            //table[@class="ui small compact celled table"]
            //td
        ''')

        # Make sure the numbers match, otherwise the operations comes next will break
        assert len(sectionData) % len(headers) == 0

        # Reverse the list so that it pops each item in correct order
        sectionData.reverse()

        # It will be empty when all rows has been parsed
        while sectionData != []:
            section = {
                'location': location,
                'semester': semester,
                'year': year
            }

            for attribute in self.sectionAttributes:
                header: str = attribute['header']
                key: str = attribute['key']
                default = attribute['default']
                xpath: str = attribute['xpath']
                parseFn = attribute['parseFn']

                # Skip parsing if table does not have the attribute
                if header not in headers:
                    section[key] = default
                    continue

                # Take out a cell
                tableData: scrapy.Selector = sectionData.pop()

                # Do parsing
                if xpath is not None:
                    value: str = tableData.xpath(xpath).get(default=default)
                    if isinstance(value, str):
                        value = value.strip()
                        if value == '':
                            value = None
                else:
                    value = parseFn(tableData)

                section[key] = value

            # Postprocessing
            section['crn'] = int(section['crn'])

            # print(section)
            yield section
    def parseSemester(self, response: scrapy.http.TextResponse):
        sectionTableUrls = [f'{response.url}?page=1']
        # print(sectionTableUrls)

        yield from response.follow_all(sectionTableUrls,
                                       callback=self.parseSectionTable)