コード例 #1
0
    def parse(self, response: scrapy.http.response.Response):

        base_css = "[data-test=qsp-financial] tbody "

        datetimes = response.css(
            base_css +
            'tr:first-child td:not(:first-child) span::text').extract()
        labels = response.css(
            base_css +
            'tr:not(:first-child) td:first-child:not([colspan]) span::text'
        ).extract()
        values = response.css(
            base_css +
            'tr:not(:first-child) td:not(:first-child) ::text').extract()

        datetimes = list(map(lambda x: x.replace('/', '-'), datetimes))

        symbol = response.request.url.split('=')[1]
        target_file = os.path.join(YahooFinanceSpider.target_dir,
                                   symbol + '.csv')

        current_label_index = -1
        current_datetime_index = -1
        datetimes_len = len(datetimes)

        df = pd.DataFrame(index=labels, columns=datetimes)
        pd.options.mode.chained_assignment = None

        for i in range(0, len(values)):

            current_datetime_index += 1
            if i % datetimes_len == 0:
                current_label_index += 1
                current_datetime_index = 0

            val = str(values[i]).replace('-', '')
            val = str(val).replace(',', '')
            if str(val) != '':
                val = int(float(val) *
                          1000)  #TODO check if all numbers are in thousands

            df.loc[labels[current_label_index]][
                datetimes[current_datetime_index]] = val

        mode = 'w'
        header = True
        if os.path.isfile(target_file):
            mode = 'a'
            header = False

        if df.shape[0] != 0 and df.shape[1] != 0:
            with open(target_file, mode) as f:
                df.to_csv(f, header=header)
コード例 #2
0
ファイル: biznesradar.py プロジェクト: banasmat/gpw-ml
    def parse(self, response: scrapy.http.response.Response):
        print(response.url)

        print('gathering links', response.url)
        self.company_links[response.url] = response.css(
            '.qTableFull tr td:first-child a::attr(href)').extract()

        # Continue only when all company_links are gathered
        can_continue = True
        for start_url in self.start_urls:
            if start_url not in self.company_links:
                print('Not all company links yet gathered', response.url)
                can_continue = False
                break

        if can_continue:

            print('All links gathered. Proceeding.')

            company_links = []
            # Organize links in correct order (same as start_urls)
            for start_url in self.start_urls:
                company_links += self.company_links[start_url]

            links_len = len(company_links)
            for i, link in enumerate(company_links):
                # print(self.url_base + link + self.suffix)
                yield scrapy.Request(self.url_base + link + self.suffix,
                                     self.parse_company_page,
                                     priority=links_len - i)
            print('Scheduled all requests. Total', links_len)
コード例 #3
0
    def parse(self, response: scrapy.http.response.Response):

        key_words = ['望京', '望馨花园', '望馨园', '东湖渠']

        send = SendEmail()

        history = []

        with open('history.txt') as f:
            tmp = f.readlines()
            if len(tmp):
                history.extend(tmp)
            else:
                self.log('历史记录是空', level=logging.WARNING)

        page = response.css('td.title')
        for i in page:
            title = i.css('a::text').extract_first().strip()
            link = i.css('a::attr(href)').extract_first()
            self.log('租房标题:{0}'.format(title), level=logging.WARNING)
            self.log('租房链接:{0}'.format(link), level=logging.WARNING)
            email_message = '租房标题:{0}\n租房链接:{1}'.format(title, link)
            for j in key_words:
                if j in title and link not in history:
                    # QQ邮箱对发信频率有限制,所以没有找到好的方法之前,无脑 sleep
                    time.sleep(10)
                    send.send_email('', email_message)
                    history.append(link + '\n')
                    with open('history.txt', 'w') as f:
                        f.writelines(history)
コード例 #4
0
    def parse_list(self, response:scrapy.http.response.Response):
        art_links_selector = response.xpath('//*[@id="J_main-container"]//h2[@class="post-title"]/a')
        for art_link_selector in art_links_selector:
            link = art_link_selector.xpath('@href')
            title = art_link_selector.xpath('text()')

        # 首页的第二页按钮
        second_page = response.xpath('//*[@id="J_main-container"]'
                                     '//a[contains(@class, "home-browser-more-btn")]/@href').get()
        if second_page:
            yield response.follow(second_page, callback=self.parse_list)

        #
        next_page = response.xpath('//*[@id="J_main-container"]//ul[@class="pagination"]'
                       '/li[not(contains(@class, "disabled"))]/a[@aria-label="Next"]/@href').get()
        if next_page:
            yield response.follow(next_page, callback=self.parse_list)
コード例 #5
0
    def parse_ticker_page(self, response: scrapy.http.response.Response):

        self.parse_price_page(response)
        next_page_href = response.css('.pages_right::attr(href)').extract()

        if len(next_page_href) > 0:
            time.sleep(1)
            return scrapy.Request(self.url_base + next_page_href[0],
                                  self.parse_ticker_page)
コード例 #6
0
 def GetMovieCountry(response: scrapy.http.response.Response):
     """
     解析制片国
     :param response: scrapy返回的response
     :return: 制片国家
     """
     return response.xpath(
         "//div[@id='info']/span[text()='制片国家/地区:'][1]/following-sibling::text()[1]"
     ).extract_first(default="").strip()
コード例 #7
0
    def GetRateInfo(response: scrapy.http.response.Response):
        """
        解析评分信息
        :param response: scrapy返回的response
        :return: 评分人数,评分信息
        """
        rateNumber = int(
            response.xpath("//a[@class='rating_people']/span/text()").
            extract_first(default=0))

        rateDetails_dict = dict()
        for start_num in range(1, 6):
            rate = response.xpath(
                "//span[@class='stars{} starstop']/../span[@class='rating_per']/text()"
                .format(start_num)).extract_first(default="0")
            rateDetails_dict.update(
                {start_num: float(rate.strip('%')) / 100.0})
        return rateNumber, rateDetails_dict
コード例 #8
0
    def parse_subpage(self, response: scrapy.http.response.Response):
        links = response.css("#main-content a::attr(href)").extract()

        for link in links:

            filename = link.rsplit('/', 1)[-1]

            pattern = re.compile("^R\d+\.htm$")
            if pattern.match(filename):
                yield scrapy.Request(self.url_base + link, self.get_data)
コード例 #9
0
    def parse(self, response: scrapy.http.response.Response):
        next_page = response.xpath(
            '//div[@class="navigation-wrapper"]/div/a[@class="next"]/@href'
        ).get()
        if next_page:
            print(next_page)
            self.count += 1
            if self.count < 20:
                yield response.follow(next_page, callback=self.parse)

        desc = response.xpath('//meta[@name="description"]/@content').get()
        tags = response.xpath('//span[@class="tag-links"]/a/text()').getall()
        res = self.extractor.extract(response.text)
        yield MeituanArticleSpiderItem(url=response.url,
                                       title=res['title'],
                                       content=res['content'],
                                       tags=tags,
                                       author=res['author'],
                                       publish_time=res['publish_time'])
コード例 #10
0
    def _parse_role(response: scrapy.http.response.Response) -> Dict[str, str]:
        """
        Extract Sphinx role from a crawled page.

        Valid roles:
            - function
            - class
            - module

        Args:
            response: PLACEHOLDER.

        Returns:
            String containing the role.

        """
        url = response.url

        name_query = "//h1/text()"
        name = response.xpath(name_query).get()

        if url in (
                "https://www.tensorflow.org/api_docs/python/tf",
                "https://www.tensorflow.org/probability/api_docs/python/tfp",
        ):
            return {"name": name, "url": url, "role": "package"}

        section_query = "//h2/text()"
        sections = response.xpath(section_query).getall()

        if "Module" in name.split(": "):
            role = "module"
            name = name.split(": ")[-1]
        elif "Attributes" in sections or "Methods" in sections:
            role = "class"
        else:
            # If the object is not a Module or a Class then it is a function.
            role = "function"

        return {"name": name, "url": url, "role": role}
コード例 #11
0
    def parse(self, response: scrapy.http.response.Response) -> scrapy.Request:
        """
        Main scrapy parser

        :param response: scrapy response object
        :return: new scrapy request
        """
        for url in response.xpath(
                '//ul[@class="fl_titlelist"]/li/div[@class="fl_name"]/a/@href'
        ):
            url_val = url.extract()
            if url_val and url_val.strip('/') in self.already_harvested:
                continue
            else:
                yield scrapy.Request(
                    url=url_val,
                    callback=self.parse_item,
                    cb_kwargs={'on_netflix': '/netflix/' in response.url})
        next_url = response.xpath(
            '//li[@class="page-item"]/a[text() = "Next"]/@href').extract_first(
            )
        if next_url:
            yield scrapy.Request(url=next_url, callback=self.parse)
コード例 #12
0
 def GetActorsInfo(response: scrapy.http.response.Response):
     """
     解析演员信息
     :param response: scrapy返回的response
     :return: 演员信息字典
     """
     try:
         actor_info_list = response.xpath("//span[@class='actor']//a")
         return {
             actor_info.xpath("text()").extract_first():
             actor_info.xpath("@href").extract_first()
             for actor_info in actor_info_list
         }
     except:
         return dict()
コード例 #13
0
    def _parse_role(response: scrapy.http.response.Response) -> Dict[str, str]:
        """
        Extract Sphinx role from a crawled page.

        Valid roles:
            - function
            - class
            - module

        Args:
            response: PLACEHOLDER.

        Returns:
            String containing the role.

        """
        url = response.url

        if response.url == "https://www.tensorflow.org/api_docs/python/tf":
            return "package"

        name_query = "//h1/text()"
        name = response.xpath(name_query).get()

        class_selector = response.xpath("//h2/text()").get()

        if "Module" in name.split(": "):
            role = "module"
            name = name.split(": ")[-1]
        elif class_selector == "Class ":
            role = "class"
        else:
            # If the object is not a Module or a Class then it is a function.
            role = "function"

        return {"name": name, "url": url, "role": role}
コード例 #14
0
 def GetDirectorOrAuthorInfo(key, response: scrapy.http.response.Response):
     """
     解析导演或编剧信息
     :param key: '导演'或'编剧'
     :param response: scrapy返回的response
     :return: 导演或编剧信息字典
     """
     try:
         info_list = response.xpath(
             "//div[@id='info']//span[text()='{key}']/following-sibling::span[1]/a"
             .format(key=key))
         return {
             info.xpath("text()").extract_first():
             info.xpath("@href").extract_first()
             for info in info_list
         }
     except:
         return dict()
コード例 #15
0
 def parse(self, response: scrapy.http.response.Response, **kwargs):
     data_list = response.xpath('//*[@id="main-container"]/div[2]/ol/li')
     for data in data_list:
         item = CcspiderItem()
         item['title'] = data.xpath('.//p[1]/text()')[2].get().strip()
         item['authors'] = data.xpath('.//p[2]/a/text()').extract()
         date = utils.merge_text(data.xpath('.//p[4]/text()[last()]').get())
         date = date.split(' ')
         item['month'] = utils.month_to_int(date[0])
         item['year'] = int(date[1][:4])
         item['subjects'] = utils.deduplicate(
             data.xpath('.//div/div/span/@data-tooltip').extract())
         item['abstract'] = utils.merge_text(
             data.xpath('.//p[3]/span[3]/text()').get())
         item['citation'] = 0
         yield item
     print('已爬完{}页(共{}条)'.format(self.page, self.page * self.size))
     self.page += 1
コード例 #16
0
    def get_data(self, response: scrapy.http.response.Response):

        document_type = response.css('th.tl strong').extract_first()
        period_label = response.css('th.th::text').extract_first()
        dt = response.css('th.th div::text').extract_first()

        if period_label is None or document_type is None or dt is None:
            # print(period_label)
            # print(document_type)
            # print(dt)
            return

        document_type = document_type.lower()
        period_label = period_label.lower()

        period_labels = ['12 months ended']
        document_types = {
            'income_statement': 'consolidated statements of income',
            'balance_sheet': 'consolidated balance sheets',
            'cash_flow': 'consolidated statements of cash flows'
        }

        is_period_important = False
        is_document_important = False

        for p_label in period_labels:
            if p_label in period_label:
                is_period_important = True
                break

        for slug, d_type in document_types.items():
            if d_type in document_type:
                is_document_important = True
                break

        if is_period_important and is_document_important:
            if "thousand" in document_type:
                multiplier = 1000
            elif "million" in document_type:
                multiplier = 1000000
            elif "billion" in document_type:
                multiplier = 1000000000
            else:
                raise RuntimeError('No multiplier defined in ' + response.url +
                                   '. Document heading: ' + document_type)

            year = dt[-4:]
            cik = response.url.rsplit('/')[-3]

            fin_dict = {'cik': cik}

            records = response.css('tr')
            for record in records:
                record_title = record.css('td.pl a::text').extract_first()
                if record_title:
                    record_title = record_title.replace(',', '')
                    value = record.css('td.nump::text').extract_first()
                    # print(record_title, value)
                    if value:
                        digit_val = re.findall(r'[\d+,]+', value)[0]
                        # print('digit_val', digit_val)
                        if digit_val:
                            digit_val = float(digit_val.replace(
                                ',', '.')) * multiplier
                            fin_dict[record_title] = str(digit_val)

            file_path = os.path.join(self.output_dir, year + '.csv')
            mode = 'w'
            if os.path.isfile(file_path):
                mode = 'a'
            with open(file_path, mode) as f:
                print('Saving output to ' + file_path)
                #FIXME sort before saving
                w = csv.DictWriter(f, fin_dict.keys())
                # if mode == 'w':
                w.writeheader()
                w.writerow(fin_dict)
        else:
            pass
コード例 #17
0
    def parse(self, response: scrapy.http.response.Response):

        links = response.css("#main-content a::attr(href)").extract()
        for link in links:
            yield scrapy.Request(self.url_base + link, self.parse_subpage)
コード例 #18
0
    def parse_item(self, response: scrapy.http.response.Response,
                   on_netflix) -> CritickerMoviesItem:
        """
        Extract data from given item url

        :param response: scrapy response object
        :return: Criticker Movies item object
        """
        movie_data = CritickerMoviesItem()
        movie_data['on_netflix'] = int(on_netflix)
        movie_data['url'] = response.url.strip('/')
        movie_data['uid'] = self.extract_uid_from_url(movie_data['url'])
        movie_data['type'] = response.xpath(
            '//*[@id="fi_info_type"]/text()').extract_first()
        movie_data['name'] = response.xpath(
            '//h1/span[@itemprop="name"]/text()').extract_first()
        movie_data['date_published'] = response.xpath(
            '//h1/span[@itemprop="datePublished"]/text()').extract_first()
        movie_data['start_date'] = response.xpath(
            '//h1/span[@itemprop="startDate"]/text()').extract_first()
        movie_data['end_date'] = response.xpath(
            '//h1/span[@itemprop="endDate"]/text()').extract_first()
        movie_data['poster_url'] = response.xpath(
            '//div[@id="poster"]/img/@src').extract_first()
        movie_data['description'] = ' '.join([
            _.extract().strip()
            for _ in response.xpath('//span[@itemprop="description"]//text()')
        ]).strip()

        if not movie_data['description']:
            movie_data['description'] = None

        more_info_elem = response.xpath('//div[@id="fi_moreinfo"]')

        h = more_info_elem.xpath('./p')

        for i, hi in enumerate(h):
            hi_ = hi.attrib['id']
            label = self.extract_label_from_id(hi_)
            if 'aka' in label:
                movie_data[label] = response.xpath(
                    '//p[@id="{}"]/text()'.format(hi_)).extract_first()
            else:
                movie_data[label] = self.extract_more_info(hi)
        movie_data['trailer_url'] = response.xpath(
            '//div[@id="fi_trailer"]/iframe/@src').extract_first()
        if movie_data['trailer_url'] == 'http://www.youtube.com/watch?v=':
            movie_data['trailer_url'] = None
        movie_data['rss_feed_url'] = response.xpath(
            '//*[@id="fi_titlerss"]/a/@href').extract_first()
        movie_data['avg_percentile'] = response.xpath(
            '//span[@itemprop="ratingValue"]/text()').extract_first()
        movie_data['n_ratings'] = response.xpath(
            '//span[@itemprop="reviewCount"]/text()').extract_first()

        return movie_data
コード例 #19
0
 def parse(self, response: scrapy.http.response.Response):
     links = response.css(
         ".list a[href^=\/files\/dera\/data\/financial-statement-data-sets\/]::attr(href)"
     ).extract()
     for link in links:
         yield scrapy.Request(self.url_base + link, self.get_data)