def parse_news_page(self, response):
        # from scrapy.shell import inspect_response
        # inspect_response(response, self)
        stop_scrape_flag = False
        news_list = self.exchange.get_news_list(response)
        if not news_list:
            raise Exception('Error: Website Structure Has Been Changed!' +
                            ' Maintainance Needed!')
        for i, news_row in enumerate(news_list):
            # has to assign new dict every loop
            # otherwise mongodb raises dup key (Id) error
            item = {
                'mkt': self.exchange.uptick_name,
                'mkt_id': self.mkt_id,
                'tzinfo': self.exchange.tzinfo,
                'error': True
            }
            try:  # news row won't have error
                date_time, url, title, misc_fields_dict = self.exchange.get_news_fields(
                    news_row)

                # database has previous news and scraped news is older than database
                if self.latest_date and date_time < self.latest_date:
                    # todo: detect disordered news list
                    # shenzhen news list has disordered news
                    # stop_scrape_flag = True
                    continue

                # generate file name by date and number of events on that date
                filename = du.get_filename(date_time,
                                           self.exchange.uptick_name)

                # insert record to mongodb
                item['date_time'] = date_time
                item['title'] = title
                item['url'] = url
                item['unique_id'] = filename
                item['error'] = False
                item.update(misc_fields_dict)
                yield item

                utils.save_pdf_url_or_chrome(url, self.pdfs_dir + filename)

            except Exception as e:  # not news row, skip
                item['error'] = {
                    'news_row_html': news_row.extract(),
                    'error_message': '%s: %s' % (e.__class__, str(e)),
                    'row_no': i,
                    'traceback': traceback.format_exc(),
                    'url': response.url
                }
                yield item
                continue

        if self.exchange.keep_follow_pagination and not stop_scrape_flag:
            for url, meta in self.exchange.get_pagination_urls(response):
                yield scrapy.Request(url,
                                     callback=self.parse_news_page,
                                     meta=meta)
Example #2
0
    def __init__(self):
        super().__init__()
        self.exchange = ExchangeParser()

        # parameters
        self.mkt_id = du.get_mkt_id(self.exchange.uptick_name)
        self.pdfs_dir = utils.PDF_DIR + self.exchange.uptick_name + '/'
        utils.create_pdf_dir(self.pdfs_dir)
        self.latest_date = du.get_latest_date_time(self.exchange.uptick_name,
                                                   self.exchange.tzinfo)
 def get_date_time(self, news_row):
   date_str = news_row.xpath('string(.//div[contains(@class, "article-date")])'
                            ).extract_first().strip()
   if date_str:
     date_str = utils.filter_spaces(date_str)[0]
     date_time = utils.create_date_time_tzinfo(
         date_str,
         self.tzinfo,
     )
     return date_time
   else:
     raise Exception('Error: Date parsing error')
 def get_date_time(self, news_row):
     date_str = news_row.xpath('string(./a/span)').extract_first()
     date_str_list = utils.filter_spaces(date_str)
     date_str = ''
     for s in date_str_list:
         s = s.strip()
         date_str += s + ' '
     if date_str:
         date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo)
         return date_time
     else:
         raise Exception('Error: Date parsing error')
    def __init__(self):
        super().__init__()
        self.exchange = ExchangeParser()

        # parameters
        self.mkt_id = du.get_mkt_id(self.exchange.uptick_name)
        # todo: change uptick_name to col_name
        self.pdfs_dir = utils.PDF_DIR + self.exchange.col_name + '/'
        utils.create_pdf_dir(self.pdfs_dir)
        # private
        # if self.exchange.is_multi_source_exchange:
        self.latest_date = utils.create_date_time_tzinfo(
            '30 DEC 2017', self.exchange.tzinfo)
Example #6
0
 def get_date_time(self, news_row):
     date_str = news_row.xpath(
         'string(./div[contains(@class, "date")])').extract_first()
     date_str_list = utils.filter_spaces(date_str)
     date_str = ''
     for s in date_str_list:
         s = s.strip()
         if s.lower() != 'date':
             date_str += s
     if date_str:
         date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo)
         return date_time
     else:
         raise Exception('Error: Date parsing error')
 def get_pagination_urls(self, response):
     meta = dict()
     url = self.pagination_template % self.page_no
     self.page_no += 1
     # todo: every url yielded should be validated & quoted
     if utils.validate_url(url):
         yield url, meta
Example #8
0
 def get_pagination_urls(self, response):
     meta = dict()
     rel_url = response.xpath(
         './/li[contains(@class,"pager-next")]//a/@href').extract_first()
     url = response.urljoin(rel_url)
     if utils.validate_url(url):
         yield url, meta
Example #9
0
 def get_pagination_urls(self, response):
     meta = dict()
     url = self.root_url + response.xpath(
         './/div[contains(@class,"pagenav")]//li[contains(@class,"news-next")]/a/@href'
     ).extract_first().strip()
     if utils.validate_url(url):
         yield url, meta
 def get_pagination_urls(self, response):
     meta = dict()
     url = self.pagination_template % self.page_no
     self.page_no += 1
     meta['page_no'] = self.page_no
     if utils.validate_url(url):
         yield url, meta
 def get_date_time(self, news_row):
     date_str = news_row.xpath(
         'string((.//table//table//tr)[1]/td)').extract_first().strip()
     date_time = utils.create_date_time_tzinfo(date_str,
                                               self.tzinfo,
                                               date_formats=['%d/%m/%Y'])
     return date_time
 def get_date_time(self, news_row):
     news_col_list = news_row.xpath('./td')
     date_str = news_col_list[0].xpath('string()').extract_first().strip()
     date_time = utils.create_date_time_tzinfo(date_str,
                                               self.tzinfo,
                                               date_formats=['%d/%m/%y'])
     return date_time
 def get_date_time(self, news_row):
     date_str = news_row.xpath('string((./td)[2]/span)').extract_first(
     ).strip().strip('[').strip(']')
     if date_str:
         date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo)
         return date_time
     else:
         raise Exception('Error: Date parsing error')
Example #14
0
 def get_date_time(self, news_row):
     date_str = news_row.xpath(
         'string(.//span[contains(@class, "date-display-single")])'
     ).extract_first().strip()
     if date_str:
         date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo)
         return date_time
     else:
         raise Exception('Error: Date parsing error')
Example #15
0
 def get_title(self, news_row):
     title = news_row.xpath('string(./div[contains(@class, "title")]/a)'
                            ).extract_first().strip()
     title_str_list = utils.filter_spaces(title)
     title_str = ''
     for t in title_str_list:
         t.strip()
         title_str += t
     return title_str
 def get_pagination_urls(self, response):
     meta = dict()
     rel_url = response.xpath(
         '//li[contains(@class, "news-next")]/a/@href').strip()
     url = response.urljoin(rel_url)
     import ipdb
     ipdb.set_trace()
     if utils.validate_url(url):
         yield url, meta
 def get_url(self, news_row):
     url_str = news_row.xpath('(./td)[2]/a/@href').extract_first().strip()
     url = url_str
     if 'javascript' in url.lower():
         import re
         reg = re.compile(r"\('(.*?)'\)")
         url = self.root_url + reg.findall(url_str)[0]
     # todo: every url yielded should be validated & quoted
     return utils.quote_url(url)
Example #18
0
 def get_date_time(self, news_row):
     date_str = news_row.xpath(
         'string(.//span[contains(@class, "meta-item published-date")])'
     ).extract_first().strip()
     if date_str:
         import re
         date_str = re.compile('[0-9/]+').findall(date_str)[0]
         date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo,
                                                   ['%d/%m/%Y'])
         return date_time
     else:
         raise Exception('Error: Date parsing error')
 def get_date_time(self, news_row):
     date_str = news_row.xpath(
         'string((./td)[1]/a)').extract_first().strip()
     if date_str:
         date_str_list = date_str.split('-')
         if len(date_str_list) >= 4:
             date_str = date_str_list[2].strip()
         else:
             date_str = date_str_list[-1].strip()
         date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo,
                                                   ['%d%m%Y'])
         return date_time
     else:
         raise Exception('Error: Date parsing error')
 def get_url(self, news_row):
     url = news_row.xpath('./a/@href').extract_first().strip()
     url = self.root_url + url
     return utils.quote_url(url)
Example #21
0
 def get_url(self, news_row):
     url = news_row.xpath('./div[contains(@class, "title")]/a/@href'
                          ).extract_first().strip()
     url = self.root_url + url
     return utils.quote_url(url)
 def get_url(self, news_row):
     url = news_row.xpath(
         '(.//table)[1]//td/a/@href').extract_first().strip()
     # todo: asx hkex quote url
     return utils.quote_url(url)
Example #23
0
 def parse(self, response):
     self.logger.info(
         util.filter_spaces(
             response.xpath('string(//div[@class="list-group"]/div[1])').
             extract_first()))
 def get_pagination_urls(self, response):
     meta = dict()
     url = ''
     if utils.validate_url(url):
         yield url, meta
Example #25
0
 def get_url(self, news_row):
     url = news_row.xpath('.//h4/a/@href').extract_first().strip()
     return utils.quote_url(url)
Example #26
0
 def get_url(self, news_row):
     url = news_row.xpath(
         './/td[contains(@class, "views-field-field-productnews-display-title-value")]/a/@href'
     ).extract_first().strip()
     url = self.root_url + url
     return utils.quote_url(url)
Example #27
0
 def get_title(self, news_row):
     title = news_row.xpath('string(.//h4/a)').extract_first().strip()
     title = utils.filter_spaces(title)[0]
     return title
Example #28
0
 def get_date_time(self, news_row):
   date_str = news_row.xpath(
       'string(div[contains(@class,"news-releases__section--date")])').extract(
       )[0]
   date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo)
   return date_time