def parse_news_page(self, response): # from scrapy.shell import inspect_response # inspect_response(response, self) stop_scrape_flag = False news_list = self.exchange.get_news_list(response) if not news_list: raise Exception('Error: Website Structure Has Been Changed!' + ' Maintainance Needed!') for i, news_row in enumerate(news_list): # has to assign new dict every loop # otherwise mongodb raises dup key (Id) error item = { 'mkt': self.exchange.uptick_name, 'mkt_id': self.mkt_id, 'tzinfo': self.exchange.tzinfo, 'error': True } try: # news row won't have error date_time, url, title, misc_fields_dict = self.exchange.get_news_fields( news_row) # database has previous news and scraped news is older than database if self.latest_date and date_time < self.latest_date: # todo: detect disordered news list # shenzhen news list has disordered news # stop_scrape_flag = True continue # generate file name by date and number of events on that date filename = du.get_filename(date_time, self.exchange.uptick_name) # insert record to mongodb item['date_time'] = date_time item['title'] = title item['url'] = url item['unique_id'] = filename item['error'] = False item.update(misc_fields_dict) yield item utils.save_pdf_url_or_chrome(url, self.pdfs_dir + filename) except Exception as e: # not news row, skip item['error'] = { 'news_row_html': news_row.extract(), 'error_message': '%s: %s' % (e.__class__, str(e)), 'row_no': i, 'traceback': traceback.format_exc(), 'url': response.url } yield item continue if self.exchange.keep_follow_pagination and not stop_scrape_flag: for url, meta in self.exchange.get_pagination_urls(response): yield scrapy.Request(url, callback=self.parse_news_page, meta=meta)
def __init__(self): super().__init__() self.exchange = ExchangeParser() # parameters self.mkt_id = du.get_mkt_id(self.exchange.uptick_name) self.pdfs_dir = utils.PDF_DIR + self.exchange.uptick_name + '/' utils.create_pdf_dir(self.pdfs_dir) self.latest_date = du.get_latest_date_time(self.exchange.uptick_name, self.exchange.tzinfo)
def get_date_time(self, news_row): date_str = news_row.xpath('string(.//div[contains(@class, "article-date")])' ).extract_first().strip() if date_str: date_str = utils.filter_spaces(date_str)[0] date_time = utils.create_date_time_tzinfo( date_str, self.tzinfo, ) return date_time else: raise Exception('Error: Date parsing error')
def get_date_time(self, news_row): date_str = news_row.xpath('string(./a/span)').extract_first() date_str_list = utils.filter_spaces(date_str) date_str = '' for s in date_str_list: s = s.strip() date_str += s + ' ' if date_str: date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo) return date_time else: raise Exception('Error: Date parsing error')
def __init__(self): super().__init__() self.exchange = ExchangeParser() # parameters self.mkt_id = du.get_mkt_id(self.exchange.uptick_name) # todo: change uptick_name to col_name self.pdfs_dir = utils.PDF_DIR + self.exchange.col_name + '/' utils.create_pdf_dir(self.pdfs_dir) # private # if self.exchange.is_multi_source_exchange: self.latest_date = utils.create_date_time_tzinfo( '30 DEC 2017', self.exchange.tzinfo)
def get_date_time(self, news_row): date_str = news_row.xpath( 'string(./div[contains(@class, "date")])').extract_first() date_str_list = utils.filter_spaces(date_str) date_str = '' for s in date_str_list: s = s.strip() if s.lower() != 'date': date_str += s if date_str: date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo) return date_time else: raise Exception('Error: Date parsing error')
def get_pagination_urls(self, response): meta = dict() url = self.pagination_template % self.page_no self.page_no += 1 # todo: every url yielded should be validated & quoted if utils.validate_url(url): yield url, meta
def get_pagination_urls(self, response): meta = dict() rel_url = response.xpath( './/li[contains(@class,"pager-next")]//a/@href').extract_first() url = response.urljoin(rel_url) if utils.validate_url(url): yield url, meta
def get_pagination_urls(self, response): meta = dict() url = self.root_url + response.xpath( './/div[contains(@class,"pagenav")]//li[contains(@class,"news-next")]/a/@href' ).extract_first().strip() if utils.validate_url(url): yield url, meta
def get_pagination_urls(self, response): meta = dict() url = self.pagination_template % self.page_no self.page_no += 1 meta['page_no'] = self.page_no if utils.validate_url(url): yield url, meta
def get_date_time(self, news_row): date_str = news_row.xpath( 'string((.//table//table//tr)[1]/td)').extract_first().strip() date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo, date_formats=['%d/%m/%Y']) return date_time
def get_date_time(self, news_row): news_col_list = news_row.xpath('./td') date_str = news_col_list[0].xpath('string()').extract_first().strip() date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo, date_formats=['%d/%m/%y']) return date_time
def get_date_time(self, news_row): date_str = news_row.xpath('string((./td)[2]/span)').extract_first( ).strip().strip('[').strip(']') if date_str: date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo) return date_time else: raise Exception('Error: Date parsing error')
def get_date_time(self, news_row): date_str = news_row.xpath( 'string(.//span[contains(@class, "date-display-single")])' ).extract_first().strip() if date_str: date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo) return date_time else: raise Exception('Error: Date parsing error')
def get_title(self, news_row): title = news_row.xpath('string(./div[contains(@class, "title")]/a)' ).extract_first().strip() title_str_list = utils.filter_spaces(title) title_str = '' for t in title_str_list: t.strip() title_str += t return title_str
def get_pagination_urls(self, response): meta = dict() rel_url = response.xpath( '//li[contains(@class, "news-next")]/a/@href').strip() url = response.urljoin(rel_url) import ipdb ipdb.set_trace() if utils.validate_url(url): yield url, meta
def get_url(self, news_row): url_str = news_row.xpath('(./td)[2]/a/@href').extract_first().strip() url = url_str if 'javascript' in url.lower(): import re reg = re.compile(r"\('(.*?)'\)") url = self.root_url + reg.findall(url_str)[0] # todo: every url yielded should be validated & quoted return utils.quote_url(url)
def get_date_time(self, news_row): date_str = news_row.xpath( 'string(.//span[contains(@class, "meta-item published-date")])' ).extract_first().strip() if date_str: import re date_str = re.compile('[0-9/]+').findall(date_str)[0] date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo, ['%d/%m/%Y']) return date_time else: raise Exception('Error: Date parsing error')
def get_date_time(self, news_row): date_str = news_row.xpath( 'string((./td)[1]/a)').extract_first().strip() if date_str: date_str_list = date_str.split('-') if len(date_str_list) >= 4: date_str = date_str_list[2].strip() else: date_str = date_str_list[-1].strip() date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo, ['%d%m%Y']) return date_time else: raise Exception('Error: Date parsing error')
def get_url(self, news_row): url = news_row.xpath('./a/@href').extract_first().strip() url = self.root_url + url return utils.quote_url(url)
def get_url(self, news_row): url = news_row.xpath('./div[contains(@class, "title")]/a/@href' ).extract_first().strip() url = self.root_url + url return utils.quote_url(url)
def get_url(self, news_row): url = news_row.xpath( '(.//table)[1]//td/a/@href').extract_first().strip() # todo: asx hkex quote url return utils.quote_url(url)
def parse(self, response): self.logger.info( util.filter_spaces( response.xpath('string(//div[@class="list-group"]/div[1])'). extract_first()))
def get_pagination_urls(self, response): meta = dict() url = '' if utils.validate_url(url): yield url, meta
def get_url(self, news_row): url = news_row.xpath('.//h4/a/@href').extract_first().strip() return utils.quote_url(url)
def get_url(self, news_row): url = news_row.xpath( './/td[contains(@class, "views-field-field-productnews-display-title-value")]/a/@href' ).extract_first().strip() url = self.root_url + url return utils.quote_url(url)
def get_title(self, news_row): title = news_row.xpath('string(.//h4/a)').extract_first().strip() title = utils.filter_spaces(title)[0] return title
def get_date_time(self, news_row): date_str = news_row.xpath( 'string(div[contains(@class,"news-releases__section--date")])').extract( )[0] date_time = utils.create_date_time_tzinfo(date_str, self.tzinfo) return date_time