class YahooArticleGetter(object): """ Download articles from Yahoo Finance. """ def __init__(self): self.headlines_url = 'http://finance.yahoo.com/q/h?s=' self.db_model = YahooDbModel() self.article_parser = ArticleParser() self.exec_error = False self.parse_datetime = False # Precompiled patterns self.native_p = re.compile('^http://finance.yahoo.com/news/.+') self.h_time_p = re.compile('.+ (\d+:\d+\w\w) .*') #### METHOD 1: get new articles def get_new_articles(self): """Main method for getting and saving new articles.""" # Browse through all companies. for company in self.db_model.get_companies(): print "====%d: %s====" % (company['id'], company['ticker']) # Get headlines and process so far unsaved articles. try: self.get_headlines(company['ticker'], company['id'], company['article_newest_saved']) except Exception, e: self.exec_error = True print "serious error: "+repr(e) self.__send_serious_error(e) break # end script # Log execution. self.db_model.add_log_exec(4, self.exec_error)
def __init__(self): self.headlines_url = 'http://finance.yahoo.com/q/h?s=' self.db_model = YahooDbModel() self.article_parser = ArticleParser() self.exec_error = False self.parse_datetime = False # Precompiled patterns self.native_p = re.compile('^http://finance.yahoo.com/news/.+') self.h_time_p = re.compile('.+ (\d+:\d+\w\w) .*')
def __init__(self, fb_config, tw_config): self.headlines_url = 'http://finance.yahoo.com/quote/' self.db_model = YahooDbModel() self.article_parser = ArticleParser() self.exec_error = False # Share count self.fb_api = facebook.GraphAPI(fb_config['access_token'], version='2.7') self.tw_api = twython.Twython(app_key=tw_config['app_key'], access_token=tw_config['access_token']) # Yahoo comments self.com_url_template = ( 'http://finance.yahoo.com/_finance_doubledown/api/resource/CommentsService.comments;count={com_count};' 'publisher=finance-en-US;sortBy=highestRated;uuid={yahoo_uuid}?' 'bkt=fintest008&device=desktop&feature=&intl=us&lang=en-US&partner=none®ion=US&site=finance&' 'tz=Europe%2FPrague&ver=0.101.427&returnMeta=true')
class YahooArticleGetter(object): """ Download articles from Yahoo Finance. """ def __init__(self, fb_config, tw_config): self.headlines_url = 'http://finance.yahoo.com/quote/' self.db_model = YahooDbModel() self.article_parser = ArticleParser() self.exec_error = False # Share count self.fb_api = facebook.GraphAPI(fb_config['access_token'], version='2.7') self.tw_api = twython.Twython(app_key=tw_config['app_key'], access_token=tw_config['access_token']) # Yahoo comments self.com_url_template = ( 'http://finance.yahoo.com/_finance_doubledown/api/resource/CommentsService.comments;count={com_count};' 'publisher=finance-en-US;sortBy=highestRated;uuid={yahoo_uuid}?' 'bkt=fintest008&device=desktop&feature=&intl=us&lang=en-US&partner=none®ion=US&site=finance&' 'tz=Europe%2FPrague&ver=0.101.427&returnMeta=true') #### METHOD 1: get new articles def get_new_articles(self, company_sleep=(10, 15)): """Main method for getting and saving new articles for all companies.""" for company in self.db_model.get_companies(): print "====%d: %s====" % (company['id'], company['ticker']) try: self.get_headlines(company['ticker'], company['id'], company['article_newest_saved']) time.sleep(random.uniform(company_sleep[0], company_sleep[1])) except Exception: self.exec_error = True print "serious error: {0}".format(traceback.format_exc()) self.__send_serious_error(traceback.format_exc()) def get_headlines(self, ticker, company_id, last_date_in_db): """ Get headlines and save new articles for given company. """ # Get ticker page ticker_url = self.headlines_url + ticker page_html = self._get_content_from_url(ticker_url, True, 5) #page = open('../test_data/ticker_not_found.htm').readlines() # Check if ticker page exists. if not page_html: print('Headlines for %s could not be loaded.') % ticker return False header_line = page_html[0] if '<title></title>' in header_line: self.exec_error = True print("Ticker %s does not exist.") % ticker self.__send_ticker_error(ticker) return False # Find App Data app_data = self._try_to_get_appdata(ticker_url, page_html) if not app_data: self.exec_error = True msg = 'JSON data was not found (ticker %s).' print(msg % ticker) self.__send_serious_error(msg % ticker) return False # Get the articles. page_name = app_data['context']['dispatcher']['stores']['StreamStore']['pageCategory'] page_field = '%s.mega' % page_name try: articles = app_data['context']['dispatcher']['stores']['StreamStore']['streams'][page_field]['data']['stream_items'] except KeyError, e: print "Page key error:" + str(e) return False # Process all articles (from oldest to newest, 10 articles into history). for art in reversed(articles): self.__process_article_from_list(art, company_id, last_date_in_db) # Commit inserts and update newest saved article datetime. self.db_model.update_last_download(company_id)