def update_stocker_stats(self, num_urls, source, num_nodes): if self.verbose: utility.sysprint('updating stocker stats at {}'.format( self.stats_file)) logger.debug('updating stocker stats at {}'.format(self.stats_file)) with open(self.stats_file, 'r') as f: data = json.load(f) # add urls to json if source in data.keys(): original = data[source] # error occuring here updated = {} updated['num_urls'] = original['num_urls'] + num_urls updated['num_nodes'] = original['num_nodes'] + num_nodes data.update({source: updated}) else: data.update( {source: { 'num_urls': num_urls, 'num_nodes': num_nodes }}) #write the updated json with open(self.stats_file, 'w') as f: json.dump(data, f, indent=4)
def update_parsed_urls(self, urls, query): """writes parsed links to JSON file to avoid reparsing""" if self.verbose: utility.sysprint('writing {} link(s) to {}'.format( len(urls), self.url_file)) logger.debug('writing {} link(s) to {}'.format(len(urls), self.url_file)) ticker, source = query.ticker.upper(), query.source.lower() with open(self.url_file, 'r') as f: data = json.load(f) # add urls to object if ticker in data.keys(): if source in data[ticker].keys(): original = data[ticker][source] updated = original + urls data[ticker].update({source: updated}) else: data[ticker].update({source: urls}) else: sub_obj = {ticker: {source: urls}} data.update(sub_obj) with open(self.url_file, 'w') as f: json.dump(data, f, indent=4)
def get_nasdaq_top_100(self): """returns a list of the NASDAQ top 100 stock tickers""" if self.verbose: utility.sysprint('Loading NASDAQ100') url = 'https://www.cnbc.com/nasdaq-100/' resp, err = self.requestHandler.get(url) if err: logger.error( 'Error retrieving top nasdaq 100, request returned: {}'.format( err)) return [] soup = BeautifulSoup(resp.content, 'html.parser') table = soup.find('table', {'class': 'quoteTable'}) tds = table.find_all('td') a_tags = [td.find('a') for td in tds if td.find('a')] tickers = [re.sub('\s+', '', a.get_text()) for a in a_tags] if self.verbose: utility.sysprint('Finished loading NASDAQ100') return tickers
def get_nyse(self): """ """ if self.verbose: utility.sysprint('Loading NYSE') page = 1 tickers = [] while page < 64: #64 url = 'https://www.nasdaq.com/screening/companies-by-industry.aspx?exchange=NYSE&page={}'.format( page) if self.verbose: utility.sysprint('getting url: {}'.format(url)) resp, err = self.requestHandler.get(url) if err: logger.error( 'Error retrieving page {} when getting NYSE tickers, request returned: {}' .format(page, err)) return tickers page += 1 soup = BeautifulSoup(resp.content, 'html.parser') table = soup.find('table', {'id': 'CompanylistResults'}) trs = [ tr for idx, tr in enumerate(table.find_all('tr')) if idx % 2 ] for t in trs: td = t.find_all('td')[1] ticker = td.find('a').get_text() tickers.append(re.sub('\s+', '', ticker)) if self.verbose: utility.sysprint('Finished loading NYSE') return tickers
def build_nodes(self, query, urls, flags): """uses the urls to build WebNodes to be written to the csv output""" nodes = [] if len(urls) == 0: logger.debug( 'Build nodes found no (new) urls to parse (query: {})'.format( query.string)) return nodes, None j = '.' company_info, err = self.financeHelper.get_company_info(query.ticker) if err: company_info = {} logger.error('Error getting company information for {}'.format( query.ticker)) for i, url in enumerate(urls): if self.verbose: utility.sysprint( 'parsing urls for query: {}'.format(query.string) + j * (i % 3)) if self.is_homepage(url, query.source): logger.debug( 'Hit a homepage ({}) ... continuing to next iteration.'. format(url)) continue articleParser = ArticleParser(url, query, company_info, **flags) node, err = articleParser.generate_web_node() if err: logger.error( 'unable to generate node for {} ... continuing to next iteration' .format(url)) continue nodes.append(node) if self.verbose: utility.sysprint('built {} Web Nodes'.format(len(nodes))) logger.debug('built {} Web Nodes'.format(len(nodes))) return nodes, None
def get_stock_movers(self): """Includes common stocks, ADRs and REITs listed on NYSE, Nasdaq or NYSE American with a prior day close of $2 a share or higher and volume of at least 50,000""" if self.verbose: utility.sysprint('Loading stock movers') url = 'https://www.wsj.com/market-data/stocks/us/movers' resp, err = self.requestHandler.get(url) if err: logger.error( 'Error retrieving top nyse 100, request returned: {}'.format( err)) return [] tickers = [] soup = BeautifulSoup(resp.content, 'html.parser') rows = soup.find_all('td') for row in rows: a = row.find('a') if a: ticker = re.findall(r'\(.*\)', a.get_text()).replace('(', '') tickers.append(ticker) if self.verbose: utility.sysprint('Finished loading stock movers') return list(tickers)
def get_snp_500(self): """returns a list of the S&P500 stock tickers""" if self.verbose: utility.sysprint('Loading SNP500') url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies' resp, err = self.requestHandler.get(url) if err: logger.error( 'Error retrieving snp 500s, request returned: {}'.format(err)) return [] soup = BeautifulSoup(resp.content, 'lxml') table = soup.find('table', {'id': 'constituents'}) tickers = [] for row in table.findAll('tr'): col = row.findAll('td') if len(col): ticker = str(col[0].get_text().strip()) tickers.append(ticker) if self.verbose: utility.sysprint('Finished loading SNP500') return tickers
def update_data_file(self, nodes, query): """writes the data gathered to a json file""" if self.verbose: utility.sysprint('writing {} node(s) to {}'.format( len(nodes), self.data_file)) logger.debug('writing {} node(s) to {} for query {}'.format( len(nodes), self.data_file, query)) with open(self.data_file, 'r') as f: data = json.load(f) for node in nodes: self.qualtricsHandler.submit_node(node) key = '+'.join([query.ticker, query.source]) if key in data.keys(): sub_nodes = data[key] sub_nodes.append(dict(node)) data[key] = sub_nodes else: data[key] = [dict(node)] with open(self.data_file, 'w') as f: json.dump(data, f, indent=4)
def build_queries(self, depth=1): """creates google queries based on the provided stocks and news sources""" i, j = 0, '.' tot = len(self.tickers) * len(self.sources) * depth for t in self.tickers: for s in self.sources: if self.verbose: utility.sysprint( 'Building {} out of {} queries | {} {} done'.format( i, tot, (i / tot), '%')) i += 1 string1 = t + '+' + s + '+' + 'stock+articles' if depth > 1: company_name = self.financeHelper.get_name_from_ticker(t) if company_name: garbage = ['Inc.'] string2 = '+'.join([ j for j in company_name.split(' ') if j not in garbage ]) + '+' + s + '+stock+news' self.queries.append(utility.Query(t, s, string2)) i += 1 self.queries.append(utility.Query(t, s, string1)) logger.debug('built {} queries'.format(len(self.queries)))
def gather_data(): """call financial web scraping API with user defined parameters""" # rh = RequestHandler() # print (rh.generate_proxies()) financeHelper = FinanceHelper() # stock_movers = financeHelper.get_stock_movers() # print (stock_movers[:10]) # print (financeHelper.get_name_from_ticker('AAPL')) # print ('sector industry of SAP is: {}'.format(financeHelper.get_company_info('AAPL'))) nasdaq100 = financeHelper.get_nasdaq_top_100() snp500 = financeHelper.get_snp_500() nyse = financeHelper.get_nyse() stocks = [ 'SAP', 'AAPL', 'GOOG', 'GPRO', 'TSLA', 'APRN', 'FB', 'NVDA', 'SNAP', 'SPY', 'NFLX', 'AMZN', 'AMD' ] markets = nyse + snp500 + nasdaq100 random.shuffle(markets) random.shuffle(stocks) tickers = stocks + markets sources = utility.get_valid_sources() worker = Stocker(tickers, sources, configpath='credentials.json') flags = { 'date_checker': False, 'depth': 1, 'validate_url': False, 'length_check': True, 'min_length': 100, } worker.stock(flags=flags) print('\n\nFinished Process Successfully.')
def stock(self, gui=True, json_output=True, csv_output=True, depth=1, query=True, shuffle=False, flags={}): """main function for the class. Begins the worker to get the information based on the queries given""" if query: self.build_queries(depth=depth) total = len(self.queries) if total == 0: return None trange_args = { 'total': total, 'unit': 'query', 'desc': self.queries[0].ticker, 'postfix': { 'source': self.queries[0].source }, 'dynamic_ncols': True, 'leave': True, 'miniters': 1 } if gui: t = trange(total, **trange_args) else: t = range(total) for i in t: query = self.queries[i] if self.verbose: utility.sysprint('Processing query: {}'.format(query.string)) logger.debug('Processing query: {}'.format(query.string)) if gui: t.set_description(query.ticker.upper()) t.set_postfix(source=query.source) t.update() urls = self.get_urls(query) if not urls: logger.debug('No URLs found.') continue urls_found = len(urls) logger.debug('urls are {}'.format(json.dumps(urls, indent=4))) nodes, err = self.build_nodes(query, urls, flags) if err: logger.error( 'Error raised in node building phase: {}'.format(err)) if len(nodes): self.update_stocker_stats(urls_found, query.source, len(nodes)) self.update_data_file(nodes, query) self.update_parsed_urls(urls, query) else: logger.debug( 'Node Dictionary is None or has a length of 0, continuing to next iteration.' ) utility.sysprint('Fished gathering data for query: {}'.format( query.string)) if gui: t.close() return nodes