def get_peers(ticker_symbol, page=None): """ Gets the list of Top Peers for a stock as listed on the "Premium Research: Industry Analysis" section :param ticker_symbol: The ticker symbol of the interested stock (e.g., "AAPL", "GOOG", "MSFT") :param page: html tree structure based on the html markup of the scraped website :return: a list of the Top Peers as listed on a stock's "Premium Research: Industry Analysis" section on it's respective zacks page """ if page is None: page = scrape_page(BASE_URL+ ticker_symbol) peers = page.xpath(PEERS_XPATH) if peers: try: peers.remove(ticker_symbol.upper()) except: pass if peers: return peers else: return None else: return None
def scrape_site(site): session = HTMLSession() with open(f'links/{site}_urls.txt', 'r') as f: urls = f.readlines() urls = {url[:-1] for url in urls} save_dir = f'data/{site}/' os.makedirs(save_dir, exist_ok=True) scraped = [] if os.path.isfile(save_dir + 'scraped_urls.txt'): with open(save_dir + 'scraped_urls.txt', 'r') as f: scraped = f.readlines() if scraped: with open(save_dir + 'data.json', 'r') as f: data = json.load(f) else: data = {'title': [], 'text': [], 'site': []} scraped = {url[:-1] for url in scraped} to_scrape = urls - scraped if not to_scrape: print(f'{site} scraping completed.') return print(f'{site} scraping initiated!') for i, url in enumerate(tqdm(to_scrape)): try: title, text = scrape_page(url, selectors[site], session, site) data['title'].append(title) data['text'].append(text) data['site'].append(site) scraped.add(url) except IndexError as e: scraped.add(url) except ValueError as e: pass except Exception as e: print(e) print(site) if i % checkpoint_steps == 0 and i > 0: with open(save_dir + 'data.json', 'w') as f: json.dump(data, f) with open(save_dir + 'scraped_urls.txt', 'w') as f: f.writelines(scraped) # print(f'{site}: {i} of {len(to_scrape)} done.') data_df = pd.DataFrame(data) data_df.to_csv(save_dir + 'data_all.tsv', sep='\t', index=False) print(f'{site} scraping completed.')
def get_bullish_sentiment(ticker_symbol, page=None): """ Gets the bullish sentiment of the target ticker symbol :param ticker_symbol: The ticker symbol of the interested stock (e.g., "AAPL", "GOOG", "MSFT") :param page: html tree structure based on the html markup of the scraped website :return: a string of the percentage of bullish sentiment as listed on a stock's StockTwit's page """ if page is None: page = scrape_page(BASE_URL + ticker_symbol) sentiment = page.xpath(BULLISH_SENTIMENT_XPATH) if not sentiment: return None else: return sentiment[0].replace("\n", "") + " Bullish"
def get_rating(ticker_symbol, page=None): """ Gets the Zack's Rank Rating of the target ticker symbol :param ticker_symbol: The ticker symbol of the interested stock (e.g., "AAPL", "GOOG", "MSFT") :param page: html tree structure based on the html markup of the scraped website :return: String of Zack's Rank Rating as listed on a stock's Zacks page """ if page is None: page = scrape_page(BASE_URL + ticker_symbol) rating = page.xpath(RATING_XPATH) if not rating: return None else: return rating[0]
def get_sentiment(ticker_symbol, page=None): """ Gets both the bullish and bearish sentiment of the target ticker symbol :param ticker_symbol: The ticker symbol of the interested stock (e.g., "AAPL", "GOOG", "MSFT") :param page: html tree structure based on the html markup of the scraped website :return: a tuple of strings containing both the bullish and bearish sentiment as listed on a stock's StockTwits page """ if page is None: page = scrape_page(BASE_URL + ticker_symbol) bullish_sentiment = get_bullish_sentiment(ticker_symbol, page) if bullish_sentiment: return bullish_sentiment, get_bearish_sentiment(ticker_symbol, page) else: return None
def get_all_statistics(ticker_symbol, page=None): """ This function will get all the associated financial statistics from the correspoding finviz page given the ticker symbol :param ticker_symbol: The ticker symbol of the interested stock (e.g., "AAPL", "GGOG", "MSFT") :param page: HTML tree structure based on the html markup of the scraped page. If one is not passed in the function will scrape the page :return: a dictionary of all the financial statistics listed on a stock's finviz page, otherwise None """ if page is None: page = scrape_page(BASE_URL + ticker_symbol) table = get_statistics_table(page) if table: return table else: return None
def get_statistic(ticker_symbol, stat_name, page=None): """ This function will get the associated financial statistic from the corresponding finviz page given the statistic's name and the ticker symbol :param ticker_symbol: The ticker symbol of the interested stock (e.g., "AAPL", "GOOG", "MSFT") :param stat_name: The name of the interested financial statistic (e.g., "P/E", "Price", "Volume"). An exhaustive list of available financial statistics can be found on a stock's finviz page :param page: HTML tree structure based on the html markup of the scraped web page. If one is not passed in the function will scrape the page :return: the value of the interested financial statistic if it exists, otherwise None """ if page is None: page = scrape_page(BASE_URL + ticker_symbol) table = get_statistics_table(page) if stat_name in table.keys() and table[stat_name]: return table[stat_name] else: return None
def search(): query = str(request.form['query']) data = scrape_page(query) return render_template('result.html', data=data)
"http://www.pollingreport.com/d.htm", "http://www.pollingreport.com/e-f.htm", "http://www.pollingreport.com/g.htm", "http://www.pollingreport.com/h-j.htm", "http://www.pollingreport.com/k.htm", "http://www.pollingreport.com/l.htm", "http://www.pollingreport.com/o.htm", "http://www.pollingreport.com/p.htm", "http://www.pollingreport.com/r.htm", "http://www.pollingreport.com/S-Z.htm", # Congressional Job Ratings "http://www.pollingreport.com/CongJob1.htm", "http://www.pollingreport.com/cong_dem.htm", "http://www.pollingreport.com/cong_rep.htm", # Party Approval "http://www.pollingreport.com/dem.htm", "http://www.pollingreport.com/rep.htm", ] if __name__ == '__main__': os.makedirs("parsed", exist_ok=True) os.makedirs("raw", exist_ok=True) for url in urls: filename = url.split("/")[-1] + ".csv" with open("parsed/" + filename, "w", encoding="latin-1") as f: scrape_page(url, f)