def crawl_ticker_symbols(driver: WebDriver = None, url: str = None, soup=None) -> List[Tuple[str, str]]: """ Extracts all of the ticker symbols that are involved Returns a list of strings [(exchange, symbol)] """ logger.info(f"Extracting ticker symbols in article scope") if soup is None: html = api.get(driver, url, headers=make_headers('wsj'), wait_for=2) soup = BeautifulSoup(html) stocks = soup.find_all('a', class_=lambda v: v and v.startswith("media-object-chiclet"), recursive=True) logger.debug(f"'{len(stocks)}' stocks") urls = list(set([urljoin(base, s['href']) for s in stocks])) pages = [api.get(driver, url, headers=make_headers(source='wsj')) for url in urls] sections = [BeautifulSoup(html).find('div', class_='cr_quotesHeader') for html in pages] symbols = [ ( re.search(r'.+:\s?([A-Z0-9a-z]+)\)?', section.find('span', class_='exchangeName', recursive=True).get_text()).group(1), section.find('span', class_='tickerName', recursive=True).get_text().strip() ) for section in sections ] logger.info(f"Symbols found: '{[(s[0] + str('::') + s[1]) for s in symbols]}'") return symbols
def crawl_ticker_symbols(soup) -> List[Tuple[str, str]]: """ Extracts all of the ticker symbols that are involved Returns a list of strings [(exchange, symbol)] """ logger.info(f"Extracting ticker symbols in article scope") section = soup.find('span', id='about_stocks') stocks = section.find_all('a') logger.debug(f"'{len(stocks)}' stocks") urls = [urljoin(base, s['href']) for s in stocks] pages = [ requests.get(url, headers=make_headers(source='seekingalpha')) for url in urls ] sections = [ BeautifulSoup(result.content).find('div', class_='symbol_title') for result in pages ] symbols = [(section.find('meta', itemprop='exchange')['content'], section.find('meta', itemprop='tickerSymbol')['content']) for section in sections] logger.info( f"Symbols found: '{[(s[0] + str('::') + s[1]) for s in symbols]}'") return symbols
def crawl_ticker_symbols(driver: WebDriver = None, url: str = None, soup=None) -> List[Tuple[str, str]]: """ Extracts all of the ticker symbols that are involved Returns a list of strings [(exchange, symbol)] """ logger.info(f"Extracting ticker symbols in article scope") if soup is None: html = api.get(driver, url, headers=make_headers('bloomberg'), wait_for=2) soup = BeautifulSoup(html) section = soup.find('section', class_=lambda v: v and 'main-column' in v, recursive=True) references = section.find_all('a', recursive=True) stocks = [ref for ref in references if '/quote' in ref.get('href')] stocks = list( set([ urljoin(base, s['href']) for s in stocks if not s['href'].split('/')[-1][0].isdigit() ])) logger.debug(f"'{len(stocks)}' stocks") pages = [api.get(driver, url) for url in stocks] for p in pages: print(p) sections = [BeautifulSoup(result) for result in pages] sections = [ section.find('meta', property='og:title')['content'] for section in sections ] matches = [ re.search(r'([A-Z0-9a-z]+):(.+) Stock Quote', s) for s in sections ] symbols = [(match.group(2), match.group(1)) for match in matches] translate = {'New York': 'NYSE', 'NASDAQ GS': 'NASDAQ'} symbols = [(translate[exchange], ticker) for exchange, ticker in symbols] logger.info( f"Symbols found: '{[(s[0] + str('::') + s[1]) for s in symbols]}'") return symbols
def parse_article(driver: WebDriver, url: str): """ Given an article, parse its content into a representation that preserves the structure * Grab the HTML page * Crawl its contents * Tag the text with the different hierarchical components * Parse the resulting output into a graph * Enrich with metadata: - author information: name, url, etc; - document title; - publishing timestamp; - other metadata; """ logger.info(f"Parsing article '{url}'") html = api.get(driver, url, make_headers(source='seekingalpha'), wait_for=2) soup = BeautifulSoup(html) logger.debug(f"Soup length '{len(soup)}'") hierarchy = ['Article', 'Section', 'Paragraph'] descriptor = { 'components': hierarchy, 'patterns': hierarchy } text = crawl_article(soup) logger.info(f"Text crawled. Number of lines '{len(text)}'") logger.info(f"Creating a graph") doc = parse_iterable(text, descriptor) doc = doc.to_dict() doc['url'] = url doc['title'] = crawl_title(soup) doc['author'] = crawl_author(driver, soup) doc['timestamp'] = crawl_timestamp(soup) doc['symbols'] = crawl_ticker_symbols(soup) # TODO: meta, e.g likes and comments are still with problems doc['meta'] = crawl_metadata(soup) return doc
def crawl_author(driver: WebDriver, soup): logger.debug(f"Extracting author information") author_tag = soup.find('div', class_='media hidden-print').find( 'div', class_='info').find('div', class_='top') author_url = author_tag.find('a')['href'] logger.debug(f"Author URL: '{author_url}'") author_name = author_tag.find('span', class_='name').get_text() logger.debug(f"Author Name: '{author_name}'") logger.debug(f"Getting author specific page '{author_url}'") html = api.get(driver, author_url, make_headers('seekingalpha'), wait_for=2) soup = BeautifulSoup(html) followers = soup.find('li', class_=['followers', 'followers tab ']).find('i').get_text() logger.debug(f"Number of followers '{followers}'") articles = soup.find('li', class_='articles').find( 'i', class_='profile-top-nav-count').get_text() logger.debug(f"Number of articles '{articles}'") result = { 'name': author_name, 'url': author_url, 'followers': followers, 'articles': articles } logger.debug(f"Author Information: '{result}'") return result
result = requests.get(x) soup = BeautifulSoup(result.content) else: soup = x title = crawl_title(soup) summary = _crawl_summary(soup) body = _crawl_body(soup) article = ["[[Article]]{}.".format(title)] + summary + body return article def crawl_metadata(soup) -> Dict: """ Returns article metadata, e.g. number of likes, comments, etc """ logger.debug(f"Crawling metadata") # TODO: no metadata? return {} if __name__ == '__main__': url = 'https://www.wsj.com/articles/goldman-sachs-lifts-the-veil-to-woo-skeptical-shareholders-11578394803' result = requests.get(url, headers=make_headers(source='wsj')) soup = BeautifulSoup(result.content) crawl_ticker_symbols(soup, url)
driver.header_overrides = headers driver.get(url) if wait_for is not None: logger.debug( f"Sleeping for '{wait_for}' seconds waiting for dyanmic content rendering" ) time.sleep(wait_for) html = driver.find_element_by_tag_name('html').get_attribute('innerHTML') return html if __name__ == '__main__': from src.webdriver import init_chrome_driver from src.crawling.http import make_headers driver = init_chrome_driver() url = 'https://www.wsj.com/articles/goldman-sachs-lifts-the-veil-to-woo-skeptical-shareholders-11578394803' url = 'https://www.bloomberg.com/news/articles/2019-12-05/boeing-tries-to-win-over-pilots-attendants-with-737-max-pitch' url = 'https://seekingalpha.com/article/4337293-tale-of-2-stocks-luckin-coffee-and-iqiyi' url = 'https://www.fool.com/investing/2019/10/08/why-sailpoint-technologies-stock-dropped-17-in-sep.aspx' html = get(driver, url, make_headers(source='fool')) # html = get(driver, url) print(html)
logger.debug(f"Crawling metadata") header = soup.find('div', class_='a-info clearfix') comments_tag = header.find('span', id='a-comments') if comments_tag: comments = comments_tag.find('a').get_text() else: comments = 0 likes_tag = header.find('div', class_='likers show-likers inited') if likes_tag: likes = likes_tag['data-count'] else: likes = 0 metadata = {'comments': comments, 'likes': likes} logger.debug(f"Article metadata: '{metadata}'") return metadata if __name__ == '__main__': url = 'https://seekingalpha.com/article/4294051-week-review-henlius-licenses-southeast-asia-rights-pdminus-1-candidate-692-million-deal' result = requests.get(url, headers=make_headers(source='seekingalpha')) soup = BeautifulSoup(result.content) crawl_ticker_symbols(soup)