Exemple #1
0
def crawl_ticker_symbols(driver: WebDriver = None, url: str = None, soup=None) -> List[Tuple[str, str]]:
    """
    Extracts all of the ticker symbols that are involved
    Returns a list of strings [(exchange, symbol)]
    """
    logger.info(f"Extracting ticker symbols in article scope")

    if soup is None:
        html = api.get(driver, url, headers=make_headers('wsj'), wait_for=2)
        soup = BeautifulSoup(html)

    stocks = soup.find_all('a', class_=lambda v: v and v.startswith("media-object-chiclet"), recursive=True)
    logger.debug(f"'{len(stocks)}' stocks")

    urls = list(set([urljoin(base, s['href']) for s in stocks]))
    pages = [api.get(driver, url, headers=make_headers(source='wsj')) for url in urls]

    sections = [BeautifulSoup(html).find('div', class_='cr_quotesHeader') for html in pages]

    symbols = [
        (
            re.search(r'.+:\s?([A-Z0-9a-z]+)\)?', section.find('span', class_='exchangeName', recursive=True).get_text()).group(1),
            section.find('span', class_='tickerName', recursive=True).get_text().strip()
        )
        for section in sections
    ]

    logger.info(f"Symbols found: '{[(s[0] + str('::') + s[1]) for s in symbols]}'")

    return symbols
Exemple #2
0
def crawl_ticker_symbols(soup) -> List[Tuple[str, str]]:
    """
    Extracts all of the ticker symbols that are involved
    Returns a list of strings [(exchange, symbol)]
    """
    logger.info(f"Extracting ticker symbols in article scope")

    section = soup.find('span', id='about_stocks')
    stocks = section.find_all('a')
    logger.debug(f"'{len(stocks)}' stocks")

    urls = [urljoin(base, s['href']) for s in stocks]
    pages = [
        requests.get(url, headers=make_headers(source='seekingalpha'))
        for url in urls
    ]
    sections = [
        BeautifulSoup(result.content).find('div', class_='symbol_title')
        for result in pages
    ]

    symbols = [(section.find('meta', itemprop='exchange')['content'],
                section.find('meta', itemprop='tickerSymbol')['content'])
               for section in sections]

    logger.info(
        f"Symbols found: '{[(s[0] + str('::') + s[1]) for s in symbols]}'")

    return symbols
Exemple #3
0
def crawl_ticker_symbols(driver: WebDriver = None,
                         url: str = None,
                         soup=None) -> List[Tuple[str, str]]:
    """
    Extracts all of the ticker symbols that are involved
    Returns a list of strings [(exchange, symbol)]
    """
    logger.info(f"Extracting ticker symbols in article scope")

    if soup is None:
        html = api.get(driver,
                       url,
                       headers=make_headers('bloomberg'),
                       wait_for=2)
        soup = BeautifulSoup(html)

    section = soup.find('section',
                        class_=lambda v: v and 'main-column' in v,
                        recursive=True)

    references = section.find_all('a', recursive=True)
    stocks = [ref for ref in references if '/quote' in ref.get('href')]
    stocks = list(
        set([
            urljoin(base, s['href']) for s in stocks
            if not s['href'].split('/')[-1][0].isdigit()
        ]))

    logger.debug(f"'{len(stocks)}' stocks")

    pages = [api.get(driver, url) for url in stocks]

    for p in pages:
        print(p)

    sections = [BeautifulSoup(result) for result in pages]
    sections = [
        section.find('meta', property='og:title')['content']
        for section in sections
    ]

    matches = [
        re.search(r'([A-Z0-9a-z]+):(.+) Stock Quote', s) for s in sections
    ]

    symbols = [(match.group(2), match.group(1)) for match in matches]

    translate = {'New York': 'NYSE', 'NASDAQ GS': 'NASDAQ'}

    symbols = [(translate[exchange], ticker) for exchange, ticker in symbols]

    logger.info(
        f"Symbols found: '{[(s[0] + str('::') + s[1]) for s in symbols]}'")

    return symbols
Exemple #4
0
def parse_article(driver: WebDriver, url: str):
    """
    Given an article, parse its content into a
    representation that preserves the structure

    * Grab the HTML page
    * Crawl its contents
    * Tag the text with the different hierarchical components
    * Parse the resulting output into a graph
    * Enrich with metadata:
        - author information: name, url, etc;
        - document title;
        - publishing timestamp;
        - other metadata;
    """
    logger.info(f"Parsing article '{url}'")

    html = api.get(driver, url,  make_headers(source='seekingalpha'), wait_for=2)
    soup = BeautifulSoup(html)
    logger.debug(f"Soup length '{len(soup)}'")

    hierarchy = ['Article', 'Section', 'Paragraph']

    descriptor = {
        'components': hierarchy,
        'patterns': hierarchy
    }

    text = crawl_article(soup)
    logger.info(f"Text crawled. Number of lines '{len(text)}'")

    logger.info(f"Creating a graph")
    doc = parse_iterable(text, descriptor)
    doc = doc.to_dict()

    doc['url'] = url
    doc['title'] = crawl_title(soup)
    doc['author'] = crawl_author(driver, soup)
    doc['timestamp'] = crawl_timestamp(soup)
    doc['symbols'] = crawl_ticker_symbols(soup)

    # TODO: meta, e.g likes and comments are still with problems
    doc['meta'] = crawl_metadata(soup)

    return doc
Exemple #5
0
def crawl_author(driver: WebDriver, soup):
    logger.debug(f"Extracting author information")

    author_tag = soup.find('div', class_='media hidden-print').find(
        'div', class_='info').find('div', class_='top')

    author_url = author_tag.find('a')['href']
    logger.debug(f"Author URL: '{author_url}'")

    author_name = author_tag.find('span', class_='name').get_text()
    logger.debug(f"Author Name: '{author_name}'")

    logger.debug(f"Getting author specific page '{author_url}'")

    html = api.get(driver,
                   author_url,
                   make_headers('seekingalpha'),
                   wait_for=2)
    soup = BeautifulSoup(html)

    followers = soup.find('li',
                          class_=['followers',
                                  'followers tab ']).find('i').get_text()
    logger.debug(f"Number of followers '{followers}'")

    articles = soup.find('li', class_='articles').find(
        'i', class_='profile-top-nav-count').get_text()
    logger.debug(f"Number of articles '{articles}'")

    result = {
        'name': author_name,
        'url': author_url,
        'followers': followers,
        'articles': articles
    }

    logger.debug(f"Author Information: '{result}'")
    return result
Exemple #6
0
        result = requests.get(x)
        soup = BeautifulSoup(result.content)
    else:
        soup = x

    title = crawl_title(soup)
    summary = _crawl_summary(soup)
    body = _crawl_body(soup)

    article = ["[[Article]]{}.".format(title)] + summary + body
    return article


def crawl_metadata(soup) -> Dict:
    """
    Returns article metadata,
    e.g. number of likes, comments, etc
    """
    logger.debug(f"Crawling metadata")
    # TODO: no metadata?
    return {}


if __name__ == '__main__':
    url = 'https://www.wsj.com/articles/goldman-sachs-lifts-the-veil-to-woo-skeptical-shareholders-11578394803'

    result = requests.get(url, headers=make_headers(source='wsj'))
    soup = BeautifulSoup(result.content)

    crawl_ticker_symbols(soup, url)
Exemple #7
0
        driver.header_overrides = headers

    driver.get(url)

    if wait_for is not None:
        logger.debug(
            f"Sleeping for '{wait_for}' seconds waiting for dyanmic content rendering"
        )
        time.sleep(wait_for)

    html = driver.find_element_by_tag_name('html').get_attribute('innerHTML')

    return html


if __name__ == '__main__':

    from src.webdriver import init_chrome_driver
    from src.crawling.http import make_headers

    driver = init_chrome_driver()

    url = 'https://www.wsj.com/articles/goldman-sachs-lifts-the-veil-to-woo-skeptical-shareholders-11578394803'
    url = 'https://www.bloomberg.com/news/articles/2019-12-05/boeing-tries-to-win-over-pilots-attendants-with-737-max-pitch'
    url = 'https://seekingalpha.com/article/4337293-tale-of-2-stocks-luckin-coffee-and-iqiyi'
    url = 'https://www.fool.com/investing/2019/10/08/why-sailpoint-technologies-stock-dropped-17-in-sep.aspx'

    html = get(driver, url, make_headers(source='fool'))
    # html = get(driver, url)
    print(html)
Exemple #8
0
    logger.debug(f"Crawling metadata")

    header = soup.find('div', class_='a-info clearfix')

    comments_tag = header.find('span', id='a-comments')
    if comments_tag:
        comments = comments_tag.find('a').get_text()
    else:
        comments = 0

    likes_tag = header.find('div', class_='likers show-likers inited')
    if likes_tag:
        likes = likes_tag['data-count']
    else:
        likes = 0

    metadata = {'comments': comments, 'likes': likes}

    logger.debug(f"Article metadata: '{metadata}'")
    return metadata


if __name__ == '__main__':

    url = 'https://seekingalpha.com/article/4294051-week-review-henlius-licenses-southeast-asia-rights-pdminus-1-candidate-692-million-deal'

    result = requests.get(url, headers=make_headers(source='seekingalpha'))
    soup = BeautifulSoup(result.content)

    crawl_ticker_symbols(soup)