def scrape_descriptions_sync():
    """Scrape companies descriptions synchronously."""
    # TODO: Second assignment. Use https://docs.python.org/3/library/urllib.html
    symbols = read_symbols()
    progress = tqdm(total=len(symbols), file=sys.stdout, disable=False)
    YAHOO_HTMLS.mkdir(parents=True, exist_ok=True)

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15',
    }

    def fetch(symbol, session):
        with session.request.urlopen(f'https://finance.yahoo.com/quote/{symbol}/profile?p={symbol}') as response:
            # with session.get(f'https://finance.yahoo.com/quote/{symbol}/profile?p={symbol}') as response:
            text = response.read()
            with aiofiles.open(YAHOO_HTMLS / f'{symbol}.html', 'wb') as f:
                f.write(text)
            progress.update(1)

    def run(symbols):
        with ClientSession(headers=headers) as session:
            tasks = (asyncio.ensure_future(fetch(symbol, session)) for symbol in symbols)
            asyncio.gather(*tasks)

    loop = asyncio.get_event_loop()
    # loop.set_exception_handler(lambda x, y: None)  # suppress exceptions because of bug in Python 3.7.3 + aiohttp + asyncio
    # loop.run_until_complete(asyncio.ensure_future(run(symbols)))
    progress.close()
def scrape_descriptions_sync():
    """Scrape companies descriptions synchronously."""

    YAHOO_HTMLS.mkdir(parents=True, exist_ok=True)

    for symbol in tqdm(read_symbols()):
        with urlopen(Request(f'https://finance.yahoo.com/quote/{symbol}/profile?p={symbol}', headers=HEADERS)) as response:
            with (YAHOO_HTMLS / f'{symbol}.html').open('wb') as f:
                f.write(response.read())
Beispiel #3
0
def scrape_descriptions_sync():
    symbols = read_symbols()

    progress = tqdm(total=len(symbols), file=sys.stdout, disable=False)

    YAHOO_HTMLS.mkdir(parents=True, exist_ok=True)

    for smb in symbols:
            content = send_request(smb)
            write_result(smb, content)
            progress.update(1)
Beispiel #4
0
def scrape_descriptions_sync():
    """Scrape companies descriptions synchronously."""
    # TODO: Second assignment. Use https://docs.python.org/3/library/urllib.html
    symbols = read_symbols()
    YAHOO_HTMLS.mkdir(parents=True, exist_ok=True)
    progress_bar = tqdm(total=len(symbols))

    counter = 0
    for symbol in symbols:
        yahoo_url = 'https://finance.yahoo.com/quote/%s/profile?p=%s' % (
            symbol, symbol)
        page = requests.get(yahoo_url).content
        with open(YAHOO_HTMLS / f'{symbol}.html', 'wb') as f:
            f.write(page)
        progress_bar.update(1)
        counter += 1
        if counter > 100:
            return
Beispiel #5
0
def scrape_data(dst=DATA_FILE, compression='BROTLI'):
    """Scrape custom data."""

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15',
        }
    symbols = read_symbols()
    columns = ('symbol', 'sector', 'industry', 'employees', 'description')
    schema = pa.schema([(col, pa.string()) for col in columns])

    def parse(text):
        tree = lxml.html.fromstring(text)
        row = {}
        row['description'] = '\n'.join(tree.xpath('//section[h2//*[text()="Description"]]/p/text()'))
        info = (tree.xpath('//div[@class="asset-profile-container"]//p[span[text()="Sector"]]') or [None])[0]
        if info is not None:
            row['sector'] = (info.xpath('./span[text()="Sector"]/following-sibling::span[1]/text()') or [''])[0]
            row['industry'] = (info.xpath('./span[text()="Industry"]/following-sibling::span[1]/text()') or [''])[0]
            row['employees'] = (info.xpath('./span[text()="Full Time Employees"]/following-sibling::span[1]/span/text()') or [''])[0].replace(',', '')
        return row

    async def fetch(symbol, session, progress):
        async with session.get(f'https://finance.yahoo.com/quote/{symbol}/profile?p={symbol}') as response:
            text = await response.read()
            row = {'symbol': symbol}
            row.update(parse(text))
            progress.update()
            return row

    async def run(symbols, writer, progress, batch_size=1000):
        async with aiohttp.ClientSession(headers=headers) as session:
            start, stop = 0, batch_size
            while start < len(symbols):
                tasks = (asyncio.ensure_future(fetch(symbol, session, progress)) for symbol in symbols[start:stop])
                rows = await asyncio.gather(*tasks)
                table = pa.Table.from_arrays([pa.array(row.get(col, '') for row in rows) for col in columns], schema=schema)
                writer.write_table(table)
                start, stop = stop, stop + batch_size

    with tqdm(total=len(symbols)) as progress:
        loop = asyncio.get_event_loop()
        loop.set_exception_handler(lambda x, y: None)  # suppress exceptions because of bug in Python 3.7.3 + aiohttp + asyncio
        with closing(pq.ParquetWriter(dst, schema, use_dictionary=False, compression=compression, flavor={'spark'})) as writer:
            loop.run_until_complete(asyncio.ensure_future(run(symbols, writer, progress)))
Beispiel #6
0
def scrape_descriptions_sync():
    """Scrape companies descriptions synchronously."""

    def scrape_file_urlretrieve(symbol):
        url = f'https://finance.yahoo.com/quote/{symbol}/profile?p={symbol}'
        destination = SYNC_YAHOO_HTMLS / f'{symbol}.html'
        filename, message = urlretrieve(url, destination)
        #print("filename:", filename)
        #print("message:", message)

    def scrape_file_urlopen(symbol):
        with urlopen(f'https://finance.yahoo.com/quote/{symbol}/profile?p={symbol}') as response:
            with open(SYNC_YAHOO_HTMLS / f'{symbol}.html', 'wb') as file:
                file.write(response.read())

    def load_files(symbols):
        for symbol in tqdm(symbols):
            #scrape_file_urlopen(symbol)
            scrape_file_urlretrieve(symbol)


    SYNC_YAHOO_HTMLS.mkdir(parents=True, exist_ok=True)
    symbols = read_symbols()
    load_files(symbols)