def scrape_descriptions_sync(): """Scrape companies descriptions synchronously.""" # TODO: Second assignment. Use https://docs.python.org/3/library/urllib.html symbols = read_symbols() progress = tqdm(total=len(symbols), file=sys.stdout, disable=False) YAHOO_HTMLS.mkdir(parents=True, exist_ok=True) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15', } def fetch(symbol, session): with session.request.urlopen(f'https://finance.yahoo.com/quote/{symbol}/profile?p={symbol}') as response: # with session.get(f'https://finance.yahoo.com/quote/{symbol}/profile?p={symbol}') as response: text = response.read() with aiofiles.open(YAHOO_HTMLS / f'{symbol}.html', 'wb') as f: f.write(text) progress.update(1) def run(symbols): with ClientSession(headers=headers) as session: tasks = (asyncio.ensure_future(fetch(symbol, session)) for symbol in symbols) asyncio.gather(*tasks) loop = asyncio.get_event_loop() # loop.set_exception_handler(lambda x, y: None) # suppress exceptions because of bug in Python 3.7.3 + aiohttp + asyncio # loop.run_until_complete(asyncio.ensure_future(run(symbols))) progress.close()
def scrape_descriptions_sync(): """Scrape companies descriptions synchronously.""" YAHOO_HTMLS.mkdir(parents=True, exist_ok=True) for symbol in tqdm(read_symbols()): with urlopen(Request(f'https://finance.yahoo.com/quote/{symbol}/profile?p={symbol}', headers=HEADERS)) as response: with (YAHOO_HTMLS / f'{symbol}.html').open('wb') as f: f.write(response.read())
def scrape_descriptions_sync(): symbols = read_symbols() progress = tqdm(total=len(symbols), file=sys.stdout, disable=False) YAHOO_HTMLS.mkdir(parents=True, exist_ok=True) for smb in symbols: content = send_request(smb) write_result(smb, content) progress.update(1)
def scrape_descriptions_sync(): """Scrape companies descriptions synchronously.""" # TODO: Second assignment. Use https://docs.python.org/3/library/urllib.html symbols = read_symbols() YAHOO_HTMLS.mkdir(parents=True, exist_ok=True) progress_bar = tqdm(total=len(symbols)) counter = 0 for symbol in symbols: yahoo_url = 'https://finance.yahoo.com/quote/%s/profile?p=%s' % ( symbol, symbol) page = requests.get(yahoo_url).content with open(YAHOO_HTMLS / f'{symbol}.html', 'wb') as f: f.write(page) progress_bar.update(1) counter += 1 if counter > 100: return
def scrape_data(dst=DATA_FILE, compression='BROTLI'): """Scrape custom data.""" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15', } symbols = read_symbols() columns = ('symbol', 'sector', 'industry', 'employees', 'description') schema = pa.schema([(col, pa.string()) for col in columns]) def parse(text): tree = lxml.html.fromstring(text) row = {} row['description'] = '\n'.join(tree.xpath('//section[h2//*[text()="Description"]]/p/text()')) info = (tree.xpath('//div[@class="asset-profile-container"]//p[span[text()="Sector"]]') or [None])[0] if info is not None: row['sector'] = (info.xpath('./span[text()="Sector"]/following-sibling::span[1]/text()') or [''])[0] row['industry'] = (info.xpath('./span[text()="Industry"]/following-sibling::span[1]/text()') or [''])[0] row['employees'] = (info.xpath('./span[text()="Full Time Employees"]/following-sibling::span[1]/span/text()') or [''])[0].replace(',', '') return row async def fetch(symbol, session, progress): async with session.get(f'https://finance.yahoo.com/quote/{symbol}/profile?p={symbol}') as response: text = await response.read() row = {'symbol': symbol} row.update(parse(text)) progress.update() return row async def run(symbols, writer, progress, batch_size=1000): async with aiohttp.ClientSession(headers=headers) as session: start, stop = 0, batch_size while start < len(symbols): tasks = (asyncio.ensure_future(fetch(symbol, session, progress)) for symbol in symbols[start:stop]) rows = await asyncio.gather(*tasks) table = pa.Table.from_arrays([pa.array(row.get(col, '') for row in rows) for col in columns], schema=schema) writer.write_table(table) start, stop = stop, stop + batch_size with tqdm(total=len(symbols)) as progress: loop = asyncio.get_event_loop() loop.set_exception_handler(lambda x, y: None) # suppress exceptions because of bug in Python 3.7.3 + aiohttp + asyncio with closing(pq.ParquetWriter(dst, schema, use_dictionary=False, compression=compression, flavor={'spark'})) as writer: loop.run_until_complete(asyncio.ensure_future(run(symbols, writer, progress)))
def scrape_descriptions_sync(): """Scrape companies descriptions synchronously.""" def scrape_file_urlretrieve(symbol): url = f'https://finance.yahoo.com/quote/{symbol}/profile?p={symbol}' destination = SYNC_YAHOO_HTMLS / f'{symbol}.html' filename, message = urlretrieve(url, destination) #print("filename:", filename) #print("message:", message) def scrape_file_urlopen(symbol): with urlopen(f'https://finance.yahoo.com/quote/{symbol}/profile?p={symbol}') as response: with open(SYNC_YAHOO_HTMLS / f'{symbol}.html', 'wb') as file: file.write(response.read()) def load_files(symbols): for symbol in tqdm(symbols): #scrape_file_urlopen(symbol) scrape_file_urlretrieve(symbol) SYNC_YAHOO_HTMLS.mkdir(parents=True, exist_ok=True) symbols = read_symbols() load_files(symbols)