def get_ISIN_download_pdf(funds, csv_file='ISINs.csv', headless=False): ''' Locate and download the most recent report for a fund, also save ISIN numbers. Parameters ---------- funds : list The funds to be found. csv_file : str The filename to which ISINs are written headless : boolean If True, browser is run headlessly ''' scraper = WebScraper( 'C:/Users/Ollie/Downloads/chromedriver_win32/chromedriver', headless=headless) ISINs = {} n_funds = len(funds) for i, fund in enumerate(funds): print(fund) nospace_fund = fund.replace(' ', '_') # Rename any downloaded pdfs as you go downloaded_files = scraper.rename_downloads_if_done() # Add any renamed files to the ISIN doc if len(downloaded_files) > 0: for fund_name in downloaded_files: temp_fund_name = fund_name.replace(' ', '_') download_pdf, ISIN = ISINs.pop(temp_fund_name) write_ISIN(csv_file, fund_name, download_pdf, ISIN) print('\n\n') completion = round(i / n_funds * 100, 1) print(f'Fund {i} of {n_funds} - {completion}% complete') # Get the fund id ISIN, download_pdf = scraper.find_fund( fund, './pdf_downloads/' + nospace_fund + '.pdf') if ISIN is None: # If you can't find the fund, write not found in the ISIN doc write_ISIN(csv_file, fund, 'Not Found', 'Not Found') elif download_pdf is None: # Write the fund ISIN into the csv straight away if there is no pdf. Otherwise, wait until the # pdf is found. write_ISIN(csv_file, fund, 'Not Found', ISIN) else: # If you are waiting for the pdf to download, store the ISIN temporarily ISINs.update({nospace_fund: [download_pdf, ISIN]}) scraper.rename_downloads() scraper.kill()
def search_and_add_station(self): scraper = WebScraper(self) redo = True while redo: redo = False name, url = scraper.get_stream_url() self.show('Confirm: Name: ' + name + ' URL: ' + url + '\nEnter \'Ok\' or \'redo\'.\n') command = input() if command.lower() == 'ok': self.settings['stations'].append(name) self.settings['urls'].append(url) self.show('Added.') else: redo = True
def test_Date(self): scraper = WebScraper() self.assertEqual(scraper._parse_date("1982"), "1982", "Unexpected date response for valid date format") self.assertEqual(scraper._parse_date("1983-08-14"), "1983", "Unexpected date response for valid date format") self.assertEqual(scraper._parse_date("1986-08-14 00:00:00"), "1986", "Unexpected date response for valid date format") self.assertEqual(scraper._parse_date("19/08/1980"), "1980", "Unexpected date response for valid date format") self.assertEqual(scraper._parse_date("1980-08"), "1980", "Unexpected date response for valid date format") self.assertEqual(scraper._parse_date(""), "1970", "Unexpected date response for invalid date format") self.assertEqual(scraper._parse_date("no date"), "1970", "Unexpected date response for invalid date format") self.assertEqual(scraper._parse_date(None), "1970", "Unexpected date response for invalid date format") self.assertEqual(scraper._parse_date("test"), "1970", "Unexpected date response for invalid date format")
def choose_random_quote(k, url): w = WebScraper() html = w.get_html(url) # html = urllib.urlopen(w.get_html(url)).read().decode() quotes = set() for li in html.select(k): for quote in li.text.split('\n'): # Case for handling quotes with 0 text length # and for handling strings of author names i.e. FirstName LastName). # We should never have quotes with less than two words anyway. if (len(quote) > 0 and len(quote.split()) > 3 and quote != "Ahh!!! Still looking for more? We have something for you." ): quotes.add(quote.strip()) q = random.choice(list(quotes)) return q
BASE_DIR = Path(__file__).resolve().parent.parent output_file = os.path.join(BASE_DIR, 'data/test.csv') with open('dates.json') as f: dates = json.load(f) with open('urls.json') as f: url = json.load(f) checkins = dates['checkin'] checkouts = dates['checkout'] for checkin, checkout in zip(checkins[10:], checkouts[10:]): web1_url = url['web1'].format(checkin=checkin) print(f'\n\nAppending website1 data for {checkin}') web1 = WebScraper(web1_url, 'website1', checkin) # First page web1.scrape(output_file) for i in tqdm(range(2, 101)): web1 = WebScraper(web1_url, 'website1', checkin, page=i) web1.scrape(output_file) if not web1.MorePages: break web2_url = url['web2'].format(checkin=checkin, checkout=checkout) print(f'\n\nAppending website2 data for {checkin}') web2 = WebScraper(web2_url, 'website2', checkin, checkout) # First page web2.scrape(output_file) for i in tqdm(range(20, 2001, 20)): # Offset starts at 20 amd increases by 20 web2 = WebScraper(web2_url, 'website2', checkin, checkout, i) web2.scrape(output_file) if not web2.MorePages:
def main(): app = WebScraper(AnchorScraper()) #url = 'https://www.tercalivre.com.br' url = 'https://github.com' app.load(url) sys.exit(app.exec_())
from notification_manager import NotificationManager import os from web_scraper import WebScraper keyvault_url = os.environ.get('KEYVAULT_URL') credential = DefaultAzureCredential() kv_client = SecretClient(keyvault_url, credential) MY_EMAIL = kv_client.get_secret('fake-email').value PASSWORD = kv_client.get_secret('fake-email-password').value TARGET_EMAIL = kv_client.get_secret('target-email').value target_product = input("Please enter the full url of the product you wish to search: ") preferred_price = float(input("Please enter your target price: ").strip('$')) scraper = WebScraper() product_dict = scraper.retrieve_price_from_site(target_product, preferred_price) data_manager = DataManager(product_dict) data_manager.check_if_item_in_data_file() products_to_buy = data_manager.check_if_price_below_target() notification_manager = NotificationManager(MY_EMAIL, PASSWORD) notification_manager.send_email(products_to_buy, TARGET_EMAIL)
from web_scraper import WebScraper from indeed_scraper import IndeedScraper from csv_saver import CsvSaver scraper = WebScraper( IndeedScraper, CsvSaver('data_scientist.csv'), { 'job_title': '', 'location': '', 'max_count': 50, 'save_links': True, 'advance_request': 'q=\"data+scientist\"&limit=50' }) scraper.scrape()
headless = True csv_file = 'ISINs.csv' # Remove any uncompleted downloads [os.remove(file) for file in os.listdir() if file.endswith('.crdownload') or file.endswith('.pdf')] # Create download folder if not os.path.exists('./pdf_downloads'): os.mkdir('./pdf_downloads') # If this is the first time running, you need to get the fund names. If not you can load a save file. if os.path.exists('funds.p'): funds = pickle.load(open('funds.p','rb')) else: # Put your path to the chromedrive binaries here!!! scraper = WebScraper('C:/Users/Ollie/Downloads/chromedriver_win32/chromedriver', headless = headless) funds = scraper.get_fund_list() scraper.kill() pickle.dump(funds, open('funds.p','wb')) # If the csvfile does not exist, fill in headers if not os.path.exists(csv_file): with open('ISINs.csv', 'w') as file: file.write('Funds,ISINs\n') uncompleted_funds = funds entries = genfromtxt(csv_file, delimiter=',', dtype = str, skip_header = 1) completed_funds = list(entries[:,0]) if entries.shape[0] > 0 else []
def _get_stat(player, stat, year): url_retriever = HockeyReferenceUrlRetriever(player) scraper = WebScraper(url_retriever.get_url()) return scraper.get_player_stats_for_year(f'{stat}', year)