Python WebScraper.WebScraperの例

プログラミング言語: Python

名前空間/パッケージ名: web_scraper

クラス/型: WebScraper

メソッド/関数: WebScraper

hotexamples.comのコード掲載数: 10

Python WebScraper.WebScraper - 10件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのweb_scraper.WebScraper.WebScraperの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

WebScraper(10)

_parse_date(1)

download_pdf(1)

exec_(1)

find_fund(1)

get_data_in_cluster_format(1)

post_entry(1)

start_scraping(1)

コード例 #1

ファイルを表示

def get_ISIN_download_pdf(funds, csv_file='ISINs.csv', headless=False):
    '''
    Locate and download the most recent report for a fund, also save ISIN numbers.

    Parameters
    ----------
    funds : list
        The funds to be found.
    csv_file : str
        The filename to which ISINs are written
    headless : boolean
        If True, browser is run headlessly

    '''

    scraper = WebScraper(
        'C:/Users/Ollie/Downloads/chromedriver_win32/chromedriver',
        headless=headless)

    ISINs = {}
    n_funds = len(funds)
    for i, fund in enumerate(funds):
        print(fund)
        nospace_fund = fund.replace(' ', '_')
        # Rename any downloaded pdfs as you go
        downloaded_files = scraper.rename_downloads_if_done()
        # Add any renamed files to the ISIN doc
        if len(downloaded_files) > 0:
            for fund_name in downloaded_files:
                temp_fund_name = fund_name.replace(' ', '_')
                download_pdf, ISIN = ISINs.pop(temp_fund_name)
                write_ISIN(csv_file, fund_name, download_pdf, ISIN)

        print('\n\n')
        completion = round(i / n_funds * 100, 1)
        print(f'Fund {i} of {n_funds} - {completion}% complete')

        # Get the fund id
        ISIN, download_pdf = scraper.find_fund(
            fund, './pdf_downloads/' + nospace_fund + '.pdf')
        if ISIN is None:
            # If you can't find the fund, write not found in the ISIN doc
            write_ISIN(csv_file, fund, 'Not Found', 'Not Found')
        elif download_pdf is None:
            # Write the fund ISIN into the csv straight away if there is no pdf. Otherwise, wait until the
            # pdf is found.
            write_ISIN(csv_file, fund, 'Not Found', ISIN)
        else:
            # If you are waiting for the pdf to download, store the ISIN temporarily
            ISINs.update({nospace_fund: [download_pdf, ISIN]})

    scraper.rename_downloads()
    scraper.kill()

コード例 #2

ファイルを表示

    def search_and_add_station(self):
        scraper = WebScraper(self)
        redo = True
        while redo:
            redo = False

            name, url = scraper.get_stream_url()

            self.show('Confirm: Name: ' + name + ' URL: ' + url +
                      '\nEnter \'Ok\' or \'redo\'.\n')
            command = input()
            if command.lower() == 'ok':
                self.settings['stations'].append(name)
                self.settings['urls'].append(url)
                self.show('Added.')
            else:
                redo = True

コード例 #3

ファイルを表示

 def test_Date(self):
     scraper = WebScraper()
     self.assertEqual(scraper._parse_date("1982"), "1982",
                      "Unexpected date response for valid date format")
     self.assertEqual(scraper._parse_date("1983-08-14"), "1983",
                      "Unexpected date response for valid date format")
     self.assertEqual(scraper._parse_date("1986-08-14 00:00:00"), "1986",
                      "Unexpected date response for valid date format")
     self.assertEqual(scraper._parse_date("19/08/1980"), "1980",
                      "Unexpected date response for valid date format")
     self.assertEqual(scraper._parse_date("1980-08"), "1980",
                      "Unexpected date response for valid date format")
     self.assertEqual(scraper._parse_date(""), "1970",
                      "Unexpected date response for invalid date format")
     self.assertEqual(scraper._parse_date("no date"), "1970",
                      "Unexpected date response for invalid date format")
     self.assertEqual(scraper._parse_date(None), "1970",
                      "Unexpected date response for invalid date format")
     self.assertEqual(scraper._parse_date("test"), "1970",
                      "Unexpected date response for invalid date format")

コード例 #4

ファイルを表示

def choose_random_quote(k, url):
    w = WebScraper()

    html = w.get_html(url)
    # html = urllib.urlopen(w.get_html(url)).read().decode()

    quotes = set()

    for li in html.select(k):
        for quote in li.text.split('\n'):
            # Case for handling quotes with 0 text length
            # and for handling strings of author names i.e. FirstName LastName).
            # We should never have quotes with less than two words anyway.
            if (len(quote) > 0 and len(quote.split()) > 3 and quote !=
                    "Ahh!!! Still looking for more? We have something for you."
                ):
                quotes.add(quote.strip())

    q = random.choice(list(quotes))

    return q

コード例 #5

ファイルを表示

BASE_DIR = Path(__file__).resolve().parent.parent
output_file = os.path.join(BASE_DIR, 'data/test.csv')

with open('dates.json') as f:
    dates = json.load(f)
with open('urls.json') as f:
    url = json.load(f)

checkins = dates['checkin']
checkouts = dates['checkout']

for checkin, checkout in zip(checkins[10:], checkouts[10:]):
    web1_url = url['web1'].format(checkin=checkin)
    print(f'\n\nAppending website1 data for {checkin}')
    web1 = WebScraper(web1_url, 'website1', checkin) # First page
    web1.scrape(output_file)
    for i in tqdm(range(2, 101)):
        web1 = WebScraper(web1_url, 'website1', checkin, page=i)
        web1.scrape(output_file)
        if not web1.MorePages:
            break

    web2_url = url['web2'].format(checkin=checkin, checkout=checkout)
    print(f'\n\nAppending website2 data for {checkin}')
    web2 = WebScraper(web2_url, 'website2', checkin, checkout) # First page
    web2.scrape(output_file)
    for i in tqdm(range(20, 2001, 20)): # Offset starts at 20 amd increases by 20
        web2 = WebScraper(web2_url, 'website2', checkin, checkout, i)
        web2.scrape(output_file)
        if not web2.MorePages:

コード例 #6

ファイルを表示

def main():
    app = WebScraper(AnchorScraper())
    #url = 'https://www.tercalivre.com.br'
    url = 'https://github.com'
    app.load(url)
    sys.exit(app.exec_())

コード例 #7

ファイルを表示

ファイル: main.py プロジェクト: redoctoberbluechristmas/100DaysOfCodePython

from notification_manager import NotificationManager
import os
from web_scraper import WebScraper

keyvault_url = os.environ.get('KEYVAULT_URL')
credential = DefaultAzureCredential()
kv_client = SecretClient(keyvault_url, credential)

MY_EMAIL = kv_client.get_secret('fake-email').value
PASSWORD = kv_client.get_secret('fake-email-password').value
TARGET_EMAIL = kv_client.get_secret('target-email').value

target_product = input("Please enter the full url of the product you wish to search: ")
preferred_price = float(input("Please enter your target price: ").strip('$'))

scraper = WebScraper()

product_dict = scraper.retrieve_price_from_site(target_product, preferred_price)
data_manager = DataManager(product_dict)

data_manager.check_if_item_in_data_file()
products_to_buy = data_manager.check_if_price_below_target()

notification_manager = NotificationManager(MY_EMAIL, PASSWORD)

notification_manager.send_email(products_to_buy, TARGET_EMAIL)

コード例 #8

ファイルを表示

from web_scraper import WebScraper
from indeed_scraper import IndeedScraper
from csv_saver import CsvSaver

scraper = WebScraper(
    IndeedScraper, CsvSaver('data_scientist.csv'), {
        'job_title': '',
        'location': '',
        'max_count': 50,
        'save_links': True,
        'advance_request': 'q=\"data+scientist\"&limit=50'
    })
scraper.scrape()

コード例 #9

ファイルを表示

    headless = True
    csv_file = 'ISINs.csv'
    
    # Remove any uncompleted downloads
    [os.remove(file) for file in os.listdir() if file.endswith('.crdownload') or file.endswith('.pdf')]
    # Create download folder
    if not os.path.exists('./pdf_downloads'):
        os.mkdir('./pdf_downloads')
        
        
    # If this is the first time running, you need to get the fund names. If not you can load a save file.
    if os.path.exists('funds.p'):
        funds = pickle.load(open('funds.p','rb'))
    else:
        # Put your path to the chromedrive binaries here!!!
        scraper = WebScraper('C:/Users/Ollie/Downloads/chromedriver_win32/chromedriver', headless = headless)
        funds = scraper.get_fund_list()
        scraper.kill()
        pickle.dump(funds, open('funds.p','wb'))
        
    # If the csvfile does not exist, fill in headers
    if not os.path.exists(csv_file):
        with open('ISINs.csv', 'w') as file:
            file.write('Funds,ISINs\n')
        uncompleted_funds = funds
    
    
    entries = genfromtxt(csv_file, delimiter=',', dtype = str, skip_header = 1)

    completed_funds = list(entries[:,0]) if entries.shape[0] > 0 else []

コード例 #10

ファイルを表示

def _get_stat(player, stat, year):
    url_retriever = HockeyReferenceUrlRetriever(player)
    scraper = WebScraper(url_retriever.get_url())
    return scraper.get_player_stats_for_year(f'{stat}', year)