Esempi in Python per Scraper.Scraper, esempi in Python per scraper.Scraper.Scraper

Esempio n. 1

0

Mostra file

File: data_collector.py Progetto: Hweinstock/tennis-data-scraper

    def collect_earnings_over_time(self, playerNum, file_obj, default_url,
                                   str_year, searchedPlayers):
        for i in tqdm(range(1, playerNum + 1)):

            player = WebDriverWait(self.browser, 20).until(
                expected_conditions.visibility_of_element_located(
                    (By.XPATH,
                     "//*[@id='rankingDetailAjaxContainer']/table/tbody/tr[" +
                     str(i) + "]/td[4]/a")))
            file_obj.write(player.text + ",")
            WebDriverWait(self.browser, 20).until(
                expected_conditions.visibility_of_element_located(
                    (By.XPATH,
                     "//*[@id='rankingDetailAjaxContainer']/table/tbody/tr[" +
                     str(i) + "]/td[4]/a"))).click()
            url = self.browser.current_url
            bio_url = from_overview_to_bio(url)
            playerActivity_url = from_overview_to_playerActivity(url, str_year)

            bio_scraper = Scraper(bio_url)
            playerActivity_scraper = Scraper(playerActivity_url)

            marital_status = bio_scraper.check_player_marital_status()
            file_obj.write(marital_status + ",")

            tournament_earnings_str = playerActivity_scraper.tournament_earnings(
            )
            file_obj.write(tournament_earnings_str + ",")

            file_obj.write(str(i) + ",")
            file_obj.write(str_year + ",")
            file_obj.write("\n")

            self.browser.get(default_url)

Esempio n. 2

0

Mostra file

File: amazon_scraper_test.py Progetto: JDev165/priceStalker

 def test_get_product_price(self):
     # Test scraping an amazon product's price
     url = 'https://www.amazon.com/AOC-U2790VQ-3840x2160-Frameless-DisplayPort/dp/B07LBM2DCC/ref=pd_rhf_se_p_img_10?_encoding=UTF8&psc=1&refRID=Q4E347Q2WW3WKJJSF5RZ'
     priceSelector = 'span#priceblock_saleprice, span#priceblock_ourprice, span#priceblock_dealprice'
     scraper = Scraper(url)
     self.assertEqual(scraper.getProductPrice(priceSelector), 0)
     url = 'https://www.amazon.com/dp/B074PK4R2H/ref=psdc_1292115011_t2_B07LBM2DCC'
     scraper = Scraper(url)
     self.assertEqual(scraper.getProductPrice(priceSelector), '263.00')
     # Has 'See price in cart' instead of actual price
     url = 'https://www.amazon.com/Xbox-All-Digital-Console-Disc-Free-Gaming/dp/B07XQXZXJC/ref=sr_1_1?keywords=xbox&qid=1584756939&s=electronics&sr=1-1'
     scraper = Scraper(url)
     self.assertEqual(scraper.getProductPrice(priceSelector), '174.95')
     url = 'https://www.amazon.com/YI-Waterproof-Surveillance-Detection-Deterrent/dp/B01CW49AGG/ref=zg_bs_photo_home_2?_encoding=UTF8&psc=1&refRID=5P9471P94RVGPXPFEBDT'
     scraper = Scraper(url)
     self.assertEqual(scraper.getProductPrice(priceSelector), '45.88')
     url = 'https://www.amazon.com/LORGDFDF-Microphone-Creative-Integrated-Bluetooth/dp/B0851WN4RG/ref=sr_1_1?keywords=howl+conference&qid=1584773845&s=electronics&sr=1-1'
     scraper = Scraper(url)
     self.assertEqual(scraper.getProductPrice(priceSelector), '113.75')
     url = 'https://www.amazon.com/Generation-Dell-Corei7-9750H-GeForce-InfinityEdge/dp/B07T3FWD22/ref=sr_1_1?keywords=xps&qid=1584773944&s=electronics&sr=1-1'
     scraper = Scraper(url)
     self.assertEqual(scraper.getProductPrice(priceSelector), 0)
     url = 'https://www.amazon.com/PlayStation-Portable-Core-PSP-1000-sony/dp/B000F2DE8S/ref=sr_1_10?keywords=psp&qid=1584774683&s=electronics&sr=1-10'
     scraper = Scraper(url)
     self.assertEqual(scraper.getProductPrice(priceSelector), 0)
     url = 'https://www.amazon.com/AmazonBasics-Pound-Neoprene-Dumbbells-Weights/dp/B01LR5RO5U?ref_=ast_sto_dp&th=1&psc=1'
     scraper = Scraper(url)
     self.assertEqual(scraper.getProductPrice(priceSelector), 0)
     url = 'https://www.amazon.com/Primitives-Kathy-Sign-3-Inch-Love/dp/B00HU7WRZC/ref=bbp_bb_5e8416_st_8174_w_0?psc=1&smid=ATVPDKIKX0DER'
     scraper = Scraper(url)
     self.assertEqual(scraper.getProductPrice(priceSelector), '6.43')
     url = 'https://www.amazon.com/dp/B07PWCWQ4Z/ref=cm_gf_aagc_iaaa_d_p0_qd0____________________cYfsB7BVGRjKYIoPUijy'
     scraper = Scraper(url)
     self.assertEqual(scraper.getProductPrice(priceSelector), '35.99')

Esempio n. 3

0

Mostra file

    def bfs(self):

        # this to be commented later
        #
        searchScraper = SearchScraper()
        scraper = Scraper()

        root = self.root
        dq = collections.deque()

        dq.appendleft(root)
        visited = set()

        i = 1
        while dq:

            # this will be removed in future
            if i == 30:
                return

            if i % 15 == 0:
                print(
                    "Hey!! We are gonna pull a little sneaky on Google. I am starting a new session"
                )
                del scraper
                scraper = Scraper()

            curr = dq.popleft()
            if curr in curr.children:
                curr.children.remove(curr)  #Remove child from itself

            print(curr.name)
            if (curr is not self.root) and (not curr._visited):
                i += 1
                links = curr.get_links(scraper)
                with open("search_links.txt", "a+") as f:
                    f.write(curr.name + "\n")
                    f.write(str(links) + "\n")

                    sleep(randint(2, 10))

            else:
                print("already-visited")

            currCh = curr.children
            filteredCh = set()

            for child in currCh:
                if child.parent != curr.name:
                    continue
                else:
                    dq.appendleft(child)
                    filteredCh.add(child)
            curr.children = filteredCh

Esempio n. 4

0

Mostra file

File: seat_securer.py Progetto: xinbaDev/PTEBookingST

    def __init__(self):
        if not os.path.exists("logs/"):
            os.makedirs("logs/")

        logging.config.dictConfig(logging_config)

        self.logger = logging.getLogger(__name__)
        
        FORMAT_ERROR = "scraper_settings format is invalid, please refers to project doc"

        try:
            self.interval = float(scraper_settings["scraping_interval"])
            self.do_email_alert = scraper_settings["do_email_alert"]

            start_date_parts = scraper_settings["start_date"].split('-')
            end_date_parts = scraper_settings["end_date"].split('-')

            self.scraper = Scraper(
                                    date(int(start_date_parts[0]), int(start_date_parts[1]), int(start_date_parts[2])), 
                                    date(int(end_date_parts[0]), int(end_date_parts[1]), int(end_date_parts[2])), 
                                    scraper_settings["do_email_alert"], scraper_settings["city"]
                                )

        except:
            raise ValueError(FORMAT_ERROR)

        self.alert = SeatAlertEmailer()

Esempio n. 5

0

Mostra file

def crawl(timeout, crawl_delay, threadID):
    """
    This method will do the crawling on a URL. In a (almost-)never-ending loop, it will try to get a URL out of the frontline queue, it will crawl it and add new URLs to the frontline queue.

    Parameters
    ----------
    timeout: time object
             this is the timeout time, past which crawling will be stopped
    crawl_delay: int
                timeout between two crawling attempts
    threadID: int, str
              this is the ID of the thread that is using this method. for logging purposes only.
    """
    while True:
        if time.time() > timeout:
            break
        if frontlineQ.empty():
            time.sleep(crawl_delay)
            continue
        url = frontlineQ.get()
        try:
            print 'Thread', threadID, ' scraping ', url
            sc = Scraper(url)
            sc.writeJSON(jsons_dir)
            outgoings = sc.getBookLinks()
            crawledQ.put(url)
            for u in outgoings:
                if not crawledQ.contains(u):
                    frontlineQ.put(u)
        except:
            crawledQ.put(url)
        frontlineQ.task_done()
        time.sleep(crawl_delay / 2)

Esempio n. 6

0

Mostra file

File: fly_user.py Progetto: mixnam/internship

def main():
    """main function"""
    try:
        parser = argparse.ArgumentParser()
        parser.add_argument("departure_IATA",
                            type=validation_iata,
                            help="Specify IATA-code of your departure airport")
        parser.add_argument("destination_IATA",
                            type=validation_iata,
                            help="Specify IATA of your destination airport")
        parser.add_argument("outbound_date",
                            type=validation_date_str,
                            help="Specify outbound date")
        parser.add_argument("return_date",
                            nargs="?",
                            default=0,
                            type=validation_date_str,
                            help="If you don't want fly oneway," \
                                 " specify return date")
        args = parser.parse_args()

        if check_dates(args.outbound_date, args.return_date):
            srap = Scraper(args.departure_IATA,
                           args.destination_IATA,
                           args.outbound_date,
                           args.return_date)
            srap.make_search()

        return 0
    except (ScraperError, ValidationError) as err:
        sys.stderr.write(err.value)
        return 1

Esempio n. 7

0

Mostra file

def create_job():
    worker = Scraper(FREQUENCY, TAB, UNIT, FINAL_YEAR)
    while True:
        item = q.get()
        worker.do_work(item)
        print(item + ' is downloaded | ' + str(q.qsize()) + ' item(s) left')
        q.task_done()

Esempio n. 8

0

Mostra file

File: bot.py Progetto: tutorly/sherlock

 def __init__(self):
     '''Initializes bot class.'''
     self.scraper = Scraper()
     self.validator = Validator()
     self.courier = Courier()
     self.loopInterval = 300  # In seconds
     self.currentMode = Environment.TEST  # This is set in self.ModeSelect() method in the run method. I have it set to Environment.TEST default incase someone removes self.modeSelect().

Esempio n. 9

0

Mostra file

 def setUpClass(start):
     city = "wroclaw"
     rooms = "two"
     price_start = "1400"
     price_end = "2000"
     page = "1"
     start.scraper_main = Scraper(city,rooms,price_start,price_end,page)

Esempio n. 10

0

Mostra file

File: app.py Progetto: bilalansari-fr/aws_codepipeline

def handler(event, context):
    """ """
    base_url = event['url']
    queries = event['queries']

    scraper = Scraper(base_url)
    response = scraper.start(queries)

    return response


# event = \
# {
# "url": "https://www.monster.com",
# "queries":  [{"keyword":"Data Science", "location":"New YOrk"}]
# }

# handler(event, 2)

# import json
# def handler(event, context):
#     response = {
#         "statusCode": 200,
#         "body": json.dumps({'message':'Successfully scraped.'})
#     }
#     return response

Esempio n. 11

0

Mostra file

def test_get_details_wrong_url():
	"""
	Test case: When given wrong url
	"""
	url = 'https://nbgdhckkdoo.com'
	with pytest.raises(URLLoadError):
		product = Scraper(url, BasicExtractor()).get_details([NameAttribute])

Esempio n. 12

0

Mostra file

def test_get_sentence_translation_and_audio():
    sentence = '他会说很多种语言，比如汉语，日语，德语。'
    s = Scraper()
    retval = s.get_sentence_translation_and_audio(sentence)
    assert retval[0] == \
     'He can speak many languages, such as Chinese, Japanese and German.'
    assert retval[1]  # (is nonempty)

Esempio n. 13

0

Mostra file

File: problem.py Progetto: Virgile-Dauge/wiki_search

 def __init__(self, start_url, goal_url):
     """Saving two urls to link."""
     super(Problem, self).__init__()
     self.start_url = start_url
     self.goal_url = goal_url
     self.scraper = Scraper()
     self.base_url = 'https://fr.wikipedia.org'

Esempio n. 14

0

Mostra file

File: stock_ops.py Progetto: jrbartola/FundamentalAnalysis

def update_stock_data(ticker):
    """
    Finds the stock data for a given ticker, and updates/inserts it into the database.
    :param ticker: str
    """
    scraper = Scraper()
    stock = get_stock(ticker)
    if not stock:
        name = scraper.get_stock_name(ticker)
        add_stock(name, ticker)
        stock = get_stock(ticker)

    # NOTE: eps_growth only ever contains four numbers, so skip the first spot
    avg_eps_growth = sum(scraper.get_eps_growth(ticker)[1:]) / 4
    qoq_eps_growth = scraper.get_qoq_growth(ticker, 'eps')

    # NOTE: sales_growth only ever contains four numbers, so skip the first spot
    avg_sales_growth = sum(scraper.get_sales_growth(ticker)[1:]) / 4
    qoq_sales_growth = scraper.get_qoq_growth(ticker, 'revenue')
    stock.avg_eps_growth = avg_eps_growth
    stock.qoq_eps_growth = qoq_eps_growth
    stock.avg_sales_growth = avg_sales_growth
    stock.qoq_sales_growth = qoq_sales_growth
    update_stock(stock)
    update_mos(ticker)
    update_eps_data(ticker)
    update_revenue_data(ticker)

Esempio n. 15

0

Mostra file

def gather():
    print('here')
    logger.info("gather")
    storage = Persistor(SCRAPPED_FILE)
    scrapper = Scraper(storage)
    for year in range(1903, int(datetime.datetime.now().year)):
        scrapper.scrape(year)

Esempio n. 16

0

Mostra file

File: worker.py Progetto: wangf111/python-rabbitmq-thread

def calculate(body):
    url = json.loads(body)['url']
    scraper = Scraper()
    result = scraper.scrape(url.strip())
    time.sleep(10)
    j = json.dumps(result.__dict__)
    publish_result(j)

Esempio n. 17

0

Mostra file

    def test_Al2O3_scraper(self):
        """Tests whether all 4 records for "Al2O3" are scraped correctly."""

        # Initialize a `scraper.Scraper` instance.
        scraper = Scraper()
        scraper.get_landing_page()

        # Is the landing page correct?
        self.assertIn('NIST-JANAF Thermochemical Tables',
                      scraper.browser.title)

        # Enter "Al2O3" in the form, submit it.
        scraper.send_query('Al2O3')
        scraper.select_state()
        scraper.submit_query()

        # Get all records resulting from the above query
        query_records = scraper.all_query_records

        # Verify number of records scraped.
        self.assertEqual(len(query_records), 4)

        # Check if scraped data is OK.
        self.assertIn('aluminum_oxide__kappa', query_records)
        self.assertEqual(query_records['aluminum_oxide__alpha']['CAS'],
                         '1344-28-1')
        self.assertEqual(query_records['aluminum_oxide__delta']['formula'],
                         'Al2O3')
        self.assertEqual(query_records['aluminum_oxide__gamma']['link'],
                         'http://kinetics.nist.gov/janaf/html/Al-098.txt')

        # Terminate the session cleanly.
        scraper.terminate_session()

Esempio n. 18

0

Mostra file

 def execute_scraper(self, container):
     try:
         result = Scraper(self, container).execute()
     except:
         self.logger.exception("Unknown exception ocurred in scraper")
         return False
     print("Container {} extracted in {} seconds.".format(container['code'], result[1]))
     
     # In case an unknown exception ocurred, finish execution
     if result[0] is None:
         return False
     
     # In case there was an error scraping a container, restart the driver
     if result[0] is False:
         # Add to failure count
         print("Scraper for container {} was unsuccessful".format(container['code']))
         self.fail_counter += 1
         # Create new driver
         self.create_driver(True)
         if self.fail_backoff <= self.MAX_BACKOFF:
             self.fail_backoff *= 2
         # Continue execution
         return True
     
     # In case no error was found, add to scraper count and restart failure backoff
     self.fail_backoff = 1
     self.total_counter += 1
     self.round_counter += 1
     if self.round_counter >= ScraperConfig.ROUNDS_RESTART:
         self.create_driver(False)
         self.round_counter = 0
     return True

Esempio n. 19

0

Mostra file

File: gui.py Progetto: i-saac/weather

    def init_weather_ui(self):
        # get address from text entry box
        self.address = self.address_entry_box.get(
            '1.0', 'end-1c') if self.address is None else self.address

        # clear canvas
        self.clear_canvas()

        # set up scraper
        self.web_scraper = Scraper(self.address)

        # create text objects to display location and weather
        self.location_text = self.canvas.create_text(
            500, 100, text=self.web_scraper.get_location())
        if self.web_scraper.get_location() != 'Error: Invalid Address':
            self.forecast_text = self.canvas.create_text(
                500, 115, text=self.web_scraper.get_forecast())
            self.temp_c_text = self.canvas.create_text(
                500, 130, text=self.web_scraper.get_temp_f())
            self.temp_f_text = self.canvas.create_text(
                500, 145, text=self.web_scraper.get_temp_c())

        # initialize reset button
        self.reset_button = tk.Button(self.canvas,
                                      text='Reset Slot',
                                      bg='white',
                                      command=self.init_entry_ui)
        self.reset_button.place(width=300, height=100, x=175, y=300)

        #initialize refresh button
        self.refresh_button = tk.Button(self.canvas,
                                        text='Refresh Forecast',
                                        bg='white',
                                        command=self.refresh_weather)
        self.refresh_button.place(width=300, height=100, x=525, y=300)

Esempio n. 20

0

Mostra file

File: data_collector.py Progetto: Hweinstock/tennis-data-scraper

    def collect_basic_info(self, playerNum, file_obj, default_url, str_year,
                           searchedPlayers):
        for i in tqdm(range(1, playerNum + 1)):

            player = WebDriverWait(self.browser, 20).until(
                expected_conditions.visibility_of_element_located(
                    (By.XPATH,
                     "//*[@id='rankingDetailAjaxContainer']/table/tbody/tr[" +
                     str(i) + "]/td[4]/a")))
            player_name = player.text
            if player_name in searchedPlayers:
                self.browser.get(default_url)
            else:
                file_obj.write(player_name + ",")
                WebDriverWait(self.browser, 20).until(
                    expected_conditions.visibility_of_element_located((
                        By.XPATH,
                        "//*[@id='rankingDetailAjaxContainer']/table/tbody/tr["
                        + str(i) + "]/td[4]/a"))).click()
                url = self.browser.current_url

                info_scraper = Scraper(url)
                basic_info = info_scraper.find_basic_info()

                file_obj.write(basic_info["turned_pro"] + ",")
                file_obj.write(basic_info["weight"] + ",")
                file_obj.write(basic_info["height"] + ",")

                file_obj.write("\n")
                searchedPlayers.append(str(player_name))
                self.browser.get(default_url)

Esempio n. 21

0

Mostra file

File: test_selenium_utils.py Progetto: AndrewYou/selenium-aws-scraper

def test_elements_found():
    obj = Scraper("https://www.google.com/", "bar.csv", ["foo1", "foo2", "foo3"], True)
    obj.go_to(obj.base_url)
    elements = WebDriverWait(obj.webdriver, 3).until(
        ElementHasCssSelector("input[value*= 'Feeling Lucky']")
    )
    assert elements

Esempio n. 22

0

Mostra file

File: main_scraper.py Progetto: THalwaysgunner/ITCPROJECT

def main():
    questions = [
        inquirer.List('user_option',
                      message="in order to proceed please choose an option",
                      choices=["scraper", "immediate data"]
                      )
    ]

    question_scraper = [
        inquirer.Text('symbol',
                      message='if you like to scrape a specific symbol enter the the symbol\nplease please enter "ALL" for all the symbol ',
                      ),
        inquirer.Text('saving flag',
                      message='if you like to save the data locally please add True (default - False) '
                      ),

    ]

    answers = inquirer.prompt(questions)

    if answers.get("user_option") == "scraper" : # TODO fix scraper - cannot scraped data while choosing "scraper"
        answer_scraper = inquirer.prompt(question_scraper)
        symbol = answer_scraper.get("symbol")
        saving_flag = answer_scraper.get("saving flag")
        scraper = Scraper(save=saving_flag)
        scraper.scrape_all(symbol_choice=symbol)

    elif answers.get("user_option") == "immediate data" :
        get_data_from_api()

Esempio n. 23

0

Mostra file

File: handler.py Progetto: TifMoe/selenium-scraper-with-aws-lambda

def scrape(event, context):
    driver = Scraper()
    page = driver.scrape_page('https://waitbutwhy.com/')

    # Business logic for specific scrape job
    post = page.find("div", {"class": "mainPost"})
    header = post.find("h1")
    link = header.find('a', href=True)

    if link:
        data = {
            "success": "true",
            "result": {
                "message":
                "Congrats!! Your Headless Chrome initialized and we found the top story on Wait But Why",
                "topStoryLink": link['href']
            }
        }
    else:
        data = {
            "success": "false",
            "result": {
                "message": "Oops, something went wrong"
            }
        }

    driver.close()
    driver.quit()

    response = {"statusCode": 200, "body": json.dumps(data)}

    return response

Esempio n. 24

0

Mostra file

    def __init__(self,
                 look_ahead=4,
                 skip_to_page=0,
                 feats=10,
                 max_q_sz=100,
                 base_url="http://www.xvideos.com/c/{0}/anal-12"):
        # Let's set this up so gathering new videos can happen in the background.
        self.scraped_videos = {}
        gather_args = (look_ahead, skip_to_page, feats, max_q_sz)
        self.gather_process = Thread(target=self.gather,
                                     args=gather_args,
                                     daemon=True)

        self.scr = Scraper(base_url=base_url, pg_n=skip_to_page)
        self.db = Database()
        self.ai = Brain(self)
        self.win = Window(self)

        self.currently_loaded_video_data = {}
        self.feats = feats

        self.q = PriorityQueue(maxsize=max_q_sz)
        self.lock = RLock()

        if "brain.pkl" in os.listdir():
            self.train()
        self.get_next()

Esempio n. 25

0

Mostra file

File: deamon_calendar.py Progetto: jemand771/py-cd-scraper

    def run_once(self):

        json_file = open("config/campusdual.json")
        data = json.load(json_file)
        username = data["username"]
        password = data["password"]

        worker = Scraper(username)
        login = worker.login(password)
        print("login result", login)
        if login != 0:
            worker.exit()
            exit(login)
        
        if self.killer.kill_now:
            print("login successful, but program is being terminated. not downloading schedule. NOT pushing to calendar. exiting")
            worker.exit()
            exit()

        worker.download_full_schedule()
        schedule_fixer.repair("data/" + username + "/schedule.json", "data/" + username + "/schedule-fixed.json")

        if self.killer.kill_now:
            print("download completed, but program is being terminated. NOT pushing to calendar. exiting")
            worker.exit()
            exit()

        worker.exit()

        calendar = calendar_api.CalendarApi()
        f = open("data/" + username + "/schedule-fixed.json", "r")
        sch = json.load(f)
        f.close()
        calendar.sync_schedule([s for s in sch if s["date"] not in FORBIDDEN_DATES])

Esempio n. 26

0

Mostra file

class MainApp():

    if __name__ == '__main__':
        startUrl = "http://scn.sap.com/community/abap/content?filterID=contentstatus[published]~objecttype~objecttype[thread]&start="
        storing = DataStoring()
        #read the input param
        i = storing.read_index_from_file()
        completeUrl = ""
        print("\n\n-------- SCRAPER STARTED ---\n")
        while (i < 5000):
            #string concatenation to get the complete URL
            completeUrl = startUrl + str(20 * i)
            #threads scraped from URL
            threads = []
            print("------ SCRAPING NEW WEB PAGE (PAGE " + str(i) + ") ---\n")
            SCNScraper = Scraper(completeUrl)
            #get threads
            threads = SCNScraper.scraping()
            #save content into json file
            storing.insert_items_into_file(threads)
            #save content into db
            storing.insert_items_into_db(threads)
            i = i + 1
            #update index file
            storing.write_index_into_file(i)

Esempio n. 27

0

Mostra file

def main():
    style = "=+" * 20
    if url_checker(args.url) is False:  # url format check
        raise argparse.ArgumentTypeError(
            'Value has to be in full url format http:// or http://')
    print(style)
    print("Box.com PDF Downloader by @lfasmpao")

    box_object = Scraper(args.url, args.driver_location, args.use_x11,
                         args.wait_time)
    print("Please wait for about {} seconds...".format(args.wait_time))
    box_object.load_url()
    dl_name = box_object.get_download_title()
    print(style)
    print("DATA TO BE DOWNLOADED\nTitle: {}\nBox.com URL: {}".format(
        dl_name, args.url))

    print(style)
    dl_url = box_object.get_download_url()
    print("Download URL:", dl_url)
    print(style)
    box_object.clean()  # clean

    # make directory
    directory = os.path.dirname(args.output_location)
    if not os.path.exists(directory):
        os.makedirs(directory)
    print("Downloading..\nFile will be save as:",
          str(args.output_location + dl_name + ".pdf"))
    download_file(url=dl_url,
                  path=str(args.output_location + dl_name + ".pdf"))

Esempio n. 28

0

Mostra file

    def process(self, json_path, initial_url=INITIAL_URL, location="local"):
        """process the data.
        :param json_path, String
        :param initial_url, String
        :param location, String
        """
        rules = self.__get_local_json_data(json_path)
        item_parser = ItemParser(rules[str(self._initial_action)])
        scraper = Scraper(auth="auth",
                          user="******",
                          passw="Scraper",
                          initial_url=initial_url)
        response = scraper.start_request()
        current_page = self._initial_action

        while True:
            scraped_item = scraper.parse_item(response, item_parser)
            if scraped_item is None:
                print(
                    "ALERT - Can’t move to page {prev_page}: page {current_page} link has been malevolently tampered with!!"
                    .format(prev_page=item_parser.get_next_parser(),
                            current_page=current_page))
                break

            print("Move to page {current_page}".format(
                current_page=current_page))
            next_parser = scraped_item['next_parser']
            next_url = scraped_item['next_url']

            item_parser = ItemParser(rules[next_parser])
            response = scraper.start_request(url=next_url)
            current_page = next_parser

Esempio n. 29

0

Mostra file

File: main.py Progetto: MikeAleksa/Multithreaded-Chewy-Scraper

def main():
    logger = VerboseScraperLogger()
    scraper = Scraper(database=DATABASE,
                      logger=logger,
                      num_threads=THREADS,
                      force=FORCE)
    scraper.scrape(url=SEARCH_URL)

Esempio n. 30

0

Mostra file

File: scrape_multiple.py Progetto: vulcant/AllRecipes

def main():
    categories = []

    main_url = 'https://www.allrecipes.com/recipes/'
    # scrape the list of categories to scrape recipes for
    source = requests.get(main_url).text
    soup = BeautifulSoup(source, 'lxml')
    category_containers = soup.find_all('div', class_='all-categories-col')
    for container in category_containers:
        for section in container.find_all('section'):
            title = section.h3.text
            title = '-'.join(title.lower().split())
            for li in section.ul.find_all('li'):
                cat_name = li.a.text
                cat_name = '-'.join(cat_name.lower().split())
                cat_url = li.a['href']
                categories.append({'title': title, 'category': cat_name, 'url': cat_url})

    # Create a list of scrapers
    max_recipe_num = 30
    for cat in categories:
        scraper = Scraper(cat['title'] + '__' + cat['category'], max_recipe_num=max_recipe_num)
        scraper.get_list_of_categories(cat['url'])
        scraper.parse_category_list_for_recipes()
        scraper.save_to_csv('./data/')