def autoscraper(Link=None, Metodo=None): url = request.args["Link"] link = request.args["Link"] Metodo = request.args["Metodo"] wanted_list = [Metodo] scraper = AutoScraper() scraper.build(link, wanted_list) dict = scraper.get_result_exact(link, unique=False, grouped=True) l = [] [l.extend([k, v]) for k, v in dict.items()] regla = l[0] scraper.set_rule_aliases({regla: 'regla'}) scraper.keep_rules([regla]) url = url.replace("http:", "").replace("//", "").replace(".", "").replace( "www", "").replace("https:", "").replace("/", "").replace("\n", "").replace("-", "") scraper.save(url + '-search') data = get_pagina_result(url, link) json_format = json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False) return json_format
def build(self, wanted_dict=None, model_name='1'): """ url2autospider """ html_ = self.html url = self.url scraper = AutoScraper() scraper.build(html=html_, wanted_dict=wanted_dict) # data = scraper.get_result_similar(url, html=html_, group_by_alias=True) scraper.save(model_name)
def autopan_bot(): url = 'https://www.juniorminingnetwork.com/mining-topics/topic/drill-results.html' highgrade_scraper = 'high grade scraper' # We can add one or multiple candidates here. # You can also put urls here to retrieve urls. wanted_list = ['High Grade', 'High-Grade'] botscraper = AutoScraper() highgrade_results = botscraper.build(url, wanted_list) print(highgrade_results) if (highgrade_results): for result in highgrade_results: print('BriefHub bot has found results! 🚀') print(highgrade_results) elif (highgrade_results == None): print("Hmmm, it doesn't look like we found anything") exit(-1) botscraper.save(highgrade_scraper) print(f"💿 > Save the model {highgrade_scraper}")
from autoscraper import AutoScraper url = 'https://www.rosario3.com/' wanted_list = [ "/especiales/Club-de-Lectura-Brandon-Sanderson-es-mejor-que-J.-R.-R.-Tolkien-20200909-0043.html" ] scraper = AutoScraper() result = scraper.build(url, wanted_list) dict = scraper.get_result_exact(url, unique=False, grouped=True) l = [] [l.extend([k, v]) for k, v in dict.items()] regla = l[0] scraper.set_rule_aliases({regla: 'regla'}) scraper.keep_rules([regla]) scraper.save('rosario3-search')
from autoscraper import AutoScraper # Parameters url = "https://www.quora.com/search?q=deep%20learning&time=year" model_name = "model_quora" wanted_list = ["When will deep learning finally die out?"] # We instanciate the AutoScraper scraper = AutoScraper() # We train the Scraper # Here we can also pass html content via the html parameter instead of the url (html=html_content) result = scraper.build(url, wanted_list) # We display the results if any if(result): print("🚀 Great a query has been inferred !! Great gob.") print(result) # If no result we leave with an error code if(result == None): print("Sorry no query can be inferred ... 😿") exit(-1) # We save the model for future use print(f"💿 > Save the model {model_name}") scraper.save(model_name)
urlMain = 'https://www.etsy.com/search?q=cosplay%20fire' urlex = 'https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-2&organic_search_click=1&frs=1&col=1' wanted_list = [urlex ] #This is the most simple search type, just a one page input MainLink = [ ('https://www.etsy.com/search?q=cosplay%20fire', [ 'https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-2&organic_search_click=1&frs=1&col=1' ]), ] scraperMain = AutoScraper() #define a new scraper object for targetUrl, wanted_list in MainLink: scraperMain.build(url=targetUrl, wanted_list=wanted_list) #scraperMain.build(urlMain, wanted_list) #build the contents of that scraper scraperMain.save( 'etsyMain' ) #Saves this particular build of the scraper! (note, this is a local file, you can load it w/o having to regenerate very time!) #Build a new batch of features to collect. Best to collect them seperately from each other so they dont cross wires! itemFavorites = [ ('https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-3&organic_search_click=1&frs=1', ['1525 favorites']), ] scraperitemFavorites = AutoScraper() for targetUrl, wanted_list in itemFavorites: scraperitemFavorites.build( url=targetUrl, wanted_list=wanted_list) #, update=True)# grouped =True) scraperitemFavorites.save('scraperitemFavorites') shopSales = [
from autoscraper import AutoScraper # Create the model url = 'https://medium.com/@inzaniak' wanted_list = [ "Build a Web Scraping Python Project from Start to Finish", "5 things you need to learn as a Python beginner" ] scraper = AutoScraper() result = scraper.build(url, wanted_list) print(result) # Save the model scraper.save('scrapers/medium.json') # Load the model del scraper scraper = AutoScraper() scraper.load('scrapers/medium.json') scraper.get_result_similar(url)
# -*- coding: utf-8 -*- """ Created on Sun Apr 25 22:30:42 2021 @author: Nikhil Reddy """ from autoscraper import AutoScraper Scrap = AutoScraper() amzn_url = "https://www.amazon.in/s?k=iphones" req_list_amzn = ["₹58,400", "New Apple iPhone 11 (128GB) - Black"] Scrap_amzn = Scrap.build(amzn_url, req_list_amzn) res_amzn = Scrap.get_result_similar(amzn_url, grouped=True) dyk = list(res_amzn.keys()) print(dyk) Scrap.set_rule_aliases({dyk[len(dyk) - 1]: 'Title', dyk[0]: 'Price'}) Scrap.keep_rules([dyk[len(dyk) - 1], dyk[0]]) Scrap.save('amazon-search3')
# -*- coding: utf-8 -*- """ Created on Sat Apr 24 12:25:41 2021 @author: win10 """ from autoscraper import AutoScraper amazon_url = "https://www.amazon.in/s?k=iphones" wanted_list = ["₹58,400", "New Apple iPhone 11 (128GB) - Black"] scraper = AutoScraper() result = scraper.build(amazon_url, wanted_list) print(scraper.get_result_similar(amazon_url, grouped=True)) scraper.set_rule_aliases({'rule_io1c': 'Title', 'rule_hm52': 'Price'}) scraper.keep_rules(['rule_io1c', 'rule_hm52']) scraper.save('AmazonIn-search')
#!/usr/bin/env python # coding: utf-8 from autoscraper import AutoScraper amazon_url = "https://www.amazon.in/s?i=aps&k=iphone" wanted_list = ["New Apple iPhone 12 Pro Max (128GB) - Pacific Blue","₹1,25,900"] scraper = AutoScraper() result = scraper.build(amazon_url,wanted_list) print(scraper.get_result_similar(amazon_url,grouped=True)) scraper.set_rule_aliases({"rule_1943":"Title", "rule_1gc6":"MRP"}) scraper.keep_rules(["rule_1943","rule_1gc6"]) scraper.save("amazon_search")
scraper=AutoScraper() result=scraper.build(amazon_url,wanted_list) """# Finding similar data""" data = scraper.get_result_similar(amazon_url,grouped=True) print(data) keys = list(data.keys()) print(keys) """# Defining alias""" scraper.set_rule_aliases({str(keys[0]):'ImageUrl',str(keys[2]):'Title',str(keys[-2]):'Price',str(keys[-1]):'Reviews'}) scraper.save("amazon_in.json") """# Testing for other search word""" amazon_scraper = AutoScraper() amazon_scraper.load('amazon_in.json') search = "samsung phones" amazon_url="https://www.amazon.in/s?k={}&s=price-desc-rank".format(search) data = amazon_scraper.get_result_similar(amazon_url, group_by_alias=True) search_data = tuple(zip(data['Title'],data['ImageUrl'],data['Price'],data['Reviews'])) df = pd.DataFrame(columns=['Query','Title','Price','Reviews','ImageUrl']) for i in range(len(search_data)): df.loc[len(df)] = [search,search_data[i][0],search_data[i][2],search_data[i][3],search_data[i][1]]
from autoscraper import AutoScraper url = 'https://www.juniorminingnetwork.com/mining-topics/topic/drill-results.html' highgrade_scraper = 'high grade scraper' # We can add one or multiple candidates here. # You can also put urls here to retrieve urls. wanted_list = ['High Grade', 'High-Grade'] botscraper = AutoScraper() highgrade_results = botscraper.build(url, wanted_list) if (highgrade_results): for result in highgrade_results: print('BriefHub bot has found results! 🚀') print(highgrade_results) elif (highgrade_results == None): print("Hmmm, it doesn't look like we found anything") exit(-1) botscraper.save(highgrade_scraper) print(f"💿 > Save the model {highgrade_scraper}")