def get_pagina_result(url, link): PAGINA_scraper = AutoScraper() PAGINA_scraper.load('./' + url + '-search.json') result = PAGINA_scraper.get_result_similar(link, unique=True, group_by_alias=True) return _aggregate_result(result)
def getPrice(): url = 'https://stackoverflow.com/questions/2081586/web-scraping-with-python' wanted_list = ["What are metaclasses in Python?"] scraper = AutoScraper() result = scraper.build(url, wanted_list) print(result)
def auto(self, url, model_name='1'): scraper = AutoScraper() scraper.load(model_name) html_ = requests.get(url) html_.encoding = html_.apparent_encoding html_ = html_.text data = scraper.get_result_similar(url, html=html_, group_by_alias=True) return data
async def scrape(ctx, url: str, wanted_list: str): # url = '' # wanted_list = [''] botscraper = AutoScraper() print(type(url)) print(type(wanted_list)) scrape_result = botscraper.build(url, wanted_list) print(scrape_result) results_message = '\r\n'.join(['BriefHub bot has found results! 🚀', f"Here is what I found for * {wanted_list} * on * {url} * : {str(scrape_result)}", ':-)']) await ctx.send(results_message)
def autopan_bot(): url = 'https://www.juniorminingnetwork.com/mining-topics/topic/drill-results.html' highgrade_scraper = 'high grade scraper' # We can add one or multiple candidates here. # You can also put urls here to retrieve urls. wanted_list = ['High Grade', 'High-Grade'] botscraper = AutoScraper() highgrade_results = botscraper.build(url, wanted_list) print(highgrade_results) if (highgrade_results): for result in highgrade_results: print('BriefHub bot has found results! 🚀') print(highgrade_results) elif (highgrade_results == None): print("Hmmm, it doesn't look like we found anything") exit(-1) botscraper.save(highgrade_scraper) print(f"💿 > Save the model {highgrade_scraper}")
def autoscraper(): link = request.json["Link"] global url url = request.json["Link"] wanted_list = request.json["Metodo"] global scraper scraper = AutoScraper() wanted_dict = { 'url': [ 'https://www.rosario3.com/policiales/Robaron-dos-autos-de-alta-gama-de-una-concesionaria-y-los-encontraron-en-un-galpon-20201014-0080.html', 'https://www.rosario3.com/-economia-negocios-agro-/La-inflacion-de-septiembre-fue-del-28-segun-el-Indec-20201014-0087.html', 'https://www.rosario3.com/informaciongeneral/Coronavirus-confirmaron-el-primer-caso-de-reinfeccion-en-Rosario-20201014-0030.html' ] } scraper.build(url=link, wanted_dict=wanted_dict) dict = scraper.get_result_similar(link, grouped=True) regla = [] [regla.extend([k]) for k in dict.keys()] #data = get_pagina_result(url, link) #json_format = json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False) return regla
def build(self, wanted_dict=None, model_name='1'): """ url2autospider """ html_ = self.html url = self.url scraper = AutoScraper() scraper.build(html=html_, wanted_dict=wanted_dict) # data = scraper.get_result_similar(url, html=html_, group_by_alias=True) scraper.save(model_name)
from autoscraper import AutoScraper # Create the model url = 'https://medium.com/@inzaniak' wanted_list = [ "Build a Web Scraping Python Project from Start to Finish", "5 things you need to learn as a Python beginner" ] scraper = AutoScraper() result = scraper.build(url, wanted_list) print(result) # Save the model scraper.save('scrapers/medium.json') # Load the model del scraper scraper = AutoScraper() scraper.load('scrapers/medium.json') scraper.get_result_similar(url)
# -*- coding: utf-8 -*- """ Created on Sun Apr 25 22:30:42 2021 @author: Nikhil Reddy """ from autoscraper import AutoScraper Scrap = AutoScraper() amzn_url = "https://www.amazon.in/s?k=iphones" req_list_amzn = ["₹58,400", "New Apple iPhone 11 (128GB) - Black"] Scrap_amzn = Scrap.build(amzn_url, req_list_amzn) res_amzn = Scrap.get_result_similar(amzn_url, grouped=True) dyk = list(res_amzn.keys()) print(dyk) Scrap.set_rule_aliases({dyk[len(dyk) - 1]: 'Title', dyk[0]: 'Price'}) Scrap.keep_rules([dyk[len(dyk) - 1], dyk[0]]) Scrap.save('amazon-search3')
import os import sys import requests from time import time as timer from urllib.parse import urljoin, urlparse from multiprocessing.pool import ThreadPool from autoscraper import AutoScraper from expression.core import pipe from expression.collections import Seq, seq standard_ebooks_url = "https://standardebooks.org/ebooks" # Navigation Scraper navigation_scraper = AutoScraper() scraped_pages_urls = navigation_scraper.build( standard_ebooks_url, ["/ebooks/?page=2", "/ebooks/?page=3"]) pages_urls = Seq(scraped_pages_urls).pipe( seq.map(lambda page: urljoin(standard_ebooks_url, page)), ) # Page Scraper page_scraper = AutoScraper() books_urls = page_scraper.build(standard_ebooks_url, [ "/ebooks/ford-madox-ford/some-do-not", "/ebooks/booth-tarkington/the-turmoil", "/ebooks/anatole-france/penguin-island/a-w-evans", "/ebooks/edgar-allan-poe/the-narrative-of-arthur-gordon-pym-of-nantucket" ], update=True) for page in pages_urls: print(page)
def url_data(): about() st.info("This feature has limited functionality") url=st.text_input("Webpage URL",help="Enter a url where your data is placed") if url=="": st.info("Please enter a valid input to get started") st.stop() #getting data Column names as user input column_name=st.text_input("enter candidadte column Name",key="value") value_list=column_name.split(",") #getting data example for refferances candidate=st.text_input("Candidate example value",key="candidates",help="use ; as seperator to enter another value") items_list=candidate.split(";") #st.write(items) # create object scraper = AutoScraper() # feeding for scraping final_result = scraper.build(url,items_list) # display result results=scraper.get_result_similar(url,grouped=True,keep_order=True) result={} for key,value in results.items(): if value not in result.values(): result[key]=value orient_df=pd.DataFrame.from_dict(result,orient="index") df=orient_df.transpose() df.columns=value_list df.fillna(value=pd.np.nan,inplace=True) st.write(df) cols=df.columns.tolist() col1,col2=st.beta_columns(2) target=col1.selectbox("Select Target", cols,key="target") typelist=['binary','multiclass','regression','time series regression','time series multiclass','time series binary'] p_type=col2.selectbox("Select problem type",typelist,key="p_type") st.write("hey") x=df.drop(columns=target) y=df[target] x_train,x_test,y_train,y_test=evalml.preprocessing.split_data(x,y,problem_type=p_type) automl = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type) automl.search() rank=automl.rankings #checking best pipeline ############################################################### best_pipeline=automl.best_pipeline description=automl.describe_pipeline(automl.rankings.iloc[0]["id"]) ### OPtimize the code ### Evaluate on hold out data problem_list=['binary','time series binary'] problem_list2=['multiclass','time series multiclass'] cola,col_b,colc=st.beta_columns(3) if p_type in problem_list: objective=col_b.selectbox("select objective",objectives().binary_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["auc","f1","Precision","Recall"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['f1', 'precision'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) pred=tunned_pipeline.predict_proba(x_test).to_dataframe() # for multiclass type problem elif p_type in problem_list2: objective=col_b.selectbox("select objective",objectives().multiclass_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["log loss multiclass","MCC multiclass","accuracy multiclass"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['MCC multiclass', 'accuracy multiclass'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) pred=tunned_pipeline.predict(x_test).to_series() # for regression type problems else: objective=col_b.selectbox("select objective",objectives().regression_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["r2","MSE","MAE","Root Mean Squared Error"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['Root Mean Squared Error', 'MSE','MAE'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) tunned_pipeline.fit(x_train,y_train) pred=tunned_pipeline.predict(x_test).to_series() file=open("model_details.txt","w") str_dict=repr(tunned_description) file.write(str_dict) file.close() def get_binary_file_downloader_html(bin_file, file_label='File'): with open(bin_file, 'rb') as f: data = f.read() bin_str = base64.b64encode(data).decode() href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Click Here To Download {file_label}</a>' return href col1,col2,col3=st.beta_columns([1,1,1]) if col2.button("Predict Results",key="output",help="shows results"): st.spinner() with st.spinner(text='In progress'): st.info("Wait while we are selecting a best algoritham for your problem..Hold your breath.") time.sleep(20) st.info("Done. Here you go.") st.write(pred) col11,col12=st.beta_columns([3,1]) with col11: with st.beta_expander("Compare Models"): st.write(tunned_rankings) with col12: with st.beta_expander("Best Pipeline"): st.success(tunned_pipeline) st.markdown(get_binary_file_downloader_html('model_details.txt', 'Pipeline Details'), unsafe_allow_html=True)
#!/usr/bin/env python # coding: utf-8 from autoscraper import AutoScraper amazon_url = "https://www.amazon.in/s?i=aps&k=iphone" wanted_list = ["New Apple iPhone 12 Pro Max (128GB) - Pacific Blue","₹1,25,900"] scraper = AutoScraper() result = scraper.build(amazon_url,wanted_list) print(scraper.get_result_similar(amazon_url,grouped=True)) scraper.set_rule_aliases({"rule_1943":"Title", "rule_1gc6":"MRP"}) scraper.keep_rules(["rule_1943","rule_1gc6"]) scraper.save("amazon_search")
from autoscraper import AutoScraper import pandas as pd pd.set_option('display.max_rows', None) tickers = ['SCHB', 'AMZN', 'AAPL', 'MSFT', 'TSLA', 'AMD', 'NFLX'] scraper = AutoScraper() scraper.load('../finviz_table') for ticker in tickers: url = f'https://finviz.com/quote.ashx?t={ticker}' result = scraper.get_result(url)[0] index = result.index('Index') df = pd.DataFrame(zip(result[index:], result[:index]), columns=['Attributes', 'Values']) print(f'\n{ticker} Data: ') print(df.set_index('Attributes'))
# -*- coding: utf-8 -*- """ Created on Sat Apr 24 12:29:21 2021 @author: win10 """ from autoscraper import AutoScraper from flask import Flask, request amazon_scraper = AutoScraper() amazon_scraper.load('amazon-search') app = Flask(__name__) def get_amazon_result(search_query): url = 'https://www.amazon.in/s?k=%s' % search_query result = amazon_scraper.get_result_similar(url, group_by_alias=True) return _aggregate_result(result) def _aggregate_result(result): final_result = [] print(list(result.values())[0]) for i in range(len(list(result.values())[0])): try: final_result.append({alias: result[alias][i] for alias in result}) except: pass return final_result
from autoscraper import AutoScraper from flask import Flask, request ebay_scraper = AutoScraper() etsy_scraper = AutoScraper() ebay_scraper.load('ebay-search') etsy_scraper.load('etsy-search') app = Flask(__name__) def get_ebay_result(search_query): url = 'https://www.ebay.com/sch/i.html?_nkw=%s' % search_query result = ebay_scraper.get_result_similar(url, group_by_alias=True) return _aggregate_result(result) def get_etsy_result(search_query): url = 'https://www.etsy.com/search?q=%s' % search_query result = etsy_scraper.get_result_similar(url, group_by_alias=True) result['url'] = [ f'https://www.etsy.com/listing/{i}' for i in result['url'] ] return _aggregate_result(result) def _aggregate_result(result): final_result = [] for i in range(len(list(result.values())[0])): final_result.append({alias: result[alias][i] for alias in result}) return final_result
from flask import Flask, render_template, request from autoscraper import AutoScraper import pandas as pd import time app = Flask(__name__) #creating object and loading amazon_scraper = AutoScraper() amazon_scraper.load('amazon_in.json') @app.route("/", methods=['GET']) def home(): #when user search it if request.args.get('search'): #inputs search = request.args.get('search') sortby = request.args.get('sortby', 'relevanceblender') #call function to retrieve data search_data, original_url = searchquery(search, sortby) data_length = len(search_data) #show to user return render_template("index.html", data={ 'original_url': original_url, 'query': search, 'sortby': sortby,
# -*- coding: utf-8 -*- """ Created on Sat Apr 24 12:25:41 2021 @author: win10 """ from autoscraper import AutoScraper amazon_url="https://www.amazon.in/s?k=iphones" wanted_list=["₹58,400","New Apple iPhone 11 (128GB) - Black"] scraper=AutoScraper() result=scraper.build(amazon_url,wanted_list) print(scraper.get_result_similar(amazon_url,grouped=True))
url = "https://app.hellobonsai.com/time_entries" jar = cookielib.CookieJar() with open("cookies.json") as f: cookies_dict = json.load(f) for cookie_dict in cookies_dict: add_dict_to_cookiejar(jar, cookie_dict) s = requests.Session() s.cookies = jar cookies_dict = requests.utils.dict_from_cookiejar(s.cookies) scraper = AutoScraper() wanted_list = ["Unbilled"] result = scraper.build(url, wanted_list, request_args={"cookies": s.cookies}) print(result) set_trace() # We can add one or multiple candidates here. # You can also put urls here to retrieve urls. # wanted_list = ["What are metaclasses in Python?"]
# -*- coding: utf-8 -*- """ Created on Sat Apr 24 12:25:41 2021 @author: win10 """ from autoscraper import AutoScraper amazon_url = "https://www.amazon.in/s?k=iphones" wanted_list = ["₹58,400", "New Apple iPhone 11 (128GB) - Black"] scraper = AutoScraper() result = scraper.build(amazon_url, wanted_list) print(scraper.get_result_similar(amazon_url, grouped=True)) scraper.set_rule_aliases({'rule_io1c': 'Title', 'rule_hm52': 'Price'}) scraper.keep_rules(['rule_io1c', 'rule_hm52']) scraper.save('AmazonIn-search')
# pip install autoscraper from autoscraper import AutoScraper url = 'https://stackoverflow.com/questions/2081586/web-scraping-with-python' wanted_list = ['How to check version of python modules?'] scraper = AutoScraper() result = scraper.build(url, wanted_list) for res in result: print(res) """ How to execute a program or call a system command from Python What are metaclasses in Python? Does Python have a ternary conditional operator? Convert bytes to a string Does Python have a string 'contains' substring method? How to check version of python modules? """
print('started scraping...') data_dir = '/home/pi/Desktop/scraper/' #scrapeRecord = os.path.join(data_dir,'scrapeRecord.csv') # location to save records of file send attempts for troubleshooting urlMain = 'https://www.etsy.com/search?q=cosplay%20fire' urlex = 'https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-2&organic_search_click=1&frs=1&col=1' wanted_list = [urlex ] #This is the most simple search type, just a one page input MainLink = [ ('https://www.etsy.com/search?q=cosplay%20fire', [ 'https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-2&organic_search_click=1&frs=1&col=1' ]), ] scraperMain = AutoScraper() #define a new scraper object for targetUrl, wanted_list in MainLink: scraperMain.build(url=targetUrl, wanted_list=wanted_list) #scraperMain.build(urlMain, wanted_list) #build the contents of that scraper scraperMain.save( 'etsyMain' ) #Saves this particular build of the scraper! (note, this is a local file, you can load it w/o having to regenerate very time!) #Build a new batch of features to collect. Best to collect them seperately from each other so they dont cross wires! itemFavorites = [ ('https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-3&organic_search_click=1&frs=1', ['1525 favorites']), ] scraperitemFavorites = AutoScraper() for targetUrl, wanted_list in itemFavorites: scraperitemFavorites.build(
import json from autoscraper import AutoScraper from flask import Flask, request, render_template PAGINA_scraper = AutoScraper() PAGINA_scraper.load('./rosario3-search') app = Flask(__name__) def get_pagina_result(url, link): PAGINA_scraper = AutoScraper() PAGINA_scraper.load('./' + url + '-search') result = PAGINA_scraper.get_result_similar(link, group_by_alias=True) return _aggregate_result(result) def _aggregate_result(result): final_result = [] for i in range(len(list(result.values())[0])): final_result.append({alias: result[alias][i] for alias in result}) return final_result @app.route('/AutoScraper', methods=['GET']) def autoscraper(Link=None, Metodo=None): url = request.args["Link"] link = request.args["Link"] Metodo = request.args["Metodo"] wanted_list = [Metodo] scraper = AutoScraper()
from autoscraper import AutoScraper # AutoScraper must be installed with # pip install git+https://github.com/alirezamika/autoscraper.git question = "france" time = "year" url = f"https://www.quora.com/search?q={question}&time={time}" model_name = "model_quora" scraper = AutoScraper() scraper.load(f"./{model_name}") results = scraper.get_result_similar(url) # if no results if results: for r in results: print(r) else: print("No result found")
def autoscraper(Link=None, Metodo=None): url = request.args["Link"] link = request.args["Link"] Metodo = request.args["Metodo"] wanted_list = [Metodo] scraper = AutoScraper() scraper.build(link, wanted_list) dict = scraper.get_result_exact(link, unique=False, grouped=True) l = [] [l.extend([k, v]) for k, v in dict.items()] regla = l[0] scraper.set_rule_aliases({regla: 'regla'}) scraper.keep_rules([regla]) url = url.replace("http:", "").replace("//", "").replace(".", "").replace( "www", "").replace("https:", "").replace("/", "").replace("\n", "").replace("-", "") scraper.save(url + '-search') data = get_pagina_result(url, link) json_format = json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False) return json_format
from autoscraper import AutoScraper # Parameters url = "https://www.quora.com/search?q=deep%20learning&time=year" model_name = "model_quora" wanted_list = ["When will deep learning finally die out?"] # We instanciate the AutoScraper scraper = AutoScraper() # We train the Scraper # Here we can also pass html content via the html parameter instead of the url (html=html_content) result = scraper.build(url, wanted_list) # We display the results if any if(result): print("🚀 Great a query has been inferred !! Great gob.") print(result) # If no result we leave with an error code if(result == None): print("Sorry no query can be inferred ... 😿") exit(-1) # We save the model for future use print(f"💿 > Save the model {model_name}") scraper.save(model_name)
#!/usr/bin/env python3 import os from autoscraper import AutoScraper #python3 -m pip install autoscraper #sudo chmod a+rwxX /home/pi/Desktop/scraper/ #sudo python3 /home/pi/Desktop/scraper/AutoScrapeRun.py data_dir = '/home/pi/Desktop/scraper/' scrapeRecord = os.path.join(data_dir,'Results.txt') # location to save records of file send attempts for troubleshooting scraperitemFavorites = AutoScraper() scrapershopSales= AutoScraper() scraperlastSale= AutoScraper() scraperbestSeller= AutoScraper() scraperstars= AutoScraper() scraperitemReviews= AutoScraper() scrapershopReviews= AutoScraper() scrapershopAge= AutoScraper() #scrapertotalItems= AutoScraper() scraperprice= AutoScraper() scrapershopSales.load('scrapershopSales') scraperlastSale.load('scraperlastSale') scraperbestSeller.load('scraperbestSeller') scraperstars.load('scraperstars') scraperitemReviews.load('scraperitemReviews') scrapershopReviews.load('scrapershopReviews') scrapershopAge.load('scrapershopAge') #scrapertotalItems.load('scrapertotalItems') scraperprice.load('scraperprice')
from autoscraper import AutoScraper url = 'https://www.rosario3.com/' wanted_list = [ "/especiales/Club-de-Lectura-Brandon-Sanderson-es-mejor-que-J.-R.-R.-Tolkien-20200909-0043.html" ] scraper = AutoScraper() result = scraper.build(url, wanted_list) dict = scraper.get_result_exact(url, unique=False, grouped=True) l = [] [l.extend([k, v]) for k, v in dict.items()] regla = l[0] scraper.set_rule_aliases({regla: 'regla'}) scraper.keep_rules([regla]) scraper.save('rosario3-search')
from autoscraper import AutoScraper from flask import Flask, request, escape flipkart_scraper = AutoScraper() flipkart_scraper.load('flipkart-search') app = Flask(__name__) def get_flipkart_result(search_query): url = 'https://www.flipkart.com/search?q=%s&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off' % search_query result = flipkart_scraper.get_result_similar(url, group_by_alias=True) return _aggregate_result(result) def _aggregate_result(result): final_result = [] print(list(result.values())[0]) print(list(result.values())[1]) for alias in result: print(alias) for i in range(len(list(result.values())[0])): try: final_result.append({alias: result[alias][i] for alias in result}) except: pass return final_result @app.route('/', methods=['GET'])
async def test_async_autoscraper(): scraper = AutoScraper() scraper.use_async = True result = scraper.build(url, wanted_list) print(result)
from autoscraper import AutoScraper import pandas as pd """# Defining url and search word""" search = "iphone+11+silver" amazon_url="https://www.amazon.in/s?k={}&s=price-desc-rank".format(search) print(amazon_url) """# Defining what data I want """ wanted_list=["https://m.media-amazon.com/images/I/71umuN8XVeL._AC_UY218_.jpg","New Apple iPhone 12 Pro Max (128GB) - Silver","1,25,900","501"] """# Creating scraper object""" scraper=AutoScraper() result=scraper.build(amazon_url,wanted_list) """# Finding similar data""" data = scraper.get_result_similar(amazon_url,grouped=True) print(data) keys = list(data.keys()) print(keys) """# Defining alias""" scraper.set_rule_aliases({str(keys[0]):'ImageUrl',str(keys[2]):'Title',str(keys[-2]):'Price',str(keys[-1]):'Reviews'}) scraper.save("amazon_in.json")