def autoscraper(Link=None, Metodo=None): url = request.args["Link"] link = request.args["Link"] Metodo = request.args["Metodo"] wanted_list = [Metodo] scraper = AutoScraper() scraper.build(link, wanted_list) dict = scraper.get_result_exact(link, unique=False, grouped=True) l = [] [l.extend([k, v]) for k, v in dict.items()] regla = l[0] scraper.set_rule_aliases({regla: 'regla'}) scraper.keep_rules([regla]) url = url.replace("http:", "").replace("//", "").replace(".", "").replace( "www", "").replace("https:", "").replace("/", "").replace("\n", "").replace("-", "") scraper.save(url + '-search') data = get_pagina_result(url, link) json_format = json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False) return json_format
def build(self, wanted_dict=None, model_name='1'): """ url2autospider """ html_ = self.html url = self.url scraper = AutoScraper() scraper.build(html=html_, wanted_dict=wanted_dict) # data = scraper.get_result_similar(url, html=html_, group_by_alias=True) scraper.save(model_name)
def getPrice(): url = 'https://stackoverflow.com/questions/2081586/web-scraping-with-python' wanted_list = ["What are metaclasses in Python?"] scraper = AutoScraper() result = scraper.build(url, wanted_list) print(result)
async def scrape(ctx, url: str, wanted_list: str): # url = '' # wanted_list = [''] botscraper = AutoScraper() print(type(url)) print(type(wanted_list)) scrape_result = botscraper.build(url, wanted_list) print(scrape_result) results_message = '\r\n'.join(['BriefHub bot has found results! 🚀', f"Here is what I found for * {wanted_list} * on * {url} * : {str(scrape_result)}", ':-)']) await ctx.send(results_message)
def autoscraper(): link = request.json["Link"] global url url = request.json["Link"] wanted_list = request.json["Metodo"] global scraper scraper = AutoScraper() wanted_dict = { 'url': [ 'https://www.rosario3.com/policiales/Robaron-dos-autos-de-alta-gama-de-una-concesionaria-y-los-encontraron-en-un-galpon-20201014-0080.html', 'https://www.rosario3.com/-economia-negocios-agro-/La-inflacion-de-septiembre-fue-del-28-segun-el-Indec-20201014-0087.html', 'https://www.rosario3.com/informaciongeneral/Coronavirus-confirmaron-el-primer-caso-de-reinfeccion-en-Rosario-20201014-0030.html' ] } scraper.build(url=link, wanted_dict=wanted_dict) dict = scraper.get_result_similar(link, grouped=True) regla = [] [regla.extend([k]) for k in dict.keys()] #data = get_pagina_result(url, link) #json_format = json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False) return regla
def autopan_bot(): url = 'https://www.juniorminingnetwork.com/mining-topics/topic/drill-results.html' highgrade_scraper = 'high grade scraper' # We can add one or multiple candidates here. # You can also put urls here to retrieve urls. wanted_list = ['High Grade', 'High-Grade'] botscraper = AutoScraper() highgrade_results = botscraper.build(url, wanted_list) print(highgrade_results) if (highgrade_results): for result in highgrade_results: print('BriefHub bot has found results! 🚀') print(highgrade_results) elif (highgrade_results == None): print("Hmmm, it doesn't look like we found anything") exit(-1) botscraper.save(highgrade_scraper) print(f"💿 > Save the model {highgrade_scraper}")
async def test_async_autoscraper(): scraper = AutoScraper() scraper.use_async = True result = scraper.build(url, wanted_list) print(result)
from autoscraper import AutoScraper url = 'https://www.rosario3.com/' wanted_list = [ "/especiales/Club-de-Lectura-Brandon-Sanderson-es-mejor-que-J.-R.-R.-Tolkien-20200909-0043.html" ] scraper = AutoScraper() result = scraper.build(url, wanted_list) dict = scraper.get_result_exact(url, unique=False, grouped=True) l = [] [l.extend([k, v]) for k, v in dict.items()] regla = l[0] scraper.set_rule_aliases({regla: 'regla'}) scraper.keep_rules([regla]) scraper.save('rosario3-search')
data_dir = '/home/pi/Desktop/scraper/' #scrapeRecord = os.path.join(data_dir,'scrapeRecord.csv') # location to save records of file send attempts for troubleshooting urlMain = 'https://www.etsy.com/search?q=cosplay%20fire' urlex = 'https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-2&organic_search_click=1&frs=1&col=1' wanted_list = [urlex ] #This is the most simple search type, just a one page input MainLink = [ ('https://www.etsy.com/search?q=cosplay%20fire', [ 'https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-2&organic_search_click=1&frs=1&col=1' ]), ] scraperMain = AutoScraper() #define a new scraper object for targetUrl, wanted_list in MainLink: scraperMain.build(url=targetUrl, wanted_list=wanted_list) #scraperMain.build(urlMain, wanted_list) #build the contents of that scraper scraperMain.save( 'etsyMain' ) #Saves this particular build of the scraper! (note, this is a local file, you can load it w/o having to regenerate very time!) #Build a new batch of features to collect. Best to collect them seperately from each other so they dont cross wires! itemFavorites = [ ('https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-3&organic_search_click=1&frs=1', ['1525 favorites']), ] scraperitemFavorites = AutoScraper() for targetUrl, wanted_list in itemFavorites: scraperitemFavorites.build( url=targetUrl, wanted_list=wanted_list) #, update=True)# grouped =True)
# -*- coding: utf-8 -*- """ Created on Sat Apr 24 12:25:41 2021 @author: win10 """ from autoscraper import AutoScraper amazon_url="https://www.amazon.in/s?k=iphones" wanted_list=["₹58,400","New Apple iPhone 11 (128GB) - Black"] scraper=AutoScraper() result=scraper.build(amazon_url,wanted_list) print(scraper.get_result_similar(amazon_url,grouped=True))
# -*- coding: utf-8 -*- """ Created on Sun Apr 25 22:30:42 2021 @author: Nikhil Reddy """ from autoscraper import AutoScraper Scrap = AutoScraper() amzn_url = "https://www.amazon.in/s?k=iphones" req_list_amzn = ["₹58,400", "New Apple iPhone 11 (128GB) - Black"] Scrap_amzn = Scrap.build(amzn_url, req_list_amzn) res_amzn = Scrap.get_result_similar(amzn_url, grouped=True) dyk = list(res_amzn.keys()) print(dyk) Scrap.set_rule_aliases({dyk[len(dyk) - 1]: 'Title', dyk[0]: 'Price'}) Scrap.keep_rules([dyk[len(dyk) - 1], dyk[0]]) Scrap.save('amazon-search3')
import os import sys import requests from time import time as timer from urllib.parse import urljoin, urlparse from multiprocessing.pool import ThreadPool from autoscraper import AutoScraper from expression.core import pipe from expression.collections import Seq, seq standard_ebooks_url = "https://standardebooks.org/ebooks" # Navigation Scraper navigation_scraper = AutoScraper() scraped_pages_urls = navigation_scraper.build( standard_ebooks_url, ["/ebooks/?page=2", "/ebooks/?page=3"]) pages_urls = Seq(scraped_pages_urls).pipe( seq.map(lambda page: urljoin(standard_ebooks_url, page)), ) # Page Scraper page_scraper = AutoScraper() books_urls = page_scraper.build(standard_ebooks_url, [ "/ebooks/ford-madox-ford/some-do-not", "/ebooks/booth-tarkington/the-turmoil", "/ebooks/anatole-france/penguin-island/a-w-evans", "/ebooks/edgar-allan-poe/the-narrative-of-arthur-gordon-pym-of-nantucket" ], update=True) for page in pages_urls: print(page) urls = page_scraper.get_result_similar(page)
def url_data(): about() st.info("This feature has limited functionality") url=st.text_input("Webpage URL",help="Enter a url where your data is placed") if url=="": st.info("Please enter a valid input to get started") st.stop() #getting data Column names as user input column_name=st.text_input("enter candidadte column Name",key="value") value_list=column_name.split(",") #getting data example for refferances candidate=st.text_input("Candidate example value",key="candidates",help="use ; as seperator to enter another value") items_list=candidate.split(";") #st.write(items) # create object scraper = AutoScraper() # feeding for scraping final_result = scraper.build(url,items_list) # display result results=scraper.get_result_similar(url,grouped=True,keep_order=True) result={} for key,value in results.items(): if value not in result.values(): result[key]=value orient_df=pd.DataFrame.from_dict(result,orient="index") df=orient_df.transpose() df.columns=value_list df.fillna(value=pd.np.nan,inplace=True) st.write(df) cols=df.columns.tolist() col1,col2=st.beta_columns(2) target=col1.selectbox("Select Target", cols,key="target") typelist=['binary','multiclass','regression','time series regression','time series multiclass','time series binary'] p_type=col2.selectbox("Select problem type",typelist,key="p_type") st.write("hey") x=df.drop(columns=target) y=df[target] x_train,x_test,y_train,y_test=evalml.preprocessing.split_data(x,y,problem_type=p_type) automl = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type) automl.search() rank=automl.rankings #checking best pipeline ############################################################### best_pipeline=automl.best_pipeline description=automl.describe_pipeline(automl.rankings.iloc[0]["id"]) ### OPtimize the code ### Evaluate on hold out data problem_list=['binary','time series binary'] problem_list2=['multiclass','time series multiclass'] cola,col_b,colc=st.beta_columns(3) if p_type in problem_list: objective=col_b.selectbox("select objective",objectives().binary_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["auc","f1","Precision","Recall"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['f1', 'precision'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) pred=tunned_pipeline.predict_proba(x_test).to_dataframe() # for multiclass type problem elif p_type in problem_list2: objective=col_b.selectbox("select objective",objectives().multiclass_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["log loss multiclass","MCC multiclass","accuracy multiclass"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['MCC multiclass', 'accuracy multiclass'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) pred=tunned_pipeline.predict(x_test).to_series() # for regression type problems else: objective=col_b.selectbox("select objective",objectives().regression_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["r2","MSE","MAE","Root Mean Squared Error"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['Root Mean Squared Error', 'MSE','MAE'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) tunned_pipeline.fit(x_train,y_train) pred=tunned_pipeline.predict(x_test).to_series() file=open("model_details.txt","w") str_dict=repr(tunned_description) file.write(str_dict) file.close() def get_binary_file_downloader_html(bin_file, file_label='File'): with open(bin_file, 'rb') as f: data = f.read() bin_str = base64.b64encode(data).decode() href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Click Here To Download {file_label}</a>' return href col1,col2,col3=st.beta_columns([1,1,1]) if col2.button("Predict Results",key="output",help="shows results"): st.spinner() with st.spinner(text='In progress'): st.info("Wait while we are selecting a best algoritham for your problem..Hold your breath.") time.sleep(20) st.info("Done. Here you go.") st.write(pred) col11,col12=st.beta_columns([3,1]) with col11: with st.beta_expander("Compare Models"): st.write(tunned_rankings) with col12: with st.beta_expander("Best Pipeline"): st.success(tunned_pipeline) st.markdown(get_binary_file_downloader_html('model_details.txt', 'Pipeline Details'), unsafe_allow_html=True)
cookies_dict = json.load(f) for cookie_dict in cookies_dict: add_dict_to_cookiejar(jar, cookie_dict) s = requests.Session() s.cookies = jar cookies_dict = requests.utils.dict_from_cookiejar(s.cookies) scraper = AutoScraper() wanted_list = ["Unbilled"] result = scraper.build(url, wanted_list, request_args={"cookies": s.cookies}) print(result) set_trace() # We can add one or multiple candidates here. # You can also put urls here to retrieve urls. # wanted_list = ["What are metaclasses in Python?"] # scraper = AutoScraper() # result = scraper.build(url, wanted_list) # print(result)
from autoscraper import AutoScraper url = 'https://www.juniorminingnetwork.com/mining-topics/topic/drill-results.html' highgrade_scraper = 'high grade scraper' # We can add one or multiple candidates here. # You can also put urls here to retrieve urls. wanted_list = ['High Grade', 'High-Grade'] botscraper = AutoScraper() highgrade_results = botscraper.build(url, wanted_list) if (highgrade_results): for result in highgrade_results: print('BriefHub bot has found results! 🚀') print(highgrade_results) elif (highgrade_results == None): print("Hmmm, it doesn't look like we found anything") exit(-1) botscraper.save(highgrade_scraper) print(f"💿 > Save the model {highgrade_scraper}")
Original file is located at https://colab.research.google.com/drive/1RI_PvSSKJl-t3dGJNxeTIXiRXPbrbzyM """ !pip install git+https://github.com/alirezamika/autoscraper.git from autoscraper import AutoScraper url = 'http://wikicfp.com/cfp/call?conference=medical%20imaging&page=2' category = ['Event'] scrape = AutoScraper() final = scrape.build(url, category) print(final[]) for i in range(6,len(final),5): print(final[i]+'\n') """# **Main Code Begins Here --**""" import pandas as pd import numpy as np single_topics = ['5G','aerospace','automation','blockchain','bussiness','cancer','economics'] double_topics = [['medical','imaging'],['fuzzy','systems']] events = [] category=['Event']