Beispiel #1
0
def autoscraper(Link=None, Metodo=None):
    url = request.args["Link"]
    link = request.args["Link"]
    Metodo = request.args["Metodo"]
    wanted_list = [Metodo]
    scraper = AutoScraper()
    scraper.build(link, wanted_list)
    dict = scraper.get_result_exact(link, unique=False, grouped=True)
    l = []
    [l.extend([k, v]) for k, v in dict.items()]
    regla = l[0]
    scraper.set_rule_aliases({regla: 'regla'})
    scraper.keep_rules([regla])
    url = url.replace("http:", "").replace("//", "").replace(".", "").replace(
        "www", "").replace("https:",
                           "").replace("/", "").replace("\n",
                                                        "").replace("-", "")
    scraper.save(url + '-search')
    data = get_pagina_result(url, link)
    json_format = json.dumps(data,
                             indent=4,
                             separators=(',', ': '),
                             sort_keys=True,
                             ensure_ascii=False)
    return json_format
Beispiel #2
0
 def build(self, wanted_dict=None, model_name='1'):
     """
     url2autospider
     """
     html_ = self.html
     url = self.url
     scraper = AutoScraper()
     scraper.build(html=html_, wanted_dict=wanted_dict)
     # data = scraper.get_result_similar(url, html=html_, group_by_alias=True)
     scraper.save(model_name)
Beispiel #3
0
def getPrice():
    url = 'https://stackoverflow.com/questions/2081586/web-scraping-with-python'
    wanted_list = ["What are metaclasses in Python?"]

    scraper = AutoScraper()
    result = scraper.build(url, wanted_list)
    print(result)
Beispiel #4
0
async def scrape(ctx, url: str, wanted_list: str):
    # url = ''
    # wanted_list = ['']
    botscraper = AutoScraper()
    print(type(url))
    print(type(wanted_list))
    scrape_result = botscraper.build(url, wanted_list)
    print(scrape_result)
    results_message = '\r\n'.join(['BriefHub bot has found results! 🚀',
                        f"Here is what I found for * {wanted_list} * on * {url} * : {str(scrape_result)}",
                        ':-)'])
     
    await ctx.send(results_message)
Beispiel #5
0
def autoscraper():
    link = request.json["Link"]
    global url
    url = request.json["Link"]
    wanted_list = request.json["Metodo"]
    global scraper
    scraper = AutoScraper()
    wanted_dict = {
        'url': [
            'https://www.rosario3.com/policiales/Robaron-dos-autos-de-alta-gama-de-una-concesionaria-y-los-encontraron-en-un-galpon-20201014-0080.html',
            'https://www.rosario3.com/-economia-negocios-agro-/La-inflacion-de-septiembre-fue-del-28-segun-el-Indec-20201014-0087.html',
            'https://www.rosario3.com/informaciongeneral/Coronavirus-confirmaron-el-primer-caso-de-reinfeccion-en-Rosario-20201014-0030.html'
        ]
    }
    scraper.build(url=link, wanted_dict=wanted_dict)
    dict = scraper.get_result_similar(link, grouped=True)

    regla = []
    [regla.extend([k]) for k in dict.keys()]
    #data = get_pagina_result(url, link)
    #json_format = json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False)
    return regla
def autopan_bot():
    url = 'https://www.juniorminingnetwork.com/mining-topics/topic/drill-results.html'
    highgrade_scraper = 'high grade scraper'
    # We can add one or multiple candidates here.
    # You can also put urls here to retrieve urls.
    wanted_list = ['High Grade', 'High-Grade']
    botscraper = AutoScraper()
    highgrade_results = botscraper.build(url, wanted_list)
    print(highgrade_results)
    if (highgrade_results):
        for result in highgrade_results:
            print('BriefHub bot has found results! 🚀')
            print(highgrade_results)
    elif (highgrade_results == None):
        print("Hmmm, it doesn't look like we found anything")
        exit(-1)
    botscraper.save(highgrade_scraper)
    print(f"💿 > Save the model {highgrade_scraper}")
async def test_async_autoscraper():
    scraper = AutoScraper()
    scraper.use_async = True
    result = scraper.build(url, wanted_list)
    print(result)
Beispiel #8
0
from autoscraper import AutoScraper

url = 'https://www.rosario3.com/'

wanted_list = [
    "/especiales/Club-de-Lectura-Brandon-Sanderson-es-mejor-que-J.-R.-R.-Tolkien-20200909-0043.html"
]

scraper = AutoScraper()
result = scraper.build(url, wanted_list)

dict = scraper.get_result_exact(url, unique=False, grouped=True)
l = []
[l.extend([k, v]) for k, v in dict.items()]

regla = l[0]
scraper.set_rule_aliases({regla: 'regla'})

scraper.keep_rules([regla])

scraper.save('rosario3-search')
Beispiel #9
0
data_dir = '/home/pi/Desktop/scraper/'
#scrapeRecord = os.path.join(data_dir,'scrapeRecord.csv') # location to save records of file send attempts for troubleshooting

urlMain = 'https://www.etsy.com/search?q=cosplay%20fire'
urlex = 'https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-2&organic_search_click=1&frs=1&col=1'
wanted_list = [urlex
               ]  #This is the most simple search type, just a one page input

MainLink = [
    ('https://www.etsy.com/search?q=cosplay%20fire', [
        'https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-2&organic_search_click=1&frs=1&col=1'
    ]),
]
scraperMain = AutoScraper()  #define a new scraper object
for targetUrl, wanted_list in MainLink:
    scraperMain.build(url=targetUrl, wanted_list=wanted_list)
    #scraperMain.build(urlMain, wanted_list) #build the contents of that scraper
scraperMain.save(
    'etsyMain'
)  #Saves this particular build of the scraper! (note, this is a local file, you can load it w/o having to regenerate very time!)

#Build a new batch of features to collect. Best to collect them seperately from each other so they dont cross wires!
itemFavorites = [
    ('https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-3&organic_search_click=1&frs=1',
     ['1525 favorites']),
]
scraperitemFavorites = AutoScraper()
for targetUrl, wanted_list in itemFavorites:
    scraperitemFavorites.build(
        url=targetUrl,
        wanted_list=wanted_list)  #, update=True)# grouped =True)
Beispiel #10
0
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 24 12:25:41 2021

@author: win10
"""

from autoscraper import AutoScraper
amazon_url="https://www.amazon.in/s?k=iphones"

wanted_list=["₹58,400","New Apple iPhone 11 (128GB) - Black"]

scraper=AutoScraper()
result=scraper.build(amazon_url,wanted_list)

print(scraper.get_result_similar(amazon_url,grouped=True))



Beispiel #11
0
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 25 22:30:42 2021

@author: Nikhil Reddy
"""

from autoscraper import AutoScraper

Scrap = AutoScraper()

amzn_url = "https://www.amazon.in/s?k=iphones"

req_list_amzn = ["₹58,400", "New Apple iPhone 11 (128GB) - Black"]
Scrap_amzn = Scrap.build(amzn_url, req_list_amzn)
res_amzn = Scrap.get_result_similar(amzn_url, grouped=True)

dyk = list(res_amzn.keys())
print(dyk)
Scrap.set_rule_aliases({dyk[len(dyk) - 1]: 'Title', dyk[0]: 'Price'})
Scrap.keep_rules([dyk[len(dyk) - 1], dyk[0]])
Scrap.save('amazon-search3')
Beispiel #12
0
import os
import sys
import requests
from time import time as timer
from urllib.parse import urljoin, urlparse
from multiprocessing.pool import ThreadPool

from autoscraper import AutoScraper
from expression.core import pipe
from expression.collections import Seq, seq

standard_ebooks_url = "https://standardebooks.org/ebooks"

# Navigation Scraper
navigation_scraper = AutoScraper()
scraped_pages_urls = navigation_scraper.build(
    standard_ebooks_url, ["/ebooks/?page=2", "/ebooks/?page=3"])
pages_urls = Seq(scraped_pages_urls).pipe(
    seq.map(lambda page: urljoin(standard_ebooks_url, page)), )

# Page Scraper
page_scraper = AutoScraper()
books_urls = page_scraper.build(standard_ebooks_url, [
    "/ebooks/ford-madox-ford/some-do-not",
    "/ebooks/booth-tarkington/the-turmoil",
    "/ebooks/anatole-france/penguin-island/a-w-evans",
    "/ebooks/edgar-allan-poe/the-narrative-of-arthur-gordon-pym-of-nantucket"
],
                                update=True)
for page in pages_urls:
    print(page)
    urls = page_scraper.get_result_similar(page)
Beispiel #13
0
def url_data():
    about()
    st.info("This feature has limited functionality")
    url=st.text_input("Webpage URL",help="Enter a url where your data is placed")
    if url=="":
        st.info("Please enter a valid input to get started")
        st.stop()
    
    #getting data Column names as user input
    column_name=st.text_input("enter candidadte column Name",key="value")
    value_list=column_name.split(",")
    
    #getting data example for refferances
    candidate=st.text_input("Candidate example value",key="candidates",help="use ; as seperator to enter another value")
    items_list=candidate.split(";")
    #st.write(items)
    
# create object
    scraper = AutoScraper()
# feeding for scraping
    final_result = scraper.build(url,items_list)
# display result
    
    
    results=scraper.get_result_similar(url,grouped=True,keep_order=True)
    result={}
    for key,value in results.items():
        if value not in result.values():
            result[key]=value
            
    orient_df=pd.DataFrame.from_dict(result,orient="index")
    df=orient_df.transpose()
    
    df.columns=value_list
    df.fillna(value=pd.np.nan,inplace=True)
    st.write(df)
    
    cols=df.columns.tolist()
    col1,col2=st.beta_columns(2)
 
    target=col1.selectbox("Select Target", cols,key="target")


    
    typelist=['binary','multiclass','regression','time series regression','time series multiclass','time series binary']
    p_type=col2.selectbox("Select problem type",typelist,key="p_type")     
    st.write("hey")
    x=df.drop(columns=target)
    y=df[target]
    x_train,x_test,y_train,y_test=evalml.preprocessing.split_data(x,y,problem_type=p_type)

    automl = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type)
    automl.search()


    rank=automl.rankings

#checking best pipeline     ###############################################################

    best_pipeline=automl.best_pipeline
    description=automl.describe_pipeline(automl.rankings.iloc[0]["id"])

### OPtimize the code 


### Evaluate on hold out data
    problem_list=['binary','time series binary']
    problem_list2=['multiclass','time series multiclass']

    cola,col_b,colc=st.beta_columns(3)
    
    if p_type in problem_list:
        objective=col_b.selectbox("select objective",objectives().binary_obj,key="objective selector")  
        best_pipeline.score(x_test, y_test, objectives=["auc","f1","Precision","Recall"])

        automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['f1', 'precision'],
                                         max_batches=1,
                                         optimize_thresholds=True)

        automl_tunned.search()

        tunned_rankings=automl_tunned.rankings

        tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

        tunned_pipeline= automl_tunned.best_pipeline

        tunned_pipeline.score(x_test, y_test,  objectives=[objective])

        pred=tunned_pipeline.predict_proba(x_test).to_dataframe()


# for multiclass type problem
    elif p_type in problem_list2:
        objective=col_b.selectbox("select objective",objectives().multiclass_obj,key="objective selector") 
        best_pipeline.score(x_test, y_test, objectives=["log loss multiclass","MCC multiclass","accuracy multiclass"])

        automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['MCC multiclass', 'accuracy multiclass'],
                                         max_batches=1,
                                         optimize_thresholds=True)

        automl_tunned.search()

        tunned_rankings=automl_tunned.rankings

        tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

        tunned_pipeline= automl_tunned.best_pipeline

        tunned_pipeline.score(x_test, y_test,  objectives=[objective])

        pred=tunned_pipeline.predict(x_test).to_series()

    
# for regression type problems
    else:
                objective=col_b.selectbox("select objective",objectives().regression_obj,key="objective selector") 
                best_pipeline.score(x_test, y_test, objectives=["r2","MSE","MAE","Root Mean Squared Error"])
                automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['Root Mean Squared Error', 'MSE','MAE'],
                                         max_batches=1,
                                         optimize_thresholds=True)

                automl_tunned.search()

                tunned_rankings=automl_tunned.rankings

                tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

                tunned_pipeline= automl_tunned.best_pipeline

                tunned_pipeline.score(x_test, y_test,  objectives=[objective])

                tunned_pipeline.fit(x_train,y_train)
                    
                pred=tunned_pipeline.predict(x_test).to_series()
                
    file=open("model_details.txt","w")
    str_dict=repr(tunned_description)
    file.write(str_dict)
    file.close()
    def get_binary_file_downloader_html(bin_file, file_label='File'):
            with open(bin_file, 'rb') as f:
                data = f.read()
                bin_str = base64.b64encode(data).decode()
                href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Click Here To Download {file_label}</a>'
                return href                
    col1,col2,col3=st.beta_columns([1,1,1])        
    if col2.button("Predict Results",key="output",help="shows results"):
            st.spinner()
            with st.spinner(text='In progress'):
                 st.info("Wait while we are selecting a best algoritham for your problem..Hold your breath.")
                 time.sleep(20)
            st.info("Done. Here you go.")
            st.write(pred)

    col11,col12=st.beta_columns([3,1])
    with col11:
        with st.beta_expander("Compare Models"):
                st.write(tunned_rankings)
        
    with col12:
        with st.beta_expander("Best Pipeline"):
                st.success(tunned_pipeline)
                st.markdown(get_binary_file_downloader_html('model_details.txt', 'Pipeline Details'), unsafe_allow_html=True)
    cookies_dict = json.load(f)

for cookie_dict in cookies_dict:
    add_dict_to_cookiejar(jar, cookie_dict)

s = requests.Session()
s.cookies = jar

cookies_dict = requests.utils.dict_from_cookiejar(s.cookies)

scraper = AutoScraper()

wanted_list = ["Unbilled"]


result = scraper.build(url, wanted_list, request_args={"cookies": s.cookies})

print(result)

set_trace()



# We can add one or multiple candidates here.
# You can also put urls here to retrieve urls.
# wanted_list = ["What are metaclasses in Python?"]

# scraper = AutoScraper()
# result = scraper.build(url, wanted_list)
# print(result)
from autoscraper import AutoScraper

url = 'https://www.juniorminingnetwork.com/mining-topics/topic/drill-results.html'
highgrade_scraper = 'high grade scraper'
# We can add one or multiple candidates here.
# You can also put urls here to retrieve urls.
wanted_list = ['High Grade', 'High-Grade']

botscraper = AutoScraper()

highgrade_results = botscraper.build(url, wanted_list)

if (highgrade_results):
    for result in highgrade_results:
        print('BriefHub bot has found results! 🚀')
        print(highgrade_results)

elif (highgrade_results == None):
    print("Hmmm, it doesn't look like we found anything")
    exit(-1)

botscraper.save(highgrade_scraper)
print(f"💿 > Save the model {highgrade_scraper}")
Beispiel #16
0
Original file is located at
    https://colab.research.google.com/drive/1RI_PvSSKJl-t3dGJNxeTIXiRXPbrbzyM
"""

!pip install git+https://github.com/alirezamika/autoscraper.git



from autoscraper import AutoScraper

url = 'http://wikicfp.com/cfp/call?conference=medical%20imaging&page=2'
category = ['Event']

scrape = AutoScraper()
final = scrape.build(url, category)
print(final[])

for i in range(6,len(final),5):
  print(final[i]+'\n')

"""# **Main Code Begins Here --**"""

import pandas as pd
import numpy as np

single_topics = ['5G','aerospace','automation','blockchain','bussiness','cancer','economics']
double_topics = [['medical','imaging'],['fuzzy','systems']]

events = []
category=['Event']