Beispiel #1
0
def get_pagina_result(url, link):
    PAGINA_scraper = AutoScraper()
    PAGINA_scraper.load('./' + url + '-search.json')
    result = PAGINA_scraper.get_result_similar(link,
                                               unique=True,
                                               group_by_alias=True)
    return _aggregate_result(result)
Beispiel #2
0
def getPrice():
    url = 'https://stackoverflow.com/questions/2081586/web-scraping-with-python'
    wanted_list = ["What are metaclasses in Python?"]

    scraper = AutoScraper()
    result = scraper.build(url, wanted_list)
    print(result)
Beispiel #3
0
 def auto(self, url, model_name='1'):
     scraper = AutoScraper()
     scraper.load(model_name)
     html_ = requests.get(url)
     html_.encoding = html_.apparent_encoding
     html_ = html_.text
     data = scraper.get_result_similar(url, html=html_, group_by_alias=True)
     return data
Beispiel #4
0
async def scrape(ctx, url: str, wanted_list: str):
    # url = ''
    # wanted_list = ['']
    botscraper = AutoScraper()
    print(type(url))
    print(type(wanted_list))
    scrape_result = botscraper.build(url, wanted_list)
    print(scrape_result)
    results_message = '\r\n'.join(['BriefHub bot has found results! 🚀',
                        f"Here is what I found for * {wanted_list} * on * {url} * : {str(scrape_result)}",
                        ':-)'])
     
    await ctx.send(results_message)
def autopan_bot():
    url = 'https://www.juniorminingnetwork.com/mining-topics/topic/drill-results.html'
    highgrade_scraper = 'high grade scraper'
    # We can add one or multiple candidates here.
    # You can also put urls here to retrieve urls.
    wanted_list = ['High Grade', 'High-Grade']
    botscraper = AutoScraper()
    highgrade_results = botscraper.build(url, wanted_list)
    print(highgrade_results)
    if (highgrade_results):
        for result in highgrade_results:
            print('BriefHub bot has found results! 🚀')
            print(highgrade_results)
    elif (highgrade_results == None):
        print("Hmmm, it doesn't look like we found anything")
        exit(-1)
    botscraper.save(highgrade_scraper)
    print(f"💿 > Save the model {highgrade_scraper}")
Beispiel #6
0
def autoscraper():
    link = request.json["Link"]
    global url
    url = request.json["Link"]
    wanted_list = request.json["Metodo"]
    global scraper
    scraper = AutoScraper()
    wanted_dict = {
        'url': [
            'https://www.rosario3.com/policiales/Robaron-dos-autos-de-alta-gama-de-una-concesionaria-y-los-encontraron-en-un-galpon-20201014-0080.html',
            'https://www.rosario3.com/-economia-negocios-agro-/La-inflacion-de-septiembre-fue-del-28-segun-el-Indec-20201014-0087.html',
            'https://www.rosario3.com/informaciongeneral/Coronavirus-confirmaron-el-primer-caso-de-reinfeccion-en-Rosario-20201014-0030.html'
        ]
    }
    scraper.build(url=link, wanted_dict=wanted_dict)
    dict = scraper.get_result_similar(link, grouped=True)

    regla = []
    [regla.extend([k]) for k in dict.keys()]
    #data = get_pagina_result(url, link)
    #json_format = json.dumps(data, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False)
    return regla
Beispiel #7
0
 def build(self, wanted_dict=None, model_name='1'):
     """
     url2autospider
     """
     html_ = self.html
     url = self.url
     scraper = AutoScraper()
     scraper.build(html=html_, wanted_dict=wanted_dict)
     # data = scraper.get_result_similar(url, html=html_, group_by_alias=True)
     scraper.save(model_name)
from autoscraper import AutoScraper

# Create the model
url = 'https://medium.com/@inzaniak'
wanted_list = [
    "Build a Web Scraping Python Project from Start to Finish",
    "5 things you need to learn as a Python beginner"
]

scraper = AutoScraper()
result = scraper.build(url, wanted_list)
print(result)

# Save the model
scraper.save('scrapers/medium.json')

# Load the model
del scraper
scraper = AutoScraper()
scraper.load('scrapers/medium.json')
scraper.get_result_similar(url)
Beispiel #9
0
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 25 22:30:42 2021

@author: Nikhil Reddy
"""

from autoscraper import AutoScraper

Scrap = AutoScraper()

amzn_url = "https://www.amazon.in/s?k=iphones"

req_list_amzn = ["₹58,400", "New Apple iPhone 11 (128GB) - Black"]
Scrap_amzn = Scrap.build(amzn_url, req_list_amzn)
res_amzn = Scrap.get_result_similar(amzn_url, grouped=True)

dyk = list(res_amzn.keys())
print(dyk)
Scrap.set_rule_aliases({dyk[len(dyk) - 1]: 'Title', dyk[0]: 'Price'})
Scrap.keep_rules([dyk[len(dyk) - 1], dyk[0]])
Scrap.save('amazon-search3')
Beispiel #10
0
import os
import sys
import requests
from time import time as timer
from urllib.parse import urljoin, urlparse
from multiprocessing.pool import ThreadPool

from autoscraper import AutoScraper
from expression.core import pipe
from expression.collections import Seq, seq

standard_ebooks_url = "https://standardebooks.org/ebooks"

# Navigation Scraper
navigation_scraper = AutoScraper()
scraped_pages_urls = navigation_scraper.build(
    standard_ebooks_url, ["/ebooks/?page=2", "/ebooks/?page=3"])
pages_urls = Seq(scraped_pages_urls).pipe(
    seq.map(lambda page: urljoin(standard_ebooks_url, page)), )

# Page Scraper
page_scraper = AutoScraper()
books_urls = page_scraper.build(standard_ebooks_url, [
    "/ebooks/ford-madox-ford/some-do-not",
    "/ebooks/booth-tarkington/the-turmoil",
    "/ebooks/anatole-france/penguin-island/a-w-evans",
    "/ebooks/edgar-allan-poe/the-narrative-of-arthur-gordon-pym-of-nantucket"
],
                                update=True)
for page in pages_urls:
    print(page)
Beispiel #11
0
def url_data():
    about()
    st.info("This feature has limited functionality")
    url=st.text_input("Webpage URL",help="Enter a url where your data is placed")
    if url=="":
        st.info("Please enter a valid input to get started")
        st.stop()
    
    #getting data Column names as user input
    column_name=st.text_input("enter candidadte column Name",key="value")
    value_list=column_name.split(",")
    
    #getting data example for refferances
    candidate=st.text_input("Candidate example value",key="candidates",help="use ; as seperator to enter another value")
    items_list=candidate.split(";")
    #st.write(items)
    
# create object
    scraper = AutoScraper()
# feeding for scraping
    final_result = scraper.build(url,items_list)
# display result
    
    
    results=scraper.get_result_similar(url,grouped=True,keep_order=True)
    result={}
    for key,value in results.items():
        if value not in result.values():
            result[key]=value
            
    orient_df=pd.DataFrame.from_dict(result,orient="index")
    df=orient_df.transpose()
    
    df.columns=value_list
    df.fillna(value=pd.np.nan,inplace=True)
    st.write(df)
    
    cols=df.columns.tolist()
    col1,col2=st.beta_columns(2)
 
    target=col1.selectbox("Select Target", cols,key="target")


    
    typelist=['binary','multiclass','regression','time series regression','time series multiclass','time series binary']
    p_type=col2.selectbox("Select problem type",typelist,key="p_type")     
    st.write("hey")
    x=df.drop(columns=target)
    y=df[target]
    x_train,x_test,y_train,y_test=evalml.preprocessing.split_data(x,y,problem_type=p_type)

    automl = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type)
    automl.search()


    rank=automl.rankings

#checking best pipeline     ###############################################################

    best_pipeline=automl.best_pipeline
    description=automl.describe_pipeline(automl.rankings.iloc[0]["id"])

### OPtimize the code 


### Evaluate on hold out data
    problem_list=['binary','time series binary']
    problem_list2=['multiclass','time series multiclass']

    cola,col_b,colc=st.beta_columns(3)
    
    if p_type in problem_list:
        objective=col_b.selectbox("select objective",objectives().binary_obj,key="objective selector")  
        best_pipeline.score(x_test, y_test, objectives=["auc","f1","Precision","Recall"])

        automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['f1', 'precision'],
                                         max_batches=1,
                                         optimize_thresholds=True)

        automl_tunned.search()

        tunned_rankings=automl_tunned.rankings

        tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

        tunned_pipeline= automl_tunned.best_pipeline

        tunned_pipeline.score(x_test, y_test,  objectives=[objective])

        pred=tunned_pipeline.predict_proba(x_test).to_dataframe()


# for multiclass type problem
    elif p_type in problem_list2:
        objective=col_b.selectbox("select objective",objectives().multiclass_obj,key="objective selector") 
        best_pipeline.score(x_test, y_test, objectives=["log loss multiclass","MCC multiclass","accuracy multiclass"])

        automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['MCC multiclass', 'accuracy multiclass'],
                                         max_batches=1,
                                         optimize_thresholds=True)

        automl_tunned.search()

        tunned_rankings=automl_tunned.rankings

        tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

        tunned_pipeline= automl_tunned.best_pipeline

        tunned_pipeline.score(x_test, y_test,  objectives=[objective])

        pred=tunned_pipeline.predict(x_test).to_series()

    
# for regression type problems
    else:
                objective=col_b.selectbox("select objective",objectives().regression_obj,key="objective selector") 
                best_pipeline.score(x_test, y_test, objectives=["r2","MSE","MAE","Root Mean Squared Error"])
                automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['Root Mean Squared Error', 'MSE','MAE'],
                                         max_batches=1,
                                         optimize_thresholds=True)

                automl_tunned.search()

                tunned_rankings=automl_tunned.rankings

                tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

                tunned_pipeline= automl_tunned.best_pipeline

                tunned_pipeline.score(x_test, y_test,  objectives=[objective])

                tunned_pipeline.fit(x_train,y_train)
                    
                pred=tunned_pipeline.predict(x_test).to_series()
                
    file=open("model_details.txt","w")
    str_dict=repr(tunned_description)
    file.write(str_dict)
    file.close()
    def get_binary_file_downloader_html(bin_file, file_label='File'):
            with open(bin_file, 'rb') as f:
                data = f.read()
                bin_str = base64.b64encode(data).decode()
                href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Click Here To Download {file_label}</a>'
                return href                
    col1,col2,col3=st.beta_columns([1,1,1])        
    if col2.button("Predict Results",key="output",help="shows results"):
            st.spinner()
            with st.spinner(text='In progress'):
                 st.info("Wait while we are selecting a best algoritham for your problem..Hold your breath.")
                 time.sleep(20)
            st.info("Done. Here you go.")
            st.write(pred)

    col11,col12=st.beta_columns([3,1])
    with col11:
        with st.beta_expander("Compare Models"):
                st.write(tunned_rankings)
        
    with col12:
        with st.beta_expander("Best Pipeline"):
                st.success(tunned_pipeline)
                st.markdown(get_binary_file_downloader_html('model_details.txt', 'Pipeline Details'), unsafe_allow_html=True)
Beispiel #12
0
#!/usr/bin/env python
# coding: utf-8


from autoscraper import AutoScraper
amazon_url = "https://www.amazon.in/s?i=aps&k=iphone"

wanted_list = ["New Apple iPhone 12 Pro Max (128GB) - Pacific Blue","₹1,25,900"]

scraper = AutoScraper()

result = scraper.build(amazon_url,wanted_list)

print(scraper.get_result_similar(amazon_url,grouped=True))




scraper.set_rule_aliases({"rule_1943":"Title",
                          "rule_1gc6":"MRP"})

scraper.keep_rules(["rule_1943","rule_1gc6"])
scraper.save("amazon_search")






from autoscraper import AutoScraper
import pandas as pd

pd.set_option('display.max_rows', None)

tickers = ['SCHB', 'AMZN', 'AAPL', 'MSFT', 'TSLA', 'AMD', 'NFLX']

scraper = AutoScraper()
scraper.load('../finviz_table')

for ticker in tickers:
    url = f'https://finviz.com/quote.ashx?t={ticker}'
    result = scraper.get_result(url)[0]

    index = result.index('Index')
    df = pd.DataFrame(zip(result[index:], result[:index]),
                      columns=['Attributes', 'Values'])

    print(f'\n{ticker} Data: ')
    print(df.set_index('Attributes'))
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 24 12:29:21 2021

@author: win10
"""
from autoscraper import AutoScraper
from flask import Flask, request

amazon_scraper = AutoScraper()
amazon_scraper.load('amazon-search')
app = Flask(__name__)


def get_amazon_result(search_query):
    url = 'https://www.amazon.in/s?k=%s' % search_query
    result = amazon_scraper.get_result_similar(url, group_by_alias=True)
    return _aggregate_result(result)


def _aggregate_result(result):
    final_result = []
    print(list(result.values())[0])
    for i in range(len(list(result.values())[0])):
        try:

            final_result.append({alias: result[alias][i] for alias in result})
        except:
            pass
    return final_result
Beispiel #15
0
from autoscraper import AutoScraper
from flask import Flask, request

ebay_scraper = AutoScraper()
etsy_scraper = AutoScraper()
ebay_scraper.load('ebay-search')
etsy_scraper.load('etsy-search')
app = Flask(__name__)


def get_ebay_result(search_query):
    url = 'https://www.ebay.com/sch/i.html?_nkw=%s' % search_query
    result = ebay_scraper.get_result_similar(url, group_by_alias=True)
    return _aggregate_result(result)


def get_etsy_result(search_query):
    url = 'https://www.etsy.com/search?q=%s' % search_query
    result = etsy_scraper.get_result_similar(url, group_by_alias=True)
    result['url'] = [
        f'https://www.etsy.com/listing/{i}' for i in result['url']
    ]
    return _aggregate_result(result)


def _aggregate_result(result):
    final_result = []
    for i in range(len(list(result.values())[0])):
        final_result.append({alias: result[alias][i] for alias in result})
    return final_result
Beispiel #16
0
from flask import Flask, render_template, request
from autoscraper import AutoScraper
import pandas as pd
import time

app = Flask(__name__)

#creating object and loading
amazon_scraper = AutoScraper()
amazon_scraper.load('amazon_in.json')


@app.route("/", methods=['GET'])
def home():

    #when user search it
    if request.args.get('search'):
        #inputs
        search = request.args.get('search')
        sortby = request.args.get('sortby', 'relevanceblender')

        #call function to retrieve data
        search_data, original_url = searchquery(search, sortby)
        data_length = len(search_data)

        #show to user
        return render_template("index.html",
                               data={
                                   'original_url': original_url,
                                   'query': search,
                                   'sortby': sortby,
Beispiel #17
0
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 24 12:25:41 2021

@author: win10
"""

from autoscraper import AutoScraper
amazon_url="https://www.amazon.in/s?k=iphones"

wanted_list=["₹58,400","New Apple iPhone 11 (128GB) - Black"]

scraper=AutoScraper()
result=scraper.build(amazon_url,wanted_list)

print(scraper.get_result_similar(amazon_url,grouped=True))



url = "https://app.hellobonsai.com/time_entries"

jar = cookielib.CookieJar()

with open("cookies.json") as f:
    cookies_dict = json.load(f)

for cookie_dict in cookies_dict:
    add_dict_to_cookiejar(jar, cookie_dict)

s = requests.Session()
s.cookies = jar

cookies_dict = requests.utils.dict_from_cookiejar(s.cookies)

scraper = AutoScraper()

wanted_list = ["Unbilled"]


result = scraper.build(url, wanted_list, request_args={"cookies": s.cookies})

print(result)

set_trace()



# We can add one or multiple candidates here.
# You can also put urls here to retrieve urls.
# wanted_list = ["What are metaclasses in Python?"]
Beispiel #19
0
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 24 12:25:41 2021

@author: win10
"""

from autoscraper import AutoScraper

amazon_url = "https://www.amazon.in/s?k=iphones"

wanted_list = ["₹58,400", "New Apple iPhone 11 (128GB) - Black"]

scraper = AutoScraper()
result = scraper.build(amazon_url, wanted_list)

print(scraper.get_result_similar(amazon_url, grouped=True))

scraper.set_rule_aliases({'rule_io1c': 'Title', 'rule_hm52': 'Price'})
scraper.keep_rules(['rule_io1c', 'rule_hm52'])
scraper.save('AmazonIn-search')
Beispiel #20
0
# pip install autoscraper

from autoscraper import AutoScraper

url = 'https://stackoverflow.com/questions/2081586/web-scraping-with-python'

wanted_list = ['How to check version of python modules?']

scraper = AutoScraper()
result = scraper.build(url, wanted_list)

for res in result:
    print(res)
""" 
How to execute a program or call a system command from Python
What are metaclasses in Python?
Does Python have a ternary conditional operator?
Convert bytes to a string
Does Python have a string 'contains' substring method?
How to check version of python modules?
"""
Beispiel #21
0
print('started scraping...')

data_dir = '/home/pi/Desktop/scraper/'
#scrapeRecord = os.path.join(data_dir,'scrapeRecord.csv') # location to save records of file send attempts for troubleshooting

urlMain = 'https://www.etsy.com/search?q=cosplay%20fire'
urlex = 'https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-2&organic_search_click=1&frs=1&col=1'
wanted_list = [urlex
               ]  #This is the most simple search type, just a one page input

MainLink = [
    ('https://www.etsy.com/search?q=cosplay%20fire', [
        'https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-2&organic_search_click=1&frs=1&col=1'
    ]),
]
scraperMain = AutoScraper()  #define a new scraper object
for targetUrl, wanted_list in MainLink:
    scraperMain.build(url=targetUrl, wanted_list=wanted_list)
    #scraperMain.build(urlMain, wanted_list) #build the contents of that scraper
scraperMain.save(
    'etsyMain'
)  #Saves this particular build of the scraper! (note, this is a local file, you can load it w/o having to regenerate very time!)

#Build a new batch of features to collect. Best to collect them seperately from each other so they dont cross wires!
itemFavorites = [
    ('https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-3&organic_search_click=1&frs=1',
     ['1525 favorites']),
]
scraperitemFavorites = AutoScraper()
for targetUrl, wanted_list in itemFavorites:
    scraperitemFavorites.build(
Beispiel #22
0
import json

from autoscraper import AutoScraper
from flask import Flask, request, render_template

PAGINA_scraper = AutoScraper()
PAGINA_scraper.load('./rosario3-search')
app = Flask(__name__)


def get_pagina_result(url, link):
    PAGINA_scraper = AutoScraper()
    PAGINA_scraper.load('./' + url + '-search')
    result = PAGINA_scraper.get_result_similar(link, group_by_alias=True)
    return _aggregate_result(result)


def _aggregate_result(result):
    final_result = []
    for i in range(len(list(result.values())[0])):
        final_result.append({alias: result[alias][i] for alias in result})
    return final_result


@app.route('/AutoScraper', methods=['GET'])
def autoscraper(Link=None, Metodo=None):
    url = request.args["Link"]
    link = request.args["Link"]
    Metodo = request.args["Metodo"]
    wanted_list = [Metodo]
    scraper = AutoScraper()
Beispiel #23
0
from autoscraper import AutoScraper

# AutoScraper must be installed with
#  pip install git+https://github.com/alirezamika/autoscraper.git

question = "france"
time = "year"
url = f"https://www.quora.com/search?q={question}&time={time}"
model_name = "model_quora"

scraper = AutoScraper()
scraper.load(f"./{model_name}")
results = scraper.get_result_similar(url)

# if no results
if results:
    for r in results:
        print(r)
else:
    print("No result found")
Beispiel #24
0
def autoscraper(Link=None, Metodo=None):
    url = request.args["Link"]
    link = request.args["Link"]
    Metodo = request.args["Metodo"]
    wanted_list = [Metodo]
    scraper = AutoScraper()
    scraper.build(link, wanted_list)
    dict = scraper.get_result_exact(link, unique=False, grouped=True)
    l = []
    [l.extend([k, v]) for k, v in dict.items()]
    regla = l[0]
    scraper.set_rule_aliases({regla: 'regla'})
    scraper.keep_rules([regla])
    url = url.replace("http:", "").replace("//", "").replace(".", "").replace(
        "www", "").replace("https:",
                           "").replace("/", "").replace("\n",
                                                        "").replace("-", "")
    scraper.save(url + '-search')
    data = get_pagina_result(url, link)
    json_format = json.dumps(data,
                             indent=4,
                             separators=(',', ': '),
                             sort_keys=True,
                             ensure_ascii=False)
    return json_format
Beispiel #25
0

from autoscraper import AutoScraper

# Parameters
url = "https://www.quora.com/search?q=deep%20learning&time=year"
model_name = "model_quora"

wanted_list = ["When will deep learning finally die out?"]

# We instanciate the AutoScraper
scraper = AutoScraper()

# We train the Scraper
# Here we can also pass html content via the html parameter instead of the url (html=html_content)
result = scraper.build(url, wanted_list)

# We display the results if any
if(result):
  print("🚀 Great a query has been inferred !! Great gob.")
  print(result)

# If no result we leave with an error code
if(result == None):
  print("Sorry no query can be inferred ... 😿")
  exit(-1)

# We save the model for future use
print(f"💿 > Save the model {model_name}")
scraper.save(model_name)
Beispiel #26
0
#!/usr/bin/env python3
import os

from autoscraper import AutoScraper
    #python3 -m pip install autoscraper
    #sudo chmod a+rwxX /home/pi/Desktop/scraper/
    #sudo python3 /home/pi/Desktop/scraper/AutoScrapeRun.py

data_dir = '/home/pi/Desktop/scraper/'
scrapeRecord = os.path.join(data_dir,'Results.txt') # location to save records of file send attempts for troubleshooting

scraperitemFavorites = AutoScraper()
scrapershopSales= AutoScraper()
scraperlastSale= AutoScraper()
scraperbestSeller= AutoScraper()
scraperstars= AutoScraper()
scraperitemReviews= AutoScraper()
scrapershopReviews= AutoScraper()
scrapershopAge= AutoScraper()
#scrapertotalItems= AutoScraper()
scraperprice= AutoScraper()

scrapershopSales.load('scrapershopSales')
scraperlastSale.load('scraperlastSale')
scraperbestSeller.load('scraperbestSeller')
scraperstars.load('scraperstars')
scraperitemReviews.load('scraperitemReviews')
scrapershopReviews.load('scrapershopReviews')
scrapershopAge.load('scrapershopAge')
#scrapertotalItems.load('scrapertotalItems')
scraperprice.load('scraperprice')
Beispiel #27
0
from autoscraper import AutoScraper

url = 'https://www.rosario3.com/'

wanted_list = [
    "/especiales/Club-de-Lectura-Brandon-Sanderson-es-mejor-que-J.-R.-R.-Tolkien-20200909-0043.html"
]

scraper = AutoScraper()
result = scraper.build(url, wanted_list)

dict = scraper.get_result_exact(url, unique=False, grouped=True)
l = []
[l.extend([k, v]) for k, v in dict.items()]

regla = l[0]
scraper.set_rule_aliases({regla: 'regla'})

scraper.keep_rules([regla])

scraper.save('rosario3-search')
Beispiel #28
0
from autoscraper import AutoScraper
from flask import Flask, request, escape

flipkart_scraper = AutoScraper()
flipkart_scraper.load('flipkart-search')
app = Flask(__name__)


def get_flipkart_result(search_query):
    url = 'https://www.flipkart.com/search?q=%s&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off' % search_query

    result = flipkart_scraper.get_result_similar(url, group_by_alias=True)
    return _aggregate_result(result)


def _aggregate_result(result):
    final_result = []
    print(list(result.values())[0])
    print(list(result.values())[1])
    for alias in result:
        print(alias)
    for i in range(len(list(result.values())[0])):
        try:

            final_result.append({alias: result[alias][i] for alias in result})
        except:
            pass
    return final_result


@app.route('/', methods=['GET'])
async def test_async_autoscraper():
    scraper = AutoScraper()
    scraper.use_async = True
    result = scraper.build(url, wanted_list)
    print(result)
Beispiel #30
0
from autoscraper import AutoScraper
import pandas as pd

"""# Defining url and search word"""

search = "iphone+11+silver"
amazon_url="https://www.amazon.in/s?k={}&s=price-desc-rank".format(search)
print(amazon_url)

"""# Defining what data I want """

wanted_list=["https://m.media-amazon.com/images/I/71umuN8XVeL._AC_UY218_.jpg","New Apple iPhone 12 Pro Max (128GB) - Silver","1,25,900","501"]

"""# Creating scraper object"""

scraper=AutoScraper()
result=scraper.build(amazon_url,wanted_list)

"""# Finding similar data"""

data = scraper.get_result_similar(amazon_url,grouped=True)
print(data)

keys = list(data.keys())
print(keys)

"""# Defining alias"""

scraper.set_rule_aliases({str(keys[0]):'ImageUrl',str(keys[2]):'Title',str(keys[-2]):'Price',str(keys[-1]):'Reviews'})

scraper.save("amazon_in.json")