コード例 #1
0
ファイル: api_server.py プロジェクト: ezeellena/AutoScraper
def autoscraper(Link=None, Metodo=None):
    url = request.args["Link"]
    link = request.args["Link"]
    Metodo = request.args["Metodo"]
    wanted_list = [Metodo]
    scraper = AutoScraper()
    scraper.build(link, wanted_list)
    dict = scraper.get_result_exact(link, unique=False, grouped=True)
    l = []
    [l.extend([k, v]) for k, v in dict.items()]
    regla = l[0]
    scraper.set_rule_aliases({regla: 'regla'})
    scraper.keep_rules([regla])
    url = url.replace("http:", "").replace("//", "").replace(".", "").replace(
        "www", "").replace("https:",
                           "").replace("/", "").replace("\n",
                                                        "").replace("-", "")
    scraper.save(url + '-search')
    data = get_pagina_result(url, link)
    json_format = json.dumps(data,
                             indent=4,
                             separators=(',', ': '),
                             sort_keys=True,
                             ensure_ascii=False)
    return json_format
コード例 #2
0
 def build(self, wanted_dict=None, model_name='1'):
     """
     url2autospider
     """
     html_ = self.html
     url = self.url
     scraper = AutoScraper()
     scraper.build(html=html_, wanted_dict=wanted_dict)
     # data = scraper.get_result_similar(url, html=html_, group_by_alias=True)
     scraper.save(model_name)
コード例 #3
0
def autopan_bot():
    url = 'https://www.juniorminingnetwork.com/mining-topics/topic/drill-results.html'
    highgrade_scraper = 'high grade scraper'
    # We can add one or multiple candidates here.
    # You can also put urls here to retrieve urls.
    wanted_list = ['High Grade', 'High-Grade']
    botscraper = AutoScraper()
    highgrade_results = botscraper.build(url, wanted_list)
    print(highgrade_results)
    if (highgrade_results):
        for result in highgrade_results:
            print('BriefHub bot has found results! 🚀')
            print(highgrade_results)
    elif (highgrade_results == None):
        print("Hmmm, it doesn't look like we found anything")
        exit(-1)
    botscraper.save(highgrade_scraper)
    print(f"💿 > Save the model {highgrade_scraper}")
コード例 #4
0
from autoscraper import AutoScraper

url = 'https://www.rosario3.com/'

wanted_list = [
    "/especiales/Club-de-Lectura-Brandon-Sanderson-es-mejor-que-J.-R.-R.-Tolkien-20200909-0043.html"
]

scraper = AutoScraper()
result = scraper.build(url, wanted_list)

dict = scraper.get_result_exact(url, unique=False, grouped=True)
l = []
[l.extend([k, v]) for k, v in dict.items()]

regla = l[0]
scraper.set_rule_aliases({regla: 'regla'})

scraper.keep_rules([regla])

scraper.save('rosario3-search')
コード例 #5
0
from autoscraper import AutoScraper

# Parameters
url = "https://www.quora.com/search?q=deep%20learning&time=year"
model_name = "model_quora"

wanted_list = ["When will deep learning finally die out?"]

# We instanciate the AutoScraper
scraper = AutoScraper()

# We train the Scraper
# Here we can also pass html content via the html parameter instead of the url (html=html_content)
result = scraper.build(url, wanted_list)

# We display the results if any
if(result):
  print("🚀 Great a query has been inferred !! Great gob.")
  print(result)

# If no result we leave with an error code
if(result == None):
  print("Sorry no query can be inferred ... 😿")
  exit(-1)

# We save the model for future use
print(f"💿 > Save the model {model_name}")
scraper.save(model_name)

コード例 #6
0
urlMain = 'https://www.etsy.com/search?q=cosplay%20fire'
urlex = 'https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-2&organic_search_click=1&frs=1&col=1'
wanted_list = [urlex
               ]  #This is the most simple search type, just a one page input

MainLink = [
    ('https://www.etsy.com/search?q=cosplay%20fire', [
        'https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-2&organic_search_click=1&frs=1&col=1'
    ]),
]
scraperMain = AutoScraper()  #define a new scraper object
for targetUrl, wanted_list in MainLink:
    scraperMain.build(url=targetUrl, wanted_list=wanted_list)
    #scraperMain.build(urlMain, wanted_list) #build the contents of that scraper
scraperMain.save(
    'etsyMain'
)  #Saves this particular build of the scraper! (note, this is a local file, you can load it w/o having to regenerate very time!)

#Build a new batch of features to collect. Best to collect them seperately from each other so they dont cross wires!
itemFavorites = [
    ('https://www.etsy.com/listing/674681682/fire-ice-cosplay-light-up-led-wearable?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=cosplay+fire&ref=sr_gallery-1-3&organic_search_click=1&frs=1',
     ['1525 favorites']),
]
scraperitemFavorites = AutoScraper()
for targetUrl, wanted_list in itemFavorites:
    scraperitemFavorites.build(
        url=targetUrl,
        wanted_list=wanted_list)  #, update=True)# grouped =True)
scraperitemFavorites.save('scraperitemFavorites')

shopSales = [
コード例 #7
0
from autoscraper import AutoScraper

# Create the model
url = 'https://medium.com/@inzaniak'
wanted_list = [
    "Build a Web Scraping Python Project from Start to Finish",
    "5 things you need to learn as a Python beginner"
]

scraper = AutoScraper()
result = scraper.build(url, wanted_list)
print(result)

# Save the model
scraper.save('scrapers/medium.json')

# Load the model
del scraper
scraper = AutoScraper()
scraper.load('scrapers/medium.json')
scraper.get_result_similar(url)
コード例 #8
0
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 25 22:30:42 2021

@author: Nikhil Reddy
"""

from autoscraper import AutoScraper

Scrap = AutoScraper()

amzn_url = "https://www.amazon.in/s?k=iphones"

req_list_amzn = ["₹58,400", "New Apple iPhone 11 (128GB) - Black"]
Scrap_amzn = Scrap.build(amzn_url, req_list_amzn)
res_amzn = Scrap.get_result_similar(amzn_url, grouped=True)

dyk = list(res_amzn.keys())
print(dyk)
Scrap.set_rule_aliases({dyk[len(dyk) - 1]: 'Title', dyk[0]: 'Price'})
Scrap.keep_rules([dyk[len(dyk) - 1], dyk[0]])
Scrap.save('amazon-search3')
コード例 #9
0
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 24 12:25:41 2021

@author: win10
"""

from autoscraper import AutoScraper

amazon_url = "https://www.amazon.in/s?k=iphones"

wanted_list = ["₹58,400", "New Apple iPhone 11 (128GB) - Black"]

scraper = AutoScraper()
result = scraper.build(amazon_url, wanted_list)

print(scraper.get_result_similar(amazon_url, grouped=True))

scraper.set_rule_aliases({'rule_io1c': 'Title', 'rule_hm52': 'Price'})
scraper.keep_rules(['rule_io1c', 'rule_hm52'])
scraper.save('AmazonIn-search')
コード例 #10
0
#!/usr/bin/env python
# coding: utf-8


from autoscraper import AutoScraper
amazon_url = "https://www.amazon.in/s?i=aps&k=iphone"

wanted_list = ["New Apple iPhone 12 Pro Max (128GB) - Pacific Blue","₹1,25,900"]

scraper = AutoScraper()

result = scraper.build(amazon_url,wanted_list)

print(scraper.get_result_similar(amazon_url,grouped=True))




scraper.set_rule_aliases({"rule_1943":"Title",
                          "rule_1gc6":"MRP"})

scraper.keep_rules(["rule_1943","rule_1gc6"])
scraper.save("amazon_search")






コード例 #11
0
scraper=AutoScraper()
result=scraper.build(amazon_url,wanted_list)

"""# Finding similar data"""

data = scraper.get_result_similar(amazon_url,grouped=True)
print(data)

keys = list(data.keys())
print(keys)

"""# Defining alias"""

scraper.set_rule_aliases({str(keys[0]):'ImageUrl',str(keys[2]):'Title',str(keys[-2]):'Price',str(keys[-1]):'Reviews'})

scraper.save("amazon_in.json")

"""# Testing for other search word"""

amazon_scraper = AutoScraper()
amazon_scraper.load('amazon_in.json')

search = "samsung phones"
amazon_url="https://www.amazon.in/s?k={}&s=price-desc-rank".format(search)

data = amazon_scraper.get_result_similar(amazon_url, group_by_alias=True)
search_data = tuple(zip(data['Title'],data['ImageUrl'],data['Price'],data['Reviews']))

df = pd.DataFrame(columns=['Query','Title','Price','Reviews','ImageUrl'])
for i in range(len(search_data)):
    df.loc[len(df)] = [search,search_data[i][0],search_data[i][2],search_data[i][3],search_data[i][1]]
コード例 #12
0
from autoscraper import AutoScraper

url = 'https://www.juniorminingnetwork.com/mining-topics/topic/drill-results.html'
highgrade_scraper = 'high grade scraper'
# We can add one or multiple candidates here.
# You can also put urls here to retrieve urls.
wanted_list = ['High Grade', 'High-Grade']

botscraper = AutoScraper()

highgrade_results = botscraper.build(url, wanted_list)

if (highgrade_results):
    for result in highgrade_results:
        print('BriefHub bot has found results! 🚀')
        print(highgrade_results)

elif (highgrade_results == None):
    print("Hmmm, it doesn't look like we found anything")
    exit(-1)

botscraper.save(highgrade_scraper)
print(f"💿 > Save the model {highgrade_scraper}")