Exemple #1
0
def scrapper(urlToRequest, firstPage, lastPage, fileName, openMode,
             newlineDelimiter):
    global pagesScrapped
    #Opening a csv file to write data in it
    with open(fileName, openMode, newline=newlineDelimiter) as csv_file:
        writer = csv.writer(csv_file)

        #For all pages in starting from the first to the last page
        for page in range(firstPage, lastPage):
            uClient = urlReq(urlToRequest + "page/" + str(page))
            page_html = uClient.read()
            uClient.close()

            #Initializing the page soup to scrape data
            page_soup = soup(page_html, "html.parser")
            containers = page_soup.findAll("h2", {"class": "title"})

            #Showing the progress as well as opening a separate page for a game to get RAM as well as Size
            for container in containers:
                pagesScrapped += 1
                print("\rScrapped pages: " + str(pagesScrapped), end='')
                uClientGamePage = urlReq(
                    urlToRequest +
                    "-".join(container.a.text.split(" ")).lower().replace(
                        "'", "").replace('"', '').replace("’", ""))
                game_page_html = uClientGamePage.read()
                uClientGamePage.close()

                #All data that is to be scrapped
                try:
                    game_page_soup = soup(game_page_html, "html.parser")
                    ramReq = game_page_soup.find(text=re.compile("^RAM"))
                    setupSize = game_page_soup.find(
                        text=re.compile("^Setup Size"))
                    cpuReq = game_page_soup.find(text=re.compile("^CPU"))
                    OSReq = game_page_soup.find(
                        text=re.compile("^Operating System"))
                    genres = game_page_soup.findAll("li",
                                                    {"class": "active-parent"})

                    genre_string = ""
                    for genre in genres:
                        genre_string += genre.a.text + " "

                    if setupSize is None:
                        setupSize = "Setup Size: NA"
                    if ramReq is None:
                        ramReq = "RAM: NA"

                    writer.writerow([
                        " ".join(container.a.text.split(" ")[:-2]),
                        ramReq.split(":")[1],
                        setupSize.split(":")[1],
                        cpuReq.split(":")[1],
                        OSReq.split(":")[1], genre_string
                    ])
                except AttributeError:
                    print("\nValue error in game: " +
                          " ".join(container.a.text.split(" ")[:-2]))
Exemple #2
0
def tellWeather():
    try:
        speak(weatherDict[random.randint(1, len(weatherDict))])
        uClient = urlReq(weather_url)
        page_html = uClient.read()
        uClient.close()

        page_soup = soup(page_html, "html.parser")
        tempValue = page_soup.find("div", {"class": "today_nowcard-temp"})
        currentTemperature = tempValue.span.text
        sideTable = page_soup.find(
            "div", {
                "class": "today_nowcard-sidecar component panel"
            }).table.tbody.tr.td.span
        wind = sideTable.text
        windDirection = wind.split()[0]
        windSpeed = wind.split()[1] + wind.split()[2]

        speak("It's " + currentTemperature +
              "C currently with a wind speed of " + windSpeed +
              " in the direction " + windDirectionTable[windDirection])
    except urllib.error.HTTPError:
        speak(
            "It seems the weather server is ignoring me. Can you try this later?"
        )
Exemple #3
0
def allrecipe_search(ingredients):
    recipeDict = {}
    # Search a list of recipes on all recipe.com
    searchUrl = "https://www.allrecipes.com/search/results/?wt=" + ingredients + "&sort=re"
    recipeUrlList = urlReq(searchUrl)
    htmlRaw = recipeUrlList.read()
    recipeUrlList.close()
    soup = BeautifulSoup(htmlRaw, "html.parser")
    # Get a list of recipes from main search result page
    recipeBook = []
    recipeUrlList = []
    recipeBlockContainer = soup.findAll("article",
                                        {"class": "fixed-recipe-card"})
    for recipeBlock in recipeBlockContainer:
        recipeUrl = recipeBlock.div.a['href']
        recipeUrlList.append(recipeUrl)
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = [
            executor.submit(get_recipe_list, recipeUrl)
            for recipeUrl in recipeUrlList
        ]
        for f in concurrent.futures.as_completed(results):
            recipeBook.append(f.result())
    recipeDict['allrecipes'] = recipeBook

    return recipeDict
Exemple #4
0
def Wpvuldb_Api(data):
    # Will match in case data is a Wordpress version
    regexVer = re.compile(r"^(\d{1,3}\.\d{0,3}\.{0,1}\d{0,3})")

    if regexVer.match(data):
        # The query is about Wordpress vulnerabilities
        base = "https://wpvulndb.com/api/v3/wordpresses/"
        url = base + data.replace(".", "")
    else:
        # The query is about Plugins vulnerabilities (a regex to match could be made but.. if "data" isn't
        # a WP version is a plugin name(slug))
        base = "https://wpvulndb.com/api/v3/plugins/"
        url = base + data

    token = ""
    authHeader = {"Authorization": "Token token=" + token}
    apiReq = Request(url, headers=authHeader)
    siteResponse = urlReq(apiReq).read()
    jsonResponse = json.loads(siteResponse.decode("ISO-8859-1"))
    listVuln = jsonResponse[data]["vulnerabilities"]

    if not listVuln:
        print("\n\tThere are no known vulnerabilities yet")

    for vuln in listVuln:
        vulName = vuln.get("title")
        vulType = vuln.get("vuln_type")
        vulDate = vuln.get("created_at")
        vulRef = vuln["references"].get("url")
        vulCVE = vuln["references"].get("cve")

        print("\n\n\tVulnerability name: " + vulName + "\n\tType: " + vulType +
              "\n\tDate: \
" + vulDate + "\n\tReferences: " + str(vulRef) + "\n\tCVE: " + str(vulCVE))
def getRecipeList(recipeUrl):
    # Opening the connection grabing webpage, store all raw information
    req = Request(recipeUrl, headers={'User-Agent': 'Mozilla/5.0'})
    uClient = urlReq(req)
    htmlRaw = uClient.read()
    uClient.close()

    # Parse raw HTML
    soup = BeautifulSoup(htmlRaw, "html.parser")

    recipeCard = {}

    # Store URL into recipe card
    recipeCard['URL'] = recipeUrl

    # Find title and store into recipe card
    recipeTitle = soup.find("h1", {"class": "entry-title"}).text
    recipeCard['title'] = recipeTitle

    # Find image URL store into recipe card.
    imageContainer = soup.find("div", {"class", "featured-image"})
    imageUrl = imageContainer.img['src']
    recipeCard['image'] = imageUrl

    # Return single recipe
    return recipeCard
def getRecipeList(recipeUrl):
    # Opening the connection grabing webpage, store all raw information
    uClient = urlReq(recipeUrl)
    htmlRaw = uClient.read()
    uClient.close()

    # Parse raw HTML
    soup = BeautifulSoup(htmlRaw, "html.parser")
    recipeCard = {}

    # Store URL into recipe card
    recipeCard['URL'] = recipeUrl

    # Find title and store into recipe card
    recipeTitle = soup.find("span", {
        "class": "o-AssetTitle__a-HeadlineText"
    }).text
    recipeCard['title'] = recipeTitle

    # Find star rating and store into recipe card
    recipe_stars = soup.find("span", {"class": "gig-rating-stars"})["title"]
    recipeCard['stars'] = recipe_stars

    # Find image URL store into recipe card.
    recipeHeader = soup.find("div", {"class", "recipe-lead"})
    imageContainer = recipeHeader.find(
        "img", {"class", "m-MediaBlock__a-Image a-Image"})
    if imageContainer:
        imageUrl = imageContainer['src']
        recipeCard['image'] = "http:" + imageUrl
    else:
        recipeCard['image'] = "https://i.imgur.com/bvzLAyR.jpg"

    # Return single recipe
    return recipeCard
Exemple #7
0
def searchInNewegg(searchString, blockedWord, searchPageDepth, sortPreference,
                   currency):
    searchString = searchString.replace(' ', '+')
    results = []
    currentPage = 1
    datetime = date.datetime.now()
    while currentPage <= searchPageDepth:
        if currentPage != 0:
            if currentPage <= (searchPageDepth + 1):
                urlSite = "https://www.newegg.com/p/pl?d=" + searchString + "&Page=" + str(
                    currentPage)
                webSite = urlReq(urlSite)
                html = webSite.read()
                webSite.close()
                page_soup = soup(html, 'html.parser')
        itemsWholeGrid = page_soup.find('div', {'class': 'items-view is-grid'})
        itemsWhole = itemsWholeGrid.findAll('div', {'class': 'item-container'})
        for item in itemsWhole:

            def itemAnalysis():
                #print('--------------------------------')
                text = item.find('div', {'class': 'item-info'})
                name = str(text.find('a', {'class': 'item-title'}).text)
                price = str(text.find('li',
                                      {'class': 'price-current'
                                       }))[78:85].strip('</strong>').replace(
                                           ',', '')
                try:
                    discount = str(
                        text.find('span', {
                            'class': 'price-save-percent'
                        }).text).strip('%')
                except:
                    #print('discount not found')
                    discount = 0
                    if discount == 'None':
                        discount = 0
                itemNumber = str(len(results) + 1)
                link = str(text.find(
                    'a', {'class': 'item-title'
                          })['href']).partition('?')[0].strip('https://')

                results.append((str(itemNumber), str(price), name, link,
                                str(discount), str(datetime), neweggDBPK))

                #print("item #"+ itemNumber +": "+ name +" $"+ price + ' OFF: '+ discount )

            bWordFound = 0
            for bWord in blockedWord:
                if bWord in str(item):
                    bWordFound += 1
            if bWordFound == 0:
                itemAnalysis()
        currentPage = currentPage + 1
    print('results in NewEgg :' + str(len(results)))
    if sortPreference == 'Increasing':
        return sortResults.sortIncreasing(results)
    if sortPreference == 'Decreasing':
        return sortResults.sortDecreasing(results)
 def get_page_html(self):
     try:
         url_client = urlReq(self.url)
         page_html = url_client.read()
         url_client.close()
         return page_html
     except URLError as e:
         return 'HTTPError = ' + str(e)
def get_simply_recipe(recipeUrl):
    # Opening the connection grabing webpage, store all raw information
    req = Request(recipeUrl, headers={'User-Agent': 'Mozilla/5.0'})
    uClient = urlReq(req)
    htmlRaw = uClient.read()
    uClient.close()

    # Parse raw HTML
    soup = BeautifulSoup(htmlRaw, "html.parser")

    recipeCard = {}

    # Store site name into recipe card
    recipeCard['siteName'] = "Simply Recipes"

    # Store URL into recipe card
    recipeCard['URL'] = recipeUrl

    # Find title and store into recipe card
    recipeTitle = soup.find("h1", {"class": "entry-title"}).text
    recipeCard['title'] = recipeTitle

    # Find image URL store into recipe card.
    imageContainer = soup.find("div", {"class", "featured-image"})
    imageUrl = imageContainer.img['src']
    recipeCard['image'] = imageUrl

    # Find metadata of the recipe
    metadataAry = []
    recipePrep = soup.find("li", {"class": "recipe-prep"}).text.strip()
    recipeCook = soup.find("li", {"class": "recipe-cook"}).text.strip()
    recipeYield = soup.find("li", {"class": "recipe-yield"}).text.strip()
    metadataAry.append(recipePrep)
    metadataAry.append(recipeCook)
    metadataAry.append(recipeYield)
    recipeCard['metadata'] = metadataAry

    # Find ingredients
    ingredientListAry = []
    ingredientList = soup.findAll("li", {"class": "ingredient"})
    for ingredientItem in ingredientList:
        ingredient = ingredientItem.text
        ingredientListAry.append(ingredient)
    recipeCard['ingredients'] = ingredientListAry

    # Find Instructions
    instructionsAry = []
    instructionsContainer = soup.find("div", {"id": "sr-recipe-method"})
    instructions = instructionsContainer.findAll("p")
    for instructionItem in instructions:
        instruction = instructionItem.text.strip()
        instructionsAry.append(instruction)
    recipeCard['instructions'] = instructionsAry

    # Return single recipe
    return recipeCard
Exemple #10
0
def getLyricsForSong(songName):
    try:
        uClient = urlReq(lyrics_url + songName + ".html")
        page_html = uClient.read()
        uClient.close()

        page_soup = soup(page_html, "html.parser")
        containers = page_soup.findAll("p", {"class": "verse"})

        speak("Here are the lyrics for that song\n")
        for container in containers:
            print(container.text)
    except urllib.error.HTTPError:
        speak(noLyricsMessageDict[random.randint(1, len(noLyricsMessageDict))])
Exemple #11
0
def getPrice():
    # Setting URL variables
    page_url = 'https://finance.yahoo.com/quote/FB?p=FB'
    urlClient = urlReq(page_url)
    page_html = urlClient.read()
    urlClient.close()

    # Parsing HTML
    page_soup = soup(page_html, "html.parser")

    # Grabbing current price
    curr_price = page_soup.find('div', {
        'class': 'My(6px) Pos(r) smartphone_Mt(6px)'
    }).find('span').text
    return curr_price
def getUrls(ingredients, pageNum):
    # Search a few pages from main search result
    searchUrl = "https://www.foodnetwork.com/search/" + ingredients + "-/p/" + str(
        pageNum) + "/rating"
    recipeUrlList = urlReq(searchUrl)
    htmlRaw = recipeUrlList.read()
    recipeUrlList.close()
    soup = BeautifulSoup(htmlRaw, "html.parser")

    # Find all recipe URLs and return URL list
    recipeUrlList = []
    recipeBlockContainer = soup.findAll("h3",
                                        {"class": "m-MediaBlock__a-Headline"})
    for recipeBlock in recipeBlockContainer:
        recipeUrl = "https:" + recipeBlock.a['href']
        if (not ("videos" in recipeUrl)):
            recipeUrlList.append(recipeUrl)

    return recipeUrlList
def getUrls(item, pageNum):
    # Search a few pages from main search result
    searchUrl = "https://www.simplyrecipes.com/recipes/main-ingredient/" + item + "/page/" + str(
        pageNum) + "/"

    req = Request(searchUrl, headers={'User-Agent': 'Mozilla/5.0'})
    recipeUrlList = urlReq(req)
    htmlRaw = recipeUrlList.read()
    recipeUrlList.close()
    soup = BeautifulSoup(htmlRaw, "html.parser")

    # Find all recipe URLs and return URL list
    recipeUrlList = []
    recipeBlockContainer = soup.findAll("h2", {"class": "grd-title-link"})
    for recipeBlock in recipeBlockContainer:
        recipeUrl = recipeBlock.a['href']
        recipeUrlList.append(recipeUrl)

    return recipeUrlList
Exemple #14
0
def Plugins_Enum():
    try:
        pwd = "./wp-content/plugins"
        plgFolders = [
            subDir for subDir in Path(pwd).iterdir() if subDir.is_dir()
        ]
        rdmFile = "/README.txt"

        for slug in plgFolders:
            base = "https://api.wordpress.org/plugins/info/1.1/?action=query_plugins&request[search]="
            url = base + slug.name
            siteResponse = urlReq(url).read()
            jsonResponse = json.loads(siteResponse.decode("ISO-8859-1"))

            firstMatch = jsonResponse["plugins"][0]

            plgName = firstMatch.get("name")
            plgVersion = firstMatch.get("version")
            plgAuthor = firstMatch.get("author_profile")
            plgSite = firstMatch.get("homepage")
            plgWPReq = firstMatch.get("requires")

            if Path(str(slug) + rdmFile).is_file():
                instVersion = verParser(rdmFile, slug)
            elif Path(str(slug) + rdmFile.lower()).is_file():
                instVersion = verParser(rdmFile.lower(), slug)

            # Show info from Wordpress Api for each plugin parsed
            print("\n\nPlugin name: " + plgName + "\x1b[1;31m" +
                  "\nLatest version: " + plgVersion + "\nInstalled \
version:" + instVersion + "\x1b[0;m" + "\nAuthor: " + plgAuthor + "\nSite: " +
                  plgSite + "\nWordpress \
version required: " + plgWPReq)

            # Show a list of vulnerabilities store in wpvulndb for each plugin parsed
            Wpvuldb_Api(slug.name)
    except:
        print(
            "The directory \"wp-content\" doesn't exist, therefore plugins cannot be enumerated"
        )

    Handler()
Exemple #15
0
def get_recipe_list(recipeUrl):
    # Opening the connection grabbing webpage, store all raw information
    uClient = urlReq(recipeUrl)
    htmlRaw = uClient.read()
    uClient.close()

    # Parse raw HTML
    soup = BeautifulSoup(htmlRaw, "html.parser")

    recipeCard = {}

    # Store URL into recipe card
    recipeCard['URL'] = recipeUrl

    # Find title and store into recipe card
    recipeTitle = soup.find("h1").text
    recipeCard['title'] = recipeTitle

    # Find the star rating and store into recipe card - grabbing the value in aria-label
    recipe_stars = soup.find('span', {'class', 'review-star-text'})
    if recipe_stars:
        recipeCard['stars'] = recipe_stars.text.strip()
    else:
        stars_span = soup.find('span', {'class', 'stars stars-5'})
        recipeCard['stars'] = stars_span['aria-label']

    # Find image URL store into recipe card.
    imageContainer = soup.find("div", {"class", "image-container"})
    if imageContainer:
        imageUrl = imageContainer.div['data-src']
    else:
        imageContainer = soup.find("img", {"class", "rec-photo"})
        imageUrl = imageContainer['src']
    recipeCard['image'] = imageUrl

    # Return single recipe as dictionary
    return recipeCard
import bs4
from urllib.request import urlopen as urlReq
from bs4 import BeautifulSoup as soup

# changes on wed
evga_url = 'https://www.evga.com/products/productlist.aspx?type=0'
# get request for the webpage and store raw html
url_client = urlReq(evga_url)
raw_html = url_client.read()

url_client.close()
# use soup to performing parsing
page_soup = soup(raw_html, "html.parser")
all_product_containers = page_soup.find_all("div",
                                            attrs={"class": "list-item"})

filename = "listings.csv"
f = open(filename, "w")

headers = "name, original price, discount, final price, base clock, boost clock, VRAM, bandwidth, link\n"
f.write(headers)

for product in all_product_containers:
    # print(product.prettify())
    details = []

    img = product.div.a.img["src"]
    link = product.find("div", {"class": "pl-list-image"}).contents[1]["href"]

    name = product.find("div", {"class": "pl-list-image"}).contents[1]["title"]
    details_ul = product.find("div", {"class": "pl-list-info"}).ul.contents
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as urlReq

page_url = "https://dmoz-odp.org/"

urlClient = urlReq(page_url)

page_soup = soup(urlClient.read(), "html.parser")
urlClient.close()

links = page_soup.findAll('a')

out_filename = "dmoz_links.csv"

headers = "link\n"

f = open(out_filename, "w")
f.write(headers)
count = 0
for link in links:
    if (count < 49):

        link_store = link.get('href')
        temp_link = str(link_store)

        if (temp_link.find('https') != -1):
            print("Link :" + str(link_store))
            f.write(str(link_store) + "\n")
        else:

            url = "https://dmoz-odp.org"
import bs4
from urllib.request import urlopen as urlReq
from bs4 import BeautifulSoup as soup

username = input("Enter Username :"******"https://github.com/" + username + "?tab=repositories"

uClient = urlReq(myurl)
page_html = uClient.read()
uClient.close()

page_soup = soup(page_html, "html.parser")

containers = page_soup.findAll("div",
                               {"class": "col-10 col-lg-9 d-inline-block"})

repo_count = page_soup.findAll("span", {"class": "Counter"})
print("Total number of commits: ", repo_count[0].text)

filename = "Repo_list.csv"
f = open(filename, "w")

headers = "Repository_name, Language, Last_update\n"

f.write(headers)

for container in containers:
    repo = container.findAll("h3", {"class": "wb-break-all"})
    repo_name = ((repo[0].text).replace(" ", "")).replace("\n", "")

    lang = container.findAll("span", {"itemprop": "programmingLanguage"})
Exemple #19
0
def get_all_recipe(recipeUrl):
    # Opening the connection grabbing webpage, store all raw information
    uClient = urlReq(recipeUrl)
    htmlRaw = uClient.read()
    uClient.close()

    # Parse raw HTML
    soup = BeautifulSoup(htmlRaw, "html.parser")

    recipeCard = {}

    # Store URL into recipe card
    recipeCard['URL'] = recipeUrl

    # Find title and store into recipe card
    recipeTitle = soup.find("h1").text
    recipeCard['title'] = recipeTitle

    # Find image URL store into recipe card.
    imageContainer = soup.find("div", {"class", "image-container"})
    if imageContainer:
        imageUrl = imageContainer.div['data-src']
    else:
        imageContainer = soup.find("img", {"class", "rec-photo"})
        imageUrl = imageContainer['src']
    recipeCard['image'] = imageUrl

    # Find metadata of the recipe
    metadataAry = []
    recipeMetadata = soup.findAll("div", {"class": "recipe-meta-item"})
    if recipeMetadata:
        for metadata in recipeMetadata:
            metadataHeader = metadata.find(
                "div", {"class", "recipe-meta-item-header"}).text.strip()
            metadataBody = metadata.find(
                "div", {"class", "recipe-meta-item-body"}).text.strip()
            metadataEntry = metadataHeader + ' ' + metadataBody
            metadataAry.append(metadataEntry)
    else:
        recipeMetadata = soup.findAll("li", {"aria-label": True})
        for metadata in recipeMetadata:
            metadataEntry = metadata['aria-label']
            metadataAry.append(metadataEntry)
    recipeCard['metadata'] = metadataAry

    # Find ingredients
    ingredientListAry = []
    ingredientList = soup.findAll("li", {"class": "ingredients-item"})
    if ingredientList:
        for ingredientItem in ingredientList:
            ingredient = ingredientItem.find("span", {
                "class": "ingredients-item-name"
            }).text.strip()
            ingredientListAry.append(ingredient)
    else:
        ingredientList = soup.findAll("label", {"title": True})
        for ingredientItem in ingredientList:
            ingredient = ingredientItem['title']
            ingredientListAry.append(ingredient)
    recipeCard['ingredients'] = ingredientListAry

    # Find Instructions
    instructionsAry = []
    instructionsContainer = soup.findAll(
        "li", {"class": "subcontainer instructions-section-item"})
    if instructionsContainer:
        for instructionItem in instructionsContainer:
            instruction = instructionItem.p.text
            instructionsAry.append(instruction)
    else:
        instructionsContainer = soup.findAll(
            "span", {"class": "recipe-directions__list--item"})
        for instructionItem in instructionsContainer:
            instruction = instructionItem.text.strip()
            instructionsAry.append(instruction)
    recipeCard['instructions'] = instructionsAry

    # Return single recipe as dictionary
    return recipeCard
Exemple #20
0
# Importing files for web scraping
from urllib.request import urlopen as urlReq
from bs4 import BeautifulSoup as soup

# Setting URL variables
test_url = 'https://www.newegg.ca/Cell-Phones-Unlocked/SubCategory/ID-2961?Tid=165973'
urlClient = urlReq(test_url)
page_html = urlClient.read()
urlClient.close()

# Parsing HTML
page_soup = soup(page_html, "html.parser")

# Finding desired content
containers = page_soup.findAll("div", {"class": "item-container"})

# Creating files
filename = "products.csv"
f = open(filename, "w")

headers = "brand, title, total_price, old_price, percent_saved, ship_cost\n"
f.write(headers)


# --------------------------- FUNCTIONS ---------------------------
# Find elements with classes function
def findClass(contain, elem, class_name, index):
    return contain.findAll(elem, {"class": class_name})[index]


# Find total price of item
Exemple #21
0
#GET JSON OBJECT FOR ONE PAGE
from urllib.request import urlopen as urlReq
from bs4 import BeautifulSoup as soup
import json
import dateutil.parser as parser

data = {}
contract_url = "https://www.sourcewell-mn.gov/cooperative-purchasing/022217-wex"
urlClient = urlReq(contract_url)
page_html = urlClient.read()
urlClient.close()

p_soup = soup(page_html, "html.parser")

container = p_soup.find("div", {"class": "vendor-contract-header__content"})
data['title'] = container.findAll('p')[0].text
str = container.findAll('p')[1].text.split('\n')
date = str[1].split('Maturity Date:')[1].strip()
data['expiration'] = parser.parse(date).isoformat()
str = str[0].replace('#', '')
data['contract_number'] = str
name = str.split('-')[1]

files = {}
files["contract-forms"] = p_soup.findAll(
    "div", {"class": "field--item"})[2].findAll('span')[3].a["href"]
data["files"] = [files]

vendor = {}
contacts = {}
vendor["name"] = name
def Pull_Site():
    url = "http://www.safa.edu"
    siteResponse = urlReq(url)
    parsedPage = bSoup(siteResponse, "html.parser")
    return (parsedPage)
Exemple #23
0
from urllib.request import urlopen as urlReq
from bs4 import BeautifulSoup as soup

############################## For Flipkart ####################################################

my_url = "https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&as-pos=0&as-type=HISTORY&as-backfill=on"

############ Request to a URL for getting all the data
vClient =  urlReq(my_url)
############ Read all the content of that page
page_html = vClient.read()
############ Close request object
vClient.close()
########## Parse all the html data by beautifull Soup
page_soup = soup(page_html,"html.parser")
########## Find all the content of any div by class name
containers = page_soup.find_all("div",{"class":"_1UoZlX"})

#print(len(containers))
########## Will convert all the data at html form which is written
#print(soup.prettify(containers[0]))
#for one product
#container = containers[0]
#print(container)
########## Product Name
#prdname = container.find("div",{"class":"_3BTv9X"})
#print(soup.prettify(prdname))
#prod_name = prdname.img["alt"]
#print(prod_name)

########### Rating#################################
Exemple #24
0
import bs4
from urllib.request import urlopen as urlReq
from bs4 import BeautifulSoup as soup
import csvWriter

searchString = "moto"
blockWords = ["None", 'screen', 'Protector', 'case', 'film']
searchPageDepth = 4
currentPage = 0
urlSite = "https://www.amazon.com/s?k=" + searchString + "&ref=nb_sb_noss_2"
webSite = urlReq(urlSite)
html = webSite.read()
webSite.close()
page_soup = soup(html, 'html.parser')
results = []
while currentPage < searchPageDepth:
    if currentPage != 0:
        if currentPage <= searchPageDepth:
            urlSite = 'https://amazon.com' + str(
                page_soup.find('li', {
                    'class': 'a-last'
                }).a['href']) + '/'
            webSite = urlReq(urlSite)
            html = webSite.read()
            webSite.close()
            page_soup = soup(html, 'html.parser')
    itemsWhole = page_soup.findAll(
        'span', {'cel_widget_id': 'SEARCH_RESULTS-SEARCH_RESULTS'})
    for item in itemsWhole:
        text = str(
            item.find('span',
Exemple #25
0
    num_images, folder_name = parse_arguments()

    with open('images_urls.txt', 'w') as file:

        last_country = str()
        driver.get(url)

        button = driver.find_element_by_class_name('intro__explore')
        button.click()
        while len(image_urls
                  ) != num_images:  #get input for number of images at least
            time.sleep(.2)

            url = driver.current_url

            uClient = urlReq(url)
            page_html = uClient.read()
            uClient.close()
            page_soup = soup(page_html, "html.parser")
            images = page_soup.find_all('img', {'src': re.compile('.jpg')})
            country_name = page_soup.find_all('div',
                                              {'class': 'location__country'})
            try:
                for image in images:
                    curr_url = image['src']
                    if curr_url not in image_urls:  #new image
                        image_urls.add(curr_url)
                        curr_name = str()

                        try:
                            curr_name += country_name[0].previous
Exemple #26
0
def searchInAmazon(searchString, blockedWord, searchPageDepth, sortPreference,
                   currency):
    datetime = date.datetime.now()
    searchString = searchString.replace(' ', '+')
    currentPage = 0
    urlSite = "https://www.amazon.com/s?k=" + searchString + "&ref=nb_sb_noss_2"
    webSite = urlReq(urlSite)
    html = webSite.read()
    webSite.close()
    page_soup = soup(html, 'html.parser')
    results = []
    while currentPage < searchPageDepth:
        if currentPage != 0:
            if currentPage <= searchPageDepth:
                urlSite = 'https://amazon.com' + str(
                    page_soup.find('li', {
                        'class': 'a-last'
                    }).a['href']) + '/'
                webSite = urlReq(urlSite)
                html = webSite.read()
                webSite.close()
                page_soup = soup(html, 'html.parser')
        itemsWhole = page_soup.findAll(
            'span', {'cel_widget_id': 'SEARCH_RESULTS-SEARCH_RESULTS'})
        for item in itemsWhole:

            def itemAnalysis():
                if 'App' and 'Prime Video' not in str(item):
                    text = str(
                        item.find('span', {
                            'class':
                            'a-size-medium a-color-base a-text-normal'
                        }))
                    name = text.strip(
                        '<span class="a-size-medium a-color-base a-text-normal" dir="auto">'
                    ).strip('</')
                    if (item.find('free') or item.find('FREE')):
                        price = fullPrice = '0'
                    else:
                        try:
                            price = str(
                                item.find('span', {
                                    'class': 'a-price-whole'
                                }).text).replace(',', '').strip('.')
                            fullPriceSpan = item.find(
                                'span', {'data-a-strike': 'true'})
                            try:
                                fullPrice = str(
                                    fullPriceSpan.find('span', {
                                        'class': 'a-offscreen'
                                    }).text).strip('$').partition('.')[0]
                            except:
                                fullPrice = price
                            try:
                                if fullPrice != price:
                                    discount = str(
                                        100 - round(float(price), 2) * 100 /
                                        round(float(fullPrice), 2)).partition(
                                            '.')[0]
                                    #print(discount)
                                else:
                                    discount = '0'
                            except ValueError as err:
                                discount = '0'
                            itemNumber = str(len(results))
                            link = ('amazon.com' + item.find(
                                'a', {'class': 'a-link-normal a-text-normal'
                                      })['href']).partition('ref')[0]
                            img = item.find('img', {'class': 's-image'})['src']
                            print(img)
                            results.append(
                                (itemNumber, price, name, link, discount,
                                 str(datetime), amazonDBPK, img))
                        except AttributeError as err:
                            pass
                            #print('Item Skipped in Amazon due to: ' +str(err))

            bWordFound = 0
            for bWord in blockedWord:
                if bWord in str(item):
                    bWordFound += 1
            if bWordFound == 0:
                itemAnalysis()
        currentPage = currentPage + 1
    print('results in Amazon :' + str(len(results)))
    if sortPreference == 'Increasing':
        return sortResults.sortIncreasing(results)
    if sortPreference == 'Decreasing':
        return sortResults.sortDecreasing(results)
Exemple #27
0
from urllib.request import urlopen as urlReq  #grab page
from bs4 import BeautifulSoup as soup  #parse txt
import json
import dateutil.parser as parser
from urllib.request import Request
from socket import timeout
import logging

pageNames = []
numPages = 9
header = {'User-Agent': 'CoProcure Technical Challenge'}

for x in range(0, numPages):
    basePageUrl = 'https://www.sourcewell-mn.gov/contract-search?category=All&keyword=&page=' + str(
        x)
    urlClient = urlReq(basePageUrl)
    page_html = urlClient.read()
    urlClient.close()
    p_soup = soup(page_html, "html.parser")
    container = p_soup.findAll(
        "p", {"class": "component__search-vendors-contracts-number"})

    for n in container:
        pageNames.append(n.text.replace('#', '').strip())

contract_data = []

for i in pageNames:
    data = {}
    contract_url = "https://www.sourcewell-mn.gov/cooperative-purchasing/" + i
    req = Request(contract_url, headers=header)
def get_foodnetwork(recipeUrl):
    # Opening the connection grabbing web page, store all raw information
    uClient = urlReq(recipeUrl)
    htmlRaw = uClient.read()
    uClient.close()

    # Parse raw HTML
    soup = BeautifulSoup(htmlRaw, "html.parser")

    recipeCard = {}

    # Store site name into recipe card
    recipeCard['siteName'] = "Food Network"

    # Store URL into recipe card
    recipeCard['URL'] = recipeUrl

    # Find title and store into recipe card
    recipeTitle = soup.find("span", {
        "class": "o-AssetTitle__a-HeadlineText"
    }).text
    recipeCard['title'] = recipeTitle

    # Find image URL store into recipe card.
    recipeHeader = soup.find("div", {"class", "recipe-lead"})
    imageContainer = recipeHeader.find(
        "img", {"class", "m-MediaBlock__a-Image a-Image"})
    if imageContainer:
        imageUrl = imageContainer['src']
        recipeCard['image'] = "http:" + imageUrl
    else:
        recipeCard['image'] = "https://i.imgur.com/bvzLAyR.jpg"

    # Find metadata of the recipe
    metadataAry = []
    levelInfo = soup.find("div", {"class": "o-RecipeInfo"})
    itemsInfo = levelInfo.findAll("li")
    for item in itemsInfo[:5]:
        item = item.text.replace('\n', ' ').strip()
        metadataAry.append(item)
    recipeCard['metadata'] = metadataAry

    # Find ingredients
    ingredientListAry = []
    ingredientList = soup.findAll("p",
                                  {"class": "o-Ingredients__a-Ingredient"})
    for ingredientItem in ingredientList:
        ingredient = ingredientItem.text
        ingredientListAry.append(ingredient)
    recipeCard['ingredients'] = ingredientListAry

    # Find Instructions
    instructionsAry = []
    instructionsContainer = soup.findAll("li", {"class": "o-Method__m-Step"})
    for instructionItem in instructionsContainer:
        instruction = instructionItem.text.strip()
        instructionsAry.append(instruction)
    recipeCard['instructions'] = instructionsAry

    # Return single recipe
    return recipeCard
Exemple #29
0
def searchInMercadoLibre(searchString, blockedWord, searchPageDepth, sortPreference, currency):
    searchString = searchString.replace(' ','+')
    currentPage = 0
    datetime = date.datetime.now()
    if currency == 'USD' :
        urlSite = 'http://www.dolarhoy.com/'
        webSite = urlReq(urlSite)
        html = webSite.read()
        webSite.close()
        page_soup = soup(html, 'html.parser')
        usdContainer = page_soup.find('div',{'class':'pill pill-coti'})
        usdCompra = str(usdContainer.findAll('span')[1:2])[23:30].strip('</').replace(',','.')
        print('USD Compra= '+str(usdCompra))
    urlSite = "https://listado.mercadolibre.com.ar/" + searchString +'/'
    webSite = urlReq(urlSite)
    html = webSite.read()
    webSite.close()
    page_soup = soup(html, 'html.parser')
    results=[]
    while currentPage < searchPageDepth : 
        if currentPage != 0 :
            if currentPage <= searchPageDepth : 
                urlSite = str(page_soup.find('li', {'class':'andes-pagination__button andes-pagination__button--next'}).a['href'])
                webSite = urlReq(urlSite)
                html = webSite.read()
                webSite.close()
                page_soup = soup(html, 'html.parser')
        itemsWhole = page_soup.findAll('li',{'class':'results-item highlighted article stack'})
        for item in itemsWhole:
            def itemAnalysis():
                #print('--------------------------------')
                text = str(item.find('span', {'class':'main-title'}))
                name=text.strip('<span class="main-title"> ').strip('</span>')
                #price = str(item.find('span',{'class':'price__fraction'}))[29:36].strip('</span>').replace('.','').replace(',','')
                try:
                    price = str(item.find('span',{'class':'price__fraction'}).text).replace('.','')
                except AttributeError as err:
                    try:
                        price = str(item.find('div',{'class':'pdp_options__text pdp_options--no-margin'}).text.strip(' $ ').partition(' ')[0]).replace('.','')
                    except AttributeError as err :
                        price = str(str(item.find('div',{'class':'pdp_options__text pdp_options--no-margin'})).strip(' $ ').partition(' ')[0]).replace('.','')
                if price == 'None':
                    print('econtro precios en None')
                    price = str(item.find('div',{'class':'pdp_options__text pdp_options--no-margin'})).partition('<span>$')[2].partition('</span>')[0].replace('.','')
                print('price: '+price)
                if currency == 'USD' :
                    price = float(price) / float(usdCompra)
                    price = str(round(price, 2))
                try:
                    discount = str(item.find('div',{'class':'item__discount'}).text).strip('% OFF')
                except AttributeError :
                    discount = '0'
                if discount == 'None' : 
                    discount = '0'
                itemNumber = str(len(results))  
                link = str(item.a['href'])
                if 'JM' in link:
                    link = link.partition('JM')[0]+'JM'
                else :
                    link = link.partition('?')[0]
                img = str(item.find('img',{})).partition('src="')[2].partition('"')[0] 
                print(img)
                results.append((itemNumber, price, name, link.strip('https://'), discount, str(datetime), mercadolibreDBPK, img))
                #print("item #"+ itemNumber +": "+ name +" $"+ price + ' OFF: '+ discount )
            bWordFound = 0
            for bWord in blockedWord:
                if bWord in str(item):  
                    bWordFound+=1
            if bWordFound == 0 :
                itemAnalysis()
        currentPage=currentPage+1
        print('results in MercadoLibre :' + str(len(results)))
        if sortPreference == 'Increasing' :
            return sortResults.sortIncreasing(results)
        if sortPreference == 'Decreasing' :
            return sortResults.sortDecreasing(results)
Exemple #30
0
import bs4
from urllib.request import urlopen as urlReq
from bs4 import BeautifulSoup as bSoup

flip_url = 'https://www.flipkart.com/search?q=laptop&sid=6bo%2Cb5g&as=on&as-show=on&otracker=AS_QueryStore_OrganicAutoSuggest_3_7_na_na_na&otracker1=AS_QueryStore_OrganicAutoSuggest_3_7_na_na_na&as-pos=3&as-type=RECENT&suggestionId=laptop%7CLaptops&requestId=3ce158b5-e07c-46b0-b18f-a727831d42c4&as-searchtext=laptop'

uClient = urlReq(flip_url)
html_page = uClient.read()
uClient.close()

soup_page = bSoup(html_page, "html.parser")

containers = soup_page.findAll("div", {"class": "_1-2Iqu row"})

filename = "laptops.csv"
f = open(filename, "w")

headers = "Model, Processor, RAM, ROM, Display, Price\n"

f.write(headers)

for container in containers:
    model_name = container.findAll("div", {"class": "_3wU53n"})
    model = model_name[0].text

    price_container = container.findAll("div", {"class": "_1vC4OE _2rQ-NK"})
    price = price_container[0].text

    content = container.findAll("li", {"class": "tVe95H"})
    processor = content[0].text
    ram = content[1].text