def get_data(result):
    soup = Bsoup(result.text, 'html.parser')

    Day = soup.find(
        "div",
        class_="today_nowcard-main component panel today-card-other-fog")

    Place = Day.header.h1.contents[0]

    Last_Update = Day.find(
        "p", class_="today_nowcard-timestamp").span.text + Day.find(
            "p", class_="today_nowcard-timestamp").span.next_sibling.text

    temp = Day.find("div", class_="today_nowcard-temp")

    weather_type = Day.find("div", class_="today_nowcard-phrase")

    feels_like = Day.find("div", class_="today_nowcard-feels")

    UV_index = list(
        Day.find("div", class_="today_nowcard-hilo").stripped_strings)
    UVstring = "".join(UV_index)
    UV_index_text = UVstring[0:4] + " " + UVstring[4:7] + "\n" + UVstring[
        7:15] + " " + UVstring[15:]

    return [Place, Last_Update, temp, weather_type, feels_like, UV_index_text]
Example #2
0
def get_top_exts(web_store_scrape_file, with_num_ratings=False):
    """
    Scrape the file, return the extension IDs.

    :param web_store_scrape_file: Should be a path to a HTML file taken from
        the Chrome Web Store showing the "Popular" category. There's no
        guarantee that the CSS and tag attributes used to locate the desired
        information will work in the future. Check the Web Store's source to be
        sure.
    :type web_store_scrape_file: str
    :param with_num_ratings: Flag indicates if the number of reviews should
        also be returned. If True, the return type will be a dictionary.
    :type with_num_ratings: bool
    :return: The list of extension IDs.
    :rtype: tuple|dict
    """
    soup = Bsoup(web_store_scrape_file, "lxml")
    ext_num_ratings = {}

    for tile in soup.find_all('div', class_='webstore-test-wall-tile'):
        link = tile.a.get('href')
        ext_id = id_from_url(link)

        rating = tile.find('div', attrs={'g:type': "AverageStarRating"})
        num_ratings = int(
            rating.span.string[1:-1])  # Number with parentheses around them

        ext_num_ratings[ext_id] = num_ratings

    if with_num_ratings:
        return ext_num_ratings
    return tuple(ext_num_ratings.keys())
Example #3
0
def conta_nodi(fileIn, selettore):
    '''Torna il numero di nodi dell'albero, che soddisfano il selettore CSS.'''
    albero=my_html.fparse(fileIn)
    page=open(fileIn)
    soup=Bsoup(page,'html.parser')
    cont=0
    if selettore.startswith('#') or selettore.startswith('.'):
        return conto(albero,soup,selettore,cont)
    if selettore.startswith('@'):
        lista=[]
        for lett in selettore:
            lista.append(lett)
        lista.remove('@')
        lista.remove('[')
        lista.remove(']')
        selettore2=''.join(lista)
        b=selettore2.split('=')
        attributo=b[1]
        for elemento in soup.find_all(width=attributo):
            cont+=1
        return cont
        
    if len(selettore)==3:
        listaselettori=selettore.split(' ')
        avo=listaselettori[0]
        discendente=listaselettori[1]
        for elemento in soup.find_all(avo):
            for el in elemento.find_all(discendente):
                cont+=1
        return cont
    for k in soup.find_all(selettore):
        cont+=1
    return cont
Example #4
0
def scrape_page(input_tag) :
    #store the url of the page holding the quotes. 
        # website https://www.goodreads.com/quotes/tag
    page_URL = "https://www.goodreads.com/quotes/search?utf8=%E2%9C%93&q="+ str(input_tag)+"&commit=Search" 

    #open connection and get content.
    try :
        page_client = urlopen(page_URL)
        page_html = page_client.read()
        #close 
        page_client.close()
    except HTTPError as e :
        page_html = e.read()
    #parse data to html
    soup = Bsoup(page_html, "html.parser")
    #access the data on the page.

    outer_container = soup.findAll("div", {"class":"quoteDetails"})
    quote_list = []

    #loop through all quote containers to get the actual quote.
    for container in outer_container :
        #inside first div the the text. Text needs to be formatted. 
        quote = (container.div.text)
        quote = quote.split("//", maxsplit = 1)[0].strip().replace("\n"," ")
        if len(quote) <= 264 :
            quote_list.append(quote)
    return quote_list
Example #5
0
    def __getImagesUrl(self, name):
        url = self.url + name

        print(url)

        driver = wd.Chrome(self.wd_path)
        driver.get(url)

        scroll_value = self.scroll_count

        for i in range(self.scroll_full_count):
            driver.execute_script("window.scrollTo(0, " + str(scroll_value) +
                                  ")")
            scroll_value += self.scroll_count
            time.sleep(0.1)

        soup = Bsoup(driver.page_source, "html.parser")
        images = soup.find_all("img", attrs={"class": "_2UpQX"})

        urls_arr = []

        for image in images:
            if image.get("src") != None:
                url = image.get("src")
                urls_arr.append(url)

        print(urls_arr)

        return urls_arr
Example #6
0
def get_soups(urls):
    soups = []
    for url in urls:
        url_o = requests.get(url)
        soup = Bsoup(url_o.text, 'html.parser')
        soups.append(soup)
    return (soups)
Example #7
0
def scrape_page():

    page_URL = "https://www.teenvogue.com/gallery/best-harry-potter-quotes"

    #open connection and get content.
    #   scraper is not universal
    page_client = urlopen(page_URL)
    page_html = page_client.read()
    #close
    page_client.close()
    #parse data to html
    soup = Bsoup(page_html, "html.parser")
    #access the data on the page.

    outer_container = soup.findAll("div",
                                   {"class": "gallery-slide-caption__dek"})
    quote_list = []

    #loop through all quote containers to get the actual quote.
    for container in outer_container:
        #inside first div the the text. Text needs to be formatted.
        quote = (container.div.text)
        quote = quote.strip().replace("\n", " ")
        if len(quote) <= 264:
            quote_list.append(quote)
    return quote_list
def searchAuthors(request):

    keyword = request.POST.get('keyword', False)
    Authors = []
    Authorlink = []
    url = 'http://scholar.google.com/citations?view_op=search_authors&mauthors=' + str(
        keyword) + '&hl=en&oi=drw'
    response = requests.get(url)

    html_soup = Bsoup(response.text, 'html.parser')

    name_containers = html_soup.find_all('div', class_='gsc_1usr')

    interests = []

    for container in name_containers:
        interest = container.find_all('a', class_='gs_ai_one_int')
        mylist = []
        for i in interest:
            mylist.append(i.text)
        interests.append(mylist)
        Authors.append(container.div.h3.a.text)
        k = str(container.div.h3.a['href'])
        Authorlink.append(
            "https://scholar.google.com/citations?view_op=medium_photo&user=" +
            k[k.index('='):])
    author = zip(Authors, Authorlink, interests)

    return render(request, 'profiles/index.html', context={'author': author})
def parse_page(url):

    x = Ureq(url)
    page = x.read()
    x.close()
    page_parsed = Bsoup(page, 'html.parser')

    return (page_parsed)
Example #10
0
def main():

    print("hey")
    names = []
    prices = []
    changes = []
    percentChanges = []
    marketCaps = []
    totalVolumes = []
    circulatingSupplys = []

    result = requests.get(url)
    soup = Bsoup(result.content, 'html.parser')
    print("Haefd")
    for listing in soup.find_all(
            'tr',
            attrs=
        {
            'class':
            'simpTblRow Bgc($extraLightBlue):h BdB Bdbc($finLightGrayAlt) Bdbc($tableBorderBlue):h H(32px) Bgc(white)'
        }):
        for name in listing.find_all('td', attrs={'aria-label': 'Name'}):
            names.append(name.text)
        for price in listing.find_all('td',
                                      attrs={'aria-label':
                                             'Price (intraday)'}):
            prices.append(price.find('span').text)
        for change in listing.find_all('td', attrs={'aria-label': 'Change'}):
            changes.append(change.text)
        for percentChange in listing.find_all('td',
                                              attrs={'aria-label':
                                                     '% change'}):
            percentChanges.append(percentChange.text)
        for marketCap in listing.find_all('td',
                                          attrs={'aria-label': 'Market cap'}):
            marketCaps.append(marketCap.text)
        for totalVolume in listing.find_all(
                'td', attrs={'aria-label': 'Avg vol (3-month)'}):
            totalVolumes.append(totalVolume.text)
        for circulatingSupply in listing.find_all(
                'td', attrs={'aria-label': 'Volume'}):
            circulatingSupplys.append(circulatingSupply.text)
    print(len(names))
    #for i in range(0, len(names)):
    #  print(float(changes[i][0:len(percentChanges[i]) - 1]))
    # print(float(percentChanges[i][0:len(percentChanges[i])-1]))

    print(si.get_day_gainers().columns)  #is a pandas data frame
def check_file(fname):
    global total_img_cnt
    content = open(fname, 'r').read()
    #lcontent = content.lower()
    soup = Bsoup(content, features="html.parser")
    img_cnt = 0
    for imgtag in soup.find_all('img'):
        img_cnt += 1
        total_img_cnt += 1
        img_path = imgtag['src']
        print('%d) tag path: %s' % (img_cnt, img_path))
        fullpath = join(OUTPUT_DIR, img_path)
        if not isfile(fullpath):
            print('broken path: %s' % fullpath)
            sys.exit(0)
    print('looks good! images checked: %s' % img_cnt)
def find_links(name):
    name = name.replace(" ", "+")

    url_str = 'https://www.google.com/search?ei=1m7NWePfFYaGmQG51q7IBg&hl=en&q={}' + \
              '\&tbm=isch&ved=0ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ&start={}' + \
              '\&yv=2&vet=10ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ.1m7NWePfFYaGmQG51q7IBg' + \
              '\.i&ijn=1&asearch=ichunk&async=_id:rg_s,_pms:s'

    headers = {
        "User-Agent": "Chrome/65.0.3325.162 Safari/537.36",
        "Content-Type": "application/json"
    }
    url_str = url_str.format(name, 0)
    print(url_str)
    request = ulib.Request(url_str, None, headers)
    json_str = ulib.urlopen(request).read()
    json_str = json.loads(json_str)
    soup = Bsoup(json_str[1][1], 'lxml')
    soup_imgs = soup.find_all("img")
    img_links = [img["src"] for img in soup_imgs]
    return img_links
def scrape(request):
    session = requests.Session()
    session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"}
    instance=Quotes.objects.all()
    instance.delete()
    i=1
    for i in range(1,4):
      url='https://www.goodreads.com/quotes?page='+str(i)
      source= requests.get(str(url)).content
      soup=Bsoup(source,'lxml')
      articles=soup.find_all('div',class_='quoteText')
      for article in  articles:
          #main = article.find('a')
          #link = main['href']
          #image_src = main.find('img')['src']
          text= article.text
          new_quote=Quotes()
          new_quote.quote=text
          #new_quote.image=image_src
          if len(new_quote.quote)<=255:
            new_quote.save()
    return redirect('../')
Example #14
0
import requests
from bs4 import BeautifulSoup as Bsoup
from pymongo import MongoClient as Mongo

client = Mongo('localhost', 27017)
db = client.m_list

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
data = requests.get('https://www.genie.co.kr/chart/top200?ditc=D&rtm=Y',
                    headers=headers)
soup = Bsoup(data.text, 'html.parser')

music_lists = soup.select(
    '#body-content > div.newest-list > div > table > tbody > tr')

for music in music_lists:
    music.select_one('td.number > span').decompose()
    rank = music.select_one('td.number').text.strip()
    title = music.select_one('a.title.ellipsis').text.strip()
    artist_name = music.select_one('a.artist.ellipsis').text
    doc = {'rank': rank, 'title': title, 'artits': artist_name}
    db.m_list.update_one({'rank': rank}, {'$set': {'title': title}})
    db.m_list.update_one({'rank': rank}, {'$set': {'artits': artist_name}})
Example #15
0
def get_result(url):
    res = requests.get(url)
    default = 'Not Available'
    soup = Bsoup(res.content, 'html.parser')
    items = soup.find_all('li', attrs={'class': 'ais-InfiniteHits-item'})
    courses = list()
    for item in items:
        course = {}
        course_title_src = item.find('h2')
        if course_title_src:
            course_title = course_title_src.get_text()
        else:
            course_title = default
        partner_src = item.find('span', attrs={'class': 'partner-name'})
        if partner_src:
            partner = partner_src.get_text()
        else:
            partner = default
        rating_value_src = item.find('span', attrs={'class': 'ratings-text'})
        if rating_value_src:
            rating_value = rating_value_src.get_text()
        else:
            rating_value = default
        rating_count_src = item.find('span', attrs={'class': 'ratings-count'})
        if rating_count_src:
            rating_count = rating_count_src.get_text()[1:-1]
        else:
            rating_count = default
        enrollment_number_src = item.find('span',
                                          attrs={'class': 'enrollment-number'})

        if enrollment_number_src:
            enrollment_number = enrollment_number_src.get_text()
        else:
            enrollment_number = default

        difficulty_level_src = item.find('span', attrs={'class': 'difficulty'})
        if difficulty_level_src:
            difficulty_level = difficulty_level_src.get_text()
        else:
            difficulty_level = default

        item_type_src = item.find('div', attrs={'class': 'product-type-row'})
        if item_type_src:
            item_type = item_type_src.get_text()
        else:
            item_type = default
        img = item.find('img')
        if img:
            imgurl = img.get('src')
        else:
            imgurl = 'https://upload.wikimedia.org/wikipedia/commons/thumb/9/97/Coursera-Logo_600x600.svg/1200px-Coursera-Logo_600x600.svg.png'

        course['course_title'] = course_title
        course['partner'] = partner
        course['rating_value'] = rating_value
        course['rating_count'] = rating_count
        course['enrollment_numbers'] = enrollment_number
        course['course_difficulty'] = difficulty_level
        course['type'] = item_type
        course['imgurl'] = imgurl
        courses.append(course)
    return courses
Example #16
0
def get_soup(url):
    url_o = requests.get(url)
    soup = Bsoup(url_o.text, 'html.parser')
    return soup
Example #17
0
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as Bsoup

#Beautifulsoup would parse the html text in a page while urllib would grab the page itself.

#Web scrapping graphics cards of newegg.com(amazon for hardware electronics)
target_url = 'https://www.newegg.com/global/ie/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=samsung+s8+plus&ignorear=0&N=-1&isNodeId=1'
uClient = uReq(
    target_url)  #open the connection,grab the webpage and download it
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = Bsoup(
    page_html, "html.parser"
)  #first argument html page and second argument is what to parse is it html or xml page etc

#converting all the phone (samsung 8) models data to a csv file.

#next step is to inspect the element of a page,i.e, model and go to the html root/child thats holding the entire container for that phone model.
#Once one model is taken care of then we can loop through all other remaining ones,similarly.

#grabs each product
containers = page_soup.find_all(
    "div",
    {"class": "item-container"})  #feeding an object  as second parameter

filename = "mobile_product.csv"
f = open(filename, "w")
headers = "brand, Operating System, Price\n"
f.write(headers)
Example #18
0
    None

try:
    file = openFile('r')

except IOError:

    file = openFile('w')

    file = openFile('r')

dwded = file.readlines()

board = get('https://www.billboard.com/charts/hot-100').text

boardSoup = Bsoup(board, 'lxml')  # html parser

song_artist = {}

url = []

fileTitle = []

stringTitles = []

count = 1

repeat = False

amount = raw_input('Song range:\n').split(': ')
Example #19
0
import re

url = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=163834&type=after&isActualPointWriteExecute=false&isMileageSubscriptionAlready=false&isMileageSubscriptionReject=false&page="
url2 = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=134963&type=after&isActualPointWriteExecute=false&isMileageSubscriptionAlready=false&isMileageSubscriptionReject=false&page="
url3 = "https://movie.naver.com/movie/point/af/list.nhn?&page="

with open("sentences.csv", "a", encoding="utf-8", newline="") as fp:
    wr = csv.writer(fp)

    count = 13501

    for i in range(1, 351):
        res = req.get(url + str(i))

        if res.status_code == 200:
            soup = Bsoup(res.text, "html.parser")
            span_texts = soup.find_all(
                attrs={"id": re.compile("_filtered_ment_[0-9]")})
            div_scores = soup.find_all(attrs={"class": "star_score"})

            text_score_list = list(zip(span_texts, div_scores))

            for span_text, div_score in text_score_list:
                final_text = span_text.text.strip()
                final_score = int(div_score.text)

                wr.writerow([count, final_text, final_score])
                count += 1

            print(i)
Example #20
0
def get_target(url, word, html_tag, class_name):
    document = requests.get(url + word).content
    soup = Bsoup(document, 'html.parser')
    result = soup.find_all(html_tag, class_=class_name)
    return result
Example #21
0
    os.mkdir('mp3')

    os.mkdir('mp4')

except WindowsError:
    pass

song = raw_input('Song name: ')

url = creatUrl(song)

author = creatUrl(raw_input('Author name (optional): '))

url = 'https://www.youtube.com/results?search_query=' + url + '+by+' + author + '+lyrics'

youtube = Bsoup(get(url).text, 'lxml')  # html parser

for link in youtube.findAll('a'):

    if '/watch?v=' in link.get('href'):

        href = 'https://www.youtube.com' + link.get('href')

        break

mp4 = YouTube(href)

stream = mp4.streams.first()

stream.download(path + '\\mp4')
Example #22
0
from bs4 import BeautifulSoup as Bsoup
from urllib.request import urlopen

my_url = "https://morvanzhou.github.io/static/scraping/list.html"
page_html = urlopen(my_url).read().decode('utf-8')
#print(page_html)

## html parsing
page_soup = Bsoup(page_html, "html.parser")
#print(page_soup)

## use class to narrow search
## function find_all, setting two searching constraints
parse_month = page_soup.find_all('li', {"class": "month"})
parse_month = page_soup.find_all('ul', {"class": "jan"})
print(parse_month)
for m in parse_month:
    print(m.get_text())
Example #23
0
from urllib import urlopen as uReq
from bs4 import BeautifulSoup as Bsoup

my_url = 'https://www.redditmetrics.com/top '

#List of the subreddits
subreddits = []

#Opening the connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#Connection Closed

#Parsing the HTML Reddit/Subreddit
soup = Bsoup(page_html, 'html.parser')
paras = soup.find(attrs={'table'})
count = 0

print paras
paras2 = str(paras)

file_obj = open("Params", 'w')
file_obj.write(paras2)
file_obj.close()

fileObj = open("Params", 'r')
params = fileObj.read()
i = 0
listo = []
Example #24
0
def scrape_books(links):
    """Parses book information from Goodreads page"""
    books_read = 0
    while books_read < 200 and len(links) > 0:
        try:
            page = requests.get(links.pop(0))
        except requests.exceptions.RequestException as error:
            print(error)
            return False

        soup = Bsoup(page.content, "html.parser")

        book_title = soup.findAll("h1")[0].get_text().strip()
        book_url = soup.head.link["href"]
        book_id = (book_url.split('/')[-1]).split('.')[0]
        isbn13 = soup.find("span", {"itemprop": "isbn"})
        if isbn13 is not None:
            isbn13 = isbn13.get_text().strip()
        author_info = soup.find(class_="authorName")
        author_url = author_info["href"]
        if author_url is not None and author_url not in author_links:
            author_links.append(author_url)
        author_name = author_info.find("span").get_text()
        details_container = soup.find("div", {"id": "details"})
        # isbn = details_container.findAll(class_="infoBoxRowItem")

        page_meta = soup.find(id="bookMeta")
        rating = page_meta.find("span", {
            "itemprop": "ratingValue"
        }).get_text().strip()
        rating_count = page_meta.find("meta",
                                      {"itemprop": "ratingCount"})["content"]
        review_count = page_meta.find("meta",
                                      {"itemprop": "reviewCount"})["content"]
        image_url = soup.find(id="coverImage")["src"]
        similar_list = soup.find("div", {"class": "carouselRow"})
        similar_books = similar_list.findAll("a")
        books = []

        for b in similar_books:
            books.append(b.find("img")["alt"].strip().replace(",", " |"))
            similar_link = book_links.append(b["href"])
            if similar_link not in book_links and similar_link is not None:
                links.append(similar_link)

        book_object = {
            "title": book_title,
            "book_url": book_url,
            "book_id": book_id,
            "ISBN": isbn13,
            "author_url": author_url,
            "author": author_name,
            "rating": rating,
            "rating_count": rating_count,
            "review_count": review_count,
            "image_url": image_url,
            "similar_books": books
        }

        books_read += 1
        book_info['books'].append(book_object)
        print(books_read)
    add_to_json()
    scrape_author(author_links)
    return True
Example #25
0
def scrape_author(links):
    """Scrapes a Goodreads page to get information"""
    authors_read = 0

    while authors_read < 50 and len(links) > 0:
        try:
            page = requests.get(links.pop(0))
        except requests.exceptions.RequestException as error:
            print(error)
            return False

        soup = Bsoup(page.content, "html.parser")

        author_name = soup.find("h1").get_text().strip()
        author_url = soup.head.link["href"]
        author_id = (author_url.split('/')[-1]).split('.')[0]
        rating = soup.find("span", {"class": "average"}).get_text().strip()
        rating_count = soup.find("span", {
            "itemprop": "ratingCount"
        }).get_text().strip()
        review_count = soup.find("span", {
            "itemprop": "reviewCount"
        }).get_text().strip()
        image_url = soup.find("img", {"alt": author_name})["src"]
        similar_container = soup.find(class_="hreview-aggregate")
        similar_lists = similar_container.findAll("a")
        # author_books = similar_lists[0]["href"]
        similar_authors = similar_lists[1]["href"]

        try:
            authors_page = requests.get(base_url + similar_authors)
        except requests.exceptions.RequestException as error:
            print(error)
            sys.exit(1)
        authors_soup = Bsoup(authors_page.content, "html.parser")

        list_container = authors_soup.findAll(
            "div", {"class": "listWithDividers__item"})
        authors_list = []
        authors_links = []
        count = 0
        for aut in list_container:
            authors = aut.find("span", {"itemprop": "name"}).get_text().strip()
            author_link = aut.find("a", {"itemprop": "url"})["href"]
            if count != 0:
                authors_list.append(authors)
            if author_link not in links and author_link is not None:
                links.append(author_link)
            count += 1

        books_container = soup.findAll("a", {"class": "bookTitle"})
        books_list = []
        # print(authors_list, "\n", links)
        count = 0
        for bk in books_container:
            sim_book_name = bk.find("span", {"itemprop": "name"})
            if sim_book_name is None:
                continue
            if count != 0:
                books_list.append(sim_book_name.get_text().strip())
            count += 1
        author_object = {
            "name": author_name,
            "author_url": author_url,
            "author_id": author_id,
            "rating": rating,
            "rating_count": rating_count,
            "review_count": review_count,
            "image_url": image_url,
            "related_authors": authors_list,
            "author_books": books_list
        }

        author_info['authors'].append(author_object)
        authors_read += 1
    add_to_json()
    return True