コード例 #1
0
ファイル: domain-check.py プロジェクト: nat330/Domain-search
def check_domain(name):
    words = [
        'not found', 'No match', 'is free', 'AVAILABLE', 'nothing found',
        'No entries found', 'NOT FOUND'
    ]
    fulltext = ""
    firstpart = 'https://www.who.is/whois/'
    myurl = firstpart + name
    uClient = uReq(myurl)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, 'html.parser')
    check = page_soup.findAll("pre", {"style": "border:0px;"})
    for texts in check:
        fulltext = texts.text.strip()
    if not check:
        print "Sorry Mate Domain %s it`s  already Registered" % (name)
    if check:
        for word in words:
            if word in fulltext:
                print "This Domain %s it`s availabe" % (name)
                count = None
            if not word in fulltext:
                if word is "none":
                    print "Sorry Mate Domain %s it`s already Registered" % (
                        name)
コード例 #2
0
ファイル: main.py プロジェクト: micduan/CheckClassOpen
def checkOpen(coursename, coursenum, lecture):

    url = 'http://www.adm.uwaterloo.ca/cgi-bin/cgiwrap/infocour/salook.pl?level=under&sess=1185&subject=' + coursename + '&cournum=' + coursenum
    uClient = uReq(url)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, "html.parser")
    all_cells = page_soup.findAll('td')
    index = 0
    capacity = 0
    current = 0

    #The page we're retrieving the data from doesn't use any classnames or IDs so we have to retrieve the desired elements by checking the text

    for cell in all_cells:
        text = cell.get_text()

        # The 5th cell after 'LEC 00X' is the current enrollment
        if index == 5:
            current = text
        # The 6th cell after 'LEC 00X' is the current enrollment
        if index == 6:
            capacity = text
            break

        # After finding the desired cell, start keeping track of how many cells we've been to
        if (text == ('LEC 00' + lecture + ' ')):
            index += 1
            continue

        if index > 0:
            index += 1

    if int(current) < int(capacity):
        return True
コード例 #3
0
ファイル: getKijiji.py プロジェクト: micduan/FindASublet
def generateListings(bedrooms, furnished):

    my_other_url = 'https://www.kijiji.ca/b-short-term-rental/gta-greater-toronto-area/sublet/k0c42l1700272'

    index = my_other_url.find('/sublet/') + len('/sublet/')
    url_end = my_other_url[index:]

    if (bedrooms == '1'):
        url_end = '1+bedroom/' + url_end
    elif (bedrooms == '2'):
        url_end = '2+bedrooms/' + url_end
    elif (bedrooms == '3'):
        url_end = '3+bedrooms/' + url_end
    elif (bedrooms == '4'):
        url_end = '4+bedrooms__5+bedrooms__6+more+bedrooms' + url_end

    if (furnished == 'Y'):
        url_end = url_end + '?furnished=1'
    elif (furnished == 'N'):
        url_end = url_end + '?furnished=0'

    my_url = my_other_url[:index] + url_end

    uClient = uReq(my_url)

    page_html = uClient.read()

    uClient.close()
    page_soup = soup(page_html, "html.parser")

    containers = page_soup.findAll(True,
                                   {"class": ['search-item', 'regular-ad']})

    return containers
コード例 #4
0
def main():
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()

    page_soup = soup(page_html, "html.parser")
    href_tags = page_soup.find_all(href=True)
    initial_result = len(href_tags)
    time.sleep(10)
    href_tags = page_soup.find_all(href=True)
    new_result = len(href_tags)
    print(new_result -
          initial_result)  # tell you how many links were added or removed
    if new_result - initial_result == 0:
        print("no change")
    #implementation of twilio...
    elif new_result - initial_result < 0 or new_result - initial_result > 0:
        client = twilio.rest.Client('AC05a19e314e2e0a36da9d8966556c359c',
                                    '8cf175a0d0d3587e9a8ceece40bfa2c6')

        client.messages.create(
            body="Google just changed something on their homepage",
            to=my_phone_number,
            from_=twilio_phone_number)

    else:
        print("nothing noticed")
コード例 #5
0
def my_function2():
    #list of URLs to scrape from
    my_url = [
        'https://magicseaweed.com/Narragansett-Beach-Surf-Report/1103/',
        'https://magicseaweed.com/2nd-Beach-Sachuest-Beach-Surf-Report/846/',
        'https://magicseaweed.com/Nahant-Surf-Report/1091/',
        'https://magicseaweed.com/Nantasket-Beach-Surf-Report/371/',
        'https://magicseaweed.com/Scituate-Surf-Report/372/',
        'https://magicseaweed.com/Cape-Cod-Surf-Report/373/',
        'https://magicseaweed.com/The-Wall-Surf-Report/369/',
        'https://magicseaweed.com/Green-Harbor-Surf-Report/864/',
        'https://magicseaweed.com/Cape-Ann-Surf-Report/370/',
        'https://magicseaweed.com/27th-Ave-North-Myrtle-Surf-Report/2152/',
        'https://magicseaweed.com/Cocoa-Beach-Surf-Report/350/'
    ]
    # opening up connecting, grabbing the page

    conn = sqlite3.connect('SurfSend.db')
    cursor = conn.cursor()
    cursor.execute(
        'CREATE TABLE IF NOT EXISTS WindInfo(ID INTEGER PRIMARY KEY, WindMPH TEXT)'
    )

    #iterate over list of URLS
    for url in my_url:
        #initiating python's ability to parse URL
        uClient = uReq(url)
        # this will offload our content in'to a variable
        page_html = uClient.read()
        # closes our client
        uClient.close()

        # html parsing
        #beautifulsoup magic
        page_soup = soup(page_html, "html.parser")
        #variable for soon to be parsed page
        wind = page_soup.findAll(
            'td',
            class_=re.compile("text-center table-forecast-wind td-nowrap"))
        #prints the list of URLs we scraped from

        # iterates over parsed HTML
        for w in wind:
            #wavesize
            wi = w.find('span', class_='stacked-text text-right')
            winb = wi.text.strip()

            conn = sqlite3.connect('SurfSend.db')
            cursor = conn.cursor()
            # cursor.execute("INSERT INTO WindInfo VALUES (?)", (winb,))
            cursor.execute("INSERT INTO WindInfo (WindMPH) VALUES (?)",
                           (winb, ))
            conn.commit()
            cursor.close()
            conn.close()
コード例 #6
0
ファイル: getCraigslist.py プロジェクト: micduan/FindASublet
def generateListings(bedrooms, furnished):

    my_other_url = 'https://toronto.craigslist.ca/search/sub'

    uClient = uReq(my_other_url)

    page_html = uClient.read()

    uClient.close()
    page_soup = soup(page_html, "html.parser")

    containers = page_soup.findAll(True, {"class": ['result-row']})
    return containers
コード例 #7
0
def get_src_of_rand_video():
    _url = base_url + random.choice(show_titles)
    uClient = uReq(_url)
    page_soup = soup(uClient.read(), "html.parser")
    uClient.close()
    episodes = page_soup.select("div.cat-eps a.sonra")
    link = random.choice(episodes).get('href')
    #get video source
    driver.get(link)
    iframes = driver.find_elements_by_tag_name('iframe')
    driver.switch_to.frame(1)
    src = driver.find_element_by_id('video-js').find_element_by_tag_name(
        'source').get_attribute('src')
    return src
コード例 #8
0
def scrapeLoop():

    # open connection to twitter
    uClient = uReq(twit_url)

    # offload content from page into a variable
    # check to be sure that the page will only read what is immediately loaded and will not refresh at bottom of page (initially) [x]
    page_html = uClient.read()

    # close the client
    uClient.close()

    # use beautiful soup html parser --> allows us to parse through the elements of a page
    page_soup = soup(page_html, "html.parser")

    # now that we have the page information, we can find all the parts that contain text from tweets
    containers = page_soup.findAll("div", {"class": "js-tweet-text-container"})

    # iterate through each container and pull information
    for container in containers:
        # returns the text within the tweet
        tweet_text = container.p.text
        # count each character in the extracted text

        # remove the pic.twitter text
        # start by finding the index of pic.twitter
        pic_sub = "pic.twitter"
        index = tweet_text.find(pic_sub)

        if index > -1:
            count = index
            f.write(str(count) + "\n")

        else:
            count = 0
            for c in tweet_text:
                count += 1

            f.write(str(count) + "\n")

    global loops
    loops = loops - 1
    # recursively call the timer.
    # That way we can collect data for a certain number of intervals.
    if loops > 0:
        timer = Timer(600.0, scrapeLoop)
        timer.start()
    else:
        # if not closed we cannot use
        f.close()
コード例 #9
0
def ScrapePrice(data):
    url = data
    print(url)
    #uClient = uReq(url)
    uClient = uReq(url)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, "html.parser")
    print("start flipkart")
    FinalResult = []
    p = page_soup.find("span", {"id": "best_price"})
    FinalResult.append(p.text)
    seller = page_soup.find("span", {"class": "btn-span"})
    FinalResult.append(seller.text)
    print("found result")
    return FinalResult
コード例 #10
0
def main():
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()


    page_soup = soup(page_html, "html.parser")
    href_tags = page_soup.find_all(href=True)
    initial_result = len(href_tags)
    time.sleep(10)
    href_tags = page_soup.find_all(href=True)
    if new_result - initial_result == 0:
        print ("no change")
    elif new_result - initial_result > 0 or new_result - initial_result < 0:
        print("change")
    else:
        print ("nothing noticed")
コード例 #11
0
def school_fun(new_url):
    filename = "schools_details.csv"
    f = open(filename, "a")

    try:
        my_url = new_url

        # opening up the connection, grabbing the page
        newClient = uReq(my_url)
        new_page_html = newClient.read()
        newClient.close()

        # html parsing
        new_page_soup = soup(new_page_html, "html.parser")

        # clicking the link
        driver = webdriver.Firefox()
        driver.get(new_url)

        # doc = driver.page_source
        new_container = new_page_soup.findAll('font', {"face": "times new roman, serif"})

        # getting school_name and school_email_id
        school_name = new_container[1].text.strip()
        email_id = new_container[9].text.strip()

        school_email_id = re.findall(r'[\w\.-]+@[\w\.-]+', email_id)

        # you can comment off the following four print commands if not needed
        print "--------------------------------------------------------------------------------------------------------"
        print "School_Name:     " + school_name
        print "School-Email_id: " + school_email_id[0].strip()
        print "--------------------------------------------------------------------------------------------------------"

        f.write(school_name + "," + school_email_id[0].strip() + "\n")
        # f.close()

    except:
        pass
        print "--------------------------------------------------------------------------------------------------------"
        print "DATA CANNOT BE RETRIVE"
        print "--------------------------------------------------------------------------------------------------------"

    f.close()
    driver.quit();
コード例 #12
0
ファイル: parser.py プロジェクト: vikas456/uteats
def main():
    time = str(datetime.now().time().hour)
    day = datetime.today().weekday()

    dining_url = 'http://housing.utexas.edu/dining/hours'
    uClient = uReq(dining_url)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, "html.parser")
    containers = page_soup.findAll("table",{"class": "tablesaw tablesaw-stack"})
    openPlaces = []
    times = []
    places = []
    data = []

    for container in containers:
        day_values = container.tbody.findAll("tr")
        place = ""
        for val in day_values:
            if val.th is not None: # Ex. J2 Dining
                place = val.th
                places.append(place.text.strip())
            day_info = val.findAll("td")
            days = []
            isTime = 0
            timeLeft = 0
            timesRange = ""
            dayRange = ""
            for temp in day_info:
                text = temp.text.strip()
                if (len(text) != 0): # avoid spaces under days
                    if (text[0].isdigit() or text == "Closed" or text[0] == "N"): # time ranges
                        timesRange = text
                        isTime = checkTime(text, time)
                    else:
                        dayRange = text
                        days = checkDay(text)
            if (len(days) > 0 and -1 not in days):
                if (day in days and isTime == 1):
                    data.append({"name": place.text.strip()})
    sac(time, data)
    union(time, data)
    print data
    return render_template('index.html', data=data)
コード例 #13
0
def PriceNameSuggestion(name):
    print("entered Price name suggestion fn")
    try:
        my_url = 'http://scandid.in/search?q=' + name + '&type=products'
        print(my_url)
        uClient = uReq(my_url)
        page_soup = soup(uClient.read(), "html.parser")
        uClient.close()
        data = page_soup.findAll("a", {"class": "ellipsis multiline"})[0:8]
        name = []
        link = []
        for i in data:
            name.append(i.text)
            link.append('http://scandid.in/' + i['href'])
        print("name is ", name)
        print("link is ", link)
        return (name, link)
    except:
        print("Error opening the URL")
コード例 #14
0
def scrapedata(data):
    print("entered scrapedata fn")
    try:
        my_url = 'https://www.goodguide.com/products?filter=' + data
        print(my_url)
        uClient = uReq(my_url)
        page_soup = soup(uClient.read(), "html.parser")
        uClient.close()
        data = page_soup.findAll("a", {"class": "entity-link"})[0:8]

        #print(data)
        name = []
        link = []
        for i in data:
            name.append(i.get('title'))
            link.append(i.get('href'))
        return (name, link)
    except:
        print("Error opening the URL")
コード例 #15
0
ファイル: scrape.py プロジェクト: mayank-ji/Choose-Smart
def flipkart(d):
    print(d)
    url = 'http://scandid.in/search?q=' + d + '&type=products'
    print(url)

    #uClient = uReq(url)
    uClient = uReq(url)
    page_html = uClient.read()
    uClient.close()

    page_soup = soup(page_html, "html.parser")
    print("start flipkart")
    containers = page_soup.findAll("a", {"class": "ellipsis multiline"})[0:10]
    print("found result")
    l = []
    for i in containers:
        print(i['href'])
    print("END")
    return l
コード例 #16
0
ファイル: parser.py プロジェクト: vikas456/uteats
def sac(currTime, data):
    sacRestaurants = ["Chick-fil-A", "P.O.D.", "Starbucks", "Taco Cabana", "Zen"]
    dayIndex = datetime.today().weekday()
    # dayIndex = getDayIndex(day)
    dining_url = 'https://universityunions.utexas.edu/sac-hours/fall-2019'
    uClient = uReq(dining_url)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, "html.parser")
    containers = page_soup.findAll("table",{"class": "tablesaw tablesaw-stack"})
    locations = containers[2].tbody.findAll("tr")
    for location in locations:
        times = location.findAll("td")
        name = times[0].text.strip()
        if (name[:6] == "P.O.D."):
            name = "P.O.D."
        if (name in sacRestaurants):
            if (checkSacTime(times[dayIndex].text.strip(), currTime) == 1):
                data.append({"name": name})
コード例 #17
0
def ScrapeResult(data):
    print("entered ScrapeResult fn")
    try:
        my_url = 'https://www.goodguide.com' + data
        print(my_url)
        uClient = uReq(my_url)
        page_soup = soup(uClient.read(), "html.parser")
        uClient.close()

        title = page_soup.find("h1")
        print(title.text)

        imgParent = page_soup.find(
            "p", {"class": "text-center product-highlight-image"})
        img = imgParent.find("img")
        i = img['src']

        scoreParent = page_soup.find("p", {"class": "ring-value number"})
        score = scoreParent.find("a")
        print(score.text)

        contentParent = page_soup.find(
            "p", {"class": "rating-explained-ingredient-count number high"})
        HighHazardConcern = contentParent.find("a")
        print(HighHazardConcern.text)

        contentParent2 = page_soup.find(
            "p", {"class": "rating-explained-ingredient-count number medium"})
        MediumHazardConcern = contentParent2.find("a")
        print(MediumHazardConcern.text)

        contentParent3 = page_soup.find(
            "p", {"class": "rating-explained-ingredient-count number low"})
        LowHazardConcern = contentParent3.find("a")
        print(LowHazardConcern.text)

        print("END")
        return (title.text, i, score.text, HighHazardConcern.text,
                MediumHazardConcern.text, LowHazardConcern.text)
    except:
        print("Error opening the URL: Error in scrape result")
コード例 #18
0
ファイル: parser.py プロジェクト: vikas456/uteats
def union(currTime, data):
    unionRestaurants = ["Starbucks", "Chick-Fil-A", "P.O.D.", "Quiznos", "MoZZo", "Panda Express", "Field of Greens Market Place", "Wendy's @ Jester", "Java City @ PCL"]
    dayIndex = datetime.today().weekday()
    # print day
    # dayIndex = getDayIndex(day)
    dining_url = 'https://universityunions.utexas.edu/union-hours/fall-2019'
    uClient = uReq(dining_url)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, "html.parser")
    containers = page_soup.findAll("table",{"class": "tablesaw tablesaw-stack"})
    locations = containers[0].tbody.findAll("tr")
    # print dayIndex
    for location in locations:
        times = location.findAll("td")
        name = times[0].text.strip()
        if (name[:3] == "Prov"):
            name = "P.O.D."
        if (name in unionRestaurants):
            # print name
            if (checkUnionTime(times[dayIndex].text.strip(), currTime) == 0):
                data.append({"name": name})
コード例 #19
0
    def get_park_names(self):
        wikipedia_url = "https://en.wikipedia.org/wiki/List_of_national_parks_of_the_United_States"
        wikipedia_client = uReq(wikipedia_url)
        wikipedia_html = wikipedia_client.read()
        wikipedia_client.close()
        wikipedia_soup = soup(wikipedia_html, "html.parser")

        park_rows = wikipedia_soup.table.find_all('tr')
        park_rows.pop(0)
        num_parks = len(park_rows)
        parks = []
        for row in park_rows:
            parks.append(row.contents[1].contents[0].contents[0].encode(
                'ascii', 'ignore') + " National Park")
        fo = open("national_parks", "wb")
        for park in parks:
            fo.write(park)
            fo.write("\n")

        fo.close()

        return parks
コード例 #20
0
def get_hh_info(hh_url):
	hh_df =pd.DataFrame({'Current_Price' : [], 'Address' : [], 'Neighbourhood' : [],'Last_sold_date' : [], 'Last_Price':[],'Other_info':[],'House_details' : []})
	all_hh_info = []
	uClient = uReq(hh_url)
	page_html = uClient.read()
	page_soup =  soup(page_html, 'html.parser')
	pageinfo = page_soup.findAll("ul", {"class": "listInline mbn pdpFeatureList"})
	houseinfo = pageinfo[0].text.split('\n\n\n')
	text_content = [info.strip() for info in houseinfo if info !='']
	items = [item.split('\n') for item in text_content]
	strings = [string.split(',') for caption in items for string in caption]
	strings_ = [''.join(eles) if len(eles) >1 else eles for eles in strings]
        house_details = flatten(strings_)
	price = house_soup.select('span.h2.typeEmphasize')[0].text
	price1 = price.strip()
	address = house_soup.select('span.h2.typeEmphasize.pan.man.defaultLineHeight')[0].text
	address1 = address.strip()
	neighbourhood = house_soup.select('span.h6.typeWeightNormal.pts.typeLowlight.miniHidden.xxsHidden')[0].text
	neighbourhood1 = neighbourhood.strip()
	last_sold_date1 = house_soup.select('td.noWrap')[0].text
	his_price = house_soup.select('td.noWrap')[2].text
	his_price1 = price.strip()
	others = house_soup.select('ul.listInlineBulleted.man.pts')[0].text
	others =  others.split('\n')
	app_info1 = [other for other in others if other != '']
	all_hh_info.append(price1)
	all_hh_info.append(address1)
	all_hh_info.append(neighbourhood1)
	all_hh_info.append(last_sold_date1)
	all_hh_info.append(his_price1)
	all_hh_info.append(app_info1)
	all_hh_info.append(house_details)
	if len(all_hh_info) == len(hh_df.columns):
                hh_df.loc[len(hh_df.index)] = all_hh_info 
	columns = ['Current_Price', 'Address', 'Neighbourhood', 'Last_sold_date', 'Last_Price', 'Other_info', 'House_details']
	hh_df = pd.DataFrame(all_hh_info).T
	hh_df = hh_df[columns]
	return hh_df
コード例 #21
0
from urllib2 import urlopen as uReq
from bs4 import BeautifulSoup as soup

my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card'

#opening up connection, grabbing the webpage
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

#html parsing
page_soup = soup(page_html, "html.parser")

#grabs each product
containers = page_soup.findAll("div", {"class": "item-container"})

#creating a csv file and setting it to write mode
filename = "products.csv"
f = open(filename, "w")

headers = "brand,product_name,shipping\n"
f.write(headers)

for container in containers:
    brand = container.div.div.a.img["title"]

    title_container = container.findAll("a", {"class": "item-title"})
    product_name = title_container[0].text

    shipping_container = container.findAll("li", {"class": "price-ship"})
    shipping = shipping_container[0].text.strip()
コード例 #22
0
my_url = []

print "Enter wikia URL to get information:"
while True:
    urls = raw_input()
    if urls == "exit":
        break
    else:
        my_url.append(urls)

#my_url[0] = 'http://vampirediaries.wikia.com/wiki/Hybrid'
#my_url[1] = 'http://vampirediaries.wikia.com/wiki/Vampire'

#opening a connection and geting the html contents
for url in my_url:
    uClient = uReq(url)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, "html.parser")
    containers = page_soup.findAll("div", {"class": "pi-item"})
    figures = page_soup.findAll("h2", {"class": "pi-item"})

    #for container,figure in zip(containers,figures):
    for container in containers:
        #print figure.text
        print container.text
        #print container
        print "\n"
        # print container.a
        # print "\n"
        # print container.li
コード例 #23
0
from lxml import html
import requests

#Webscraper for xe.com

#These strings will be used to help build the URL
from_currency = "USD"  #This program will calculate the exchange rate of ONE currency, which you specify, with respect to a list of OTHER currencies, which you can add to
to_currency_list = ["CAD", "EUR", "GBP"]

#These strings will be used to help build the URL
view_list = ["12h", "1D", "1W", "1M", "1Y", "2Y", "5Y",
             "10Y"]  #List of the various time frames that XE provides

for toCurr in to_currency_list:
    for view in view_list:
        my_url = "https://www.xe.com/currencycharts/?from=" + from_currency + "&to=" + toCurr + "&view=" + view
        #page = requests.get(my_url)        #Open webpage
        #tree = html.fromstring(page.content)
        #low = tree.xpath('//*[@id="rates_detail_desc"]/strong[3]')
        #high = tree.xpath('//*[@id="rates_detail_desc"]/strong[4]')
        #print(low)

        uClient = uReq(my_url)  #Open webpage
        page_html = uClient.read()  #Read webpage
        uClient.close()  #Close webpage
        page_soup = soup(page_html, "html.parser")  #html parsing
        rates_detail = page_soup.find("div", {"id": "rates_detail_desc"})
        inner_text = rates_detail.text
        #rates_detail = page_soup.find("table", {"id": "crLive"})
        #rate = rates_detail.tbody
        print(inner_text)
コード例 #24
0
#/usr/bin/env python
#Developed by John Melody Mel
import bs4
from urllib2 import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = "https://www.ubuntu.com"
#grabbing command:
uClient = uReq("https://www.ubuntu.com")
page_html = uClient.read()
uClient.close()
page_soup = (page_html, "html.parser")
#grab each ubuntu()
page_soup.findAll = ("div",  {"class" : "Download Ubuntu"}) 
#len(containers)
#containers[0]
container = containers[0]
#for containers in container
download = container.div.div.a["Ubuntu 18.04.1 LTS"]
コード例 #25
0
To extract relevant urls
"""
from bs4 import BeautifulSoup as soup
from urllib2 import Request
from urllib2 import urlopen as uReq

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'
}
#Change URL here
req = Request(
    url='https://en.wikipedia.org/wiki/List_of_neighbourhoods_in_Bangalore',
    headers=headers)
##
uClient = uReq(req)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
table_containers = page_soup.find('body').find('div', {
    'class': 'mw-parser-output'
}).findAll('table')
d = {}
for tcont in table_containers:
    d.update({tcont: tcont.findAll('tr')})
locale = ''
container = ''
for table in d:
    for container in d[table]:
        if container.td == None:
            continue
コード例 #26
0
import bs4
from urllib2 import urlopen as uReq
from bs4 import BeautifulSoup as soup

myUrl = "https://www.webopedia.com/Top_Category.asp"
page_html = uReq(myUrl).read()
uReq(myUrl).close()

parsedPage = soup(page_html, "html.parser")
categories = parsedPage.findAll("div", {"class": "bullet_list"})

file = "terms.csv"
f = open(file, "w")

headers = "main_category_id, main_category_name, subCategory_id, subCategory_name, term_id, term_name, term_difinition\n"
f.write(headers)

for index, category in enumerate(categories):
    main_category_id = index
    main_category_name = str(category.div.span.a["href"])
    subCategories1 = category.findAll("li", {"class": "listing-item"})
    subCategories2 = category.findAll("li", {"class": "listing-item-hidden"})
    subCategories = subCategories1 + subCategories2
    for indx, subCategory in enumerate(subCategories):
        subCategory_id = indx
        subCategory_name = str(subCategory.a.text)
        sublink = str(subCategory.a["href"])
        link = "https://www.webopedia.com" + sublink
        terms_html = uReq(link).read()
        uReq(link).close()
        parsedTermsPage = soup(terms_html, "html.parser")
コード例 #27
0
ファイル: app.py プロジェクト: nafnafnaf/facebook-weather
def scrape():
    url = 'http://www.meteokav.gr/weather/'
    client = uReq(url)
    page = client.read()
    client.close()
    page_soup = soup(page, "html.parser")
    values_list = [
        ["Ενημέρωση απο το www.meteokav.gr:"],
        [
            "Θερμοκρασία:",
            page_soup.find("span", {
                "id": "ajaxtemp"
            }).text.strip()[0:6]
        ],
        [
            page_soup.find_all("strong")[19].text.strip(),
            page_soup.find("span", {
                "id": "ajaxhumidity"
            }).text.strip() + "%"
        ],
        [
            "Αίσθηση σαν: ",
            page_soup.find("span", {
                "id": "ajaxfeelslike"
            }).text.strip()
        ],
        ["Διαφορά 24ώρου: ",
         page_soup.find_all("strong")[0].text.strip()],
        ["Διαφορά ώρας: ",
         page_soup.find_all("strong")[1].text.strip()],
        [
            "Ανεμος: " + page_soup.find("span", {
                "id": "ajaxwinddir"
            }).text.strip() + "@" + page_soup.find("span", {
                "id": "ajaxbeaufortnum"
            }).text.strip() + " Bft"
        ],
        [
            page_soup.find_all("strong")[21].text.strip() + " " +
            page_soup.find("span", {
                "id": "ajaxbaro"
            }).text.strip() + " " + page_soup.find("span", {
                "id": "ajaxbarotrendtext"
            }).text.strip()
        ],
        [
            "Βροχή Σήμερα: " + page_soup.find("span", {
                "id": "ajaxrain"
            }).text.strip()
        ],
        #[page_soup.find("td", {"colspan":"2"}).find_all("tr")[1].find_all("td")[0].text.strip() +
        [
            "Μέγιστη Σήμερα: " + page_soup.find("table", {
                "class": "data1"
            }).find_all("tr")[1].find_all("td")[1].text.strip()[0:6] + "@" +
            page_soup.find("table", {
                "class": "data1"
            }).find_all("tr")[1].find_all("td")[1].text.strip()[-6:]
        ],
        #    [page_soup.find("td", {"colspan":"2"}).find_all("tr")[1].find_all("td")[0].text.strip() +
        [
            "Μέγιστη Χθες: " + page_soup.find("table", {
                "class": "data1"
            }).find_all("tr")[1].find_all("td")[2].text.strip()[0:6] + "@" +
            page_soup.find("table", {
                "class": "data1"
            }).find_all("tr")[1].find_all("td")[2].text.strip()[-6:]
        ],
        [
            "Ελάχιστη Σήμερα: " + page_soup.find("table", {
                "class": "data1"
            }).find_all("tr")[2].find_all("td")[1].text.strip()[0:4] + "@" +
            page_soup.find("table", {
                "class": "data1"
            }).find_all("tr")[2].find_all("td")[1].text.strip()[-5:]
        ],
        [
            "Ελάχιστη Χθες: " + page_soup.find("table", {
                "class": "data1"
            }).find_all("td")[5].text.strip()[0:4] + "@" +
            page_soup.find("table", {
                "class": "data1"
            }).find_all("td")[5].text.strip()[-5:]
        ],
        [
            page_soup.find_all("strong")[20].text.strip() + " " +
            page_soup.find("span", {
                "id": "ajaxdew"
            }).text.strip()
        ],
        [
            "Μέγιστη " + page_soup.find_all("strong")[19].text.strip() + " " +
            page_soup.find("td", {
                "rowspan": "3"
            }).find_all("tr")[1].find_all("td")[1].text.strip()[0:3] + "@" +
            page_soup.find("td", {
                "rowspan": "3"
            }).find_all("tr")[1].find_all("td")[1].text.strip()[-5:]
        ],
        [
            "Μέγιστη πιεση: " + page_soup.find("td", {
                "rowspan": "3"
            }).find_all("tr")[6].find_all("td")[1].text.strip()[0:10] + "@" +
            page_soup.find("td", {
                "rowspan": "3"
            }).find_all("tr")[6].find_all("td")[1].text.strip()[-5:]
        ],
        [
            "Ελάχιστη πιεση: " + page_soup.find("td", {
                "rowspan": "3"
            }).find_all("tr")[7].find_all("td")[1].text.strip()[0:10] + "@" +
            page_soup.find("td", {
                "rowspan": "3"
            }).find_all("tr")[7].find_all("td")[1].text.strip()[-5:]
        ]
    ]
    # y = values_list
    #uni_values = unicodedata.normalize('NFKD', y).encode('ascii', 'ignore')
    return tabulate(values_list)
コード例 #28
0
import bs4
from urllib2 import urlopen as uReq
from bs4 import BeautifulSoup as soup

print("web scrap")

my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=video%20cards'

uClient = uReq(my_url)  #opening connection
page_html = uClient.read()  #grabbing page
uClient.close()  #close connection

page_soup = soup(page_html,
                 "html.parser")  #grabs entire html file into memory location

# we need to grab the list of video cards
# this was found how to do by going to the page
# on chrome and then using the "inspect" ability,
# figure out the individual search results is in a div
# with the class name item-container
# use class below, we can also use id if we wanted to

containers = page_soup.findAll("div", {"class", "item-container"})

#putting it into a csv file
filename = "graphics_card.csv"  #name of file
f = open(filename, "w")  #open file and declare access type
headers = "brand, product_name, shippping \n"  # the headers for each column

f.write(headers)
コード例 #29
0
for i in xrange(1,len(data)):
    imdb_score.append(float(data["Column 27"][i]))
mean = np.mean(imdb_score)
std_Dev=np.std(imdb_score)
print ("Population mean:"+str(mean))
print ("Population Standard deviation:"+str(std_Dev))
print ("Population length:"+str(len(data)))

######################### Script to get Data of 2016 movies using web srcapping#################

import bs4
from urllib2 import urlopen as uReq
from bs4 import BeautifulSoup as soup

url="http://www.imdb.com/list/ls070941500/"
page =uReq(url)
page_html=page.read()
page.close()

page_soup=soup(page_html,"lxml")
# print page_soup
movie=page_soup.findAll("div",{"class":"info"})
file = open("imdb_2016.txt","w")
imdb_score2016=[]
for x in xrange(0,len(movie)):
	movie_name=movie[x].b.a.get_text()
	imdb_score=movie[x].div.findAll("span",{"class":"value"})[0].get_text()	
	imdb_score2016.append(float(imdb_score))
	file.write(str(x)+". \t"+movie_name+"\t\t\t\t\t\t\t\t\t"+ imdb_score+"\n")
file.close()
コード例 #30
0
ファイル: scrape.py プロジェクト: mittalayushi/Scraper
from urllib2 import urlopen as uReq
from bs4 import BeautifulSoup as soup
import urllib
my_url = 'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major'

uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

page_soup = soup(page_html, "html.parser")

containers = page_soup.findAll("ul", {"id": "ullist"})
listpoints = containers[0].findAll("li")
for points in listpoints:
    links = points.a["href"]
    bas = "https://www.scholarships.com"
    finlink = bas + links
    u1Client = uReq(finlink)
    page1_html = u1Client.read()
    u1Client.close()
    page1_soup = soup(page1_html, "html.parser")
    cont = page1_soup.findAll("td")
    for pts in cont:
        lks = pts.a["href"]
        finlks = bas + lks
        '''u2Client=uReq(finlks)
		page2_html=u2Client.read()
		u2Client.close()
		page2_soup = soup(page2_html,"html.parser")'''
        u2client = urllib.urlopen(finlks.encode('utf-8')).read()
        page2_soup = soup(u2client, "html.parser")