Python DOM.by_classの例、pattern.web.DOM.by_class Pythonの例

コード例 #1

0

ファイルを表示

ファイル: scraper.py プロジェクト: aeggermont/cs171

def process_page():

    url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series")
    dom = DOM(url.download(cached=True))
    domIndex = 0

    for title in dom.by_class("title"):

        theTitle = str(title.by_tag("a")[0].content).encode('ascii', 'replace')
        titleCatalog.append(Title(theTitle))
    
        try:

            match = re.search("^(\d+).*$", str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
            #print match.group(1)
            # titleCatalog[domIndex].addRunTime( str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
            titleCatalog[domIndex].addRunTime(match.group(1))

        except Exception, e:
            pass

        try:
            titleCatalog[domIndex].addRank( str(dom.by_class("value")[domIndex].content).encode('ascii', 'replace'))
        except Exception, e:
            pass

コード例 #2

0

ファイルを表示

    def extract_percentages(dom):
        file_url = URL(TARGET_URL)
        file_dom = DOM(file_url.download())

        percentage_list = []
        if file_dom.by_class('percentage'):
            for item in file_dom.by_class('percentage'):
                percentage_list.append(item.content.encode('utf-8'))
            return percentage_list[0]
        else:
            return "nodata"

コード例 #3

0

ファイルを表示

ファイル: scraper.py プロジェクト: netprofm/Project

    def extract_percentages(dom):
        file_url = URL(TARGET_URL)
        file_dom = DOM(file_url.download())

        percentage_list = []
        if file_dom.by_class('percentage'):
            for item in file_dom.by_class('percentage'):
                percentage_list.append(item.content.encode('utf-8'))
            return percentage_list[0]
        else:
            return "nodata"

コード例 #4

0

ファイルを表示

ファイル: Education_Scraper.py プロジェクト: lisayao/Education-in-California

def scrape_api(county_num):
	if county_num<10:
		county_num = '0' + str(county_num)
	else:
		county_num = str(county_num)
	
	print county_num
	#url = 'http://dq.cde.ca.gov/dataquest/Acnt2012/2011Base_Co.aspx?cYear=&cSelect=02'
	url = 'http://dq.cde.ca.gov/dataquest/Acnt2012/2011Base_Co.aspx?cYear=&cSelect=' + county_num
	abs_url = URL(string = url)
	dom = DOM(abs_url.download(cached=True))#download the DOM

	#grab the value for each district and sum them up to obtain the county total value
	districts = dom.by_class('medium\+_left')

	num_students_county_total = 0
	api_county_total = 0
	for n in districts:
		#grab and sum number of students
		district_num_students = n.parent.by_class("medium_center")[0].content
		
		if not "&nbsp" in district_num_students:
			#cast to int
			district_num_students = int(district_num_students.replace(',',''))
			num_students_county_total += district_num_students
		
			#grab and sum API for each district
			district_api = n.parent.by_class("medium_center")[1].content
			#remove any asterii
			district_api = district_api.replace('*','')		
			#cast to int
			district_api = int(district_api.replace(',',''))
		
			#add the API weighted by the number of students in the current district
			api_county_total += district_api*district_num_students
		
	#divide the weighted sum of APIs by the total number of students in the county
	average_api = api_county_total/num_students_county_total

		
		
	API_num_students = dom.by_class('medium\+_left')[0].parent.by_class("medium_center")[0].content

	#use county number as a placeholder for the county name for now, as the county name is not easily scrapable	
	county = county_num

	

	# write all the collected data to a new row of the output file
	writer.writerow([str(county), str(num_students_county_total),str(average_api)])

コード例 #5

0

ファイルを表示

ファイル: nominees.py プロジェクト: rizzomichaelg/cs109-oscars

def get_by_year(year):

    url = URL("http://www.imdb.com/event/ev0000003/" + str(year))
    dom = DOM(url.download(cached=True))
    
    dictAll = {}
    
    awards = dom.by_class('award')
    awardTitles = awards[0].by_tag('h2')
    awardList = []
    for award in awardTitles:
        awardList.append(award.content)

    prize = awards[0].by_tag('blockquote')
    for index, title in enumerate(prize[1:25]):
        winner = title.by_tag('strong')[0].by_tag('a')[0].content
        winner_id = str(title.by_tag('strong')[0].by_tag('a')[0].attrs['href'][-8:-1])

        nomineeList = []
        for each in title.by_tag('strong')[1::]:
            name = each.by_tag('a')[0].content
            id = str(each.by_tag('a')[0].attrs['href'][-8:-1])
            nomineeList.append((clean_unicode(name),id))
            
        winnersAndNominees = {}
        winnersAndNominees['winner'] = (clean_unicode(winner),winner_id)
        winnersAndNominees['nominees'] = nomineeList
        dictAll[awardList[index]] =  winnersAndNominees
    return dictAll

コード例 #6

0

ファイルを表示

ファイル: imdb-crawler.py プロジェクト: Lesliedao/DataProcessing

def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    # Pak de html van de url en maak er een DOM van
    html = url.download()
    dom = DOM(html)
    # Elke url begint met deze root, deze root is nodig voor het absolute pad
    root = 'http://www.imdb.com'
    
    # De url van elke film zit in een td tag met class titleColumn
    for movie in dom.by_class("titleColumn"):
    	# Maak een DOM van de inhoud tussen de td tags om daarin te kunnen zoeken
        movieinfo = DOM(movie.content)
        # Het relatieve pad van elke film is de waarde van 'href' van de eerste 'a' tag
        # Concatenate de root en het relatieve pad voor het absolute pad en append aan movie_urls
        movie_urls.append(root + movieinfo.by_tag("a")[0].attrs.get("href",""))
        
                    
    # return the list of URLs of each movie's page on IMDB
    return movie_urls

コード例 #7

0

ファイルを表示

ファイル: imdb-crawler.py プロジェクト: Xlocsinjr/DataProcessing

def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    print(url)

    url_html = url.download(cashed=True)
    url_dom = DOM(url_html)

    movie_urls = []

    for movie in url_dom.by_class("titleColumn"):
        # looks for the element containing the link.
        movie_url = movie.by_tag("a")[0]

        # Gets a dictionary of the elements' attributes.
        movie_url = movie_url.attrs['href']

        # Splits the string at the '?'.
        movie_url = movie_url.split('?')

        # Forms full url and appends to the list of movie urls
        movie_url = "http://www.imdb.com" + movie_url[0]
        movie_urls.append(movie_url)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls

コード例 #8

0

ファイルを表示

ファイル: Education_Scraper.py プロジェクト: lisayao/Education-in-California

def scrape_starrtest(county_num):
	if county_num<10:
		county_num = '0' + str(county_num)
	else:
		county_num = str(county_num)
	
	print county_num
	#url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=01&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
	url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=' + str(county_num) + '&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
	abs_url = URL(string = url)
	dom = DOM(abs_url.download(cached=True))#download the DOM

	
	#sciend_num = dom.by_class("rm")[4].content
	scicst_num = dom.by_class("rm")[3].content
	math_num = dom.by_class("rm")[2].content
	hist_num = dom.by_class("rm")[1].content
	ela_num = dom.by_class("rm")[0].content
	
	#sciend_percent = dom.by_class("rs")[4].content[:5]
	scicst_percent = dom.by_class("rs")[3].content[:5]
	math_percent = dom.by_class("rs")[2].content[:5]
	hist_percent = dom.by_class("rs")[1].content[:5]
	ela_percent = dom.by_class("rs")[0].content[:5]
	
	county = dom.by_tag("h2")[0].content
	
	
	# write all the collected data to a new row of the output file
	writer.writerow([county, ela_num,ela_percent, hist_num, hist_percent, math_num, math_percent,scicst_num, scicst_percent])

コード例 #9

0

ファイルを表示

ファイル: scraper-dev.py プロジェクト: aeggermont/cs171

def process_page():

    url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series")
    dom = DOM(url.download(cached=True))
    domIndex = 0

    for title in dom.by_class("title"):

        theTitle = str(title.by_tag("a")[0].content).encode('ascii', 'replace')
        #print theTitle
        #titleCatalog.append(Title(title.by_tag("a")[0].content))
        titleCatalog.append(Title(theTitle))
    
        try:
            # print dom.by_class("runtime")[domIndex].content
            titleCatalog[domIndex].addRunTime( str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
        except:
            pass

        try:
            # print dom.by_class("value")[domIndex].content
            titleCatalog[domIndex].addRank( str(dom.by_class("value")[domIndex].content).encode('ascii', 'replace'))
        except:
            pass

        try:
            for genre in dom.by_class("genre")[domIndex].by_tag("a"):
                # print genre.content
                titleCatalog[domIndex].addGenre( str(genre.content).encode('ascii', 'replace'))
        except:
            pass

        try:
            for credit in dom.by_class("credit")[domIndex].by_tag("a"):
                # print credit.content
                titleCatalog[domIndex].addActors( str(credit.content).encode('ascii', 'replace'))
        except:
            pass

        domIndex += 1

コード例 #10

0

ファイルを表示

ファイル: scraper.py プロジェクト: Lesliedao/DataProcessing

def scrape_page(url):
    html = url.download()
    dom = DOM(html)


    table = DOM(dom.by_class("wikitable")[0].content)
    countrylist = table.by_tag("tr")[1:]

    pointsdict = {}
    for c in countrylist:
        infodict = {}

        infodict["name"] = c.by_tag("a")[-1].content.encode("utf-8")
        infodict["Overall"] = int(c.by_tag("td")[2].content.encode("utf-8"))
        infodict["Female"] = int(c.by_tag("td")[4].content.encode("utf-8"))
        infodict["Male"] = int(c.by_tag("td")[6].content.encode("utf-8"))

        if infodict["Overall"] > 80:
            infodict["fillKey"] = "HIGH"
        elif infodict["Overall"] > 70:
            infodict["fillKey"] = "ABVAVG"
        elif infodict["Overall"] > 60:
            infodict["fillKey"] = "AVG"
        elif infodict["Overall"] > 50:
            infodict["fillKey"] = "BELAVG"
        else:
            infodict["fillKey"] = "LOW"

        code = ""
        for countryCode in countryCodes:
            if infodict["name"] == countryCode[2]:
                code = countryCode[1]
                break

        # Als de code niet gevonden is, sla het land over
        if code == "":
            pass
        # Anders maak een key van de code met value de infodict van dat land
        else:
            pointsdict[code] = infodict

    json.dump(pointsdict, open("lifeexpectancy.json", "wb"))

コード例 #11

0

ファイルを表示

ファイル: imbd-crawler.py プロジェクト: BartQuaink/Dataprocessing

def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    url = URL(url)
    dom = DOM(url.download())

    for e in dom.by_class('titleColumn'):
        for href in e('a')[:1]:
            movie_urls.append("http://www.imdb.com" + href.attrs["href"])

    # return the list of URLs of each movie's page on IMDB
    return movie_urls

コード例 #12

0

ファイルを表示

ファイル: scraper.py プロジェクト: willyxiao/eye-tracker-chinese

# and stores it into a JSON object

import re, json, io
from pattern.web import URL, DOM, plaintext, strip_between
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT

chindict = {}
radicals = []

url = URL("http://www.mdbg.net/chindict/chindict.php?page=radicals")
dom = DOM(url.download(cached=True))

radElements = dom('a.rad')

for rad in radElements:
    radicals.append(rad.content)
    words = []
    wordUrl = URL("http://www.mdbg.net/chindict/" + rad.attrs["href"])
    wordDom = DOM(wordUrl.download(cached=True))
    wordTable = wordDom.by_class("results")

    for word in wordTable[0].by_tag("span"):
        words.append(word.content)

    chindict[rad.content] = words

with open('chindict.js', 'w') as outfile:
    json.dump(radicals, outfile)
    json.dump(chindict, outfile)

コード例 #13

0

ファイルを表示

ファイル: races_datav2.py プロジェクト: rayden7/cs171-project03

# Create csv and add a header row	
output = open("races_data.csv", "wb")
writer = csv.writer(output)
writer.writerow(["Race","Year", "POS", "Num", "rider ID", "Rider URL", "Rider01", "rider02" , "Machine", "Time", "Speed" ])				

# Set up base URL and main URL. ERA 5 = 1991 - 2012
eras = ["1","2","3","4","5"]
for era in eras:
	print "Era:" + era
	url = URL("http://www.iomtt.com/TT-Database/Events.aspx?meet_code=TT2012&era=" + era)
	text_url = "http://www.iomtt.com"

	# Get a hold of the dom and then Grab each Year's URL which is embeded on li tags.
	dom = DOM(url.download(cached=True))
	years = dom.by_class("ttDatabasePipeSeparator floatleft")[0].by_tag("li")

	# Iterate over each year
	for year in years:
		#Print commands are useful to monitor progress.
		print("year:")
		print year.by_tag("a")[0].attributes.get('href','')
		
		#Find the current year's URL and download its DOM.
		new_url = URL(text_url + year.by_tag("a")[0].attributes.get('href',''))
		year_url = URL(new_url)
		year_dom = DOM(year_url.download(cached=True))
		#races = year_dom.by_class("panelinner clearfix")[0].by_tag("ul")[0].by_tag("li")
		races_div = races = year_dom.by_class("ttDatabase")[0].by_class("panelinner")[1].by_tag("ul")
		if len(races_div) > 1:
			races = races_div[0].by_tag("li")

コード例 #14

0

ファイルを表示

ファイル: 12-dom.py プロジェクト: Abhishek-1/temp

######################################## Test Techcrunch - https://techcrunch.com/ ####################################

print("#" * 40, "Test Techcrunch", "#" * 40)
url = URL("https://techcrunch.com/startups/")
dom = DOM(url.download(cached=True))

for e in dom.by_tag("header.post-block__header")[:5]:
    for a in e.by_tag("h2.post-block__title")[:1]:
        print(plaintext(a.content))
        for h in a.by_tag("a.post-block__title__link")[:1]:
            print(h.attrs["href"])
        print("")
print("\n")

header = dom.by_class("river__title")[0]
print(header.content)
print("\n")

title_image = dom.by_attr(name="msapplication-TileImage")[0]
print(title_image.attrs['content'])
print("\n")

url = URL("https://techcrunch.com")
dom = DOM(url.download(cached=True))
for k in dom.by_class("post-block__title__link"):
    print(k.content.strip())
    print("")

print("\n")

コード例 #15

0

ファイルを表示

ファイル: spiders.py プロジェクト: Carlosmr/WhooshSearcher

 def getTitle(self, link):
     html = URL(link).download()
     body = DOM(html).body
     title = body.by_class("title-news")[0].content.strip()
     return title

コード例 #16

0

ファイルを表示

ファイル: 2014CourseInstructor.py プロジェクト: beebha/Thesis

writer.writerow(["CourseID", "InstructorCode", "InstructorName", "InstructorURL", "InstructorEmail"])

date_to_write = []

all_urls = ["http://dceweb.harvard.edu/prod/sswcpgm.taf?function=search&wgrp=ALMIT&_UserReference=E11F5775BEB5C7554DDE88C4&concentrationArea=AREA_CONC_1%2C9&SEARCH_TERM=both",
            "http://dceweb.harvard.edu/prod/sswcpgm.taf?function=search&wgrp=ALMIT&_UserReference=E11F5775BEB5C7554DDE88C4&concentrationArea=AREA_CONC_2%2C9&SEARCH_TERM=both",
            "http://dceweb.harvard.edu/prod/sswcpgm.taf?function=search&wgrp=ALMIT&_UserReference=E11F5775BEB5C7554DDE88C4&concentrationArea=AREA_CONC_5%2C6&SEARCH_TERM=both"]

for ind_url in all_urls:

    # DOM object for each concentration
    url = URL(ind_url)
    dom = DOM(url.download(cached=True))

    # get main content containing all the courses
    main_content = dom.by_class("csearchresults")

    # get all the rows that have the course data
    all_data_rows = main_content[0].by_tag("tr")

    # loop through each row
    for ind_data_row in all_data_rows:

        if ind_data_row.attributes.get("class", "") == "" or ind_data_row.attributes.get("class", "") == "odd":

            all_columns = ind_data_row.by_tag("td")

            # ensure course is not cancelled
            if len(all_columns) > 1 and plaintext(str(all_columns[4])).find("Canceled") == -1:

                course_id = ""

コード例 #17

0

ファイルを表示

ファイル: scrape.py プロジェクト: dicai/datavis

def get_info(baseurl, out_filename, npages=200):

    output = open(out_filename, "w")
    w = writer.UnicodeWriter(output)
    # TODO: fix this header
    w.writerow(
        [
            "Title",
            "Rating",
            "Calories (kcal)",
            "Cholesterol (mg)",
            "Fat (g)",
            "Protein (g)",
            "Fiber (g)",
            "Sodium (mg)",
            "Cook Time",
            "Ingredients",
            "Full Ingredients",
        ]
    )

    for page in range(1, npages):
        try:
            url = URL(baseurl + "?Page=%d" % page)
            dom = DOM(url.download(cached=True))
            links = dom.by_class("rectitlediv")

            # goes through the 20 recipes on a given page
            for index in range(len(links)):
                # print index
                # get the link name
                title = links[index].content.split("/recipe/")[1].split("/detail")[0]
                # download individual recipe
                rpage = URL(os.path.join(base, title, end))
                pdom = DOM(rpage.download(cached=True))

                # average rating value
                rating = pdom.by_attribute(itemprop="ratingValue")[0].source.split('"')[3]

                # list of nutrition elements
                nut_list = pdom.by_class("nutrSumWrap")[0].by_class("nutrSumList")
                nut_vals = []
                for i in range(len(nut_list)):
                    val = nut_list[i].by_attribute(id="lblNutrientValue")[0].content
                    nut_vals.append(val)
                nuts = "\t".join(nut_vals)

                # time needed to cook
                try:
                    cook_hours = pdom.by_attribute(id="cookHoursSpan")[0].content
                    cook_hours = cook_hours.replace("<em>", " ").replace("</em>", " ")
                except:
                    cook_hours = "0"
                try:
                    cook_mins = pdom.by_attribute(id="cookMinsSpan")[0].content
                    cook_mins = cook_mins.replace("<em>", " ").replace("</em>", " ")
                except:
                    cook_mins = "0"
                mins = str(int(cook_hours.split()[0]) * 60 + int(cook_mins.split()[0]))

                # ingredients

                ## gets the block containing both the amount and the amount
                all_ings = pdom.by_attribute(itemprop="ingredients")
                ing_units = []
                ing_vals = []
                for ing_index in range(len(all_ings)):
                    tmp_ing = all_ings[ing_index].by_id("lblIngName").content
                    if "&nbsp;" in all_ings[ing_index].content:
                        continue
                    try:
                        tmp_amount = all_ings[ing_index].by_id("lblIngAmount").content
                    except:
                        tmp_amount = ""  # LET THIS BE THE EMPTY CHAR we decide on
                    ing_units.append(tmp_amount)
                    ing_vals.append(tmp_ing)
                ings = ";".join(ing_vals)

                ing_units = [x + "|" for x in ing_units]
                str_ings = [str(x) for x in zip(ing_units, ing_vals)]
                str_ings = [x.replace(",", " ") for x in str_ings]
                full_ings = ";".join(str_ings)
                full_ings = (
                    full_ings.replace("u'", "")
                    .replace("'", "")
                    .replace(", u", "")
                    .replace("(", "")
                    .replace(")", "")
                    .replace("  ", " ")
                )

                assert len(ing_vals) == len(ing_units)

                w.writerow([title, rating, nuts, mins, ings, full_ings])

        except:
            pass

    output.close()

コード例 #18

0

ファイルを表示

ファイル: data_scraper_WIKI_EA.py プロジェクト: beebha/CS171P3

            self.writerow(row)

# Creating the csv output file for writing into as well as defining the writer
output = open("data_output_WIKI_EA.csv", "wb")
writer = UnicodeWriter(output)

# add header row
writer.writerow(["State", "Rank", "EA", "Degree"])


# get the DOM object to scrape for links
url = URL("http://en.wikipedia.org/wiki/List_of_U.S._states_by_educational_attainment")
dom = DOM(url.download(cached=True))

# get the tables where all info is contained
all_data_tables = dom.by_class("wikitable")

# define the variable to store all the WIKI data
all_wiki_data = []

# loop through each row
for ind_data_table in all_data_tables:

    degree = ""

    for ind_data_header in ind_data_table.by_tag("th"):
        if "H.S. Graduate" in plaintext(ind_data_header.content):
            degree = "High School"
        if "Bachelor's Degree" in plaintext(ind_data_header.content):
            degree = "Undergraduate"
        if "Advanced Degree" in plaintext(ind_data_header.content):

コード例 #19

0

ファイルを表示

ファイル: complex_HTML.py プロジェクト: aeggermont/cs171

def get_title_attributes(title, titleLink):

    url = URL(titleLink)
    dom = DOM(url.download(cached=True))
    titleObj = Title(title.encode('ascii','replace'))

    print "Movie: ", title

    # Get Directors
    print "-> About to print directors... "

    directors = dom.by_attribute(itemprop="director")[0]
    directorNames =  directors.by_tag("a")


    for director in directorNames:
        print director.content

        dirName  = unicodedata.normalize('NFD', director.content).encode('ascii','replace')
        #str(director.content).encode("utf-8")
        print "Director ===> ", dirName

        titleObj.addDirectors( dirName )

    # Get writers
    print "-> About to print writers... "

    try:
        writers = dom.by_attribute(itemprop="writer")
        for writer in writers:
            # print writer[1][1].content
            titleObj.addWriters( str(writer[1][1].content).encode('ascii', 'replace'))
    except:
        pass



    print "--> About to get actors... "
    try:
        actors = dom.by_attribute(itemprop="actors" )
        for actor in actors:
            # print actor[1][1].content
            titleObj.addActors( str(actor[1][1].content).encode('ascii', 'replace'))
    except:
        pass


    print "--> Aboutb to get rating information... "


    try:
        ratingsInfo = dom.by_class("star-box-giga-star")

        for rating in ratingsInfo:
            # print rating.content
            titleObj.addRating(str(rating.content).encode('ascii', 'replace'))
    except:
        pass


    print "--> About to print other stuff...  "



    for item in dom.by_class("infobar"):

        try:
            objMatch = re.search("(\d+)", item.by_tag("time")[0].content )

            if objMatch:
                # print objMatch.group(1)
                titleObj.addRunTime( str(objMatch.group(1)).encode('ascii', 'replace'))
        except:
            pass



        for genreItem in item.by_tag("a"):

            try:
                objMatch = re.search("genre", genreItem.attributes['href'] )

                if objMatch:
                    titleObj.addGenre(str(genreItem.content).encode('ascii', 'replace'))
                    # print genreItem.attributes['href']
                    # print genreItem.content
            except:
                pass


    return  titleObj

コード例 #20

0

ファイルを表示

ファイル: imdb-crawler.py プロジェクト: Jack-Herrer/DataProcessing

def scrape_movie_page(dom):
    '''
    Scrape the IMDB page for a single movie
    Args:
        dom: pattern.web.DOM instance representing the page of 1 single
            movie.
    Returns:
        A list of strings representing the following (in order): title, year,
        duration, genre(s) (semicolon separated if several), director(s) 
        (semicolon separated if several), writer(s) (semicolon separated if
        several), actor(s) (semicolon separated if several), rating, number
        of ratings.
    '''

    # iterate all movies
    for p in movie_urls:
        p_url = URL(p)
        p_dom = DOM(p_url.download(cached=True))

        # get title
        title = clean_unicode(p_dom.by_class('header')[0].content)
        title = plaintext(strip_between('<span', '</span>', title))

        # get gengres
        genres = []
        for genre in p_dom.by_class('infobar')[0].by_tag('a')[:-1]:
            genres.append(clean_unicode(genre.content))

        # make lists for info
        directors = []
        writers = []
        actors = []

        # get directors writers actors
        text_blocks = p_dom.by_class('txt-block')[:3]
        for t in text_blocks:
            spans = t.by_tag('span')
            for s in spans:
                if s.attributes.get('itemprop') == 'director':
                    director = s.by_tag('span')[0].by_tag('a')[0].content
                    directors.append(clean_unicode(director))

                if s.attributes.get('itemprop') == 'writer':
                    p_writer = s.by_tag('span')[0].by_tag('a')[0].content
                    writers.append(clean_unicode(p_writer))

                if s.attributes.get('itemprop') == 'actors':
                    actor = s.by_tag('span')[0].by_tag('a')[0].content
                    actors.append(clean_unicode(actor))

        rating = []
        ratings_count = []

        # get ratings
        spans = p_dom.by_class('star-box-details')[0].by_tag('span')
        for s in spans:
            if s.attributes.get('itemprop') == 'ratingValue':
                rating = clean_unicode(s.content)
            if s.attributes.get('itemprop') == 'ratingCount':
                ratings_count = clean_unicode(s.content)

        # format the strings from lists
        genres = concat_strings(genres)
        directors = concat_strings(directors)
        writers = concat_strings(writers)
        actors = concat_strings(actors)



    # Return everything of interest for this movie (all strings as specified
    # in the docstring of this function).
        return title, duration, genres, directors, writers, actors, rating, \
            n_ratings

コード例 #21

0

ファイルを表示

ファイル: complex_HTML.py プロジェクト: goodspeedj/csci-e64

        movieUrl = URL(movieTitleLinks.group(0))
        movieDom = DOM(movieUrl.download(cached=True))
        
        
        #=======================================================================
        # Get the title
        #=======================================================================
        for movie in movieDom.by_tag("title"):
            title = re.sub(' \(\d+\) - IMDb','', movie.content.encode('ascii','ignore').strip())

            
        
        #=======================================================================
        # Get the runtime
        #=======================================================================
        for movie in movieDom.by_class("infobar"):
            time = re.search('\d+ min', movie.content.encode('ascii', 'ignore').strip())
            runtime = re.sub(' min','', time.group(0))

            
            #===================================================================
            # Get the genres
            #===================================================================
            genre = []
            for g in movie.by_tag('a'):
                
                type = re.sub('\n|\d+.*|\(.*\)','', g.content.encode('ascii', 'ignore').strip('\r\n'))
                
                
                if ((type != ' \n') and not (re.match('^\s+', type))):
                    genre.append(type)

コード例 #22

0

ファイルを表示

ファイル: restweek.py プロジェクト: bsiddiqui/restaurant-week-scraper

            self.writerow(row)

# Creating the csv output file for writing into as well as defining the writer
output = open("restweek.csv", "wb")
writer = UnicodeWriter(output)

# add header row
writer.writerow(["Name", "Neighborhood", "Cuisine", "Dining Style", "Meals Served", "Dress Code", "Ratings", "Price", "Phone Number", "Address", "Website" ])


# Get the DOM object to scrape for movie links. [Hint: Use absolute URL's.
# Documentation can be found here: http://www.clips.ua.ac.be/pages/pattern-web] 
url = URL("http://www.opentable.com/promo.aspx?m=7&ref=470&pid=90")
dom = DOM(url.download(cached=True))

for restaraunt in dom.by_class("ResultRow"):
    name = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_tag("a")[0].content.encode( 'ascii', 'ignore' )
    neighborhood_cuisine = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_class("d")[0].content.encode( 'ascii', 'ignore' )
    neihgborhood_cuisine =  neighborhood_cuisine.split('|')
    neighborhood = neihgborhood_cuisine[0]
    cuisine = neihgborhood_cuisine[1]
    meals = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_class("message")[0].content.encode( 'ascii', 'ignore' )
    meals = meals.split('<')
    # need to clean
    meals = meals[0]   
    restURL = URL(abs(restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_tag("a")[0].attributes.get('href',''), base=url.redirect or url.string))
    restDOM = DOM(restURL.download(cached=True))
    # need to clean
    address = restDOM.by_id("ProfileOverview_lblAddressText").content
    price = restDOM.by_id("ProfileOverview_lblPriceText").content
    try:

コード例 #23

0

ファイルを表示

# http://www.mdbg.net/chindict/chindict.php?page=radicals
# and stores it into a JSON object

import re, json, io
from pattern.web import URL, DOM, plaintext, strip_between
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT

chindict = {}
radicals = []

url = URL("http://www.mdbg.net/chindict/chindict.php?page=radicals")
dom = DOM(url.download(cached=True))

radElements = dom('a.rad')

for rad in radElements:
    radicals.append(rad.content)
    words = []
    wordUrl = URL("http://www.mdbg.net/chindict/" + rad.attrs["href"])
    wordDom = DOM(wordUrl.download(cached=True))
    wordTable = wordDom.by_class("results")

    for word in wordTable[0].by_tag("span"):
        words.append(word.content)

    chindict[rad.content] = words

with open('chindict.js', 'w') as outfile:
    json.dump(radicals, outfile)
    json.dump(chindict, outfile)

コード例 #24

0

ファイルを表示

ファイル: ch1.py プロジェクト: acheson/cs171pj2

def get_mov_link(search_url, mov_title, mov_year, proxy):
    mov_url = URL(search_url)
    if mov_title == 'M':
        return "http://www.1channel.ch/watch-48002-M"
    if mov_title == u"8\u00BD":
        return "http://www.1channel.ch/watch-1188-8189"
    if mov_title == u"Nausica\u00E4 of the Valley of the Wind":
        return "http://www.1channel.ch/watch-998-Nausicaa-of-the-Valley-of-the-Winds"
    #try:
    mov_dom = DOM(mov_url.download(cached=False, timeout=25, proxy=proxy)) 
    #print mov_dom
    #    print "Downloaded search dom for:", mov_title, "(" + str(mov_year) + ")"   
    #except Exception, e:
    #    print "Could not download search url: ", mov_url,'for reason:', e
    
    mov_ind = mov_dom.by_class("index_container")
    #print search_url
    #print mov_ind[0].by_class("index_item index_item_ie")[0]
    if mov_ind[0].by_class("info_message"):
        print mov_title, "not found"
        return None
    else:

        for r in mov_ind[0].by_class("index_item index_item_ie"):
            #grab the search results title
            res_title = r.by_tag("a")[0].attributes.get("title")

            #split out the year based on "(\d+)", assign title to res_t and year to res_y
            res_ts = re.search("Watch (.+)\s\((\d+)", res_title)
            res_t = res_ts.group(1)
            res_y = res_ts.group(2)
            if mov_title == 'The Good, the Bad and the Ugly':
                mov_year = 1967
            if mov_title == 'The Dark Knight':
                mov_title = 'Batman: The Dark Knight'
            if mov_title == "One Flew Over the Cuckoo's Nest":
                mov_year = 1976
            if mov_title == 'Star Wars':
                mov_title ='Star Wars: Episode IV - A New Hope'
            if mov_title == 'Seven Samurai':
                mov_year = 1956
            if mov_title == 'Once Upon a Time in the West':
                mov_title = "Once Upon a Time in the West - (C'era una volta il West)"
            if mov_title == 'Casablanca':
                mov_year = 1943
            if mov_title == 'Rear Window':
                mov_year = 1955
            if mov_title == "It's a Wonderful Life":
                mov_year = 1947
            if mov_title == "The Pianist":
                mov_year = 2003
            if mov_title == u'L\xe9on: The Professional':
                mov_title = "Leon The Professional"
            if mov_title == u"Am\xe9lie":
                mov_title = "Amelie from Montmartre"
            if mov_title == "Princess Mononoke":
                mov_title = "Princess Mononoke (Mononoke-hime)"
            if mov_title == "Witness for the Prosecution":
                mov_year = 1958
            if mov_title == 'Grave of the Fireflies':
                mov_title = "Grave of the Fireflies (Hotaru no haka)"
            if mov_title == 'Snatch.':
                mov_title = "Snatch"
                mov_year = 2001
            if mov_title == 'The General':
                mov_year = 1927
            if mov_title == 'Gran Torino':
                mov_year = 2009
            if mov_title == 'Hotel Rwanda':
                mov_year = 2005
            if mov_title == 'V for Vendetta':
                mov_year = 2006
            # Foreign title
            if mov_title == "The Secret in Their Eyes":
                mov_title = "El secreto de sus ojos"
            if mov_title == "There Will Be Blood":
                mov_year = 2008
            if mov_title == "Million Dollar Baby":
                mov_year = 2005
            if mov_title == "Amores Perros":
                mov_title = "Amores perros"
            if mov_title == "Life of Pi":
                mov_title = "Life Of PI"
            if mov_title == "The 400 Blows":
                mov_title = "The 400 Blows (Les quatre cents coups)"
            if mov_title == "Howl's Moving Castle":
                mov_title = "Howl's Moving Castle (Hauru no ugoku shiro)"
            if mov_title == "La strada":
                mov_title = "La Strada"
            if mov_title == "The Wild Bunch":
                mov_title = "The Wild Bunch (1969)"
            if mov_title == "A Fistful of Dollars":
                mov_title = "A Fistful of Dollars - (Per un pugno di dollari)"
            if mov_title == "Slumdog Millionaire":
                mov_year = 2009
            if mov_title == "Stalker":
                mov_year = 1980
            if mov_title == "Harry Potter and the Deathly Hallows: Part 2":
                mov_title = "Harry Potter and the Deathly Hallows 2"
            if mov_title == "The Wrestler":
                mov_year = 2009
            if mov_title == "Spring, Summer, Fall, Winter... and Spring":
                mov_title = "Spring, Summer, Fall, Winter...and Spring (Bom yeoreum gaeul gyeoul geurigo bom)"
            if mov_title == "Castle in the Sky":
                mov_title = "Castle in The Sky"
            print res_t, res_y, mov_title.strip(), mov_year
            if res_t.strip() == mov_title.strip() and int(res_y) == int(mov_year):
                return abs_url(r.by_tag("a")[0].attributes.get("href"),base=url.redirect or url.string)

コード例 #25

0

ファイルを表示

    for i in range(0, len(tvseries)):
        writer.writerow(tvseries[i])

    f.close()

if __name__ == '__main__':
    # Download the HTML file
    url = URL(TARGET_URL)
    html = url.download()

    # Save a copy to disk in the current directory, this serves as an backup
    # of the original HTML, will be used in grading.
    with open(BACKUP_HTML, 'wb') as f:
        f.write(html)

    # Parse the HTML file into a DOM representation
    dom = DOM(html)

    # Extract the tv series (using the function you implemented)
    tvseries = []
    count = 0
    for i in dom.by_class("lister-item-content"):
        tvseries.append(extract_tvseries(dom,count))
        count = count + 1

    # Write the CSV file to disk (including a header)
    with open(OUTPUT_CSV, 'wb') as output_file:
        save_csv(output_file, tvseries)


    print tvseries

コード例 #26

0

ファイルを表示

ファイル: ocaml_scraper.py プロジェクト: rboling/data_analysis_work

	


url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'OCaml&offset=&limit=500' + '&action=history'

url = URL(url_string)

dom = DOM(url.download(cached=True))

engine = Wikipedia(license=None)

article = engine.search('Ocaml')

a = 0

while (len(dom.by_class("mw-nextlink")) > 0):
  page_history_links = dom.by_tag("ul")[0].by_tag("li")
  for link in page_history_links:
    date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') 
    ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore')  
    bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore')  
    ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true'
    req = urllib2.urlopen(ip_url)
    req_request = urllib2.Request(ip_url)
    #handler = urllib2.urlopen(req)
#    print "this must be the code in bytes\n"	
#    print req.read()
    read_ip_data = HTMLParser.HTMLParser().unescape(req.read()).encode('ascii', 'ignore')	
    if read_ip_data.split()[0] != '<html>':	
      print "the date\n"
      print date

コード例 #27

0

ファイルを表示

ファイル: races_data.py プロジェクト: rayden7/cs171-project02

	if (isinstance(stringToConvert, unicode)):
		stringToConvert = unicodedata.normalize('NFKD', stringToConvert).encode('ascii','ignore')
	return stringToConvert

# Create csv and add a header row	
output = open("races_data.csv", "wb")
writer = csv.writer(output)
writer.writerow(["Race","Year", "POS", "Num", "rider ID", "Rider URL", "Rider01", "rider02" , "Machine", "Time", "Speed" ])				

# Set up base URL and main URL. ERA 5 = 1991 - 2012
url = URL("http://www.iomtt.com/TT-Database/Events.aspx?meet_code=TT2012&era=5")
text_url = "http://www.iomtt.com"

# Get a hold of the dom and then Grab each Year's URL which is embeded on li tags.
dom = DOM(url.download(cached=True))
years = dom.by_class("ttDatabasePipeSeparator floatleft")[0].by_tag("li")

# Iterate over each year
for year in years:
	#Print commands are useful to monitor progress.
	print("")
	print year.by_tag("a")[0].attributes.get('href','')
	
	#Find the current year's URL and download its DOM.
	new_url = URL(text_url + year.by_tag("a")[0].attributes.get('href',''))
	year_url = URL(new_url)
	year_dom = DOM(year_url.download(cached=True))
	races = year_dom.by_class("grid_10 alpha hideoverflow")[0].by_tag("li")
	
	# The first 22 URLs belong to the year and this is consistent across the site so those URLs will be skipped.
	for race in races[22:-4]:

コード例 #28

0

ファイルを表示

ファイル: data_scraper_WIKI_EA.py プロジェクト: beebha/CS171P3

# Creating the csv output file for writing into as well as defining the writer
output = open("data_output_WIKI_EA.csv", "wb")
writer = UnicodeWriter(output)

# add header row
writer.writerow(["State", "Rank", "EA", "Degree"])

# get the DOM object to scrape for links
url = URL(
    "http://en.wikipedia.org/wiki/List_of_U.S._states_by_educational_attainment"
)
dom = DOM(url.download(cached=True))

# get the tables where all info is contained
all_data_tables = dom.by_class("wikitable")

# define the variable to store all the WIKI data
all_wiki_data = []

# loop through each row
for ind_data_table in all_data_tables:

    degree = ""

    for ind_data_header in ind_data_table.by_tag("th"):
        if "H.S. Graduate" in plaintext(ind_data_header.content):
            degree = "High School"
        if "Bachelor's Degree" in plaintext(ind_data_header.content):
            degree = "Undergraduate"
        if "Advanced Degree" in plaintext(ind_data_header.content):

コード例 #29

0

ファイルを表示


urls = [
    "http://www.campusservices.harvard.edu/transit-fleet/evening-nights-monday-friday",
    "http://www.campusservices.harvard.edu/transit-fleet/morning-afternoon-monday-friday",
    "http://www.campusservices.harvard.edu/transit-fleet/weekends",
    "http://www.campusservices.harvard.edu/transit-fleet/overnight-service"
]

count = 1
for urlStr in urls:
    url = URL(urlStr)
    dom = DOM(url.download(cached=False))
    print "Parsing " + urlStr

    body = dom.by_class("field-items")[0]
    tables = body.by_tag("table")
    headers = body.by_tag("h2")
    for i, table in enumerate(tables):
        writer = Writer(str(count) + ".csv")
        header = headers[i].content
        print ""
        print "Parsing route " + str(header)
        writer.writeLine(urlStr.split("/")[-1])
        writer.writeLine(header)

        header = True
        for row in table.by_tag("tr"):
            #lst = []
            for cell in row.by_tag("td"):
                if (header and len(cell.by_tag("strong")) != 0):

コード例 #30

0

ファイルを表示

ファイル: scraper.py プロジェクト: bsiddiqui/whatever-the-weather

def fetchWeatherDataForAirportAndYear(airport, year, month):
	try:
		# print urlForAirportAndYear(airport, year, month)
		url = URL(urlForAirportAndYear(airport, year, month))
		dom = DOM(url.download(cached=True))
		avg_temp = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[3].by_class("b")
		if avg_temp:
			avg_temp = avg_temp[1].content
		else:
			avg_temp = ""
		avg_max_temp = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[2].by_class("b")
		if avg_max_temp:
			avg_max_temp = avg_max_temp[1].content
		else:
			avg_max_temp = ""
		avg_min_temp = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[4].by_class("b")
		if avg_min_temp:
			avg_min_temp = avg_min_temp[1].content
		else:
			avg_min_temp = ""
		avg_dew_point = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[10].by_class("b")
		if avg_dew_point:
			avg_dew_point = avg_dew_point[1].content
		else:
			avg_dew_point = ""
		avg_precipitation = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[12].by_class("b")
		if avg_precipitation[1]:
			avg_precipitation = avg_precipitation[1].content
		else:
			avg_precipitation = ""
		avg_wind = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[15].by_class("b")
		if avg_wind:
			avg_wind = avg_wind[1].content
		else:
			avg_wind = ""
		avg_gust_wind = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[16].by_class("b")
		if avg_gust_wind:
			avg_gust_wind = avg_gust_wind[1].content
		else:
			avg_gust_wind = ""
		avg_sea_level_pressure = dom.by_class("contentData")[0].by_tag("table")[0].by_tag("tr")[18].by_class("b")
		if avg_sea_level_pressure:
			avg_sea_level_pressure = avg_sea_level_pressure[1].content
		else:
			avg_sea_level_pressure = ""

		# print "good"	
		return {"avg_temp": avg_temp, 
			"avg_min_temp": avg_min_temp, 
			"avg_max_temp": avg_max_temp, 
			"avg_dew_point": avg_dew_point, 
			"avg_precipitation": avg_precipitation, 
			"avg_wind": avg_wind,
			"avg_gust_wind": avg_gust_wind, 
			"avg_sea_level_pressure": avg_sea_level_pressure
		}
	except URLTimeout:
		# print "timeout fetching data"
		return {}
	except:
		# print "unknown error fetching data"
		return {}

コード例 #31

0

ファイルを表示

ファイル: 12-dom.py プロジェクト: clips/pattern

######################################## Test Techcrunch - https://techcrunch.com/ ####################################

print("#"*40, "Test Techcrunch", "#"*40)
url = URL("https://techcrunch.com/startups/")
dom = DOM(url.download(cached=True))

for e in dom.by_tag("header.post-block__header")[:5]:
    for a in e.by_tag("h2.post-block__title")[:1]:
        print(plaintext(a.content))
        for h in a.by_tag("a.post-block__title__link")[:1]:
            print(h.attrs["href"])
        print("")
print("\n")

header = dom.by_class("river__title")[0]
print(header.content)
print("\n")


title_image = dom.by_attr(name="msapplication-TileImage")[0]
print(title_image.attrs['content'])
print("\n")


url = URL("https://techcrunch.com")
dom = DOM(url.download(cached=True))
for k in dom.by_class("post-block__title__link"):
    print(k.content.strip())
    print("")

コード例 #32

0

ファイルを表示

ファイル: teams.py プロジェクト: mercicle/Scraping-And-Visualizing-NHL-Data

# I use team_labels[0] but it doesn't matter because the labels are the same
# get "th"'s b/c the elements look like this: <th align="left"  class="tooltip sort_default_asc" >Franchise</th>
all_labels = team_labels[0].by_tag("th")

team_label_container=[]
for label in all_labels:
    team_label_container.append(label.content.encode("utf8"))
  
# team_label_container now has the headers
team_label_container.insert(0,'team_acronym') 

team_container=[]
team_container.append(team_label_container)

#Now get the statistics.
teams = dom.by_class("full_table")

for team in teams:
    this_team_container=[]
    for td in team.by_tag("td"):
        #http://stackoverflow.com/questions/2365411/python-convert-unicode-to-ascii-without-errors
        this_team_container.append(td.content.encode("utf8"))

    # original comes out like this: ['<a href="/teams/ANA/">Anaheim Ducks</a>', 'NHL',...]
    #split the first element to create two entries: 1. for the URL and 2. the other for franchise
    team_acronym = this_team_container[0][this_team_container[0].find("/teams")+7:this_team_container[0].find("/teams")+10]
    franchise = this_team_container[0].split(">")[1][0:len(this_team_container[0].split(">")[1])-3]
    #now remove the <a href> element and replace with the cleaned acronym and franchise
    this_team_container.pop(0)
    this_team_container.insert(0,team_acronym)
    this_team_container.insert(1,franchise)

コード例 #33

0

ファイルを表示

ファイル: 2013InformationManagementSystems.py プロジェクト: beebha/Thesis

# Creating the csv output file for writing into as well as defining the writer
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
output = open("output/information_management_systems_requirements_"+timestamp+".csv", "wb")
writer = UnicodeWriter(output)

# Get the DOM object.
url = URL("http://dceweb.harvard.edu/prod/sswcpgm.taf?function=search&wgrp=ALMIT&_UserReference=E11F5775BEB5C7554DDE88C4&concentrationArea=AREA_CONC_2%2C9&SEARCH_TERM=both")
dom = DOM(url.download(cached=True))

# add 1st header row
writer.writerow(["Term", "CourseNumber", "Title", "Instructor", "Day", "Time", "Location", "CourseType", "EnrollLimit", "Attributes"])

date_to_write = []

# get main content containing all the courses
main_content = dom.by_class("csearchresults")

# get all the rows that have the course data
all_data_rows = main_content[0].by_tag("tr")

# loop through each row
for ind_data_row in all_data_rows:

    if ind_data_row.attributes.get("class", "") == "" or ind_data_row.attributes.get("class", "") == "odd":

        all_columns = ind_data_row.by_tag("td")

        # ensure course is not cancelled
        if len(all_columns) > 1 and plaintext(str(all_columns[4])).find("Canceled") == -1:

            term = ""

コード例 #34

0

ファイルを表示

ファイル: complex_HTML.py プロジェクト: acheson/cs171

#        Ratings
#        Number of Ratings


page_urls = []

tableRows = dom.by_id('main').by_tag('table')[1].by_tag('tr')
for tr in tableRows[1:]:
	a = tr.by_tag('a')[0]
	page_urls.append(clean_unicode(abs_url(a.attributes.get('href', ''), url.string)))

for p in page_urls:
	p_url = URL(p)
	p_dom = DOM(p_url.download(cached=True))
	
	title = clean_unicode(p_dom.by_class('header')[0].content)
	title = plaintext(strip_between('<span', '</span>', title))
	
	runtime = clean_unicode(p_dom.by_class('infobar')[0].by_tag('time')[0].content)

	genres = []
	for genre in p_dom.by_class('infobar')[0].by_tag('a')[:-1]:
		genres.append(clean_unicode(genre.content))
 	
 	directors = []
 	writers = []
 	actors = []

 	text_blocks = p_dom.by_class('txt-block')[:3]
 	for t in text_blocks:
 		spans = t.by_tag('span')

コード例 #35

0

ファイルを表示

presto_schools = ["Harvard", "Yale", "Brown"]

# download presto sports data
# for each school
for school in range(0,3):
    # for each year
    for presto_year in presto_years:
        print presto_schools[school] + " " + presto_year
        # generate url string, download dom
        base_url = presto_urls[school]
        url_string = base_url + presto_year + "/roster"
        url = URL(url_string)
        dom = DOM(url.download(cached=True))
        print "Downloaded."
        print "-----------------------"
        rows = (dom.by_class("roster-row0") + dom.by_class("roster-row1"))
        # go through rows of swimmers
        for row in rows:
            # adjustment for Brown 2011-12 (has an extra column), 
            # and for Brown and Harvard in general because year columns are in different place
            adj = 0
            if school == 2 and presto_year == "2011-12":
                adj = 1
            elif school == 0 or school == 2:
                adj = -1
            cells = row.by_tag("td")
            # skip divers
            if cells[1 - adj].content.strip() == "Diving":
                continue
            year_name = cells[2 + adj].content.strip()
            grad_year = 0

コード例 #36

0

ファイルを表示

ファイル: scraper.py プロジェクト: goodspeedj/csci-e64

# To get you started, uncomment the following print line and see the output for the first entry

#print dom.by_class("title")[0].by_tag("a")[0].content

# by_class selects all with class="title" and returns a list. Familiarize yourself with the DOM
# by trying out different combinations. See what each returns.

# NOTE: if you see u' in front of your strings, you can use use encode( 'ascii', 'ignore' ) on your string
# to learn why, you can optionally read up on http://docs.python.org/2/howto/unicode.html 

# You could start with this

# add header row
writer.writerow(["Title", "Ranking", "Genre", "Actors", "Runtime"])
allElements = dom.by_class("title")

for i,e in enumerate(allElements):
	
    # WRITE YOUR CODE HERE
    for title in e.by_tag('a')[:1]:
		title = plaintext(title.content.encode('ascii','ignore'))
	
    for rank in e.by_class('value')[:1]:
    	rank = plaintext(rank.content.encode('ascii','ignore'))
    
    for genre in e.by_class('genre')[:1]:
    	genre = re.sub(" \| ", ", ", plaintext(genre.content.encode('ascii','ignore')))
    	
    for actors in e.by_class('credit')[:1]:
    	actors = re.sub("^With: ", "", plaintext(actors.content.encode('ascii','ignore')))