コード例 #1
0
def getRandomHistoryDOM(language):
    url = URL("http://"+language+".wikipedia.org/wiki/Special:Random")
    #Gets the url only of the page this redirects to
    redirectUrl = url.redirect
    try:
        #Grab the name of the wikipedia article from the url
        urlComponents = string.split(redirectUrl, '/')
    except AttributeError:
        #Use some recursion if we encounter a page with no history, or some other error
        return getRandomHistoryDOM(language)

    #Get the history section of the article
    redirectUrl = "http://"+language+".wikipedia.org/w/index.php?title="+urlComponents[4]+"&action=history"
    print "Current article is: " +str(urlComponents[4])
    #print redirectUrl
    url = URL(redirectUrl);
    dom = DOM(url.download(cached=False))
    try:
        historyList = dom.by_id("pagehistory").by_tag("li")
        return historyList, urlComponents[4]
    except AttributeError:
        #Use some recursion if we encounter a page with no history, or some other error
        dom = getRandomHistoryDOM(language)

    return getRandomHistoryDOM(language)
コード例 #2
0
def scrape_education(county_num):
	if county_num<10:
		county_num = '0' + str(county_num)
	else:
		county_num = str(county_num)
	
	print county_num
	#url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=01,ALAMEDA&cType=T&cGender=&Submit=1'
	url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=' + county_num + '01,ALAMEDA&cType=T&cGender=&Submit=1'
	abs_url = URL(string = url)
	dom = DOM(abs_url.download(cached=True))#download the DOM

	

	other = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[12].content.replace(',','')
	associates = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[11].content.replace(',','')
	bachelors = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[9].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[10].content.replace(',','')))

	masters = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[4].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[5].content.replace(',','')))
	jurisdoctor = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[3].content.replace(',','')
	doctorate = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[2].content.replace(',','')
	
	bachelors_and_less = str(int(bachelors) + int(associates) + int(other))
	
	post_grad = str(int(masters) + int(jurisdoctor) + int(doctorate))
	
	county = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("a")[0].content

	# write all the collected data to a new row of the output file
	writer.writerow([county, bachelors_and_less, post_grad, associates, bachelors, masters, jurisdoctor, doctorate])
コード例 #3
0
ファイル: spiders.py プロジェクト: Carlosmr/WhooshSearcher
 def getTitle(self, link):
     html = URL(link).download()
     body = DOM(html).body
     node = body.by_id("main-article-info")
     if node:
         title = node.children[1].content.strip()
     else:
         title = ''
     return title
コード例 #4
0
ファイル: alexa.py プロジェクト: acheson/cs171pj2
def getVisByCountry(site):
    countries = {}
    url = URL(base + site)

    aDom = DOM(url.download(cached=True))
    if aDom.by_id("visitors-by-country") is not None:
        vis = aDom.by_id("visitors-by-country")

        countries = {}
        for r in vis.by_class("tr1"):
         if r.by_tag("a")[0].attributes.get("id") == "toggleMoreCountryVisits":
           pass
         else:
           #print r.by_tag("a")[0].content
           country = r.by_tag("a")[0].content.split("&nbsp; ")[1].strip()
           pct = float(r.by_tag("p")[1].content[0:-1])
           #print country, pct
           countries[country] = pct
    sites[site] = countries
コード例 #5
0
ファイル: spiders.py プロジェクト: Carlosmr/WhooshSearcher
 def htmlParser(self,link):
     html = URL(link).download()
     body = DOM(html).body
     content = body.by_id("content")
     if content:
         plaincontent = plaintext(content.content, linebreaks=2, indentation = True)
         pretty = unicode(plaincontent.strip())
     else:
         pretty=''            
     return pretty
コード例 #6
0
def load(year, pagenum, pagerank):
	strnum = str(year)
	url = URL("http://www.imdb.com/search/title?at=0&sort=moviemeter,asc&start="
			   +str(pagenum)+"&title_type=feature&year="+strnum+","+strnum)
	dom = DOM(url.download(timeout=30, cached=True))
	htmlsource = dom.by_id("main").by_class("results")[0].by_class("title")[pagerank].by_tag("a")[0].source
	urlpiece = re.search(r'/title/+[t0-9]+/', htmlsource)
	finalurl = "http://www.imdb.com" + urlpiece.group(0)
	url2 = URL(finalurl)
	return url2
コード例 #7
0
ファイル: scrape.py プロジェクト: debovis/python-analysis
	def getTextAboutResturants(self):
		# get text about resturants
		i=0
		for rs in self.conn.resturants.find():
			if not rs.get('information'):
				information = {}
				request = DOM(URL(rs['url']).download())
				# Tags
				if request.by_id('LocationMetaData'):
					source = str(request.by_id('LocationMetaData').source.encode('cp1252', 'ignore'))
					tags = Element(source[source.find('<b>Tags: </b>'):]).by_tag('a')
					if tags:
						information['parsedTags'] = [ (tag.attributes['href'], tag.content) for tag in tags]
				# Review 
				if request.by_id('LocationDescription'):
					information["review"] = plaintext(request.by_id('LocationDescription').content)
				# Details
				if request.by_id('LocationRestaurantDetails'):
					information["details"] = request.by_id('LocationRestaurantDetails').by_tag('p')[0].content
				rs['details'] = information
				print information
				self.conn.resturants.save(rs)
			else:
				print i, rs['name']

			i +=1
コード例 #8
0
ファイル: scrape.py プロジェクト: debovis/python-analysis
	def getResturants(self):
		# get all resturants and urls
		for page in range(1,48):
			params["page"] = page
			request = DOM(URL(url, query=params).download())
			searchResults = request.by_id('searchResults')
			pageResults = searchResults.by_class('locationListing clearfix')
			for item in pageResults:
				link = item.by_tag('h4')[0].by_tag('a')[-1]
				name = plaintext(link.content)
				address = link.attributes['href']
				resturant = { 'name' : name, 'url' : address}
				conn['resturants'].insert(resturant)
コード例 #9
0
ファイル: ch1.py プロジェクト: acheson/cs171pj2
def get_search_string(search, proxy):
    if search == "Schindler's List":
        search = "Schindler"
    if search == "One Flew Over the Cuckoo's Nest":
        search = "one flew over"
    if search == "It's a Wonderful Life":
        search = "wonderful life"
    if search == u"L\xe9on: The Professional":
        search = "the professional"
    if search == "Terminator 2: Judgment Day":
        search = "Terminator 2"
    if search == u"Am\xe9lie":
        search = "Amelie"
    if search == "L.A. Confidential":
        search = "Confidential"
    if search == "Pan's Labyrinth":
        search = "pan"
    if search == "A Few Dollars More":
        search = "dollars"
    if search == "The Secret in Their Eyes":
        search = "El secreto de sus ojos"
    if search == "The King's Speech":
        search = "the king"
    if search == "Howl's Moving Castle":
        search = "howl"
    if search == "Harry Potter and the Deathly Hallows: Part 2":
        search = "harry potter"
    if search == "Who's Afraid of Virginia Woolf?":
        search = "virginia woolf"
    if search == "Rosemary's Baby":
        search = "rosemary"
    url = URL("http://1channel.ch")
    dom = DOM(url.download(cached=False, timeout=20, proxy=proxy))
    a = dom.by_id("searchform")
    s_base = a.attributes.get("action")
    s_text = "_keywords=" + search.replace(" ","+")
    key = a.by_attribute(name="key")[0].attributes.get("value")
    s_section = a.by_attribute(name="search_section")[0].attributes.get("value")
    search_string = s_base + s_text + "&key=" + key + "&search_section=" + s_section
    return search_string
コード例 #10
0
def scrape_truancy(county_num):
	if county_num<10:
		county_num = '0' + str(county_num)
	else:
		county_num = str(county_num)
	
	print county_num
	#url = 'http://dq.cde.ca.gov/dataquest/SuspExp/suspexplrate.aspx?cYear=2011-12&cType=ALL&cCDS=01000000000000&cName=ALAMEDA&cLevel=County&cChoice=cSusExpRt'
	url = 'http://dq.cde.ca.gov/dataquest/SuspExp/suspexplrate.aspx?cYear=2011-12&cType=ALL&cCDS=' + county_num + '000000000000&cName=ALAMEDA&cLevel=County&cChoice=cSusExpRt'
	abs_url = URL(string = url)
	dom = DOM(abs_url.download(cached=True))#download the DOM


	county = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("a")[0].content
	total_enrollment = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[3].content
	suspensions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[4].content
	suspension_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[5].content
	expulsions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[6].content
	expulsion_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[7].content
	truants = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[8].content
	trauncy_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[9].content

	
	#For the first county only, also grab the statewide totals
	if county_num=='01':
		state_total_enrollment = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[3].content
		state_suspensions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[4].content
		state_suspension_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[5].content
		state_expulsions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[6].content
		state_expulsion_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[7].content
		state_truants = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[8].content
		state_trauncy_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[9].content
		
		# write the statewide total data to the top row of the output file
		writer.writerow(["California Total", state_total_enrollment,state_suspensions, state_suspension_rate, state_expulsions, state_expulsion_rate, state_truants, state_trauncy_rate])

	

	# write all the collected data to a new row of the output file
	writer.writerow([county, total_enrollment,suspensions, suspension_rate, expulsions, expulsion_rate, truants, trauncy_rate])
コード例 #11
0
		return
	dfunv = dfun(dom2)
	if dfunv == "":
		return
	writer.writerow([tfun(data), runfun(data), gfun(data), dfun(data),
				 wfun(data), afun(data), msfun(data), rtfun(data),
				 rtnmfun(data), bfun(text), bousfun(text), 
				 bowfun(text), mpaafun(data), dfun(dom2)])

	
#this handles tags

year = 2000
while year < 2011:
	pagenum = 1
	while pagenum < 101:
		pagerank = 0
		while pagerank < 50:
			url2 = load(year, pagenum, pagerank)
			dom2 = DOM(url2.download(timeout=30, cached=True))
			data = dom2.by_id("overview-top")
			text = loadbus(url2)
			print dfun(data)
			entrytest(data,text,dom2)
			pagerank += 1
		pagenum += 50
	year += 1


output.close()
コード例 #12
0
def loadbus(url):
	url = URL(str(url)+"business?ref_=tt_dt_bus")
	dom = DOM(url.download(timeout=30, cached=True))
	return (dom.by_id("tn15content").content).encode('ascii', 'ignore')
コード例 #13
0
#build the final list of variables called season_label_container
roster_labels_container=[]
for label in roster_labels:
    roster_labels_container.append(label.content.encode("utf8"))

#add in a column for the team acronym to act as a key
roster_labels_container.insert(0,"team_id")
roster_labels_container.insert(1,"Season")

#the roster_container holds all of the players for the specified team/year
roster_container = []
roster_container.append(roster_labels_container)
print roster_labels_container

# this is so powerful - I just needed to look and find the id for the roster table
all_divs = dom.by_id("roster")
#roster_trs holds a list of players info
roster_trs = all_divs.by_tag("tr")

#iterate through each player in the roster
for trs in roster_trs:

    #this will hold the final encoded info/stats pulled from the current player
    this_roster_farian = []
    #now add the team id and the season
    this_roster_farian.append(this_team_acronym) 
    this_roster_farian.append(this_season) 
    
    for t in trs.by_tag("td"):
            #the player name has a link to the player, 
            #e.g. '<a href="/players/m/milledr01.html">Drew Miller</a>'
コード例 #14
0
            elif year_name[0:2] == "Fr":
                grad_year = int(neu_year) + 4
            # split name into first and last (adjusting for title rows)
            name = cells[0].by_tag("a")
            name = cells[0].content.split(" ", 1) if len(name) == 0 else cells[0].by_tag("a")[0].content.split(" ", 1)
            # reorder name if from Columbia or Princeton
            if school in (2,3):
                name.reverse()
            # add swimmer (last name, first name, graduating year, school) to array
            swimmers.append([name[1].encode('ascii', 'ignore').strip(",").strip(), name[0].encode('ascii', 'ignore').strip(), grad_year, neu_schools[school]])


# Get all Cornell Roster id numbers for the URLs
url = URL("http://www.cornellbigred.com/roster.aspx?roster=847")
dom = DOM(url.download(cached=True))
options = dom.by_id("ctl00_cplhMainContent_ddlPastRosters").by_tag("option")
base_url = "http://www.cornellbigred.com/roster.aspx?roster="
cornell_roster_ids = []
for option in options:
    cornell_roster_ids.append(str(option.attrs["value"]))

# define years array
cornell_years = []
for i in range(YEARS_TO_SCRAPE):
	cornell_years.append(str(year-i))

counter = 0 
for cornell_year in cornell_years:
    print counter
    print "Cornell" + " " + cornell_year
    url_string = base_url + cornell_roster_ids[counter]
コード例 #15
0
from pattern.web import URL, DOM
from pattern.db import Datasheet
import glob, re


urls = glob.glob('/Users/tnatoli/Desktop/pages/*.html')
headers = ['player', 'pos', 'team', 'owner']

f = open('player_table.txt', 'w')
f.write('\t'.join(headers) + '\n')

for u in urls:
    url = URL(u)
    dom = DOM(url.download(cached=False))
    tbody = dom.by_id('statTable0').by_tag('tbody')[0]
    for tr in tbody.by_tag('tr'):
        pname = tr.by_class('ysf-player-name')[0].by_tag('a')[0].content
        team_pos = tr.by_class('ysf-player-team-pos')[0].by_tag('span')[0].content
        team = re.sub('\(', '', team_pos.split(' - ')[0])
        pos = re.sub('\)', '', team_pos.split(' - ')[1])
        owner_links = tr.by_class('owner')[0].by_tag('a')
        if owner_links:
            owner = owner_links[0].content
        else:
            owner = 'FA'
        line = '\t'.join([pname, team, pos, owner])
        print line
        for l in line:
            try:
                l.encode('ascii')
コード例 #16
0
ファイル: complex_HTML.py プロジェクト: acheson/cs171
#With the movie links, scrape each entry
#You will get the the following items:
#Produce a comma-separated text file (use semicolons to separate the entries) with a header row and the fields: 
#        Title of movie
#        Runtime
#        Genre (separated by semicolons if multiple)
#        Director(s)
#        Writer(s)
#        Actors (listed on the page directly only or first three, separated by semicolons)
#        Ratings
#        Number of Ratings


page_urls = []

tableRows = dom.by_id('main').by_tag('table')[1].by_tag('tr')
for tr in tableRows[1:]:
	a = tr.by_tag('a')[0]
	page_urls.append(clean_unicode(abs_url(a.attributes.get('href', ''), url.string)))

for p in page_urls:
	p_url = URL(p)
	p_dom = DOM(p_url.download(cached=True))
	
	title = clean_unicode(p_dom.by_class('header')[0].content)
	title = plaintext(strip_between('<span', '</span>', title))
	
	runtime = clean_unicode(p_dom.by_class('infobar')[0].by_tag('time')[0].content)

	genres = []
	for genre in p_dom.by_class('infobar')[0].by_tag('a')[:-1]:
コード例 #17
0
	# alphabetical letter so we can observe the CSVs being successfully created as the script runs
	csv_format = "csv/rider-data-%s.csv"
	csv_filename = convertUnicodeToAscii(csv_format % competitorLetterUrl[-1])

	# Creating the csv output file for writing into as well as defining the writer
	output = open(csv_filename, "wb")
	writer = UnicodeWriter(output)

	# add header row
	writer.writerow(header_row)

	# load up the current competitors listing URL
	url = URL(competitorLetterUrl)
	dom = DOM(url.download(cached=True))

	riderList = dom.by_id("riderlist")

	#       <div class="ttDatabaseNav">
	#         <h4>A</h4>
	#         <ul id="riderlist">
	#           <li><a href="/TT-Database/competitors.aspx?ride_id=5509&amp;filter=A">A.Domini, AKA</a></li>
	#           <li><a href="/TT-Database/competitors.aspx?ride_id=6016&amp;filter=A">Abbey, Ben</a></li>
	#           <li><a href="/TT-Database/competitors.aspx?ride_id=6876&amp;filter=A">Abbott, Roger</a></li>
	#           <li><a href="/TT-Database/competitors.aspx?ride_id=202&amp;filter=A">Abbott, A R</a></li><li>
	#           ... ... ...
	#           <li><a href="/TT-Database/competitors.aspx?ride_id=9845&amp;filter=A">Aylott, Mike</a></li>
	#           <li><a href="/TT-Database/competitors.aspx?ride_id=3178&amp;filter=A">Ayres, Asa</a></li>
	#           <li><a href="/TT-Database/competitors.aspx?ride_id=9071&amp;filter=A">Ayres, Brian</a></li>
	#           <li><a href="/TT-Database/competitors.aspx?ride_id=7&amp;filter=A">Ayton, R</a></li>
	#         </ul>
	#       </div>
            Name = dom[e].content
            BandID = dom[e].attrs["href"].split("/")

            LocationAndGenre = unicode(dom[e + 1]).split(",")
            Genre = LocationAndGenre[1].replace("\"", "").replace(",", "-").strip()
            Location = LocationAndGenre[2].replace("\"", "").replace(",", "-").strip()
            Status = dom[e + 2].content.replace("\"", "").strip()
            YearsActive = ""
            Albums = ""

            # Go to band page
            link = URL(dom[e].attrs["href"])
            if link.exists:
                BandPage = DOM(link.download(cached=True))

                YearsActive = BandPage.by_id("wrapper").by_id("content_wrapper").by_id("band_content").by_id("band_info").by_id("band_stats").by_tag("dl")[2].by_tag("dd")[0].content
                YearsActive = YearsActive.split(",")
                years = ""
                for e in range(0, len(YearsActive)):
                    # Deletes all html tags in yearsactive
                    YearsActive[e] = YearsActive[e].replace(r'<([A-Z][A-Z0-9]*)\b[^>]*>(.*?)</\1>', '')
                    YearsActive[e] = YearsActive[e].replace("\"", "").strip()
                    YearsActive[e] = YearsActive[e].replace("\t", "").replace("   ", "")
                    if(len(YearsActive) > 1 and e < (len(YearsActive) - 1)):
                        years += YearsActive[e] + ", "
                    else:
                        years += YearsActive[e]
                
                YearsActive = "".join(years)
                # print YearsActive
コード例 #19
0
dom = DOM(url.download(cached=True))

for restaraunt in dom.by_class("ResultRow"):
    name = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_tag("a")[0].content.encode( 'ascii', 'ignore' )
    neighborhood_cuisine = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_class("d")[0].content.encode( 'ascii', 'ignore' )
    neihgborhood_cuisine =  neighborhood_cuisine.split('|')
    neighborhood = neihgborhood_cuisine[0]
    cuisine = neihgborhood_cuisine[1]
    meals = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_class("message")[0].content.encode( 'ascii', 'ignore' )
    meals = meals.split('<')
    # need to clean
    meals = meals[0]   
    restURL = URL(abs(restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_tag("a")[0].attributes.get('href',''), base=url.redirect or url.string))
    restDOM = DOM(restURL.download(cached=True))
    # need to clean
    address = restDOM.by_id("ProfileOverview_lblAddressText").content
    price = restDOM.by_id("ProfileOverview_lblPriceText").content
    try:
        ratings = restDOM.by_id("RestPopLabel_ReviewsFormat")[0].attributes
        ratings = ratings['title']
    except TypeError:
        ratings = 'not available'
    style = restDOM.by_id("ProfileOverview_DiningStyle").by_class("value")[0].content
    try:
        website = restDOM.by_id("ProfileOverview_Website").by_tag("a")[0].content
    except AttributeError:
        website = "not available"
    phone = restDOM.by_id("ProfileOverview_Phone").by_class("value")[0].content
    dress = restDOM.by_id("ProfileOverview_DressCode").by_class("value")[0].content
    writer.writerow([name, neighborhood, cuisine, style, meals, dress, ratings, price, phone, address, website])