def getRandomHistoryDOM(language): url = URL("http://"+language+".wikipedia.org/wiki/Special:Random") #Gets the url only of the page this redirects to redirectUrl = url.redirect try: #Grab the name of the wikipedia article from the url urlComponents = string.split(redirectUrl, '/') except AttributeError: #Use some recursion if we encounter a page with no history, or some other error return getRandomHistoryDOM(language) #Get the history section of the article redirectUrl = "http://"+language+".wikipedia.org/w/index.php?title="+urlComponents[4]+"&action=history" print "Current article is: " +str(urlComponents[4]) #print redirectUrl url = URL(redirectUrl); dom = DOM(url.download(cached=False)) try: historyList = dom.by_id("pagehistory").by_tag("li") return historyList, urlComponents[4] except AttributeError: #Use some recursion if we encounter a page with no history, or some other error dom = getRandomHistoryDOM(language) return getRandomHistoryDOM(language)
def scrape_education(county_num): if county_num<10: county_num = '0' + str(county_num) else: county_num = str(county_num) print county_num #url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=01,ALAMEDA&cType=T&cGender=&Submit=1' url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=' + county_num + '01,ALAMEDA&cType=T&cGender=&Submit=1' abs_url = URL(string = url) dom = DOM(abs_url.download(cached=True))#download the DOM other = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[12].content.replace(',','') associates = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[11].content.replace(',','') bachelors = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[9].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[10].content.replace(',',''))) masters = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[4].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[5].content.replace(',',''))) jurisdoctor = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[3].content.replace(',','') doctorate = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[2].content.replace(',','') bachelors_and_less = str(int(bachelors) + int(associates) + int(other)) post_grad = str(int(masters) + int(jurisdoctor) + int(doctorate)) county = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("a")[0].content # write all the collected data to a new row of the output file writer.writerow([county, bachelors_and_less, post_grad, associates, bachelors, masters, jurisdoctor, doctorate])
def getTitle(self, link): html = URL(link).download() body = DOM(html).body node = body.by_id("main-article-info") if node: title = node.children[1].content.strip() else: title = '' return title
def getVisByCountry(site): countries = {} url = URL(base + site) aDom = DOM(url.download(cached=True)) if aDom.by_id("visitors-by-country") is not None: vis = aDom.by_id("visitors-by-country") countries = {} for r in vis.by_class("tr1"): if r.by_tag("a")[0].attributes.get("id") == "toggleMoreCountryVisits": pass else: #print r.by_tag("a")[0].content country = r.by_tag("a")[0].content.split(" ")[1].strip() pct = float(r.by_tag("p")[1].content[0:-1]) #print country, pct countries[country] = pct sites[site] = countries
def htmlParser(self,link): html = URL(link).download() body = DOM(html).body content = body.by_id("content") if content: plaincontent = plaintext(content.content, linebreaks=2, indentation = True) pretty = unicode(plaincontent.strip()) else: pretty='' return pretty
def load(year, pagenum, pagerank): strnum = str(year) url = URL("http://www.imdb.com/search/title?at=0&sort=moviemeter,asc&start=" +str(pagenum)+"&title_type=feature&year="+strnum+","+strnum) dom = DOM(url.download(timeout=30, cached=True)) htmlsource = dom.by_id("main").by_class("results")[0].by_class("title")[pagerank].by_tag("a")[0].source urlpiece = re.search(r'/title/+[t0-9]+/', htmlsource) finalurl = "http://www.imdb.com" + urlpiece.group(0) url2 = URL(finalurl) return url2
def getTextAboutResturants(self): # get text about resturants i=0 for rs in self.conn.resturants.find(): if not rs.get('information'): information = {} request = DOM(URL(rs['url']).download()) # Tags if request.by_id('LocationMetaData'): source = str(request.by_id('LocationMetaData').source.encode('cp1252', 'ignore')) tags = Element(source[source.find('<b>Tags: </b>'):]).by_tag('a') if tags: information['parsedTags'] = [ (tag.attributes['href'], tag.content) for tag in tags] # Review if request.by_id('LocationDescription'): information["review"] = plaintext(request.by_id('LocationDescription').content) # Details if request.by_id('LocationRestaurantDetails'): information["details"] = request.by_id('LocationRestaurantDetails').by_tag('p')[0].content rs['details'] = information print information self.conn.resturants.save(rs) else: print i, rs['name'] i +=1
def getResturants(self): # get all resturants and urls for page in range(1,48): params["page"] = page request = DOM(URL(url, query=params).download()) searchResults = request.by_id('searchResults') pageResults = searchResults.by_class('locationListing clearfix') for item in pageResults: link = item.by_tag('h4')[0].by_tag('a')[-1] name = plaintext(link.content) address = link.attributes['href'] resturant = { 'name' : name, 'url' : address} conn['resturants'].insert(resturant)
def get_search_string(search, proxy): if search == "Schindler's List": search = "Schindler" if search == "One Flew Over the Cuckoo's Nest": search = "one flew over" if search == "It's a Wonderful Life": search = "wonderful life" if search == u"L\xe9on: The Professional": search = "the professional" if search == "Terminator 2: Judgment Day": search = "Terminator 2" if search == u"Am\xe9lie": search = "Amelie" if search == "L.A. Confidential": search = "Confidential" if search == "Pan's Labyrinth": search = "pan" if search == "A Few Dollars More": search = "dollars" if search == "The Secret in Their Eyes": search = "El secreto de sus ojos" if search == "The King's Speech": search = "the king" if search == "Howl's Moving Castle": search = "howl" if search == "Harry Potter and the Deathly Hallows: Part 2": search = "harry potter" if search == "Who's Afraid of Virginia Woolf?": search = "virginia woolf" if search == "Rosemary's Baby": search = "rosemary" url = URL("http://1channel.ch") dom = DOM(url.download(cached=False, timeout=20, proxy=proxy)) a = dom.by_id("searchform") s_base = a.attributes.get("action") s_text = "_keywords=" + search.replace(" ","+") key = a.by_attribute(name="key")[0].attributes.get("value") s_section = a.by_attribute(name="search_section")[0].attributes.get("value") search_string = s_base + s_text + "&key=" + key + "&search_section=" + s_section return search_string
def scrape_truancy(county_num): if county_num<10: county_num = '0' + str(county_num) else: county_num = str(county_num) print county_num #url = 'http://dq.cde.ca.gov/dataquest/SuspExp/suspexplrate.aspx?cYear=2011-12&cType=ALL&cCDS=01000000000000&cName=ALAMEDA&cLevel=County&cChoice=cSusExpRt' url = 'http://dq.cde.ca.gov/dataquest/SuspExp/suspexplrate.aspx?cYear=2011-12&cType=ALL&cCDS=' + county_num + '000000000000&cName=ALAMEDA&cLevel=County&cChoice=cSusExpRt' abs_url = URL(string = url) dom = DOM(abs_url.download(cached=True))#download the DOM county = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("a")[0].content total_enrollment = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[3].content suspensions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[4].content suspension_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[5].content expulsions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[6].content expulsion_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[7].content truants = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[8].content trauncy_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[9].content #For the first county only, also grab the statewide totals if county_num=='01': state_total_enrollment = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[3].content state_suspensions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[4].content state_suspension_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[5].content state_expulsions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[6].content state_expulsion_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[7].content state_truants = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[8].content state_trauncy_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[9].content # write the statewide total data to the top row of the output file writer.writerow(["California Total", state_total_enrollment,state_suspensions, state_suspension_rate, state_expulsions, state_expulsion_rate, state_truants, state_trauncy_rate]) # write all the collected data to a new row of the output file writer.writerow([county, total_enrollment,suspensions, suspension_rate, expulsions, expulsion_rate, truants, trauncy_rate])
return dfunv = dfun(dom2) if dfunv == "": return writer.writerow([tfun(data), runfun(data), gfun(data), dfun(data), wfun(data), afun(data), msfun(data), rtfun(data), rtnmfun(data), bfun(text), bousfun(text), bowfun(text), mpaafun(data), dfun(dom2)]) #this handles tags year = 2000 while year < 2011: pagenum = 1 while pagenum < 101: pagerank = 0 while pagerank < 50: url2 = load(year, pagenum, pagerank) dom2 = DOM(url2.download(timeout=30, cached=True)) data = dom2.by_id("overview-top") text = loadbus(url2) print dfun(data) entrytest(data,text,dom2) pagerank += 1 pagenum += 50 year += 1 output.close()
def loadbus(url): url = URL(str(url)+"business?ref_=tt_dt_bus") dom = DOM(url.download(timeout=30, cached=True)) return (dom.by_id("tn15content").content).encode('ascii', 'ignore')
#build the final list of variables called season_label_container roster_labels_container=[] for label in roster_labels: roster_labels_container.append(label.content.encode("utf8")) #add in a column for the team acronym to act as a key roster_labels_container.insert(0,"team_id") roster_labels_container.insert(1,"Season") #the roster_container holds all of the players for the specified team/year roster_container = [] roster_container.append(roster_labels_container) print roster_labels_container # this is so powerful - I just needed to look and find the id for the roster table all_divs = dom.by_id("roster") #roster_trs holds a list of players info roster_trs = all_divs.by_tag("tr") #iterate through each player in the roster for trs in roster_trs: #this will hold the final encoded info/stats pulled from the current player this_roster_farian = [] #now add the team id and the season this_roster_farian.append(this_team_acronym) this_roster_farian.append(this_season) for t in trs.by_tag("td"): #the player name has a link to the player, #e.g. '<a href="/players/m/milledr01.html">Drew Miller</a>'
elif year_name[0:2] == "Fr": grad_year = int(neu_year) + 4 # split name into first and last (adjusting for title rows) name = cells[0].by_tag("a") name = cells[0].content.split(" ", 1) if len(name) == 0 else cells[0].by_tag("a")[0].content.split(" ", 1) # reorder name if from Columbia or Princeton if school in (2,3): name.reverse() # add swimmer (last name, first name, graduating year, school) to array swimmers.append([name[1].encode('ascii', 'ignore').strip(",").strip(), name[0].encode('ascii', 'ignore').strip(), grad_year, neu_schools[school]]) # Get all Cornell Roster id numbers for the URLs url = URL("http://www.cornellbigred.com/roster.aspx?roster=847") dom = DOM(url.download(cached=True)) options = dom.by_id("ctl00_cplhMainContent_ddlPastRosters").by_tag("option") base_url = "http://www.cornellbigred.com/roster.aspx?roster=" cornell_roster_ids = [] for option in options: cornell_roster_ids.append(str(option.attrs["value"])) # define years array cornell_years = [] for i in range(YEARS_TO_SCRAPE): cornell_years.append(str(year-i)) counter = 0 for cornell_year in cornell_years: print counter print "Cornell" + " " + cornell_year url_string = base_url + cornell_roster_ids[counter]
from pattern.web import URL, DOM from pattern.db import Datasheet import glob, re urls = glob.glob('/Users/tnatoli/Desktop/pages/*.html') headers = ['player', 'pos', 'team', 'owner'] f = open('player_table.txt', 'w') f.write('\t'.join(headers) + '\n') for u in urls: url = URL(u) dom = DOM(url.download(cached=False)) tbody = dom.by_id('statTable0').by_tag('tbody')[0] for tr in tbody.by_tag('tr'): pname = tr.by_class('ysf-player-name')[0].by_tag('a')[0].content team_pos = tr.by_class('ysf-player-team-pos')[0].by_tag('span')[0].content team = re.sub('\(', '', team_pos.split(' - ')[0]) pos = re.sub('\)', '', team_pos.split(' - ')[1]) owner_links = tr.by_class('owner')[0].by_tag('a') if owner_links: owner = owner_links[0].content else: owner = 'FA' line = '\t'.join([pname, team, pos, owner]) print line for l in line: try: l.encode('ascii')
#With the movie links, scrape each entry #You will get the the following items: #Produce a comma-separated text file (use semicolons to separate the entries) with a header row and the fields: # Title of movie # Runtime # Genre (separated by semicolons if multiple) # Director(s) # Writer(s) # Actors (listed on the page directly only or first three, separated by semicolons) # Ratings # Number of Ratings page_urls = [] tableRows = dom.by_id('main').by_tag('table')[1].by_tag('tr') for tr in tableRows[1:]: a = tr.by_tag('a')[0] page_urls.append(clean_unicode(abs_url(a.attributes.get('href', ''), url.string))) for p in page_urls: p_url = URL(p) p_dom = DOM(p_url.download(cached=True)) title = clean_unicode(p_dom.by_class('header')[0].content) title = plaintext(strip_between('<span', '</span>', title)) runtime = clean_unicode(p_dom.by_class('infobar')[0].by_tag('time')[0].content) genres = [] for genre in p_dom.by_class('infobar')[0].by_tag('a')[:-1]:
# alphabetical letter so we can observe the CSVs being successfully created as the script runs csv_format = "csv/rider-data-%s.csv" csv_filename = convertUnicodeToAscii(csv_format % competitorLetterUrl[-1]) # Creating the csv output file for writing into as well as defining the writer output = open(csv_filename, "wb") writer = UnicodeWriter(output) # add header row writer.writerow(header_row) # load up the current competitors listing URL url = URL(competitorLetterUrl) dom = DOM(url.download(cached=True)) riderList = dom.by_id("riderlist") # <div class="ttDatabaseNav"> # <h4>A</h4> # <ul id="riderlist"> # <li><a href="/TT-Database/competitors.aspx?ride_id=5509&filter=A">A.Domini, AKA</a></li> # <li><a href="/TT-Database/competitors.aspx?ride_id=6016&filter=A">Abbey, Ben</a></li> # <li><a href="/TT-Database/competitors.aspx?ride_id=6876&filter=A">Abbott, Roger</a></li> # <li><a href="/TT-Database/competitors.aspx?ride_id=202&filter=A">Abbott, A R</a></li><li> # ... ... ... # <li><a href="/TT-Database/competitors.aspx?ride_id=9845&filter=A">Aylott, Mike</a></li> # <li><a href="/TT-Database/competitors.aspx?ride_id=3178&filter=A">Ayres, Asa</a></li> # <li><a href="/TT-Database/competitors.aspx?ride_id=9071&filter=A">Ayres, Brian</a></li> # <li><a href="/TT-Database/competitors.aspx?ride_id=7&filter=A">Ayton, R</a></li> # </ul> # </div>
Name = dom[e].content BandID = dom[e].attrs["href"].split("/") LocationAndGenre = unicode(dom[e + 1]).split(",") Genre = LocationAndGenre[1].replace("\"", "").replace(",", "-").strip() Location = LocationAndGenre[2].replace("\"", "").replace(",", "-").strip() Status = dom[e + 2].content.replace("\"", "").strip() YearsActive = "" Albums = "" # Go to band page link = URL(dom[e].attrs["href"]) if link.exists: BandPage = DOM(link.download(cached=True)) YearsActive = BandPage.by_id("wrapper").by_id("content_wrapper").by_id("band_content").by_id("band_info").by_id("band_stats").by_tag("dl")[2].by_tag("dd")[0].content YearsActive = YearsActive.split(",") years = "" for e in range(0, len(YearsActive)): # Deletes all html tags in yearsactive YearsActive[e] = YearsActive[e].replace(r'<([A-Z][A-Z0-9]*)\b[^>]*>(.*?)</\1>', '') YearsActive[e] = YearsActive[e].replace("\"", "").strip() YearsActive[e] = YearsActive[e].replace("\t", "").replace(" ", "") if(len(YearsActive) > 1 and e < (len(YearsActive) - 1)): years += YearsActive[e] + ", " else: years += YearsActive[e] YearsActive = "".join(years) # print YearsActive
dom = DOM(url.download(cached=True)) for restaraunt in dom.by_class("ResultRow"): name = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_tag("a")[0].content.encode( 'ascii', 'ignore' ) neighborhood_cuisine = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_class("d")[0].content.encode( 'ascii', 'ignore' ) neihgborhood_cuisine = neighborhood_cuisine.split('|') neighborhood = neihgborhood_cuisine[0] cuisine = neihgborhood_cuisine[1] meals = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_class("message")[0].content.encode( 'ascii', 'ignore' ) meals = meals.split('<') # need to clean meals = meals[0] restURL = URL(abs(restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_tag("a")[0].attributes.get('href',''), base=url.redirect or url.string)) restDOM = DOM(restURL.download(cached=True)) # need to clean address = restDOM.by_id("ProfileOverview_lblAddressText").content price = restDOM.by_id("ProfileOverview_lblPriceText").content try: ratings = restDOM.by_id("RestPopLabel_ReviewsFormat")[0].attributes ratings = ratings['title'] except TypeError: ratings = 'not available' style = restDOM.by_id("ProfileOverview_DiningStyle").by_class("value")[0].content try: website = restDOM.by_id("ProfileOverview_Website").by_tag("a")[0].content except AttributeError: website = "not available" phone = restDOM.by_id("ProfileOverview_Phone").by_class("value")[0].content dress = restDOM.by_id("ProfileOverview_DressCode").by_class("value")[0].content writer.writerow([name, neighborhood, cuisine, style, meals, dress, ratings, price, phone, address, website])