def main(): ##<ul class="oe_pressReleasesList"> ## <table id="ai_official_reports"> ## <tr> ## <td><a href="http://web.oie.int/wahis/reports/en_fup_0000010121_20110102_154206.pdf" target="_top" title="Follow-up report No. 2">Follow-up report No. 2</a> <a style="color:#FAF9F8" name="c27054">27054</a></td> base = "http://www.oie.int/animal-health-in-the-world/update-on-avian-influenza/" # 2011 - 2004 # variation http://www.oie.int/animal-health-in-the-world/update-on-avian-influenza/2011 ## http://web.oie.int/wahis/reports/en_fup_0000010289_20110304_170141.pdf # Make a new directory for this run with date as name. tm = datetime.datetime.now() datadir = "../data/" + tm.strftime("%Y-%m-%d_t%H_%M_%S") #datadir = "D:\\School\\171\\fp\\data\\" + str(tm) EnsureDirectory(datadir) # keep going until we find a page that doesn't have any job postings # -- Loop on pages. for yr in range(4, 8): print "Year : ", yr if (yr < 10): yearPostFix = "200" + str(yr) else: yearPostFix = "20" + str(yr) url = base + yearPostFix print "PAGE -----------------------", url # Find the pdf file links soup = util.mysoupopen(url) # find table with the pdf links <table id="ai_official_reports"> linkTable = soup.findAll("table", {"id": "ai_official_reports"}) if (len(linkTable) == 0): print "Did not find any links!" break anchorList = soup.findAll("a", {"target": "_top"}) # Break out of page loop when there are no job details links. if (len(anchorList) == 0): print "No pdf links found???" break # create a directory for this year. yearDir = datadir + "/" + yearPostFix + "/" EnsureDirectory(yearDir) # -- Loop on Details links. <a id="" class="detailsLink" ..> for anchor in anchorList: href = anchor['href'] print href download(href, yearDir) pass
def main(): ##<ul class="oe_pressReleasesList"> ## <table id="ai_official_reports"> ## <tr> ## <td><a href="http://web.oie.int/wahis/reports/en_fup_0000010121_20110102_154206.pdf" target="_top" title="Follow-up report No. 2">Follow-up report No. 2</a> <a style="color:#FAF9F8" name="c27054">27054</a></td> base= "http://www.oie.int/animal-health-in-the-world/update-on-avian-influenza/" # 2011 - 2004 # variation http://www.oie.int/animal-health-in-the-world/update-on-avian-influenza/2011 ## http://web.oie.int/wahis/reports/en_fup_0000010289_20110304_170141.pdf # Make a new directory for this run with date as name. tm = datetime.datetime.now() datadir = "../data/" + tm.strftime("%Y-%m-%d_t%H_%M_%S") #datadir = "D:\\School\\171\\fp\\data\\" + str(tm) EnsureDirectory(datadir) # keep going until we find a page that doesn't have any job postings # -- Loop on pages. for yr in range(4,8): print "Year : ", yr if( yr < 10): yearPostFix = "200" + str(yr) else: yearPostFix = "20" + str(yr) url = base + yearPostFix print "PAGE -----------------------", url # Find the pdf file links soup = util.mysoupopen(url) # find table with the pdf links <table id="ai_official_reports"> linkTable = soup.findAll("table", {"id":"ai_official_reports"}) if(len(linkTable) == 0): print "Did not find any links!" break; anchorList = soup.findAll("a", {"target":"_top"}) # Break out of page loop when there are no job details links. if(len(anchorList) == 0): print "No pdf links found???" break # create a directory for this year. yearDir = datadir + "/" + yearPostFix + "/" ; EnsureDirectory(yearDir) # -- Loop on Details links. <a id="" class="detailsLink" ..> for anchor in anchorList: href = anchor['href'] print href download(href, yearDir) pass
# problems - need to learn how to search for things that DON'T have certain patterns import re, util # this creates a tab-delimited file delim = "\t" # base url base = "http://www.elyrics.net" # find all in flames songs by going to the following url url = base + "/song/i/in-flames-lyrics.html" # use cs171-util to get a soup object that represents a webpage soup = util.mysoupopen(url) # songs holds the html code that holds the names of the songs songs = soup.findAll("table", {"class": "songs"}) # If we find some songs, then find the urls to each song's lyrics if (len(songs) != 0): # songs is a soup object # however, we'd like it to be a string, so we can use reg exps to # search through it! songs_str will be that string songs_str = str(songs[0]) # find links for every song listed. m = re.findall('<a href="(.*?)">.*?</a', songs_str) # Now go to each url and get the lyrics
# problems - need to learn how to search for things that DON'T have certain patterns import re, util # this creates a tab-delimited file delim = "\t" # base url base = "http://www.census.gov" # go to the first page url = base + "/econ/census02/data/al/AL000_71.HTM" # use cs171-util to get a soup object that represents a webpage soup = util.mysoupopen(url) # states holds the url code that has all the states extentions states = soup.findAll("select", {"name": "Location"}) # If we find some states, then find the urls to each song's lyrics if (len(states) != 0): # states is a soup object # however, we'd like it to be a string, so we can use reg exps to # search through it! states_str will be that string states_str = str(states[0]) states_str = states_str.replace("\n", "") # cut down our states_str to only the state links m = re.findall('<select name="Location">(.*?)</select>', states_str) # now find make a list of urls of each state
import re, util, string, datetime, time # configuration STARTING_YEAR = 1996 DELIMITER = "\t" URL_TEMPLATE = "http://www.swivel.com/data_sets/spreadsheet/1000052?page=%(page)s" PAGES = range(1,3) DATE_COLUMN_PROPERTIES = {"class": "DateTimeDataFormat "} RESIDENTIAL_PRICE_COLUMN_PROPERTIES = {'class': 'NumberDataFormat column1001109'} COMMERCIAL_PRICE_COLUMN_PROPERTIES = {'class': 'NumberDataFormat column1001110'} INDUSTRIAL_PRICE_COLUMN_PROPERTIES = {'class': 'NumberDataFormat column1001111'} prices_by_date = {} print "Month Year" + DELIMITER + "Residential" + DELIMITER + "Commercial" + DELIMITER + "Industrial" for page in PAGES: soup = util.mysoupopen(URL_TEMPLATE % {"page": page}) table = soup.find("table", {"class": "data"}) # for each row on page for row in table.findAll("tr"): date_column = row.find("td", DATE_COLUMN_PROPERTIES) if date_column == None: continue # scrape data from rows and store in prices_by_date date_struct = time.strptime(date_column.string.strip(), "%b %Y") date = datetime.date(date_struct[0], date_struct[1], 1) residential_price = row.find("td", RESIDENTIAL_PRICE_COLUMN_PROPERTIES).string.strip() commercial_price = row.find("td", COMMERCIAL_PRICE_COLUMN_PROPERTIES).string.strip() industrial_price = row.find("td", INDUSTRIAL_PRICE_COLUMN_PROPERTIES).string.strip() if date.year >= STARTING_YEAR: prices_by_date[date] = {'residential': residential_price, 'commercial': commercial_price, 'industrial': industrial_price}
#!/usr/bin/env python # Courtesy of Sue in Boston # http://zombievir.us/2010/02/beautiful-soup-for-ugly-html/ import re, util from BeautifulSoup import BeautifulSoup, Comment # list list=[] # base url baseurl = "http://www.plyrics.com/w/weezer.html" # use cs171-util to get a soup object that represents a webpage soup = util.mysoupopen(baseurl) #I started with the dido lyric scraper I saw on the forums. I have never written #in python before. I played with it for a while and came up with my own way of #grabbing urls. # titleCols- grab the HREF values of all links titleCols = soup.findAll("a", href=True) # if there are no href values, stop :) # program! if(len(titleCols) == 0): exit; # for each entry for url in titleCols: mc = str(url['href']) #print mc
# this creates a tab-delimited file delim = "\t" # base url base = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&Category=334&N=2003340000&SpeTabStoreType=0" # counter to keep track of which page we're on ctr = 0 # keep going until we find a page that doesn't have any products while (True): url = base + "&page=" + str(ctr) ctr += 1 # use cs171-util to get a soup object that represents a webpage soup = util.mysoupopen(url) # infoCols holds the html code that holds the names of the products infoCols = soup.findAll("td", {"class": "midCol"}) # priceCols holds the html code that holds the prices of the products priceCols = soup.findAll("ul", {"class": "priceCol"}) # if we didn't find any products, break out of the loop and finish the # program! if (len(infoCols) == 0): break # for each product... for i in range(len(priceCols)): # infoCols[i] is a soup object
import re, util, string DELIMITER = "\t" URL_TEMPLATE = "http://www.usc.edu/schools/college/crcc/engagement/resources/texts/muslim/quran/%(chapter_number)s.qmt.html" TRANSLATOR = "YUSUFALI" VERSE_MATCHING_REGEX = "<strong>%(translator)s:</strong>([^<]+?)<br\s?/>" % {'translator': TRANSLATOR} words = {} FIRST_CHAPTER = 1 LAST_CHAPTER = 114 STOP_WORDS = map(string.rstrip, open('stop_words.txt').readlines()) # Scrape 114 chapters in the Qur'an for chapter_number in range(FIRST_CHAPTER,LAST_CHAPTER+1): url = URL_TEMPLATE % {'chapter_number': str(chapter_number).zfill(3)} soup = util.mysoupopen(url) verse_matches = re.findall(VERSE_MATCHING_REGEX, str(soup)) verse_number = 1 # for each verse in chapter_number for verse_match in verse_matches: # remove punctuations and leading/trailing spaces verse_text = re.sub("'|,|;|:|'|\"|\!|\?|\.|\(|\)|\-", " ", verse_match).strip() verse_text = re.sub("\s{2,5}|\n", " ", verse_text) # add verse's words to dictionary for word in re.split("\s+", verse_text): word = word.lower() if(word in STOP_WORDS): continue if(word not in words):