Example #1
0
def main():

    ##<ul class="oe_pressReleasesList">
    ##		<table id="ai_official_reports">
    ##            <tr>
    ##                <td><a href="http://web.oie.int/wahis/reports/en_fup_0000010121_20110102_154206.pdf" target="_top" title="Follow-up report No. 2">Follow-up report No. 2</a>&nbsp;&nbsp;<a style="color:#FAF9F8" name="c27054">27054</a></td>

    base = "http://www.oie.int/animal-health-in-the-world/update-on-avian-influenza/"  # 2011 - 2004
    # variation http://www.oie.int/animal-health-in-the-world/update-on-avian-influenza/2011
    ## http://web.oie.int/wahis/reports/en_fup_0000010289_20110304_170141.pdf

    # Make a new directory for this run with date as name.
    tm = datetime.datetime.now()
    datadir = "../data/" + tm.strftime("%Y-%m-%d_t%H_%M_%S")
    #datadir = "D:\\School\\171\\fp\\data\\" + str(tm)

    EnsureDirectory(datadir)

    # keep going until we find a page that doesn't have any job postings
    # -- Loop on pages.
    for yr in range(4, 8):
        print "Year : ", yr

        if (yr < 10):
            yearPostFix = "200" + str(yr)
        else:
            yearPostFix = "20" + str(yr)

        url = base + yearPostFix
        print "PAGE -----------------------", url

        # Find the pdf file links
        soup = util.mysoupopen(url)

        # find table with the pdf links <table id="ai_official_reports">
        linkTable = soup.findAll("table", {"id": "ai_official_reports"})

        if (len(linkTable) == 0):
            print "Did not find any links!"
            break

        anchorList = soup.findAll("a", {"target": "_top"})

        # Break out of page loop when there are no job details links.
        if (len(anchorList) == 0):
            print "No pdf links found???"
            break

        # create a directory for this year.
        yearDir = datadir + "/" + yearPostFix + "/"
        EnsureDirectory(yearDir)
        # -- Loop on Details links. <a id="" class="detailsLink" ..>
        for anchor in anchorList:
            href = anchor['href']
            print href
            download(href, yearDir)

        pass
Example #2
0
def main():


    ##<ul class="oe_pressReleasesList">
    ##		<table id="ai_official_reports">
    ##            <tr>
    ##                <td><a href="http://web.oie.int/wahis/reports/en_fup_0000010121_20110102_154206.pdf" target="_top" title="Follow-up report No. 2">Follow-up report No. 2</a>&nbsp;&nbsp;<a style="color:#FAF9F8" name="c27054">27054</a></td>

    base= "http://www.oie.int/animal-health-in-the-world/update-on-avian-influenza/"    # 2011 - 2004
    # variation http://www.oie.int/animal-health-in-the-world/update-on-avian-influenza/2011
    ## http://web.oie.int/wahis/reports/en_fup_0000010289_20110304_170141.pdf


    # Make a new directory for this run with date as name.
    tm = datetime.datetime.now()
    datadir = "../data/" + tm.strftime("%Y-%m-%d_t%H_%M_%S")
    #datadir = "D:\\School\\171\\fp\\data\\" + str(tm)

    EnsureDirectory(datadir)



    # keep going until we find a page that doesn't have any job postings
    # -- Loop on pages.
    for yr in range(4,8):
        print "Year : ", yr

        if( yr < 10):
            yearPostFix = "200" + str(yr)
        else:
            yearPostFix = "20" + str(yr)

        url = base + yearPostFix
        print "PAGE -----------------------", url

        # Find the pdf file links
        soup = util.mysoupopen(url)

        # find table with the pdf links <table id="ai_official_reports">
        linkTable = soup.findAll("table", {"id":"ai_official_reports"})

        if(len(linkTable) == 0):
            print "Did not find any links!"
            break;


        anchorList = soup.findAll("a", {"target":"_top"})

        # Break out of page loop when there are no job details links.
        if(len(anchorList) == 0):
            print "No pdf links found???"
            break

        # create a directory for this year.
        yearDir = datadir + "/" + yearPostFix + "/" ;
        EnsureDirectory(yearDir)
        # -- Loop on Details links. <a id="" class="detailsLink" ..>
        for anchor in anchorList:
            href =  anchor['href']
            print href
            download(href, yearDir)


        pass
Example #3
0
# problems - need to learn how to search for things that DON'T have certain patterns

import re, util

# this creates a tab-delimited file
delim = "\t"

# base url
base = "http://www.elyrics.net"

# find all in flames songs by going to the following url
url = base + "/song/i/in-flames-lyrics.html"

# use cs171-util to get a soup object that represents a webpage
soup = util.mysoupopen(url)

# songs holds the html code that holds the names of the songs
songs = soup.findAll("table", {"class": "songs"})

# If we find some songs, then find the urls to each song's lyrics
if (len(songs) != 0):

    # songs is a soup object
    # however, we'd like it to be a string, so we can use reg exps to
    # search through it! songs_str will be that string
    songs_str = str(songs[0])

    # find links for every song listed.
    m = re.findall('<a href="(.*?)">.*?</a', songs_str)

# Now go to each url and get the lyrics
Example #4
0
# problems - need to learn how to search for things that DON'T have certain patterns

import re, util

# this creates a tab-delimited file
delim = "\t"

# base url
base = "http://www.census.gov"

# go to the first page
url = base + "/econ/census02/data/al/AL000_71.HTM"

# use cs171-util to get a soup object that represents a webpage
soup = util.mysoupopen(url)

# states holds the url code that has all the states extentions
states = soup.findAll("select", {"name": "Location"})

# If we find some states, then find the urls to each song's lyrics
if (len(states) != 0):

    # states is a soup object
    # however, we'd like it to be a string, so we can use reg exps to
    # search through it! states_str will be that string
    states_str = str(states[0])
    states_str = states_str.replace("\n", "")
    # cut down our states_str to only the state links
    m = re.findall('<select name="Location">(.*?)</select>', states_str)

    # now find make a list of urls of each state
import re, util, string, datetime, time

# configuration
STARTING_YEAR = 1996
DELIMITER = "\t"
URL_TEMPLATE = "http://www.swivel.com/data_sets/spreadsheet/1000052?page=%(page)s"
PAGES = range(1,3)
DATE_COLUMN_PROPERTIES = {"class": "DateTimeDataFormat "}
RESIDENTIAL_PRICE_COLUMN_PROPERTIES = {'class': 'NumberDataFormat column1001109'}
COMMERCIAL_PRICE_COLUMN_PROPERTIES = {'class': 'NumberDataFormat column1001110'}
INDUSTRIAL_PRICE_COLUMN_PROPERTIES = {'class': 'NumberDataFormat column1001111'}

prices_by_date = {}
print "Month Year" + DELIMITER + "Residential" + DELIMITER + "Commercial" + DELIMITER + "Industrial"
for page in PAGES:
  soup = util.mysoupopen(URL_TEMPLATE % {"page": page})
  table = soup.find("table", {"class": "data"})

  # for each row on page
  for row in table.findAll("tr"):
    date_column = row.find("td", DATE_COLUMN_PROPERTIES)
    if date_column == None: continue
  
    # scrape data from rows and store in prices_by_date
    date_struct = time.strptime(date_column.string.strip(), "%b %Y")
    date = datetime.date(date_struct[0], date_struct[1], 1)
    residential_price = row.find("td", RESIDENTIAL_PRICE_COLUMN_PROPERTIES).string.strip()
    commercial_price = row.find("td", COMMERCIAL_PRICE_COLUMN_PROPERTIES).string.strip()
    industrial_price = row.find("td", INDUSTRIAL_PRICE_COLUMN_PROPERTIES).string.strip()
    if date.year >= STARTING_YEAR:
      prices_by_date[date] = {'residential': residential_price, 'commercial': commercial_price, 'industrial': industrial_price}
Example #6
0
#!/usr/bin/env python
# Courtesy of Sue in Boston
# http://zombievir.us/2010/02/beautiful-soup-for-ugly-html/

import re, util
from BeautifulSoup import BeautifulSoup, Comment
 
# list
list=[]
 
# base url
baseurl = "http://www.plyrics.com/w/weezer.html"
 
# use cs171-util to get a soup object that represents a webpage
soup = util.mysoupopen(baseurl)
#I started with the dido lyric scraper I saw on the forums. I have never written
#in python before. I played with it for a while and came up with my own way of
#grabbing urls.
 
# titleCols- grab the HREF values of all links
titleCols = soup.findAll("a", href=True)
 
# if there are no href values, stop :)
# program!
if(len(titleCols) == 0):
    exit;
 
# for each entry
for url in titleCols:
    mc = str(url['href'])
    #print mc
Example #7
0
# this creates a tab-delimited file
delim = "\t"

# base url
base = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&Category=334&N=2003340000&SpeTabStoreType=0"

# counter to keep track of which page we're on
ctr = 0

# keep going until we find a page that doesn't have any products
while (True):
    url = base + "&page=" + str(ctr)
    ctr += 1

    # use cs171-util to get a soup object that represents a webpage
    soup = util.mysoupopen(url)

    # infoCols holds the html code that holds the names of the products
    infoCols = soup.findAll("td", {"class": "midCol"})

    # priceCols holds the html code that holds the prices of the products
    priceCols = soup.findAll("ul", {"class": "priceCol"})

    # if we didn't find any products, break out of the loop and finish the
    # program!
    if (len(infoCols) == 0):
        break

    # for each product...
    for i in range(len(priceCols)):
        # infoCols[i] is a soup object
import re, util, string

DELIMITER = "\t"
URL_TEMPLATE = "http://www.usc.edu/schools/college/crcc/engagement/resources/texts/muslim/quran/%(chapter_number)s.qmt.html"
TRANSLATOR = "YUSUFALI"
VERSE_MATCHING_REGEX = "<strong>%(translator)s:</strong>([^<]+?)<br\s?/>" % {'translator': TRANSLATOR}
words = {}
FIRST_CHAPTER = 1
LAST_CHAPTER = 114
STOP_WORDS = map(string.rstrip, open('stop_words.txt').readlines())

# Scrape 114 chapters in the Qur'an
for chapter_number in range(FIRST_CHAPTER,LAST_CHAPTER+1):
    url = URL_TEMPLATE % {'chapter_number': str(chapter_number).zfill(3)}
    soup = util.mysoupopen(url)

    verse_matches = re.findall(VERSE_MATCHING_REGEX, str(soup))
    
    verse_number = 1
    # for each verse in chapter_number
    for verse_match in verse_matches:
        # remove punctuations and leading/trailing spaces
        verse_text = re.sub("'|,|;|:|'|\"|\!|\?|\.|\(|\)|\-", " ", verse_match).strip()
        verse_text = re.sub("\s{2,5}|\n", " ", verse_text)
        
        # add verse's words to dictionary
        for word in re.split("\s+", verse_text):
          word = word.lower()
          if(word in STOP_WORDS):
            continue
          if(word not in words):