コード例 #1
0
def extract_incidents(dom):

    incident_list = []
    i = 0

    for incident in dom.by_tag('tr'):
        if i > 0:
            link = INCIDENT_URL + incident.by_tag('a')[0].href
            print link

            url = URL(link)
            html = url.download(timeout=100)
            dom_incident = DOM(html)

            weapons = [weapon.strip() for weapon in dom_incident.by_tag('p')[16].content[27:].split('<br />')]
            weapons = ", ".join(weapons)[:-2]
            latitude = dom_incident.by_tag('p')[2].content[33:].strip()
            longitude = dom_incident.by_tag('p')[3].content[34:].strip()

            description = incident.by_tag('div')[0].content[1:].strip()
            date = incident.by_tag('td')[2].content[1:].strip()
            location = incident.by_tag('td')[3].content[1:].strip()
            violation = incident.by_tag('td')[4].content[1:].strip()
            incident_list.append([link.encode('utf-8'), location.encode('utf-8'), latitude.encode('utf-8'), longitude.encode('utf-8'), date.encode('utf-8'), violation.encode('utf-8'), weapons.encode('utf-8'), description.encode('utf-8')])

        i += 1

    return incident_list
コード例 #2
0
def extract_tvseries(dom):
    '''
    Extract a list of highest ranking TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Ranking
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''

    dom = DOM(URL(TARGET_URL).download())

    # global list for storing all series information
    series_list = []

    # collect all info of the series, one series at a time
    for l in range(NUMBER_OF_SERIES):

        # temporary variables to make strings
        genre = ''
        credit = ''

        # get rank for each series
        rank = dom.by_tag("tr.detailed")[l].by_tag("span.value")[0].content

        # get runtime for each series
        time = dom.by_tag("span.runtime")[l]
        time = plaintext(time.content)[:-5]
      
        # get all genres for each series
        for m in dom.by_tag("span.genre")[l].by_tag("a"):
            genre += m.content + ", "
        genre = genre[:-2].encode('ascii', 'ignore').decode('ascii')

        # get all actors for each series
        for m in dom.by_tag("span.credit")[l].by_tag("a"):
            credit += m.content + ", "
        credit = credit[:-2].encode('ascii', 'ignore').decode('ascii')

        # get title for each series
        title = dom.by_tag("tr.detailed")[l].by_tag("a")[1].content

        # store info for each series
        series = [title, rank, genre, credit, time]
        series_list.append(series)

    return series_list
コード例 #3
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    url = URL(url)
    html = url.download()
    dom = DOM(html)
    homeUrl = 'http://www.imdb.com'
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    for e in dom.by_tag("td.titleColumn"):
        absoluteUrl = ''
        for a in e.by_tag("a"):
            link = a.attributes.get("href", "")
            absoluteUrl = homeUrl + link
            movie_urls.append(absoluteUrl)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #4
0
def scrape_starrtest(county_num):
	if county_num<10:
		county_num = '0' + str(county_num)
	else:
		county_num = str(county_num)
	
	print county_num
	#url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=01&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
	url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=' + str(county_num) + '&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
	abs_url = URL(string = url)
	dom = DOM(abs_url.download(cached=True))#download the DOM

	
	#sciend_num = dom.by_class("rm")[4].content
	scicst_num = dom.by_class("rm")[3].content
	math_num = dom.by_class("rm")[2].content
	hist_num = dom.by_class("rm")[1].content
	ela_num = dom.by_class("rm")[0].content
	
	#sciend_percent = dom.by_class("rs")[4].content[:5]
	scicst_percent = dom.by_class("rs")[3].content[:5]
	math_percent = dom.by_class("rs")[2].content[:5]
	hist_percent = dom.by_class("rs")[1].content[:5]
	ela_percent = dom.by_class("rs")[0].content[:5]
	
	county = dom.by_tag("h2")[0].content
	
	
	# write all the collected data to a new row of the output file
	writer.writerow([county, ela_num,ela_percent, hist_num, hist_percent, math_num, math_percent,scicst_num, scicst_percent])
コード例 #5
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    top_250_url = URL(url)
    top_250_html = top_250_url.download(cached=True)
    top_250_dom = DOM(top_250_html)

    for a in top_250_dom.by_tag("td.titleColumn"):
        for b in a.by_tag("a"):
            link_ext = b.attrs["href"].encode("utf-8")
            link_base = "http://www.imdb.com"
            link = link_base+link_ext
            movie_urls.append(link)
             

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #6
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    # Pak de html van de url en maak er een DOM van
    html = url.download()
    dom = DOM(html)
    # Elke url begint met deze root, deze root is nodig voor het absolute pad
    root = 'http://www.imdb.com'
    
    # De url van elke film zit in een td tag met class titleColumn
    for movie in dom.by_class("titleColumn"):
    	# Maak een DOM van de inhoud tussen de td tags om daarin te kunnen zoeken
        movieinfo = DOM(movie.content)
        # Het relatieve pad van elke film is de waarde van 'href' van de eerste 'a' tag
        # Concatenate de root en het relatieve pad voor het absolute pad en append aan movie_urls
        movie_urls.append(root + movieinfo.by_tag("a")[0].attrs.get("href",""))
        
                    
    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #7
0
ファイル: scrape.py プロジェクト: debovis/python-analysis
	def getReviews(self):
		params = {
			'id' : "comments",
			'oid' : 0,
			'showAll' : 'yes'
		}
		reviews = []

		i=0
		for rs in self.conn.resturants.find():
			reviews = []
			if not rs.get('reviews'):
				oid = str(rs['url']).split('=')[1]
				params['oid'] = oid
				req = DOM(URL(self.xmlUrl, query=params).download())
				for item in req.by_tag('item'):
					if item.by_tag('description'):
						content = plaintext(item.by_tag('description')[0].content)
						reviews.append(self.parseReview(content))
				
				# print reviews[0:3]
				rs['reviews'] = reviews
				self.conn.resturants.save(rs)
				print 'saved reviews for', rs['name']	
			else:
				print 'already have reviews for', rs['name']			
コード例 #8
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    dom = DOM(URL(TOP_250_URL).download()) #set domain
    for td in dom.by_tag("td.titleColumn")[:250]: #loop over movies
        for a in td.by_tag("a"):
            a = str(a)
            a = a.split('"')
            link = "http://www.imdb.com" + a[1]
            movie_urls.append(link)
    print movie_urls

       


    
           

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.



    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #9
0
def scrape_beer_info_urls(url):
    '''
    Scrape the top 30 beer discounts from Yenom.com
    '''
    # Download the HTML file
    html = url.download()
    # Parse the HTML file into a DOM representation
    dom = DOM(html)
    table = dom.by_tag("table.hikashop_products_table adminlist table table-striped table-hover")[0]
    
    i = 0
    info_urls = []
    # Loop through all beer discounts
    for listItem in table.by_tag("tr")[1:]:
        print 
        print i
        i += 1
        print
        # Get URL
        links = listItem.by_tag("a")
        # Some of the rows in the table are separators between supermarkets so they do not have a link
        if len(links) > 0:
            #print Links[0].content.encode("utf-8")
            print HOME_URL + links[0].attrs["href"]
            info_urls.append(HOME_URL + links[0].attrs["href"])

    # return the list of URLs for each info page
    return info_urls

    """
コード例 #10
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    url = URL(url)
    html = url.download()
    dom = DOM(html)
    homeUrl = 'http://www.imdb.com'
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    for e in dom.by_tag("td.titleColumn"):
        absoluteUrl = ''
        for a in e.by_tag("a"):
            link = a.attributes.get("href","")
            absoluteUrl = homeUrl + link
            movie_urls.append(absoluteUrl)
        
    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #11
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    dom = DOM(url.download(cached=True))
    result = "http://imdb.com"

    for films in dom.by_tag("tbody.lister-list"):
        for urls in films.by_tag("td.titleColumn"):
            for url in urls.by_tag("a"):
                content = str(url).split('"')
                #print content[1]
                result += str(content[1])
                movie_urls.append(result)
                result = "http://imdb.com"
    return movie_urls               
コード例 #12
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    from pattern.web import abs

    movie_urls = []
    html = url.download(cached=True)
    dom = DOM(html)

    for a in dom.by_tag("tbody.lister-list"):
        for b in a.by_tag("td.titleColumn"):
            for c in b.by_tag("a"):
                link = c.attrs.get("href","")
                link = abs(link, base=url.redirect or url.string)
                movie_urls.append(link)

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #13
0
def obtain_data(url):
	'''
	Scrape the Wikipedia page.

	Args:
		url: pattern.web.URL instance pointing to the Wikipedia page

	Returns:
		A list of lists, where each sublist represents a data point. Each
		sublist contains two elements: a string with the name of the country,
		and a string with the size of the population of that country. 
	'''

	# Create a DOM of the URL.
	html = url.download(cached=True)
	dom = DOM(html)

	data_points = []

	for countries_table in dom.by_tag("table.wikitable sortable"):
		for table_row in countries_table.by_tag("tr")[1:]:	# The first row is the header, so start at index 1.
			table_row_content = []
			# Obtain the content of the row.
			for table_row_cell in table_row.by_tag("td"):
				table_row_cell_content = unicode(plaintext(table_row_cell.content))
				table_row_content.append(table_row_cell_content)
			# Obtain the country name and the population size.
			country = table_row_content[1].split("[")[0].split(" (")[0]
			population = "".join(table_row_content[2].split(","))
			data_point = [country, population]
			data_points.append(data_point)

	return data_points
コード例 #14
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    top_250_url = URL(url)
    top_250_html = top_250_url.download(cached=True)
    top_250_dom = DOM(top_250_html)

    for a in top_250_dom.by_tag("td.titleColumn")[:1]:
        for b in a.by_tag("a"):
            link_ext = b.attrs["href"].encode("utf-8")
            link_base = "http://www.imdb.com"
            link = link_base+link_ext
            movie_urls.append(link)
             

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #15
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []

    dom = DOM(url.download())
    from pattern.web import abs
    url = URL("http://imdb.com")
    for x in dom.by_tag("td.titleColumn"):
        x = x.by_tag("a")[0]
        x = x.attrs.get("href","")
        x = abs(x, base=url.redirect or url.string)
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.



    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #16
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []

    # Grab web page
    movie_html = URL(url).download(cached=True)

    # Extract relevant information for each movie
    movie_dom = DOM(movie_html)

    for a in movie_dom.by_tag("td.titleColumn"):
        for b in a.by_tag("a"):
            b = str(b)
            title = b.split('"')[1]
            url = "http://www.imdb.com", b.split('"')[1]
            urly = "".join(url)
            movie_urls.append(urly)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #17
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    # This piece of code is needed to use the dom structure while it is not given as argument.
    TOP_250_URL = 'http://www.imdb.com/chart/top'
    top_250_url = URL(TOP_250_URL)
    top_250_html = top_250_url.download(cached=True)
    dom = DOM(top_250_html)
    movie_urls = []

    '''
    Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film.
    Uses CSS selectors to find the right urls and subsequently places them in a list
    '''

    for e in dom.by_tag("td.titleColumn"): 
        for a in e.by_tag("a")[:1]:
            main = "http://www.imdb.com"
            Locallink = main + a.attrs["href"]
            movie_urls.append(Locallink)
    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #18
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    movie_urls = []

    index_html = URL(url).download(cached=True)
    index_dom = DOM(index_html)

    # Get all information from IMDB
    for i in index_dom.by_tag("td.titleColumn")[:250]:
        # Get title and append in tvserieslist
        for j in i.by_tag("a")[:1]:
            url = j.attributes["href"]
            #movie_urls.append(str(title[0]))
            movie_urls.append("http://www.imdb.com" + url)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #19
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    #dom = DOM(url)
    dom = DOM(URL(TOP_250_URL).download())
    #print plaintext(dom.by_tag("td.titleColumn")[0].content)
    x = 0

    # extract links to the movie pages of the movies in the imdb top 250
    for i in range(250):

        # extract link to movie page for each movie
        for data in dom.by_tag("td.titleColumn")[i].by_tag("a"):
            data = str(data)
            relative_path = data.split('"')[1]
            link = 'http://www.imdb.com' + relative_path
            movie_urls.append(link)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #20
0
    def research_on(self, what, where):

        url = URL(
            "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" +
            what + "&ou=" + where + "&proximite=0")
        dom = DOM(url.download(cached=True))

        for a in dom.by_tag("div.main-title pj-on-autoload "):
            for e in a.by_tag("span.denombrement"):
                number_of_results = int(
                    self.decode_if_unicode(plaintext(e.content))[:3])

        number_of_page_results = number_of_results / 20
        if (number_of_results % 20 > 0):
            number_of_page_results += 1

        self.exctract_values(dom, self.myInfo)

        for i in range(2, number_of_page_results + 1):
            url = URL(
                "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" +
                what + "&ou=" + where + "&proximite=0+"
                "&page=" + str(i))
            dom = DOM(url.download(cached=True))
            self.exctract_values(dom, self.myInfo)

        self.myInfo.sort_and_merge()
コード例 #21
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    html = url.download()
    dom = DOM(html)

    # Search for the list of movies
    movie_list = dom.by_tag('tbody.lister-list')[0]

    # Get the title column for each movie for the url
    for movie in movie_list.by_tag('td.titleColumn'):
        movie_urls.append('http://www.imdb.com' + movie('a')[0].attrs['href'])


    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #22
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    from pattern.web import abs
    url = URL("http://www.imdb.com/chart/top")
    dom = DOM(url.download(cached=True))
    for e in dom.by_tag("td.titleColumn")[:250]:
        for link in e.by_tag("a"):
            link = link.attrs.get("href", "")
            link = abs(link, base=url.redirect or url.string)
            movie_urls.append(link)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #23
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    movie_urls = []

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    # Create a DOM of the URL.
    html = url.download(cashed=True)
    dom = DOM(html)

    for movie_table in dom.by_tag("table.chart full-width"):
        for movie_table_row in movie_table.by_tag("tr")[1:251]: # The first row is redundant, so start from index 1.
            for movie_table_row_cell in movie_table_row.by_tag("td.titleColumn"):
                for a in movie_table_row_cell.by_tag("a"):
                    # Obtain the path of the URL to the movie's page, create an absolute URL, and append it to the list 'movie_urls'. 
                    movie_url_path = a.attrs["href"]
                    absolute_movie_url = "".join(["http://www.imdb.com/", movie_url_path])
                    movie_urls.append(absolute_movie_url)

    # Return the list of URLs of each movie's page on IMDB.
    return movie_urls
コード例 #24
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    # dwonload html page of url
    index_html = URL(url).download(cached=True)

    # create dom structure of index.html
    index_dom = DOM(index_html)

    # loops over every movie
    for td in index_dom.by_tag("td.titleColumn")[:250]:
        # extracts the url of the movie
        for a in td.by_tag("a")[:1]:
            a = str(a)
            path = a.split('"')[1]
            domain = "http://imdb.com"
            movie_url = domain + path
            movie_urls.append(movie_url)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #25
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    # initieer movie_html en en movie_dom naar imdb top 250 site
    movie_html = URL(url).download(cached=True)
    movie_dom = DOM(movie_html)
        # zoek op de site naar td.titlecolumn waar link in zit
    for films in movie_dom.by_tag("td.titleColumn"):
        # zoek link in td.titlecolumn 
        link = films.by_tag('a')[0]
        # maak abslote path en voeg het toe aan de lijst movies_urls
        link = "http://www.imdb.com" + link.attrs.get("href","")
        movie_urls.append(link)


    # return the list of URLs of each movie's page on IMDB
    return movie_urls
def scrape_top_250(url):
    """
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    """

    # This piece of code is needed to use the dom structure while it is not given as argument.
    TOP_250_URL = "http://www.imdb.com/chart/top"
    top_250_url = URL(TOP_250_URL)
    top_250_html = top_250_url.download(cached=True)
    dom = DOM(top_250_html)
    movie_urls = []

    """
    Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film.
    Uses CSS selectors to find the right urls and subsequently places them in a list
    """

    for e in dom.by_tag("td.titleColumn"):
        for a in e.by_tag("a")[:1]:
            main = "http://www.imdb.com"
            Locallink = main + a.attrs["href"]
            movie_urls.append(Locallink)
    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #27
0
def make_json(url):
    json_dict = {}
    # Geef de data een titel
    json_dict["data"] = "percentage renewable energy"

    # Pak de DOM van de tabel van alle landen
    html = url.download()
    dom = DOM(DOM(html).by_class("wikitable")[1].content)

    # Maak een list met info over de landen
    countrylist = dom.by_tag("tr")[1:]

    # Lege list om de data aan te appenden
    pointslist = []
    for countryinfo in countrylist:
        # Lege list om land en percentage renewable energy aan te appenden
        infopair = []

        # Neem de naam van het land en append dat aan infopair
        infopair.append(DOM(countryinfo.content).by_tag("a")[0].attrs.get("title", "").encode("utf-8"))
        # Neem het percentage renewable energy van het land en append dat aan infopair
        infopair.append(DOM(countryinfo.content).by_tag("td")[8].content.encode("utf-8"))

        # Append de list aan pointslist voor een nested list
        pointslist.append(infopair)

    # Geef de dictionary de key 'points' met value de nested list pointslist
    json_dict["points"] = pointslist

    # Dump de dictionary als JSON naar de textfile json.txt
    json.dump(json_dict, open("json.txt", "wb"))
コード例 #28
0
ファイル: read.py プロジェクト: alex-mcleod/py_msg
class fbMessageDump(MessageDump):
	def __init__(self, dump, p1, p2 = None, **kwargs):
		super(fbMessageDump, self).__init__(dump, p1, **kwargs)
		
	def construct_dump(self):
		f = open(self.dump, "r")
		self.dump = DOM(f.read())
		f.close()
		
	def construct_threads(self):
		for i in self.dump.by_tag("div.thread"):
			cur_thread = msg_classes.Thread()
			cur_thread.p1 = self.p1
			thread_exists = False
			if plaintext(i.by_tag("span.profile fn")[0].content) == self.p1: 
				cur_thread.p2 = plaintext(i.by_tag("span.profile fn")[1].content)
			else:
				cur_thread.p2 = plaintext(i.by_tag("span.profile fn")[0].content)
			# TODO if p1 and p2 have the same name, error!
			# assert cur_thread.p1 != cur_thread.p2 
			for e in i.by_tag("div.message"):
				cur_thread.add_message(
						plaintext(e.by_tag("div.from")[0].content).encode("utf-8"), 
						e.by_tag("abbr.time published")[0].attributes['title'].encode("utf-8"),
						plaintext(e.by_tag("div.msgbody")[0].content).encode("utf-8")
						)
			cur_thread.construct_conversations() 
			for t in self.threads:
				if t.p2 == cur_thread.p2:
					thread_exists = True 
					t.combine(cur_thread)

			if not thread_exists:
				self.threads.append(cur_thread) 
コード例 #29
0
ファイル: tvscraper.py プロジェクト: jordi1992/DataProcessing
def extract_tvseries(dom):

    url = URL(TARGET_URL)
    dom = DOM(url.download(cached=True))
    #print dom.body.content
    x = 0
    csv_row = []
    for series in dom.by_tag('td.title'):    
        title = series.by_tag('a')[0].content.encode('ascii', 'ignore')
        ranking = series.by_tag('span.value')[0].content.encode('ascii', 'ignore')
        genres = series.by_tag('span.genre')[0].by_tag('a')
        genres = [g.content.encode('ascii', 'ignore') for g in genres]
        actors = series.by_tag('span.credit')[0].by_tag('a')
        actors = [a.content.encode('ascii', 'ignore') for a in actors]
        x = x + 1
        try:
            runtime = series.by_tag('span.runtime')[0].content.encode('ascii', 'ignore')
        except:
            runtime = "Unknown"
        #print x, title, ranking, genres, actors, runtime

        csv_titles = title
        csv_ranking = ranking
        csv_genres = genres
        csv_actors = actors
        csv_runtime = runtime
        row = [csv_titles, csv_ranking, csv_genres, csv_actors, csv_runtime]
        csv_row.append(row)

    return csv_row
コード例 #30
0
def scrape(url):
    with io.open("allMusicOneWeek.csv", "w",encoding = "utf8") as f:
            url = "http://www.top40.nl/top40/2015/week-46"
            week = url.split("/")
            week = week[-1]
            url = URL("http://www.top40.nl/top40/2015/week-46")
            dom = DOM(url.download(cached = True))
            # geeft de week
            i = 1
            # de lijst van de top 40 selecteren

            for l in dom.by_tag("ol.top40"):
                # per nummer selecteren=
                print "lijst top 40"
                for e in l.by_tag("div.clearfix"):
                    muziekGegevens = ""
                    #positie in de top 40
                    muziekGegevens += str(i) + ","
                    print i , 'positie'
                    i += 1 # opletten met resetten
                    # de artiest selecteren
                    for artiest in e.by_class("credit"):
                        muziekGegevens += artiest.content + ","
                    #positie
                    for inner in e.by_tag("strong")[1:2]:
                        print inner.content , "1:2"
                        muziekGegevens += inner.content + ","
                    # hoogste notering
                    for inner in e.by_tag("strong")[2:3]:
                        print inner.content , "2:3"
                        muziekGegevens += inner.content + ","
                    # aantal punten
                    for inner in e.by_tag("strong")[3:4]:
                        print inner.content , "3:4"
                        muziekGegevens += inner.content + ","
                    # jaar van het nummer
                    for inner in e.by_tag("strong")[4:5]:
                        print inner.content.strip() , "4:5"
                        muziekGegevens += inner.content.strip()
                    h = HTMLParser.HTMLParser()
                    muziekGegevens = h.unescape(muziekGegevens)

                    if not whatisthis(muziekGegevens):
                        muziekGegevens = unicode(muziekGegevens, "utf-8")
                        print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf'
                        f.write(muziekGegevens + "\n")
                    else:
                        f.write(muziekGegevens + "\n")


    #                     1 positie
    # week-45
    # ,1,
    # Traceback (most recent call last):
    #   File "testhtmlscraper.py", line 58, in <module>
    #     f.write(muziekGegevens + "\n")
    # TypeError: must be unicode, not str ???
    f.close
コード例 #31
0
def scrape(url):
    with io.open("allMusicOneWeek.csv", "w", encoding="utf8") as f:
        url = "http://www.top40.nl/top40/2015/week-46"
        week = url.split("/")
        week = week[-1]
        url = URL("http://www.top40.nl/top40/2015/week-46")
        dom = DOM(url.download(cached=True))
        # geeft de week
        i = 1
        # de lijst van de top 40 selecteren

        for l in dom.by_tag("ol.top40"):
            # per nummer selecteren=
            print "lijst top 40"
            for e in l.by_tag("div.clearfix"):
                muziekGegevens = ""
                #positie in de top 40
                muziekGegevens += str(i) + ","
                print i, 'positie'
                i += 1  # opletten met resetten
                # de artiest selecteren
                for artiest in e.by_class("credit"):
                    muziekGegevens += artiest.content + ","
                #positie
                for inner in e.by_tag("strong")[1:2]:
                    print inner.content, "1:2"
                    muziekGegevens += inner.content + ","
                # hoogste notering
                for inner in e.by_tag("strong")[2:3]:
                    print inner.content, "2:3"
                    muziekGegevens += inner.content + ","
                # aantal punten
                for inner in e.by_tag("strong")[3:4]:
                    print inner.content, "3:4"
                    muziekGegevens += inner.content + ","
                # jaar van het nummer
                for inner in e.by_tag("strong")[4:5]:
                    print inner.content.strip(), "4:5"
                    muziekGegevens += inner.content.strip()
                h = HTMLParser.HTMLParser()
                muziekGegevens = h.unescape(muziekGegevens)

                if not whatisthis(muziekGegevens):
                    muziekGegevens = unicode(muziekGegevens, "utf-8")
                    print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf'
                    f.write(muziekGegevens + "\n")
                else:
                    f.write(muziekGegevens + "\n")

    #                     1 positie
    # week-45
    # ,1,
    # Traceback (most recent call last):
    #   File "testhtmlscraper.py", line 58, in <module>
    #     f.write(muziekGegevens + "\n")
    # TypeError: must be unicode, not str ???
    f.close
コード例 #32
0
def extract_tvseries(dom):
    url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series")
    dom = DOM(url.download(cached = True))
    # create two arrays to make a list at the end to write off
    infoserie = []
    infoSerieList = []
    a = ''
    for e in dom.by_tag("tr.detailed")[:50]: # Top 50 imdb entries.
        # get title
        for a in e.by_tag("a")[:1]: # First <a class="title"> in entry.
            infoserie = []
            s = a.attrs["title"]
            infoserie += [s.split('(')[0].strip()]
        # get rating
        for rating in e.by_tag("div.rating-list")[:1]:
            rating = rating.attrs["title"]
            infoserie +=[rating[17:20]]
        # get genre
        for genre in e.by_tag("span.genre")[:1]:
            for m in genre.by_tag("a"):
                infoserie += [m.content]
        # get actors
        for actors in e.by_tag("span.credit"):
            for actors_sub in actors.by_tag("a"):

                infoserie +=[actors_sub.content]
        #get time
        for time in e.by_tag("span.runtime")[:1]:

            infoserie += [time.content[:3]]
        infotopserie =[]
        # encode to get rid of unicode error
        for encoding in infoserie:

            infotopserie += [encoding.encode('utf-8')]
        # add row to list
        infoSerieList.append(infotopserie)

    '''
    Extract a list of highest ranking TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Ranking
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''

    # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE
    # HIGHEST RANKING TV-SERIES
    # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE
    # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT.

    return infoSerieList  # replace this line as well as appropriate
コード例 #33
0
def scrape_movie_page(dom):
    '''
    Scrape the IMDB page for a single movie

    Args:
        dom: pattern.web.DOM instance representing the page of 1 single
            movie.

    Returns:
        A list of strings representing the following (in order): title, year,
        duration, genre(s) (semicolon separated if several), director(s) 
        (semicolon separated if several), writer(s) (semicolon separated if
        several), actor(s) (semicolon separated if several), rating, number
        of ratings.
    '''
    # YOUR SCRAPING CODE GOES HERE:
    dom = DOM(url.download())
    #print dom.body.content
    csv_row = []
    for series in dom.by_tag('td.titleColumn'):    
        title = series.by_tag('a')[0]
        ranking = series.by_tag('td.ratingColumn')[0]
        genres = series.by_tag('span.genre')[0].by_tag('a')
        genres = [g.content for g in genres]
        actors = series.by_tag('span.credit')[0].by_tag('a')
        actors = [a.content for a in actors]
        try:
            runtime = series.by_tag('span.runtime')[0]
        except:
            runtime = "Unknown"
       

        csv_titles = title
        csv_ranking = ranking
        csv_genres = genres
        csv_actors = actors
        csv_runtime = runtime
        row = [csv_titles, csv_ranking, csv_genres, csv_actors, csv_runtime]
        csv_row.append(row)

    return csv_row

    print title;
    '''
    Geen idee hoe dit werkt, Python ligt mij, ik heb met moeite vorige week die opdracht afgekregen
    in mijn ogen is het precies hetzelfde als vorige week, maar het werkt niet. Bij deze dus een incomplete
    opdracht. Ik hoop niet dat ik daardoor de module niet meer kan halen... Volgende keer beter, JS ligt mij
    toch iets meer dan Python om eerlijk te zijn.
    '''
    # Return everything of interest for this movie (all strings as specified
    # in the docstring of this function).
    return title, duration, genres, directors, writers, actors, rating, \
        n_ratings
コード例 #34
0
ファイル: tscraper.py プロジェクト: BartQuaink/Dataprocessing
def extract_tvseries(dom):
    '''
    Extract a list of highest ranking TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Ranking
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''

    # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE
    # HIGHEST RANKING TV-SERIES
    # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE
    # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT.
    dom = DOM(TARGET_URL.download(cached=True))
    # Get top 50 results
    for e in dom.by_tag("td.title"):
        # get title
        for a in e.by_tag("a")[:1]:
            title = plaintext(a.content)
            print title
            print

        # get ranking
        for td in e.by_tag("span.value")[:1]:
            ranking = plaintext(td.content)
            print ranking
            print

        # get genre
        for span in e.by_tag("span.genre")[:1]:
            genre = plaintext(span.content)
            print genre
            print

        # get actors/actresses
        for span in e.by_tag("span.credit")[:1]:
            actors = plaintext(span.content)
            print actors
            print

        # get runtime (number)
        for span in e.by_tag("span.runtime")[:1]:
            runtime = plaintext(span.content)
            print runtime
            print

        # create a dictionary of all the retrieved info
        showlist[e] = {title, ranking, genre, actors, runtime}
コード例 #35
0
def scrape_gradrate(county_num):
	if county_num<10:
		county_num = '0' + str(county_num)
	else:
		county_num = str(county_num)
	
	print county_num
	#url = 'http://dq.cde.ca.gov/dataquest/cohortrates/CRByGender.aspx?cds=01000000000000&TheYear=2011-12&Agg=O&Topic=Dropouts&RC=County&SubGroup=Ethnic/Racial'
	url = 'http://dq.cde.ca.gov/dataquest/cohortrates/CRByGender.aspx?cds='+county_num+'000000000000&TheYear=2011-12&Agg=O&Topic=Dropouts&RC=County&SubGroup=Ethnic/Racial'
	abs_url = URL(string = url)
	dom = DOM(abs_url.download(cached=True))#download the DOM

	
	grad_percent = dom.by_tag("tr")[-1].by_tag("td")[4].content
	total_dropouts = dom.by_tag("tr")[-1].by_tag("td")[5].by_tag("span")[0].content
	total_grads = dom.by_tag("tr")[-1].by_tag("td")[3].by_tag("span")[0].content
	total_num = dom.by_tag("tr")[-1].by_tag("td")[2].by_tag("span")[0].content
	
	county = dom.by_tag("h2")[0].by_tag("span")[0].content[26:]

	

	# write all the collected data to a new row of the output file
	writer.writerow([county, total_num,total_grads, total_dropouts, grad_percent])
コード例 #36
0
def scrape(url, f):

    week = url.split("/")
    week = week[-1]
    url = URL(url)
    dom = DOM(url.download(cached=True))
    # geeft de week
    i = 1
    # de lijst van de top 40 selecteren

    for l in dom.by_tag("ol.top40"):
        # per nummer selecteren=
        print "lijst top 40"
        for e in l.by_tag("div.clearfix")[0:40]:
            muziekGegevens = ""
            #positie in de top 40
            muziekGegevens += str(i) + ","
            print i, 'positie'
            i += 1  # opletten met resetten
            # de artiest selecteren
            for artiest in e.by_class(
                    "credit"):  #error niet te veel elementen!
                muziekGegevens += artiest.content + ","
            #positie
            for inner in e.by_tag("strong")[1:2]:
                print inner.content, "1:2"
                muziekGegevens += inner.content + ","
            # hoogste notering
            for inner in e.by_tag("strong")[2:3]:
                print inner.content, "2:3"
                muziekGegevens += inner.content + ","
            # aantal punten
            for inner in e.by_tag("strong")[3:4]:
                print inner.content, "3:4"
                muziekGegevens += inner.content + ","
            # jaar van het nummer
            for inner in e.by_tag("strong")[4:5]:
                print inner.content.strip(), "4:5"
                muziekGegevens += inner.content.strip()
            h = HTMLParser.HTMLParser()
            muziekGegevens = h.unescape(muziekGegevens)

            if not whatisthis(muziekGegevens):
                muziekGegevens = unicode(muziekGegevens, "utf-8")
                print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf'
                f.write(muziekGegevens + "\n")
            else:
                f.write(muziekGegevens + "\n")
コード例 #37
0
def scrape(url,f):

        week = url.split("/")
        week = week[-1]
        url = URL(url)
        dom = DOM(url.download(cached = True))
        # geeft de week
        i = 1
        # de lijst van de top 40 selecteren

        for l in dom.by_tag("ol.top40"):
            # per nummer selecteren=
            print "lijst top 40"
            for e in l.by_tag("div.clearfix")[0:40]:
                muziekGegevens = ""
                #positie in de top 40
                muziekGegevens += str(i) + ","
                print i , 'positie'
                i += 1 # opletten met resetten
                # de artiest selecteren
                for artiest in e.by_class("credit"): #error niet te veel elementen!
                    muziekGegevens += artiest.content + ","
                #positie
                for inner in e.by_tag("strong")[1:2]:
                    print inner.content , "1:2"
                    muziekGegevens += inner.content + ","
                # hoogste notering
                for inner in e.by_tag("strong")[2:3]:
                    print inner.content , "2:3"
                    muziekGegevens += inner.content + ","
                # aantal punten
                for inner in e.by_tag("strong")[3:4]:
                    print inner.content , "3:4"
                    muziekGegevens += inner.content + ","
                # jaar van het nummer
                for inner in e.by_tag("strong")[4:5]:
                    print inner.content.strip() , "4:5"
                    muziekGegevens += inner.content.strip()
                h = HTMLParser.HTMLParser()
                muziekGegevens = h.unescape(muziekGegevens)

                if not whatisthis(muziekGegevens):
                    muziekGegevens = unicode(muziekGegevens, "utf-8")
                    print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf'
                    f.write(muziekGegevens + "\n")
                else:
                    f.write(muziekGegevens + "\n")
コード例 #38
0
def get_countries(exceptions):
    """ Get the population density and ISO CODE 3166 for every country with manually added exceptions. """

    url = URL("https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_by_population_density")
    html = url.download()
    dom = DOM(html)
    country_table = dom.by_tag('table.wikitable')
    countries = {}

    # Get every tablerow that got a country in it.
    for country in country_table[0]('tr')[4:-1]:

        # Some come with extra's added, which makes them put it in a <i> tag.
        try:
            link = country('i')[0]('a')[0].attrs['href']
            name = country('i')[0]('a')[0].content.encode('ascii', 'ignore')
        except:

            if len(country('span')) == 0:
                link = country('td')[0]('a')[0].attrs['href']
                name = country('td')[0]('a')[0].content.encode('ascii', 'ignore')
            else:
                link = country('td')[1]('a')[0].attrs['href']
                name = country('td')[1]('a')[0].content.encode('ascii', 'ignore')
        density = country('td')[5].content.replace(',', '')

        # Not every wikipedia page is the same or got a ISO CODE at all.
        try:
            iso_code = retrieve_iso("https://en.wikipedia.org", link)
        except:
            iso_code = 'ERROR'

        # Add in manually added ISO CODES.
        if iso_code == 'ERROR' and name in exceptions:
            iso_code = exceptions[name]

        if iso_code == 'CY':  # SVG doesn't contain northern cyprus unlike wikipedia, so I add it manually.
            density = 125

        print iso_code, [name], density
        if iso_code != 'ERROR':
            countries[iso_code] = float(density)

    with open('data.txt', 'w') as outfile:
        json.dump(countries, outfile, indent=4)

    return countries
コード例 #39
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []

    # retrieve TOP 250 DOM
    dom = DOM(url.download(cached=True))

    # add all absolute movie URLs to the list
    for movie in dom.by_tag("td.titleColumn"):
        movie_urls.append('http://www.imdb.com' + movie.by_tag('a')[0].href)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #40
0
ファイル: imdb-crawler.py プロジェクト: Croow/DataProcessing
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    dom = DOM(url.download(cached=True))
    movie_urls = []

    # find the url of the separate movies and put them in a list
    for e in dom.by_tag("td.titleColumn")[:250]:
        for a in e.by_tag("a"):
            site = "http://www.imdb.com"
            site += a.attrs["href"]
            movie_urls.append(URL(site))

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #41
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    html = url.download(cached=True)
    dom = DOM(html)
    # Finds the movie URLs by selecting the first "table.chart" and then the elements "td.titleColumn" with their first links.
    e = dom.by_tag("table.chart")[0]
    for c in e.by_tag("td.titleColumn"):
        movie_urls.append(([URL("http://" + url.domain + a.attrs.get("href","")) for a in c.by_tag("a")][0]))
            
    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #42
0
from pattern.web import URL, DOM, extension, MIMETYPE_IMAGE
from pattern.web import Element, download
import urllib
import datetime

#libraries to check urllib (legacy vs not), pattern, requests
url = URL("http://www.dot.ca.gov/dist1/d1tmc/allcams.php")
dom = DOM(url.download(cached = True))
i = 0
try :
    for e in dom.by_tag('img'):
        if (extension(e.attr['src']) == '.jpg'):
            print(e.attr['src'])
            urllib.request.urlretrieve(e.attr['src'], "data/test/urllib{0}.jpg".format(i))
            #image = download(e.attr['src'], unicode= False, timeout= 5)
            #f = open("data/test/pattern{0}.jpg".format(i), 'wb')
            #f.write(image)
            i += 1
except:
    print ("error")
        
"""
image = "http://www1.dot.ca.gov/cwwp2/data/d1/cctv/image/us101northofcushingcreeklookingsouth/us101northofcushingcreeklookingsouth.jpg"
url = URL(image)
print (url.mimetype in MIMETYPE_IMAGE)
urllib.request.urlretrieve(image, 'data/test2.jpg')
"""

コード例 #43
0
ff = webdriver.Firefox()
ff.implicitly_wait(30)

## The datasets in the portal are spread over several pages, this loop
## instructs the scraper to go through each page.
## The html for the pages just updates the page number, so the URL() code
## keeps track of p and uses it to specify the page number
for p in range(1, 12):
    url = URL(
        'https://data.cityofchicago.org/browse?limitTo=datasets&sortBy=oldest&utf8=%E2%9C%93&page='
        + str(p))
    dom = DOM(url.download(cached=True))

    ## The list of datasets is in a table, so this loop cycles through the row
    ## elements to scrape each dataset.
    for i in dom.by_tag('tbody')[0:]:
        for g in i.by_tag('tr')[0:]:
            for h in g.by_tag('a.name')[0:]:
                name = h.content
                name = plaintext(name)
                name = name.encode('ascii', 'ignore')
            for j in g.by_class('category infoItem')[0:]:
                category = j.content
                category = plaintext(category)
                category = category.encode('ascii', 'ignore')
            if (g.by_class('tags infoItem')):
                tag = g.by_class('tags infoItem')[0].content
                tag = tag.encode('ascii', 'ignore')
            else:
                tag = " "
            for k in g.by_class('visits')[0:]:
コード例 #44
0
ファイル: 12-dom.py プロジェクト: Abhishek-1/temp
# The DOM elements can then be searched by tag name, CSS id, CSS class, ...

# For example, top news entries on Reddit are coded as:
# <div class="_1poyrkZ7g36PawDueRza-J s1r3zmnv-7 bmeGah">
#     ...
#     <span class="y8HYJ-y_lTUHkQIc1mdCq yj3st6-1 kYJFRo">
#     ...
#         <a class="SQnoC3ObvgnGjWt90zD9Z " href="http://i.imgur.com/yDyPu8P.jpg">Bagel the bengal, destroyer of boxes</a>
#     ...
# </div>
#
# ... which - naturally - is a picture of a cat.
url = URL("http://www.reddit.com/top/")
dom = DOM(url.download(cached=True))
#print(dom.body.content)
for e in dom.by_tag("div._1poyrkZ7g36PawDueRza-J s1r3zmnv-7 bmeGah"
                    )[:5]:  # Top 5 reddit entries.
    for a in e.by_tag("a.SQnoC3ObvgnGjWt90zD9Z")[:1]:
        print(plaintext(a.content))
        print(a.attrs["href"])
        print("")

# The links in the HTML source code may be relative,
# e.g., "../img.jpg" instead of "www.domain.com/img.jpg".
# We can get the absolute URL by prepending the base URL.
# However, this can get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs

url = URL("http://nodebox.net")
for link in DOM(url.download()).by_tag("a"):
    link = link.attrs.get("href", "")
コード例 #45
0
# The DOM (Document Object Model) parses a string of HTML
# and returns a tree of nested Element objects.
# The DOM elements can then be searched by tag name, CSS id, CSS class, ...

# For example, top news entries on Reddit are coded as:
# <div class="entry">
#     <p class="title">
#         <a class="title " href="http://i.imgur.com/yDyPu8P.jpg">Bagel the bengal, destroyer of boxes</a>
#     ...
# </div>
#
# ... which - naturally - is a picture of a cat.
url = URL("http://www.reddit.com/top/")
dom = DOM(url.download(cached=True))
#print dom.body.content
for e in dom.by_tag("div.entry")[:5]:  # Top 5 reddit entries.
    for a in e.by_tag("a.title")[:1]:  # First <a class="title"> in entry.
        print plaintext(a.content)
        print a.attrs["href"]
        print

# The links in the HTML source code may be relative,
# e.g., "../img.jpg" instead of "www.domain.com/img.jpg".
# We can get the absolute URL by prepending the base URL.
# However, this can get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs
url = URL("http://nodebox.net")
for link in DOM(url.download()).by_tag("a"):
    link = link.attrs.get("href", "")
    link = abs(link, base=url.redirect or url.string)
コード例 #46
0
import csv

from pattern.web import URL, DOM, plaintext, strip_between
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT

#For the 2013 datasheet, use this code:

url = URL(
    'http://www.satp.org/satporgtp/countries/pakistan/database/majorincidents.htm'
)
dom = DOM(url.download(cached=True))

myarray = []
tab = dom.by_tag('table')
for i in dom.by_tag('td')[11:]:
    g = i.content
    h = plaintext(g)
    myarray.append(h)


def chunks(l, n):
    return [l[i:i + n] for i in range(0, len(l), n)]


yes = chunks(myarray, 5)

output = open("satpincidents2013.csv", "wb")

writer = csv.writer(output)

for i in yes[0:]:
コード例 #47
0
stop_str = open('api_data_store.xml', 'r').read()
dom = DOM(stop_str)

# Write the beginning
writer = Writer("db.json")
writer.beginObj()

# Write the stops
print "Writing stops..."
writer.writeKey("stops")
writer.beginArray()

namesToIDs = dict()

stops = dom.by_tag("stop")
for stop in stops:
    # Get the stop id, name, title, and the stop's routes
    stop_id = stop.attributes["s_id"]
    name = stop.by_tag("title")[0].content
    title = "i-Lab" if name == "HiLab-HBS" else name
    routes = stop.by_tag("stop_routes")[0].content.split(",")

    # Map names to stop ids
    namesToIDs[name] = stop_id

    # Write out the stop to the db
    writer.beginObj()
    writer.writeKeyVal("id", stop_id, True)
    writer.writeKeyVal("name", name, True)
    writer.writeKeyVal("title", title, True)
コード例 #48
0
players = ["3975/stephen-curry", "9/ray-allen", "552/reggie-miller", "841/jason-terry", "662/paul-pierce", "429/jason-kidd", "136/vince-carter", "165/jamal-crawford", "63/chauncey-billups", "2011/kyle-korver", "469/rashard-lewis", "813/peja-stojakovic", "1007/joe-johnson", "110/kobe-bryant"]

htmllink = "http://espn.go.com/nba/player/stats/_/id/"

output_file = open('new3pointers.json', 'w')

# get data for all players
for player in players:
    TARGET_URL = URL(htmllink + player)
    dom = DOM(TARGET_URL.download(cached=True))
    dataofyear = list()
    tempdata= dict()

    #  loop over the html table
    for e in dom.by_tag("div.mod-container mod-table mod-player-stats"):
        for a in e.by_tag("div.mod-content")[1:2]:
            for tablehead in a.by_class("tablehead"):
                year = -1
                for oddrow in tablehead.by_class("oddrow"):
                    madeshots = oddrow[4].content[:3]
                    madeshots = int(madeshots.replace("-", ""))

                    year += 2

                    percentage = float(oddrow[5].content)

                    tempdata["year"] = year
                    tempdata["tot3fg"] = madeshots
                    tempdata["percentage"] = percentage
コード例 #49
0
        for row in rows:
            self.writerow(row)


# Creating the csv output file for writing into as well as defining the writer
output = open("data_output_Trulia_HP.csv", "wb")
writer = UnicodeWriter(output)
writer.writerow(
    ["State", "County", "Average Listing Price", "Median Sales Price"])

# get the DOM object to scrape for links
url = URL("http://www.trulia.com/home_prices/")
dom = DOM(url.download(cached=True))

# get the rows where all info is contained
all_data_rows = dom.by_tag("tr")

# define the variable to store all the trulia data
all_trulia_data = []

# loop through each row
for ind_data_row in all_data_rows:
    if (ind_data_row.attributes.get("style", "")
            == 'background-color: #FFFFFF;' or ind_data_row.attributes.get(
                "style", "") == 'background-color: #EDEFF2;'):

        all_columns = ind_data_row.by_tag("td")

        state = plaintext(all_columns[0].by_tag("a")[0].content)
        avg_listing_price = plaintext(all_columns[1].content)
        median_sales_price = plaintext(all_columns[2].content)