コード例 #1
0
ファイル: tvscraper.py プロジェクト: jordi1992/DataProcessing
def extract_tvseries(dom):

    url = URL(TARGET_URL)
    dom = DOM(url.download(cached=True))
    #print dom.body.content
    x = 0
    csv_row = []
    for series in dom.by_tag('td.title'):    
        title = series.by_tag('a')[0].content.encode('ascii', 'ignore')
        ranking = series.by_tag('span.value')[0].content.encode('ascii', 'ignore')
        genres = series.by_tag('span.genre')[0].by_tag('a')
        genres = [g.content.encode('ascii', 'ignore') for g in genres]
        actors = series.by_tag('span.credit')[0].by_tag('a')
        actors = [a.content.encode('ascii', 'ignore') for a in actors]
        x = x + 1
        try:
            runtime = series.by_tag('span.runtime')[0].content.encode('ascii', 'ignore')
        except:
            runtime = "Unknown"
        #print x, title, ranking, genres, actors, runtime

        csv_titles = title
        csv_ranking = ranking
        csv_genres = genres
        csv_actors = actors
        csv_runtime = runtime
        row = [csv_titles, csv_ranking, csv_genres, csv_actors, csv_runtime]
        csv_row.append(row)

    return csv_row
コード例 #2
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    url = URL(url)
    html = url.download()
    dom = DOM(html)
    homeUrl = 'http://www.imdb.com'
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    for e in dom.by_tag("td.titleColumn"):
        absoluteUrl = ''
        for a in e.by_tag("a"):
            link = a.attributes.get("href", "")
            absoluteUrl = homeUrl + link
            movie_urls.append(absoluteUrl)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #3
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    top_250_url = URL(url)
    top_250_html = top_250_url.download(cached=True)
    top_250_dom = DOM(top_250_html)

    for a in top_250_dom.by_tag("td.titleColumn"):
        for b in a.by_tag("a"):
            link_ext = b.attrs["href"].encode("utf-8")
            link_base = "http://www.imdb.com"
            link = link_base+link_ext
            movie_urls.append(link)
             

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #4
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    movie_urls = []

    index_html = URL(url).download(cached=True)
    index_dom = DOM(index_html)

    # Get all information from IMDB
    for i in index_dom.by_tag("td.titleColumn")[:250]:
        # Get title and append in tvserieslist
        for j in i.by_tag("a")[:1]:
            url = j.attributes["href"]
            #movie_urls.append(str(title[0]))
            movie_urls.append("http://www.imdb.com" + url)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #5
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    # Pak de html van de url en maak er een DOM van
    html = url.download()
    dom = DOM(html)
    # Elke url begint met deze root, deze root is nodig voor het absolute pad
    root = 'http://www.imdb.com'
    
    # De url van elke film zit in een td tag met class titleColumn
    for movie in dom.by_class("titleColumn"):
    	# Maak een DOM van de inhoud tussen de td tags om daarin te kunnen zoeken
        movieinfo = DOM(movie.content)
        # Het relatieve pad van elke film is de waarde van 'href' van de eerste 'a' tag
        # Concatenate de root en het relatieve pad voor het absolute pad en append aan movie_urls
        movie_urls.append(root + movieinfo.by_tag("a")[0].attrs.get("href",""))
        
                    
    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #6
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    top_250_url = URL(url)
    top_250_html = top_250_url.download(cached=True)
    top_250_dom = DOM(top_250_html)

    for a in top_250_dom.by_tag("td.titleColumn")[:1]:
        for b in a.by_tag("a"):
            link_ext = b.attrs["href"].encode("utf-8")
            link_base = "http://www.imdb.com"
            link = link_base+link_ext
            movie_urls.append(link)
             

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #7
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    html = url.download()
    dom = DOM(html)

    # Search for the list of movies
    movie_list = dom.by_tag('tbody.lister-list')[0]

    # Get the title column for each movie for the url
    for movie in movie_list.by_tag('td.titleColumn'):
        movie_urls.append('http://www.imdb.com' + movie('a')[0].attrs['href'])


    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #8
0
def getRandomHistoryDOM(language):
    url = URL("http://"+language+".wikipedia.org/wiki/Special:Random")
    #Gets the url only of the page this redirects to
    redirectUrl = url.redirect
    try:
        #Grab the name of the wikipedia article from the url
        urlComponents = string.split(redirectUrl, '/')
    except AttributeError:
        #Use some recursion if we encounter a page with no history, or some other error
        return getRandomHistoryDOM(language)

    #Get the history section of the article
    redirectUrl = "http://"+language+".wikipedia.org/w/index.php?title="+urlComponents[4]+"&action=history"
    print "Current article is: " +str(urlComponents[4])
    #print redirectUrl
    url = URL(redirectUrl);
    dom = DOM(url.download(cached=False))
    try:
        historyList = dom.by_id("pagehistory").by_tag("li")
        return historyList, urlComponents[4]
    except AttributeError:
        #Use some recursion if we encounter a page with no history, or some other error
        dom = getRandomHistoryDOM(language)

    return getRandomHistoryDOM(language)
コード例 #9
0
ファイル: scrape.py プロジェクト: debovis/python-analysis
	def getReviews(self):
		params = {
			'id' : "comments",
			'oid' : 0,
			'showAll' : 'yes'
		}
		reviews = []

		i=0
		for rs in self.conn.resturants.find():
			reviews = []
			if not rs.get('reviews'):
				oid = str(rs['url']).split('=')[1]
				params['oid'] = oid
				req = DOM(URL(self.xmlUrl, query=params).download())
				for item in req.by_tag('item'):
					if item.by_tag('description'):
						content = plaintext(item.by_tag('description')[0].content)
						reviews.append(self.parseReview(content))
				
				# print reviews[0:3]
				rs['reviews'] = reviews
				self.conn.resturants.save(rs)
				print 'saved reviews for', rs['name']	
			else:
				print 'already have reviews for', rs['name']			
コード例 #10
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    dom = DOM(URL(TOP_250_URL).download()) #set domain
    for td in dom.by_tag("td.titleColumn")[:250]: #loop over movies
        for a in td.by_tag("a"):
            a = str(a)
            a = a.split('"')
            link = "http://www.imdb.com" + a[1]
            movie_urls.append(link)
    print movie_urls

       


    
           

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.



    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #11
0
def scrape_beer_info_urls(url):
    '''
    Scrape the top 30 beer discounts from Yenom.com
    '''
    # Download the HTML file
    html = url.download()
    # Parse the HTML file into a DOM representation
    dom = DOM(html)
    table = dom.by_tag("table.hikashop_products_table adminlist table table-striped table-hover")[0]
    
    i = 0
    info_urls = []
    # Loop through all beer discounts
    for listItem in table.by_tag("tr")[1:]:
        print 
        print i
        i += 1
        print
        # Get URL
        links = listItem.by_tag("a")
        # Some of the rows in the table are separators between supermarkets so they do not have a link
        if len(links) > 0:
            #print Links[0].content.encode("utf-8")
            print HOME_URL + links[0].attrs["href"]
            info_urls.append(HOME_URL + links[0].attrs["href"])

    # return the list of URLs for each info page
    return info_urls

    """
コード例 #12
0
def get_by_year(year):

    url = URL("http://www.imdb.com/event/ev0000003/" + str(year))
    dom = DOM(url.download(cached=True))
    
    dictAll = {}
    
    awards = dom.by_class('award')
    awardTitles = awards[0].by_tag('h2')
    awardList = []
    for award in awardTitles:
        awardList.append(award.content)

    prize = awards[0].by_tag('blockquote')
    for index, title in enumerate(prize[1:25]):
        winner = title.by_tag('strong')[0].by_tag('a')[0].content
        winner_id = str(title.by_tag('strong')[0].by_tag('a')[0].attrs['href'][-8:-1])

        nomineeList = []
        for each in title.by_tag('strong')[1::]:
            name = each.by_tag('a')[0].content
            id = str(each.by_tag('a')[0].attrs['href'][-8:-1])
            nomineeList.append((clean_unicode(name),id))
            
        winnersAndNominees = {}
        winnersAndNominees['winner'] = (clean_unicode(winner),winner_id)
        winnersAndNominees['nominees'] = nomineeList
        dictAll[awardList[index]] =  winnersAndNominees
    return dictAll
コード例 #13
0
ファイル: scraper.py プロジェクト: aeggermont/cs171
def process_page():

    url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series")
    dom = DOM(url.download(cached=True))
    domIndex = 0

    for title in dom.by_class("title"):

        theTitle = str(title.by_tag("a")[0].content).encode('ascii', 'replace')
        titleCatalog.append(Title(theTitle))
    
        try:

            match = re.search("^(\d+).*$", str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
            #print match.group(1)
            # titleCatalog[domIndex].addRunTime( str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
            titleCatalog[domIndex].addRunTime(match.group(1))

        except Exception, e:
            pass

        try:
            titleCatalog[domIndex].addRank( str(dom.by_class("value")[domIndex].content).encode('ascii', 'replace'))
        except Exception, e:
            pass
コード例 #14
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    print(url)

    url_html = url.download(cashed=True)
    url_dom = DOM(url_html)

    movie_urls = []

    for movie in url_dom.by_class("titleColumn"):
        # looks for the element containing the link.
        movie_url = movie.by_tag("a")[0]

        # Gets a dictionary of the elements' attributes.
        movie_url = movie_url.attrs['href']

        # Splits the string at the '?'.
        movie_url = movie_url.split('?')

        # Forms full url and appends to the list of movie urls
        movie_url = "http://www.imdb.com" + movie_url[0]
        movie_urls.append(movie_url)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #15
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []

    # Grab web page
    movie_html = URL(url).download(cached=True)

    # Extract relevant information for each movie
    movie_dom = DOM(movie_html)

    for a in movie_dom.by_tag("td.titleColumn"):
        for b in a.by_tag("a"):
            b = str(b)
            title = b.split('"')[1]
            url = "http://www.imdb.com", b.split('"')[1]
            urly = "".join(url)
            movie_urls.append(urly)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #16
0
def obtain_data(url):
	'''
	Scrape the Wikipedia page.

	Args:
		url: pattern.web.URL instance pointing to the Wikipedia page

	Returns:
		A list of lists, where each sublist represents a data point. Each
		sublist contains two elements: a string with the name of the country,
		and a string with the size of the population of that country. 
	'''

	# Create a DOM of the URL.
	html = url.download(cached=True)
	dom = DOM(html)

	data_points = []

	for countries_table in dom.by_tag("table.wikitable sortable"):
		for table_row in countries_table.by_tag("tr")[1:]:	# The first row is the header, so start at index 1.
			table_row_content = []
			# Obtain the content of the row.
			for table_row_cell in table_row.by_tag("td"):
				table_row_cell_content = unicode(plaintext(table_row_cell.content))
				table_row_content.append(table_row_cell_content)
			# Obtain the country name and the population size.
			country = table_row_content[1].split("[")[0].split(" (")[0]
			population = "".join(table_row_content[2].split(","))
			data_point = [country, population]
			data_points.append(data_point)

	return data_points
コード例 #17
0
    def research_on(self, what, where):

        url = URL(
            "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" +
            what + "&ou=" + where + "&proximite=0")
        dom = DOM(url.download(cached=True))

        for a in dom.by_tag("div.main-title pj-on-autoload "):
            for e in a.by_tag("span.denombrement"):
                number_of_results = int(
                    self.decode_if_unicode(plaintext(e.content))[:3])

        number_of_page_results = number_of_results / 20
        if (number_of_results % 20 > 0):
            number_of_page_results += 1

        self.exctract_values(dom, self.myInfo)

        for i in range(2, number_of_page_results + 1):
            url = URL(
                "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" +
                what + "&ou=" + where + "&proximite=0+"
                "&page=" + str(i))
            dom = DOM(url.download(cached=True))
            self.exctract_values(dom, self.myInfo)

        self.myInfo.sort_and_merge()
コード例 #18
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    url = URL(url)
    html = url.download()
    dom = DOM(html)
    homeUrl = 'http://www.imdb.com'
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    for e in dom.by_tag("td.titleColumn"):
        absoluteUrl = ''
        for a in e.by_tag("a"):
            link = a.attributes.get("href","")
            absoluteUrl = homeUrl + link
            movie_urls.append(absoluteUrl)
        
    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #19
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    # initieer movie_html en en movie_dom naar imdb top 250 site
    movie_html = URL(url).download(cached=True)
    movie_dom = DOM(movie_html)
        # zoek op de site naar td.titlecolumn waar link in zit
    for films in movie_dom.by_tag("td.titleColumn"):
        # zoek link in td.titlecolumn 
        link = films.by_tag('a')[0]
        # maak abslote path en voeg het toe aan de lijst movies_urls
        link = "http://www.imdb.com" + link.attrs.get("href","")
        movie_urls.append(link)


    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #20
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    from pattern.web import abs
    url = URL("http://www.imdb.com/chart/top")
    dom = DOM(url.download(cached=True))
    for e in dom.by_tag("td.titleColumn")[:250]:
        for link in e.by_tag("a"):
            link = link.attrs.get("href", "")
            link = abs(link, base=url.redirect or url.string)
            movie_urls.append(link)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #21
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    #dom = DOM(url)
    dom = DOM(URL(TOP_250_URL).download())
    #print plaintext(dom.by_tag("td.titleColumn")[0].content)
    x = 0

    # extract links to the movie pages of the movies in the imdb top 250
    for i in range(250):

        # extract link to movie page for each movie
        for data in dom.by_tag("td.titleColumn")[i].by_tag("a"):
            data = str(data)
            relative_path = data.split('"')[1]
            link = 'http://www.imdb.com' + relative_path
            movie_urls.append(link)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #22
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    movie_urls = []

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    # Create a DOM of the URL.
    html = url.download(cashed=True)
    dom = DOM(html)

    for movie_table in dom.by_tag("table.chart full-width"):
        for movie_table_row in movie_table.by_tag("tr")[1:251]: # The first row is redundant, so start from index 1.
            for movie_table_row_cell in movie_table_row.by_tag("td.titleColumn"):
                for a in movie_table_row_cell.by_tag("a"):
                    # Obtain the path of the URL to the movie's page, create an absolute URL, and append it to the list 'movie_urls'. 
                    movie_url_path = a.attrs["href"]
                    absolute_movie_url = "".join(["http://www.imdb.com/", movie_url_path])
                    movie_urls.append(absolute_movie_url)

    # Return the list of URLs of each movie's page on IMDB.
    return movie_urls
コード例 #23
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    dom = DOM(url.download(cached=True))
    result = "http://imdb.com"

    for films in dom.by_tag("tbody.lister-list"):
        for urls in films.by_tag("td.titleColumn"):
            for url in urls.by_tag("a"):
                content = str(url).split('"')
                #print content[1]
                result += str(content[1])
                movie_urls.append(result)
                result = "http://imdb.com"
    return movie_urls               
コード例 #24
0
def extract_incidents(dom):

    incident_list = []
    i = 0

    for incident in dom.by_tag('tr'):
        if i > 0:
            link = INCIDENT_URL + incident.by_tag('a')[0].href
            print link

            url = URL(link)
            html = url.download(timeout=100)
            dom_incident = DOM(html)

            weapons = [weapon.strip() for weapon in dom_incident.by_tag('p')[16].content[27:].split('<br />')]
            weapons = ", ".join(weapons)[:-2]
            latitude = dom_incident.by_tag('p')[2].content[33:].strip()
            longitude = dom_incident.by_tag('p')[3].content[34:].strip()

            description = incident.by_tag('div')[0].content[1:].strip()
            date = incident.by_tag('td')[2].content[1:].strip()
            location = incident.by_tag('td')[3].content[1:].strip()
            violation = incident.by_tag('td')[4].content[1:].strip()
            incident_list.append([link.encode('utf-8'), location.encode('utf-8'), latitude.encode('utf-8'), longitude.encode('utf-8'), date.encode('utf-8'), violation.encode('utf-8'), weapons.encode('utf-8'), description.encode('utf-8')])

        i += 1

    return incident_list
コード例 #25
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []

    dom = DOM(url.download())
    from pattern.web import abs
    url = URL("http://imdb.com")
    for x in dom.by_tag("td.titleColumn"):
        x = x.by_tag("a")[0]
        x = x.attrs.get("href","")
        x = abs(x, base=url.redirect or url.string)
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.



    # return the list of URLs of each movie's page on IMDB
    return movie_urls
def scrape_top_250(url):
    """
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    """

    # This piece of code is needed to use the dom structure while it is not given as argument.
    TOP_250_URL = "http://www.imdb.com/chart/top"
    top_250_url = URL(TOP_250_URL)
    top_250_html = top_250_url.download(cached=True)
    dom = DOM(top_250_html)
    movie_urls = []

    """
    Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film.
    Uses CSS selectors to find the right urls and subsequently places them in a list
    """

    for e in dom.by_tag("td.titleColumn"):
        for a in e.by_tag("a")[:1]:
            main = "http://www.imdb.com"
            Locallink = main + a.attrs["href"]
            movie_urls.append(Locallink)
    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #27
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    # dwonload html page of url
    index_html = URL(url).download(cached=True)

    # create dom structure of index.html
    index_dom = DOM(index_html)

    # loops over every movie
    for td in index_dom.by_tag("td.titleColumn")[:250]:
        # extracts the url of the movie
        for a in td.by_tag("a")[:1]:
            a = str(a)
            path = a.split('"')[1]
            domain = "http://imdb.com"
            movie_url = domain + path
            movie_urls.append(movie_url)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #28
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    from pattern.web import abs

    movie_urls = []
    html = url.download(cached=True)
    dom = DOM(html)

    for a in dom.by_tag("tbody.lister-list"):
        for b in a.by_tag("td.titleColumn"):
            for c in b.by_tag("a"):
                link = c.attrs.get("href","")
                link = abs(link, base=url.redirect or url.string)
                movie_urls.append(link)

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #29
0
def make_json(url):
    json_dict = {}
    # Geef de data een titel
    json_dict["data"] = "percentage renewable energy"

    # Pak de DOM van de tabel van alle landen
    html = url.download()
    dom = DOM(DOM(html).by_class("wikitable")[1].content)

    # Maak een list met info over de landen
    countrylist = dom.by_tag("tr")[1:]

    # Lege list om de data aan te appenden
    pointslist = []
    for countryinfo in countrylist:
        # Lege list om land en percentage renewable energy aan te appenden
        infopair = []

        # Neem de naam van het land en append dat aan infopair
        infopair.append(DOM(countryinfo.content).by_tag("a")[0].attrs.get("title", "").encode("utf-8"))
        # Neem het percentage renewable energy van het land en append dat aan infopair
        infopair.append(DOM(countryinfo.content).by_tag("td")[8].content.encode("utf-8"))

        # Append de list aan pointslist voor een nested list
        pointslist.append(infopair)

    # Geef de dictionary de key 'points' met value de nested list pointslist
    json_dict["points"] = pointslist

    # Dump de dictionary als JSON naar de textfile json.txt
    json.dump(json_dict, open("json.txt", "wb"))
コード例 #30
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    # This piece of code is needed to use the dom structure while it is not given as argument.
    TOP_250_URL = 'http://www.imdb.com/chart/top'
    top_250_url = URL(TOP_250_URL)
    top_250_html = top_250_url.download(cached=True)
    dom = DOM(top_250_html)
    movie_urls = []

    '''
    Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film.
    Uses CSS selectors to find the right urls and subsequently places them in a list
    '''

    for e in dom.by_tag("td.titleColumn"): 
        for a in e.by_tag("a")[:1]:
            main = "http://www.imdb.com"
            Locallink = main + a.attrs["href"]
            movie_urls.append(Locallink)
    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #31
0
def scrape(url):
    with io.open("allMusicOneWeek.csv", "w",encoding = "utf8") as f:
            url = "http://www.top40.nl/top40/2015/week-46"
            week = url.split("/")
            week = week[-1]
            url = URL("http://www.top40.nl/top40/2015/week-46")
            dom = DOM(url.download(cached = True))
            # geeft de week
            i = 1
            # de lijst van de top 40 selecteren

            for l in dom.by_tag("ol.top40"):
                # per nummer selecteren=
                print "lijst top 40"
                for e in l.by_tag("div.clearfix"):
                    muziekGegevens = ""
                    #positie in de top 40
                    muziekGegevens += str(i) + ","
                    print i , 'positie'
                    i += 1 # opletten met resetten
                    # de artiest selecteren
                    for artiest in e.by_class("credit"):
                        muziekGegevens += artiest.content + ","
                    #positie
                    for inner in e.by_tag("strong")[1:2]:
                        print inner.content , "1:2"
                        muziekGegevens += inner.content + ","
                    # hoogste notering
                    for inner in e.by_tag("strong")[2:3]:
                        print inner.content , "2:3"
                        muziekGegevens += inner.content + ","
                    # aantal punten
                    for inner in e.by_tag("strong")[3:4]:
                        print inner.content , "3:4"
                        muziekGegevens += inner.content + ","
                    # jaar van het nummer
                    for inner in e.by_tag("strong")[4:5]:
                        print inner.content.strip() , "4:5"
                        muziekGegevens += inner.content.strip()
                    h = HTMLParser.HTMLParser()
                    muziekGegevens = h.unescape(muziekGegevens)

                    if not whatisthis(muziekGegevens):
                        muziekGegevens = unicode(muziekGegevens, "utf-8")
                        print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf'
                        f.write(muziekGegevens + "\n")
                    else:
                        f.write(muziekGegevens + "\n")


    #                     1 positie
    # week-45
    # ,1,
    # Traceback (most recent call last):
    #   File "testhtmlscraper.py", line 58, in <module>
    #     f.write(muziekGegevens + "\n")
    # TypeError: must be unicode, not str ???
    f.close
コード例 #32
0
def scrape(url):
    with io.open("allMusicOneWeek.csv", "w", encoding="utf8") as f:
        url = "http://www.top40.nl/top40/2015/week-46"
        week = url.split("/")
        week = week[-1]
        url = URL("http://www.top40.nl/top40/2015/week-46")
        dom = DOM(url.download(cached=True))
        # geeft de week
        i = 1
        # de lijst van de top 40 selecteren

        for l in dom.by_tag("ol.top40"):
            # per nummer selecteren=
            print "lijst top 40"
            for e in l.by_tag("div.clearfix"):
                muziekGegevens = ""
                #positie in de top 40
                muziekGegevens += str(i) + ","
                print i, 'positie'
                i += 1  # opletten met resetten
                # de artiest selecteren
                for artiest in e.by_class("credit"):
                    muziekGegevens += artiest.content + ","
                #positie
                for inner in e.by_tag("strong")[1:2]:
                    print inner.content, "1:2"
                    muziekGegevens += inner.content + ","
                # hoogste notering
                for inner in e.by_tag("strong")[2:3]:
                    print inner.content, "2:3"
                    muziekGegevens += inner.content + ","
                # aantal punten
                for inner in e.by_tag("strong")[3:4]:
                    print inner.content, "3:4"
                    muziekGegevens += inner.content + ","
                # jaar van het nummer
                for inner in e.by_tag("strong")[4:5]:
                    print inner.content.strip(), "4:5"
                    muziekGegevens += inner.content.strip()
                h = HTMLParser.HTMLParser()
                muziekGegevens = h.unescape(muziekGegevens)

                if not whatisthis(muziekGegevens):
                    muziekGegevens = unicode(muziekGegevens, "utf-8")
                    print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf'
                    f.write(muziekGegevens + "\n")
                else:
                    f.write(muziekGegevens + "\n")

    #                     1 positie
    # week-45
    # ,1,
    # Traceback (most recent call last):
    #   File "testhtmlscraper.py", line 58, in <module>
    #     f.write(muziekGegevens + "\n")
    # TypeError: must be unicode, not str ???
    f.close
コード例 #33
0
ファイル: spiders.py プロジェクト: Carlosmr/WhooshSearcher
 def getTitle(self, link):
     html = URL(link).download()
     body = DOM(html).body
     node = body.by_id("main-article-info")
     if node:
         title = node.children[1].content.strip()
     else:
         title = ''
     return title
コード例 #34
0
ファイル: spiders.py プロジェクト: Carlosmr/WhooshSearcher
 def htmlParser(self,link):
     html = URL(link).download()
     body = DOM(html).body
     content = body.by_id("content")
     if content:
         plaincontent = plaintext(content.content, linebreaks=2, indentation = True)
         pretty = unicode(plaincontent.strip())
     else:
         pretty=''            
     return pretty
コード例 #35
0
def load(year, pagenum, pagerank):
	strnum = str(year)
	url = URL("http://www.imdb.com/search/title?at=0&sort=moviemeter,asc&start="
			   +str(pagenum)+"&title_type=feature&year="+strnum+","+strnum)
	dom = DOM(url.download(timeout=30, cached=True))
	htmlsource = dom.by_id("main").by_class("results")[0].by_class("title")[pagerank].by_tag("a")[0].source
	urlpiece = re.search(r'/title/+[t0-9]+/', htmlsource)
	finalurl = "http://www.imdb.com" + urlpiece.group(0)
	url2 = URL(finalurl)
	return url2
コード例 #36
0
def extract_tvseries(dom):
    url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series")
    dom = DOM(url.download(cached = True))
    # create two arrays to make a list at the end to write off
    infoserie = []
    infoSerieList = []
    a = ''
    for e in dom.by_tag("tr.detailed")[:50]: # Top 50 imdb entries.
        # get title
        for a in e.by_tag("a")[:1]: # First <a class="title"> in entry.
            infoserie = []
            s = a.attrs["title"]
            infoserie += [s.split('(')[0].strip()]
        # get rating
        for rating in e.by_tag("div.rating-list")[:1]:
            rating = rating.attrs["title"]
            infoserie +=[rating[17:20]]
        # get genre
        for genre in e.by_tag("span.genre")[:1]:
            for m in genre.by_tag("a"):
                infoserie += [m.content]
        # get actors
        for actors in e.by_tag("span.credit"):
            for actors_sub in actors.by_tag("a"):

                infoserie +=[actors_sub.content]
        #get time
        for time in e.by_tag("span.runtime")[:1]:

            infoserie += [time.content[:3]]
        infotopserie =[]
        # encode to get rid of unicode error
        for encoding in infoserie:

            infotopserie += [encoding.encode('utf-8')]
        # add row to list
        infoSerieList.append(infotopserie)

    '''
    Extract a list of highest ranking TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Ranking
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''

    # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE
    # HIGHEST RANKING TV-SERIES
    # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE
    # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT.

    return infoSerieList  # replace this line as well as appropriate
コード例 #37
0
    def extract_percentages(dom):
        file_url = URL(TARGET_URL)
        file_dom = DOM(file_url.download())

        percentage_list = []
        if file_dom.by_class('percentage'):
            for item in file_dom.by_class('percentage'):
                percentage_list.append(item.content.encode('utf-8'))
            return percentage_list[0]
        else:
            return "nodata"
コード例 #38
0
ファイル: scraper.py プロジェクト: netprofm/Project
    def extract_percentages(dom):
        file_url = URL(TARGET_URL)
        file_dom = DOM(file_url.download())

        percentage_list = []
        if file_dom.by_class('percentage'):
            for item in file_dom.by_class('percentage'):
                percentage_list.append(item.content.encode('utf-8'))
            return percentage_list[0]
        else:
            return "nodata"
コード例 #39
0
def scrape_movie_page(dom):
    '''
    Scrape the IMDB page for a single movie

    Args:
        dom: pattern.web.DOM instance representing the page of 1 single
            movie.

    Returns:
        A list of strings representing the following (in order): title, year,
        duration, genre(s) (semicolon separated if several), director(s) 
        (semicolon separated if several), writer(s) (semicolon separated if
        several), actor(s) (semicolon separated if several), rating, number
        of ratings.
    '''
    # YOUR SCRAPING CODE GOES HERE:
    dom = DOM(url.download())
    #print dom.body.content
    csv_row = []
    for series in dom.by_tag('td.titleColumn'):    
        title = series.by_tag('a')[0]
        ranking = series.by_tag('td.ratingColumn')[0]
        genres = series.by_tag('span.genre')[0].by_tag('a')
        genres = [g.content for g in genres]
        actors = series.by_tag('span.credit')[0].by_tag('a')
        actors = [a.content for a in actors]
        try:
            runtime = series.by_tag('span.runtime')[0]
        except:
            runtime = "Unknown"
       

        csv_titles = title
        csv_ranking = ranking
        csv_genres = genres
        csv_actors = actors
        csv_runtime = runtime
        row = [csv_titles, csv_ranking, csv_genres, csv_actors, csv_runtime]
        csv_row.append(row)

    return csv_row

    print title;
    '''
    Geen idee hoe dit werkt, Python ligt mij, ik heb met moeite vorige week die opdracht afgekregen
    in mijn ogen is het precies hetzelfde als vorige week, maar het werkt niet. Bij deze dus een incomplete
    opdracht. Ik hoop niet dat ik daardoor de module niet meer kan halen... Volgende keer beter, JS ligt mij
    toch iets meer dan Python om eerlijk te zijn.
    '''
    # Return everything of interest for this movie (all strings as specified
    # in the docstring of this function).
    return title, duration, genres, directors, writers, actors, rating, \
        n_ratings
コード例 #40
0
def scrape(url, f):

    week = url.split("/")
    week = week[-1]
    url = URL(url)
    dom = DOM(url.download(cached=True))
    # geeft de week
    i = 1
    # de lijst van de top 40 selecteren

    for l in dom.by_tag("ol.top40"):
        # per nummer selecteren=
        print "lijst top 40"
        for e in l.by_tag("div.clearfix")[0:40]:
            muziekGegevens = ""
            #positie in de top 40
            muziekGegevens += str(i) + ","
            print i, 'positie'
            i += 1  # opletten met resetten
            # de artiest selecteren
            for artiest in e.by_class(
                    "credit"):  #error niet te veel elementen!
                muziekGegevens += artiest.content + ","
            #positie
            for inner in e.by_tag("strong")[1:2]:
                print inner.content, "1:2"
                muziekGegevens += inner.content + ","
            # hoogste notering
            for inner in e.by_tag("strong")[2:3]:
                print inner.content, "2:3"
                muziekGegevens += inner.content + ","
            # aantal punten
            for inner in e.by_tag("strong")[3:4]:
                print inner.content, "3:4"
                muziekGegevens += inner.content + ","
            # jaar van het nummer
            for inner in e.by_tag("strong")[4:5]:
                print inner.content.strip(), "4:5"
                muziekGegevens += inner.content.strip()
            h = HTMLParser.HTMLParser()
            muziekGegevens = h.unescape(muziekGegevens)

            if not whatisthis(muziekGegevens):
                muziekGegevens = unicode(muziekGegevens, "utf-8")
                print 'lajdsflkejwflejwfoiewjfwjfldskjfoewijf'
                f.write(muziekGegevens + "\n")
            else:
                f.write(muziekGegevens + "\n")
コード例 #41
0
def load_dom(url):
    r = requests.get(url)

    if r.status_code == 200:
        return DOM(r.content)

    return None
コード例 #42
0
 def setUp(self):
     with open(BACKUP_HTML, 'r') as f:
         dom = DOM(f.read())
         # Add the header for now as the extract_tvseries function does not
         # add a header itself.
         self.rows = [['Title', 'Ranking', 'Genre', 'Actors', 'Runtime']]
         self.rows.extend(extract_tvseries(dom))
コード例 #43
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    #absolute_url = 'http://www.imdb.com'

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    url = URL(url)
    dom = DOM(url.download(cached=True))

    #return dom

    for e in dom('.titleColumn'):
        for link in e('a'):
            movie_urls.append(abs(link.attributes.get('href')), )

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #44
0
def get_patent(url):
    url = URL(url + "/fulltext")
    html = url.download()
    dom = DOM(html)
    title = plaintext(dom('h3 a')[0].content)
    body = plaintext(dom('#contents')[0].content)
    return [title, body]
コード例 #45
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    # Download the HTML file
    url = URL(url)
    html = url.download()

    # Parse the HTML file into a DOM representation
    dom = DOM(html)

    # Iterate through all 250 table rows on the index page
    for movies in dom('.lister-list > tr'):
        # take the movie's href attribute and put it in href
        href = movies('td.titleColumn a')[0].attrs["href"]
        # append the href attribute to the string, but also add http://www.imdb.com/ in front of it
        movie_urls.append("http://www.imdb.com/" + href)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
コード例 #46
0
def get_artist_docs(name):

    default_dir = basedir + name
    rap_docs = ""

    # get a list of all the files in default dir
    for f in os.listdir(default_dir):
        # go to that dir
        os.chdir(default_dir)
        # open the file
        fi = open(f, 'r')
        # print "reading " + f
        # slurp
        page = fi.read()

        # what does this do?
        dom = DOM(page)

        # we look at the page and get that the thing we want is in the .lyrics div.
        if dom and dom('.lyrics'):
            lyrics = dom('.lyrics')[0]
        else:
            continue

        p = plaintext(lyrics.content)
        rap_docs += p

    return rap_docs
コード例 #47
0
def get_patent_urls(keyword, limit=10):
    keyword = urllib.quote_plus(keyword)
    base_url = "http://www.lens.org"
    url = URL(base_url + "/lens/search?ft=true&l=en&st=true&n=" + str(limit) +
              "&q=" + keyword)
    dom = DOM(url.download())
    links = [base_url + a.attributes.get('href') for a in dom('a.link')]
    return links
コード例 #48
0
def all_lyrics(artist):
    clean = re.sub(r"\s+|'", '', artist)
    url = URL(BASE_URL + artist[0] + '/' + clean + '.html')
    dom = DOM(url.download())
    titles = [a.content for a in dom('div#listAlbum a')]
    ew_amazon = [
        abs(link.attributes.get('href', ''), base=url.redirect or url.string)
        for link in dom('div#listAlbum a')
    ]
    songlinks = [l for l in ew_amazon if 'amazon' not in l]
    lyrics = []
    for link in songlinks:
        song_url = URL(link)
        song_dom = DOM(song_url.download())
        lyrics.append(plaintext(song_dom('div#main div')[4:5][0].content))
    zippy_lyrics = zip(titles, lyrics)
    return json.dumps(zippy_lyrics, sort_keys=True)
コード例 #49
0
 def get_dom_object(self, url_target):
     try:
         url = URL(url_target)
         dom_object = DOM(url.download(cached=True))
     except:
         print('Problem retrieving data for this url: ',
               self.target_url_str)
         self.url_query_timeout = 1
     return dom_object
コード例 #50
0
    def downloadContent(self):
        if not self.isWebPage():
            raise URLError("Invalid or empty content type")
        try:
            self.content = self.url.download(timeout=1)
        except httplib.InvalidURL:
            raise URLError("Invalid URL")

        self.decodeContent()
        self.dom = DOM(self.content)
コード例 #51
0
def box_office_titles():
    # download the webpage
    html = URL(BOX_OFFICE_URL).download()
    dom = DOM(html)

    # find the movie titles
    title_elements = dom(MOVIE_TITLE_TAG)
    titles = map(lambda x: x.content, title_elements)

    return titles
コード例 #52
0
    def extract_pic_url(self):
        dom = DOM(self.page_source)
        tag_list = dom('a.rg_l')
 
        for tag in tag_list[:self.image_dl_per_search]:
            tar_str = re.search('imgurl=(.*)&imgrefurl', tag.attributes['href'])
            try:
                self.pic_url_list.append(tar_str.group(1))
            except:
                print('error parsing', tag)
コード例 #53
0
def main():
    '''
    Crawl the IMDB top 250 movies, save CSV with their information.

    Note:
        This function also makes backups of the HTML files in a sub-directory
        called HTML_BACKUPS (those will be used in grading).
    '''

    # Create a directory to store copies of all the relevant HTML files (those
    # will be used in testing).
    print 'Setting up backup dir if needed ...'
    create_dir(BACKUP_DIR)

    # Make backup of the IMDB top 250 movies page
    print 'Access top 250 page, making backup ...'
    top_250_url = URL(TOP_250_URL)
    top_250_html = top_250_url.download(cached=True)
    top_250_dom = DOM(top_250_html)
    make_backup(os.path.join(BACKUP_DIR, 'index.html'), top_250_html)

    # extract the top 250 movies
    print 'Scraping top 250 page ...'
    url_strings = scrape_top_250(top_250_url)

    # grab all relevant information from the 250 movie web pages
    rows = []
    for i, url in enumerate(url_strings):  # Enumerate, a great Python trick!
        print 'Scraping movie %d ...' % i
        # Grab web page
        movie_html = URL(url).download(cached=True)
        # Extract relevant information for each movie
        movie_dom = DOM(movie_html)
        rows.append(scrape_movie_page(movie_dom))

        # Save one of the IMDB's movie pages (for testing)
        if i == 83:
            html_file = os.path.join(BACKUP_DIR, 'movie-%03d.html' % i)
            make_backup(html_file, movie_html)

    # Save a CSV file with the relevant information for the top 250 movies.
    print 'Saving CSV ...'
    save_csv(os.path.join(SCRIPT_DIR, 'top250movies.csv'), rows)
def inflect(word, language="italian"):

    inflections = {}
    url = "http://en.wiktionary.org/wiki/" + word.replace(" ", "_") 
    dom = DOM(URL(url).download(throttle=10, cached=True))

    pos = ""

    # Search the header that marks the start for the given language:
    # <h2><span class="mw-headline" id="Italian">Italian</span></h2>

    e = dom("#" + language)[0].parent

    while e is not None: # e = e.next_sibling

        if e.type == "element":

            if e.tag == "hr": # Horizontal line = next language.
                break

            if e.tag == "h3": # <h3>Adjective [edit]</h3>
                pos = plaintext(e.content.lower())
                pos = pos.replace("[edit]", "").strip()[:3].rstrip("ouer") + "-"

            # Parse inflections, using regular expressions.

            s = plaintext(e.content)

            # affetto m (f affetta, m plural affetti, f plural affette)

            if s.startswith(word):

                for gender, regexp, i in (
                  ("m" , r"(" + word + r") m", 1),
                  ("f" , r"(" + word + r") f", 1),
                  ("m" , r"(" + word + r") (mf|m and f)", 1),
                  ("f" , r"(" + word + r") (mf|m and f)", 1),
                  ("m" , r"masculine:? (\S*?)(,|\))", 1),
                  ("f" , r"feminine:? (\S*?)(,|\))", 1),
                  ("m" , r"(\(|, )m(asculine)? (\S*?)(,|\))", 3),
                  ("f" , r"(\(|, )f(eminine)? (\S*?)(,|\))", 3),
                  ("mp", r"(\(|, )m(asculine)? plural (\S*?)(,|\))", 3),
                  ("fp", r"(\(|, )f(eminine)? plural (\S*?)(,|\))", 3),
                  ( "p", r"(\(|, )plural (\S*?)(,|\))", 2),
                  ( "p", r"m and f plural (\S*?)(,|\))", 1)):
                    m = re.search(regexp, s, re.I)
                    if m is not None:
                        # {"adj-m": "affetto", "adj-fp": "affette"}
                        inflections[pos + gender] = m.group(i)

            #print s

         e = e.next_sibling

    return inflections
コード例 #55
0
def extract_data_ML(i):
    url = 'http://macaulaylibrary.org/audio/%s' % i
    page = URL(url).download()
    dom = DOM(page)
    description = dom('meta')[0].attr['content']
    result = [x.content for x in dom('script') if 'jwplayer(' in x.content][0]
    result = [
        x.strip() for x in result.split('\n') if x.strip().startswith('file')
    ][0]
    path_to_mp3 = result.split('"')[1]
    return {'index': i, 'desc': description, 'mp3': path_to_mp3}
コード例 #56
0
 def get_dom_object(self, url_target):
     try:
         session = HTMLSession()
         # get the html content
         response = session.get(url_target)
         # execute Java-script
         response.html.render(timeout=30, sleep=2)
         dom_object = DOM(response.html.html)
         return dom_object
     except:
         self.ErrorReason = 'Problem retrieving data for this url: ' + url_target + '.\nPlease check your Internet connection.'
         return None
コード例 #57
0
 def create_dom_object(self):
     """ Create dom object based on element for scraping
         Take into consideration that there might be query problem.
         
     """
     try:
         url = URL(self.full_url_str)
         self.dom_object = DOM(url.download(cached=True))
     except:
         if self.__print_url_finding_error:
             print 'Problem retrieving data for this url: ', self.full_url_str
         self.url_query_timeout = 1
コード例 #58
0
    def extract_pic_url(self):
        """ extract all the raw pic url in list
 
        """
        dom = DOM(self.page_source)
        tag_list = dom('a.rg_l')
        print len(tag_list)
        for tag in tag_list[:self.nb_images]:
            tar_str = re.search('imgurl=(.*)&imgrefurl',
                                tag.attributes['href'])
            try:
                self.pic_url_list.append(tar_str.group(1))
            except:
                print 'error parsing', tag
コード例 #59
0
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []

    # retrieve TOP 250 DOM
    dom = DOM(url.download(cached=True))

    # add all absolute movie URLs to the list
    for movie in dom.by_tag("td.titleColumn"):
        movie_urls.append('http://www.imdb.com' + movie.by_tag('a')[0].href)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls