titleElement = element.find_element_by_css_selector('a') link = titleElement.get_attribute('href') title = titleElement.get_attribute('title') movie = Movie(title, link) movies.append(movie) for movie in movies: driver.get(movie.link) print('Processing ', movie.title) movie.synopsis = driver.find_element_by_css_selector('div#sinopsis').text datos = driver.find_element_by_css_selector( 'div#tecnicos > p').get_attribute('innerHTML') for dato in datos.split('<br>'): if 'Género' in dato: movie.genre = dato.split(': ')[1] if 'Director' in dato: movie.director = dato.split(': ')[1] if 'Actores' in dato: movie.cast = dato.split(': ')[1].split(',') if 'Duración' in dato: movie.duration = dato.split(': ')[1] rooms = driver.find_elements_by_css_selector('.accordion > div.card.panel') for r in rooms: room = r.find_element_by_css_selector('h2.panel-title') types = r.find_elements_by_css_selector( '.movie-showtimes-component-combination') for t in types: type_data = list( map( lambda x: x.strip(), t.find_element_by_css_selector( '.movie-showtimes-component-label small').
tags = soup.find_all('li', class_="ui-slide-item") movies = [] for tag in tags: # print("=========================") # print(tag) # soup.select('li[data-actors]') # print(tag.li) movie = Movie() # 主演 movie.actors = tag.attrs['data-actors'] # 导演 movie.director = tag.attrs['data-director'] # 时长 movie.duration = tag.attrs['data-duration'] # 豆瓣评分 movie.rate = tag.attrs["data-rate"] # 发布地区 movie.region = tag.attrs["data-region"] # 发布时间 movie.release = tag.attrs["data-release"] # 电影名称 movie.title = tag.attrs["data-title"] # 相关视频地址 movie.trailer = tag.attrs["data-trailer"] # 电影封面 movie.cover = tag.img.attrs["src"] # 豆瓣地址
def read_csv_file(self): with open(self.__file_name, mode='r', encoding='utf-8-sig') as csvfile: movie_file_reader = csv.DictReader(csvfile) for row in movie_file_reader: self.movies.append(row['Title']) self.actors.append(row['Actors']) self.genres.append(row['Genre']) self.directors.append(row['Director']) self.year.append(int(row['Year'])) self.description.append(row['Description']) self.runtime.append(row['Runtime (Minutes)']) self.external_ratings.append(row['Rating']) self.votes.append(row['Votes']) self.metascores.append(row['Metascore']) self.revenues.append(row['Revenue (Millions)']) index = 0 for mov in self.movies: mov = mov.split(',') mov = " ".join(mov) movie = Movie(mov, self.year[index]) if movie not in self.dataset_of_movies: self.dataset_of_movies.append(movie) movie.description = self.description[index] movie.director = Director(self.directors[index]) for genre in self.genres[index].split(','): movie.add_genre(Genre(genre)) for actor in self.actors[index].split(','): movie.add_actor(Actor(actor)) if self.runtime == 'N/A': pass else: movie.runtime_minutes = int(self.runtime[index]) if self.external_ratings == "N/A": pass else: movie.external_rating = float( self.external_ratings[index]) if self.revenues[index] == "N/A": pass else: movie.revenue = float(self.revenues[index]) if self.metascores[index] == "N/A": pass else: movie.metascore = int(self.metascores[index]) else: pass index += 1 for actor in self.actors: list_actors = actor.split(',') for act in list_actors: act = act.split() act = " ".join(act) if Actor(act) not in self.dataset_of_actors: self.dataset_of_actors.append(Actor(act)) else: pass for genre in self.genres: list_genre = genre.split(',') for gen in list_genre: gen = gen.split() gen = " ".join(gen) if Genre(gen) not in self.dataset_of_genres: self.dataset_of_genres.append(Genre(gen)) else: pass for director in self.directors: list_directors = director.split(',') for dir in list_directors: dir = dir.split() dir = " ".join(dir) if Director(dir) not in self.dataset_of_directors: self.dataset_of_directors.append(Director(dir)) else: pass
# #---------------------------------- from movie import Movie import fresh_tomatoes # Movies # Arrival arrival = Movie("Arrival", "A linguist is recruited by the military to assist in translating alien communications.", "./images/arrival.jpg", "https://www.youtube.com/watch?v=ZLO4X6UI8OY") arrival.storyline = "When mysterious spacecraft touch down across the globe, an elite team - led by expert linguist Louise Banks - is brought together to investigate. As mankind teeters on the verge of global war, Banks and the team race against time for answers - and to find them, she will take a chance that could threaten her life, and quite possibly humanity." arrival.director = "Denis Villeneuve" arrival.writers = "Eric Heisserer (screenplay), Ted Chiang (based on the story \"Story of Your Life\" written by)" arrival.stars = "Amy Adams, Jeremy Renner, Forest Whitaker" arrival.taglines = "Why are they here?" arrival.genres = "Drama | Mystery | Sci-Fi | Thriller" arrival.country = "USA" arrival.language = "English" arrival.release_date = "11 November 2016 (Norway)" arrival.runtime = "116 min" # Kong: Skull Island kong_skull_island = Movie("Kong: Skull Island", "An action/adventure story centered on King Kong's origins.", "./images/KongSkullIsland.jpg", "https://www.youtube.com/watch?v=h9y6oPka3us")
def get_info(): global stage time.sleep(5) print "-CRAWLER- Start to get movie feature..." while (not mvIDQ.empty()) or stage == 0: try: mvID = mvIDQ.get() # get info from imdmpy with movie id # print "-CRAWLER- Getting movie(id: %s) feature..." % mvID mvIN = imdb_access.get_movie(mvID) # create new Movie object mvOJ = Movie() # ID string mvOJ.id = mvID # title string mvOJ.title = mvIN.get('title') # poster url string mvOJ.cover_url = mvIN.get('cover url') # Bigger poster url string mvOJ.giant_cover_url = mvIN.get('full-size cover url') # genres string list if mvIN.has_key('genres'): sIN = "" for i in mvIN.get('genres'): sIN += (i + '$') mvOJ.genres = sIN[0:len(sIN) - 1] # color string list if mvIN.has_key('color info'): sIN = "" for i in mvIN.get('color info'): sIN += (i + '$') mvOJ.color_info = sIN[0:len(sIN) - 1] # director string list if mvIN.has_key('director'): sIN = "" for i in mvIN.get('director'): sIN += i['name'] + '$' mvOJ.director = sIN[0:len(sIN) - 1] # 1st Actor mvOJ.cast_1st = mvIN.get('cast')[0]['name'] if len(mvIN.get('cast')) >= 2: # 2nd Actor mvOJ.cast_2nd = mvIN.get('cast')[1]['name'] if len(mvIN.get('cast')) >= 3: # 3rd Actor mvOJ.cast_3rd = mvIN.get('cast')[2]['name'] # country string list if mvIN.has_key('countries'): sIN = "" for i in mvIN.get('countries'): sIN += (i + '$') mvOJ.countries = sIN[0:len(sIN) - 1] # language string list if mvIN.has_key('languages'): sIN = "" for i in mvIN.get('languages'): sIN += (i + '$') mvOJ.languages = sIN[0:len(sIN) - 1] # writer string list if mvIN.has_key('writer'): sIN = "" for i in mvIN.get('writer'): sIN += i['name'] + '$' mvOJ.writer = sIN[0:len(sIN) - 1] # editor string list if mvIN.has_key('editor'): sIN = "" for i in mvIN.get('editor'): sIN += i['name'] + '$' mvOJ.editor = sIN[0:len(sIN) - 1] # cinematographer string list if mvIN.has_key('cinematographer'): sIN = "" for i in mvIN.get('cinematographer'): sIN += i['name'] + '$' mvOJ.cinematographer = sIN[0:len(sIN) - 1] # art direction string list if mvIN.has_key('art direction'): sIN = "" for i in mvIN.get('art direction'): sIN += i['name'] + '$' mvOJ.art_director = sIN[0:len(sIN) - 1] # costume designer string list if mvIN.has_key('costume designer'): sIN = "" for i in mvIN.get('costume designer'): sIN += i['name'] + '$' mvOJ.costume_designer = sIN[0:len(sIN) - 1] # music By string list if mvIN.has_key('original music'): sIN = "" for i in mvIN.get('original music'): sIN += i['name'] + '$' mvOJ.original_music = sIN[0:len(sIN) - 1] # sound string list if mvIN.has_key('sound mix'): sIN = "" for i in mvIN.get('sound mix'): sIN += (i + '$') mvOJ.sound_mix = sIN[0:len(sIN) - 1] # production company string list if mvIN.has_key('production companies'): sIN = "" for i in mvIN.get('production companies'): sIN += i['name'] + '$' mvOJ.production_companies = sIN[0:len(sIN) - 1] # year int if mvIN.has_key('year'): mvOJ.year = mvIN.get('year') else: mvOJ.year = 0 # running time int if mvIN.has_key('runtimes'): try: if str(mvIN.get('runtimes')[0]).find(':') != -1: mvOJ.runtimes = int( str(mvIN.get('runtimes')[0]).split(':')[1]) else: mvOJ.runtimes = int(mvIN.get('runtimes')[0]) except Exception: mvOJ.runtimes = 0 else: mvOJ.runtimes = 0 # budget int # if 'budget' in mvIN: # mvOJ.budget = mvIN.get('budget') # get rating for old movies if mode == "old": mvOJ.number_of_votes = get_rating(mvID) mvINQ.put(mvOJ) mvIDQ.task_done() # print '-CRAWLER- Get movie features(ID: %s) successfully.' % mvID # TODO cannot handle exception except Exception, e: print '-CRAWLER- An {} exception occured!'.format(e), mvID mvINQ.put(mvID) time.sleep(1)