def get_cast_crew(self,url): request=get_file(url) soup = BeautifulSoup(request.text) main_dic={} lst=[u'Cast',u'Production and Technical Credits'] for i in xrange(len(lst)): main_dic[lst[i]]=np.nan dic={} try: lst[i]=soup.findAll('div',{'id':'cast'})[i].find('h1').text for row in soup.findAll('div',{'id':'cast'})[i].findAll('tr'): position, filler, name = row.findAll('td') position= unicodedata.normalize('NFKD', position.text).encode('ascii','ignore') name = unicodedata.normalize('NFKD', name.text).encode('ascii','ignore') if position in dic: dic[position]+=[name] else: dic[position]=[name] dic=json.dumps(dic) except: dic=np.nan main_dic[lst[i]]=dic return main_dic
def movie_list_to_df(self,movie_list): """ Takes the list movies and process then adds the data to the dataframe """ for movie in movie_list: request=get_file(self._base_link+movie) ##build dic of furture dataframe column with dic keys ##and dataframe value as dic[key]=value dic=self.scrape_movie_data_to_dic(request) cast_crew=self.get_cast_crew((self._base_link+movie).replace('summary','cast-and-crew')) ##merge together dic['Cast']=cast_crew[u'Cast'] dic['Crew']=cast_crew[u'Production and Technical Credits'] ##build new dataframe and append it to total df=pd.DataFrame.from_dict(dic) self._df=self._df.append(df)
def get_year_page_movie_list(self,year): """ Takes a given year and calls the webpage with all the movies that year then returns a list with all movies that have a budget listed. """ ##get the webpage with all the movies from that year request=get_file(self._base_link+'/movies/year/'+str(year)) soup = BeautifulSoup(request.text) output=[] for item in soup.findAll('table')[0].findAll('tr'): ##only add movies to list when they have budget listed budget=item.find("td", { "class" : "data" }) if budget is not None and len(budget.text)>1: s=item.find('a').decode().split('"') output.append(s[1]) return output