# https://github.com/richardasaurus/imdb-pie from imdbpie import Imdb imdb = Imdb() imdb = Imdb(anonymize=True) print(imdb.search_for_title("The Dark Knight")) print() print(imdb.search_for_person("Christian Bale")) print() print(imdb.get_episodes('tt0096697')) top250 = imdb.top_250() for i in range(0, len(top250)): print(top250[i]) print() title = imdb.get_title_by_id("tt1210166") for person in title.credits: # check if they are a writer if person.token == 'writers': print(person.name + ' is a writer') else: print(person.name + ' is not a writer')
def imdb_content(twitter_id, twitter_name): imdb = Imdb() # imdb = Imdb(anonymize=True) # to proxy requests id = imdb.search_for_person(twitter_name)[0]['imdb_id'] hxs = lxml.html.document_fromstring( requests.get("http://www.imdb.com/name/" + str(id)).content) name = '' try: name = hxs.xpath('//*[@id="overview-top"]/h1/span/text()')[0].strip() except: #try: # name = hxs.xpath('//*[@id="overview-top"]/div[1]/div/h1/span/text()')[0].strip() #except: # pass pass if name.lower() == twitter_name.lower(): try: occupation = [ o.strip() for o in hxs.xpath( '//*[@id="name-job-categories"]/a[*]/span/text()') ] except: occupation = '' try: birthday = hxs.xpath( '//*[@id="name-born-info"]/time')[0].attrib['datetime'] except: birthday = '' try: death = hxs.xpath( '//*[@id="name-death-info"]/time')[0].attrib['datetime'] except: death = "" hxs_bio = lxml.html.document_fromstring( requests.get("http://www.imdb.com/name/" + id + "/bio").content) try: content = ''.join( hxs_bio.xpath('//*[@id="bio_content"]/div[2]/p[1]/text()')) except: content = '' try: spouse = hxs_bio.xpath( '//*[@id="tableSpouses"]//tr/td[1]/a/text()') except: spouse = '' try: children_content = hxs_bio.xpath( '//*[@id="tableSpouses"]//tr/td[2]/text()') children = 0 for line in children_content: l = line.encode('UTF-8').replace('(', '( ').strip() pattern = '(.*) (.*) child' compiled = re.compile(pattern) m = compiled.search(l) if m is not None: children += int(m.group(2)) except: children = '' basic_info = { 'id': twitter_id, 'name': name, 'occupation': occupation, 'birthday': birthday, 'death': death, 'spouse': spouse, 'children': str(children) } content_info = {'id': twitter_id, 'content': content.strip()} write_imdb_data(json.dumps(basic_info), json.dumps(content_info))
actor_names = [] names = pd.read_excel('Actor_names.xlsx') n = names.iterrows() actor_names = [] actor_ids = [] for i in n: actor_names.append(i[1][0]) print len(actor_names) for name in actor_names: try: actor_name = imdb.search_for_person(str(name)) except: continue print 'Converting ' + str(name) + ' to IMDB id: ' + str(actor_name[0]['imdb_id'].replace('nm','')) actor_ids.append(actor_name[0]['imdb_id'].replace('nm','')) print 'Total actor count: ' + str(len(actor_ids)) no_pic = [] ids = pd.DataFrame(actor_ids) ac_ids = {} ac_ages = {} ac_names = [] ac_pred_age = []
actor_ids = [] actor_names = [] names = pd.read_excel('Actor_names.xlsx') n = names.iterrows() actor_names = [] actor_ids = [] for i in n: actor_names.append(i[1][0]) print len(actor_names) for name in actor_names: try: actor_name = imdb.search_for_person(str(name)) except: continue print 'Converting ' + str(name) + ' to IMDB id: ' + str( actor_name[0]['imdb_id'].replace('nm', '')) actor_ids.append(actor_name[0]['imdb_id'].replace('nm', '')) print 'Total actor count: ' + str(len(actor_ids)) no_pic = [] ids = pd.DataFrame(actor_ids) ac_ids = {} ac_ages = {} ac_names = [] ac_pred_age = []