def getMovieCast(dataF, movieID, indexF, keyF, attrIF, attrKF, offsList=[], charNF=None, doCast=0, doWriters=0): """Read the specified files and return a list of Person objects, one for every people in offsList.""" resList = [] _globoff = [] for offset in offsList: # One round for person is enough. if offset not in _globoff: _globoff.append(offset) else: continue personID, movies = getRawData(dataF, offset, doCast, doWriters) # Consider only the current movie. movielist = [x for x in movies if x.get('movieID') == movieID] # XXX: a person can be listed more than one time for a single movie: # think about directors of TV series. # XXX: here, 'movie' is a dictionary as returned by the getRawData # function, not a Movie class instance. for movie in movielist: name = getLabel(personID, indexF, keyF) if not name: continue curRole = movie.get('currentRole', u'') roleID = None if curRole and charNF: curRole, roleID = getCharactersIDs(curRole, charNF) p = Person(name=name, personID=personID, currentRole=curRole, roleID=roleID, accessSystem='local') if movie.has_key('attributeID'): attr = getLabel(movie['attributeID'], attrIF, attrKF) if attr: p.notes = attr # Used to sort cast. if movie.has_key('position'): p.billingPos = movie['position'] or None resList.append(p) return resList
def _findRefs(self, o, trefs, nrefs): """Find titles or names references in strings.""" if isinstance(o, (UnicodeType, StringType)): for title in re_titleRef.findall(o): a_title = analyze_title(title, canonical=1) rtitle = build_title(a_title, canonical=1, ptdf=1) if trefs.has_key(rtitle): continue movieID = self._getTitleID(rtitle) if movieID is None: movieID = self._getTitleID(title) if movieID is None: continue m = Movie(title=rtitle, movieID=movieID, accessSystem=self.accessSystem) trefs[rtitle] = m rtitle2 = canonicalTitle(a_title.get('title', u'')) if rtitle2 and rtitle2 != rtitle and rtitle2 != title: trefs[rtitle2] = m if title != rtitle: trefs[title] = m for name in re_nameRef.findall(o): a_name = analyze_name(name, canonical=1) rname = build_name(a_name, canonical=1) if nrefs.has_key(rname): continue personID = self._getNameID(rname) if personID is None: personID = self._getNameID(name) if personID is None: continue p = Person(name=rname, personID=personID, accessSystem=self.accessSystem) nrefs[rname] = p rname2 = normalizeName(a_name.get('name', u'')) if rname2 and rname2 != rname: nrefs[rname2] = p if name != rname and name != rname2: nrefs[name] = p elif isinstance(o, (ListType, TupleType)): for item in o: self._findRefs(item, trefs, nrefs) elif isinstance(o, DictType): for value in o.values(): self._findRefs(value, trefs, nrefs) return (trefs, nrefs)
def postprocess_data(self, data): result = {} for item in ('names refs', 'titles refs'): result[item] = {} for k, v in data.get(item, []): k = k.strip() v = v.strip() if not (k and v): continue imdbID = analyze_imdbid(v) if item == 'names refs': obj = Person(personID=imdbID, name=k, accessSystem=self._as, modFunct=self._modFunct) elif item == 'titles refs': obj = Movie(movieID=imdbID, title=k, accessSystem=self._as, modFunct=self._modFunct) result[item][k] = obj return result
def postprocess_data(self, data): result = {} for item in ('names refs', 'titles refs', 'characters refs'): result[item] = {} for k, v in data.get(item, []): if not v.endswith('/'): continue imdbID = analyze_imdbid(v) if item == 'names refs': obj = Person(personID=imdbID, name=k, accessSystem=self._as, modFunct=self._modFunct) elif item == 'titles refs': obj = Movie(movieID=imdbID, title=k, accessSystem=self._as, modFunct=self._modFunct) else: obj = Character(characterID=imdbID, name=k, accessSystem=self._as, modFunct=self._modFunct) # XXX: companies aren't handled: are they ever found in text, # as links to their page? result[item][k] = obj return result
def get_movie_main(self, movieID): movieID = int(movieID) data = self._base_title_info(movieID) _movies_cache = {movieID: data} _persons_cache = {} tc = self.T['title_crew'] movie = tc.select(tc.c.tconst == movieID).execute().fetchone() or {} tc_data = self._rename('title_crew', dict(movie)) writers = [] directors = [] for key, target in (('director', directors), ('writer', writers)): for personID in split_array(tc_data.get(key) or ''): if not personID: continue personID = int(personID) person_data = self._base_person_info( personID, movies_cache=_movies_cache, persons_cache=_persons_cache) person = Person(personID=personID, data=person_data, accessSystem=self.accessSystem) target.append(person) tc_data['director'] = directors tc_data['writer'] = writers data.update(tc_data) te = self.T['title_episode'] movie = te.select(te.c.tconst == movieID).execute().fetchone() or {} te_data = self._rename('title_episode', dict(movie)) if 'parentTconst' in te_data: te_data['episodes of'] = self._base_title_info( te_data['parentTconst']) self._clean(te_data, ('parentTconst', )) data.update(te_data) tp = self.T['title_principals'] movie_rows = tp.select( tp.c.tconst == movieID).execute().fetchall() or {} roles = {} for movie_row in movie_rows: movie_row = dict(movie_row) tp_data = self._rename('title_principals', dict(movie_row)) category = tp_data.get('category') if not category: continue if category in ('actor', 'actress', 'self'): category = 'cast' roles.setdefault(category, []).append(movie_row) for role in roles: roles[role].sort(key=itemgetter('ordering')) persons = [] for person_info in roles[role]: personID = person_info.get('nconst') if not personID: continue person_data = self._base_person_info( personID, movies_cache=_movies_cache, persons_cache=_persons_cache) person = Person(personID=personID, data=person_data, billingPos=person_info.get('ordering'), currentRole=person_info.get('characters'), notes=person_info.get('job'), accessSystem=self.accessSystem) persons.append(person) data[role] = persons tr = self.T['title_ratings'] movie = tr.select(tr.c.tconst == movieID).execute().fetchone() or {} tr_data = self._rename('title_ratings', dict(movie)) data.update(tr_data) ta = self.T['title_akas'] akas = ta.select(ta.c.titleId == movieID).execute() akas_list = [] for aka in akas: ta_data = self._rename('title_akas', dict(aka)) or {} for key in list(ta_data.keys()): if not ta_data[key]: del ta_data[key] for key in 't_soundex', 'movieID': if key in ta_data: del ta_data[key] for key in 'types', 'attributes': if key not in ta_data: continue ta_data[key] = split_array(ta_data[key]) akas_list.append(ta_data) if akas_list: data['akas'] = akas_list self._clean(data, ('movieID', 't_soundex')) return {'data': data, 'info sets': self.get_movie_infoset()}
if p[4] in ('actor', 'actress'): p[4] = 'cast' # Regroup by role/duty (cast, writer, director, ...) castdata[:] = _groupListBy(castdata, 4) for group in castdata: duty = group[0][4] for pdata in group: curRole = pdata[1] curRoleID = None if curRole is not None: robj = CharName.get(curRole) curRole = robj.name curRoleID = robj.id p = Person(personID=pdata[0], name=pdata[5], currentRole=curRole or u'', roleID=curRoleID, notes=pdata[2] or u'', accessSystem='sql') if pdata[6]: p['imdbIndex'] = pdata[6] p.billingPos = pdata[3] res.setdefault(duty, []).append(p) if duty == 'cast': res[duty] = merge_roles(res[duty]) res[duty].sort() # Info about the movie. minfo = [(self._info[m.infoTypeID], m.info, m.note) for m in MovieInfo.select(MovieInfo.q.movieID == movieID)] minfo = _groupListBy(minfo, 0) for group in minfo: sect = group[0][0] for mdata in group:
def build_person(txt, personID=None, billingPos=None, roleID=None, accessSystem='http', modFunct=None): """Return a Person instance from the tipical <tr>...</tr> strings found in the IMDb's web site.""" #if personID is None # _b_p_logger.debug('empty name or personID for "%s"', txt) notes = u'' role = u'' # Search the (optional) separator between name and role/notes. if txt.find('....') != -1: sep = '....' elif txt.find('...') != -1: sep = '...' else: sep = '...' # Replace the first parenthesis, assuming there are only # notes, after. # Rationale: no imdbIndex is (ever?) showed on the web site. txt = txt.replace('(', '...(', 1) txt_split = txt.split(sep, 1) name = txt_split[0].strip() if len(txt_split) == 2: role_comment = txt_split[1].strip() # Strip common endings. if role_comment[-4:] == ' and': role_comment = role_comment[:-4].rstrip() elif role_comment[-2:] == ' &': role_comment = role_comment[:-2].rstrip() elif role_comment[-6:] == '& ....': role_comment = role_comment[:-6].rstrip() # Get the notes. if roleID is not None: if not isinstance(roleID, list): cmt_idx = role_comment.find('(') if cmt_idx != -1: role = role_comment[:cmt_idx].rstrip() notes = role_comment[cmt_idx:] else: # Just a role, without notes. role = role_comment else: role = role_comment else: # We're managing something that doesn't have a 'role', so # everything are notes. notes = role_comment if role == '....': role = u'' roleNotes = [] # Manages multiple roleIDs. if isinstance(roleID, list): rolesplit = role.split('/') role = [] for r in rolesplit: nidx = r.find('(') if nidx != -1: role.append(r[:nidx].rstrip()) roleNotes.append(r[nidx:]) else: role.append(r) roleNotes.append(None) lr = len(role) lrid = len(roleID) if lr > lrid: roleID += [None] * (lrid - lr) elif lr < lrid: roleID = roleID[:lr] for i, rid in enumerate(roleID): if rid is not None: roleID[i] = str(rid) if lr == 1: role = role[0] roleID = roleID[0] notes = roleNotes[0] or u'' elif roleID is not None: roleID = str(roleID) if personID is not None: personID = str(personID) if (not name) or (personID is None): # Set to 'debug', since build_person is expected to receive some crap. _b_p_logger.debug('empty name or personID for "%s"', txt) # XXX: return None if something strange is detected? person = Person(name=name, personID=personID, currentRole=role, roleID=roleID, notes=notes, billingPos=billingPos, modFunct=modFunct, accessSystem=accessSystem) if roleNotes and len(roleNotes) == len(roleID): for idx, role in enumerate(person.currentRole): if roleNotes[idx]: role.notes = roleNotes[idx] return person
from imdb import IMDb from imdb.Person import Person import pandas as pd import re import numpy as np import matplotlib.pyplot as plt import time ia = IMDb() per = Person() names = [ 'Deepika Padukone', 'Priyanka Chopra', 'Alia Bhatt', 'Aishwarya Rai Bachchan', 'Anushka Sharma', 'Kareena Kapoor', 'Kangana Ranaut', 'Katrina Kaif', 'Shraddha Kapoor', 'Karisma Kapoor', 'Kajol', 'Madhuri Dixit', 'Kriti Sanon', 'Vidya Balan', 'Sonam K Ahuja', 'Sridevi', 'Rani Mukerji', 'Parineeti Chopra', 'Jacqueline Fernandez', 'Disha Patani', 'Rekha', 'Kiara Advani', 'Taapsee Pannu', 'Sharmila Tagore', 'Ileana DCruz', 'Bipasha Basu', 'Babita Shivdasani', 'Preity Zinta', 'Hema Malini', 'Yami Gautam', 'Sonakshi Sinha', 'Sunny Leone', 'Tabu', 'Smita Patil', 'Madhubala', 'Juhi Chawla', 'Nargis' ] names = np.unique(names) c = 0 average_movies = np.zeros(100) years = [] spike_point = [] for name in names: name = name.strip() person = ia.search_person(name)
def get_movie_main(self, movieID): movieID = int(movieID) data = self._base_title_info(movieID) _movies_cache = {movieID: data} _persons_cache = {} tc = self.T['title_crew'] movie = tc.select(tc.c.tconst == movieID).execute().fetchone() or {} tc_data = self._rename('title_crew', dict(movie)) writers = [] directors = [] for key, target in (('director', directors), ('writer', writers)): for personID in split_array(tc_data.get(key) or ''): if not personID: continue personID = int(personID) person_data = self._base_person_info( personID, movies_cache=_movies_cache, persons_cache=_persons_cache) person = Person(personID=personID, data=person_data, accessSystem=self.accessSystem) target.append(person) tc_data['director'] = directors tc_data['writer'] = writers data.update(tc_data) te = self.T['title_episode'] movie = tc.select(te.c.tconst == movieID).execute().fetchone() or {} te_data = self._rename('title_episode', dict(movie)) if 'parentTconst' in te_data: te_data['episodes of'] = self._base_title_info( te_data['parentTconst']) self._clean(te_data, ('parentTconst', )) data.update(te_data) tp = self.T['title_principals'] movie = tp.select(tp.c.tconst == movieID).execute().fetchone() or {} tp_data = self._rename('title_principals', dict(movie)) cast = [] for personID in split_array(tp_data.get('cast') or ''): if not personID: continue personID = int(personID) person_data = self._base_person_info(personID, movies_cache=_movies_cache, persons_cache=_persons_cache) person = Person(personID=personID, data=person_data, accessSystem=self.accessSystem) cast.append(person) tp_data['cast'] = cast data.update(tp_data) tr = self.T['title_ratings'] movie = tr.select(tr.c.tconst == movieID).execute().fetchone() or {} tr_data = self._rename('title_ratings', dict(movie)) data.update(tr_data) ta = self.T['title_akas'] akas = ta.select(ta.c.titleId == movieID).execute() akas_list = [] for aka in akas: ta_data = self._rename('title_akas', dict(aka)) or {} for key in list(ta_data.keys()): if not ta_data[key]: del ta_data[key] for key in 't_soundex', 'movieID': if key in ta_data: del ta_data[key] for key in 'types', 'attributes': if key not in ta_data: continue ta_data[key] = split_array(ta_data[key]) akas_list.append(ta_data) if akas_list: data['akas'] = akas_list self._clean(data, ('movieID', 't_soundex')) return {'data': data, 'info sets': self.get_movie_infoset()}
def build_person(txt, personID=None, billingPos=None, roleID=None, accessSystem='http', modFunct=None): """Return a Person instance from the tipical <tr>...</tr> strings found in the IMDb's web site.""" notes = u'' role = u'' # Search the (optional) separator between name and role/notes. if txt.find('....') != -1: sep = '....' elif txt.find('...') != -1: sep = '...' else: sep = '...' # Replace the first parenthesis, assuming there are only # notes, after. # Rationale: no imdbIndex is (ever?) showed on the web site. txt = txt.replace('(', '...(', 1) txt_split = txt.split(sep, 1) name = txt_split[0].strip() if len(txt_split) == 2: role_comment = txt_split[1].strip() # Strip common endings. if role_comment[-4:] == ' and': role_comment = role_comment[:-4].rstrip() elif role_comment[-2:] == ' &': role_comment = role_comment[:-2].rstrip() elif role_comment[-6:] == '& ....': role_comment = role_comment[:-6].rstrip() # Get the notes. cmt_idx = role_comment.find('(') if cmt_idx != -1: role = role_comment[:cmt_idx].rstrip() notes = role_comment[cmt_idx:] else: # Just a role, without notes. role = role_comment if role == '....': role = u'' # Manages multiple roleIDs. if isinstance(roleID, list): role = role.split(' / ') lr = len(role) lrid = len(roleID) if lr > lrid: roleID += [None] * (lrid - lr) elif lr < lrid: roleID = roleID[:lr] if lr == 1: role = role[0] roleID = roleID[0] # XXX: return None if something strange is detected? return Person(name=name, personID=personID, currentRole=role, roleID=roleID, notes=notes, billingPos=billingPos, modFunct=modFunct, accessSystem=accessSystem)