def extract_movie_from_bdfci(pow: PieceOfWork, refresh_delay=31): title = pow.title.replace(" ", "+") page = load_page("https://www.bdfci.info/?q=" + title + "&pa=f&d=f&page=search&src=bdfci&startFrom=1&offset=1", refresh_delay=refresh_delay) articles = page.find_all("article") url_ref = None if len(articles) == 0: entete = page.find("h1") if not entete is None: text_entete = entete.text.split("<")[0].lower() if text_entete == pow.title.lower(): url_ref = page else: url = articles[0].find("a") if url is not None and url.attrs["title"].lower() == str( pow.title).lower(): url_ref = "https://www.bdfci.info" + url.attrs["href"] if url_ref is not None: pow.add_link(url_ref, "BDFI") log("Ajout du lien BDFCI:" + url_ref + " pour " + pow.title) pow.dtLastSearch = datetime.now() pow.save() return title
def extract_film_from_senscritique(title: str, refresh_delay=31): url = "https://www.senscritique.com/search?q=" + urlencode(title.lower()) log("Recherche sur sens-critique : " + url) pages = load_page(url, save=False) pages = pages.find_all("div", {"data-qa": "hits"}) if len(pages) > 0: links = pages[0].find_all("a") url = "" for l in links: if "href" in l.attrs and l.attrs["href"].startswith( "https://www.senscritique.com/film/"): if l.getText().lower() == title.lower(): url = l["href"] log("Extraction de " + url) page = load_page(url, refresh_delay) return url return None
def extract_profil_from_imdb(lastname: str, firstname: str): peoples = ia.search_person(firstname + " " + lastname) infos = dict() for p in peoples: name = p.data["name"].upper() if firstname.upper() in name and lastname.upper() in name: if not "nopicture" in p.data["headshot"]: infos["photo"] = p.data["headshot"] if not "url" in infos: infos["url"] = "https://imdb.com/name/nm" + p.personID + "/" log("Ouverture de " + infos["url"]) page = load_page(infos["url"]) film_zone = page.find("div", {"id": "filmography"}) if film_zone is None: film_zone = page links = film_zone.findAll( 'a', attrs={'href': wikipedia.re.compile("^/title/tt")}) infos["links"] = [] for l in links: if len( l.getText() ) > 3 and l.parent.parent.parent.parent and l.parent.parent.parent.parent[ "id"] == "filmography": texts = l.parent.parent.text.split("(") nature = "long" job: str = l.parent.parent.get("id").split("-")[0] if job == "miscellaneous" or len(job) == 0: temp = l.parent.parent.text.split("(") job = temp[len(temp) - 1].split(")")[0] pass url = "https://www.imdb.com" + l.get("href") url = url.split("?")[0] if len(texts) > 1: nature = "" for nat in MOVIE_NATURE: if nat.lower() in texts[1].lower(): nature = nat break if nature == "": log("Nature inconnue depuis " + texts[1] + " pour " + url) if len(texts) > 2 and len(job) == 0: job = texts[2].split(")")[0] infos["links"].append({ "url": url, "text": l.getText(), "job": job, "nature": nature }) return infos
def extract_profil_from_lefimlfrancais(firstname, lastname, refresh_delay=31): rc = dict() url = "http://www.lefilmfrancais.com/index.php?option=com_papyrus&view=recherche&task=json&tmpl=rss&term=" + firstname + "+" + lastname data = load_json(url) if len(data) > 1: rc["url"] = data[0]["link"] page = load_page(rc["url"], refresh_delay=refresh_delay) rc["links"] = [] for l in page.find_all("a"): if l.attrs["href"].startswith( "http://www.lefilmfrancais.com/film/"): rc["links"].append({ "text": l.text, "url": l.attrs["href"], "source": "LeFilmFrancais" }) return rc
def extract_profil_from_imdb(lastname: str, firstname: str, refresh_delay=31): peoples = ia.search_person( remove_accents(firstname) + " " + remove_accents(lastname)) infos = dict() for p in peoples: name = remove_accents(remove_ponctuation(p.data["name"].upper())) if firstname.upper() in name and lastname.upper() in name: if not "nopicture" in p.data["headshot"]: infos["photo"] = p.data["headshot"] if not "url" in infos: infos["url"] = "https://imdb.com/name/nm" + p.personID + "/" log("Ouverture de " + infos["url"]) page = load_page(infos["url"], refresh_delay=refresh_delay) film_zone = page.find("div", {"id": "filmography"}) if film_zone is None: film_zone = page #Contient l'ensemble des liens qui renvoi vers une oeuvre infos["links"] = [] links = film_zone.findAll( 'a', attrs={'href': wikipedia.re.compile("^/title/tt")}) for l in links: if len( l.getText() ) > 3 and l.parent.parent.parent.parent and l.parent.parent.parent.parent[ "id"] == "filmography": texts = l.parent.parent.text.split("(") nature = "long" job: str = l.parent.parent.get("id").split("-")[0] if job == "miscellaneous" or len(job) == 0: temp = l.parent.parent.text.split("(") job = temp[len(temp) - 1].split(")")[0] pass else: if not in_dict(job, "jobs"): job = "" url = "https://www.imdb.com" + l.get("href") url = url.split("?")[0] infos["links"].append({ "url": url, "text": l.getText(), "job": "", "nature": "" }) return infos
def extract_profil_from_unifrance(name="céline sciamma", refresh_delay=31): page = load_page( "https://www.unifrance.org/recherche/personne?q=$query&sort=pertinence" .replace("$query", parse.quote(name)), refresh_delay=refresh_delay) links = page.findAll( 'a', attrs={ 'href': wikipedia.re.compile( "^https://www.unifrance.org/annuaires/personne/") }) rc = list() if len(links) > 0: u = links[0].get("href") page = wikipedia.BeautifulSoup( wikipedia.requests.get(u, headers={ 'User-Agent': 'Mozilla/5.0' }).text, "html5lib") if equal_str(name, page.title.text.split("-")[0]) or equal_str( name, links[0].text.split("Activités : ")[0]): photo = "" _photo = page.find('div', attrs={'class': "profil-picture pull-right"}) if not _photo is None: photo = _photo.find("a").get("href") links_film = page.findAll( 'a', attrs={ 'href': wikipedia.re.compile( "^https://www.unifrance.org/film/[0-9][0-9]*/") }) for l in links_film: rc.append({ "url": l.get("href"), "text": l.get("text"), "nature": "" }) return {"links": rc, "photo": photo, "url": u} return None
def extract_film_from_unifrance(url: str, job_for=None): rc = dict() if not url.startswith("http"): log("On passe par la page de recherche pour retrouver le titre") page = load_page("https://unifrance.org/recherche?q=" + parse.quote(url)) _link = page.find("a", attrs={ 'href': wikipedia.re.compile( "^https://www.unifrance.org/film/[0-9][0-9]") }) if _link is None: return rc url = _link.get("href") #r=wikipedia.requests.get(url, headers={'User-Agent': 'Mozilla/5.0',"accept-encoding": "gzip, deflate"}) #page = wikipedia.BeautifulSoup(str(r.content,encoding="utf-8"),"html5lib") page = load_page(url) _title = page.find('h1', attrs={'itemprop': "name"}) if not _title is None: rc["title"] = _title.text log("Analyse du film " + rc["title"]) for title in page.findAll('h1'): if "Affiches" in title.text: section = title.parent _img = section.find("img", attrs={'itemprop': "image"}) if not _img is None: src: str = _img.get("src") if not src.startswith("/ressource"): rc["visual"] = src log("Enregistrement de l'affiche " + src) _real = page.find("div", attrs={"itemprop": "director"}) if not _real is None: rc["real"] = _real.find("a", attrs={"itemprop": "name"}).get("href") idx_div = 0 for div in page.findAll("div", attrs={'class': "details_bloc"}): if idx_div == 0: if not ":" in div.text: rc["nature"] = div.text if "Année de production : " in div.text: rc["year"] = div.text.replace("Année de production : ", "") if "Genre(s) : " in div.text: rc["category"] = translate(div.text.replace("Genre(s) : ", "")) idx_div = idx_div + 1 if "category" in rc and len(rc["category"]) == 0: rc["category"] = "inconnue" if not job_for is None: if rc["real"] == job_for: rc["job"] = "Réalisation" else: section = page.find("section", {"id": "casting"}) if not section is None: jobs = section.findAll("h2") paras = section.findAll("p") #if not "personne" in links[0].href:links.remove(0) for idx in range(len(paras)): links = paras[idx].findAll("a") for l in links: if "/personne" in l.get("href"): if l.get("href") == job_for: rc["job"] = jobs[idx].text.replace(" : ", "") break if not "job" in rc: pass _synopsis = page.find("div", attrs={"itemprop": "description"}) if not _synopsis is None: rc["synopsis"] = _synopsis.getText(strip=True) return rc
def extract_film_from_imdb( url: str, title: str, name="", job="", ): """ :return: """ page = load_page(url) rc = dict({"title": title, "nature": translate("film")}) zone_info = page.find("div", {"class": "title_block"}) if title.startswith("Episode") or "Episode" in zone_info.getText(): section_title = page.find("div", {"class": "titleParent"}) if not section_title is None: title = section_title.find("a").text + " " + title #Recherche de l'épisode rc["nature"] = MOVIE_NATURE[0] zone_info_comp = page.find("div", {"class": "button_panel navigation_panel"}) if not zone_info_comp is None and "Season" in zone_info_comp.getText(): extract_text = "S" + zone_info_comp.getText().split( "Season")[1].replace("Episode ", "E").replace( " | ", "").replace(" ", "") rc["title"] = title + " " + extract_text.split("\n")[0] for cat in MOVIE_CATEGORIES: if cat.lower() in zone_info.getText().lower(): rc["category"] = cat if not "category" in rc: rc["category"] = "Inconnue" log("Pas de categorie pour " + url) affiche = page.find("div", attrs={"class": "poster"}) if not affiche is None and not affiche.find("img") is None: rc["visual"] = affiche.find("img").get("src") try: rc["year"] = re.search('[1-2][0-9][0-9][0-9]', page.title.text).group(0) except: try: rc["year"] = re.search('[1-2][0-9][0-9][0-9]', zone_info.getText()).group(0) except: pass summary_section = page.find("div", attrs={"class": "summary_text"}) if not summary_section is None and not "Add a Plot" in summary_section.text: rc["synopsis"] = summary_section.text.replace("\n", "").strip() log("Recherche du role sur le film") credits = load_page(url + "fullcredits") if not credits is None: credits = credits.find("div", {"id": "main"}) if not credits is None: links = credits.find_all("a") for l in links: if name.upper() in l.text.upper(): parent = l.parent.parent.find("td", {"class": "credit"}) if not parent is None: rc["job"] = str(parent.getText().replace("\n", "")).strip() rc["job"] = rc["job"].split("(")[0] while " " in rc["job"]: rc["job"] = rc["job"].replace(" ", " ") break if not "job" in rc: rc["job"] = job return rc
def extract_film_from_imdb(url: str, title: str, name="", job="", all_casting=False, refresh_delay=31): """ :return: """ if not url.startswith("http"): page = load_page("https://www.imdb.com/find?s=tt&q=" + parse.quote(url)) bFind = False for link in page.find_all("a"): if link and equal_str( link.text, url) and link["href"].startswith("/title/tt"): url = "https://www.imdb.com" + link["href"] bFind = True break if not bFind: log(url + " introuvable sur IMDB") return None page = load_page(url, refresh_delay) title = remove_ponctuation(title) rc = dict({ "title": title, "nature": "", "casting": list(), "url": url, "source": "auto:IMDB" }) divs = dict() elts = page.find_all("div", recursive=True) + page.find_all( "h1", recursive=True) + page.find_all( "ul", recursive=True) + page.find_all("p") + page.find_all("li") for div in elts: s = div.text s_t = translate(s) if s_t in MOVIE_NATURE: rc["nature"] = s_t if s.startswith("1h") or s.startswith("2h") and s.endswith( "m") and len(rc["nature"]) == 0: rc["nature"] = translate("long") if "data-testid" in div.attrs: divs[div.attrs["data-testid"]] = div #Recherche de la nature et de la catégorie if not "genres" in divs: elt = page.find("li", { "role": "presentation", "class": "ipc-inline-list__item" }) if not elt is None: cat = elt.text else: cat = "inconnu" else: cat = "" for div in divs["genres"]: cat = cat + translate(div.text.lower()) + " " if cat.split(" ")[0] in MOVIE_NATURE: rc["nature"] = cat.split(" ")[0] cat = cat.replace(rc["nature"], "").strip() rc["category"] = cat.strip() try: title = divs["hero-title-block__title"].text year = divs["hero-title-block__metadata"].text if not year is None: rc["year"] = re.search(r"(\d{4})", year).group(1) except: log("Erreur sur title=" + title) return None affiche = divs["hero-media__poster"] if not affiche is None and not affiche.find("img") is None: rc["visual"] = affiche.find("img").get("src") rc["synopsis"] = "" if "plot" in divs: rc["synopsis"] = divs["plot"].text.replace("Read all", "") #log("Recherche du role sur le film") credits = load_page(url + "fullcredits", refresh_delay) if not credits is None: credits = credits.find("div", {"id": "fullcredits_content"}) if not credits is None: sur_jobs = credits.find_all("h4") tables = credits.find_all("table") for i in range(0, len(tables)): trs = tables[i].find_all("tr") for tr in trs: tds = tr.find_all("td") if len(tds) > 1: findname = tds[0].text.replace("\n", "").replace( " ", " ").strip() if len(findname) == 0: findname = tds[1].text.replace("\n", "").replace( " ", " ").strip() if len(findname) > 0: #log("Nom identifié "+findname) if equal_str(findname, name): sur_job = sur_jobs[i].text.replace( "\n", " ").strip() if "Cast" in sur_job or "Serie Cast" in sur_job: if len(tds) > 3 and "Self" in tds[3].text: job = "" else: job = "Actor" else: job = tds[len(tds) - 1].text.split( "(")[0].split("/")[0].strip() if len(job) == 0 and len( sur_jobs[i].text) > 0: job = sur_job.replace(" by", "").strip() job = job.split("\n")[0] rc["job"] = translate(job) if len(job) == 0: log("Job non identifié pour " + name + " sur " + url) else: if not all_casting: break else: if all_casting: names = tds[0].split(" ") rc["casting"].append({ "name": " ".join(names), "source": "imdb", "job": job }) if not "job" in rc: rc["job"] = job return rc
def extract_awards_from_imdb(profil_url, profil): # Recherche des awards page = load_page(profil_url + "awards?ref_=nm_awd") awards = page.find_all("h3") if len(awards) > 0: awards.pop(0) tables = page.find_all("table", {"class": "awards"}) for i in range(0, len(tables)): for tr in tables[i].find_all("tr"): if tr: festival_title = translate( awards[i].text.split(",")[0].lower().strip()) tds = tr.find_all("td") if len(tds) <= 2: log("Format non conforme " + tr.text) else: year = tds[0].text.replace("\n", "").replace(" ", "").strip() award = tds[1].text film = tds[2].find("a") if film and award: win = ("Winner" in award) film_title = film.text if "(" in tds[2].text: film_year = tds[2].text.split("(")[1].split(")")[0] pow = PieceOfWork.objects.filter( title__iexact=film_title, year__iexact=film_year) if pow.exists(): pow = pow.first() f = Festival.objects.filter( title__iexact=festival_title) if f.exists(): f = f.first() else: f = Festival(title=festival_title) f.save() a = Award.objects.filter(pow__id=pow.id, year=year, festival__id=f.id, profil__id=profil.id) if a.exists(): a = a.first() else: award = award.replace("\n", "").replace( "Winner", "").replace("Nominee", "").strip() if award.startswith("(") and ")" in award: award = award.split(")")[1] a = Award(description=award, year=year, pow=pow, festival=f, profil=profil, winner=win) try: a.save() except: log("!!Probleme d'enregistrement de l'award sur " + pow.title)
def extract_film_from_unifrance(url: str, job_for=None, all_casting=False, refresh_delay=30): rc = dict({"casting": [], "source": "auto:unifrance", "url": url}) if not url.startswith("http"): log("On passe par la page de recherche pour retrouver le titre") page = load_page("https://unifrance.org/recherche?q=" + parse.quote(url), refresh_delay=refresh_delay) _link = page.find("a", attrs={ 'href': wikipedia.re.compile( "^https://www.unifrance.org/film/[0-9][0-9]") }) if _link is None: return None url = _link.get("href") rc["url"] = url #r=wikipedia.requests.get(url, headers={'User-Agent': 'Mozilla/5.0',"accept-encoding": "gzip, deflate"}) #page = wikipedia.BeautifulSoup(str(r.content,encoding="utf-8"),"html5lib") page = load_page(url, refresh_delay) _title = page.find('h1', attrs={'itemprop': "name"}) if not _title is None: rc["title"] = _title.text log("Analyse du film " + rc["title"]) for title in page.findAll('h1'): if title.text.startswith("Affiches"): section = title.parent _img = section.find("img", attrs={'itemprop': "image"}) if not _img is None: src: str = _img.get("src") if not src.startswith("/ressource"): rc["visual"] = src log("Enregistrement de l'affiche " + src) _real = page.find("div", attrs={"itemprop": "director"}) if not _real is None and not _real.find("a", attrs={"itemprop": "name" }) is None: rc["real"] = _real.find("a", attrs={"itemprop": "name"}).get("href") idx_div = 0 for div in page.findAll("div", attrs={'class': "details_bloc"}): if idx_div == 0: if not ":" in div.text: rc["nature"] = div.text if "Numéro de visa" in div.text: rc["visa"] = div.text.split(" : ")[1].replace(".", "") if "Langues de tournage" in div.text: rc["langue"] = div.text.split(" : ")[1] if "Année de production : " in div.text: rc["year"] = div.text.replace("Année de production : ", "") if "Genre(s) : " in div.text: rc["category"] = translate(div.text.replace("Genre(s) : ", "")) idx_div = idx_div + 1 if "category" in rc and len(rc["category"]) == 0: rc["category"] = "inconnue" rc["prix"] = [] for section_prix in page.find_all("div", attrs={"class": "distinction palmares"}): if len(section_prix.find_all("div")) > 0: content = section_prix.find_all("div")[1].text if content is not None: content = content.replace("PlusMoins", "") _prix = { "description": content.split(")Prix")[1].split(" : ")[0] } for l in section_prix.find_all("div")[1].find_all("a"): if "festivals" in l.attrs["href"]: _prix["title"] = l.text.split("(")[0] _prix["year"] = re.findall(r"[1-2][0-9]{3}", l.text)[0] if "person" in l.attrs["href"] and "profil" not in _prix: _prix["profil"] = index_string(l.text) if not "profil" in _prix: log("Attribution du prix à " + job_for) _prix["profil"] = index_string(job_for) if "year" in _prix and "title" in _prix: rc["prix"].append(_prix) log("Ajout du prix " + str(_prix)) else: log("!Prix non conforme sur " + url) if not job_for is None: real_links = page.find("div", { "id": "description" }).find("p").find_all("a") if len(real_links) > 0 and equal_str(real_links[0].text, job_for): rc["job"] = translate("Réalisation") else: #Recherche en réalisation section = page.find("div", {"itemprop": "director"}) if section and (job_for.lower() in section.text.lower()): rc["job"] = translate("Réalisation") #Recherche dans le générique détaillé section = page.find("section", {"id": "casting"}) if not section is None: jobs = section.findAll("h2") paras = section.findAll("p") #if not "personne" in links[0].href:links.remove(0) for idx in range(len(paras)): links = paras[idx].findAll("a") for l in links: job = jobs[idx].text.replace(":", "").strip() if "/personne" in l.get("href"): if (job_for.startswith("http") and l.get("href") == job_for) or equal_str( job_for, l.text): rc["job"] = job break else: if all_casting: #On ajoute l'ensemble du casting au systeme names = str(l.getText()).split(" ") lastname = names[len(names) - 1] rc["casting"].append({ "lastname": lastname, "url": l.attrs["href"], "source": "unifrance", "firstname": l.getText().replace(lastname, "").strip(), "job": job }) #Recherche dans les acteurs for actor in page.find_all("div", {"itemprop": "actors"}): if "data-title" in actor.attrs: if actor.attrs["data-title"].lower() == job_for.lower(): rc["job"] = "actor" if not "job" in rc: pass _synopsis = page.find("div", attrs={"itemprop": "description"}) if not _synopsis is None: rc["synopsis"] = _synopsis.getText(strip=True) return rc
def extract_film_from_leFilmFrancais(url: str, job_for=None, all_casting=False, refresh_delay=30, bot=None): rc = dict({ "nature": "", "title": "", "source": "auto:LeFilmFrancais", "url": url }) if not url.startswith("http"): page = load_page( "http://www.lefilmfrancais.com/index.php?option=com_papyrus&view=recherche&searchword=" + parse.quote(url)) bFind = False fiche_film = page.find("div", {"id": "fiche_film"}) if fiche_film: for l in fiche_film.find_all("a"): if l and l["href"].startswith( "http://www.lefilmfrancais.com/film/"): url = l["href"] bFind = True break if not bFind: return None page = load_page(url, bot=bot) if page.find("div", {"id": "synopsis"}): rc["synopsis"] = remove_html(page.find("div", {"id": "synopsis"}).text) elts = page.find_all("h1") if len(elts) > 0: rc["title"] = elts[0].text.split("(")[0] elt = page.find("div", {"id": "detail"}) if elt: for item in elt: if item.name is None: if "sortie" in item.lower(): pass for span in page.find_all("span"): if "class" in span.attrs and len( span.attrs["class"] ) > 0 and span.attrs["class"][0] == "realisation": if not "Réalisation" in span.text.split(",")[0]: rc["nature"] = span.text.split(",")[0].split("(")[0] else: if ":" in span.text: val = span.text.split(":")[1].strip() if "Visa" in span.text: rc["visa"] = val if "Titre original" in span.text: rc["original_title"] = val if "Réalisation" in span.text: rc["real"] = val if "Sortie" in span.text: rc["sortie"] = val if "copies" in span.text: rc["copies"] = int(val) if "Nationalité" in span.text: rc["Nationality"] = val if "Distribution France" in span.text: rc["distribution"] = val for item in page.find_all("li"): lab = item.text.split(":")[0] if ":" in item.text: val = item.text.split(":")[1].split("|")[0].strip() if "production :" in lab: rc["production"] = val if "Partenaires" in lab: rc["financial"] = val if "Récompense" in lab: rc["prix"] = val if "Presse" in lab: rc["presse"] = val if "title" in rc: log("Extraction de " + rc["title"] + " : " + str(rc)) return rc