def update_dictionnary(request): for w in Work.objects.all(): job=translate(w.job) if job!=w.job: log("Traitement de "+str(w.job)) w.job=job w.save() for p in PieceOfWork.objects.all(): category=translate(p.category) if category!=p.category: p.category=category p.save() return Response({"message":"ok"})
def dict_to_pow(film: dict, content=None): pow = PieceOfWork(title=film["title"], title_index=index_string(film["title"])) pow.add_link(url=film["url"], title=film["source"]) if not content is None and content["senscritique"]: pow.add_link(extract_film_from_senscritique(film["title"]), title="Sens-critique") for k in list(model_to_dict(pow).keys()): if k in film: pow.__setattr__(k, film[k]) if "nature" in film: pow.nature = translate(film["nature"]) else: pow.nature = "Film" if "category" in film: pow.category = translate(film["category"]) if "synopsis" in film: pow.description = film["synopsis"] return pow
def add_pows_to_profil(profil, links, all_links, job_for): """ Ajoute des oeuvres au profil :param profil: :param links: :param all_links: :return: """ for l in links: source = "auto" pow = None for p in PieceOfWork.objects.filter(title__iexact=l["text"]): for link in p.links: if l["url"] == link["url"]: pow = p break if not pow: if "unifrance" in l["url"]: film = extract_film_from_unifrance(l["url"], job_for=job_for) source = "auto:unifrance" if "imdb" in l["url"]: film = extract_film_from_imdb(l["url"], l["text"], name=profil.firstname + " " + profil.lastname, job=l["job"]) if not "nature" in film: film["nature"] = l["nature"] source = "auto:IMDB" log("Traitement de " + film["title"] + " à l'adresse " + l["url"]) pow = PieceOfWork(title=film["title"]) pow.add_link(url=l["url"], title=source) if "nature" in film: pow.nature = translate(film["nature"]) else: pow.nature = "Film" if "synopsis" in film: pow.description = film["synopsis"] if "visual" in film: pow.visual = film["visual"] if "category" in film: pow.category = translate(film["category"]) if "year" in film: pow.year = film["year"] try: result = PieceOfWork.objects.filter(title__iexact=pow.title) if len(result) > 0: log("Le film existe déjà dans la base, on le récupére") pow = result.first() pow.add_link(l["url"], source) pow.save() # TODO: a réétudier car des mises a jour de fiche pourrait nous faire rater des films # il faudrait désindenter le code ci-dessous mais du coup il faudrait retrouver le pow job = profil.job if "job" in film: job = film["job"] except Exception as inst: log("Impossible d'enregistrer le film: " + str(inst.args)) else: job = l["job"] t_job = translate(job) if not Work.objects.filter( pow_id=pow.id, profil_id=profil.id, job=t_job).exists(): log("Ajout de l'experience " + job + " traduit en " + t_job + " sur " + pow.title + " à " + profil.lastname) work = Work(pow=pow, profil=profil, job=t_job, source=source) work.save()
def extract_film_from_unifrance(url: str, job_for=None): rc = dict() if not url.startswith("http"): log("On passe par la page de recherche pour retrouver le titre") page = load_page("https://unifrance.org/recherche?q=" + parse.quote(url)) _link = page.find("a", attrs={ 'href': wikipedia.re.compile( "^https://www.unifrance.org/film/[0-9][0-9]") }) if _link is None: return rc url = _link.get("href") #r=wikipedia.requests.get(url, headers={'User-Agent': 'Mozilla/5.0',"accept-encoding": "gzip, deflate"}) #page = wikipedia.BeautifulSoup(str(r.content,encoding="utf-8"),"html5lib") page = load_page(url) _title = page.find('h1', attrs={'itemprop': "name"}) if not _title is None: rc["title"] = _title.text log("Analyse du film " + rc["title"]) for title in page.findAll('h1'): if "Affiches" in title.text: section = title.parent _img = section.find("img", attrs={'itemprop': "image"}) if not _img is None: src: str = _img.get("src") if not src.startswith("/ressource"): rc["visual"] = src log("Enregistrement de l'affiche " + src) _real = page.find("div", attrs={"itemprop": "director"}) if not _real is None: rc["real"] = _real.find("a", attrs={"itemprop": "name"}).get("href") idx_div = 0 for div in page.findAll("div", attrs={'class': "details_bloc"}): if idx_div == 0: if not ":" in div.text: rc["nature"] = div.text if "Année de production : " in div.text: rc["year"] = div.text.replace("Année de production : ", "") if "Genre(s) : " in div.text: rc["category"] = translate(div.text.replace("Genre(s) : ", "")) idx_div = idx_div + 1 if "category" in rc and len(rc["category"]) == 0: rc["category"] = "inconnue" if not job_for is None: if rc["real"] == job_for: rc["job"] = "Réalisation" else: section = page.find("section", {"id": "casting"}) if not section is None: jobs = section.findAll("h2") paras = section.findAll("p") #if not "personne" in links[0].href:links.remove(0) for idx in range(len(paras)): links = paras[idx].findAll("a") for l in links: if "/personne" in l.get("href"): if l.get("href") == job_for: rc["job"] = jobs[idx].text.replace(" : ", "") break if not "job" in rc: pass _synopsis = page.find("div", attrs={"itemprop": "description"}) if not _synopsis is None: rc["synopsis"] = _synopsis.getText(strip=True) return rc
def extract_film_from_imdb( url: str, title: str, name="", job="", ): """ :return: """ page = load_page(url) rc = dict({"title": title, "nature": translate("film")}) zone_info = page.find("div", {"class": "title_block"}) if title.startswith("Episode") or "Episode" in zone_info.getText(): section_title = page.find("div", {"class": "titleParent"}) if not section_title is None: title = section_title.find("a").text + " " + title #Recherche de l'épisode rc["nature"] = MOVIE_NATURE[0] zone_info_comp = page.find("div", {"class": "button_panel navigation_panel"}) if not zone_info_comp is None and "Season" in zone_info_comp.getText(): extract_text = "S" + zone_info_comp.getText().split( "Season")[1].replace("Episode ", "E").replace( " | ", "").replace(" ", "") rc["title"] = title + " " + extract_text.split("\n")[0] for cat in MOVIE_CATEGORIES: if cat.lower() in zone_info.getText().lower(): rc["category"] = cat if not "category" in rc: rc["category"] = "Inconnue" log("Pas de categorie pour " + url) affiche = page.find("div", attrs={"class": "poster"}) if not affiche is None and not affiche.find("img") is None: rc["visual"] = affiche.find("img").get("src") try: rc["year"] = re.search('[1-2][0-9][0-9][0-9]', page.title.text).group(0) except: try: rc["year"] = re.search('[1-2][0-9][0-9][0-9]', zone_info.getText()).group(0) except: pass summary_section = page.find("div", attrs={"class": "summary_text"}) if not summary_section is None and not "Add a Plot" in summary_section.text: rc["synopsis"] = summary_section.text.replace("\n", "").strip() log("Recherche du role sur le film") credits = load_page(url + "fullcredits") if not credits is None: credits = credits.find("div", {"id": "main"}) if not credits is None: links = credits.find_all("a") for l in links: if name.upper() in l.text.upper(): parent = l.parent.parent.find("td", {"class": "credit"}) if not parent is None: rc["job"] = str(parent.getText().replace("\n", "")).strip() rc["job"] = rc["job"].split("(")[0] while " " in rc["job"]: rc["job"] = rc["job"].replace(" ", " ") break if not "job" in rc: rc["job"] = job return rc
def add_pows_to_profil(profil, links, job_for, refresh_delay_page, templates=[], bot=None, content=None): """ Ajoute des oeuvres au profil :param profil: :param links: :param all_links: :return: """ n_films = 0 n_works = 0 articles = list() job_for = remove_accents(remove_ponctuation(job_for)) for l in links: source = "auto" film = None pow = None job = l["job"] if "job" in l else "" # for p in PieceOfWork.objects.filter(title__iexact=l["text"]): # #si la source à déjà été analysée on ne fait rien # for link in p.links: # if l["url"] == link["url"]: # pow=p # break if "unifrance" in l["url"]: film = extract_film_from_unifrance( l["url"], job_for=job_for, refresh_delay=refresh_delay_page) if "source" in l and "LeFilmFrancais" in l["source"]: film = extract_film_from_leFilmFrancais( l["url"], job_for=job_for, refresh_delay=refresh_delay_page, bot=bot) if "imdb" in l["url"]: film = extract_film_from_imdb(l["url"], l["text"], name=profil.firstname + " " + profil.lastname, job=l["job"], refresh_delay=refresh_delay_page) if film and (film["category"] == "News" or len(film["nature"]) == 0): log("Ce type d'événement est exlue :" + str(film)) film = None if not film is None: if not "nature" in film: film["nature"] = l["nature"] if "title" in film: log("Traitement de " + film["title"] + " à l'adresse " + l["url"]) pow = dict_to_pow(film, content) job = profil.job if "job" in film: job = film["job"] try: result = PieceOfWork.objects.filter( title_index__iexact=pow.title_index) if len(result) > 0: for p in result: if abs(int(p.year) - int(pow.year)) <= 1: log("Le film existe déjà dans la base, on le met a jour avec les nouvelles données" ) pow, hasChanged = fusion(p, pow) if hasChanged: pow.dtLastSearch = datetime.now() pow.save() else: n_films = n_films + 1 pow.dtLastSearch = datetime.now() pow.save() # TODO: a réétudier car des mises a jour de fiche pourrait nous faire rater des films # il faudrait désindenter le code ci-dessous mais du coup il faudrait retrouver le pow except Exception as inst: log("Impossible d'enregistrer le film: " + str(inst.args)) else: log("Impossible de retrouver le film" + str(film)) if not pow is None: if not film is None and "prix" in film and not film[ "prix"] is None and len(film["prix"]) > 0: for prix in film["prix"]: f = Festival.objects.filter(title__iexact=prix["title"]) if f.exists(): f = f.first() else: f = Festival(title=prix["title"].strip().lower()) f.save() a = Award.objects.filter(pow__id=pow.id, year=int(prix["year"]), festival__id=f.id) if a.exists(): a = a.first() else: desc = prix["description"][:249] if desc.startswith("(") and ")" in desc: desc = desc.split(")")[1] a = Award( description=desc, year=prix["year"], pow=pow, festival=f, profil=None if not "profil" in prix else Profil.objects.filter( name_index__iexact=prix["profil"]).first()) try: a.save() except: log("!!Probleme d'enregistrement de l'award sur " + pow.title) if job is None: job = "" t_job = translate(job) if len(t_job) == 0: if job_for and pow and pow.title: log("!Job non identifié pour " + job_for + " sur " + pow.title) #t_job="Non identifié" else: if not Work.objects.filter(pow_id=pow.id, profil_id=profil.id, job=t_job).exists(): if len(t_job) > 0: log("Ajout de l'experience " + job + " traduit en " + t_job + " sur " + pow.title + " à " + profil.lastname) work = Work(pow=pow, profil=profil, job=t_job, source=source) try: work.save() except Exception as inst: log("Impossible d'enregistrer le travail: " + str(inst.args)) if len(templates) > 0: articles.append( create_article(profil, pow, work, templates[0])) else: log("Pas d'enregistrement de la contribution job=" + job) # Enregistrement du casting if not film is None and "casting" in film: for p in film["casting"]: _ps = list( Profil.objects.filter(lastname=p["lastname"], firstname=p["firstname"])) if len(_ps) == 0: log("Ajout de " + p["lastname"] + " comme externe en tant que " + p["job"]) _p = Profil(firstname=p["firstname"], lastname=p["lastname"], name_index=index_string(p["firstname"] + p["lastname"]), department="Ext", cursus="E", school="", email=p["firstname"] + "." + p["lastname"] + "@fictif") _p.add_link(url=p["url"], title=p["source"]) _p.save() else: _p = _ps[0] if not Work.objects.filter(pow_id=pow.id, profil_id=_p.id, job=p["job"]).exists(): work = Work(pow=pow, profil=_p, job=p["job"], source=source) work.save() n_works = n_works + 1 return n_films, n_works, articles
def extract_film_from_imdb(url: str, title: str, name="", job="", all_casting=False, refresh_delay=31): """ :return: """ if not url.startswith("http"): page = load_page("https://www.imdb.com/find?s=tt&q=" + parse.quote(url)) bFind = False for link in page.find_all("a"): if link and equal_str( link.text, url) and link["href"].startswith("/title/tt"): url = "https://www.imdb.com" + link["href"] bFind = True break if not bFind: log(url + " introuvable sur IMDB") return None page = load_page(url, refresh_delay) title = remove_ponctuation(title) rc = dict({ "title": title, "nature": "", "casting": list(), "url": url, "source": "auto:IMDB" }) divs = dict() elts = page.find_all("div", recursive=True) + page.find_all( "h1", recursive=True) + page.find_all( "ul", recursive=True) + page.find_all("p") + page.find_all("li") for div in elts: s = div.text s_t = translate(s) if s_t in MOVIE_NATURE: rc["nature"] = s_t if s.startswith("1h") or s.startswith("2h") and s.endswith( "m") and len(rc["nature"]) == 0: rc["nature"] = translate("long") if "data-testid" in div.attrs: divs[div.attrs["data-testid"]] = div #Recherche de la nature et de la catégorie if not "genres" in divs: elt = page.find("li", { "role": "presentation", "class": "ipc-inline-list__item" }) if not elt is None: cat = elt.text else: cat = "inconnu" else: cat = "" for div in divs["genres"]: cat = cat + translate(div.text.lower()) + " " if cat.split(" ")[0] in MOVIE_NATURE: rc["nature"] = cat.split(" ")[0] cat = cat.replace(rc["nature"], "").strip() rc["category"] = cat.strip() try: title = divs["hero-title-block__title"].text year = divs["hero-title-block__metadata"].text if not year is None: rc["year"] = re.search(r"(\d{4})", year).group(1) except: log("Erreur sur title=" + title) return None affiche = divs["hero-media__poster"] if not affiche is None and not affiche.find("img") is None: rc["visual"] = affiche.find("img").get("src") rc["synopsis"] = "" if "plot" in divs: rc["synopsis"] = divs["plot"].text.replace("Read all", "") #log("Recherche du role sur le film") credits = load_page(url + "fullcredits", refresh_delay) if not credits is None: credits = credits.find("div", {"id": "fullcredits_content"}) if not credits is None: sur_jobs = credits.find_all("h4") tables = credits.find_all("table") for i in range(0, len(tables)): trs = tables[i].find_all("tr") for tr in trs: tds = tr.find_all("td") if len(tds) > 1: findname = tds[0].text.replace("\n", "").replace( " ", " ").strip() if len(findname) == 0: findname = tds[1].text.replace("\n", "").replace( " ", " ").strip() if len(findname) > 0: #log("Nom identifié "+findname) if equal_str(findname, name): sur_job = sur_jobs[i].text.replace( "\n", " ").strip() if "Cast" in sur_job or "Serie Cast" in sur_job: if len(tds) > 3 and "Self" in tds[3].text: job = "" else: job = "Actor" else: job = tds[len(tds) - 1].text.split( "(")[0].split("/")[0].strip() if len(job) == 0 and len( sur_jobs[i].text) > 0: job = sur_job.replace(" by", "").strip() job = job.split("\n")[0] rc["job"] = translate(job) if len(job) == 0: log("Job non identifié pour " + name + " sur " + url) else: if not all_casting: break else: if all_casting: names = tds[0].split(" ") rc["casting"].append({ "name": " ".join(names), "source": "imdb", "job": job }) if not "job" in rc: rc["job"] = job return rc
def extract_awards_from_imdb(profil_url, profil): # Recherche des awards page = load_page(profil_url + "awards?ref_=nm_awd") awards = page.find_all("h3") if len(awards) > 0: awards.pop(0) tables = page.find_all("table", {"class": "awards"}) for i in range(0, len(tables)): for tr in tables[i].find_all("tr"): if tr: festival_title = translate( awards[i].text.split(",")[0].lower().strip()) tds = tr.find_all("td") if len(tds) <= 2: log("Format non conforme " + tr.text) else: year = tds[0].text.replace("\n", "").replace(" ", "").strip() award = tds[1].text film = tds[2].find("a") if film and award: win = ("Winner" in award) film_title = film.text if "(" in tds[2].text: film_year = tds[2].text.split("(")[1].split(")")[0] pow = PieceOfWork.objects.filter( title__iexact=film_title, year__iexact=film_year) if pow.exists(): pow = pow.first() f = Festival.objects.filter( title__iexact=festival_title) if f.exists(): f = f.first() else: f = Festival(title=festival_title) f.save() a = Award.objects.filter(pow__id=pow.id, year=year, festival__id=f.id, profil__id=profil.id) if a.exists(): a = a.first() else: award = award.replace("\n", "").replace( "Winner", "").replace("Nominee", "").strip() if award.startswith("(") and ")" in award: award = award.split(")")[1] a = Award(description=award, year=year, pow=pow, festival=f, profil=profil, winner=win) try: a.save() except: log("!!Probleme d'enregistrement de l'award sur " + pow.title)
def extract_film_from_unifrance(url: str, job_for=None, all_casting=False, refresh_delay=30): rc = dict({"casting": [], "source": "auto:unifrance", "url": url}) if not url.startswith("http"): log("On passe par la page de recherche pour retrouver le titre") page = load_page("https://unifrance.org/recherche?q=" + parse.quote(url), refresh_delay=refresh_delay) _link = page.find("a", attrs={ 'href': wikipedia.re.compile( "^https://www.unifrance.org/film/[0-9][0-9]") }) if _link is None: return None url = _link.get("href") rc["url"] = url #r=wikipedia.requests.get(url, headers={'User-Agent': 'Mozilla/5.0',"accept-encoding": "gzip, deflate"}) #page = wikipedia.BeautifulSoup(str(r.content,encoding="utf-8"),"html5lib") page = load_page(url, refresh_delay) _title = page.find('h1', attrs={'itemprop': "name"}) if not _title is None: rc["title"] = _title.text log("Analyse du film " + rc["title"]) for title in page.findAll('h1'): if title.text.startswith("Affiches"): section = title.parent _img = section.find("img", attrs={'itemprop': "image"}) if not _img is None: src: str = _img.get("src") if not src.startswith("/ressource"): rc["visual"] = src log("Enregistrement de l'affiche " + src) _real = page.find("div", attrs={"itemprop": "director"}) if not _real is None and not _real.find("a", attrs={"itemprop": "name" }) is None: rc["real"] = _real.find("a", attrs={"itemprop": "name"}).get("href") idx_div = 0 for div in page.findAll("div", attrs={'class': "details_bloc"}): if idx_div == 0: if not ":" in div.text: rc["nature"] = div.text if "Numéro de visa" in div.text: rc["visa"] = div.text.split(" : ")[1].replace(".", "") if "Langues de tournage" in div.text: rc["langue"] = div.text.split(" : ")[1] if "Année de production : " in div.text: rc["year"] = div.text.replace("Année de production : ", "") if "Genre(s) : " in div.text: rc["category"] = translate(div.text.replace("Genre(s) : ", "")) idx_div = idx_div + 1 if "category" in rc and len(rc["category"]) == 0: rc["category"] = "inconnue" rc["prix"] = [] for section_prix in page.find_all("div", attrs={"class": "distinction palmares"}): if len(section_prix.find_all("div")) > 0: content = section_prix.find_all("div")[1].text if content is not None: content = content.replace("PlusMoins", "") _prix = { "description": content.split(")Prix")[1].split(" : ")[0] } for l in section_prix.find_all("div")[1].find_all("a"): if "festivals" in l.attrs["href"]: _prix["title"] = l.text.split("(")[0] _prix["year"] = re.findall(r"[1-2][0-9]{3}", l.text)[0] if "person" in l.attrs["href"] and "profil" not in _prix: _prix["profil"] = index_string(l.text) if not "profil" in _prix: log("Attribution du prix à " + job_for) _prix["profil"] = index_string(job_for) if "year" in _prix and "title" in _prix: rc["prix"].append(_prix) log("Ajout du prix " + str(_prix)) else: log("!Prix non conforme sur " + url) if not job_for is None: real_links = page.find("div", { "id": "description" }).find("p").find_all("a") if len(real_links) > 0 and equal_str(real_links[0].text, job_for): rc["job"] = translate("Réalisation") else: #Recherche en réalisation section = page.find("div", {"itemprop": "director"}) if section and (job_for.lower() in section.text.lower()): rc["job"] = translate("Réalisation") #Recherche dans le générique détaillé section = page.find("section", {"id": "casting"}) if not section is None: jobs = section.findAll("h2") paras = section.findAll("p") #if not "personne" in links[0].href:links.remove(0) for idx in range(len(paras)): links = paras[idx].findAll("a") for l in links: job = jobs[idx].text.replace(":", "").strip() if "/personne" in l.get("href"): if (job_for.startswith("http") and l.get("href") == job_for) or equal_str( job_for, l.text): rc["job"] = job break else: if all_casting: #On ajoute l'ensemble du casting au systeme names = str(l.getText()).split(" ") lastname = names[len(names) - 1] rc["casting"].append({ "lastname": lastname, "url": l.attrs["href"], "source": "unifrance", "firstname": l.getText().replace(lastname, "").strip(), "job": job }) #Recherche dans les acteurs for actor in page.find_all("div", {"itemprop": "actors"}): if "data-title" in actor.attrs: if actor.attrs["data-title"].lower() == job_for.lower(): rc["job"] = "actor" if not "job" in rc: pass _synopsis = page.find("div", attrs={"itemprop": "description"}) if not _synopsis is None: rc["synopsis"] = _synopsis.getText(strip=True) return rc
def movie_importer(request): log("Importation de films") header=str(request.body)[20:35] if "excel" in header: txt = str(base64.b64decode(str(request.body).split("base64,")[1]),encoding="utf-8") d = csv.reader(StringIO(txt), delimiter=";") else: d=extract_text_from_pdf(base64.b64decode(str(request.body).split("base64,")[1])) return i = 0 record = 0 for row in list(d): pow=None if len(row)>10: if i>0: if row[6]=="":row[6]="0" if row[11]=="":row[11]="1800" pow:PieceOfWork=PieceOfWork( title=row[0].replace(u'\xa0', u' '), description=row[1], visual=row[4], nature=row[5], dtStart=row[2], budget=int(row[6]), category=row[7], links=[{"url":row[9],"text":row[8]}], lang="US", year=int(row[11]), owner=row[10] ) if not pow is None: try: pow.category = pow.category.replace("|", " ") rc = pow.save() log("Ajout de " + pow.title) record = record + 1 except Exception as inst: log("Probléme d'enregistrement" + str(inst)) else: pows=PieceOfWork.objects.filter(title__iexact=row[0]) if len(pows)==0: pow: PieceOfWork = PieceOfWork( title=row[0], description=translate(row[4]), nature=translate(row[2]), category=row[3], lang="FR" ) if len(row[1])>0:pow.year=int(str(row[1]).split(",")[0]) pow.add_link("","FEMIS","Film ajouter depuis le référencement FEMIS") pow.save() log("Ajout de "+pow.title) else: pow=pows.first() name=row[6].replace("\n","") if " " in name: profils = Profil.objects.filter(lastname__icontains=name.split(" ")[1],firstname__icontains=name.split(" ")[0]) if len(profils)>0: work=Work(pow_id=pow.id,job=translate(row[5]),profil_id=profils.first().id) work.save() i=i+1 log("Importation terminé de "+str(record)+" films") return Response(str(record) + " films importés", 200)