コード例 #1
0
ファイル: views.py プロジェクト: f80dev/testdcp
def update_dictionnary(request):
    for w in Work.objects.all():
        job=translate(w.job)
        if job!=w.job:
            log("Traitement de "+str(w.job))
            w.job=job
            w.save()

    for p in PieceOfWork.objects.all():
        category=translate(p.category)
        if category!=p.category:
            p.category=category
            p.save()

    return Response({"message":"ok"})
コード例 #2
0
ファイル: Batch.py プロジェクト: f80dev/DataCulturePro
def dict_to_pow(film: dict, content=None):
    pow = PieceOfWork(title=film["title"],
                      title_index=index_string(film["title"]))
    pow.add_link(url=film["url"], title=film["source"])
    if not content is None and content["senscritique"]:
        pow.add_link(extract_film_from_senscritique(film["title"]),
                     title="Sens-critique")

    for k in list(model_to_dict(pow).keys()):
        if k in film: pow.__setattr__(k, film[k])

    if "nature" in film:
        pow.nature = translate(film["nature"])
    else:
        pow.nature = "Film"

    if "category" in film: pow.category = translate(film["category"])
    if "synopsis" in film: pow.description = film["synopsis"]

    return pow
コード例 #3
0
def add_pows_to_profil(profil, links, all_links, job_for):
    """
    Ajoute des oeuvres au profil
    :param profil:
    :param links:
    :param all_links:
    :return:
    """
    for l in links:
        source = "auto"
        pow = None
        for p in PieceOfWork.objects.filter(title__iexact=l["text"]):
            for link in p.links:
                if l["url"] == link["url"]:
                    pow = p
                    break

        if not pow:
            if "unifrance" in l["url"]:
                film = extract_film_from_unifrance(l["url"], job_for=job_for)
                source = "auto:unifrance"

            if "imdb" in l["url"]:
                film = extract_film_from_imdb(l["url"],
                                              l["text"],
                                              name=profil.firstname + " " +
                                              profil.lastname,
                                              job=l["job"])
                if not "nature" in film: film["nature"] = l["nature"]
                source = "auto:IMDB"

            log("Traitement de " + film["title"] + " à l'adresse " + l["url"])

            pow = PieceOfWork(title=film["title"])
            pow.add_link(url=l["url"], title=source)
            if "nature" in film:
                pow.nature = translate(film["nature"])
            else:
                pow.nature = "Film"

            if "synopsis" in film: pow.description = film["synopsis"]
            if "visual" in film: pow.visual = film["visual"]
            if "category" in film: pow.category = translate(film["category"])
            if "year" in film: pow.year = film["year"]

            try:
                result = PieceOfWork.objects.filter(title__iexact=pow.title)
                if len(result) > 0:
                    log("Le film existe déjà dans la base, on le récupére")
                    pow = result.first()
                    pow.add_link(l["url"], source)
                pow.save()

                # TODO: a réétudier car des mises a jour de fiche pourrait nous faire rater des films
                # il faudrait désindenter le code ci-dessous mais du coup il faudrait retrouver le pow
                job = profil.job
                if "job" in film: job = film["job"]

            except Exception as inst:
                log("Impossible d'enregistrer le film: " + str(inst.args))
        else:
            job = l["job"]

        t_job = translate(job)
        if not Work.objects.filter(
                pow_id=pow.id, profil_id=profil.id, job=t_job).exists():
            log("Ajout de l'experience " + job + " traduit en " + t_job +
                " sur " + pow.title + " à " + profil.lastname)
            work = Work(pow=pow, profil=profil, job=t_job, source=source)
            work.save()
コード例 #4
0
def extract_film_from_unifrance(url: str, job_for=None):
    rc = dict()
    if not url.startswith("http"):
        log("On passe par la page de recherche pour retrouver le titre")
        page = load_page("https://unifrance.org/recherche?q=" +
                         parse.quote(url))
        _link = page.find("a",
                          attrs={
                              'href':
                              wikipedia.re.compile(
                                  "^https://www.unifrance.org/film/[0-9][0-9]")
                          })
        if _link is None: return rc

        url = _link.get("href")

    #r=wikipedia.requests.get(url, headers={'User-Agent': 'Mozilla/5.0',"accept-encoding": "gzip, deflate"})
    #page = wikipedia.BeautifulSoup(str(r.content,encoding="utf-8"),"html5lib")
    page = load_page(url)
    _title = page.find('h1', attrs={'itemprop': "name"})
    if not _title is None:
        rc["title"] = _title.text
        log("Analyse du film " + rc["title"])

    for title in page.findAll('h1'):
        if "Affiches" in title.text:
            section = title.parent
            _img = section.find("img", attrs={'itemprop': "image"})
            if not _img is None:
                src: str = _img.get("src")
                if not src.startswith("/ressource"):
                    rc["visual"] = src
                    log("Enregistrement de l'affiche " + src)

    _real = page.find("div", attrs={"itemprop": "director"})
    if not _real is None:
        rc["real"] = _real.find("a", attrs={"itemprop": "name"}).get("href")

    idx_div = 0
    for div in page.findAll("div", attrs={'class': "details_bloc"}):
        if idx_div == 0:
            if not ":" in div.text: rc["nature"] = div.text

        if "Année de production : " in div.text:
            rc["year"] = div.text.replace("Année de production : ", "")
        if "Genre(s) : " in div.text:
            rc["category"] = translate(div.text.replace("Genre(s) : ", ""))
        idx_div = idx_div + 1

    if "category" in rc and len(rc["category"]) == 0:
        rc["category"] = "inconnue"

    if not job_for is None:
        if rc["real"] == job_for:
            rc["job"] = "Réalisation"
        else:
            section = page.find("section", {"id": "casting"})

            if not section is None:
                jobs = section.findAll("h2")
                paras = section.findAll("p")
                #if not "personne" in links[0].href:links.remove(0)
                for idx in range(len(paras)):
                    links = paras[idx].findAll("a")
                    for l in links:
                        if "/personne" in l.get("href"):
                            if l.get("href") == job_for:
                                rc["job"] = jobs[idx].text.replace(" : ", "")
                                break

    if not "job" in rc:
        pass

    _synopsis = page.find("div", attrs={"itemprop": "description"})
    if not _synopsis is None: rc["synopsis"] = _synopsis.getText(strip=True)

    return rc
コード例 #5
0
def extract_film_from_imdb(
    url: str,
    title: str,
    name="",
    job="",
):
    """

    :return:
    """
    page = load_page(url)

    rc = dict({"title": title, "nature": translate("film")})

    zone_info = page.find("div", {"class": "title_block"})
    if title.startswith("Episode") or "Episode" in zone_info.getText():
        section_title = page.find("div", {"class": "titleParent"})
        if not section_title is None:
            title = section_title.find("a").text + " " + title
        #Recherche de l'épisode
        rc["nature"] = MOVIE_NATURE[0]
        zone_info_comp = page.find("div",
                                   {"class": "button_panel navigation_panel"})
        if not zone_info_comp is None and "Season" in zone_info_comp.getText():
            extract_text = "S" + zone_info_comp.getText().split(
                "Season")[1].replace("Episode ", "E").replace(
                    " | ", "").replace(" ", "")
            rc["title"] = title + " " + extract_text.split("\n")[0]

    for cat in MOVIE_CATEGORIES:
        if cat.lower() in zone_info.getText().lower():
            rc["category"] = cat
    if not "category" in rc:
        rc["category"] = "Inconnue"
        log("Pas de categorie pour " + url)

    affiche = page.find("div", attrs={"class": "poster"})
    if not affiche is None and not affiche.find("img") is None:
        rc["visual"] = affiche.find("img").get("src")

    try:
        rc["year"] = re.search('[1-2][0-9][0-9][0-9]',
                               page.title.text).group(0)
    except:
        try:
            rc["year"] = re.search('[1-2][0-9][0-9][0-9]',
                                   zone_info.getText()).group(0)
        except:
            pass

    summary_section = page.find("div", attrs={"class": "summary_text"})
    if not summary_section is None and not "Add a Plot" in summary_section.text:
        rc["synopsis"] = summary_section.text.replace("\n", "").strip()

    log("Recherche du role sur le film")

    credits = load_page(url + "fullcredits")
    if not credits is None:
        credits = credits.find("div", {"id": "main"})
        if not credits is None:
            links = credits.find_all("a")
            for l in links:
                if name.upper() in l.text.upper():
                    parent = l.parent.parent.find("td", {"class": "credit"})
                    if not parent is None:
                        rc["job"] = str(parent.getText().replace("\n",
                                                                 "")).strip()
                        rc["job"] = rc["job"].split("(")[0]
                        while "  " in rc["job"]:
                            rc["job"] = rc["job"].replace("  ", " ")

                    break

    if not "job" in rc: rc["job"] = job

    return rc
コード例 #6
0
ファイル: Batch.py プロジェクト: f80dev/DataCulturePro
def add_pows_to_profil(profil,
                       links,
                       job_for,
                       refresh_delay_page,
                       templates=[],
                       bot=None,
                       content=None):
    """
    Ajoute des oeuvres au profil
    :param profil:
    :param links:
    :param all_links:
    :return:
    """
    n_films = 0
    n_works = 0
    articles = list()
    job_for = remove_accents(remove_ponctuation(job_for))

    for l in links:
        source = "auto"
        film = None
        pow = None
        job = l["job"] if "job" in l else ""
        # for p in PieceOfWork.objects.filter(title__iexact=l["text"]):
        #     #si la source à déjà été analysée on ne fait rien
        #     for link in p.links:
        #         if l["url"] == link["url"]:
        #             pow=p
        #             break

        if "unifrance" in l["url"]:
            film = extract_film_from_unifrance(
                l["url"], job_for=job_for, refresh_delay=refresh_delay_page)

        if "source" in l and "LeFilmFrancais" in l["source"]:
            film = extract_film_from_leFilmFrancais(
                l["url"],
                job_for=job_for,
                refresh_delay=refresh_delay_page,
                bot=bot)

        if "imdb" in l["url"]:
            film = extract_film_from_imdb(l["url"],
                                          l["text"],
                                          name=profil.firstname + " " +
                                          profil.lastname,
                                          job=l["job"],
                                          refresh_delay=refresh_delay_page)
            if film and (film["category"] == "News"
                         or len(film["nature"]) == 0):
                log("Ce type d'événement est exlue :" + str(film))
                film = None

        if not film is None:
            if not "nature" in film: film["nature"] = l["nature"]
            if "title" in film:
                log("Traitement de " + film["title"] + " à l'adresse " +
                    l["url"])

            pow = dict_to_pow(film, content)

            job = profil.job
            if "job" in film: job = film["job"]

            try:
                result = PieceOfWork.objects.filter(
                    title_index__iexact=pow.title_index)
                if len(result) > 0:
                    for p in result:
                        if abs(int(p.year) - int(pow.year)) <= 1:
                            log("Le film existe déjà dans la base, on le met a jour avec les nouvelles données"
                                )
                            pow, hasChanged = fusion(p, pow)
                            if hasChanged:
                                pow.dtLastSearch = datetime.now()
                                pow.save()
                else:
                    n_films = n_films + 1
                    pow.dtLastSearch = datetime.now()
                    pow.save()

                # TODO: a réétudier car des mises a jour de fiche pourrait nous faire rater des films
                # il faudrait désindenter le code ci-dessous mais du coup il faudrait retrouver le pow

            except Exception as inst:
                log("Impossible d'enregistrer le film: " + str(inst.args))
        else:
            log("Impossible de retrouver le film" + str(film))

        if not pow is None:
            if not film is None and "prix" in film and not film[
                    "prix"] is None and len(film["prix"]) > 0:
                for prix in film["prix"]:
                    f = Festival.objects.filter(title__iexact=prix["title"])
                    if f.exists():
                        f = f.first()
                    else:
                        f = Festival(title=prix["title"].strip().lower())
                        f.save()

                    a = Award.objects.filter(pow__id=pow.id,
                                             year=int(prix["year"]),
                                             festival__id=f.id)
                    if a.exists():
                        a = a.first()
                    else:
                        desc = prix["description"][:249]
                        if desc.startswith("(") and ")" in desc:
                            desc = desc.split(")")[1]

                        a = Award(
                            description=desc,
                            year=prix["year"],
                            pow=pow,
                            festival=f,
                            profil=None
                            if not "profil" in prix else Profil.objects.filter(
                                name_index__iexact=prix["profil"]).first())
                        try:
                            a.save()
                        except:
                            log("!!Probleme d'enregistrement de l'award sur " +
                                pow.title)

            if job is None: job = ""
            t_job = translate(job)
            if len(t_job) == 0:
                if job_for and pow and pow.title:
                    log("!Job non identifié pour " + job_for + " sur " +
                        pow.title)
                #t_job="Non identifié"
            else:
                if not Work.objects.filter(pow_id=pow.id,
                                           profil_id=profil.id,
                                           job=t_job).exists():
                    if len(t_job) > 0:
                        log("Ajout de l'experience " + job + " traduit en " +
                            t_job + " sur " + pow.title + " à " +
                            profil.lastname)
                        work = Work(pow=pow,
                                    profil=profil,
                                    job=t_job,
                                    source=source)
                        try:
                            work.save()
                        except Exception as inst:
                            log("Impossible d'enregistrer le travail: " +
                                str(inst.args))

                        if len(templates) > 0:
                            articles.append(
                                create_article(profil, pow, work,
                                               templates[0]))
                    else:
                        log("Pas d'enregistrement de la contribution job=" +
                            job)

            # Enregistrement du casting
            if not film is None and "casting" in film:
                for p in film["casting"]:
                    _ps = list(
                        Profil.objects.filter(lastname=p["lastname"],
                                              firstname=p["firstname"]))
                    if len(_ps) == 0:
                        log("Ajout de " + p["lastname"] +
                            " comme externe en tant que " + p["job"])
                        _p = Profil(firstname=p["firstname"],
                                    lastname=p["lastname"],
                                    name_index=index_string(p["firstname"] +
                                                            p["lastname"]),
                                    department="Ext",
                                    cursus="E",
                                    school="",
                                    email=p["firstname"] + "." +
                                    p["lastname"] + "@fictif")
                        _p.add_link(url=p["url"], title=p["source"])
                        _p.save()
                    else:
                        _p = _ps[0]

                    if not Work.objects.filter(pow_id=pow.id,
                                               profil_id=_p.id,
                                               job=p["job"]).exists():
                        work = Work(pow=pow,
                                    profil=_p,
                                    job=p["job"],
                                    source=source)

                        work.save()
                        n_works = n_works + 1

    return n_films, n_works, articles
コード例 #7
0
ファイル: Batch.py プロジェクト: f80dev/DataCulturePro
def extract_film_from_imdb(url: str,
                           title: str,
                           name="",
                           job="",
                           all_casting=False,
                           refresh_delay=31):
    """

    :return:
    """
    if not url.startswith("http"):
        page = load_page("https://www.imdb.com/find?s=tt&q=" +
                         parse.quote(url))
        bFind = False
        for link in page.find_all("a"):
            if link and equal_str(
                    link.text, url) and link["href"].startswith("/title/tt"):
                url = "https://www.imdb.com" + link["href"]
                bFind = True
                break
        if not bFind:
            log(url + " introuvable sur IMDB")
            return None

    page = load_page(url, refresh_delay)

    title = remove_ponctuation(title)

    rc = dict({
        "title": title,
        "nature": "",
        "casting": list(),
        "url": url,
        "source": "auto:IMDB"
    })

    divs = dict()
    elts = page.find_all("div", recursive=True) + page.find_all(
        "h1", recursive=True) + page.find_all(
            "ul", recursive=True) + page.find_all("p") + page.find_all("li")
    for div in elts:
        s = div.text
        s_t = translate(s)
        if s_t in MOVIE_NATURE:
            rc["nature"] = s_t
        if s.startswith("1h") or s.startswith("2h") and s.endswith(
                "m") and len(rc["nature"]) == 0:
            rc["nature"] = translate("long")
        if "data-testid" in div.attrs:
            divs[div.attrs["data-testid"]] = div

    #Recherche de la nature et de la catégorie
    if not "genres" in divs:
        elt = page.find("li", {
            "role": "presentation",
            "class": "ipc-inline-list__item"
        })
        if not elt is None:
            cat = elt.text
        else:
            cat = "inconnu"
    else:
        cat = ""
        for div in divs["genres"]:
            cat = cat + translate(div.text.lower()) + " "
        if cat.split(" ")[0] in MOVIE_NATURE:
            rc["nature"] = cat.split(" ")[0]
            cat = cat.replace(rc["nature"], "").strip()

    rc["category"] = cat.strip()

    try:
        title = divs["hero-title-block__title"].text
        year = divs["hero-title-block__metadata"].text
        if not year is None: rc["year"] = re.search(r"(\d{4})", year).group(1)
    except:
        log("Erreur sur title=" + title)
        return None

    affiche = divs["hero-media__poster"]
    if not affiche is None and not affiche.find("img") is None:
        rc["visual"] = affiche.find("img").get("src")

    rc["synopsis"] = ""
    if "plot" in divs:
        rc["synopsis"] = divs["plot"].text.replace("Read all", "")

    #log("Recherche du role sur le film")

    credits = load_page(url + "fullcredits", refresh_delay)
    if not credits is None:
        credits = credits.find("div", {"id": "fullcredits_content"})
        if not credits is None:
            sur_jobs = credits.find_all("h4")
            tables = credits.find_all("table")
            for i in range(0, len(tables)):
                trs = tables[i].find_all("tr")

                for tr in trs:
                    tds = tr.find_all("td")
                    if len(tds) > 1:
                        findname = tds[0].text.replace("\n", "").replace(
                            "  ", " ").strip()
                        if len(findname) == 0:
                            findname = tds[1].text.replace("\n", "").replace(
                                "  ", " ").strip()
                        if len(findname) > 0:
                            #log("Nom identifié "+findname)
                            if equal_str(findname, name):
                                sur_job = sur_jobs[i].text.replace(
                                    "\n", " ").strip()
                                if "Cast" in sur_job or "Serie Cast" in sur_job:
                                    if len(tds) > 3 and "Self" in tds[3].text:
                                        job = ""
                                    else:
                                        job = "Actor"
                                else:
                                    job = tds[len(tds) - 1].text.split(
                                        "(")[0].split("/")[0].strip()
                                    if len(job) == 0 and len(
                                            sur_jobs[i].text) > 0:
                                        job = sur_job.replace(" by",
                                                              "").strip()

                                job = job.split("\n")[0]
                                rc["job"] = translate(job)
                                if len(job) == 0:
                                    log("Job non identifié pour " + name +
                                        " sur " + url)
                                else:
                                    if not all_casting: break
                            else:
                                if all_casting:
                                    names = tds[0].split(" ")
                                    rc["casting"].append({
                                        "name":
                                        " ".join(names),
                                        "source":
                                        "imdb",
                                        "job":
                                        job
                                    })

    if not "job" in rc: rc["job"] = job

    return rc
コード例 #8
0
ファイル: Batch.py プロジェクト: f80dev/DataCulturePro
def extract_awards_from_imdb(profil_url, profil):
    # Recherche des awards
    page = load_page(profil_url + "awards?ref_=nm_awd")

    awards = page.find_all("h3")
    if len(awards) > 0:
        awards.pop(0)

    tables = page.find_all("table", {"class": "awards"})

    for i in range(0, len(tables)):
        for tr in tables[i].find_all("tr"):
            if tr:
                festival_title = translate(
                    awards[i].text.split(",")[0].lower().strip())
                tds = tr.find_all("td")
                if len(tds) <= 2:
                    log("Format non conforme " + tr.text)
                else:
                    year = tds[0].text.replace("\n", "").replace(" ",
                                                                 "").strip()
                    award = tds[1].text

                    film = tds[2].find("a")
                    if film and award:
                        win = ("Winner" in award)
                        film_title = film.text
                        if "(" in tds[2].text:
                            film_year = tds[2].text.split("(")[1].split(")")[0]
                            pow = PieceOfWork.objects.filter(
                                title__iexact=film_title,
                                year__iexact=film_year)
                            if pow.exists():
                                pow = pow.first()
                                f = Festival.objects.filter(
                                    title__iexact=festival_title)
                                if f.exists():
                                    f = f.first()
                                else:
                                    f = Festival(title=festival_title)
                                    f.save()

                                a = Award.objects.filter(pow__id=pow.id,
                                                         year=year,
                                                         festival__id=f.id,
                                                         profil__id=profil.id)
                                if a.exists():
                                    a = a.first()
                                else:
                                    award = award.replace("\n", "").replace(
                                        "Winner", "").replace("Nominee",
                                                              "").strip()
                                    if award.startswith("(") and ")" in award:
                                        award = award.split(")")[1]
                                    a = Award(description=award,
                                              year=year,
                                              pow=pow,
                                              festival=f,
                                              profil=profil,
                                              winner=win)
                                try:
                                    a.save()
                                except:
                                    log("!!Probleme d'enregistrement de l'award sur "
                                        + pow.title)
コード例 #9
0
ファイル: Batch.py プロジェクト: f80dev/DataCulturePro
def extract_film_from_unifrance(url: str,
                                job_for=None,
                                all_casting=False,
                                refresh_delay=30):
    rc = dict({"casting": [], "source": "auto:unifrance", "url": url})
    if not url.startswith("http"):
        log("On passe par la page de recherche pour retrouver le titre")
        page = load_page("https://unifrance.org/recherche?q=" +
                         parse.quote(url),
                         refresh_delay=refresh_delay)
        _link = page.find("a",
                          attrs={
                              'href':
                              wikipedia.re.compile(
                                  "^https://www.unifrance.org/film/[0-9][0-9]")
                          })
        if _link is None: return None

        url = _link.get("href")
        rc["url"] = url

    #r=wikipedia.requests.get(url, headers={'User-Agent': 'Mozilla/5.0',"accept-encoding": "gzip, deflate"})
    #page = wikipedia.BeautifulSoup(str(r.content,encoding="utf-8"),"html5lib")
    page = load_page(url, refresh_delay)
    _title = page.find('h1', attrs={'itemprop': "name"})
    if not _title is None:
        rc["title"] = _title.text
        log("Analyse du film " + rc["title"])

    for title in page.findAll('h1'):
        if title.text.startswith("Affiches"):
            section = title.parent
            _img = section.find("img", attrs={'itemprop': "image"})
            if not _img is None:
                src: str = _img.get("src")
                if not src.startswith("/ressource"):
                    rc["visual"] = src
                    log("Enregistrement de l'affiche " + src)

    _real = page.find("div", attrs={"itemprop": "director"})
    if not _real is None and not _real.find("a", attrs={"itemprop": "name"
                                                        }) is None:
        rc["real"] = _real.find("a", attrs={"itemprop": "name"}).get("href")

    idx_div = 0
    for div in page.findAll("div", attrs={'class': "details_bloc"}):
        if idx_div == 0:
            if not ":" in div.text: rc["nature"] = div.text

        if "Numéro de visa" in div.text:
            rc["visa"] = div.text.split(" : ")[1].replace(".", "")

        if "Langues de tournage" in div.text:
            rc["langue"] = div.text.split(" : ")[1]

        if "Année de production : " in div.text:
            rc["year"] = div.text.replace("Année de production : ", "")
        if "Genre(s) : " in div.text:
            rc["category"] = translate(div.text.replace("Genre(s) : ", ""))
        idx_div = idx_div + 1

    if "category" in rc and len(rc["category"]) == 0:
        rc["category"] = "inconnue"

    rc["prix"] = []
    for section_prix in page.find_all("div",
                                      attrs={"class": "distinction palmares"}):
        if len(section_prix.find_all("div")) > 0:
            content = section_prix.find_all("div")[1].text
            if content is not None:
                content = content.replace("PlusMoins", "")
                _prix = {
                    "description": content.split(")Prix")[1].split(" : ")[0]
                }

                for l in section_prix.find_all("div")[1].find_all("a"):
                    if "festivals" in l.attrs["href"]:
                        _prix["title"] = l.text.split("(")[0]
                        _prix["year"] = re.findall(r"[1-2][0-9]{3}", l.text)[0]
                    if "person" in l.attrs["href"] and "profil" not in _prix:
                        _prix["profil"] = index_string(l.text)

                if not "profil" in _prix:
                    log("Attribution du prix à " + job_for)
                    _prix["profil"] = index_string(job_for)

                if "year" in _prix and "title" in _prix:
                    rc["prix"].append(_prix)
                    log("Ajout du prix " + str(_prix))
                else:
                    log("!Prix non conforme sur " + url)

    if not job_for is None:
        real_links = page.find("div", {
            "id": "description"
        }).find("p").find_all("a")
        if len(real_links) > 0 and equal_str(real_links[0].text, job_for):
            rc["job"] = translate("Réalisation")
        else:
            #Recherche en réalisation
            section = page.find("div", {"itemprop": "director"})
            if section and (job_for.lower() in section.text.lower()):
                rc["job"] = translate("Réalisation")

            #Recherche dans le générique détaillé
            section = page.find("section", {"id": "casting"})
            if not section is None:
                jobs = section.findAll("h2")
                paras = section.findAll("p")
                #if not "personne" in links[0].href:links.remove(0)
                for idx in range(len(paras)):
                    links = paras[idx].findAll("a")
                    for l in links:
                        job = jobs[idx].text.replace(":", "").strip()
                        if "/personne" in l.get("href"):
                            if (job_for.startswith("http")
                                    and l.get("href") == job_for) or equal_str(
                                        job_for, l.text):
                                rc["job"] = job
                                break
                            else:
                                if all_casting:
                                    #On ajoute l'ensemble du casting au systeme
                                    names = str(l.getText()).split(" ")
                                    lastname = names[len(names) - 1]
                                    rc["casting"].append({
                                        "lastname":
                                        lastname,
                                        "url":
                                        l.attrs["href"],
                                        "source":
                                        "unifrance",
                                        "firstname":
                                        l.getText().replace(lastname,
                                                            "").strip(),
                                        "job":
                                        job
                                    })

            #Recherche dans les acteurs
            for actor in page.find_all("div", {"itemprop": "actors"}):
                if "data-title" in actor.attrs:
                    if actor.attrs["data-title"].lower() == job_for.lower():
                        rc["job"] = "actor"

    if not "job" in rc:
        pass

    _synopsis = page.find("div", attrs={"itemprop": "description"})
    if not _synopsis is None:
        rc["synopsis"] = _synopsis.getText(strip=True)

    return rc
コード例 #10
0
ファイル: views.py プロジェクト: f80dev/testdcp
def movie_importer(request):
    log("Importation de films")
    header=str(request.body)[20:35]
    if "excel" in header:
        txt = str(base64.b64decode(str(request.body).split("base64,")[1]),encoding="utf-8")
        d = csv.reader(StringIO(txt), delimiter=";")
    else:
        d=extract_text_from_pdf(base64.b64decode(str(request.body).split("base64,")[1]))
        return

    i = 0
    record = 0
    for row in list(d):
        pow=None
        if len(row)>10:
            if i>0:
                if row[6]=="":row[6]="0"
                if row[11]=="":row[11]="1800"

                pow:PieceOfWork=PieceOfWork(
                    title=row[0].replace(u'\xa0', u' '),
                    description=row[1],
                    visual=row[4],
                    nature=row[5],
                    dtStart=row[2],
                    budget=int(row[6]),
                    category=row[7],
                    links=[{"url":row[9],"text":row[8]}],
                    lang="US",
                    year=int(row[11]),
                    owner=row[10]
                )

                if not pow is None:
                    try:
                        pow.category = pow.category.replace("|", " ")
                        rc = pow.save()
                        log("Ajout de " + pow.title)
                        record = record + 1
                    except Exception as inst:
                        log("Probléme d'enregistrement" + str(inst))

        else:
            pows=PieceOfWork.objects.filter(title__iexact=row[0])
            if len(pows)==0:
                pow: PieceOfWork = PieceOfWork(
                    title=row[0],
                    description=translate(row[4]),
                    nature=translate(row[2]),
                    category=row[3],
                    lang="FR"
                )
                if len(row[1])>0:pow.year=int(str(row[1]).split(",")[0])
                pow.add_link("","FEMIS","Film ajouter depuis le référencement FEMIS")
                pow.save()
                log("Ajout de "+pow.title)
            else:
                pow=pows.first()

            name=row[6].replace("\n","")
            if " " in name:
                profils = Profil.objects.filter(lastname__icontains=name.split(" ")[1],firstname__icontains=name.split(" ")[0])
                if len(profils)>0:
                    work=Work(pow_id=pow.id,job=translate(row[5]),profil_id=profils.first().id)
                    work.save()



        i=i+1
    log("Importation terminé de "+str(record)+" films")

    return Response(str(record) + " films importés", 200)