Beispiel #1
0
def extract_profil_from_bellefaye(firstname, lastname):
    page = wikipedia.BeautifulSoup(
        wikipedia.requests.post(
            "https://www.bellefaye.com/fr/login_check",
            data=
            "_csrf_token=c8FvlHO5q-f0XpbhG2lQJifHlmhei_qpGO3WcaLgPqE&_username=h.hoareau%40femis.fr&_password=Femis2021&_submit=",
            headers={
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
                'Accept': 'text/html',
                'Content-Type': 'application/x-www-form-urlencoded'
            }).text, "html5lib")

    url = "https://www.bellefaye.com/fr/search"
    data = "name=%name%&firstName=%firstname%&searchCity=&searchZipCode=&searchEmail=&searchGender=&findPerson=&searchName=&searchCity2=&searchZipCode2=&searchEmail2="
    data = data.replace("%name%", lastname).replace("%firstname%", firstname)
    page = wikipedia.BeautifulSoup(
        wikipedia.requests.post(
            url,
            data=data,
            headers={
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
                'Accept': 'text/html',
                'Content-Type': 'application/x-www-form-urlencoded'
            }).text, "html5lib")
    print(page.text)
    pass
Beispiel #2
0
def extract_profil_from_cnca(title):
    """
    Extraction sur la base http://www.cnc-rca.fr/Pages/PageAccueil.aspx
    :param firstname:
    :param lastname:
    :return:
    """
    page = wikipedia.BeautifulSoup(
        wikipedia.requests.get(
            "http://www.cnc-rca.fr/Pages/Page.aspx?view=RecOeuvre",
            headers={
                'User-Agent': 'Mozilla/5.0'
            }).text, "html5lib")
    return title
Beispiel #3
0
def extract_actor_from_wikipedia(lastname, firstname):
    wikipedia.set_lang("fr")

    searchs = wikipedia.search(lastname + " " + firstname)

    for search in searchs:
        page = wikipedia.page(search)
        rc = {"links": list({"title": "wikipedia", "url": page.url})}

        if lastname in page.title and firstname in page.title:
            rc = dict({"links": [], "name": firstname + " " + lastname})
            for img in page.images:
                if img.endswith(".jpg"): rc["photo"] = img

            save_domains = [
                "unifrance.org", "www.lefilmfrancais", "www.allocine.fr",
                "catalogue.bnf.fr", "www.allmovie.com"
            ]
            libs = [
                "UniFrance", "Le Film Francais", "Allocine", "La BNF",
                "All movie"
            ]
            try:
                for ref in page.references:
                    domain = urlparse(ref).netloc
                    try:
                        idx = save_domains.index(domain)
                        rc["links"].append({"title": libs[idx], "url": ref})
                    except:
                        pass
            except:
                pass

            html: wikipedia.BeautifulSoup = wikipedia.BeautifulSoup(
                page.html(), "html5lib")
            #Recherche de la section des films
            # for link in html.findAll('a', attrs={'href': wikipedia.re.compile("^http://")}):
            #     if "film" in link.text:
            #         pass

            rc["summary"] = page.summary
            rc["title"] = page.title
            rc["url"] = page.url

            return rc

    return None
Beispiel #4
0
def extract_profil_from_unifrance(name="céline sciamma", refresh_delay=31):
    page = load_page(
        "https://www.unifrance.org/recherche/personne?q=$query&sort=pertinence"
        .replace("$query", parse.quote(name)),
        refresh_delay=refresh_delay)
    links = page.findAll(
        'a',
        attrs={
            'href':
            wikipedia.re.compile(
                "^https://www.unifrance.org/annuaires/personne/")
        })

    rc = list()
    if len(links) > 0:
        u = links[0].get("href")
        page = wikipedia.BeautifulSoup(
            wikipedia.requests.get(u, headers={
                'User-Agent': 'Mozilla/5.0'
            }).text, "html5lib")
        if equal_str(name,
                     page.title.text.split("-")[0]) or equal_str(
                         name, links[0].text.split("Activités : ")[0]):
            photo = ""
            _photo = page.find('div',
                               attrs={'class': "profil-picture pull-right"})
            if not _photo is None: photo = _photo.find("a").get("href")

            links_film = page.findAll(
                'a',
                attrs={
                    'href':
                    wikipedia.re.compile(
                        "^https://www.unifrance.org/film/[0-9][0-9]*/")
                })
            for l in links_film:
                rc.append({
                    "url": l.get("href"),
                    "text": l.get("text"),
                    "nature": ""
                })

            return {"links": rc, "photo": photo, "url": u}

    return None