コード例 #1
0
def parse_person():
    data = {}
    for person in models.Person.query.all():  #.filter_by(name_original=None)
        while True:
            try:
                r = ProxyRequests(f'{URL}{person.links}')
            except:
                break
            r.get()
            r.encoding = 'utf-8'
            text = r.request
            soup = BeautifulSoup(text, 'html.parser')
            if not soup.find('h1', {'itemprop': 'name'}):
                continue
            alternateName = soup.find('span', {'itemprop': 'alternateName'})
            if alternateName:
                person.name_original = alternateName.text
            else:
                person.name_original = person.name
            db.session.add(person)
            db.session.commit()

            list_career = []
            director = soup.find('a', {'href': '#director'})
            if director:
                egge = director.text.replace(' ', '')
                if not models.Career.query.filter_by(name=egge).first():
                    new_career = models.Career(name=egge)
                    db.session.add(new_career)
                    db.session.commit()
                    list_career.append(new_career)
                else:
                    list_career.append(
                        models.Career.query.filter_by(name=egge).first())

            actor = soup.find('a', {'href': '#actor'})
            if actor:
                egge = actor.text.replace(' ', '')
                if not models.Career.query.filter_by(name=egge).first():
                    new_career = models.Career(name=egge)
                    db.session.add(new_career)
                    db.session.commit()
                    list_career.append(new_career)
                else:
                    list_career.append(
                        models.Career.query.filter_by(name=egge).first())

            person.career.clear()
            for i in list_career:
                person.career.append(i)
            db.session.add(person)
            db.session.commit()
            break
コード例 #2
0
def parse_links():
    page = 1
    last_page = 1
    data = {}

    while page <= last_page:
        r = ProxyRequests(f'{URL}/top/navigator/m_act[rating]/1%3A/order/rating/page/{page}/#results')
        r.get()
        r.encoding = 'utf-8'
        text = r.request
        soup = BeautifulSoup(text)
        if last_page == 1:
            try:
                last_link = soup.find_all('li', {'class': 'arr'})[-1].find('a').get('href')
                last_page = int(re.findall(r'\d{2,}', last_link)[0])
            except:
                continue

        movie_link = soup.find_all('div', {'class': '_NO_HIGHLIGHT_'})
        if not movie_link:
            continue

        for i in movie_link:
            i_soup = BeautifulSoup(f'b{i}').find('div', {'class': 'name'}).find('a')
            i_text = i_soup.text
            i_link = i_soup.get('href')
            id_film = int(re.findall(r'\d{1,}', i_link)[1])
            if models.Film.query.filter_by(id_film=id_film).first() == None:
                film = models.Film(id_film=id_film, links=i_link, name=i_text)
                db.session.add(film)
                try:
                    db.session.commit()
                except Exception:
                    db.session.rollback()
                    data[i_text] = {page:i_link}
                    continue

        page += 1
    with open('data.txt', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False)
コード例 #3
0
def parse_films():
    engine = create_engine('sqlite:///:memory:', echo=True)

    data = {}
    for film in models.Film.query.filter_by(rating_kp=None).all():  #
        while True:
            try:
                r = ProxyRequests(f'{URL}{film.links}')
            except:
                break
            r.get()
            r.encoding = 'utf-8'
            text = r.request
            soup = BeautifulSoup(text, 'html.parser')
            genres = soup.find('span', {'itemprop': 'genre'})
            if genres:
                genres = genres.find_all('a')
                countrys = soup.find_all(
                    'div', {'style': 'position: relative'})[1].find_all('a')
                persons = soup.find_all('li', {'itemprop': 'actors'})
                for director in soup.find_all('td', {'itemprop': 'director'}):
                    persons.append(director)
                break
        list_genres = []
        for genre in genres:
            if not models.Genre.query.filter_by(name=genre.text).first():
                while True:
                    new_genre = models.Genre(name=genre.text)
                    db.session.add(new_genre)
                    try:
                        db.session.commit()
                        list_genres.append(new_genre)
                        break
                    except Exception:
                        db.session.rollback()
            else:
                list_genres.append(
                    models.Genre.query.filter_by(name=genre.text).first())

        list_countrys = []
        for country in countrys:
            if not models.Country.query.filter_by(name=country.text).first():
                while True:
                    new_country = models.Country(name=country.text)
                    db.session.add(new_country)
                    try:
                        db.session.commit()
                        list_countrys.append(new_country)
                        break
                    except Exception:
                        db.session.rollback()
            else:
                list_countrys.append(
                    models.Country.query.filter_by(name=country.text).first())

        list_person = []
        for person in persons:
            if person.find('a').text.replace(' ', '') == '...':
                break
            person_link = person.find('a').get('href')
            if not models.Person.query.filter_by(id_person_kp=int(
                    re.findall(r'\d{1,}', person_link)[0])).first():
                while True:
                    # person_link = person.find('a').get('href')
                    if models.Person.query.filter_by(id_person_kp=int(
                            re.findall(r'\d{1,}', person_link)[0])).first():
                        break
                    id_person_kp = int(re.findall(r'\d{1,}', person_link)[0])
                    new_person = models.Person(name=person.text,
                                               links=person_link,
                                               id_person_kp=id_person_kp)
                    db.session.add(new_person)
                    try:
                        db.session.commit()
                        list_person.append(new_person)
                        break
                    except Exception:
                        db.session.rollback()
            else:
                if not models.Person.query.filter_by(id_person_kp=int(
                        re.findall(r'\d{1,}', person_link)
                    [0])).first() in list_person:
                    list_person.append(
                        models.Person.query.filter_by(id_person_kp=int(
                            re.findall(r'\d{1,}', person_link)[0])).first())

        # if not film.description:
        while True:
            try:
                film.name = soup.find('span', {
                    'class': 'moviename-title-wrapper'
                }).text
                film.name_original = film.name if not soup.find(
                    'span', {
                        'class': 'alternativeHeadline'
                    }).text else soup.find('span', {
                        'class': 'alternativeHeadline'
                    }).text
                film.description = soup.find('div', {
                    'itemprop': 'description'
                }).text.replace(chr(151), '-')
                film.rating_kp = float(
                    soup.find('span', {
                        'class': 'rating_ball'
                    }).text)
                film.rating_imdb = float(
                    re.findall(
                        r'[\d][^ ]+',
                        soup.find('div', {
                            'style':
                            'color:#999;font:100 11px tahoma, verdana'
                        }).text)[0])
                film.date_released = int(
                    soup.find('div', {
                        'style': 'position: relative'
                    }).find('a').text)
                try:
                    db.session.commit()
                except Exception:
                    db.session.rollback()
                    continue
                film.genre.clear()
                film.country.clear()
                film.person.clear()

                while True:
                    for i in list_genres:
                        film.genre.append(i)
                    for i in list_countrys:
                        film.country.append(i)
                    for i in list_person:
                        film.person.append(i)
                    db.session.add(film)
                    try:
                        db.session.commit()
                        break
                    except Exception:
                        db.session.rollback()
                break

            except:
                db.session.rollback()