def parse_person(): data = {} for person in models.Person.query.all(): #.filter_by(name_original=None) while True: try: r = ProxyRequests(f'{URL}{person.links}') except: break r.get() r.encoding = 'utf-8' text = r.request soup = BeautifulSoup(text, 'html.parser') if not soup.find('h1', {'itemprop': 'name'}): continue alternateName = soup.find('span', {'itemprop': 'alternateName'}) if alternateName: person.name_original = alternateName.text else: person.name_original = person.name db.session.add(person) db.session.commit() list_career = [] director = soup.find('a', {'href': '#director'}) if director: egge = director.text.replace(' ', '') if not models.Career.query.filter_by(name=egge).first(): new_career = models.Career(name=egge) db.session.add(new_career) db.session.commit() list_career.append(new_career) else: list_career.append( models.Career.query.filter_by(name=egge).first()) actor = soup.find('a', {'href': '#actor'}) if actor: egge = actor.text.replace(' ', '') if not models.Career.query.filter_by(name=egge).first(): new_career = models.Career(name=egge) db.session.add(new_career) db.session.commit() list_career.append(new_career) else: list_career.append( models.Career.query.filter_by(name=egge).first()) person.career.clear() for i in list_career: person.career.append(i) db.session.add(person) db.session.commit() break
def parse_links(): page = 1 last_page = 1 data = {} while page <= last_page: r = ProxyRequests(f'{URL}/top/navigator/m_act[rating]/1%3A/order/rating/page/{page}/#results') r.get() r.encoding = 'utf-8' text = r.request soup = BeautifulSoup(text) if last_page == 1: try: last_link = soup.find_all('li', {'class': 'arr'})[-1].find('a').get('href') last_page = int(re.findall(r'\d{2,}', last_link)[0]) except: continue movie_link = soup.find_all('div', {'class': '_NO_HIGHLIGHT_'}) if not movie_link: continue for i in movie_link: i_soup = BeautifulSoup(f'b{i}').find('div', {'class': 'name'}).find('a') i_text = i_soup.text i_link = i_soup.get('href') id_film = int(re.findall(r'\d{1,}', i_link)[1]) if models.Film.query.filter_by(id_film=id_film).first() == None: film = models.Film(id_film=id_film, links=i_link, name=i_text) db.session.add(film) try: db.session.commit() except Exception: db.session.rollback() data[i_text] = {page:i_link} continue page += 1 with open('data.txt', 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False)
def parse_films(): engine = create_engine('sqlite:///:memory:', echo=True) data = {} for film in models.Film.query.filter_by(rating_kp=None).all(): # while True: try: r = ProxyRequests(f'{URL}{film.links}') except: break r.get() r.encoding = 'utf-8' text = r.request soup = BeautifulSoup(text, 'html.parser') genres = soup.find('span', {'itemprop': 'genre'}) if genres: genres = genres.find_all('a') countrys = soup.find_all( 'div', {'style': 'position: relative'})[1].find_all('a') persons = soup.find_all('li', {'itemprop': 'actors'}) for director in soup.find_all('td', {'itemprop': 'director'}): persons.append(director) break list_genres = [] for genre in genres: if not models.Genre.query.filter_by(name=genre.text).first(): while True: new_genre = models.Genre(name=genre.text) db.session.add(new_genre) try: db.session.commit() list_genres.append(new_genre) break except Exception: db.session.rollback() else: list_genres.append( models.Genre.query.filter_by(name=genre.text).first()) list_countrys = [] for country in countrys: if not models.Country.query.filter_by(name=country.text).first(): while True: new_country = models.Country(name=country.text) db.session.add(new_country) try: db.session.commit() list_countrys.append(new_country) break except Exception: db.session.rollback() else: list_countrys.append( models.Country.query.filter_by(name=country.text).first()) list_person = [] for person in persons: if person.find('a').text.replace(' ', '') == '...': break person_link = person.find('a').get('href') if not models.Person.query.filter_by(id_person_kp=int( re.findall(r'\d{1,}', person_link)[0])).first(): while True: # person_link = person.find('a').get('href') if models.Person.query.filter_by(id_person_kp=int( re.findall(r'\d{1,}', person_link)[0])).first(): break id_person_kp = int(re.findall(r'\d{1,}', person_link)[0]) new_person = models.Person(name=person.text, links=person_link, id_person_kp=id_person_kp) db.session.add(new_person) try: db.session.commit() list_person.append(new_person) break except Exception: db.session.rollback() else: if not models.Person.query.filter_by(id_person_kp=int( re.findall(r'\d{1,}', person_link) [0])).first() in list_person: list_person.append( models.Person.query.filter_by(id_person_kp=int( re.findall(r'\d{1,}', person_link)[0])).first()) # if not film.description: while True: try: film.name = soup.find('span', { 'class': 'moviename-title-wrapper' }).text film.name_original = film.name if not soup.find( 'span', { 'class': 'alternativeHeadline' }).text else soup.find('span', { 'class': 'alternativeHeadline' }).text film.description = soup.find('div', { 'itemprop': 'description' }).text.replace(chr(151), '-') film.rating_kp = float( soup.find('span', { 'class': 'rating_ball' }).text) film.rating_imdb = float( re.findall( r'[\d][^ ]+', soup.find('div', { 'style': 'color:#999;font:100 11px tahoma, verdana' }).text)[0]) film.date_released = int( soup.find('div', { 'style': 'position: relative' }).find('a').text) try: db.session.commit() except Exception: db.session.rollback() continue film.genre.clear() film.country.clear() film.person.clear() while True: for i in list_genres: film.genre.append(i) for i in list_countrys: film.country.append(i) for i in list_person: film.person.append(i) db.session.add(film) try: db.session.commit() break except Exception: db.session.rollback() break except: db.session.rollback()