def load(self): url_from_name = form_url_from_name(self.film.name_orig) if not url_from_name is None: reqobj = simple_tor_get_page(url_from_name) html_with_type = HTML_with_type(reqobj.decode('utf-8')) len(html_with_type) html_with_type.page_type='film_page' return {'html':html_with_type, 'url_view':url_from_name, 'url':url_from_name} else: html_with_type =HTML_with_type(simple_tor_get_page(form_search_url(self.film.name))) html_with_type.page_type = 'search_page' return {'html':html_with_type, 'url_view':url_from_name, 'url':url_from_name}
def get_film_data(self): search_film_url = '/hbo/api/v1/films.json?' site_name = 'www.amediateka.ru' filter_film_search = 'limit=1000&offset=0&expand=genres&client_id=amediateka&platform=desktop' url = "http://{}{}{}".format(site_name, search_film_url, filter_film_search) response = simple_tor_get_page(url) data_site = json.loads(response)['films'] film = Films.objects.values_list('id', 'name') data = film.values('name', 'id') locations = { 'info': [], 'type': 'amediateka_ru' } for f in data_site: for film in data: if f['name'] == film['name']: film_data = Films.objects.filter(id=film['id']) for dict_film in film_data: d = self.film_dict(dict_film, f) one_loc_res = save_location(**d) save_existed_location_to_locs_dict(locations, one_loc_res) break fill_log_table_for_not_schema_corresponded_robots(locations) robot_is_banned = MultiLocationRobotsBunCheck.is_result_looks_like_robot_banned(locations) if not robot_is_banned: LocationRobotsCorrector.correct_locations(locations, 'amediateka') return locations
def update_kinopoisk_persone(pid): try: response = simple_tor_get_page('http://www.kinopoisk.ru/name/{}/view_info/ok/#trivia'.format(pid), True) soup = BeautifulSoup(response) tag = soup.find('span', attrs={'itemprop': 'alternativeHeadline'}) orig_name = tag.text.strip() p = Persons.objects.get(kinopoisk_id=pid) tag_birthdate = soup.find('td', attrs={'class': 'birth'}) birthdate = '' print "ID = ", p.id if not (tag_birthdate is None): birthdate = tag_birthdate.get('birthdate') else: print 'No data birthdate for this person id = {}'.format(pid) tags_bio = soup.findAll('li', attrs={'class': 'trivia'}) bio = '' if len(tags_bio): for li in tags_bio: bio = bio + ' ' + li.text else: print 'No biography for this person id = {}'.format(pid) p.name_orig = orig_name p.birthdate = birthdate p.bio = bio p.kinopoisk_id = pid if p.photo == '' and p.kinopoisk_id != 0: p.photo.save('profile.jpg', File(get_photo(p.kinopoisk_id))) p.save() except Exception, e: import traceback traceback.print_exc()
def parse(self, response, dict_gen, film, url): d = dict_gen(film) content = simple_tor_get_page(url) value = '' isFilm = False try: soup = BeautifulSoup(content) tag = soup.find('meta', attrs={'property': 'og:type'}) if (not tag is None and film.type == APP_FILM_FULL_FILM)\ or (film.type == APP_FILM_SERIAL and tag is None): tag = soup.find('div', attrs={'class': 'big_rating'}) id = tag.get('id') value = self.parse_value + id isFilm = True except: pass if isFilm: d['url_view'] = url d['value'] = value d['price_type'] = 0 d['price'] = self.get_price() d['type'] = 'zoomby' return [d] return []
def get_film_data(self): url = 'http://my.mail.ru/video/catalog/movies' content = simple_tor_get_page(url) soup = BeautifulSoup(content) locations = { 'info': [], 'type': 'mail_ru' } items = soup.findAll('a', {'class': 'link-default'}) for item in items: film_name = item.text film_link = item.get('href') film_url = HOST + film_link film_dict = self.get_film_dict(film_name) if not film_dict is None: film_dict['type'] = 'mail_ru' film_dict['url_view'] = film_url film_dict['price'] = 0 film_dict['price_type'] = APP_CONTENTS_PRICE_TYPE_FREE one_loc_res = save_location(**film_dict) save_existed_location_to_locs_dict(locations, one_loc_res) fill_log_table_for_not_schema_corresponded_robots(locations) return locations
def parse_search(response, film_name, year): film_link = None try: soup = BeautifulSoup(response) if not (soup.find('div', {'class': 'empty-search'}) is None): return None search_div = soup.find('div', {'class': 'card-list'}) film_divs = search_div.find_all( 'div', {'class': ['card', 'no-rationale', 'tall-cover', 'movies tiny']}) for film in film_divs: film_tag = film.find('a', {'class': 'title'}) if film_name == film_tag.get('title'): film_link = 'http://play.google.com' + film_tag.get('href') if film_link: page = simple_tor_get_page(film_link) soup_page = BeautifulSoup(page) film_year = soup_page.find('div', { 'itemprop': 'datePublished' }).text film_year = re.search(ur'\d+', film_year) if not str(year) in film_year.group(): return None except IndexError: film_link = None return film_link
def get_data(self): all_film_url = 'http://viaplay.ru/filmy/vse/5/alphabetical' locations = { 'info': [], 'type': 'viaplay' } content = simple_tor_get_page(all_film_url) soup_films = BeautifulSoup(content).find('ul', {'class': 'atoz-list'}).li.ul.find_all('li') films = Films.objects.values('name', 'id') for li_film in soup_films: for film in films: if li_film.a.text.lower().strip().encode('utf-8').translate(None, string.punctuation) == film['name'].lower().strip().encode('utf-8').translate(None, string.punctuation): link = 'http://viaplay.ru' + li_film.a.get('href') film_query_set = Films.objects.filter(id=film['id']) for obj in film_query_set: d = self.film_dict(obj, link) one_loc_res = save_location(**d) save_existed_location_to_locs_dict(locations, one_loc_res) break fill_log_table_for_not_schema_corresponded_robots(locations) robot_is_banned = MultiLocationRobotsBunCheck.is_result_looks_like_robot_banned(locations) if not robot_is_banned: LocationRobotsCorrector.correct_locations(locations, 'viaplay') return locations
def get_soup(): ''' Getting data from playfamily site and parsing its xml into BeautfulSoup object ''' xmldata = simple_tor_get_page(PLAYFAMILY_XML) soup = BeautifulSoup(xmldata,'xml') return soup
def get_image(template, actor_id): try: r = simple_tor_get_page(template.format(actor_id), tor_flag=False) return convert_file(r) except Exception as e: import traceback traceback.print_exc() return None
def parse_translation_live_russia_tv(): translation_list = [] # Get current date current_date = timezone.now() # Get page translation_page = simple_tor_get_page(TRANSLATION_URL) soup = BeautifulSoup(translation_page) translation_bloc = soup.find('div', {'class': ['broadcasts', 'tab-non-active', 'tab-broadcasts']}) for trans in translation_bloc.find_all('li'): try: # Get date and time time_tag = trans.find('div', {'class': 'time'}) time = re.findall(ur'\d+', time_tag.text) date_tag = trans.find('div', {'class': 'label'}) date_str = date_tag.text.lower().strip().split() if len(date_str) == 1: if date_str[0] == u'сегодня': date = timezone.datetime(year=current_date.year, month=current_date.month, day=current_date.day, hour=int(time[0]), minute=int(time[1])) elif len(date_str) == 3: date = timezone.datetime(year=int(date_str[2]), month=MONTHS[date_str[1]], day=int(date_str[0]), hour=int(time[0]), minute=int(time[1])) else: continue # Get title title_tag = trans.h2 title = title_tag.text #Get link href = trans.a.get('href') link = TRANSLATION_URL + href #Get video id video_id = href.split('/')[-1] #Create dict with information about translation translation_data = { 'title': title, 'date': date, 'price': float(0), 'link': link, 'meta': {}, 'embed_code': None, 'value': video_id, 'player': PLAYER_LINK % video_id } translation_list.append(translation_data) except Exception, e: print e.message
def __init__(self, film_id): self.film = Films.objects.get(id=film_id) search_film = urllib.urlencode( {'text': (self.film.name.encode('utf-8'))}) search_url = URL_SEARCH.format(search_film) url = "https://%s/%s" % ( HOST, search_url, ) self.response = simple_tor_get_page(url)
def kinopoisk_news(): data = BeautifulSoup( simple_tor_get_page(KINOPOISK_PREMIERES_URL).content.decode('cp1251')) big_names = data.select('span.name_big') names = data.select('span.name') for name in big_names + names: if name.a: kinopoisk_id = int(name.a.attrs['href'].split('/')[2]) kinopoisk_name = name.a.text yield kinopoisk_name, kinopoisk_id
def parse_search(response, film_name, film_year): film_link = None try: flag = False soup = BeautifulSoup(response) if soup.find(attrs={'data-href': 'video_search'}) is None: return None class_tag = soup.find('aside', {'role': 'complementary'}) if class_tag: li_list = class_tag.find_all('li') for li in li_list: if u'Видео' in li.text: flag = True break if flag: search_divs = soup.find_all('div', {'class': 'catalog-list search'}) film_div = None for div in search_divs: if div.figure: film_div = div break if film_div: films = film_div.find_all('figure') for film in films: if film_name.lower().strip().encode('utf-8').translate(None, string.punctuation) == film.figcaption.div.header.strong.text.lower().strip().encode('utf-8').translate(None, string.punctuation): film_link = 'http://www.zabava.ru' + film.a.get('href') break else: return None else: return None if film_link: film_page = simple_tor_get_page(film_link) film_soup = BeautifulSoup(film_page) year_bloc = film_soup.find('div', {'class': 'mbottom10'}) reg = re.compile(ur'Год издания') year_tag = None for e in year_bloc.find_all('em'): if reg.match(e.text): year_tag = e.parent break if year_tag: year = re.search(ur'\d+', year_tag.text) year = int(year.group()) if film_year.year != year: film_link = None else: return None except: film_link = None return film_link
def kinopoisk_news(): data = BeautifulSoup(simple_tor_get_page(KINOPOISK_PREMIERES_URL).content.decode('cp1251')) big_names = data.select('span.name_big') names = data.select('span.name') for name in big_names + names: if name.a: kinopoisk_id = int(name.a.attrs['href'].split('/')[2]) kinopoisk_name = name.a.text yield kinopoisk_name, kinopoisk_id
def load(self): url_from_name = form_url_from_name(self.film.name_orig) if not url_from_name is None: reqobj = simple_tor_get_page(url_from_name) html_with_type = HTML_with_type(reqobj.decode('utf-8')) len(html_with_type) html_with_type.page_type = 'film_page' return { 'html': html_with_type, 'url_view': url_from_name, 'url': url_from_name } else: html_with_type = HTML_with_type( simple_tor_get_page(form_search_url(self.film.name))) html_with_type.page_type = 'search_page' return { 'html': html_with_type, 'url_view': url_from_name, 'url': url_from_name }
def get_vote(soup): csslink = [lnk.attrs['href'] for lnk in soup.find_all('link') if 'votes' in lnk.attrs['href']][0] # TODO implement caching css = simple_tor_get_page(csslink, tor_flag=True) m = re.search('[.]starbar[ ]{width[:][ ](?P<width>[0-9]+)px', css) parent_width = float(m.groupdict()['width']) starbar_div = soup.select('div.starbar_w') child_width = float(dict([i.split(':') for i in starbar_div[0].attrs['style'].split(';')])['width'].replace('px', '')) return round(child_width / (parent_width * 10), 2)
def get_data(self): all_film_url = 'http://viaplay.ru/filmy/vse/5/alphabetical' content = simple_tor_get_page(all_film_url) soup_films = BeautifulSoup(content).find('ul', { 'class': 'atoz-list' }).li.ul.find_all('li') films = Films.objects.values('name', 'id') for li_film in soup_films: for film in films: if li_film.a.text == film['name']: link = 'http://viaplay.ru' + li_film.a.get('href') film_query_set = Films.objects.filter(id=film['id']) for obj in film_query_set: d = self.film_dict(obj, link) save_location(**d) break
def parse_news(robot_name): #Get site site = SITE_DICT[robot_name] # Get page page = simple_tor_get_page(site['news_url']) soup = BeautifulSoup(page) # Constant for punctuation punctuation = string.punctuation trans = string.maketrans(punctuation, ' ' * len(punctuation)) # String constant pattern = '!"#$%&\'()*+,-./:;<=>–№«»?@[\\]^_`{|}~ ' # ascii: 160 cursor = connection.cursor() #Get films from page films = soup.find_all(*site['films_tag_args']) news = [] for film in films: try: # Find name from tag name_tag = film.find(*site['name_tag_args']) name = name_tag.text.lower().strip().encode('utf-8').translate(trans) # Find year from tag year_tag = film.find(*site['year_tag_args']) year = int(re.search(ur'\d+', year_tag.text).group()) query = """SELECT * FROM (SELECT films.id, films.name, EXTRACT(YEAR FROM films.release_date) AS year, regexp_split_to_array(trim(both lower(translate(films.name, E%s, %s))), E'\\s+') AS new_name FROM films) AS t WHERE t.year=%s AND t.new_name=regexp_split_to_array(%s, E'\\s+')""" cursor.execute(query, [pattern, ' ' * len(pattern), year, name]) result = dict_fetch_all_without_gen(cursor) if len(result) == 1: link_tag = name_tag if robot_name == ROBOT_STREAM else name_tag.a link = site['host'] + link_tag.get('href') news.append({'film_id': result[0]['id'], 'url': link}) except Exception, e: print e.message
def get_film_data(self): search_film_url = '/hbo/api/v1/films.json?' filter_film_search = 'limit=1000&offset=0&expand=genres&client_id=amediateka&platform=desktop' url = "http://{}{}{}".format('www.amediateka.ru', search_film_url, filter_film_search) response = simple_tor_get_page(url) data_site = json.loads(response)['films'] film = Films.objects.values_list('id', 'name') data = film.values('name', 'id') for f in data_site: for film in data: if f['name'] == film['name']: film_data = Films.objects.filter(id=film['id']) for dict_film in film_data: d = self.film_dict(dict_film, f) save_location(**d) break
def get_serials_data(self): search_serials_url = '/hbo/api/v1/serials.json?' filter_serials_search = 'limit=1000&offset=0&expand=seasons,genres&client_id=amediateka&platform=desktop' url = "http://{}{}{}".format('www.amediateka.ru', search_serials_url, filter_serials_search) response = simple_tor_get_page(url) data_site = json.loads(response)['serials'] data = Films.objects.values_list('id', 'name').values('name', 'id') for s in data_site: for serials in data: if s['name'] == serials['name']: serials_data = Films.objects.filter(id=serials['id']) for dict_serials in serials_data: if dict_serials.type == APP_FILM_SERIAL: list_serial = self.serial_dict(dict_serials, s) for ser in list_serial: save_location(**ser) break
def parse_translation_championat_com(): translation_list = [] champ_dict = {} # Get page list_translations_page = simple_tor_get_page(TRANSLATION_URL+'/broadcast/') list_translations_soup = BeautifulSoup(list_translations_page) championship_bloc = list_translations_soup.find('div', {'class': 'broadcast__menu'}) #Create championship map for champ in championship_bloc.find_all('div', {'class': 'broadcast__menu__i'}): try: img = champ.find('div', {'class': 'broadcast__tournament'}) if img: champ_dict[img.get('class')[-1]] = champ.p.text except Exception, e: print e.message
def acquire_page(page_id): if not os.path.exists(PAGE_ARCHIVE): os.mkdir(PAGE_ARCHIVE) dump_path = os.path.join(PAGE_ARCHIVE, str(page_id)) page_dump = '' if os.path.exists(dump_path): with open(dump_path) as fd: page_dump = fd.read().decode('utf-8') if not page_dump: url = u"http://www.kinopoisk.ru/film/%d/cast/" % page_id res = simple_tor_get_page(url, tor_flag=True) page_dump = res.decode('cp1251') with open(dump_path, 'w') as fdw: fdw.write(page_dump.encode('utf-8')) return page_dump
def get_vote(soup): csslink = [ lnk.attrs['href'] for lnk in soup.find_all('link') if 'votes' in lnk.attrs['href'] ][0] # TODO implement caching r = simple_tor_get_page(csslink, tor_flag=True) css = r m = re.search('[.]starbar[ ]{width[:][ ](?P<width>[0-9]+)px', css) parent_width = float(m.groupdict()['width']) starbar_div = soup.select('div.starbar_w') child_width = float( dict([i.split(':') for i in starbar_div[0].attrs['style'].split(';') ])['width'].replace('px', '')) return round(child_width / parent_width * 10, 2)
def acquire_page(page_id, force_reload=False): if not os.path.exists(PAGE_ARCHIVE): os.mkdir(PAGE_ARCHIVE) page_dump = '' dump_path = os.path.join(PAGE_ARCHIVE, str(page_id)) if os.path.exists(dump_path): with open(dump_path) as fd: page_dump = fd.read().decode('utf-8') if not page_dump or force_reload: url = u"http://www.kinopoisk.ru/film/%d/cast/" % page_id res = simple_tor_get_page(url, tor_flag=True) page_dump = res.decode('cp1251') with open(dump_path, 'w') as fdw: fdw.write(page_dump.encode('utf-8')) return page_dump
def get_data(self): try: films = json.loads( self.response)['live_search']['search_movies_result'] for film in films: if film['rus_title'] == self.film.name: film_link = 'https://www.ayyo.ru/movies/%s/' % ( film['slug']) ayyo_film_id = film['movie'] break film_url = 'https://www.ayyo.ru/api/movies/?{}'.format( urllib.urlencode({'id__in': ayyo_film_id})) film_response = simple_tor_get_page(film_url) price = float( json.loads(film_response)['movies']['data'][str(ayyo_film_id)] ['streaming_price']) d = self.film_dict(self.film, film_link, price) save_location(**d) except Exception, e: pass
def get_data(self): locations = { 'info': [], 'type': 'www.ayyo.ru' } try: films = json.loads(self.response)['live_search']['search_movies_result'] print films for film in films: if film['rus_title'].lower().strip().encode('utf-8').translate(None, string.punctuation) == self.film.name.lower().strip().encode('utf-8').translate(None, string.punctuation): film_link = 'https://www.ayyo.ru/movies/%s/' % (film['slug']) ayyo_film_id = film['movie'] break film_url = 'https://www.ayyo.ru/api/movies/?{}'.format(urllib.urlencode({'id__in': ayyo_film_id})) film_response = simple_tor_get_page(film_url) price = float(json.loads(film_response)['movies']['data'][str(ayyo_film_id)]['streaming_price']) d = self.film_dict(self.film, film_link, price) one_loc_res = save_location(**d) save_existed_location_to_locs_dict(locations, one_loc_res) except Exception, e: pass
def kinopoisk_films(pages): try: for page in range(1, pages+1): print u"Page number: {0} of {1}".format(page, pages) html = simple_tor_get_page(KINOPOISK_LIST_FILMS_URL.format(page), tor_flag=True) soup = BeautifulSoup(html) films_list = soup.findAll('div', attrs={'class': 'name'}) for film in films_list: name = film.a.text print u"Film name: {0}".format(name) kinopoisk_id = int(film.a.get('href').split('/')[4]) if u'(сериал)' in name: name = name.replace(u'(сериал)', u'') film, flag = Films.objects.get_or_create(kinopoisk_id=kinopoisk_id, defaults={'type': '', 'name':name}) print u"Film: {0} {1}".format(film.name, film.kinopoisk_id) kinopoisk_parse_one_film.apply_async((film.kinopoisk_id, film.name)) persons_films_update_with_indexes.apply_async((film.kinopoisk_id,)) except Exception, e: import traceback traceback.print_exc()
def update_kinopoisk_persone(pid): try: response = simple_tor_get_page('http://www.kinopoisk.ru/name/{}/view_info/ok/#trivia'.format(pid), True) soup = BeautifulSoup(response) tag = soup.find('span', attrs={'itemprop': 'alternativeHeadline'}) orig_name = tag.text.strip() p = Persons.objects.get(kinopoisk_id=pid) tag_birthdate = soup.find('td', attrs={'class': 'birth'}) birthdate = '' print "ID = ", p.id if not (tag_birthdate is None): birthdate = tag_birthdate.get('birthdate') else: print 'No data birthdate for this person id = {}'.format(pid) bio = '' tags_bio = soup.findAll('li', attrs={'class': 'trivia'}) if len(tags_bio): for li in tags_bio: bio = bio + ' ' + li.text else: print 'No biography for this person id = {}'.format(pid) p.bio = bio p.kinopoisk_id = pid p.name_orig = orig_name p.birthdate = birthdate if p.photo == '' and p.kinopoisk_id != 0: p.photo.save('profile.jpg', File(get_photo(p.kinopoisk_id))) p.save() except Exception, e: traceback_own(e)
def parse_tvzor_news(): # Get page page = simple_tor_get_page(NEW_URL) # Constant for punctuation punctuation = string.punctuation trans = string.maketrans(punctuation, ' ' * len(punctuation)) # String constant pattern = '!"#$%&\'()*+,-./:;<=>–№«»?@[\\]^_`{|}~ ' # ascii: 160 cursor = connection.cursor() #Get films from page in json format films = json.loads(page) news = [] for film in films: try: # Film name name = film['name'].lower().strip().encode('utf-8').translate(trans) # Film year year = int(film['releaseDate']) query = """SELECT * FROM (SELECT films.name, films.id, EXTRACT(YEAR FROM films.release_date) AS year, regexp_split_to_array(trim(both lower(translate(films.name, E%s, %s))), E'\\s+') AS new_name FROM films) AS t WHERE t.year=%s AND t.new_name=regexp_split_to_array(%s, E'\\s+')""" cursor.execute(query, [pattern, ' ' * len(pattern), year, name]) result = dict_fetch_all(cursor) if len(result) == 1: link = HOST + '/movie/' + film['assetId'] news.append({'film_id': result[0]['id'], 'url': link}) except Exception, e: print e.message
def kinopoisk_films(pages): try: for page in range(1, pages+1): print u"Page number: {0} of {1}".format(page, pages) html = simple_tor_get_page(KINOPOISK_LIST_FILMS_URL.format(page), tor_flag=True) soup = BeautifulSoup(html) films_list = soup.findAll('div', attrs={'class': 'name'}) for film in films_list: kinopoisk_id = int(film.a.get('href').split('/')[4]) name = film.a.text print u"Film name: {0}".format(name) if u'(сериал)' in name: name = name.replace(u'(сериал)', u'') film, flag = Films.objects.get_or_create(kinopoisk_id=kinopoisk_id, defaults={'type': '', 'name':name}) print u"Film: {0} {1}".format(film.name, film.kinopoisk_id) kinopoisk_parse_one_film.apply_async((film.kinopoisk_id, film.name)) persons_films_update_with_indexes.apply_async((film.kinopoisk_id,)) except Exception, e: traceback_own(e)
def __init__(self, film_id): self.film = Films.objects.get(id=film_id) search_film = urllib.urlencode({'text': (self.film.name.encode('utf-8'))}) search_url = URL_SEARCH.format(search_film) url = "https://%s/%s" % (HOST, search_url, ) self.response = simple_tor_get_page(url)
def get_data_dict(kinopoisk_id): url = 'http://www.kinopoisk.ru/handler_trailer_popup.php?ids={id}'.format(id=kinopoisk_id) content = simple_tor_get_page(url) return json.loads(content)
def get_data_dict(kinopoisk_id): content = simple_tor_get_page( 'http://www.kinopoisk.ru/handler_trailer_popup.php?ids={}'.format( kinopoisk_id)) return json.loads(content)
time = re.findall(ur'\d+', dates[1].text) #Get price price_tag = trans.find('div', {'class': '_paid'}) price = re.search(ur'\d+', price_tag.text).group() #Get link link_tag = trans.find('a', {'class': 'broadcast__table__link'}) link = TRANSLATION_URL + link_tag.get('href') #Get championship championship_img = trans.find('td', {'class': '_icon'}).div.get('class') championship = champ_dict[championship_img[-1]] #Get value from translation page trans_page = simple_tor_get_page(link) trans_soup = BeautifulSoup(trans_page) value_div = trans_soup.find('div', {'class': 'broadcast'}) value = value_div.iframe.get('src') #Create dict with information about translation translation_data = { 'title': title, 'date': timezone.datetime(year=current_year, month=int(date[1]), day=int(date[0]), hour=int(time[0]), minute=int(time[1])), 'price': float(price), 'link': link, 'meta': {'championship': championship if championship else None}, 'value': value, } translation_list.append(translation_data)
def get_image(template, actor_id): try: result = simple_tor_get_page(template.format(actor_id), tor_flag=False) return convert_file(result) except Exception, e: traceback_own(e)