def get_kinoteatrua_films_and_persons(): ''' Получение фильмов ''' opener = give_me_cookie() source = ImportSources.objects.get(url='http://kino-teatr.ua/') sfilm_clean(source) try: with open( '%s/dump_%s_nof_film.xml' % (settings.NOF_DUMP_PATH, source.dump), 'r') as f: xml_data = BeautifulSoup(f.read(), from_encoding="utf-8") except IOError: xml_data = BeautifulSoup('', from_encoding="utf-8") ignored = get_ignored_films() films_slugs = [i.get('slug_ru') for i in xml_data.findAll('film')] data_nof_film = '' persons_dict = {} data_nof_persons = '' films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) year = datetime.datetime.now().year lang = Language.objects.get(name='Украинский') def get_persons(data): persons = {} tags = ['director', 'actor'] for tag in tags: for p in data.findAll('span', itemprop=tag): person_id = p.a.get('href') person_id = long(re.findall(r'\d+', person_id)[0]) if p.a.text: persons[person_id] = p.a.text.encode('utf-8') return persons films_urls = get_kinoteatrua_films_links( 'http://kino-teatr.ua/films-near.phtml', 1, year, source, opener) for ind, film in enumerate(films_urls): film_ua_url = film['url'].replace(source.url, '%suk/' % source.url) req_text = opener.open(urllib2.Request(film_ua_url)) if req_text.getcode() == 200: film_data = BeautifulSoup(req_text.read(), from_encoding="utf-8") persons = get_persons(film_data) persons_dict[film['id']] = persons name = film_data.find('div', { 'class': 'myriadFilm' }).text.encode('utf-8') name = name.replace('Фільм ', '').strip() text = film_data.find('div', itemprop='description') text_data = text.findAll('p', limit=1) if text_data: text = text_data[0].text.encode('utf-8') else: text = text.text.encode('utf-8').strip() text = text.replace('редактирование синопсиса', '').strip() if text in ('Проект оголошений', 'Підготовка до зйомок'): text = '' film_slug = low(del_separator(film['name'])) temp_film_slug = film_slug.decode('utf-8') if temp_film_slug not in ignored and temp_film_slug not in films_slugs: obj = films.get(film['id']) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, year, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film['id'], kid, source, name, year=film.get('year'), txt=text) films[film['id']] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: if temp_film_slug not in films_slugs: data_nof_film += xml_noffilm( film['name'], film_slug, None, None, film['id'], info, film['url'].encode('utf-8'), source.id) if objt and not create_new: try: film_text = objt.text.encode('utf-8') except UnicodeDecodeError: film_text = objt.text if film_text != text: objt.text = text objt.save() if ind % 2 == 0: time.sleep(random.uniform(1.0, 3.0)) xml_data = str(xml_data).replace('<html><head></head><body><data>', '').replace('</data></body></html>', '') xml_data = '<data>%s%s</data>' % (xml_data, data_nof_film) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, xml_data) cron_success('html', source.dump, 'films', 'Укр. фильмы') # persons persons_nof_list = [] persons_list = [] for ind, film in enumerate(films_urls): req = opener.open(urllib2.Request(film['url'])) if req.getcode() == 200: film_data = BeautifulSoup(req.read(), from_encoding="utf-8") persons = get_persons(film_data) for person_id, person_ru_name in persons.iteritems(): if person_id not in persons_nof_list and person_id not in persons_list: ukr_person = persons_dict.get(film['id']) if ukr_person: ukr_person_name = ukr_person.get(person_id) if ukr_person_name: ukr_person_name_slug = low( del_separator(ukr_person_name)) person_ru_name_slug = low( del_separator(person_ru_name)) person_obj = Person.objects.filter( name__name=person_ru_name_slug).exclude( kid=None) if person_obj.count() == 1: names = [{ 'name': ukr_person_name, 'status': 1 }, { 'name': ukr_person_name_slug, 'status': 2 }] for i in names: name_obj, name_created = NamePerson.objects.get_or_create( name=i['name'], status=i['status'], language=lang, defaults={ 'name': i['name'], 'status': i['status'], 'language': lang, }) if name_obj not in person_obj[0].name.all( ): person_obj[0].name.add(name_obj) else: data_nof_persons += '<person name="%s" slug="%s" code="%s" name_alt="%s" slug_alt="%s"></person>' % ( person_ru_name.replace( '"', "'"), person_ru_name_slug, person_id, ukr_person_name.replace( '"', "'"), ukr_person_name_slug) persons_list.append(person_id) if ind % 2 == 0: time.sleep(random.uniform(1.0, 3.0)) create_dump_file('%s_nof_person' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_persons) cron_success('html', source.dump, 'persons', 'Укр. персоны')
def get_okinoua_films(): """ Парсинг фильмов Украины """ xml = open('%s/dump_okinoua_nof_film.xml' % settings.NOF_DUMP_PATH, 'r') xml_data = BeautifulSoup(xml.read(), from_encoding="utf-8") xml.close() films_slugs = [] for i in xml_data.findAll('film'): slug = i.get('slug_ru') films_slugs.append(slug) source = ImportSources.objects.get(url='http://www.okino.ua/') data_nof_films = '' not_founded_films = [] # Получаем словарь идентифицированных фильмов OkinoUA okinoua_films = get_source_data(source, 'film', 'list') # Получаем словарь со списком идентифицированных городов OkinoUA okinoua_cities_dict = get_source_data(source, 'city', 'dict') # Получаем словарь со списком идентифицированных кинотеатров OkinoUA okinoua_cinemas_dict = get_source_data(source, 'cinema', 'dict') counter = 0 for city_id, city_obj in okinoua_cities_dict.iteritems(): counter += 1 url = '%s%s/' % (source.url, city_id) req = urllib.urlopen(url) dates = [] if req.getcode() == 200: page = BeautifulSoup(req.read(), from_encoding="utf-8") for div in page.findAll('div', {'class': 'item0'}): for film in div.findAll('div', {'class': 'item2'}): alt_name = None if film.div.div.a: film_name = film.div.div.a.string.encode('utf-8') film_a = film.div.div.a.get('href') film_id = film_a.replace('/film/', '').replace( '/', '').encode('utf-8') full_url = '%sfilm/%s' % (source.url, film_id) req_name = urllib.urlopen(full_url) if req_name.getcode() == 200: filmpage = BeautifulSoup(req_name.read(), from_encoding="utf-8") title = filmpage.find('div', {'class': 'item'}) if title.h4: alt_name = title.h4.text.encode('utf-8') alt_name = re.sub(r'\(.*?\)', '', alt_name).strip() else: film_name = film.div.div.string.strip().encode('utf-8') film_id = None film_name_slug = low( del_separator(del_screen_type(film_name))) if not film_id: film_id = film_name_slug.decode('utf-8') if film_id not in okinoua_films: kid, info = film_identification(film_name_slug, None, {}, {}, source=source) if kid: film_obj, created = SourceFilms.objects.get_or_create( source_id=film_id, source_obj=source, defaults={ 'source_id': film_id, 'source_obj': source, 'name': film_name, 'kid': kid, 'name_alter': alt_name, }) else: slug_tag = 'slug_ru="%s"' % film_name_slug if slug_tag not in data_nof_films and film_name_slug.decode( 'utf-8') not in films_slugs: data_nof_films += xml_noffilm( film_name, film_name_slug, None, None, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) okinoua_films.append(film_id) if counter % 4 == 0: time.sleep(random.uniform(1.0, 3.0)) xml_data = str(xml_data).replace('<html><head></head><body><data>', '').replace('</data></body></html>', '') xml_data = '<data>%s%s</data>' % (xml_data, data_nof_films) create_dump_file('okinoua_nof_film', settings.NOF_DUMP_PATH, xml_data) cron_success('html', 'okinoua', 'films', 'Фильмы')
def ivi_ident(): source = ImportSources.objects.get(url='http://antipiracy.ivi.ru/') sfilm_clean(source) ignored = get_ignored_films() REG_YEAR = re.compile(r'(\,\s\d{4}$)|(\s\(\d{4}\)$)') data_nof_film = '' noffilms = [] films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) films_data = [] with open('%s/dump_%s.txt' % (settings.API_DUMP_PATH, source.dump), 'r') as f: ftype = False count = 0 tmp = {} for line in f: try: l = line.strip() if l == 'ФИЛЬМЫ:': ftype = True if ftype: if l: count += 1 if count == 1: tmp['name'] = l elif count == 2: tmp['code'] = l films_data.append(tmp) else: if tmp: tmp = {} count = 0 except ValueError: pass for i in films_data: name = i['name'] code = i['code'] year = REG_YEAR.findall(name) if year: name_clear = REG_YEAR.sub('', name) year = ''.join(year[0]) year = year.replace(',', '').replace('(', '').replace(')', '').strip() else: year = None name_clear = name name_slug = low(del_separator(name_clear)) film_id = low(del_separator(name)) if film_id.decode('utf-8') not in ignored and film_id not in noffilms: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(name_slug, None, {}, {}, year=year, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, name, txt=code) films[film_id.decode('utf-8')] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm(name, name_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'players', 'Онлайн плееры')
def get_cinemaarthall_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] city_name = 'Норильск' cinema_name = 'Синема-АРТ-Холл' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://cinemaarthall.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) dates = [] url = '%spage/kino/films/' % source.url req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) show_days = data.find('div', id='datachek') for a in show_days.findAll('a'): day = a.get('href').replace('/page/kino/films/&date=', '') dates.append(day) for d in dates: url = '%spage/kino/films/&date=%s' % (source.url, d) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) for div in data.findAll('div', {'class': 'media-block'}): film_name = div.find('h3') if film_name: film_name = film_name.string.encode('utf-8') film_id = div.findAll('a', limit=1)[0].get('href').replace( '/', '').encode('utf-8') film_slug = del_screen_type(low(del_separator(film_name))) full_url = '%spage/kino/films/%s' % (source.url, film_id) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: div_sess = div.find('div', {'class': 'filmr'}) for t in div_sess.findAll('span'): if t.string: t = t.string.split(',')[0] hours, minutes = t.split(':') day, month, year = d.split('.') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_okinoua_distributors(request): form = OkinoUploadForm() if request.POST: form = OkinoUploadForm(request.POST, request.FILES) if form.is_valid(): source = ImportSources.objects.get(url='http://www.okino.ua/') with open( '%s/dump_%s_nof_film.xml' % (settings.NOF_DUMP_PATH, source.dump), 'r') as f: xml_data = BeautifulSoup(f.read(), from_encoding="utf-8") ignored = get_ignored_films() films_slugs = [i.get('slug_ru') for i in xml_data.findAll('film')] today = datetime.date.today() films_dict = get_source_data(source, 'film', 'dict') releases = SourceReleases.objects.select_related('film').filter( film__source_obj=source, release__gte=today) releases_dict = {} for i in releases: releases_dict[i.film.source_id] = i data_nof_films = '' data = request.FILES['file'].read() html_data = BeautifulSoup(data, from_encoding="utf-8") main = html_data.find('div', {'class': 'release_list'}) year = datetime.date.today().year first_h3 = main.findAll('h3', limit=1)[0] for div in first_h3.find_next_siblings(): film_tag = div.find('p', {'class': 'name'}) flag = False if film_tag: flag = True film_tag = film_tag.a film_name = film_tag.string.encode('utf-8') full_url = film_tag.get('href').encode('utf-8') film_id = re.findall(r'\d+\/$', full_url)[0].replace( '/', '').encode('utf-8') film_slug = low(del_separator(film_name)) film_year = div.find('span', { 'class': 'y' }).string.encode('utf-8').replace('(', '').replace(')', '') full_url = 'http://www.okino.ua%s' % full_url release_day = int( div.find('span', { 'class': 'day' }).string) release_month = div.find('span', { 'class': 'month' }).string.encode('utf-8') release_month = get_month(release_month) release_date = datetime.date(year, int(release_month), release_day) film_obj = films_dict.get(film_id) if not film_obj: kid, info = film_identification(film_slug, None, {}, {}, year=film_year, source=source) if kid: film_obj = SourceFilms.objects.create( source_id=film_id, source_obj=source, name=film_name, kid=kid, year=film_year, ) else: temp_film_slug = film_slug.decode('utf-8') if temp_film_slug not in films_slugs and temp_film_slug not in ignored: films_slugs.append(film_slug.decode('utf-8')) data_nof_films += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) if film_obj: for p in div.findAll('p'): if p.string: text = p.string.encode('utf-8') if 'Дистрибьютор:' in text: distr = text.replace( 'Дистрибьютор: ', '').decode('utf-8') release_obj = releases_dict.get( film_id) if release_obj: if release_obj.release != release_date or release_obj.distributor != distr: release_obj.release = release_date release_obj.distributor = distr release_obj.save() else: release_obj = SourceReleases.objects.create( source_obj=source, film=film_obj, release=release_date, distributor=distr, ) releases_dict[ film_id] = release_obj if div.string: year = int( re.findall(r'\d+$', div.string.encode('utf-8'))[0]) xml_data = str(xml_data).replace('<html><head></head><body><data>', '').replace( '</data></body></html>', '') xml_data = '<data>%s%s</data>' % (xml_data, data_nof_films) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, xml_data) return HttpResponseRedirect(reverse('admin_source_releases_show')) return render_to_response('release_parser/okinoua_upload.html', {'form': form}, context_instance=RequestContext(request))
def tvzavr_ident(): source = ImportSources.objects.get(url='http://www.tvzavr.ru/') sfilm_clean(source) path = '%s/dump_%s_index.xml' % (settings.API_DUMP_PATH, source.dump) data_nof_film = '' noffilms = [] ignored = get_ignored_films() films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) with open(path, 'r') as f: data = BeautifulSoup(f.read(), "html.parser") for i in data.findAll('url'): title = i.find('video:video').find('video:title').text.encode('utf-8') slug = low(del_separator(title)) film_id = i.find('tvzavr:video').find('tvzavr:id').text if not 'серия' in slug and film_id not in noffilms: if slug.decode('utf-8') not in ignored: url = i.find('loc').text.encode('utf-8') year = i.find('tvzavr:video').find('tvzavr:year').text obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(slug, None, {}, {}, year=year, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, title, year=year, extra=url) films[film_id] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm(title, slug, None, None, film_id.encode('utf-8'), info, url, source.id) noffilms.append(film_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'players', 'Онлайн плееры')
def get_rottentomatoes_films(everyday=True): def get_critic(block): critic = block.findAll('div', id="scoreStats", limit=1) if critic: critic = critic[0].findAll('div') average = critic[0].find('span', { 'class': 'subtle superPageFontColor' }).next_sibling.string.strip() reviews = critic[1].findAll('span', limit=2)[1].text.strip() fresh = critic[2].find('span', { 'class': 'subtle superPageFontColor' }).next_sibling.string.strip() rotten = critic[3].find('span', { 'class': 'subtle superPageFontColor' }).next_sibling.string.strip() return '%s;%s;%s;%s' % (average.replace( '/10', ''), reviews, fresh, rotten) else: return 'N/A;0;0;0' ''' critic = block.findAll('p', {'class': 'critic_stats'}, limit=1)[0] average, reviews = critic.findAll('span', limit=2) try: fresh, rotten = reviews.next_sibling.next_sibling.encode('utf-8').strip().split(' | ') except AttributeError: return 'N/A;0;0;0' fresh = fresh.replace('Fresh:','').strip() rotten = rotten.replace('Rotten:','').strip() average = average.string.encode('utf-8').split('/')[0] reviews = reviews.string.encode('utf-8') return '%s;%s;%s;%s' % (average, reviews, fresh, rotten) ''' source = ImportSources.objects.get(url='http://www.rottentomatoes.com/') sfilm_clean(source) noffilms = [] data_nof_film = '' filter = {'source_obj': source} if everyday: today = datetime.datetime.today().date() day7 = today + datetime.timedelta(days=7) today = today - datetime.timedelta(days=30) filter['text__gte'] = today filter['text__lt'] = day7 exists = get_source_data(source, 'film', 'list') films = {} source_films = SourceFilms.objects.filter(**filter) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) ignored = get_ignored_films() opener = urllib2.build_opener() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1', } opener.addheaders = headers.items() updated = [] for k, f in films.items(): film_url = '%s%s' % (source.url, k) req = opener.open(film_url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") extra = get_critic(data) f.extra = extra f.save() updated.append(k) time.sleep(1) u = 'http://www.rottentomatoes.com/api/private/v1.0/m/list/find?page=1&limit=50&type=opening&minTomato=0&maxTomato=100&minPopcorn=0&maxPopcorn=100&services=&genres=1%3B2%3B4%3B5%3B6%3B8%3B9%3B10%3B11%3B13%3B14%3B18&sortBy=popularity&certified=false' req = opener.open(u) if req.getcode() == 200: data = json.loads(req.read(), encoding="latin-1") for i in data['results']: title = i['title'].encode('utf-8') title_slug = low(del_separator(title)) url = i['url'].lstrip('/') full_url = '%s%s' % (source.url, url) if url not in exists and url not in noffilms: if title_slug.decode( 'utf-8') not in ignored and url not in updated: time.sleep(1) req2 = opener.open(full_url) if req2.getcode() == 200: data2 = BeautifulSoup(req2.read(), from_encoding="utf-8") year_block = data2.find('h1', {'class': 'title hidden-xs'}) if not year_block: year_block = data2.find('h1', id='movie-title') year_tmp = year_block.find('span', { 'class': 'h3 year' }).text.encode('utf-8') year = int(year_tmp.replace('(', '').replace(')', '')) release_date = data2.find('td', itemprop="datePublished") if release_date: release_date = release_date.get('content') extra = get_critic(data2) obj = films.get(url) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid obj.extra = extra obj.save() else: kid, info = film_identification(None, title_slug, {}, {}, year, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(url, kid, source, title, txt=release_date, extra=extra) films[url] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm( title, title_slug, None, None, url.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(url) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films', 'Фильмы, рейтинг')
def get_kino_ru(): current_site = DjangoSite.objects.get(domain='kinoinfo.ru') REG_YEAR = re.compile(r'\d{4}\sгод.') REG_DATETIME = re.compile(r'\s?\-\s\d{2}\:\d{2}\:\d{2}\s\d{2}\s.*\s\d{4}') ignored = get_ignored_films() data_nof_film = '' noffilms = [] source = ImportSources.objects.get(url='http://www.kino.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) users = {} for i in SourceUsers.objects.select_related('profile').filter( source_obj=source): users[i.source_id] = i text_ids = list( NewsFilms.objects.filter(source_obj=source).values_list('source_id', flat=True)) forum_dict = {} urls = (source.url, '%safisha/page/2' % source.url, '%safisha/page/3' % source.url) for url in urls: # фильмы req = urllib.urlopen(url) if not req.getcode() == 200: time.sleep(7) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") for article in data.findAll('article', {'class': "post"}): film_url = article.find('a', {'class': 'h2'}) film_id = film_url.get('href') full_url = u'%s%s' % (source.url, film_id.lstrip('/')) film_id = film_id.replace('/film/', '') film_name = film_url.text.strip().encode('utf-8') film_slug = low(del_separator(film_name)) info_country = article.find('div', {'class': 'info-country'}) year = int(info_country.findAll('a')[-1].text.strip()) comments_exist = article.find('div', {'class': 'comments'}) if comments_exist and film_id.encode( 'utf-8') not in noffilms and film_slug.decode( 'utf-8') not in ignored: forum_href = '%s/comments' % full_url obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, year=year, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name, year=year) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(film_id.encode('utf-8')) if objt: forum_dict[film_id] = { 'obj': objt, 'href': forum_href } # отзывы и авторы for k, v in forum_dict.iteritems(): req = urllib.urlopen(v['href']) if not req.getcode() == 200: time.sleep(7) req = urllib.urlopen(v['href']) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") for post in data.findAll('div', {'class': 'post-comment'}): user_data = post.find('a', {'class': 'login_name'}) if user_data: user_name = user_data.text.strip().encode('utf-8') user_id = user_data.get('href').replace('/user/', '') user_obj = users.get(user_id) if user_obj: profile = user_obj.profile else: new_user = get_user() new_user.first_name = user_name new_user.save() profile = Profile.objects.get(user=new_user) user_obj = SourceUsers.objects.create( source_id=user_id, source_obj=source, profile=profile, ) users[user_id] = user_obj date_comment = post.find('div', {'class': 'date-comment'}) com_time, com_date = date_comment.findAll('a') com_day, com_month, com_year = com_date.text.encode( 'utf-8').strip().split() com_month = get_month(com_month) com_hour, com_minute = com_time.text.encode( 'utf-8').split(':') com_dtime = datetime.datetime(int(com_year), int(com_month), int(com_day), int(com_hour), int(com_minute), 0) text_id = com_time.get('href').replace( '/film/%s/comments/' % k, '') text = post.find('div', { 'class': 'text-comment' }).text.encode('utf-8').strip() if text_id not in text_ids: news = News.objects.create( title='', text=text, visible=True, autor=profile, autor_nick=1, site=current_site, subdomain=0, reader_type='8', ) news.dtime = com_dtime news.save() NewsFilms.objects.create( kid=v['obj'].kid, message=news, source_id=text_id, source_obj=source, ) text_ids.append(text_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films', 'Фильмы и отзывы')
def get_zlat74ru_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] city_name = 'Златоуст' cinema_name = 'Космос' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://www.zlat74.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) req = urllib.urlopen(source.url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") div = data.find('div', id='schedule') for tr in div.findAll('tr'): if tr.th: show_date = tr.th.string.encode('utf-8') day, month, year, temp = show_date.split() month = get_month(month) date = datetime.date(int(year), int(month), int(day)) if tr.td: film_tag = tr.td.a film_id = film_tag.get('href').replace('/movies/', '').encode('utf-8') film_name = film_tag.string.encode('utf-8') film_slug = del_screen_type(low(del_separator(film_name))) full_url = '%smovies/%s' % (source.url, film_id.decode('utf-8')) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: for time in tr.findAll('span'): hours, minutes = time.string.split(':') dtime = datetime.datetime( date.year, date.month, date.day, int(hours), int(minutes)) sch_id = '%s%s%s%s' % (dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_surkino_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] source = ImportSources.objects.get(url='http://surkino.ru/') sfilm_clean(source) cinemas = get_source_data(source, 'cinema', 'dict') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') dates = [] req = urllib.urlopen(source.url) if req.getcode() == 200: data = BeautifulSoup(req.read()) show_days = data.find('div', {'class': 'days'}) for a in show_days.findAll('a'): dates.append(a.get('href').replace('?date=', '')) for d in dates: url = '%s?date=%s' % (source.url, d) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="windows-1251") div = data.find('div', id='filmlist') if div: for cinema_tag in div.findAll('div', {'class': 'filmname'}): cinema_id = cinema_tag.a.get('href').replace( '?cinema=', '') cinema_obj = cinemas.get(cinema_id) if cinema_obj: films_block = cinema_tag.find_next_siblings('div', limit=1)[0] for tr in films_block.findAll('tr'): film_tag = tr.findAll('a') film_tag = film_tag[1] if len( film_tag) == 2 else film_tag[0] full_url = '%s%s' % (source.url, film_tag.get('href')) film_id = film_tag.get('href').replace( '?film=', '').encode('utf-8') film_name = film_tag.string.encode('utf-8') film_slug = low(del_separator(film_name)) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: showtime = tr.td.string.encode('utf-8') hours, minutes = showtime.split('.') year, month, day = d.split('-') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s' % (dtime, cinema_id, film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_kinosaturn_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] city_name = 'Александров' cinema_name = 'Сатурн' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://www.kinosaturn.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') city = City.objects.get(name__name=city_name, name__status=1) cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) today = datetime.datetime.now() next_month = datetime.date.today() + datetime.timedelta(days=40) req = urllib.urlopen(source.url) if req.getcode() == 200: data = BeautifulSoup(req.read()) #, from_encoding="utf-8" tables = data.findAll('table', width="560", border="0", cellspacing="0", cellpadding="0") for table in tables: film_name = table.find('div', { 'class': u'стиль25' }).text.strip().encode('utf-8') film_slug = del_screen_type(low(del_separator(film_name))) film_id = film_slug if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name) films[film_id.decode('utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) if objt: show_date = table.findAll( 'span', {'class': u'стиль23'}, limit=1)[0].string.strip().encode('utf-8') try: date_from, date_to = show_date.split(' по ') except ValueError: date_from, date_to = show_date.split(' - ') date_from_day, date_from_month = date_from.replace( 'с ', '').split('.') date_to_day, date_to_month = date_to.split('.') date_from = datetime.date(today.year, int(date_from_month), int(date_from_day)) date_to = datetime.date(today.year, int(date_to_month), int(date_to_day)) if date_to < next_month: for cl in (u'стиль23 стиль35 стиль37', u'стиль23 стиль35'): for t in table.findAll('span', {'class': cl}): hours, minutes = t.string.strip().encode( 'utf-8').split(':') delta = date_to - date_from for day in range(delta.days + 1): d = date_from + datetime.timedelta( days=day) dtime = datetime.datetime( d.year, d.month, d.day, int(hours), int(minutes)) sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_surkino_cinemas(): data_nof_cinema = '' source = ImportSources.objects.get(url='http://surkino.ru/') city_name = 'Сургут' city_slug = low(del_separator(city_name)) city = City.objects.get(name__name=city_name, name__status=1) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinemas = get_source_data(source, 'cinema', 'list') ignored_cinemas = get_ignored_cinemas() req = urllib.urlopen(source.url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="windows-1251") div = data.find('div', {'class': 'cinemas'}) div_classes = ['ciname', 'ciname last'] for cl in div_classes: for cinema_tag in div.findAll('div', {'class': cl}): cinema_name = cinema_tag.a.get('title').encode( 'utf-8').replace('Кинотеатр ', '') cinema_slug = low(del_separator(cinema_name)) cinema_id = cinema_tag.a.get('href').replace('?cinema=', '') cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'), city_obj.city.kid) if cinema_id not in cinemas and cinema_ig_id not in ignored_cinemas: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city__id': city_obj.city_id } cinema_kid = cinema_identification(cinema_slug, filter1) if cinema_kid: try: cinema = Cinema.objects.get(code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) except Cinema.DoesNotExist: pass else: if 'slug="%s"' % cinema_slug not in data_nof_cinema: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_obj.name.encode('utf-8'), city_obj.city.kid) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) cron_success('html', source.dump, 'cinemas', 'Кинотеатры')
def get_luxor_chuvashia_schedules(): today = datetime.datetime.now().strftime('%d.%m.%Y') data_nof_film = '' noffilms = [] ignored = get_ignored_films() source = ImportSources.objects.get(url='http://luxor.chuvashia.com/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') data = [ { 'city': 'Чебоксары', 'cinema': 'Мир Луксор', 'url': '%sschedule.aspx?kinoteatr=luxor' % source.url }, { 'city': 'Новочебоксарск', 'cinema': 'Атал', 'url': '%sschedule.aspx?kinoteatr=atal' % source.url }, ] def get_page_data(date, data_list): url = '%s&date=%s' % (i['url'], date) req = urllib.urlopen(url) if req.getcode() == 200: page_data = BeautifulSoup(req.read()) div = page_data.find('div', id='BodyContener_ScheduleBlock') table = div.find('table', id='BodyContener_TCalendar') for j in div.findAll('div', {'class': 'ScheduleTitle'}): data_list.append({ 'date': date, 'title': j, 'sch': j.next_sibling }) day, month, year = date.split('.') date_obj_current = datetime.date(int(year), int(month), int(day)) for a in table.findAll('a'): link = a.get('href') d = re.findall(r'\=[\d+\.?]+', link.encode('utf-8'))[0].replace('=', '') day, month, year = d.split('.') date_obj = datetime.date(int(year), int(month), int(day)) if date_obj > date_obj_current: get_page_data(d, data_list) return data_list for i in data: city_slug = low(del_separator(i['city'])) city = City.objects.get(name__name=i['city'], name__status=1) city_obj, city_created = SourceCities.objects.get_or_create( source_id=city_slug, source_obj=source, defaults={ 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': i['city'], }) cinema_slug = low(del_separator(i['cinema'])) cinema = Cinema.objects.get(name__name=i['cinema'], name__status=1, city=city) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id=cinema_slug, source_obj=source, defaults={ 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': i['cinema'], }) data_list = get_page_data(today, []) for schedule in data_list: tag_a = schedule['title'].find('a') film_name = tag_a.text.encode('utf-8') film_slug = low(del_separator(del_screen_type(film_name))) film_url = tag_a.get('href') film_id = film_url.replace('films.aspx?id=', '').encode('utf-8') full_url = '%s%s' % (source.url, film_url) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: sch_div = schedule['sch'].find('div', { 'class': 'ScheduleClock' }).text.encode('utf-8').strip() showtimes = re.findall(r'\d+\:\d+', sch_div) day, month, year = schedule['date'].split('.') for showtime in showtimes: hours, minutes = showtime.split(':') dtime = datetime.datetime(int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s%s' % (dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_rutracker_topics(): REG_SIZE = re.compile(r'\[\d+\.?\d+?\s?\w+\]') REG_SLUG = re.compile(ur'[a-zа-я0-9]+') data_nof_film = '' temp_data = {} source = ImportSources.objects.get(url='http://rutracker.org/') ignored = get_ignored_films() films = get_source_data(source, 'film', 'list') noffilms = [] links = [ 'http://feed.rutracker.cc/atom/f/2200.atom', 'http://feed.rutracker.cc/atom/f/2093.atom', 'http://feed.rutracker.cc/atom/f/209.atom', 'http://feed.rutracker.cc/atom/f/22.atom', 'http://feed.rutracker.cc/atom/f/124.atom' ] send_msg = False for url in links: req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") for entry in data.findAll('entry'): id = entry.link.get('href').replace( "http://rutracker.org/forum/viewtopic.php?t=", "").replace("viewtopic.php?t=", "") date_upd, time_upd = entry.updated.text.replace('+00:00', '').split('T') dtime = '%s %s' % (date_upd, time_upd) if id not in films and id not in noffilms and id.decode( 'utf-8') not in ignored: title = entry.title.text.encode('utf-8').replace( '<![CDATA[', '').replace(']]>', '') title = re.sub(REG_SIZE, '', title) name = title.replace('[Обновлено]', '').replace('[Extended Cut]', '').strip() year_temp = name.split('[') if len(year_temp) > 1: year = re.findall(r'\d+', year_temp[1]) if year: year = year[0] name_alt = re.findall(REG_SLUG, low(name).decode('utf-8')) name_alt = ''.join(name_alt) name_for_search = name.split('/')[0].strip() name_for_search_slug = re.findall( REG_SLUG, low(name_for_search).decode('utf-8')) name_for_search_slug = ''.join( name_for_search_slug) full_url = '%sforum/viewtopic.php?t=%s' % ( source.url, id) kid = temp_data.get(name_for_search_slug + year) if not kid: kid, info = film_identification( name_for_search_slug, None, {}, {}, year, source=source) if kid: film_obj = SourceFilms.objects.create( source_id=id, source_obj=source, name=name, name_alter=name_alt, kid=kid, text=dtime, extra='new', ) temp_data[name_for_search_slug + year] = kid films.append(id) send_msg = True else: data_nof_film += xml_noffilm( name_for_search, name_for_search_slug.encode('utf-8'), None, None, id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'films', 'Новые фильмы') if send_msg: current_site = DjangoSite.objects.get(domain='kinoinfo.ru') msg_from = Profile.objects.get(user__last_name='SYSTEM') msg_to = Profile.objects.get( accounts__login='******') # [email protected] msg = 'В сети появились новые фильмы <a href="http://kinoinfo.ru/torrents/listing/%s/" target="_blank">http://kinoinfo.ru/torrents/listing/%s/</a>' % ( source.id, source.id) try: dialog_exist = DialogMessages.objects.filter( readers__user=msg_to, readers__message__autor=msg_from).order_by('-id')[0] except IndexError: dialog_exist = None reader_type = '1' msg_obj = News.objects.create( title='Сообщение', text=msg, autor=msg_from, site=current_site, subdomain='0', reader_type='1', ) reader = NewsReaders.objects.create(user=msg_to, status='0', message=msg_obj) if dialog_exist: dialog_exist.readers.add(reader) else: dialog_obj = DialogMessages() dialog_obj.save() dialog_obj.readers.add(reader)
def get_rambler_films(): ignored = get_ignored_films() source = ImportSources.objects.get(url='http://www.rambler.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) noffilms = [] data_nof_films = '' ''' # LOCALHOST f = open('%s/dump_rambler_films.xml' % settings.API_DUMP_PATH, 'r') xml = BeautifulSoup(f.read(), from_encoding="utf-8") f.close() if xml: if xml: # --- end localhost ''' # SERVER f = open('%s/dump_rambler_index.xml' % settings.API_DUMP_PATH, 'r') xml_index = BeautifulSoup(f.read(), from_encoding="utf-8") f.close() creations = xml_index.find('creations') filenames = [] for i in creations.findAll('file'): filename = i.get('filename') if filename: filenames.append(filename) for i in filenames: url = 'http://api.kassa.rambler.ru/v2/%s/xml/Movie/export/sale/%s' % ( RAMBLER_API_KEY, i) req = urllib.urlopen(url) if req.getcode() == 200: xml = BeautifulSoup(req.read(), from_encoding="utf-8") # --- end server for i in xml.findAll('creation'): film_id = i.objectid.string if film_id not in noffilms: try: year = int(i.year.string) if i.year.string else None except UnicodeEncodeError: year = None full_url = 'https://kassa.rambler.ru/movie/%s' % film_id name = i.find('name').string.encode('utf-8') name_slug = low(del_separator(name)) if year and name_slug.decode('utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: try: kid, info = film_identification( name_slug, None, {}, {}, year=year, source=source) except db.backend.Database._mysql.OperationalError: next_step = False if next_step: objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, name, year=year) films[film_id] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) elif not obj: data_nof_films += xml_noffilm( name, name_slug, None, None, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films.replace('&', '&')) cron_success('xml', source.dump, 'films', 'Фильмы')
def get_kinoboomer_schedules(): ignored = get_ignored_films() data_nof_film = '' data_nof_hall = '' data_nof_cinema = '' noffilms = [] nofhalls = [] city_name = 'Киев' cinema_name = 'Boomer' city_slug = low(del_separator(city_name)) cinema_slug = low(del_separator(cinema_name)) source = ImportSources.objects.get(url='http://www.kinoboomer.com.ua/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') halls = get_source_data(source, 'hall', 'dict') city = City.objects.get(name__name=city_name, name__status=1) try: cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city) except Cinema.DoesNotExist: cinema = None data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (cinema_name, cinema_slug, city_name, city.kid) film_urls = [] if cinema: city_obj, city_created = SourceCities.objects.get_or_create( source_id = city_slug, source_obj = source, defaults = { 'source_id': city_slug, 'source_obj': source, 'city': city, 'name': city_name, }) cinema_obj, cinema_created = SourceCinemas.objects.get_or_create( source_id = cinema_slug, source_obj = source, defaults = { 'source_id': cinema_slug, 'source_obj': source, 'city': city_obj, 'cinema': cinema, 'name': cinema_name, }) cinema_kid = cinema.code city_kid = city.kid today = datetime.date.today() url = '%sseances' % source.url req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) content = data.find('div', {'class': 'view-content'}) for i in content.findAll('h3'): a = i.find('a') film_id = a.get('href').strip().encode('utf-8') full_url = '%s%s' % (source.url, film_id.lstrip('/')) film_name = a.text.strip().encode('utf-8') film_slug = low(del_separator(film_name)) film_urls.append({ 'film_id': film_id, 'film_name': film_name, 'film_slug': film_slug, 'full_url': full_url, }) for i in film_urls: if i['film_id'] not in noffilms and i['film_slug'].decode('utf-8') not in ignored: # Идентифицирую фильм obj = films.get(i['film_id']) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(i['film_slug'], None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(i['film_id'], kid, source, i['film_name']) films[i['film_id']] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) elif not obj: #open('ddd.txt','a').write(str((type(i['film_name']), type(i['film_slug']), type(i['full_url'])))) data_nof_film += xml_noffilm(i['film_name'], i['film_slug'], None, None, i['film_id'], info, i['full_url'].encode('utf-8'), source.id) noffilms.append(i['film_id']) # если фильм найден, то идентифицирую зал if objt: req = urllib.urlopen(i['full_url']) if req.getcode() == 200: data = BeautifulSoup(req.read()) hall_name = '' content = data.find('div', {'class': 'view-grouping-content'}) if content: wrapper = content.findAll('div', {'class': 'group-wrapper'}, limit=1) if wrapper: widget_links = wrapper[0].findAll('a', {'class': 'vkino-link'}, limit=1) widget_req = urllib.urlopen(widget_links[0].get('href')) if widget_req.getcode() == 200: widget_data = BeautifulSoup(widget_req.read(), from_encoding="utf-8") nav = widget_data.find('div', id='purchase-navigation') li = nav.findAll('li', limit=1)[0] li.a.extract() li.nobr.extract() hall_name = li.text.strip().encode('utf-8').split('«')[-1].split('»')[0] hall_name_slug = low(del_separator(hall_name)) if hall_name and hall_name_slug not in nofhalls: hall_obj = halls.get(hall_name_slug) if not hall_obj: halls_obj = Hall.objects.filter(name__name=hall_name_slug, cinema=cinema_obj.cinema).distinct('pk') if halls_obj.count() == 1: hall_kid = halls_obj[0].kid hall_obj = SourceHalls.objects.create( source_id = hall_name_slug, source_obj = source, cinema = cinema_obj, name = hall_name, kid = hall_kid, ) halls[hall_name_slug] = hall_obj else: id = '%s%s%s%s' % (city_kid, cinema_kid, hall_name, hall_name_slug) id = id.replace(' ','') data_nof_hall += '<hall city="%s" city_kid="%s" cinema="%s" cinema_kid="%s" name="%s" slug="%s" id="%s"></hall>' % (city_name, city_kid, cinema_name, cinema_kid, hall_name, hall_name_slug, id) nofhalls.append(hall_name_slug) if hall_obj: # если зал найден, то получаю сеансы и создаю #day, day_month = wrapper.find('h3').text.strip().split() #day, month = day_month.split('.') #date_sch = datetime.date(today.year, int(month), int(day)) for wrapper in content.findAll('div', {'class': 'group-wrapper'}): widget_links = wrapper.findAll('a', {'class': 'vkino-link'}) for link in widget_links: dtime = link.find('span').get('content').replace('T', ' ').split('+')[0] dtime = datetime.datetime.strptime(dtime, "%Y-%m-%d %H:%M:%S") sch_id = '%s%s%s' % (dtime, hall_obj.id, i['film_id']) sch_id = sch_id.replace(' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id = sch_id, source_obj = source, film = objt, cinema = cinema_obj, hall = hall_obj.kid, dtime = dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_hall' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_hall) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_rambler_cinemas(): data_nof_cinema = '' source = ImportSources.objects.get(url='http://www.rambler.ru/') cinemas_ids = get_source_data(source, 'cinema', 'list') rambler_cities_dict = get_source_data(source, 'city', 'dict') cinemass = Cinema.objects.all() cinemass_dict = {} for i in cinemass: cinemass_dict[i.code] = i ignored_cinemas = get_ignored_cinemas() ''' # LOCALHOST f = open('%s/dump_rambler_cinema.xml' % settings.API_DUMP_PATH, 'r') xml = BeautifulSoup(f.read(), from_encoding="utf-8") f.close() if xml: if xml: # --- end localhost ''' # SERVER f = open('%s/dump_rambler_index.xml' % settings.API_DUMP_PATH, 'r') xml_index = BeautifulSoup(f.read(), from_encoding="utf-8") f.close() places = xml_index.find('places') filenames = [] for i in places.findAll('file'): filename = i.get('filename') if filename: filenames.append(filename) for i in filenames: url = 'http://api.kassa.rambler.ru/v2/%s/xml/Movie/export/sale/%s' % ( RAMBLER_API_KEY, i) req = urllib.urlopen(url) if req.getcode() == 200: xml = BeautifulSoup(req.read(), from_encoding="utf-8") # --- end server for i in xml.findAll('place'): id = i.objectid.string name = i.find('name').string.encode('utf-8') name_slug = low(del_separator(name)) address = i.address.string.encode( 'utf-8') if i.address.string else None latitude = i.latitude.string longitude = i.longitude.string city_id = i.cityid.string city_obj = rambler_cities_dict.get(city_id) if city_obj: cinema_ig_id = u'%s__%s' % (name_slug.decode('utf-8'), city_obj.city.kid) if id not in cinemas_ids and cinema_ig_id not in ignored_cinemas: filter1 = { 'name__name': name_slug, 'name__status': 2, 'city__id': city_obj.city_id } cinema = cinema_identification(name_slug, filter1) cin_obj = cinemass_dict.get(cinema) if cin_obj: SourceCinemas.objects.create( source_id=id, source_obj=source, city=city_obj, cinema=cin_obj, name=name, address=address, latitude=latitude, longitude=longitude, ) else: if 'slug="%s"' % name_slug not in data_nof_cinema: name_city = city_obj.name data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( name, name_slug, name_city.encode('utf-8'), city_obj.city.kid) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) cron_success('xml', source.dump, 'cinemas', 'Кинотеатры')
def get_mailru_soon(): data_nof_film = '' noffilms = [] ignored = get_ignored_films() source = ImportSources.objects.get(url='https://afisha.mail.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) today = datetime.datetime.today() dates = list( map((lambda x: today.date() + relativedelta(months=x)), xrange(1, 13))) dates.insert(0, today.date()) for d in dates: main_url = '%scinema/soon/%s/%s/' % (source.url, d.year, d.month) opener = give_me_cookie() #headers = { # 'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.2.2; en-us; Nexus 7 Build/JDQ39E) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Safari/534.30 CyanogenMod/10.1.3/grouper', #} #opener.addheaders = headers.items() try: req = opener.open(urllib2.Request(main_url)) except urllib2.HTTPError: req = None if req: data = BeautifulSoup(req.read(), "html.parser") for block in data.findAll('div', {'class': 'premiere__date'}): day = block.find('div', {'class': 'premiere__date__mday'}).text if day: release_date = datetime.date(d.year, d.month, int(day)) for item in block.findAll('div', {'class': 'clearin'}): a = item.find('div', { 'class': 'itemevent__head__name' }).find('a') film_name = a.text.strip().encode('utf-8') film_slug = low(del_separator(film_name)) href = a.get('href') film_id = href.replace('/cinema/movies/', '').replace('/', '').encode('utf-8') full_url = '%s%s' % (source.url, href.lstrip('/')) details = item.find('div', { 'class': 'itemevent__head__info' }).text.encode('utf-8') year = re.findall(r'\/\d{4}\/', details) if year: year = int(year[0].replace('/', '')) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) #OFC76 path from U+2009|e2 80 89|THIN SPACE #in film name film_slug = film_slug.decode("utf-8").replace( u"\u2009", '').encode("utf-8") next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, year=year, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id.decode('utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: sr_obj, sr_created = SourceReleases.objects.get_or_create( film=objt, source_obj=source, defaults={ 'film': objt, 'source_obj': source, 'release': release_date, }) if sr_created: try: req = opener.open( urllib2.Request(full_url)) except urllib2.HTTPError: req = None if req: data = BeautifulSoup( req.read(), "html.parser") movie_pic = data.find( 'div', { 'class': 'movieabout__info__left' }) pic = None if movie_pic: pic = movie_pic.find( 'a', { 'data-module': 'Gallery' }).get('href') txt = None movie_txt = data.find( 'div', { 'class': 'movieabout__info__descr__txt' }) if movie_txt: txt = movie_txt.text.strip( ).encode('utf-8') if pic or txt: objt.text = txt objt.extra = pic objt.save() time.sleep(random.uniform(1.0, 1.5)) else: if sr_obj.release != release_date: sr_obj.release = release_date sr_obj.save() time.sleep(random.uniform(1.0, 2.0)) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films', 'Релизы')
def get_top250(): source = ImportSources.objects.get(url='http://top250.info/') films_exist = SourceFilms.objects.filter(source_obj=source) films_exist_dict = {} for i in films_exist: films_exist_dict[int(i.source_id)] = i data_nof_films = '' films = {} keys = [] url = '%scharts/' % source.url req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") main = data.find('div', {'class': "layout"}) tables = main.findAll('table', limit=2) trr = tables[0].findAll('tr', limit=1)[0] tdd = trr.findAll('td', limit=2)[1] date_tmp = tdd.text.encode('utf-8').replace('Date: ','').replace(',','').strip() month, day, year, times = date_tmp.split() month = get_month_en(low(month)) date_upd = datetime.date(int(year), month, int(day)) #'-' без изменений #'↑' поднялся #'↓' опустился #'*' новый в топе for tr in tables[1].findAll('tr', {'class': ['row_same', 'row_up', 'row_down', 'row_new']}): td = tr.findAll('td') position = int(td[0].text) change = td[1].text.encode('utf-8') rating = float(td[3].text) votes = int(td[4].text) title = td[2].a.text.encode('utf-8') year = re.findall(r'\(.*?\)', title)[0].replace('(','').replace(')','') title = re.sub(r'\(.*?\)', '', title).strip() imdb_id = int(td[2].a.get('href').encode('utf-8').replace('/movie/?','')) if '-' in change: change = 1 change_val = None elif '↑' in change: change_val = int(change.replace('↑','').strip()) change = 2 elif '↓' in change: change_val = int(change.replace('↓','').strip()) change = 3 elif '*' in change: change_val = None change = 4 # получаю объект фильм от источника из БД, если существует obj = films_exist_dict.get(imdb_id) unique = '%s%s' % (imdb_id, date_upd) keys.append(unique) # записываю в словарь все спарсенные данные и объект films[imdb_id] = { 'imdb_id': imdb_id, 'position': position, 'change': change, 'change_val': change_val, 'rating': rating, 'votes': votes, 'title': title, 'year': year, 'obj': obj, 'key': unique, } top = Top250.objects.filter(key__in=keys) tops = [i.key.encode('utf-8') for i in top] # достаю все совпавшие фильмы по id imdb из БД для идентификации films_afisha = Film.objects.using('afisha').only('id', 'idalldvd').filter(idalldvd__in=films.keys()) films_afisha_dict = {} for i in films_afisha: films_afisha_dict[int(i.idalldvd)] = i.id # иду по всем спарсенным фильмам for i in films.values(): # идентифицирую фильм kid = films_afisha_dict.get(i['imdb_id']) # если у нас уже есть такой фильм от источника if i['obj']: # и он неидентифицирован, но сейчас идентифицировался, то сохраняю kid if kid and not i['obj'].kid: i['obj'].kid = kid i['obj'].save() # если у нас нет такого фильма от источника, то сохраняю else: sobj = SourceFilms.objects.create( source_id = i['imdb_id'], source_obj = source, name = i['title'], kid = kid, imdb = i['imdb_id'], year = i['year'], ) i['obj'] = sobj if i['key'] not in tops: Top250.objects.create( key = i['key'], date_upd = date_upd, film = i['obj'], position = i['position'], change = i['change'], change_val = i['change_val'], rating = i['rating'], votes = i['votes'], ) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('html', source.dump, 'films', 'Фильмы')
def get_cinemaplex_releases(): ignored = get_ignored_films() distr_nof_data = '' data_nof_film = '' noffilms = [] nof_distributors = [] distributors = {} source = ImportSources.objects.get(url='http://cinemaplex.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) today = datetime.datetime.today() url = '%s2013/01/30/release-schedule.html' % source.url ''' with open('cinemaplex.htm','r') as f: main = BeautifulSoup(f.read(), from_encoding="utf-8") if main: ''' req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") main = data.find('div', {'class': 'post-entry'}) main = main.find('tbody') release_date = None for tr in main.findAll('tr'): all_td = tr.findAll('td') if len(all_td) == 1: if all_td[0].text.strip(): try: release_first, release_last = all_td[0].text.encode( 'utf-8').split('—') except ValueError: try: release_first, release_last = all_td[ 0].text.encode('utf-8').split('–') except ValueError: release_first, release_last = all_td[ 0].text.encode('utf-8').split('-') release_first = release_first.replace('\xc2\xa0', '').strip() try: release_first = int(release_first) except ValueError: release_last = release_first release_first = release_first.split()[0].strip() release_month = release_last.strip().split()[1] release_day = int(release_first) release_month = int(get_month(release_month)) past_month_range = [] for m in [1, 2, 3, 4]: past_dates = today - relativedelta(months=+m) past_month_range.append(past_dates.month) if release_month in past_month_range or ( release_month == today.month and release_day <= today.day): release_date = None else: release_year = today.year if release_month >= today.month else today.year + 1 release_date = datetime.date(release_year, release_month, release_day) elif release_date: film_name = all_td[0].text.encode('utf-8').strip() distributor = all_td[1].text.encode('utf-8').replace( '&', '&').split(',')[0].strip() #copies = all_td[2].text.encode('utf-8').strip() runtime = all_td[3].text.encode('utf-8').strip() #genres = all_td[5].text.encode('utf-8').strip() #limits = all_td[7].text.encode('utf-8').strip() try: details = all_td[8].text.encode('utf-8').strip() except IndexError: details = '' f_name = film_name.split('/') if len(f_name) == 2: f_name_ru, f_name_en = (f_name[0].strip(), f_name[1].strip()) else: f_name_ru, f_name_en = (f_name[0].strip(), f_name[0].strip()) film_slug_ru = low(del_separator(f_name_ru)) film_slug_en = low(del_separator(f_name_en)) film_slug = low(del_separator(film_name)) film_id = film_slug full_url = None ''' current_release_date = re.findall(r'с\s\d+\.\d+', details) if current_release_date: current_release_day = current_release_date[0].replace('с ','').split('.')[0] current_release_date = datetime.date(int(release_date.year), int(release_date.month), int(current_release_day)) else: current_release_date = release_date ''' if film_slug_ru: if film_id not in noffilms and film_slug_ru.decode( 'utf-8') not in ignored: # дистрибьютор distributor_slug = low(del_separator(distributor)) distributor_kid = distributors.get(distributor_slug) if not distributor_kid and distributor_slug.decode( 'utf-8') not in nof_distributors: distr, status = distributor_identification( distributor, distributor_slug) if distr: distributor_kid = distr.kid if distr.kid else None distributors[ distributor_slug] = distributor_kid else: distr_nof_data += '<distributor value="%s" slug="%s" alt="%s"></distributor>' % ( distributor, distributor_slug, '') nof_distributors.append( distributor_slug.decode('utf-8')) if distributor_kid: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug_ru, film_slug_en, distributor_kid, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, f_name_ru) films[film_id.decode('utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm( f_name_ru, film_slug_ru, f_name_en, film_slug_en, film_id, info, full_url, source.id) noffilms.append(film_id) if objt: sr_obj, sr_created = SourceReleases.objects.get_or_create( film=objt, source_obj=source, defaults={ 'film': objt, 'distributor': distributor, 'source_obj': source, 'release': release_date, }) if not sr_created: if sr_obj.release != release_date: sr_obj.release = release_date sr_obj.save() runtime = runtime.replace('-', '').strip() if runtime: runtime = runtime.split("'")[0].split( '’')[0] runtime = runtime.replace("'", '').replace( '’', '') extra = '%s' % runtime if objt.extra != extra: objt.extra = extra objt.save() else: info = 'Нет такого дистрибьютора' data_nof_film += xml_noffilm( f_name_ru, film_slug_ru, f_name_en, film_slug_en, film_id, info, full_url, source.id) noffilms.append(film_id) create_dump_file('%s_nof_distributor' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % distr_nof_data) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'releases', 'Релизы')
def get_kinobusiness(request, country_data): ignored = get_ignored_films() source = ImportSources.objects.get(url='http://www.kinobusiness.com/') country = Country.objects.get(name=country_data['ru']) bx_ids = list(BoxOffice.objects.filter(country=country).values_list( 'bx_id', flat=True)) films = BoxOffice.objects.filter(country=country).distinct('kid') films_dict = {} for i in films: films_dict[i.source_id] = i.kid data_nof_films = '' data_nof_distr = '' noffilms = [] nofdistr = [] if country_data['en'] == 'usa': main_url = '%skassa_world_prokata/kassa-usa/' % source.url add = '' else: main_url = '%skassovye_sbory/weekend/' % source.url add = 'usd/' req = urllib.urlopen(main_url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") div = data.find('div', {'class': 'table-responsive'}) data = div.findAll('table', limit=1)[0] # data = data.find('table', {'class': "table table-striped table-hover calendar_year ned"}) tr = data.findAll('tr', limit=2)[1] a = tr.findAll('a')[0].get('href').lstrip('/') req = urllib.urlopen('%s%s%s' % (source.url, a, add)) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") date = data.find('h1', {'class': 'film__title'}) date = date.find('small').text.encode('utf-8') to_day, to_month, to_year = re.findall( r'\-\s[\d+\.?]+', date)[0].replace('- ', '').split('.') date_to = datetime.date(int(to_year), int(to_month), int(to_day)) date_from = date_to - datetime.timedelta(days=3) counter = 0 main = data.find('table', id="krestable") for index, tr in enumerate(main.findAll('tr')): if index != 0: if country_data['en'] == 'usa': trs = tr.findAll('td', limit=5) film_name = trs[2].text.strip().encode('utf-8') film_name_orig = trs[3].text.strip().encode('utf-8') a = trs[2].find('a') else: trs = tr.findAll('td', limit=5) film_name = trs[3].text.strip().encode('utf-8') film_name_orig = trs[4].text.strip().encode('utf-8') a = trs[3].find('a') url = a.get('href').encode('utf-8') if a else None film_name = film_name.replace('*', '') film_slug = low(del_separator(film_name)) film_slug_orig = low(del_separator(film_name_orig)) full_url = '' if url: full_url = '%s%s' % (source.url, url.lstrip('/')) full_url = full_url.encode('utf-8') film_id = film_slug.decode('utf-8') film_slug_orig = film_slug_orig.decode('utf-8') bx_id = '%s%s%s%s%s' % ( film_id, film_slug_orig, date_from, date_to, country_data['dump']) if bx_id not in bx_ids: distributors = [] week_audience = None td = tr.findAll('td') if country_data['en'] == 'usa': distributors = td[4].text week_sum = int(float(td[5].text.replace( u' ', '').replace(u',', u'.'))) screens = int(float(td[7].text.replace( u' ', '').replace(u',', u'.').replace(u'-', u'0'))) all_sum = int(float(td[9].text.replace( u' ', '').replace(u',', u'.'))) days = int(float(td[11].text.replace( u' ', '').replace(u',', u'.'))) * 7 all_audience = None else: distributors = td[5].text week_sum = int(float(td[6].text.replace( u' ', '').replace(u',', u'.'))) screens = int(float(td[8].text.replace( u' ', '').replace(u',', u'.').replace(u'-', u'0'))) days = int(float(td[10].text.replace( u' ', '').replace(u',', u'.'))) all_sum = int(float(td[11].text.replace( u' ', '').replace(u',', u'.'))) all_audience = td[12].text.replace( u' ', '').replace(u',', u'.') all_audience = int(float(all_audience)) if all_audience else None if distributors: distributors = distributors.encode( 'utf-8').replace('*', '').split('/') else: distributors = [] dlist = [] for dname in distributors: dname = dname.strip().replace('&', '&') dname_slug = low(del_separator(dname)) if dname_slug not in nofdistr: distr, status = distributor_identification( dname, dname_slug) if distr: dlist.append(distr) else: data_nof_distr += '<distributor value="%s" slug="%s" alt="%s"></distributor>' % (dname.replace('&', '&'), dname_slug, None) nofdistr.append(dname_slug) if dlist: if film_id not in noffilms and film_slug.decode('utf-8') not in ignored: film_obj = films_dict.get(film_id) if not film_obj: ''' req2 = urllib.urlopen(full_url) if req2.getcode() == 200: counter += 1 data2 = BeautifulSoup(req2.read()) film_details = data2.find('table', {'class': 'news-detail'}) year = None for p in film_details.findAll('p'): if p.b: year_tag = p.b.string.encode('utf-8').strip() if year_tag == 'Год:': year = re.findall(r'\d+', p.text.encode('utf-8').strip())[0] if year: ''' d1, d2 = (dlist[0].kid, dlist[1].kid) if len(dlist) > 1 else (dlist[0].kid, None) kid, info = film_identification( film_slug, None, d1, d2, source=source) if kid: film_obj = kid films_dict[film_id] = kid else: data_nof_films += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, full_url, source.id) noffilms.append(film_id) if film_obj: boxoffice = BoxOffice.objects.create( bx_id=bx_id, source_id=film_id, source_obj=source, name=film_name, kid=film_obj, screens=screens, date_from=date_from, date_to=date_to, week_sum=week_sum, all_sum=all_sum, week_audience=week_audience, all_audience=all_audience, days=days, country=country, ) for i in dlist: boxoffice.distributor.add(i) bx_ids.append(bx_id) if counter % 3 == 0: time.sleep(random.uniform(1.0, 3.0)) create_dump_file('%s_nof_distributor' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_distr) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('html', source.dump, 'boxoffice%s' % country_data['dump'], 'Кассовые сборы %s' % country_data['ru']) return HttpResponseRedirect(reverse("boxoffice_admin", kwargs={'country': country_data['en']}))
def get_luxor_films(): query = 'QueryCode=GetMovies' data = get_luxor_data_by_socket(query) source = ImportSources.objects.get(url='http://luxor.ru/') sfilm_clean(source) #create_dump_file('%s_films' % source.dump, settings.API_DUMP_PATH, data) data_nof_films = '' noffilms = [] films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) ''' xml = open('%s/dump_%s_films.xml' % (settings.API_DUMP_PATH, source.dump), 'r')# temp data = xml.read()# temp xml.close()# temp ''' ignored = get_ignored_films() xml_data = BeautifulSoup(data, from_encoding="utf-8") for film in xml_data.findAll('movie'): film_id = film['id'].encode('utf-8') film_name = film.find('othername').string.encode('utf-8').replace( '[CDATA[', '').replace(']]', '') film_slug = low(del_separator(del_screen_type(film_name))) if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, film_name) films[film_id] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('xml', source.dump, 'films', 'Фильмы')
'utf-8') not in films_slugs: data_nof_film += xml_noffilm(name_ru, name_slug, name_ua, None, film_id.encode('utf-8'), info, url.encode('utf-8'), source.id) # на каждом 4 обращении к источнику делаю паузу в 1-3 секунды if index % 3 == 0: time.sleep(random.uniform(1.0, 3.0)) xml_data = str(xml_data).replace('<html><head></head><body><data>', '').replace('</data></body></html>', '') xml_data = '<data>%s%s</data>' % (xml_data, data_nof_film) create_dump_file('okinoua_nof_film', settings.NOF_DUMP_PATH, xml_data) cron_success('html', 'okinoua', 'releases', 'Укр. релизы') @timer def get_okinoua_cities(): """ Парсинг городов Украины """ source = ImportSources.objects.get(url='http://www.okino.ua/') # Получаем список городов с таблицы SourceCities в виде списка cities_ids = get_source_data(source, 'city', 'list') data_nof_city = '' # Открываем страницу с городами
def get_luxor_cinemas(): query = 'QueryCode=GetHalls' data = get_luxor_data_by_socket(query) source = ImportSources.objects.get(url='http://luxor.ru/') #create_dump_file('%s_cinemas' % source.dump, settings.API_DUMP_PATH, data) data_nof_cinema = '' data_nof_city = '' data_nof_hall = '' nofcities = [] nofcinemas = [] cinemas = get_source_data(source, 'cinema', 'dict') cities = get_source_data(source, 'city', 'dict') ignored_cinemas = get_ignored_cinemas() halls = get_source_data(source, 'hall', 'dict') ''' xml = open('%s/dump_%s_cinemas.xml' % (settings.API_DUMP_PATH, source.dump), 'r')# temp data = xml.read()# temp xml.close()# temp ''' xml_data = BeautifulSoup(data, from_encoding="utf-8") for cinema in xml_data.findAll('theatre'): cinema_id = cinema['id'].encode('utf-8') cinema_name = cinema.find('name').text.encode('utf-8') cinema_name = cinema_name.replace('[CDATA[', '').replace(']]', '').strip() cinema_slug = low(del_separator(cinema_name)) cinema_alt_name = 'Люксор' cinema_alt_slug = 'люксор' address = cinema.find('address').text.encode('utf-8') address = address.replace('[CDATA[', '').replace(']]', '').replace('"', "'").strip() city_obj = cities.get(cinema_slug.decode('utf-8')) if not city_obj and cinema_slug not in nofcities: city = City.objects.filter(name__name=cinema_slug, name__status=2).distinct('pk') if city.count() == 1: city_obj = SourceCities.objects.create( source_id=cinema_slug, source_obj=source, city=city[0], name=cinema_name, ) cities[cinema_slug] = city_obj else: data_nof_city += '<city name="%s" slug="%s" info="%s"></city>' % ( cinema_name, cinema_slug, address) nofcities.append(cinema_slug) if city_obj: cinema_obj = cinemas.get(cinema_id) city_kid = city_obj.city.kid cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'), city_kid) if cinema_ig_id not in ignored_cinemas: if not cinema_obj: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city__kid': city_kid } cinema_kid = cinema_identification(cinema_slug, filter1, {}, city_kid) if cinema_kid: cin_obj = Cinema.objects.get(code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cin_obj, name=cinema_name, ) cinemas[cinema_id] = cinema_obj else: city_name = '' for i in city_obj.city.name.all(): if i.status == 1: city_name = i.name.encode('utf-8') data_nof_cinema += '<cinema name="Люксор %s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_name, city_kid) if cinema_obj: for i in cinema.findAll('hall'): hall_id = i['id'].encode('utf-8') hall_name = i.find('name').string.encode('utf-8') hall_name = hall_name.replace('[CDATA[', '').replace(']]', '').strip() hall_slug = low(del_separator(hall_name)) hall_obj = halls.get(hall_id) if not hall_obj: hall_obj = Hall.objects.filter( name__name=hall_slug, cinema=cinema_obj.cinema).distinct('pk') if hall_obj.count() == 1: hall_kid = hall_obj[0].kid SourceHalls.objects.create( source_id=hall_id, source_obj=source, cinema=cinema_obj, name=hall_name, kid=hall_kid, ) else: city_name = '' for i in city_obj.city.name.all(): if i.status == 1: city_name = i.name.encode('utf-8') cinema_kid = cinema_obj.cinema.code id = '%s%s%s%s' % (city_kid, cinema_kid, hall_name, hall_slug) id = id.replace(' ', '') data_nof_hall += '<hall city="%s" city_kid="%s" cinema="Люксор %s" cinema_kid="%s" name="%s" slug="%s" id="%s"></hall>' % ( city_name, city_kid, cinema_name, cinema_kid, hall_name, hall_slug, id) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_hall' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_hall) cron_success('xml', source.dump, 'cities_and_cinemas', 'Города и кинотеатры')
def get_okinoua_cinemas(): """ Парсинг кинотеатров Украины """ source = ImportSources.objects.get(url='http://www.okino.ua/') # Получаем список идентифицированных кинотеатров OkinoUA cinemas_ids = get_source_data(source, 'cinema', 'list') data_nof_cinema = '' # Получаем словарь со списком идентифицированных городов OkinoUA okinoua_cities_dict = get_source_data(source, 'city', 'dict') cinemas = Cinema.objects.all() cinemas_dict = {} for i in cinemas: cinemas_dict[i.code] = i counter = 0 # Открываем ссылку, если она доступна и считываем ее BeautifulSoup'ом for city_id, city_obj in okinoua_cities_dict.iteritems(): counter += 1 url = '%s%s/' % (source.url, city_id) req = urllib.urlopen(url) if req.getcode() == 200: page = BeautifulSoup(req.read(), from_encoding="utf-8") # Находим все теги с городами и считываем из них id и названия городов for div in page.findAll('div', {'class': 'item0'}): cinema_tag = div.find('h3') cinema_id = cinema_tag.a.get('href').replace('/', '') cinema_name = cinema_tag.a.string.encode('utf-8') cinema_slug = low(del_separator(cinema_name)) if cinema_id not in cinemas_ids: filter = { 'name__name': cinema_slug, 'name__status': 2, 'city__id': city_obj.city_id } cinema_kid = cinema_identification(cinema_slug, filter) if cinema_kid: try: cinema = Cinema.objects.get(code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) except Cinema.DoesNotExist: pass else: if 'slug="%s"' % cinema_slug not in data_nof_cinema: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_obj.name.encode('utf-8'), city_obj.city.kid) if counter % 4 == 0: time.sleep(random.uniform(1.0, 3.0)) create_dump_file('okinoua_nof_cinema', settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) cron_success('html', 'okinoua', 'cinemas', 'Укр. кинотеатры')
def get_kinoteatrua_schedules(): opener = give_me_cookie() kinoteatrua_urls = [ { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-alushta.phtml', 'name': 'Алушта', 'id': 51 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-alchevsk.phtml', 'name': 'Алчевск', 'id': 42 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-artemovsk.phtml', 'name': 'Артемовск', 'id': 37 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-belaya-tserkov.phtml', 'name': 'Белая церковь', 'id': 25 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-berdyansk.phtml', 'name': 'Бердянськ', 'id': 64 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-borispol.phtml', 'name': 'Борисполь', 'id': 38 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-brovary.phtml', 'name': 'Бровары', 'id': 2 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-bucha.phtml', 'name': 'Буча', 'id': 12 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-vinnitsa.phtml', 'name': 'Винница', 'id': 20 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-gorlovka.phtml', 'name': 'Горловка', 'id': 46 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-dnepropetrovsk.phtml', 'name': 'Днепропетровск', 'id': 5 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-donetsk.phtml', 'name': 'Донецк', 'id': 6 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-evpatoriya.phtml', 'name': 'Евпатория', 'id': 39 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-zhitomir.phtml', 'name': 'Житомир', 'id': 17 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-zaporozhe.phtml', 'name': 'Запорожье', 'id': 18 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-ivano-frankovsk.phtml', 'name': 'Ивано-Франковск', 'id': 7 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-irpen.phtml', 'name': 'Ирпень', 'id': 3 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-kamenets-podolskiy.phtml', 'name': 'Каменец-Подольский', 'id': 40 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-kahovka.phtml', 'name': 'Каховка', 'id': 54 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-kerch.phtml', 'name': 'Керчь', 'id': 35 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-kiev.phtml', 'name': 'Киев', 'id': 1 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-kirovograd.phtml', 'name': 'Кировоград', 'id': 36 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-kovel.phtml', 'name': 'Ковель', 'id': 31 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-kolomiya.phtml', 'name': 'Коломыя', 'id': 58 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-komsomolsk.phtml', 'name': 'Комсомольск', 'id': 62 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-konotop.phtml', 'name': 'Конотоп', 'id': 52 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-korosten.phtml', 'name': 'Коростень', 'id': 49 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-kramatorsk.phtml', 'name': 'Краматорск', 'id': 14 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-krasnyy-luch.phtml', 'name': 'Красный Луч', 'id': 55 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-kremenchug.phtml', 'name': 'Кременчуг', 'id': 41 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-krivoy-rog.phtml', 'name': 'Кривой Рог', 'id': 15 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-lubny.phtml', 'name': 'Лубны', 'id': 59 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-lugansk.phtml', 'name': 'Луганск', 'id': 32 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-lutsk.phtml', 'name': 'Луцк', 'id': 29 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-lvov.phtml', 'name': 'Львов', 'id': 9 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-makeevka.phtml', 'name': 'Макеевка', 'id': 67 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-mariupol.phtml', 'name': 'Мариуполь', 'id': 19 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-melitopol.phtml', 'name': 'Мелитополь', 'id': 57 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-mukachevo.phtml', 'name': 'Мукачево', 'id': 43 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-nikolaev.phtml', 'name': 'Николаев', 'id': 11 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-nikopol.phtml', 'name': 'Никополь', 'id': 65 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-novaya-kahovka.phtml', 'name': 'Новая Каховка', 'id': 53 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-odessa.phtml', 'name': 'Одесса', 'id': 4 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-pavlograd.phtml', 'name': 'Павлоград', 'id': 66 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-pervomaysk.phtml', 'name': 'Первомайськ', 'id': 61 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-poltava.phtml', 'name': 'Полтава', 'id': 27 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-rovno.phtml', 'name': 'Ровно', 'id': 26 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-svetlovodsk.phtml', 'name': 'Светловодск', 'id': 60 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-sevastopol.phtml', 'name': 'Севастополь', 'id': 23 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-severodonetsk.phtml', 'name': 'Северодонецк', 'id': 56 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-simferopol.phtml', 'name': 'Симферополь', 'id': 21 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-stahanov.phtml', 'name': 'Стаханов', 'id': 44 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-sumy.phtml', 'name': 'Сумы', 'id': 16 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-ternopol.phtml', 'name': 'Тернополь', 'id': 10 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-uzhgorod.phtml', 'name': 'Ужгород', 'id': 33 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-fastov.phtml', 'name': 'Фастов', 'id': 45 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-feodosiya.phtml', 'name': 'Феодосия', 'id': 48 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-kharkov.phtml', 'name': 'Харьков', 'id': 13 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-kherson.phtml', 'name': 'Херсон', 'id': 34 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-khmelnitskiy.phtml', 'name': 'Хмельницкий', 'id': 30 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-tsyurupinsk.phtml', 'name': 'Цюрупинск', 'id': 47 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-cherkassy.phtml', 'name': 'Черкассы', 'id': 8 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-chernigov.phtml', 'name': 'Чернигов', 'id': 28 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-chernovtsy.phtml', 'name': 'Черновцы', 'id': 24 }, { 'status': 0, 'url': 'http://kino-teatr.ua/kinoafisha-shostka.phtml', 'name': 'Шостка', 'id': 50 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-yugnoe.phtml', 'name': 'Южное', 'id': 63 }, { 'status': 1, 'url': 'http://kino-teatr.ua/kinoafisha-yalta.phtml', 'name': 'Ялта', 'id': 22 }, ] ignored = get_ignored_films() kinoteatrua_urls = sorted(kinoteatrua_urls, key=operator.itemgetter('status'), reverse=True) source = ImportSources.objects.get(url='http://kino-teatr.ua/') xml = open( '%s/dump_%s_nof_film.xml' % (settings.NOF_DUMP_PATH, source.dump), 'r') xml_data = BeautifulSoup(xml.read(), from_encoding="utf-8") xml.close() films_slugs = [i.get('slug_ru') for i in xml_data.findAll('film')] data_nof_films = '' data_nof_cinemas = '' data_nof_city = '' noffilms = [] nofcinemas = [] films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) cities_dict = get_source_data(source, 'city', 'dict') cinemas_dict = get_source_data(source, 'cinema', 'dict') schedules = get_source_data(source, 'schedule', 'list') date_from = datetime.date.today() date_to = date_from + datetime.timedelta(days=14) dates = [] delta = date_to - date_from for day in range(delta.days + 1): d = date_from + datetime.timedelta(days=day) dates.append({'str': d.strftime('%d.%m.%Y'), 'obj': d}) def get_kinoteatr_data(opener, date, city_obj): nof_films = '' nof_cinemas = '' url = '%sru/main/bill/order/cinemas/date/%s.phtml' % (source.url, date['str']) req = opener.open(urllib2.Request(url)) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") main = data.find('div', id='news_page') if main: if main.find('center', {'class': 'xErr'}): return nof_films, nof_cinemas, 'error' for cinema_tag in main.findAll('span', id='afishaKtName'): cinema_name_block = cinema_tag.findAll('a', limit=1)[0] cinema_name = cinema_name_block.text.encode( 'utf-8').replace('Кинотеатр', '') cinema_slug = low( del_separator(del_screen_type(cinema_name))) cinema_name = cinema_name.replace('"', "'").replace( '&', '&').strip() cinema_id = cinema_name_block.get('href').replace( '.phtml', '') if 'cinema_id' in cinema_id: cinema_id = cinema_id.replace( 'http://kino-teatr.ua/ru/main/cinema/cinema_id/', '').encode('utf-8') else: cinema_id = re.findall(r'\d+$', cinema_id)[0] if cinema_id not in nofcinemas: cinema_obj = cinemas_dict.get(str(cinema_id)) if not cinema_obj: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city__id': city_obj.city_id } cinema_kid = cinema_identification( cinema_slug, filter1) if cinema_kid: try: cinema = Cinema.objects.get( code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) cinemas_dict[str(cinema_id)] = cinema_obj except Cinema.DoesNotExist: pass else: try: name_city = city_obj.name.encode('utf-8') except UnicodeDecodeError: name_city = city_obj.name nof_cinemas += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, name_city, city_obj.city.kid) nofcinemas.append(cinema_id) if cinema_obj: films_block = cinema_tag.find_next_sibling('div') for film_block in films_block.findAll( 'div', id='afishaItem'): film_name = film_block.find( 'div', {'class': 'filmName'}) full_url = film_name.a.get('href').encode( 'utf-8') if film_name.a.text: film_name = film_name.a.text.encode( 'utf-8').strip() film_slug = low(del_separator(film_name)) film_id = full_url.replace( 'http://kino-teatr.ua/film/', '').replace('.phtml', '').encode('utf-8') if film_slug.decode( 'utf-8' ) not in ignored and film_id not in noffilms: obj = films.get(film_id) next_step = True if obj and obj.rel_ignore else False if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) if not obj: if kid: uk_url = '%suk/film/%s' % ( source.url, film_id) uk_req = opener.open( urllib2.Request( uk_url)) if uk_req.getcode() == 200: uk_data = BeautifulSoup( uk_req.read( ).decode('utf-8'), from_encoding= "utf-8") uk_name = uk_data.find( 'div', { 'class': 'myriadFilm' }).text.encode( 'utf-8') uk_text = uk_data.find( 'div', itemprop= 'description') uk_text_data = uk_text.findAll( 'p', limit=1) if uk_text_data: uk_text = uk_text_data[ 0].text.encode( 'utf-8') else: uk_text = uk_text.text.encode( 'utf-8').strip( ) uk_text = uk_text.replace( 'редактирование синопсиса', '').strip() obj = create_sfilm( film_id, kid, source, uk_name, txt=uk_text) films[film_id] = obj if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid][ 'script_rel'].append( obj) else: if film_slug.decode( 'utf-8' ) not in films_slugs: nof_films += xml_noffilm( film_name, film_slug, None, None, film_id, info, full_url, source.id) noffilms.append( film_id) if obj: shows = film_block.find( 'div', {'class': 'filmShows'}) for times in shows.findAll( 'a', {'class': 'time'}): try: hours, minutes = times.text.split( ':') except AttributeError: times.find( 'sup').extract() hours, minutes = times.text.split( ':') dtime = datetime.datetime( date['obj'].year, date['obj'].month, date['obj'].day, int(hours), int(minutes)) sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=obj, cinema=cinema_obj, dtime=dtime, ) schedules.append( sch_id) return nof_films, nof_cinemas, '' for ind, i in enumerate(kinoteatrua_urls): city_name = i['name'] city_slug = low(del_separator(city_name)) city_obj = cities_dict.get(str(i['id'])) if not city_obj: city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk') if city.count() == 1: city_obj = SourceCities.objects.create( source_id=i['id'], source_obj=source, city=city[0], name=city_name, ) cities_dict[str(i['id'])] = city_obj else: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_slug) if city_obj: opener = give_me_cookie() opener.addheaders.append(('Cookie', 'main::city_id=%s' % i['id'])) city_req = opener.open(urllib2.Request(i['url'])) if city_req.getcode() == 200: for index, date in enumerate(dates): nof_film, nof_cinema, error = get_kinoteatr_data( opener, date, city_obj) data_nof_films += nof_film data_nof_cinemas += nof_cinema if error: break if index % 3 == 0: time.sleep(random.uniform(1.0, 3.0)) if ind % 2 == 0: time.sleep(random.uniform(1.0, 3.0)) xml_data = str(xml_data).replace('<html><head></head><body><data>', '').replace('</data></body></html>', '') xml_data = '<data>%s%s</data>' % (xml_data, data_nof_films) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, xml_data) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinemas) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_planeta_films(): ignored = get_ignored_films() source = ImportSources.objects.get(url='http://planeta-kino.com.ua/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) data_nof_film = '' noffilms = [] for i in planeta_kino_urls: xml = open( '%s/dump_planetakino_%s.xml' % (settings.API_DUMP_PATH, i['city']), 'r') xml_data = BeautifulSoup(xml.read(), from_encoding="utf-8") xml.close() for film in xml_data.findAll('movie'): film_id = film['id'] if film_id not in noffilms: film_url = film['url'] film_name = film.title.text.replace( '"', "'").encode('utf-8').strip() film_slug = low(del_separator(del_screen_type(film_name))) if film_slug.decode('utf-8') not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, film_name, {}, {}, source=source) if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, film_name) films[film_id] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, None, source.id) noffilms.append(film_id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'films', 'Фильмы')
def get_imdb_film_list(): source = ImportSources.objects.get(url='http://www.imdb.com/') url = '%scalendar/?region=us' % source.url opener = give_me_cookie() req = opener.open(urllib2.Request(url)) xml = '' ids = [] if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") div = data.find('div', id="main") old_date = '' for h4 in div.findAll('h4'): release = h4.string.encode('utf-8') day, month, year = release.split() month = get_month_en(low(month)) rel_date = '%s-%s-%s' % (year, month, day) xml += '<date v="%s">' % rel_date ul = h4.find_next('ul') for li in ul.findAll('li'): year = li.find('span', { 'class': "year_type" }).string.encode('utf-8') if 'documentary' not in low(year): year = re.findall(r'\d+', year) if year: details = li.find('i') if details: details = str(details).encode('utf-8').replace( '<i>', '').replace('</i>', '') details = details.replace('(', '').replace(')', '') else: details = '' if 'limited' not in low(details) and 'fest' not in low( details) or 'tv premiere' not in low(details): film_name = li.a.string.encode('utf-8').replace( '"', '"').replace('&', '&') film_slug = low(del_separator(film_name)) full_url = li.a.get('href').encode('utf-8') imdb_id = full_url.replace('/title/tt', '').replace('/', '') xml += '<film n="%s" s="%s" y="%s" id="%s" d="%s" r="%s"></film>' % ( film_name, film_slug, year[0], imdb_id, details, rel_date) ids.append(imdb_id) xml += '</date>' ids = ';'.join(set(ids)) xml = '<data><ids value="%s">%s</ids></data>' % (ids, xml) create_dump_file('%s_film_list' % source.dump, settings.API_DUMP_PATH, xml) cron_success('html', source.dump, 'films_list', 'Список релизов')