def get_imdb_film(): data_nof_persons, distr_nof_data, dump, good = get_imdb_data(None, True, 1) create_dump_file('%s_nof_person' % dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_persons) create_dump_file('%s_nof_distributor' % dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % distr_nof_data) cron_success('html', dump, 'films_data', 'Данные релизов')
def imdb_film_ident(): source = ImportSources.objects.get(url='http://www.imdb.com/') films = Films.objects.filter(kid=None) films_ids = [i.imdb_id for i in films] exist_films = Film.objects.using('afisha').filter(idalldvd__in=films_ids) exist_ids = {} for i in exist_films: exist_ids[i.idalldvd] = i.id data_nof_film = '' for i in films: name = None for j in i.name.filter(status=1, language__id=2): name = j.name.encode('utf-8') slug = low(del_separator(name)) kid = exist_ids.get(long(i.imdb_id)) if kid: i.kid = kid i.save() else: full_url = '%stitle/tt%s/' % (source.url, i.imdb_id) data_nof_film += xml_noffilm(name, slug, None, None, i.imdb_id, 'Фильм не найден', full_url.encode('utf-8'), source.id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films_ident', 'Идентификация')
def nowru_ident(): source = ImportSources.objects.get(url='http://www.now.ru/') ignored = get_ignored_films() data_nof_film = '' nowru_data = Nowru.objects.filter(kid=None) for i in nowru_data: name_ru_slug = low(del_separator(i.name_ru.encode('utf-8'))) if name_ru_slug.decode('utf-8') not in ignored: name_en_slug = low(del_separator(i.name_en.encode('utf-8'))) kid, info = film_identification(name_ru_slug, name_en_slug, {}, {}, year=i.year, source=source) if kid: i.kid = kid i.save() else: if 'slug="%s"' % name_ru_slug not in data_nof_film: name_ru = i.name_ru.encode('utf-8') name_en = i.name_en.encode('utf-8') data_nof_film += xml_noffilm(name_ru, name_ru_slug, name_en, name_en_slug, i.nowru_id, info, None, source.id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'players', 'Онлайн плееры')
def cron_dump_schedules_v4(): open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('schedules *') + '\n') res = query_schedule_v4(None, None) result_xml, result_json = get_schedule_v4(res, None, True) save_dump(result_xml, None, None, 'schedule_v4') save_dump(simplejson.dumps(result_json, ensure_ascii=False).encode('utf-8'), None, None, 'schedule_v4', '', 'json') cron_success('api', source.dump, 'schedules_v4', 'Сеансы v4') open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('schedules') + '\n')
def cron_dump_releases_ua(): open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('releases_ua *') + '\n') res = query_releases_ua(None) result_xml, result_json = get_releases_ua(res, None, True) save_dump(result_xml, None, None, 'releases_ua') save_dump(simplejson.dumps(result_json, ensure_ascii=False).encode('utf-8'), None, None, 'releases_ua', '', 'json') cron_success('api', source.dump, 'releases_ua', 'Укр. релизы') open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('releases_ua') + '\n')
def cron_dump_film_trailers(): open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('film_trailers *') + '\n') res, version = query_film_trailers(None, None) result_xml, result_json = get_film_trailers(res, None, True) save_dump(result_xml, None, None, 'film_trailers') save_dump(simplejson.dumps(result_json, ensure_ascii=False).encode('utf-8'), None, None, 'film_trailers', '', 'json') cron_success('api', source.dump, 'film_trailers', 'Трейлеры') open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('film_trailers') + '\n')
def cron_dump_movie_reviews(): open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('movie_reviews *') + '\n') res = query_movie_reviews(None, None) result_xml, result_json = get_movie_reviews(res, None, True) save_dump(result_xml, None, None, 'movie_reviews') save_dump(simplejson.dumps(result_json, ensure_ascii=False).encode('utf-8'), None, None, 'movie_reviews', '', 'json') cron_success('api', source.dump, 'movie_reviews', 'Рецензии') open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('movie_reviews') + '\n')
def cron_dump_imdb_rate(): open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('imdb_rate *') + '\n') res = query_imdb_rate(None, None) result_xml, result_json = get_imdb_rate(res, None, True) save_dump(result_xml, None, None, 'imdb_rate') save_dump(simplejson.dumps(result_json, ensure_ascii=False).encode('utf-8'), None, None, 'imdb_rate', '', 'json') cron_success('api', source.dump, 'imdb_rate', 'IMDB рейтинги') open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('imdb_rate') + '\n')
def cron_dump_films_name(): open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('films_name *') + '\n') res = query_films_name(None) result_xml, result_json = get_films_name(res, None, True) save_dump(result_xml, None, None, 'films_name') save_dump(simplejson.dumps(result_json, ensure_ascii=False).encode('utf-8'), None, None, 'films_name', '', 'json') cron_success('api', source.dump, 'films_name', 'Названия фильмов') open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('films_name') + '\n')
def vkinocomua_schedules_export_to_kinoafisha(): from release_parser.views import schedules_export source = ImportSources.objects.get(url='http://vkino.com.ua/') autors = (source.code, 0, 75, 100) log = schedules_export(source, autors, False) # запись лога в xml файл create_dump_file('%s_export_to_kinoafisha_log' % source.dump, settings.LOG_DUMP_PATH, '<data>%s</data>' % log) cron_success('export', source.dump, 'schedules', 'Сеансы')
def cron_dump_cinemas(): open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('cinemas *') + '\n') res = query_cinema(None) result_xml, result_json = get_cinema(res, None, True) save_dump(result_xml, None, None, 'cinema') save_dump(simplejson.dumps(result_json, ensure_ascii=False).encode('utf-8'), None, None, 'cinema', '', 'json') cron_success('api', source.dump, 'cinemas', 'Кинотеатры') open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('cinemas') + '\n')
def get_okinoua_links(): ''' Получение urls укр. релизов ''' links = [] def get_link_from_tag(i): tag = i.find('p', {'class': 'name'}) film_id = tag.a.get('href').replace('/film/', '').replace('/', '') link = 'http://www.okino.ua%s' % tag.a.get('href') return link, film_id url = 'http://www.okino.ua/comingsoon/' req = urllib.urlopen(url) if req.getcode() == 200: html_data = BeautifulSoup(req.read(), from_encoding="utf-8") divs = [ { 'class': 'film' }, { 'class': 'film last' }, { 'class': 'film film-s' }, { 'class': 'film last film-s' }, ] for div in divs: for i in html_data.findAll('div', div): distr = None link, film_id = get_link_from_tag(i) for j in i.findAll('p'): if u'Дистрибьютор:' in j.text: distr = j.text.split(':') distr = distr[1].strip() links.append({ 'link': link, 'distr': distr, 'id': film_id }) f = open('%s/dump_okino.ua.links.xml' % settings.API_DUMP_PATH, 'w') xml = '' for i in links: xml += '<release>' xml += '<link value="%s"></link>' % i['link'] xml += '<distr value="%s"></distr>' % i['distr'].replace('&', '&') xml += '<id value="%s"></id>' % i['id'] xml += '</release>' f.write('<data>%s</data>' % xml.encode('utf-8')) f.close() cron_success('html', 'okino.ua', 'links', 'Ссылки укр. релизов')
def get_rutracker_topics_closed(): REG_SIZE = re.compile(r'\[\d+\.?\d+?\s?\w+\]') REG_SLUG = re.compile(ur'[a-zа-я0-9]+') source = ImportSources.objects.get(url='http://rutracker.org/') films = SourceFilms.objects.filter(source_obj=source) films_dict = {} for i in films: films_dict[i.name_alter] = i url = 'http://rutracker.org/forum/index.php?closed=1' req = urllib.urlopen(url) for_del = [] if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="windows-1251") nav = data.find('ul') if nav: for i in nav.findAll('li'): title = i.b.text.strip().encode('utf-8') if ' / ' in title: name_alt = re.findall(REG_SLUG, low(title).decode('utf-8')) name_alt = ''.join(name_alt) obj = films_dict.get(name_alt) if obj: for_del.append(obj.id) SourceFilms.objects.filter(pk__in=set(for_del)).delete() ''' if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") return HttpResponse(str(data)) nav = data.find('div', {'class': 'cl-pg'}) for a in nav.findAll('a'): link = a.get('href').encode('utf-8') if 'start' in link: new_url = '%sforum/%s' % (source.url, link) links.append(new_url) for url in links: req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") for i in data.findAll('b'): title = i.text.encode('utf-8').strip() if ' / ' in title: name_alt = re.findall(REG_SLUG, low(title).decode('utf-8')) name_alt = ''.join(name_alt) obj = films_dict.get(name_alt) if obj: obj.delete() ''' cron_success('xml', source.dump, 'films_closed', 'Закрытые фильмы')
def cron_dump_screens(): vers = [{'ver': 1, 'name': 'screens'}, {'ver': 2, 'name': 'screens_v2'}] qresult = query_screens(None) for i in vers: open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('Сеансы v.%s (Дания) *' % i['ver']) + '\n') result_xml, result_json = get_screens(qresult, i['ver'], None, True) save_dump(result_xml, None, None, i['name']) save_dump(simplejson.dumps(result_json, ensure_ascii=False).encode('utf-8'), None, None, i['name'], '', 'json') cron_success('api', source.dump, i['name'], 'Сеансы v.%s (Дания)' % i['ver']) open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('Сеансы v.%s (Дания)' % i['ver']) + '\n')
def cron_dump_films(): years_list = ['1990', '1990_1999', '2000_2009', '2010_2011'] + map(str, range(2012, datetime.date.today().year + 1)) for i in years_list: open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('films_%s *' % i) + '\n') res = get_year_films(i) result_xml, result_json = get_film(res, None, None, True) save_dump(result_xml, None, None, 'film', i, 'xml') save_dump(simplejson.dumps(result_json, ensure_ascii=False).encode('utf-8'), None, None, 'film', i, 'json') cron_success('api', source.dump, 'films_%s' % i, 'Фильмы %s' % i) open('%s/api_time_log.txt' % settings.API_DUMP_PATH, 'a').write(str(datetime.datetime.now()) + '\t' + str('films_%s' % i) + '\n')
def get_tvzavr_dump(): ''' Получение дампа фильмов ''' source = ImportSources.objects.get(url='http://www.tvzavr.ru/') main_url = '%sapi/mgm/sitemap-video.xml' % source.url req = urllib.URLopener() path = '%s/dump_%s_index.xml' % (settings.API_DUMP_PATH, source.dump) req.retrieve(main_url, path) cron_success('xml', source.dump, 'index', 'Дамп с фильмами')
def get_kinoteatrua_releases(): ''' Получение укр.релизов ''' opener = give_me_cookie() source = ImportSources.objects.get(url='http://kino-teatr.ua/') films_dict = get_source_data(source, 'film', 'dict') releases = SourceReleases.objects.select_related('film').filter( source_obj=source) releases_dict = {} for i in releases: releases_dict[i.film.source_id] = i url = '%sfilms-near.phtml' % source.url req = opener.open(urllib2.Request(url)) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") for ind, i in enumerate(data.findAll('a', {'class': 'searchItemLink'})): film_url = i.get('href') film_id = film_url.replace('http://kino-teatr.ua/film/', '').replace('.phtml', '').encode('utf-8') film_obj = films_dict.get(film_id) if film_obj: req2 = opener.open(urllib2.Request(film_url)) if req2.getcode() == 200: data2 = BeautifulSoup(req2.read(), from_encoding="utf-8") block = data2.find('div', id='filmInfo') strong = block.find('strong', text=u"Премьера (в Украине): ") day, month, year = strong.find_next_sibling( "a").text.strip().split('.') showdate = datetime.date(int(year), int(month), int(day)) release_obj = releases_dict.get(film_id) if release_obj: if release_obj.release != showdate: release_obj.release = showdate release_obj.save() else: release_obj = SourceReleases.objects.create( source_obj=source, film=film_obj, release=showdate, ) releases_dict[film_id] = release_obj if ind % 1 == 0: time.sleep(random.uniform(1.0, 3.0)) cron_success('html', source.dump, 'releases', 'Укр.релизы')
def get_imdb_film_list(): source = ImportSources.objects.get(url='http://www.imdb.com/') url = '%scalendar/?region=us' % source.url opener = give_me_cookie() req = opener.open(urllib2.Request(url)) xml = '' ids = [] if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") div = data.find('div', id="main") old_date = '' for h4 in div.findAll('h4'): release = h4.string.encode('utf-8') day, month, year = release.split() month = get_month_en(low(month)) rel_date = '%s-%s-%s' % (year, month, day) xml += '<date v="%s">' % rel_date ul = h4.find_next('ul') for li in ul.findAll('li'): year = li.find('span', {'class': "year_type"}).string.encode('utf-8') if 'documentary' not in low(year): year = re.findall(r'\d+', year) if year: details = li.find('i') if details: details = str(details).encode('utf-8').replace('<i>','').replace('</i>','') details = details.replace('(','').replace(')','') else: details = '' if 'limited' not in low(details) and 'fest' not in low(details) or 'tv premiere' not in low(details): film_name = li.a.string.encode('utf-8').replace('"', '"').replace('&','&') film_slug = low(del_separator(film_name)) full_url = li.a.get('href').encode('utf-8') imdb_id = full_url.replace('/title/tt', '').replace('/', '') xml += '<film n="%s" s="%s" y="%s" id="%s" d="%s" r="%s"></film>' % (film_name, film_slug, year[0], imdb_id, details, rel_date) ids.append(imdb_id) xml += '</date>' ids = ';'.join(set(ids)) xml = '<data><ids value="%s">%s</ids></data>' % (ids, xml)time create_dump_file('%s_film_list' % source.dump, settings.API_DUMP_PATH, xml) cron_success('html', source.dump, 'films_list', 'Список релизов')
def get_rambler_indexfile(): source = ImportSources.objects.get(url='http://www.rambler.ru/') url = 'http://api.kassa.rambler.ru/v2/%s/xml/Movie/export/sale/' % RAMBLER_API_KEY # dump_rambler_index.xml req = urllib.urlopen(url) if req.getcode() == 200: data = req.read() if 'InvalidClientIp' in data: return HttpResponse(str('InvalidClientIp')) create_dump_file('%s_index' % source.dump, settings.API_DUMP_PATH, data) cron_success('xml', source.dump, 'index', 'Индексный файл') return HttpResponse(str('OK'))
def cinemate_cc_get_links(): source = ImportSources.objects.get(url='http://cinemate.cc/') films = {} source_films = SourceFilms.objects.filter(source_obj=source)[:50] for i in source_films: films[int(i.source_id)] = i torrents = list( CinemateTorrents.objects.filter( film__source_id__in=films.keys()).values_list('go_link_id', flat=True)) opener = give_me_cookie() for source_id, film in films.iteritems(): url = '%smovie/%s/links/#tabs' % (source.url, source_id) req = opener.open(urllib2.Request(url)) data = BeautifulSoup(req.read(), from_encoding="utf-8") table = data.find('div', {'class': "table"}) for div in table.findAll('div', {'class': "row delimiter"}): td_div = div.findAll('div') tracker = td_div[2].text.strip().encode('utf-8') quality = td_div[3].text.strip().encode('utf-8') size = td_div[-1].text.strip().encode('utf-8') link_id = div.find('a', { 'class': "icon_t download-link" }).get('href', '').replace('/go/s/', '').replace('/', '') if link_id not in torrents: go_url = '%sgo/s/%s' % (source.url, link_id) go_req = opener.open(urllib2.Request(go_url)) go_data = BeautifulSoup(go_req.read(), from_encoding="utf-8") main = go_data.find('div', {'class': "main"}) a = main.find('a', rel="nofollow").get('href') CinemateTorrents.objects.create( film=film, go_link_id=link_id, link=a, tracker=tracker, quality=quality, file_size=size, ) time.sleep(random.uniform(0.8, 1.2)) cron_success('html', source.dump, 'links', 'Ссылки на трекеры')
def nowru_player_to_kinoafisha(): source = ImportSources.objects.get(url='http://www.now.ru/') nowru_data = Nowru.objects.exclude(kid=None) nowru_ids = [i.kid for i in nowru_data] ivi_data = SourceFilms.objects.exclude(kid__in=set(nowru_ids)).filter( source_obj__url="http://antipiracy.ivi.ru/") ivi_ids = [i.kid for i in ivi_data] nowru_ivi = nowru_ids + ivi_ids megogo_data = MovieMegogo.objects.exclude( Q(afisha_id=0) | Q(afisha_id=None) | Q(afisha_id__in=set(nowru_ivi))) megogo_ids = [i.afisha_id for i in megogo_data] nowru_ivi_megogo = set(nowru_ivi + megogo_ids) afisha_code = FilmsCodes.objects.using('afisha').exclude(player='').filter( film__id__in=nowru_ivi_megogo) afisha_code_dict = {} for i in afisha_code: afisha_code_dict[i.film_id] = i for ind, data in enumerate((nowru_data, ivi_data, megogo_data)): for i in data: # now.ru if ind == 0: kid = i.kid player = i.player_code elif ind == 1: kid = i.kid player = i.text # megogo elif ind == 2: kid = i.afisha_id player = '<iframe width="607" height="360" \ src="http://megogo.net/e/%s" frameborder="0" \ allowfullscreen></iframe>' % i.megogo_id if kid: afisha_obj = afisha_code_dict.get(kid) if afisha_obj: afisha_obj.player = player afisha_obj.save() else: FilmsCodes.objects.using('afisha').create( film_id=kid, player=player, ) cron_success('export', source.dump, 'players', 'Онлайн плееры')
def get_luxor_schedules(): query = 'QueryCode=GetSessions' data = get_luxor_data_by_socket(query) source = ImportSources.objects.get(url='http://luxor.ru/') #create_dump_file('%s_schedules' % source.dump, settings.API_DUMP_PATH, data) ''' xml = open('%s/dump_%s_schedules.xml' % (settings.API_DUMP_PATH, source.dump), 'r')# temp data = xml.read()# temp xml.close()# temp ''' films = get_source_data(source, 'film', 'dict') cinemas = get_source_data(source, 'cinema', 'dict') halls = get_source_data(source, 'hall', 'dict') schedules = get_source_data(source, 'schedule', 'list') xml_data = BeautifulSoup(data, from_encoding="utf-8") for session in xml_data.findAll('session'): sch_id = session['id'] if sch_id not in schedules: cinema_id = session.theatre['id'].encode('utf-8') hall_id = session.theatre.hall['id'].encode('utf-8') film_id = session.movie['id'].encode('utf-8') cinema_obj = cinemas.get(cinema_id) film_obj = films.get(film_id) hall_obj = halls.get(hall_id) if cinema_obj and film_obj and hall_obj: showdate = session.date.string.encode('utf-8') showtime = session.time.string.encode('utf-8') day, month, year = showdate.split('.') hours, minutes = showtime.split(':') dtime = datetime.datetime(int(year), int(month), int(day), int(hours), int(minutes)) SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=film_obj, cinema=cinema_obj, dtime=dtime, hall=hall_obj.kid, ) cron_success('xml', source.dump, 'schedules', 'Сеансы')
def get_currency_rate(): source = ImportSources.objects.get(url='http://cbrf.magazinfo.ru/') data = [ { 'url': '%srur/USD' % source.url, 'cur_1': '4', # рубль 'cur_2': '1', # доллар }, { 'url': '%srur/AUD' % source.url, 'cur_1': '4', # рубль 'cur_2': '3', # австр.доллар }, ] for i in data: req = urllib.urlopen(i['url']) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") table = data.find('table', border="1", cellspacing="0", cellpadding="5") tr = table.findAll('tr', limit=2) td = tr[1].findAll('td') cur_day, cur_month, cur_year = td[0].text.split('.') cur_date = datetime.datetime(int(cur_year), int(cur_month), int(cur_day)) value = td[1].text.encode('utf-8') obj, created = CurrencyRate.objects.get_or_create( currency=i['cur_1'], by_currency=i['cur_2'], defaults={ 'currency': i['cur_1'], 'by_currency': i['cur_2'], 'country_id': 2, 'date': cur_date, 'value': value, }) if obj: obj.value = value obj.date = cur_date obj.save() get_currency_rate_NZD() cron_success('html', source.dump, 'currency_rate', 'Курс валют')
def raspishi_relations(): source = ImportSources.objects.get(url='http://распиши.рф/') ignored = get_ignored_films() data_nof_film = '' domain = u'распиши.рф' url = 'http://%s/getfilmxml.php' % domain.encode('idna') req = urllib.urlopen(url) if req.getcode() == 200: films_rid = list( RaspishiRelations.objects.exclude(kid=0).values_list('rid', flat=True)) xml_data = BeautifulSoup(req.read(), from_encoding="utf-8") for i in xml_data.findAll('movie'): id = int(i['id']) if id not in films_rid: name_ru = i.find('name').text.encode('utf-8') name_en = i.find('nameeng').text.encode('utf-8') name_ru = re.sub(r'\(.*?\)', '', name_ru).strip() name_en = re.sub(r'\(.*?\)', '', name_en).strip() name_slug = low(del_separator(del_screen_type(name_ru))) name_en_slug = low(del_separator(del_screen_type(name_en))) if name_slug.decode('utf-8') not in ignored: try: kid, info = film_identification(name_slug, None, {}, {}, source=source) if kid: created = RaspishiRelations.objects.create( rid=id, kid=kid, name_ru=name_ru, name_en=name_en, ) else: data_nof_film += xml_noffilm( name_ru, name_slug, name_en, name_en_slug, id, info, None, source.id) except db.backend.Database._mysql.OperationalError: pass create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'films', 'Укр. сеансы')
def get_ktmir_and_ktrussia_schedules(): city_name = 'Балаково' cinema_name = 'Мир' source = 'http://ktmir.ru/' data_nof_film = page_parser(city_name, cinema_name, source) create_dump_file('ktmir_nof_film', settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', 'ktmir', 'schedules', 'Сеансы') city_name = 'Балаково' cinema_name = 'Россия' source = 'http://kt-russia.ru/' data_nof_film = page_parser(city_name, cinema_name, source) create_dump_file('ktrussia_nof_film', settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', 'ktrussia', 'schedules', 'Сеансы')
def get_okinoua_cities(): """ Парсинг городов Украины """ source = ImportSources.objects.get(url='http://www.okino.ua/') # Получаем список городов с таблицы SourceCities в виде списка cities_ids = get_source_data(source, 'city', 'list') data_nof_city = '' # Открываем страницу с городами url = '%skinoafisha-kiev/' % source.url req = urllib.urlopen(url) if req.getcode() == 200: page = BeautifulSoup(req.read(), from_encoding="utf-8") # Находим все теги с городами и считываем из них id и названия городов for ul in page.findAll('ul', {'class': 'blist'}): for li in ul.findAll('li'): id = li.a.get('href').replace('/', '') name = li.a.string.encode('utf-8').strip() name_slug = low(del_separator(name)) # Сравниваем полученные города с городами в нашей БД и, если НЕТ совпадений, то if id not in cities_ids: # идентифицируем новый город city = City.objects.filter(name__name=name_slug, name__status=2).distinct('pk') # если идентифицировали, то записываем в таблицу SourceCities if city.count() == 1: SourceCities.objects.create( source_id=id, source_obj=source, city=city[0], name=name, ) # в противном случаем записываем ненайденые города в xml для дальнейших действий над ними else: if 'slug="%s"' % name_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( name, name_slug) create_dump_file('okinoua_nof_city', settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) cron_success('html', 'okinoua', 'cities', 'Укр. города')
def get_rambler_cities(): source = ImportSources.objects.get(url='http://www.rambler.ru/') cities_ids = get_source_data(source, 'city', 'list') data_nof_city = '' ''' # LOCALHOST f = open('%s/dump_rambler_city.xml' % settings.API_DUMP_PATH, 'r') xml = BeautifulSoup(f.read(), from_encoding="utf-8") f.close() if xml: # --- end localhost ''' # SERVER url = 'http://api.kassa.rambler.ru/v2/%s/xml/cities/' % RAMBLER_API_KEY # dump_rambler_city.xml req = urllib.urlopen(url) if req.getcode() == 200: xml = BeautifulSoup(req.read(), from_encoding="utf-8") # --- end server for i in xml.findAll('city'): id = i.cityid.string name = i.find('name').string.encode('utf-8') name_slug = low(del_separator(name)) if id not in cities_ids: city = City.objects.filter(name__name=name_slug, name__status=2).distinct('pk') if city.count() == 1: SourceCities.objects.create( source_id=id, source_obj=source, city=city[0], name=name, ) else: if 'slug="%s"' % name_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( name, name_slug) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) cron_success('xml', source.dump, 'cities', 'Города')
def get_premierzal_cities(): source = ImportSources.objects.get(url='http://www.premierzal.ru/') cities = get_source_data(source, 'city', 'list') data_nof_city = '' req = urllib.urlopen(source.url) if req.getcode() == 200: data = BeautifulSoup(req.read()) block = data.find('div', {'class': 'drop'}) for i in block.findAll('a'): city_name = i.text.encode('utf-8').strip() city_id = low(del_separator(city_name)) if city_id.decode('utf-8') not in cities: city = City.objects.filter(name__name=city_id, name__status=2).distinct('pk') if city.count() == 1: SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) else: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_id) cities.append(city_id.decode('utf-8')) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) cron_success('html', source.dump, 'cities', 'Города')
def get_ivi_file(): ''' Получение txt файла ''' source = ImportSources.objects.get(url='http://antipiracy.ivi.ru/') films = get_source_data(source, 'film', 'list') url = '%s-/' % source.url req = urllib.urlopen(url) if req.getcode() == 200: links = BeautifulSoup(req.read(), from_encoding="windows-1251") for i in links.findAll('a'): link = i.string.encode('utf-8') if 'in one file.txt' in link: req2 = urllib.urlopen('%s%s' % (url, i.get('href'))) data = BeautifulSoup(req2.read(), from_encoding="windows-1251") file = str(data).replace('<html><head></head><body>', '').replace('</body></html>', '') create_dump_file(source.dump, settings.API_DUMP_PATH, file, 'txt') cron_success('html', source.dump, 'file', 'txt файл с данными')
def get_zapad24ru(): ignored = get_ignored_films() ignored_cinemas = get_ignored_cinemas() source = ImportSources.objects.get(url='http://zapad24.ru/') sfilm_clean(source) cities_dict = get_source_data(source, 'city', 'dict') cinemas_dict = get_source_data(source, 'cinema', 'dict') schedules = get_source_data(source, 'schedule', 'list') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) today = datetime.datetime.now() next_month = datetime.date.today() + datetime.timedelta(days=40) data_nof_films = '' data_nof_cinema = '' data_nof_city = '' noffilms = [] req = urllib.urlopen('%safisha/' % source.url) if req.getcode() == 200: data = BeautifulSoup(req.read()) #, from_encoding="utf-8" div = data.find('div', align="left") for ind, table in enumerate( div.findAll('table', border="0", cellpadding="0", cellspacing="0", width="100%")): cinema_tag = table.find('strong').string.encode('utf-8') cinema_name = re.findall(r'\".+\"', cinema_tag)[0].replace('"', '').strip() cinema_slug = low(del_separator(cinema_name)) cinema_id = cinema_slug.decode('utf-8') city_name = re.findall(r'\(.+\)', cinema_tag)[0].replace( '(г. ', '').replace(')', '').strip() city_slug = low(del_separator(city_name)) city_id = city_slug.decode('utf-8') city_obj = cities_dict.get(city_id) if not city_obj: city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk') if city.count() == 1: city_obj = SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) cities_dict[city_id] = city_obj else: if 'slug="%s"' % city_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_slug) if city_obj: cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'), city_obj.city.kid) if cinema_ig_id not in ignored_cinemas: cinema_obj = cinemas_dict.get(cinema_id) if not cinema_obj: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city': city_obj.city } cinema_kid = cinema_identification( cinema_slug, filter1) if cinema_kid: try: cinema = Cinema.objects.get(code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) cinemas_dict[cinema_id] = cinema_obj except Cinema.DoesNotExist: pass else: if 'slug="%s"' % cinema_slug not in data_nof_cinema: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_name, city_obj.city.kid) if cinema_obj: film_table = table.find('table') date_from = None date_to = None for tr in film_table.findAll('tr'): film_name, film_slug, film_id = (None, None, None) if ind == 0: film_name = tr.find('b').string.encode( 'utf-8').strip() film_slug = low(del_separator(film_name)) film_id = film_slug.decode('utf-8') else: showdate = '' for f in tr.findAll('b'): if f.find('span'): showdate = f.find( 'span').string.encode( 'utf-8').strip() else: film_name = f.string.encode( 'utf-8').strip() film_name = re.findall( r'\«.+\»', film_name)[0] film_name = film_name.replace( '«', '').replace('»', '').strip() film_slug = low( del_separator(film_name)) film_id = film_slug.decode('utf-8') if showdate and film_name: try: date_from, date_to = showdate.split( '-') date_from_day, date_from_month = date_from.strip( ).split('.') date_to_day, date_to_month = date_to.strip( ).split('.') except ValueError: date_from, date_to = showdate.split( ' – ') date_from_day, date_from_month = date_from.strip( ).split() date_from_month = get_month( date_from_month) date_to_day, date_to_month = date_to.strip( ).split() date_to_month = get_month( date_to_month) date_from = datetime.date( today.year, int(date_from_month), int(date_from_day)) date_to = datetime.date( today.year, int(date_to_month), int(date_to_day)) full_url = tr.find('a').get('href').encode('utf-8') if film_id not in noffilms and film_id not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, full_url, source.id) noffilms.append(film_id) if objt: req_film = urllib.urlopen(full_url) if req_film.getcode() == 200: data_film = BeautifulSoup( req_film.read() ) #, from_encoding="utf-8" td = data_film.find( 'td', { 'class': 'news' }).div.text.encode('utf-8') showtime = [] if ind == 0: showtime = re.findall( r'\d+\:\d+\s\s?', td) else: if date_from and date_to: if date_to < next_month: showtimes = re.findall( r'Начало сеансов:\s?[\d+\-\d+\,?\s?]+', td) times = [] for t in showtimes: t = t.replace( 'Начало сеансов:', '').split(',') times = [ i.strip() for i in t if i.strip() ] delta = date_to - date_from for day in range( delta.days + 1): d = date_from + datetime.timedelta( days=day) for t in times: hours, minutes = t.split( '-') dtime = datetime.datetime( d.year, d.month, d.day, int(hours), int(minutes )) showtime.append( dtime) for t in showtime: if ind == 0: hours, minutes = t.strip( ).split(':') dtime = datetime.datetime( today.year, today.month, today.day, int(hours), int(minutes)) else: dtime = t sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id.encode('utf-8')) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('html', source.dump, 'schedules', 'Сеансы')